Skip to content

Commit db0aa2e

Browse files
dhowellsbrauner
authored andcommitted
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a list of folio_queue structures to be used to provide a buffer to iov_iter-taking functions, such as sendmsg and recvmsg. The folio_queue structure looks like: struct folio_queue { struct folio_batch vec; u8 orders[PAGEVEC_SIZE]; struct folio_queue *next; struct folio_queue *prev; unsigned long marks; unsigned long marks2; }; It does not use a list_head so that next and/or prev can be set to NULL at the ends of the list, allowing iov_iter-handling routines to determine that they *are* the ends without needing to store a head pointer in the iov_iter struct. A folio_batch struct is used to hold the folio pointers which allows the batch to be passed to batch handling functions. Two mark bits are available per slot. The intention is to use at least one of them to mark folios that need putting, but that might not be ultimately necessary. Accessor functions are used to access the slots to do the masking and an additional accessor function is used to indicate the size of the array. The order of each folio is also stored in the structure to avoid the need for iov_iter_advance() and iov_iter_revert() to have to query each folio to find its size. With careful barriering, this can be used as an extending buffer with new folios inserted and new folio_queue structs added without the need for a lock. Further, provided we always keep at least one struct in the buffer, we can also remove consumed folios and consumed structs from the head end as we without the need for locks. [Questions/thoughts] (1) To manage this, I need a head pointer, a tail pointer, a tail slot number (assuming insertion happens at the tail end and the next pointers point from head to tail). Should I put these into a struct of their own, say "folio_queue_head" or "rolling_buffer"? I will end up with two of these in netfs_io_request eventually, one keeping track of the pagecache I'm dealing with for buffered I/O and the other to hold a bounce buffer when we need one. (2) Should I make the slots {folio,off,len} or bio_vec? (3) This is intended to replace ITER_XARRAY eventually. Using an xarray in I/O iteration requires the taking of the RCU read lock, doing copying under the RCU read lock, walking the xarray (which may change under us), handling retries and dealing with special values. The advantage of ITER_XARRAY is that when we're dealing with the pagecache directly, we don't need any allocation - but if we're doing encrypted comms, there's a good chance we'd be using a bounce buffer anyway. This will require afs, erofs, cifs, orangefs and fscache to be converted to not use this. afs still uses it for dirs and symlinks; some of erofs usages should be easy to change, but there's one which won't be so easy; ceph's use via fscache can be fixed by porting ceph to netfslib; cifs is using xarray as a bounce buffer - that can be moved to use sheaves instead; and orangefs has a similar problem to erofs - maybe orangefs could use netfslib? Signed-off-by: David Howells <[email protected]> cc: Matthew Wilcox <[email protected]> cc: Jeff Layton <[email protected]> cc: Steve French <[email protected]> cc: Ilya Dryomov <[email protected]> cc: Gao Xiang <[email protected]> cc: Mike Marshall <[email protected]> cc: [email protected] cc: [email protected] cc: [email protected] cc: [email protected] cc: [email protected] cc: [email protected] cc: [email protected] cc: [email protected] Link: https://lore.kernel.org/r/[email protected]/ # v2 Signed-off-by: Christian Brauner <[email protected]>
1 parent 22de489 commit db0aa2e

File tree

6 files changed

+771
-4
lines changed

6 files changed

+771
-4
lines changed

include/linux/folio_queue.h

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
/* SPDX-License-Identifier: GPL-2.0-or-later */
2+
/* Queue of folios definitions
3+
*
4+
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
5+
* Written by David Howells ([email protected])
6+
*/
7+
8+
#ifndef _LINUX_FOLIO_QUEUE_H
9+
#define _LINUX_FOLIO_QUEUE_H
10+
11+
#include <linux/pagevec.h>
12+
13+
/*
14+
* Segment in a queue of running buffers. Each segment can hold a number of
15+
* folios and a portion of the queue can be referenced with the ITER_FOLIOQ
16+
* iterator. The possibility exists of inserting non-folio elements into the
17+
* queue (such as gaps).
18+
*
19+
* Explicit prev and next pointers are used instead of a list_head to make it
20+
* easier to add segments to tail and remove them from the head without the
21+
* need for a lock.
22+
*/
23+
struct folio_queue {
24+
struct folio_batch vec; /* Folios in the queue segment */
25+
u8 orders[PAGEVEC_SIZE]; /* Order of each folio */
26+
struct folio_queue *next; /* Next queue segment or NULL */
27+
struct folio_queue *prev; /* Previous queue segment of NULL */
28+
unsigned long marks; /* 1-bit mark per folio */
29+
unsigned long marks2; /* Second 1-bit mark per folio */
30+
#if PAGEVEC_SIZE > BITS_PER_LONG
31+
#error marks is not big enough
32+
#endif
33+
};
34+
35+
static inline void folioq_init(struct folio_queue *folioq)
36+
{
37+
folio_batch_init(&folioq->vec);
38+
folioq->next = NULL;
39+
folioq->prev = NULL;
40+
folioq->marks = 0;
41+
folioq->marks2 = 0;
42+
}
43+
44+
static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq)
45+
{
46+
return PAGEVEC_SIZE;
47+
}
48+
49+
static inline unsigned int folioq_count(struct folio_queue *folioq)
50+
{
51+
return folio_batch_count(&folioq->vec);
52+
}
53+
54+
static inline bool folioq_full(struct folio_queue *folioq)
55+
{
56+
//return !folio_batch_space(&folioq->vec);
57+
return folioq_count(folioq) >= folioq_nr_slots(folioq);
58+
}
59+
60+
static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot)
61+
{
62+
return test_bit(slot, &folioq->marks);
63+
}
64+
65+
static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot)
66+
{
67+
set_bit(slot, &folioq->marks);
68+
}
69+
70+
static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot)
71+
{
72+
clear_bit(slot, &folioq->marks);
73+
}
74+
75+
static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot)
76+
{
77+
return test_bit(slot, &folioq->marks2);
78+
}
79+
80+
static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot)
81+
{
82+
set_bit(slot, &folioq->marks2);
83+
}
84+
85+
static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot)
86+
{
87+
clear_bit(slot, &folioq->marks2);
88+
}
89+
90+
static inline unsigned int __folio_order(struct folio *folio)
91+
{
92+
if (!folio_test_large(folio))
93+
return 0;
94+
return folio->_flags_1 & 0xff;
95+
}
96+
97+
static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio)
98+
{
99+
unsigned int slot = folioq->vec.nr++;
100+
101+
folioq->vec.folios[slot] = folio;
102+
folioq->orders[slot] = __folio_order(folio);
103+
return slot;
104+
}
105+
106+
static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio)
107+
{
108+
unsigned int slot = folioq->vec.nr++;
109+
110+
folioq->vec.folios[slot] = folio;
111+
folioq->orders[slot] = __folio_order(folio);
112+
folioq_mark(folioq, slot);
113+
return slot;
114+
}
115+
116+
static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot)
117+
{
118+
return folioq->vec.folios[slot];
119+
}
120+
121+
static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot)
122+
{
123+
return folioq->orders[slot];
124+
}
125+
126+
static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot)
127+
{
128+
return PAGE_SIZE << folioq_folio_order(folioq, slot);
129+
}
130+
131+
static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot)
132+
{
133+
folioq->vec.folios[slot] = NULL;
134+
folioq_unmark(folioq, slot);
135+
folioq_unmark2(folioq, slot);
136+
}
137+
138+
#endif /* _LINUX_FOLIO_QUEUE_H */

include/linux/iov_iter.h

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <linux/uio.h>
1212
#include <linux/bvec.h>
13+
#include <linux/folio_queue.h>
1314

1415
typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
1516
void *priv, void *priv2);
@@ -140,6 +141,60 @@ size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
140141
return progress;
141142
}
142143

144+
/*
145+
* Handle ITER_FOLIOQ.
146+
*/
147+
static __always_inline
148+
size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2,
149+
iov_step_f step)
150+
{
151+
const struct folio_queue *folioq = iter->folioq;
152+
unsigned int slot = iter->folioq_slot;
153+
size_t progress = 0, skip = iter->iov_offset;
154+
155+
if (slot == folioq_nr_slots(folioq)) {
156+
/* The iterator may have been extended. */
157+
folioq = folioq->next;
158+
slot = 0;
159+
}
160+
161+
do {
162+
struct folio *folio = folioq_folio(folioq, slot);
163+
size_t part, remain, consumed;
164+
size_t fsize;
165+
void *base;
166+
167+
if (!folio)
168+
break;
169+
170+
fsize = folioq_folio_size(folioq, slot);
171+
base = kmap_local_folio(folio, skip);
172+
part = umin(len, PAGE_SIZE - skip % PAGE_SIZE);
173+
remain = step(base, progress, part, priv, priv2);
174+
kunmap_local(base);
175+
consumed = part - remain;
176+
len -= consumed;
177+
progress += consumed;
178+
skip += consumed;
179+
if (skip >= fsize) {
180+
skip = 0;
181+
slot++;
182+
if (slot == folioq_nr_slots(folioq) && folioq->next) {
183+
folioq = folioq->next;
184+
slot = 0;
185+
}
186+
}
187+
if (remain)
188+
break;
189+
} while (len);
190+
191+
iter->folioq_slot = slot;
192+
iter->folioq = folioq;
193+
iter->iov_offset = skip;
194+
iter->count -= progress;
195+
return progress;
196+
}
197+
143198
/*
144199
* Handle ITER_XARRAY.
145200
*/
@@ -249,6 +304,8 @@ size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
249304
return iterate_bvec(iter, len, priv, priv2, step);
250305
if (iov_iter_is_kvec(iter))
251306
return iterate_kvec(iter, len, priv, priv2, step);
307+
if (iov_iter_is_folioq(iter))
308+
return iterate_folioq(iter, len, priv, priv2, step);
252309
if (iov_iter_is_xarray(iter))
253310
return iterate_xarray(iter, len, priv, priv2, step);
254311
return iterate_discard(iter, len, priv, priv2, step);

include/linux/uio.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <uapi/linux/uio.h>
1212

1313
struct page;
14+
struct folio_queue;
1415

1516
typedef unsigned int __bitwise iov_iter_extraction_t;
1617

@@ -25,6 +26,7 @@ enum iter_type {
2526
ITER_IOVEC,
2627
ITER_BVEC,
2728
ITER_KVEC,
29+
ITER_FOLIOQ,
2830
ITER_XARRAY,
2931
ITER_DISCARD,
3032
};
@@ -66,6 +68,7 @@ struct iov_iter {
6668
const struct iovec *__iov;
6769
const struct kvec *kvec;
6870
const struct bio_vec *bvec;
71+
const struct folio_queue *folioq;
6972
struct xarray *xarray;
7073
void __user *ubuf;
7174
};
@@ -74,6 +77,7 @@ struct iov_iter {
7477
};
7578
union {
7679
unsigned long nr_segs;
80+
u8 folioq_slot;
7781
loff_t xarray_start;
7882
};
7983
};
@@ -126,6 +130,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i)
126130
return iov_iter_type(i) == ITER_DISCARD;
127131
}
128132

133+
static inline bool iov_iter_is_folioq(const struct iov_iter *i)
134+
{
135+
return iov_iter_type(i) == ITER_FOLIOQ;
136+
}
137+
129138
static inline bool iov_iter_is_xarray(const struct iov_iter *i)
130139
{
131140
return iov_iter_type(i) == ITER_XARRAY;
@@ -273,6 +282,9 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec
273282
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
274283
unsigned long nr_segs, size_t count);
275284
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
285+
void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
286+
const struct folio_queue *folioq,
287+
unsigned int first_slot, unsigned int offset, size_t count);
276288
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
277289
loff_t start, size_t count);
278290
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,

0 commit comments

Comments
 (0)