Skip to content

Commit bd61848

Browse files
minaPaolo Abeni
authored andcommitted
net: devmem: Implement TX path
Augment dmabuf binding to be able to handle TX. Additional to all the RX binding, we also create tx_vec needed for the TX path. Provide API for sendmsg to be able to send dmabufs bound to this device: - Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from. - MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf. Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY implementation, while disabling instances where MSG_ZEROCOPY falls back to copying. We additionally pipe the binding down to the new zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems instead of the traditional page netmems. We also special case skb_frag_dma_map to return the dma-address of these dmabuf net_iovs instead of attempting to map pages. The TX path may release the dmabuf in a context where we cannot wait. This happens when the user unbinds a TX dmabuf while there are still references to its netmems in the TX path. In that case, the netmems will be put_netmem'd from a context where we can't unmap the dmabuf, Resolve this by making __net_devmem_dmabuf_binding_free schedule_work'd. Based on work by Stanislav Fomichev <[email protected]>. A lot of the meat of the implementation came from devmem TCP RFC v1[1], which included the TX path, but Stan did all the rebasing on top of netmem/net_iov. Cc: Stanislav Fomichev <[email protected]> Signed-off-by: Kaiyuan Zhang <[email protected]> Signed-off-by: Mina Almasry <[email protected]> Acked-by: Stanislav Fomichev <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Paolo Abeni <[email protected]>
1 parent 8802087 commit bd61848

File tree

13 files changed

+340
-60
lines changed

13 files changed

+340
-60
lines changed

include/linux/skbuff.h

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,26 +1707,31 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
17071707
extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops;
17081708

17091709
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
1710-
struct ubuf_info *uarg);
1710+
struct ubuf_info *uarg, bool devmem);
17111711

17121712
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
17131713

1714+
struct net_devmem_dmabuf_binding;
1715+
17141716
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
17151717
struct sk_buff *skb, struct iov_iter *from,
1716-
size_t length);
1718+
size_t length,
1719+
struct net_devmem_dmabuf_binding *binding);
17171720

17181721
int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
17191722
struct iov_iter *from, size_t length);
17201723

17211724
static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
17221725
struct msghdr *msg, int len)
17231726
{
1724-
return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
1727+
return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
1728+
NULL);
17251729
}
17261730

17271731
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
17281732
struct msghdr *msg, int len,
1729-
struct ubuf_info *uarg);
1733+
struct ubuf_info *uarg,
1734+
struct net_devmem_dmabuf_binding *binding);
17301735

17311736
/* Internal */
17321737
#define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB)))
@@ -3697,6 +3702,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
36973702
size_t offset, size_t size,
36983703
enum dma_data_direction dir)
36993704
{
3705+
if (skb_frag_is_net_iov(frag)) {
3706+
return netmem_to_net_iov(frag->netmem)->dma_addr + offset +
3707+
frag->offset;
3708+
}
37003709
return dma_map_page(dev, skb_frag_page(frag),
37013710
skb_frag_off(frag) + offset, size, dir);
37023711
}

include/net/sock.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1851,6 +1851,7 @@ struct sockcm_cookie {
18511851
u32 tsflags;
18521852
u32 ts_opt_id;
18531853
u32 priority;
1854+
u32 dmabuf_id;
18541855
};
18551856

18561857
static inline void sockcm_init(struct sockcm_cookie *sockc,

io_uring/zcrx.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -810,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
810810
return io_zcrx_copy_frag(req, ifq, frag, off, len);
811811

812812
niov = netmem_to_net_iov(frag->netmem);
813-
if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
813+
if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops ||
814814
io_pp_to_ifq(niov->pp) != ifq)
815815
return -EFAULT;
816816

net/core/datagram.c

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@
6363
#include <net/busy_poll.h>
6464
#include <crypto/hash.h>
6565

66+
#include "devmem.h"
67+
6668
/*
6769
* Is a socket 'connection oriented' ?
6870
*/
@@ -691,16 +693,58 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
691693
return 0;
692694
}
693695

696+
static int
697+
zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
698+
int length,
699+
struct net_devmem_dmabuf_binding *binding)
700+
{
701+
int i = skb_shinfo(skb)->nr_frags;
702+
size_t virt_addr, size, off;
703+
struct net_iov *niov;
704+
705+
/* Devmem filling works by taking an IOVEC from the user where the
706+
* iov_addrs are interpreted as an offset in bytes into the dma-buf to
707+
* send from. We do not support other iter types.
708+
*/
709+
if (iov_iter_type(from) != ITER_IOVEC)
710+
return -EFAULT;
711+
712+
while (length && iov_iter_count(from)) {
713+
if (i == MAX_SKB_FRAGS)
714+
return -EMSGSIZE;
715+
716+
virt_addr = (size_t)iter_iov_addr(from);
717+
niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
718+
if (!niov)
719+
return -EFAULT;
720+
721+
size = min_t(size_t, size, length);
722+
size = min_t(size_t, size, iter_iov_len(from));
723+
724+
get_netmem(net_iov_to_netmem(niov));
725+
skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
726+
size, PAGE_SIZE);
727+
iov_iter_advance(from, size);
728+
length -= size;
729+
i++;
730+
}
731+
732+
return 0;
733+
}
734+
694735
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
695736
struct sk_buff *skb, struct iov_iter *from,
696-
size_t length)
737+
size_t length,
738+
struct net_devmem_dmabuf_binding *binding)
697739
{
698740
unsigned long orig_size = skb->truesize;
699741
unsigned long truesize;
700742
int ret;
701743

702744
if (msg && msg->msg_ubuf && msg->sg_from_iter)
703745
ret = msg->sg_from_iter(skb, from, length);
746+
else if (binding)
747+
ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
704748
else
705749
ret = zerocopy_fill_skb_from_iter(skb, from, length);
706750

@@ -734,7 +778,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
734778
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
735779
return -EFAULT;
736780

737-
return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
781+
return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL);
738782
}
739783
EXPORT_SYMBOL(zerocopy_sg_from_iter);
740784

net/core/devmem.c

Lines changed: 99 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <net/netdev_rx_queue.h>
1717
#include <net/page_pool/helpers.h>
1818
#include <net/page_pool/memory_provider.h>
19+
#include <net/sock.h>
1920
#include <trace/events/page_pool.h>
2021

2122
#include "devmem.h"
@@ -52,8 +53,10 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
5253
((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
5354
}
5455

55-
void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
56+
void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
5657
{
58+
struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w);
59+
5760
size_t size, avail;
5861

5962
gen_pool_for_each_chunk(binding->chunk_pool,
@@ -71,8 +74,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
7174
dma_buf_detach(binding->dmabuf, binding->attachment);
7275
dma_buf_put(binding->dmabuf);
7376
xa_destroy(&binding->bound_rxqs);
77+
kvfree(binding->tx_vec);
7478
kfree(binding);
7579
}
80+
EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free);
7681

7782
struct net_iov *
7883
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
@@ -117,6 +122,13 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
117122
unsigned long xa_idx;
118123
unsigned int rxq_idx;
119124

125+
xa_erase(&net_devmem_dmabuf_bindings, binding->id);
126+
127+
/* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the
128+
* erase.
129+
*/
130+
synchronize_net();
131+
120132
if (binding->list.next)
121133
list_del(&binding->list);
122134

@@ -131,8 +143,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
131143
__net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
132144
}
133145

134-
xa_erase(&net_devmem_dmabuf_bindings, binding->id);
135-
136146
net_devmem_dmabuf_binding_put(binding);
137147
}
138148

@@ -166,8 +176,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
166176
}
167177

168178
struct net_devmem_dmabuf_binding *
169-
net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
170-
struct netlink_ext_ack *extack)
179+
net_devmem_bind_dmabuf(struct net_device *dev,
180+
enum dma_data_direction direction,
181+
unsigned int dmabuf_fd, struct netlink_ext_ack *extack)
171182
{
172183
struct net_devmem_dmabuf_binding *binding;
173184
static u32 id_alloc_next;
@@ -189,13 +200,6 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
189200
}
190201

191202
binding->dev = dev;
192-
193-
err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
194-
binding, xa_limit_32b, &id_alloc_next,
195-
GFP_KERNEL);
196-
if (err < 0)
197-
goto err_free_binding;
198-
199203
xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
200204

201205
refcount_set(&binding->ref, 1);
@@ -206,26 +210,36 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
206210
if (IS_ERR(binding->attachment)) {
207211
err = PTR_ERR(binding->attachment);
208212
NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
209-
goto err_free_id;
213+
goto err_free_binding;
210214
}
211215

212216
binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
213-
DMA_FROM_DEVICE);
217+
direction);
214218
if (IS_ERR(binding->sgt)) {
215219
err = PTR_ERR(binding->sgt);
216220
NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment");
217221
goto err_detach;
218222
}
219223

224+
if (direction == DMA_TO_DEVICE) {
225+
binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
226+
sizeof(struct net_iov *),
227+
GFP_KERNEL);
228+
if (!binding->tx_vec) {
229+
err = -ENOMEM;
230+
goto err_unmap;
231+
}
232+
}
233+
220234
/* For simplicity we expect to make PAGE_SIZE allocations, but the
221235
* binding can be much more flexible than that. We may be able to
222236
* allocate MTU sized chunks here. Leave that for future work...
223237
*/
224-
binding->chunk_pool =
225-
gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev));
238+
binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
239+
dev_to_node(&dev->dev));
226240
if (!binding->chunk_pool) {
227241
err = -ENOMEM;
228-
goto err_unmap;
242+
goto err_tx_vec;
229243
}
230244

231245
virtual = 0;
@@ -270,31 +284,54 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
270284
niov->owner = &owner->area;
271285
page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
272286
net_devmem_get_dma_addr(niov));
287+
if (direction == DMA_TO_DEVICE)
288+
binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
273289
}
274290

275291
virtual += len;
276292
}
277293

294+
err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
295+
binding, xa_limit_32b, &id_alloc_next,
296+
GFP_KERNEL);
297+
if (err < 0)
298+
goto err_free_chunks;
299+
278300
return binding;
279301

280302
err_free_chunks:
281303
gen_pool_for_each_chunk(binding->chunk_pool,
282304
net_devmem_dmabuf_free_chunk_owner, NULL);
283305
gen_pool_destroy(binding->chunk_pool);
306+
err_tx_vec:
307+
kvfree(binding->tx_vec);
284308
err_unmap:
285309
dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
286310
DMA_FROM_DEVICE);
287311
err_detach:
288312
dma_buf_detach(dmabuf, binding->attachment);
289-
err_free_id:
290-
xa_erase(&net_devmem_dmabuf_bindings, binding->id);
291313
err_free_binding:
292314
kfree(binding);
293315
err_put_dmabuf:
294316
dma_buf_put(dmabuf);
295317
return ERR_PTR(err);
296318
}
297319

320+
struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
321+
{
322+
struct net_devmem_dmabuf_binding *binding;
323+
324+
rcu_read_lock();
325+
binding = xa_load(&net_devmem_dmabuf_bindings, id);
326+
if (binding) {
327+
if (!net_devmem_dmabuf_binding_get(binding))
328+
binding = NULL;
329+
}
330+
rcu_read_unlock();
331+
332+
return binding;
333+
}
334+
298335
void net_devmem_get_net_iov(struct net_iov *niov)
299336
{
300337
net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov));
@@ -305,6 +342,49 @@ void net_devmem_put_net_iov(struct net_iov *niov)
305342
net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov));
306343
}
307344

345+
struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
346+
unsigned int dmabuf_id)
347+
{
348+
struct net_devmem_dmabuf_binding *binding;
349+
struct dst_entry *dst = __sk_dst_get(sk);
350+
int err = 0;
351+
352+
binding = net_devmem_lookup_dmabuf(dmabuf_id);
353+
if (!binding || !binding->tx_vec) {
354+
err = -EINVAL;
355+
goto out_err;
356+
}
357+
358+
/* The dma-addrs in this binding are only reachable to the corresponding
359+
* net_device.
360+
*/
361+
if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) {
362+
err = -ENODEV;
363+
goto out_err;
364+
}
365+
366+
return binding;
367+
368+
out_err:
369+
if (binding)
370+
net_devmem_dmabuf_binding_put(binding);
371+
372+
return ERR_PTR(err);
373+
}
374+
375+
struct net_iov *
376+
net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
377+
size_t virt_addr, size_t *off, size_t *size)
378+
{
379+
if (virt_addr >= binding->dmabuf->size)
380+
return NULL;
381+
382+
*off = virt_addr % PAGE_SIZE;
383+
*size = PAGE_SIZE - *off;
384+
385+
return binding->tx_vec[virt_addr / PAGE_SIZE];
386+
}
387+
308388
/*** "Dmabuf devmem memory provider" ***/
309389

310390
int mp_dmabuf_devmem_init(struct page_pool *pool)

0 commit comments

Comments
 (0)