Skip to content

Commit bab632d

Browse files
mstsirkindavem330
authored andcommitted
vhost: vhost TX zero-copy support
>From: Shirley Ma <[email protected]> This adds experimental zero copy support in vhost-net, disabled by default. To enable, set experimental_zcopytx module option to 1. This patch maintains the outstanding userspace buffers in the sequence it is delivered to vhost. The outstanding userspace buffers will be marked as done once the lower device buffers DMA has finished. This is monitored through last reference of kfree_skb callback. Two buffer indices are used for this purpose. The vhost-net device passes the userspace buffers info to lower device skb through message control. DMA done status check and guest notification are handled by handle_tx: in the worst case is all buffers in the vq are in pending/done status, so we need to notify guest to release DMA done buffers first before we get any new buffers from the vq. One known problem is that if the guest stops submitting buffers, buffers might never get used until some further action, e.g. device reset. This does not seem to affect linux guests. Signed-off-by: Shirley <[email protected]> Signed-off-by: Michael S. Tsirkin <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 5c74501 commit bab632d

File tree

3 files changed

+220
-16
lines changed

3 files changed

+220
-16
lines changed

drivers/vhost/net.c

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <linux/virtio_net.h>
1313
#include <linux/miscdevice.h>
1414
#include <linux/module.h>
15+
#include <linux/moduleparam.h>
1516
#include <linux/mutex.h>
1617
#include <linux/workqueue.h>
1718
#include <linux/rcupdate.h>
@@ -28,10 +29,18 @@
2829

2930
#include "vhost.h"
3031

32+
static int experimental_zcopytx;
33+
module_param(experimental_zcopytx, int, 0444);
34+
MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX");
35+
3136
/* Max number of bytes transferred before requeueing the job.
3237
* Using this limit prevents one virtqueue from starving others. */
3338
#define VHOST_NET_WEIGHT 0x80000
3439

40+
/* MAX number of TX used buffers for outstanding zerocopy */
41+
#define VHOST_MAX_PEND 128
42+
#define VHOST_GOODCOPY_LEN 256
43+
3544
enum {
3645
VHOST_NET_VQ_RX = 0,
3746
VHOST_NET_VQ_TX = 1,
@@ -54,6 +63,12 @@ struct vhost_net {
5463
enum vhost_net_poll_state tx_poll_state;
5564
};
5665

66+
static bool vhost_sock_zcopy(struct socket *sock)
67+
{
68+
return unlikely(experimental_zcopytx) &&
69+
sock_flag(sock->sk, SOCK_ZEROCOPY);
70+
}
71+
5772
/* Pop first len bytes from iovec. Return number of segments used. */
5873
static int move_iovec_hdr(struct iovec *from, struct iovec *to,
5974
size_t len, int iov_count)
@@ -129,6 +144,8 @@ static void handle_tx(struct vhost_net *net)
129144
int err, wmem;
130145
size_t hdr_size;
131146
struct socket *sock;
147+
struct vhost_ubuf_ref *uninitialized_var(ubufs);
148+
bool zcopy;
132149

133150
/* TODO: check that we are running from vhost_worker? */
134151
sock = rcu_dereference_check(vq->private_data, 1);
@@ -149,8 +166,13 @@ static void handle_tx(struct vhost_net *net)
149166
if (wmem < sock->sk->sk_sndbuf / 2)
150167
tx_poll_stop(net);
151168
hdr_size = vq->vhost_hlen;
169+
zcopy = vhost_sock_zcopy(sock);
152170

153171
for (;;) {
172+
/* Release DMAs done buffers first */
173+
if (zcopy)
174+
vhost_zerocopy_signal_used(vq);
175+
154176
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
155177
ARRAY_SIZE(vq->iov),
156178
&out, &in,
@@ -166,6 +188,13 @@ static void handle_tx(struct vhost_net *net)
166188
set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
167189
break;
168190
}
191+
/* If more outstanding DMAs, queue the work */
192+
if (unlikely(vq->upend_idx - vq->done_idx >
193+
VHOST_MAX_PEND)) {
194+
tx_poll_start(net, sock);
195+
set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
196+
break;
197+
}
169198
if (unlikely(vhost_enable_notify(&net->dev, vq))) {
170199
vhost_disable_notify(&net->dev, vq);
171200
continue;
@@ -188,17 +217,48 @@ static void handle_tx(struct vhost_net *net)
188217
iov_length(vq->hdr, s), hdr_size);
189218
break;
190219
}
220+
/* use msg_control to pass vhost zerocopy ubuf info to skb */
221+
if (zcopy) {
222+
vq->heads[vq->upend_idx].id = head;
223+
if (len < VHOST_GOODCOPY_LEN) {
224+
/* copy don't need to wait for DMA done */
225+
vq->heads[vq->upend_idx].len =
226+
VHOST_DMA_DONE_LEN;
227+
msg.msg_control = NULL;
228+
msg.msg_controllen = 0;
229+
ubufs = NULL;
230+
} else {
231+
struct ubuf_info *ubuf = &vq->ubuf_info[head];
232+
233+
vq->heads[vq->upend_idx].len = len;
234+
ubuf->callback = vhost_zerocopy_callback;
235+
ubuf->arg = vq->ubufs;
236+
ubuf->desc = vq->upend_idx;
237+
msg.msg_control = ubuf;
238+
msg.msg_controllen = sizeof(ubuf);
239+
ubufs = vq->ubufs;
240+
kref_get(&ubufs->kref);
241+
}
242+
vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
243+
}
191244
/* TODO: Check specific error and bomb out unless ENOBUFS? */
192245
err = sock->ops->sendmsg(NULL, sock, &msg, len);
193246
if (unlikely(err < 0)) {
247+
if (zcopy) {
248+
if (ubufs)
249+
vhost_ubuf_put(ubufs);
250+
vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
251+
UIO_MAXIOV;
252+
}
194253
vhost_discard_vq_desc(vq, 1);
195254
tx_poll_start(net, sock);
196255
break;
197256
}
198257
if (err != len)
199258
pr_debug("Truncated TX packet: "
200259
" len %d != %zd\n", err, len);
201-
vhost_add_used_and_signal(&net->dev, vq, head, 0);
260+
if (!zcopy)
261+
vhost_add_used_and_signal(&net->dev, vq, head, 0);
202262
total_len += len;
203263
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
204264
vhost_poll_queue(&vq->poll);
@@ -603,6 +663,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
603663
{
604664
struct socket *sock, *oldsock;
605665
struct vhost_virtqueue *vq;
666+
struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
606667
int r;
607668

608669
mutex_lock(&n->dev.mutex);
@@ -632,13 +693,23 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
632693
oldsock = rcu_dereference_protected(vq->private_data,
633694
lockdep_is_held(&vq->mutex));
634695
if (sock != oldsock) {
696+
ubufs = vhost_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock));
697+
if (IS_ERR(ubufs)) {
698+
r = PTR_ERR(ubufs);
699+
goto err_ubufs;
700+
}
701+
oldubufs = vq->ubufs;
702+
vq->ubufs = ubufs;
635703
vhost_net_disable_vq(n, vq);
636704
rcu_assign_pointer(vq->private_data, sock);
637705
vhost_net_enable_vq(n, vq);
638706
}
639707

640708
mutex_unlock(&vq->mutex);
641709

710+
if (oldubufs)
711+
vhost_ubuf_put_and_wait(oldubufs);
712+
642713
if (oldsock) {
643714
vhost_net_flush_vq(n, index);
644715
fput(oldsock->file);
@@ -647,6 +718,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
647718
mutex_unlock(&n->dev.mutex);
648719
return 0;
649720

721+
err_ubufs:
722+
fput(sock->file);
650723
err_vq:
651724
mutex_unlock(&vq->mutex);
652725
err:
@@ -776,6 +849,8 @@ static struct miscdevice vhost_net_misc = {
776849

777850
static int vhost_net_init(void)
778851
{
852+
if (experimental_zcopytx)
853+
vhost_enable_zcopy(VHOST_NET_VQ_TX);
779854
return misc_register(&vhost_net_misc);
780855
}
781856
module_init(vhost_net_init);

drivers/vhost/vhost.c

Lines changed: 113 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ enum {
3737
VHOST_MEMORY_F_LOG = 0x1,
3838
};
3939

40+
static unsigned vhost_zcopy_mask __read_mostly;
41+
4042
#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
4143
#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
4244

@@ -179,6 +181,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
179181
vq->call_ctx = NULL;
180182
vq->call = NULL;
181183
vq->log_ctx = NULL;
184+
vq->upend_idx = 0;
185+
vq->done_idx = 0;
186+
vq->ubufs = NULL;
182187
}
183188

184189
static int vhost_worker(void *data)
@@ -225,10 +230,28 @@ static int vhost_worker(void *data)
225230
return 0;
226231
}
227232

233+
static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
234+
{
235+
kfree(vq->indirect);
236+
vq->indirect = NULL;
237+
kfree(vq->log);
238+
vq->log = NULL;
239+
kfree(vq->heads);
240+
vq->heads = NULL;
241+
kfree(vq->ubuf_info);
242+
vq->ubuf_info = NULL;
243+
}
244+
245+
void vhost_enable_zcopy(int vq)
246+
{
247+
vhost_zcopy_mask |= 0x1 << vq;
248+
}
249+
228250
/* Helper to allocate iovec buffers for all vqs. */
229251
static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
230252
{
231253
int i;
254+
bool zcopy;
232255

233256
for (i = 0; i < dev->nvqs; ++i) {
234257
dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect *
@@ -237,34 +260,30 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
237260
GFP_KERNEL);
238261
dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
239262
UIO_MAXIOV, GFP_KERNEL);
240-
263+
zcopy = vhost_zcopy_mask & (0x1 << i);
264+
if (zcopy)
265+
dev->vqs[i].ubuf_info =
266+
kmalloc(sizeof *dev->vqs[i].ubuf_info *
267+
UIO_MAXIOV, GFP_KERNEL);
241268
if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
242-
!dev->vqs[i].heads)
269+
!dev->vqs[i].heads ||
270+
(zcopy && !dev->vqs[i].ubuf_info))
243271
goto err_nomem;
244272
}
245273
return 0;
246274

247275
err_nomem:
248-
for (; i >= 0; --i) {
249-
kfree(dev->vqs[i].indirect);
250-
kfree(dev->vqs[i].log);
251-
kfree(dev->vqs[i].heads);
252-
}
276+
for (; i >= 0; --i)
277+
vhost_vq_free_iovecs(&dev->vqs[i]);
253278
return -ENOMEM;
254279
}
255280

256281
static void vhost_dev_free_iovecs(struct vhost_dev *dev)
257282
{
258283
int i;
259284

260-
for (i = 0; i < dev->nvqs; ++i) {
261-
kfree(dev->vqs[i].indirect);
262-
dev->vqs[i].indirect = NULL;
263-
kfree(dev->vqs[i].log);
264-
dev->vqs[i].log = NULL;
265-
kfree(dev->vqs[i].heads);
266-
dev->vqs[i].heads = NULL;
267-
}
285+
for (i = 0; i < dev->nvqs; ++i)
286+
vhost_vq_free_iovecs(&dev->vqs[i]);
268287
}
269288

270289
long vhost_dev_init(struct vhost_dev *dev,
@@ -287,6 +306,7 @@ long vhost_dev_init(struct vhost_dev *dev,
287306
dev->vqs[i].log = NULL;
288307
dev->vqs[i].indirect = NULL;
289308
dev->vqs[i].heads = NULL;
309+
dev->vqs[i].ubuf_info = NULL;
290310
dev->vqs[i].dev = dev;
291311
mutex_init(&dev->vqs[i].mutex);
292312
vhost_vq_reset(dev, dev->vqs + i);
@@ -390,6 +410,30 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
390410
return 0;
391411
}
392412

413+
/* In case of DMA done not in order in lower device driver for some reason.
414+
* upend_idx is used to track end of used idx, done_idx is used to track head
415+
* of used idx. Once lower device DMA done contiguously, we will signal KVM
416+
* guest used idx.
417+
*/
418+
int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
419+
{
420+
int i;
421+
int j = 0;
422+
423+
for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
424+
if ((vq->heads[i].len == VHOST_DMA_DONE_LEN)) {
425+
vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
426+
vhost_add_used_and_signal(vq->dev, vq,
427+
vq->heads[i].id, 0);
428+
++j;
429+
} else
430+
break;
431+
}
432+
if (j)
433+
vq->done_idx = i;
434+
return j;
435+
}
436+
393437
/* Caller should have device mutex */
394438
void vhost_dev_cleanup(struct vhost_dev *dev)
395439
{
@@ -400,6 +444,13 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
400444
vhost_poll_stop(&dev->vqs[i].poll);
401445
vhost_poll_flush(&dev->vqs[i].poll);
402446
}
447+
/* Wait for all lower device DMAs done. */
448+
if (dev->vqs[i].ubufs)
449+
vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
450+
451+
/* Signal guest as appropriate. */
452+
vhost_zerocopy_signal_used(&dev->vqs[i]);
453+
403454
if (dev->vqs[i].error_ctx)
404455
eventfd_ctx_put(dev->vqs[i].error_ctx);
405456
if (dev->vqs[i].error)
@@ -1486,3 +1537,50 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
14861537
&vq->used->flags, r);
14871538
}
14881539
}
1540+
1541+
static void vhost_zerocopy_done_signal(struct kref *kref)
1542+
{
1543+
struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
1544+
kref);
1545+
wake_up(&ubufs->wait);
1546+
}
1547+
1548+
struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
1549+
bool zcopy)
1550+
{
1551+
struct vhost_ubuf_ref *ubufs;
1552+
/* No zero copy backend? Nothing to count. */
1553+
if (!zcopy)
1554+
return NULL;
1555+
ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL);
1556+
if (!ubufs)
1557+
return ERR_PTR(-ENOMEM);
1558+
kref_init(&ubufs->kref);
1559+
kref_get(&ubufs->kref);
1560+
init_waitqueue_head(&ubufs->wait);
1561+
ubufs->vq = vq;
1562+
return ubufs;
1563+
}
1564+
1565+
void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
1566+
{
1567+
kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
1568+
}
1569+
1570+
void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
1571+
{
1572+
kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
1573+
wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
1574+
kfree(ubufs);
1575+
}
1576+
1577+
void vhost_zerocopy_callback(void *arg)
1578+
{
1579+
struct ubuf_info *ubuf = arg;
1580+
struct vhost_ubuf_ref *ubufs = ubuf->arg;
1581+
struct vhost_virtqueue *vq = ubufs->vq;
1582+
1583+
/* set len = 1 to mark this desc buffers done DMA */
1584+
vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
1585+
kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
1586+
}

0 commit comments

Comments
 (0)