Skip to content

Commit fb42c83

Browse files
committed
Merge branch 'TX-used-ring-batched-updating-for-vhost'
Jason Wang says: ==================== TX used ring batched updating for vhost This series implement batch updating of used ring for TX. This help to reduce the cache contention on used ring. The idea is first split datacopy path from zerocopy, and do only batching for datacopy. This is because zercopy had already supported its own batching. TX PPS was increased 25.8% and Netperf TCP does not show obvious differences. The split of datapath will also be helpful for future implementation like in order completion. ==================== Acked-by: Michael S. Tsirkin <[email protected]> Signed-off-by: David S. Miller <[email protected]>
2 parents 0ae0d60 + 4afb52c commit fb42c83

File tree

1 file changed

+179
-70
lines changed

1 file changed

+179
-70
lines changed

drivers/vhost/net.c

Lines changed: 179 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ struct vhost_net_ubuf_ref {
9494
struct vhost_virtqueue *vq;
9595
};
9696

97-
#define VHOST_RX_BATCH 64
97+
#define VHOST_NET_BATCH 64
9898
struct vhost_net_buf {
9999
void **queue;
100100
int tail;
@@ -168,7 +168,7 @@ static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
168168

169169
rxq->head = 0;
170170
rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
171-
VHOST_RX_BATCH);
171+
VHOST_NET_BATCH);
172172
return rxq->tail;
173173
}
174174

@@ -428,17 +428,31 @@ static int vhost_net_enable_vq(struct vhost_net *n,
428428
return vhost_poll_start(poll, sock->file);
429429
}
430430

431+
static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
432+
{
433+
struct vhost_virtqueue *vq = &nvq->vq;
434+
struct vhost_dev *dev = vq->dev;
435+
436+
if (!nvq->done_idx)
437+
return;
438+
439+
vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
440+
nvq->done_idx = 0;
441+
}
442+
431443
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
432-
struct vhost_virtqueue *vq,
433-
struct iovec iov[], unsigned int iov_size,
444+
struct vhost_net_virtqueue *nvq,
434445
unsigned int *out_num, unsigned int *in_num,
435446
bool *busyloop_intr)
436447
{
448+
struct vhost_virtqueue *vq = &nvq->vq;
437449
unsigned long uninitialized_var(endtime);
438450
int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
439451
out_num, in_num, NULL, NULL);
440452

441453
if (r == vq->num && vq->busyloop_timeout) {
454+
if (!vhost_sock_zcopy(vq->private_data))
455+
vhost_net_signal_used(nvq);
442456
preempt_disable();
443457
endtime = busy_clock() + vq->busyloop_timeout;
444458
while (vhost_can_busy_poll(endtime)) {
@@ -467,9 +481,62 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
467481
min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
468482
}
469483

470-
/* Expects to be always run from workqueue - which acts as
471-
* read-size critical section for our kind of RCU. */
472-
static void handle_tx(struct vhost_net *net)
484+
static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
485+
size_t hdr_size, int out)
486+
{
487+
/* Skip header. TODO: support TSO. */
488+
size_t len = iov_length(vq->iov, out);
489+
490+
iov_iter_init(iter, WRITE, vq->iov, out, len);
491+
iov_iter_advance(iter, hdr_size);
492+
493+
return iov_iter_count(iter);
494+
}
495+
496+
static bool vhost_exceeds_weight(int pkts, int total_len)
497+
{
498+
return total_len >= VHOST_NET_WEIGHT ||
499+
pkts >= VHOST_NET_PKT_WEIGHT;
500+
}
501+
502+
static int get_tx_bufs(struct vhost_net *net,
503+
struct vhost_net_virtqueue *nvq,
504+
struct msghdr *msg,
505+
unsigned int *out, unsigned int *in,
506+
size_t *len, bool *busyloop_intr)
507+
{
508+
struct vhost_virtqueue *vq = &nvq->vq;
509+
int ret;
510+
511+
ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr);
512+
513+
if (ret < 0 || ret == vq->num)
514+
return ret;
515+
516+
if (*in) {
517+
vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n",
518+
*out, *in);
519+
return -EFAULT;
520+
}
521+
522+
/* Sanity check */
523+
*len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out);
524+
if (*len == 0) {
525+
vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n",
526+
*len, nvq->vhost_hlen);
527+
return -EFAULT;
528+
}
529+
530+
return ret;
531+
}
532+
533+
static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
534+
{
535+
return total_len < VHOST_NET_WEIGHT &&
536+
!vhost_vq_avail_empty(vq->dev, vq);
537+
}
538+
539+
static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
473540
{
474541
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
475542
struct vhost_virtqueue *vq = &nvq->vq;
@@ -484,37 +551,86 @@ static void handle_tx(struct vhost_net *net)
484551
};
485552
size_t len, total_len = 0;
486553
int err;
487-
size_t hdr_size;
488-
struct socket *sock;
489-
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
490-
bool zcopy, zcopy_used;
491554
int sent_pkts = 0;
492555

493-
mutex_lock(&vq->mutex);
494-
sock = vq->private_data;
495-
if (!sock)
496-
goto out;
556+
for (;;) {
557+
bool busyloop_intr = false;
497558

498-
if (!vq_iotlb_prefetch(vq))
499-
goto out;
559+
head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
560+
&busyloop_intr);
561+
/* On error, stop handling until the next kick. */
562+
if (unlikely(head < 0))
563+
break;
564+
/* Nothing new? Wait for eventfd to tell us they refilled. */
565+
if (head == vq->num) {
566+
if (unlikely(busyloop_intr)) {
567+
vhost_poll_queue(&vq->poll);
568+
} else if (unlikely(vhost_enable_notify(&net->dev,
569+
vq))) {
570+
vhost_disable_notify(&net->dev, vq);
571+
continue;
572+
}
573+
break;
574+
}
500575

501-
vhost_disable_notify(&net->dev, vq);
502-
vhost_net_disable_vq(net, vq);
576+
vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
577+
vq->heads[nvq->done_idx].len = 0;
503578

504-
hdr_size = nvq->vhost_hlen;
505-
zcopy = nvq->ubufs;
579+
total_len += len;
580+
if (tx_can_batch(vq, total_len))
581+
msg.msg_flags |= MSG_MORE;
582+
else
583+
msg.msg_flags &= ~MSG_MORE;
584+
585+
/* TODO: Check specific error and bomb out unless ENOBUFS? */
586+
err = sock->ops->sendmsg(sock, &msg, len);
587+
if (unlikely(err < 0)) {
588+
vhost_discard_vq_desc(vq, 1);
589+
vhost_net_enable_vq(net, vq);
590+
break;
591+
}
592+
if (err != len)
593+
pr_debug("Truncated TX packet: len %d != %zd\n",
594+
err, len);
595+
if (++nvq->done_idx >= VHOST_NET_BATCH)
596+
vhost_net_signal_used(nvq);
597+
if (vhost_exceeds_weight(++sent_pkts, total_len)) {
598+
vhost_poll_queue(&vq->poll);
599+
break;
600+
}
601+
}
602+
603+
vhost_net_signal_used(nvq);
604+
}
605+
606+
static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
607+
{
608+
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
609+
struct vhost_virtqueue *vq = &nvq->vq;
610+
unsigned out, in;
611+
int head;
612+
struct msghdr msg = {
613+
.msg_name = NULL,
614+
.msg_namelen = 0,
615+
.msg_control = NULL,
616+
.msg_controllen = 0,
617+
.msg_flags = MSG_DONTWAIT,
618+
};
619+
size_t len, total_len = 0;
620+
int err;
621+
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
622+
bool zcopy_used;
623+
int sent_pkts = 0;
506624

507625
for (;;) {
508626
bool busyloop_intr;
509627

510628
/* Release DMAs done buffers first */
511-
if (zcopy)
512-
vhost_zerocopy_signal_used(net, vq);
629+
vhost_zerocopy_signal_used(net, vq);
513630

514631
busyloop_intr = false;
515-
head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
516-
ARRAY_SIZE(vq->iov),
517-
&out, &in, &busyloop_intr);
632+
head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
633+
&busyloop_intr);
518634
/* On error, stop handling until the next kick. */
519635
if (unlikely(head < 0))
520636
break;
@@ -528,27 +644,10 @@ static void handle_tx(struct vhost_net *net)
528644
}
529645
break;
530646
}
531-
if (in) {
532-
vq_err(vq, "Unexpected descriptor format for TX: "
533-
"out %d, int %d\n", out, in);
534-
break;
535-
}
536-
/* Skip header. TODO: support TSO. */
537-
len = iov_length(vq->iov, out);
538-
iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
539-
iov_iter_advance(&msg.msg_iter, hdr_size);
540-
/* Sanity check */
541-
if (!msg_data_left(&msg)) {
542-
vq_err(vq, "Unexpected header len for TX: "
543-
"%zd expected %zd\n",
544-
len, hdr_size);
545-
break;
546-
}
547-
len = msg_data_left(&msg);
548647

549-
zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
550-
&& !vhost_exceeds_maxpend(net)
551-
&& vhost_net_tx_select_zcopy(net);
648+
zcopy_used = len >= VHOST_GOODCOPY_LEN
649+
&& !vhost_exceeds_maxpend(net)
650+
&& vhost_net_tx_select_zcopy(net);
552651

553652
/* use msg_control to pass vhost zerocopy ubuf info to skb */
554653
if (zcopy_used) {
@@ -570,10 +669,8 @@ static void handle_tx(struct vhost_net *net)
570669
msg.msg_control = NULL;
571670
ubufs = NULL;
572671
}
573-
574672
total_len += len;
575-
if (total_len < VHOST_NET_WEIGHT &&
576-
!vhost_vq_avail_empty(&net->dev, vq) &&
673+
if (tx_can_batch(vq, total_len) &&
577674
likely(!vhost_exceeds_maxpend(net))) {
578675
msg.msg_flags |= MSG_MORE;
579676
} else {
@@ -600,12 +697,37 @@ static void handle_tx(struct vhost_net *net)
600697
else
601698
vhost_zerocopy_signal_used(net, vq);
602699
vhost_net_tx_packet(net);
603-
if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
604-
unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
700+
if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
605701
vhost_poll_queue(&vq->poll);
606702
break;
607703
}
608704
}
705+
}
706+
707+
/* Expects to be always run from workqueue - which acts as
708+
* read-size critical section for our kind of RCU. */
709+
static void handle_tx(struct vhost_net *net)
710+
{
711+
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
712+
struct vhost_virtqueue *vq = &nvq->vq;
713+
struct socket *sock;
714+
715+
mutex_lock(&vq->mutex);
716+
sock = vq->private_data;
717+
if (!sock)
718+
goto out;
719+
720+
if (!vq_iotlb_prefetch(vq))
721+
goto out;
722+
723+
vhost_disable_notify(&net->dev, vq);
724+
vhost_net_disable_vq(net, vq);
725+
726+
if (vhost_sock_zcopy(sock))
727+
handle_tx_zerocopy(net, sock);
728+
else
729+
handle_tx_copy(net, sock);
730+
609731
out:
610732
mutex_unlock(&vq->mutex);
611733
}
@@ -641,18 +763,6 @@ static int sk_has_rx_data(struct sock *sk)
641763
return skb_queue_empty(&sk->sk_receive_queue);
642764
}
643765

644-
static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
645-
{
646-
struct vhost_virtqueue *vq = &nvq->vq;
647-
struct vhost_dev *dev = vq->dev;
648-
649-
if (!nvq->done_idx)
650-
return;
651-
652-
vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
653-
nvq->done_idx = 0;
654-
}
655-
656766
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
657767
bool *busyloop_intr)
658768
{
@@ -665,7 +775,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
665775

666776
if (!len && tvq->busyloop_timeout) {
667777
/* Flush batched heads first */
668-
vhost_rx_signal_used(rnvq);
778+
vhost_net_signal_used(rnvq);
669779
/* Both tx vq and rx socket were polled here */
670780
mutex_lock_nested(&tvq->mutex, 1);
671781
vhost_disable_notify(&net->dev, tvq);
@@ -907,13 +1017,12 @@ static void handle_rx(struct vhost_net *net)
9071017
goto out;
9081018
}
9091019
nvq->done_idx += headcount;
910-
if (nvq->done_idx > VHOST_RX_BATCH)
911-
vhost_rx_signal_used(nvq);
1020+
if (nvq->done_idx > VHOST_NET_BATCH)
1021+
vhost_net_signal_used(nvq);
9121022
if (unlikely(vq_log))
9131023
vhost_log_write(vq, vq_log, log, vhost_len);
9141024
total_len += vhost_len;
915-
if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
916-
unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
1025+
if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
9171026
vhost_poll_queue(&vq->poll);
9181027
goto out;
9191028
}
@@ -923,7 +1032,7 @@ static void handle_rx(struct vhost_net *net)
9231032
else
9241033
vhost_net_enable_vq(net, vq);
9251034
out:
926-
vhost_rx_signal_used(nvq);
1035+
vhost_net_signal_used(nvq);
9271036
mutex_unlock(&vq->mutex);
9281037
}
9291038

@@ -976,7 +1085,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
9761085
return -ENOMEM;
9771086
}
9781087

979-
queue = kmalloc_array(VHOST_RX_BATCH, sizeof(void *),
1088+
queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *),
9801089
GFP_KERNEL);
9811090
if (!queue) {
9821091
kfree(vqs);

0 commit comments

Comments
 (0)