@@ -94,7 +94,7 @@ struct vhost_net_ubuf_ref {
94
94
struct vhost_virtqueue * vq ;
95
95
};
96
96
97
- #define VHOST_RX_BATCH 64
97
+ #define VHOST_NET_BATCH 64
98
98
struct vhost_net_buf {
99
99
void * * queue ;
100
100
int tail ;
@@ -168,7 +168,7 @@ static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
168
168
169
169
rxq -> head = 0 ;
170
170
rxq -> tail = ptr_ring_consume_batched (nvq -> rx_ring , rxq -> queue ,
171
- VHOST_RX_BATCH );
171
+ VHOST_NET_BATCH );
172
172
return rxq -> tail ;
173
173
}
174
174
@@ -428,17 +428,31 @@ static int vhost_net_enable_vq(struct vhost_net *n,
428
428
return vhost_poll_start (poll , sock -> file );
429
429
}
430
430
431
+ static void vhost_net_signal_used (struct vhost_net_virtqueue * nvq )
432
+ {
433
+ struct vhost_virtqueue * vq = & nvq -> vq ;
434
+ struct vhost_dev * dev = vq -> dev ;
435
+
436
+ if (!nvq -> done_idx )
437
+ return ;
438
+
439
+ vhost_add_used_and_signal_n (dev , vq , vq -> heads , nvq -> done_idx );
440
+ nvq -> done_idx = 0 ;
441
+ }
442
+
431
443
static int vhost_net_tx_get_vq_desc (struct vhost_net * net ,
432
- struct vhost_virtqueue * vq ,
433
- struct iovec iov [], unsigned int iov_size ,
444
+ struct vhost_net_virtqueue * nvq ,
434
445
unsigned int * out_num , unsigned int * in_num ,
435
446
bool * busyloop_intr )
436
447
{
448
+ struct vhost_virtqueue * vq = & nvq -> vq ;
437
449
unsigned long uninitialized_var (endtime );
438
450
int r = vhost_get_vq_desc (vq , vq -> iov , ARRAY_SIZE (vq -> iov ),
439
451
out_num , in_num , NULL , NULL );
440
452
441
453
if (r == vq -> num && vq -> busyloop_timeout ) {
454
+ if (!vhost_sock_zcopy (vq -> private_data ))
455
+ vhost_net_signal_used (nvq );
442
456
preempt_disable ();
443
457
endtime = busy_clock () + vq -> busyloop_timeout ;
444
458
while (vhost_can_busy_poll (endtime )) {
@@ -467,9 +481,62 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
467
481
min_t (unsigned int , VHOST_MAX_PEND , vq -> num >> 2 );
468
482
}
469
483
470
- /* Expects to be always run from workqueue - which acts as
471
- * read-size critical section for our kind of RCU. */
472
- static void handle_tx (struct vhost_net * net )
484
+ static size_t init_iov_iter (struct vhost_virtqueue * vq , struct iov_iter * iter ,
485
+ size_t hdr_size , int out )
486
+ {
487
+ /* Skip header. TODO: support TSO. */
488
+ size_t len = iov_length (vq -> iov , out );
489
+
490
+ iov_iter_init (iter , WRITE , vq -> iov , out , len );
491
+ iov_iter_advance (iter , hdr_size );
492
+
493
+ return iov_iter_count (iter );
494
+ }
495
+
496
+ static bool vhost_exceeds_weight (int pkts , int total_len )
497
+ {
498
+ return total_len >= VHOST_NET_WEIGHT ||
499
+ pkts >= VHOST_NET_PKT_WEIGHT ;
500
+ }
501
+
502
+ static int get_tx_bufs (struct vhost_net * net ,
503
+ struct vhost_net_virtqueue * nvq ,
504
+ struct msghdr * msg ,
505
+ unsigned int * out , unsigned int * in ,
506
+ size_t * len , bool * busyloop_intr )
507
+ {
508
+ struct vhost_virtqueue * vq = & nvq -> vq ;
509
+ int ret ;
510
+
511
+ ret = vhost_net_tx_get_vq_desc (net , nvq , out , in , busyloop_intr );
512
+
513
+ if (ret < 0 || ret == vq -> num )
514
+ return ret ;
515
+
516
+ if (* in ) {
517
+ vq_err (vq , "Unexpected descriptor format for TX: out %d, int %d\n" ,
518
+ * out , * in );
519
+ return - EFAULT ;
520
+ }
521
+
522
+ /* Sanity check */
523
+ * len = init_iov_iter (vq , & msg -> msg_iter , nvq -> vhost_hlen , * out );
524
+ if (* len == 0 ) {
525
+ vq_err (vq , "Unexpected header len for TX: %zd expected %zd\n" ,
526
+ * len , nvq -> vhost_hlen );
527
+ return - EFAULT ;
528
+ }
529
+
530
+ return ret ;
531
+ }
532
+
533
+ static bool tx_can_batch (struct vhost_virtqueue * vq , size_t total_len )
534
+ {
535
+ return total_len < VHOST_NET_WEIGHT &&
536
+ !vhost_vq_avail_empty (vq -> dev , vq );
537
+ }
538
+
539
+ static void handle_tx_copy (struct vhost_net * net , struct socket * sock )
473
540
{
474
541
struct vhost_net_virtqueue * nvq = & net -> vqs [VHOST_NET_VQ_TX ];
475
542
struct vhost_virtqueue * vq = & nvq -> vq ;
@@ -484,37 +551,86 @@ static void handle_tx(struct vhost_net *net)
484
551
};
485
552
size_t len , total_len = 0 ;
486
553
int err ;
487
- size_t hdr_size ;
488
- struct socket * sock ;
489
- struct vhost_net_ubuf_ref * uninitialized_var (ubufs );
490
- bool zcopy , zcopy_used ;
491
554
int sent_pkts = 0 ;
492
555
493
- mutex_lock (& vq -> mutex );
494
- sock = vq -> private_data ;
495
- if (!sock )
496
- goto out ;
556
+ for (;;) {
557
+ bool busyloop_intr = false;
497
558
498
- if (!vq_iotlb_prefetch (vq ))
499
- goto out ;
559
+ head = get_tx_bufs (net , nvq , & msg , & out , & in , & len ,
560
+ & busyloop_intr );
561
+ /* On error, stop handling until the next kick. */
562
+ if (unlikely (head < 0 ))
563
+ break ;
564
+ /* Nothing new? Wait for eventfd to tell us they refilled. */
565
+ if (head == vq -> num ) {
566
+ if (unlikely (busyloop_intr )) {
567
+ vhost_poll_queue (& vq -> poll );
568
+ } else if (unlikely (vhost_enable_notify (& net -> dev ,
569
+ vq ))) {
570
+ vhost_disable_notify (& net -> dev , vq );
571
+ continue ;
572
+ }
573
+ break ;
574
+ }
500
575
501
- vhost_disable_notify ( & net -> dev , vq );
502
- vhost_net_disable_vq ( net , vq ) ;
576
+ vq -> heads [ nvq -> done_idx ]. id = cpu_to_vhost32 ( vq , head );
577
+ vq -> heads [ nvq -> done_idx ]. len = 0 ;
503
578
504
- hdr_size = nvq -> vhost_hlen ;
505
- zcopy = nvq -> ubufs ;
579
+ total_len += len ;
580
+ if (tx_can_batch (vq , total_len ))
581
+ msg .msg_flags |= MSG_MORE ;
582
+ else
583
+ msg .msg_flags &= ~MSG_MORE ;
584
+
585
+ /* TODO: Check specific error and bomb out unless ENOBUFS? */
586
+ err = sock -> ops -> sendmsg (sock , & msg , len );
587
+ if (unlikely (err < 0 )) {
588
+ vhost_discard_vq_desc (vq , 1 );
589
+ vhost_net_enable_vq (net , vq );
590
+ break ;
591
+ }
592
+ if (err != len )
593
+ pr_debug ("Truncated TX packet: len %d != %zd\n" ,
594
+ err , len );
595
+ if (++ nvq -> done_idx >= VHOST_NET_BATCH )
596
+ vhost_net_signal_used (nvq );
597
+ if (vhost_exceeds_weight (++ sent_pkts , total_len )) {
598
+ vhost_poll_queue (& vq -> poll );
599
+ break ;
600
+ }
601
+ }
602
+
603
+ vhost_net_signal_used (nvq );
604
+ }
605
+
606
+ static void handle_tx_zerocopy (struct vhost_net * net , struct socket * sock )
607
+ {
608
+ struct vhost_net_virtqueue * nvq = & net -> vqs [VHOST_NET_VQ_TX ];
609
+ struct vhost_virtqueue * vq = & nvq -> vq ;
610
+ unsigned out , in ;
611
+ int head ;
612
+ struct msghdr msg = {
613
+ .msg_name = NULL ,
614
+ .msg_namelen = 0 ,
615
+ .msg_control = NULL ,
616
+ .msg_controllen = 0 ,
617
+ .msg_flags = MSG_DONTWAIT ,
618
+ };
619
+ size_t len , total_len = 0 ;
620
+ int err ;
621
+ struct vhost_net_ubuf_ref * uninitialized_var (ubufs );
622
+ bool zcopy_used ;
623
+ int sent_pkts = 0 ;
506
624
507
625
for (;;) {
508
626
bool busyloop_intr ;
509
627
510
628
/* Release DMAs done buffers first */
511
- if (zcopy )
512
- vhost_zerocopy_signal_used (net , vq );
629
+ vhost_zerocopy_signal_used (net , vq );
513
630
514
631
busyloop_intr = false;
515
- head = vhost_net_tx_get_vq_desc (net , vq , vq -> iov ,
516
- ARRAY_SIZE (vq -> iov ),
517
- & out , & in , & busyloop_intr );
632
+ head = get_tx_bufs (net , nvq , & msg , & out , & in , & len ,
633
+ & busyloop_intr );
518
634
/* On error, stop handling until the next kick. */
519
635
if (unlikely (head < 0 ))
520
636
break ;
@@ -528,27 +644,10 @@ static void handle_tx(struct vhost_net *net)
528
644
}
529
645
break ;
530
646
}
531
- if (in ) {
532
- vq_err (vq , "Unexpected descriptor format for TX: "
533
- "out %d, int %d\n" , out , in );
534
- break ;
535
- }
536
- /* Skip header. TODO: support TSO. */
537
- len = iov_length (vq -> iov , out );
538
- iov_iter_init (& msg .msg_iter , WRITE , vq -> iov , out , len );
539
- iov_iter_advance (& msg .msg_iter , hdr_size );
540
- /* Sanity check */
541
- if (!msg_data_left (& msg )) {
542
- vq_err (vq , "Unexpected header len for TX: "
543
- "%zd expected %zd\n" ,
544
- len , hdr_size );
545
- break ;
546
- }
547
- len = msg_data_left (& msg );
548
647
549
- zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
550
- && !vhost_exceeds_maxpend (net )
551
- && vhost_net_tx_select_zcopy (net );
648
+ zcopy_used = len >= VHOST_GOODCOPY_LEN
649
+ && !vhost_exceeds_maxpend (net )
650
+ && vhost_net_tx_select_zcopy (net );
552
651
553
652
/* use msg_control to pass vhost zerocopy ubuf info to skb */
554
653
if (zcopy_used ) {
@@ -570,10 +669,8 @@ static void handle_tx(struct vhost_net *net)
570
669
msg .msg_control = NULL ;
571
670
ubufs = NULL ;
572
671
}
573
-
574
672
total_len += len ;
575
- if (total_len < VHOST_NET_WEIGHT &&
576
- !vhost_vq_avail_empty (& net -> dev , vq ) &&
673
+ if (tx_can_batch (vq , total_len ) &&
577
674
likely (!vhost_exceeds_maxpend (net ))) {
578
675
msg .msg_flags |= MSG_MORE ;
579
676
} else {
@@ -600,12 +697,37 @@ static void handle_tx(struct vhost_net *net)
600
697
else
601
698
vhost_zerocopy_signal_used (net , vq );
602
699
vhost_net_tx_packet (net );
603
- if (unlikely (total_len >= VHOST_NET_WEIGHT ) ||
604
- unlikely (++ sent_pkts >= VHOST_NET_PKT_WEIGHT )) {
700
+ if (unlikely (vhost_exceeds_weight (++ sent_pkts , total_len ))) {
605
701
vhost_poll_queue (& vq -> poll );
606
702
break ;
607
703
}
608
704
}
705
+ }
706
+
707
+ /* Expects to be always run from workqueue - which acts as
708
+ * read-size critical section for our kind of RCU. */
709
+ static void handle_tx (struct vhost_net * net )
710
+ {
711
+ struct vhost_net_virtqueue * nvq = & net -> vqs [VHOST_NET_VQ_TX ];
712
+ struct vhost_virtqueue * vq = & nvq -> vq ;
713
+ struct socket * sock ;
714
+
715
+ mutex_lock (& vq -> mutex );
716
+ sock = vq -> private_data ;
717
+ if (!sock )
718
+ goto out ;
719
+
720
+ if (!vq_iotlb_prefetch (vq ))
721
+ goto out ;
722
+
723
+ vhost_disable_notify (& net -> dev , vq );
724
+ vhost_net_disable_vq (net , vq );
725
+
726
+ if (vhost_sock_zcopy (sock ))
727
+ handle_tx_zerocopy (net , sock );
728
+ else
729
+ handle_tx_copy (net , sock );
730
+
609
731
out :
610
732
mutex_unlock (& vq -> mutex );
611
733
}
@@ -641,18 +763,6 @@ static int sk_has_rx_data(struct sock *sk)
641
763
return skb_queue_empty (& sk -> sk_receive_queue );
642
764
}
643
765
644
- static void vhost_rx_signal_used (struct vhost_net_virtqueue * nvq )
645
- {
646
- struct vhost_virtqueue * vq = & nvq -> vq ;
647
- struct vhost_dev * dev = vq -> dev ;
648
-
649
- if (!nvq -> done_idx )
650
- return ;
651
-
652
- vhost_add_used_and_signal_n (dev , vq , vq -> heads , nvq -> done_idx );
653
- nvq -> done_idx = 0 ;
654
- }
655
-
656
766
static int vhost_net_rx_peek_head_len (struct vhost_net * net , struct sock * sk ,
657
767
bool * busyloop_intr )
658
768
{
@@ -665,7 +775,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
665
775
666
776
if (!len && tvq -> busyloop_timeout ) {
667
777
/* Flush batched heads first */
668
- vhost_rx_signal_used (rnvq );
778
+ vhost_net_signal_used (rnvq );
669
779
/* Both tx vq and rx socket were polled here */
670
780
mutex_lock_nested (& tvq -> mutex , 1 );
671
781
vhost_disable_notify (& net -> dev , tvq );
@@ -907,13 +1017,12 @@ static void handle_rx(struct vhost_net *net)
907
1017
goto out ;
908
1018
}
909
1019
nvq -> done_idx += headcount ;
910
- if (nvq -> done_idx > VHOST_RX_BATCH )
911
- vhost_rx_signal_used (nvq );
1020
+ if (nvq -> done_idx > VHOST_NET_BATCH )
1021
+ vhost_net_signal_used (nvq );
912
1022
if (unlikely (vq_log ))
913
1023
vhost_log_write (vq , vq_log , log , vhost_len );
914
1024
total_len += vhost_len ;
915
- if (unlikely (total_len >= VHOST_NET_WEIGHT ) ||
916
- unlikely (++ recv_pkts >= VHOST_NET_PKT_WEIGHT )) {
1025
+ if (unlikely (vhost_exceeds_weight (++ recv_pkts , total_len ))) {
917
1026
vhost_poll_queue (& vq -> poll );
918
1027
goto out ;
919
1028
}
@@ -923,7 +1032,7 @@ static void handle_rx(struct vhost_net *net)
923
1032
else
924
1033
vhost_net_enable_vq (net , vq );
925
1034
out :
926
- vhost_rx_signal_used (nvq );
1035
+ vhost_net_signal_used (nvq );
927
1036
mutex_unlock (& vq -> mutex );
928
1037
}
929
1038
@@ -976,7 +1085,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
976
1085
return - ENOMEM ;
977
1086
}
978
1087
979
- queue = kmalloc_array (VHOST_RX_BATCH , sizeof (void * ),
1088
+ queue = kmalloc_array (VHOST_NET_BATCH , sizeof (void * ),
980
1089
GFP_KERNEL );
981
1090
if (!queue ) {
982
1091
kfree (vqs );
0 commit comments