@@ -100,6 +100,8 @@ do { \
100
100
} while (0)
101
101
#endif
102
102
103
+ #define GOODCOPY_LEN 128
104
+
103
105
#define FLT_EXACT_COUNT 8
104
106
struct tap_filter {
105
107
unsigned int count ; /* Number of addrs. Zero means disabled */
@@ -604,19 +606,100 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
604
606
return skb ;
605
607
}
606
608
609
+ /* set skb frags from iovec, this can move to core network code for reuse */
610
+ static int zerocopy_sg_from_iovec (struct sk_buff * skb , const struct iovec * from ,
611
+ int offset , size_t count )
612
+ {
613
+ int len = iov_length (from , count ) - offset ;
614
+ int copy = skb_headlen (skb );
615
+ int size , offset1 = 0 ;
616
+ int i = 0 ;
617
+
618
+ /* Skip over from offset */
619
+ while (count && (offset >= from -> iov_len )) {
620
+ offset -= from -> iov_len ;
621
+ ++ from ;
622
+ -- count ;
623
+ }
624
+
625
+ /* copy up to skb headlen */
626
+ while (count && (copy > 0 )) {
627
+ size = min_t (unsigned int , copy , from -> iov_len - offset );
628
+ if (copy_from_user (skb -> data + offset1 , from -> iov_base + offset ,
629
+ size ))
630
+ return - EFAULT ;
631
+ if (copy > size ) {
632
+ ++ from ;
633
+ -- count ;
634
+ offset = 0 ;
635
+ } else
636
+ offset += size ;
637
+ copy -= size ;
638
+ offset1 += size ;
639
+ }
640
+
641
+ if (len == offset1 )
642
+ return 0 ;
643
+
644
+ while (count -- ) {
645
+ struct page * page [MAX_SKB_FRAGS ];
646
+ int num_pages ;
647
+ unsigned long base ;
648
+ unsigned long truesize ;
649
+
650
+ len = from -> iov_len - offset ;
651
+ if (!len ) {
652
+ offset = 0 ;
653
+ ++ from ;
654
+ continue ;
655
+ }
656
+ base = (unsigned long )from -> iov_base + offset ;
657
+ size = ((base & ~PAGE_MASK ) + len + ~PAGE_MASK ) >> PAGE_SHIFT ;
658
+ if (i + size > MAX_SKB_FRAGS )
659
+ return - EMSGSIZE ;
660
+ num_pages = get_user_pages_fast (base , size , 0 , & page [i ]);
661
+ if (num_pages != size ) {
662
+ for (i = 0 ; i < num_pages ; i ++ )
663
+ put_page (page [i ]);
664
+ return - EFAULT ;
665
+ }
666
+ truesize = size * PAGE_SIZE ;
667
+ skb -> data_len += len ;
668
+ skb -> len += len ;
669
+ skb -> truesize += truesize ;
670
+ atomic_add (truesize , & skb -> sk -> sk_wmem_alloc );
671
+ while (len ) {
672
+ int off = base & ~PAGE_MASK ;
673
+ int size = min_t (int , len , PAGE_SIZE - off );
674
+ __skb_fill_page_desc (skb , i , page [i ], off , size );
675
+ skb_shinfo (skb )-> nr_frags ++ ;
676
+ /* increase sk_wmem_alloc */
677
+ base += size ;
678
+ len -= size ;
679
+ i ++ ;
680
+ }
681
+ offset = 0 ;
682
+ ++ from ;
683
+ }
684
+ return 0 ;
685
+ }
686
+
607
687
/* Get packet from user space buffer */
608
- static ssize_t tun_get_user (struct tun_struct * tun ,
609
- const struct iovec * iv , size_t count ,
610
- int noblock )
688
+ static ssize_t tun_get_user (struct tun_struct * tun , void * msg_control ,
689
+ const struct iovec * iv , size_t total_len ,
690
+ size_t count , int noblock )
611
691
{
612
692
struct tun_pi pi = { 0 , cpu_to_be16 (ETH_P_IP ) };
613
693
struct sk_buff * skb ;
614
- size_t len = count , align = NET_SKB_PAD ;
694
+ size_t len = total_len , align = NET_SKB_PAD ;
615
695
struct virtio_net_hdr gso = { 0 };
616
696
int offset = 0 ;
697
+ int copylen ;
698
+ bool zerocopy = false;
699
+ int err ;
617
700
618
701
if (!(tun -> flags & TUN_NO_PI )) {
619
- if ((len -= sizeof (pi )) > count )
702
+ if ((len -= sizeof (pi )) > total_len )
620
703
return - EINVAL ;
621
704
622
705
if (memcpy_fromiovecend ((void * )& pi , iv , 0 , sizeof (pi )))
@@ -625,7 +708,7 @@ static ssize_t tun_get_user(struct tun_struct *tun,
625
708
}
626
709
627
710
if (tun -> flags & TUN_VNET_HDR ) {
628
- if ((len -= tun -> vnet_hdr_sz ) > count )
711
+ if ((len -= tun -> vnet_hdr_sz ) > total_len )
629
712
return - EINVAL ;
630
713
631
714
if (memcpy_fromiovecend ((void * )& gso , iv , offset , sizeof (gso )))
@@ -647,14 +730,46 @@ static ssize_t tun_get_user(struct tun_struct *tun,
647
730
return - EINVAL ;
648
731
}
649
732
650
- skb = tun_alloc_skb (tun , align , len , gso .hdr_len , noblock );
733
+ if (msg_control )
734
+ zerocopy = true;
735
+
736
+ if (zerocopy ) {
737
+ /* Userspace may produce vectors with count greater than
738
+ * MAX_SKB_FRAGS, so we need to linearize parts of the skb
739
+ * to let the rest of data to be fit in the frags.
740
+ */
741
+ if (count > MAX_SKB_FRAGS ) {
742
+ copylen = iov_length (iv , count - MAX_SKB_FRAGS );
743
+ if (copylen < offset )
744
+ copylen = 0 ;
745
+ else
746
+ copylen -= offset ;
747
+ } else
748
+ copylen = 0 ;
749
+ /* There are 256 bytes to be copied in skb, so there is enough
750
+ * room for skb expand head in case it is used.
751
+ * The rest of the buffer is mapped from userspace.
752
+ */
753
+ if (copylen < gso .hdr_len )
754
+ copylen = gso .hdr_len ;
755
+ if (!copylen )
756
+ copylen = GOODCOPY_LEN ;
757
+ } else
758
+ copylen = len ;
759
+
760
+ skb = tun_alloc_skb (tun , align , copylen , gso .hdr_len , noblock );
651
761
if (IS_ERR (skb )) {
652
762
if (PTR_ERR (skb ) != - EAGAIN )
653
763
tun -> dev -> stats .rx_dropped ++ ;
654
764
return PTR_ERR (skb );
655
765
}
656
766
657
- if (skb_copy_datagram_from_iovec (skb , 0 , iv , offset , len )) {
767
+ if (zerocopy )
768
+ err = zerocopy_sg_from_iovec (skb , iv , offset , count );
769
+ else
770
+ err = skb_copy_datagram_from_iovec (skb , 0 , iv , offset , len );
771
+
772
+ if (err ) {
658
773
tun -> dev -> stats .rx_dropped ++ ;
659
774
kfree_skb (skb );
660
775
return - EFAULT ;
@@ -728,12 +843,18 @@ static ssize_t tun_get_user(struct tun_struct *tun,
728
843
skb_shinfo (skb )-> gso_segs = 0 ;
729
844
}
730
845
846
+ /* copy skb_ubuf_info for callback when skb has no error */
847
+ if (zerocopy ) {
848
+ skb_shinfo (skb )-> destructor_arg = msg_control ;
849
+ skb_shinfo (skb )-> tx_flags |= SKBTX_DEV_ZEROCOPY ;
850
+ }
851
+
731
852
netif_rx_ni (skb );
732
853
733
854
tun -> dev -> stats .rx_packets ++ ;
734
855
tun -> dev -> stats .rx_bytes += len ;
735
856
736
- return count ;
857
+ return total_len ;
737
858
}
738
859
739
860
static ssize_t tun_chr_aio_write (struct kiocb * iocb , const struct iovec * iv ,
@@ -748,7 +869,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
748
869
749
870
tun_debug (KERN_INFO , tun , "tun_chr_write %ld\n" , count );
750
871
751
- result = tun_get_user (tun , iv , iov_length (iv , count ),
872
+ result = tun_get_user (tun , NULL , iv , iov_length (iv , count ), count ,
752
873
file -> f_flags & O_NONBLOCK );
753
874
754
875
tun_put (tun );
@@ -962,8 +1083,8 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
962
1083
struct msghdr * m , size_t total_len )
963
1084
{
964
1085
struct tun_struct * tun = container_of (sock , struct tun_struct , socket );
965
- return tun_get_user (tun , m -> msg_iov , total_len ,
966
- m -> msg_flags & MSG_DONTWAIT );
1086
+ return tun_get_user (tun , m -> msg_control , m -> msg_iov , total_len ,
1087
+ m -> msg_iovlen , m -> msg_flags & MSG_DONTWAIT );
967
1088
}
968
1089
969
1090
static int tun_recvmsg (struct kiocb * iocb , struct socket * sock ,
@@ -1133,6 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
1133
1254
sock_init_data (& tun -> socket , sk );
1134
1255
sk -> sk_write_space = tun_sock_write_space ;
1135
1256
sk -> sk_sndbuf = INT_MAX ;
1257
+ sock_set_flag (sk , SOCK_ZEROCOPY );
1136
1258
1137
1259
tun_sk (sk )-> tun = tun ;
1138
1260
0 commit comments