Skip to content

Commit d346a3f

Browse files
Daniel Borkmanndavem330
authored andcommitted
packet: introduce PACKET_QDISC_BYPASS socket option
This patch introduces a PACKET_QDISC_BYPASS socket option, that allows for using a similar xmit() function as in pktgen instead of taking the dev_queue_xmit() path. This can be very useful when PF_PACKET applications are required to be used in a similar scenario as pktgen, but with full, flexible packet payload that needs to be provided, for example. On default, nothing changes in behaviour for normal PF_PACKET TX users, so everything stays as is for applications. New users, however, can now set PACKET_QDISC_BYPASS if needed to prevent own packets from i) reentering packet_rcv() and ii) to directly push the frame to the driver. In doing so we can increase pps (here 64 byte packets) for PF_PACKET a bit: # CPUs -- QDISC_BYPASS -- qdisc path -- qdisc path[**] 1 CPU == 1,509,628 pps -- 1,208,708 -- 1,247,436 2 CPUs == 3,198,659 pps -- 2,536,012 -- 1,605,779 3 CPUs == 4,787,992 pps -- 3,788,740 -- 1,735,610 4 CPUs == 6,173,956 pps -- 4,907,799 -- 1,909,114 5 CPUs == 7,495,676 pps -- 5,956,499 -- 2,014,422 6 CPUs == 9,001,496 pps -- 7,145,064 -- 2,155,261 7 CPUs == 10,229,776 pps -- 8,190,596 -- 2,220,619 8 CPUs == 11,040,732 pps -- 9,188,544 -- 2,241,879 9 CPUs == 12,009,076 pps -- 10,275,936 -- 2,068,447 10 CPUs == 11,380,052 pps -- 11,265,337 -- 1,578,689 11 CPUs == 11,672,676 pps -- 11,845,344 -- 1,297,412 [...] 20 CPUs == 11,363,192 pps -- 11,014,933 -- 1,245,081 [**]: qdisc path with packet_rcv(), how probably most people seem to use it (hopefully not anymore if not needed) The test was done using a modified trafgen, sending a simple static 64 bytes packet, on all CPUs. The trick in the fast "qdisc path" case, is to avoid reentering packet_rcv() by setting the RAW socket protocol to zero, like: socket(PF_PACKET, SOCK_RAW, 0); Tradeoffs are documented as well in this patch, clearly, if queues are busy, we will drop more packets, tc disciplines are ignored, and these packets are not visible to taps anymore. For a pktgen like scenario, we argue that this is acceptable. The pointer to the xmit function has been placed in packet socket structure hole between cached_dev and prot_hook that is hot anyway as we're working on cached_dev in each send path. Done in joint work together with Jesper Dangaard Brouer. Signed-off-by: Daniel Borkmann <[email protected]> Signed-off-by: Jesper Dangaard Brouer <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 4262e5c commit d346a3f

File tree

4 files changed

+102
-12
lines changed

4 files changed

+102
-12
lines changed

Documentation/networking/packet_mmap.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,27 @@ int main(int argc, char **argp)
952952
return 0;
953953
}
954954

955+
-------------------------------------------------------------------------------
956+
+ PACKET_QDISC_BYPASS
957+
-------------------------------------------------------------------------------
958+
959+
If there is a requirement to load the network with many packets in a similar
960+
fashion as pktgen does, you might set the following option after socket
961+
creation:
962+
963+
int one = 1;
964+
setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one));
965+
966+
This has the side-effect, that packets sent through PF_PACKET will bypass the
967+
kernel's qdisc layer and are forcedly pushed to the driver directly. Meaning,
968+
packet are not buffered, tc disciplines are ignored, increased loss can occur
969+
and such packets are also not visible to other PF_PACKET sockets anymore. So,
970+
you have been warned; generally, this can be useful for stress testing various
971+
components of a system.
972+
973+
On default, PACKET_QDISC_BYPASS is disabled and needs to be explicitly enabled
974+
on PF_PACKET sockets.
975+
955976
-------------------------------------------------------------------------------
956977
+ PACKET_TIMESTAMP
957978
-------------------------------------------------------------------------------

include/uapi/linux/if_packet.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ struct sockaddr_ll {
5151
#define PACKET_TIMESTAMP 17
5252
#define PACKET_FANOUT 18
5353
#define PACKET_TX_HAS_OFF 19
54+
#define PACKET_QDISC_BYPASS 20
5455

5556
#define PACKET_FANOUT_HASH 0
5657
#define PACKET_FANOUT_LB 1

net/packet/af_packet.c

Lines changed: 79 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,48 @@ struct packet_skb_cb {
237237
static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238238
static void __fanout_link(struct sock *sk, struct packet_sock *po);
239239

240+
static int packet_direct_xmit(struct sk_buff *skb)
241+
{
242+
struct net_device *dev = skb->dev;
243+
const struct net_device_ops *ops = dev->netdev_ops;
244+
netdev_features_t features;
245+
struct netdev_queue *txq;
246+
u16 queue_map;
247+
int ret;
248+
249+
if (unlikely(!netif_running(dev) ||
250+
!netif_carrier_ok(dev))) {
251+
kfree_skb(skb);
252+
return NET_XMIT_DROP;
253+
}
254+
255+
features = netif_skb_features(skb);
256+
if (skb_needs_linearize(skb, features) &&
257+
__skb_linearize(skb)) {
258+
kfree_skb(skb);
259+
return NET_XMIT_DROP;
260+
}
261+
262+
queue_map = skb_get_queue_mapping(skb);
263+
txq = netdev_get_tx_queue(dev, queue_map);
264+
265+
__netif_tx_lock_bh(txq);
266+
if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
267+
ret = NETDEV_TX_BUSY;
268+
kfree_skb(skb);
269+
goto out;
270+
}
271+
272+
ret = ops->ndo_start_xmit(skb, dev);
273+
if (likely(dev_xmit_complete(ret)))
274+
txq_trans_update(txq);
275+
else
276+
kfree_skb(skb);
277+
out:
278+
__netif_tx_unlock_bh(txq);
279+
return ret;
280+
}
281+
240282
static struct net_device *packet_cached_dev_get(struct packet_sock *po)
241283
{
242284
struct net_device *dev;
@@ -261,6 +303,16 @@ static void packet_cached_dev_reset(struct packet_sock *po)
261303
RCU_INIT_POINTER(po->cached_dev, NULL);
262304
}
263305

306+
static bool packet_use_direct_xmit(const struct packet_sock *po)
307+
{
308+
return po->xmit == packet_direct_xmit;
309+
}
310+
311+
static u16 packet_pick_tx_queue(struct net_device *dev)
312+
{
313+
return (u16) smp_processor_id() % dev->real_num_tx_queues;
314+
}
315+
264316
/* register_prot_hook must be invoked with the po->bind_lock held,
265317
* or from a context in which asynchronous accesses to the packet
266318
* socket is not possible (packet_create()).
@@ -1994,9 +2046,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
19942046

19952047
skb_reserve(skb, hlen);
19962048
skb_reset_network_header(skb);
1997-
skb_probe_transport_header(skb, 0);
19982049

1999-
if (po->tp_tx_has_off) {
2050+
if (!packet_use_direct_xmit(po))
2051+
skb_probe_transport_header(skb, 0);
2052+
if (unlikely(po->tp_tx_has_off)) {
20002053
int off_min, off_max, off;
20012054
off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
20022055
off_max = po->tx_ring.frame_size - tp_len;
@@ -2166,12 +2219,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
21662219
}
21672220
}
21682221

2222+
skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
21692223
skb->destructor = tpacket_destruct_skb;
21702224
__packet_set_status(po, ph, TP_STATUS_SENDING);
21712225
atomic_inc(&po->tx_ring.pending);
21722226

21732227
status = TP_STATUS_SEND_REQUEST;
2174-
err = dev_queue_xmit(skb);
2228+
err = po->xmit(skb);
21752229
if (unlikely(err > 0)) {
21762230
err = net_xmit_errno(err);
21772231
if (err && __packet_get_status(po, ph) ==
@@ -2230,8 +2284,7 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
22302284
return skb;
22312285
}
22322286

2233-
static int packet_snd(struct socket *sock,
2234-
struct msghdr *msg, size_t len)
2287+
static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
22352288
{
22362289
struct sock *sk = sock->sk;
22372290
struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
@@ -2376,6 +2429,7 @@ static int packet_snd(struct socket *sock,
23762429
skb->dev = dev;
23772430
skb->priority = sk->sk_priority;
23782431
skb->mark = sk->sk_mark;
2432+
skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
23792433

23802434
if (po->has_vnet_hdr) {
23812435
if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
@@ -2396,16 +2450,12 @@ static int packet_snd(struct socket *sock,
23962450
len += vnet_hdr_len;
23972451
}
23982452

2399-
skb_probe_transport_header(skb, reserve);
2400-
2453+
if (!packet_use_direct_xmit(po))
2454+
skb_probe_transport_header(skb, reserve);
24012455
if (unlikely(extra_len == 4))
24022456
skb->no_fcs = 1;
24032457

2404-
/*
2405-
* Now send it
2406-
*/
2407-
2408-
err = dev_queue_xmit(skb);
2458+
err = po->xmit(skb);
24092459
if (err > 0 && (err = net_xmit_errno(err)) != 0)
24102460
goto out_unlock;
24112461

@@ -2427,6 +2477,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
24272477
{
24282478
struct sock *sk = sock->sk;
24292479
struct packet_sock *po = pkt_sk(sk);
2480+
24302481
if (po->tx_ring.pg_vec)
24312482
return tpacket_snd(po, msg);
24322483
else
@@ -2641,6 +2692,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
26412692
po = pkt_sk(sk);
26422693
sk->sk_family = PF_PACKET;
26432694
po->num = proto;
2695+
po->xmit = dev_queue_xmit;
26442696

26452697
packet_cached_dev_reset(po);
26462698

@@ -3220,6 +3272,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
32203272
po->tp_tx_has_off = !!val;
32213273
return 0;
32223274
}
3275+
case PACKET_QDISC_BYPASS:
3276+
{
3277+
int val;
3278+
3279+
if (optlen != sizeof(val))
3280+
return -EINVAL;
3281+
if (copy_from_user(&val, optval, sizeof(val)))
3282+
return -EFAULT;
3283+
3284+
po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3285+
return 0;
3286+
}
32233287
default:
32243288
return -ENOPROTOOPT;
32253289
}
@@ -3312,6 +3376,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
33123376
case PACKET_TX_HAS_OFF:
33133377
val = po->tp_tx_has_off;
33143378
break;
3379+
case PACKET_QDISC_BYPASS:
3380+
val = packet_use_direct_xmit(po);
3381+
break;
33153382
default:
33163383
return -ENOPROTOOPT;
33173384
}

net/packet/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ struct packet_sock {
114114
unsigned int tp_tx_has_off:1;
115115
unsigned int tp_tstamp;
116116
struct net_device __rcu *cached_dev;
117+
int (*xmit)(struct sk_buff *skb);
117118
struct packet_type prot_hook ____cacheline_aligned_in_smp;
118119
};
119120

0 commit comments

Comments
 (0)