Skip to content

Commit 2ccdbaa

Browse files
wdebruijdavem330
authored andcommitted
packet: rollover lock contention avoidance
Rollover has to call packet_rcv_has_room on sockets in the fanout group to find a socket to migrate to. This operation is expensive especially if the packet sockets use rings, when a lock has to be acquired. Avoid pounding on the lock by all sockets by temporarily marking a socket as "under memory pressure" when such pressure is detected. While set, only the socket owner may call packet_rcv_has_room on the socket. Once it detects normal conditions, it clears the flag. The socket is not used as a victim by any other socket in the meantime. Under reasonably balanced load, each socket writer frequently calls packet_rcv_has_room and clears its own pressure field. As a backup for when the socket is rarely written to, also clear the flag on reading (packet_recvmsg, packet_poll) if this can be done cheaply (i.e., without calling packet_rcv_has_room). This is only for edge cases. Tested: Ran bench_rollover: a process with 8 sockets in a single fanout group, each pinned to a single cpu that receives one nic recv interrupt. RPS and RFS are disabled. The benchmark uses packet rx_ring, which has to take a lock when determining whether a socket has room. Sent 3.5 Mpps of UDP traffic with sufficient entropy to spread uniformly across the packet sockets (and inserted an iptables rule to drop in PREROUTING to avoid protocol stack processing). Without this patch, all sockets try to migrate traffic to neighbors, causing lock contention when searching for a non- empty neighbor. The lock is the top 9 entries. perf record -a -g sleep 5 - 17.82% bench_rollover [kernel.kallsyms] [k] _raw_spin_lock - _raw_spin_lock - 99.00% spin_lock + 81.77% packet_rcv_has_room.isra.41 + 18.23% tpacket_rcv + 0.84% packet_rcv_has_room.isra.41 + 5.20% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock + 5.15% ksoftirqd/1 [kernel.kallsyms] [k] _raw_spin_lock + 5.14% ksoftirqd/2 [kernel.kallsyms] [k] _raw_spin_lock + 5.12% ksoftirqd/7 [kernel.kallsyms] [k] _raw_spin_lock + 5.12% ksoftirqd/5 [kernel.kallsyms] [k] _raw_spin_lock + 5.10% ksoftirqd/4 [kernel.kallsyms] [k] _raw_spin_lock + 4.66% ksoftirqd/0 [kernel.kallsyms] [k] _raw_spin_lock + 4.45% ksoftirqd/3 [kernel.kallsyms] [k] _raw_spin_lock + 1.55% bench_rollover [kernel.kallsyms] [k] packet_rcv_has_room.isra.41 On net-next with this patch, this lock contention is no longer a top entry. Most time is spent in the actual read function. Next up are other locks: + 15.52% bench_rollover bench_rollover [.] reader + 4.68% swapper [kernel.kallsyms] [k] memcpy_erms + 2.77% swapper [kernel.kallsyms] [k] packet_lookup_frame.isra.51 + 2.56% ksoftirqd/1 [kernel.kallsyms] [k] memcpy_erms + 2.16% swapper [kernel.kallsyms] [k] tpacket_rcv + 1.93% swapper [kernel.kallsyms] [k] mlx4_en_process_rx_cq Looking closer at the remaining _raw_spin_lock, the cost of probing in rollover is now comparable to the cost of taking the lock later in tpacket_rcv. - 1.51% swapper [kernel.kallsyms] [k] _raw_spin_lock - _raw_spin_lock + 33.41% packet_rcv_has_room + 28.15% tpacket_rcv + 19.54% enqueue_to_backlog + 6.45% __free_pages_ok + 2.78% packet_rcv_fanout + 2.13% fanout_demux_rollover + 2.01% netif_receive_skb_internal Signed-off-by: Willem de Bruijn <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 9954729 commit 2ccdbaa

File tree

2 files changed

+32
-7
lines changed

2 files changed

+32
-7
lines changed

net/packet/af_packet.c

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,14 +1265,14 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
12651265
return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
12661266
}
12671267

1268-
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1268+
static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
12691269
{
12701270
struct sock *sk = &po->sk;
12711271
int ret = ROOM_NONE;
12721272

12731273
if (po->prot_hook.func != tpacket_rcv) {
12741274
int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1275-
- skb->truesize;
1275+
- (skb ? skb->truesize : 0);
12761276
if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
12771277
return ROOM_NORMAL;
12781278
else if (avail > 0)
@@ -1281,7 +1281,6 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
12811281
return ROOM_NONE;
12821282
}
12831283

1284-
spin_lock(&sk->sk_receive_queue.lock);
12851284
if (po->tp_version == TPACKET_V3) {
12861285
if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
12871286
ret = ROOM_NORMAL;
@@ -1293,7 +1292,26 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
12931292
else if (__tpacket_has_room(po, 0))
12941293
ret = ROOM_LOW;
12951294
}
1296-
spin_unlock(&sk->sk_receive_queue.lock);
1295+
1296+
return ret;
1297+
}
1298+
1299+
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1300+
{
1301+
int ret;
1302+
bool has_room;
1303+
1304+
if (po->prot_hook.func == tpacket_rcv) {
1305+
spin_lock(&po->sk.sk_receive_queue.lock);
1306+
ret = __packet_rcv_has_room(po, skb);
1307+
spin_unlock(&po->sk.sk_receive_queue.lock);
1308+
} else {
1309+
ret = __packet_rcv_has_room(po, skb);
1310+
}
1311+
1312+
has_room = ret == ROOM_NORMAL;
1313+
if (po->pressure == has_room)
1314+
xchg(&po->pressure, !has_room);
12971315

12981316
return ret;
12991317
}
@@ -1362,7 +1380,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
13621380
unsigned int idx, bool try_self,
13631381
unsigned int num)
13641382
{
1365-
struct packet_sock *po;
1383+
struct packet_sock *po, *po_next;
13661384
unsigned int i, j;
13671385

13681386
po = pkt_sk(f->arr[idx]);
@@ -1371,8 +1389,9 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
13711389

13721390
i = j = min_t(int, po->rollover->sock, num - 1);
13731391
do {
1374-
if (i != idx &&
1375-
packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) {
1392+
po_next = pkt_sk(f->arr[i]);
1393+
if (po_next != po && !po_next->pressure &&
1394+
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
13761395
if (i != j)
13771396
po->rollover->sock = i;
13781397
return i;
@@ -3000,6 +3019,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
30003019
if (skb == NULL)
30013020
goto out;
30023021

3022+
if (pkt_sk(sk)->pressure)
3023+
packet_rcv_has_room(pkt_sk(sk), NULL);
3024+
30033025
if (pkt_sk(sk)->has_vnet_hdr) {
30043026
struct virtio_net_hdr vnet_hdr = { 0 };
30053027

@@ -3755,6 +3777,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
37553777
TP_STATUS_KERNEL))
37563778
mask |= POLLIN | POLLRDNORM;
37573779
}
3780+
if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
3781+
xchg(&po->pressure, 0);
37583782
spin_unlock_bh(&sk->sk_receive_queue.lock);
37593783
spin_lock_bh(&sk->sk_write_queue.lock);
37603784
if (po->tx_ring.pg_vec) {

net/packet/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ struct packet_sock {
105105
auxdata:1,
106106
origdev:1,
107107
has_vnet_hdr:1;
108+
int pressure;
108109
int ifindex; /* bound device */
109110
__be16 num;
110111
struct packet_rollover *rollover;

0 commit comments

Comments
 (0)