Skip to content

Commit 9f0a74d

Browse files
committed
Merge branch 'packet_rollover'
Willem de Bruijn says: ==================== refine packet socket rollover: 1. mitigate a case of lock contention 2. avoid exporting resource exhaustion to other sockets, by migrating only to a victim socket that has ample room 3. avoid reordering of most flows on the socket, by migrating first the flow responsible for load imbalance 4. help processes detect load imbalance, by exporting rollover counters Context: rollover implements flow migration in packet socket fanout groups in case of extreme load imbalance. It is a specific implementation of migration that minimizes reordering by selecting the same victim socket when possible (and by selecting subsequent victims in a round robin fashion, from which its name derives). Changes: v2 -> v3: - statistics: replace unsigned long with __aligned_u64 v1 -> v2: - huge flow detection: run lockless - huge flow detection: replace stored index with random - contention avoidance: test in packet_poll while lock held - contention avoidance: clear pressure sooner packet_poll and packet_recvmsg would clear only if the sock is empty to avoid taking the necessary lock. But, * packet_poll already holds this lock, so a lockless variant __packet_rcv_has_room is cheap. * packet_recvmsg is usually called only for non-ring sockets, which also runs lockless. - preparation: drop "single return" patch packet_rcv_has_room is now a locked wrapper around __packet_rcv_has_room, achieving the same (single footer). The benchmark mentioned in the patches is at https://github.com/wdebruij/kerneltools/blob/master/tests/bench_rollover.c ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 7d771aa + a9b6391 commit 9f0a74d

File tree

3 files changed

+163
-28
lines changed

3 files changed

+163
-28
lines changed

include/uapi/linux/if_packet.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ struct sockaddr_ll {
5454
#define PACKET_FANOUT 18
5555
#define PACKET_TX_HAS_OFF 19
5656
#define PACKET_QDISC_BYPASS 20
57+
#define PACKET_ROLLOVER_STATS 21
5758

5859
#define PACKET_FANOUT_HASH 0
5960
#define PACKET_FANOUT_LB 1
@@ -75,6 +76,12 @@ struct tpacket_stats_v3 {
7576
unsigned int tp_freeze_q_cnt;
7677
};
7778

79+
struct tpacket_rollover_stats {
80+
__aligned_u64 tp_all;
81+
__aligned_u64 tp_huge;
82+
__aligned_u64 tp_failed;
83+
};
84+
7885
union tpacket_stats_u {
7986
struct tpacket_stats stats1;
8087
struct tpacket_stats_v3 stats3;

net/packet/af_packet.c

Lines changed: 145 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,27 +1234,86 @@ static void packet_free_pending(struct packet_sock *po)
12341234
free_percpu(po->tx_ring.pending_refcnt);
12351235
}
12361236

1237-
static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1237+
#define ROOM_POW_OFF 2
1238+
#define ROOM_NONE 0x0
1239+
#define ROOM_LOW 0x1
1240+
#define ROOM_NORMAL 0x2
1241+
1242+
static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
1243+
{
1244+
int idx, len;
1245+
1246+
len = po->rx_ring.frame_max + 1;
1247+
idx = po->rx_ring.head;
1248+
if (pow_off)
1249+
idx += len >> pow_off;
1250+
if (idx >= len)
1251+
idx -= len;
1252+
return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1253+
}
1254+
1255+
static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1256+
{
1257+
int idx, len;
1258+
1259+
len = po->rx_ring.prb_bdqc.knum_blocks;
1260+
idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1261+
if (pow_off)
1262+
idx += len >> pow_off;
1263+
if (idx >= len)
1264+
idx -= len;
1265+
return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1266+
}
1267+
1268+
static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
12381269
{
12391270
struct sock *sk = &po->sk;
1271+
int ret = ROOM_NONE;
1272+
1273+
if (po->prot_hook.func != tpacket_rcv) {
1274+
int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1275+
- (skb ? skb->truesize : 0);
1276+
if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1277+
return ROOM_NORMAL;
1278+
else if (avail > 0)
1279+
return ROOM_LOW;
1280+
else
1281+
return ROOM_NONE;
1282+
}
1283+
1284+
if (po->tp_version == TPACKET_V3) {
1285+
if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1286+
ret = ROOM_NORMAL;
1287+
else if (__tpacket_v3_has_room(po, 0))
1288+
ret = ROOM_LOW;
1289+
} else {
1290+
if (__tpacket_has_room(po, ROOM_POW_OFF))
1291+
ret = ROOM_NORMAL;
1292+
else if (__tpacket_has_room(po, 0))
1293+
ret = ROOM_LOW;
1294+
}
1295+
1296+
return ret;
1297+
}
1298+
1299+
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1300+
{
1301+
int ret;
12401302
bool has_room;
12411303

1242-
if (po->prot_hook.func != tpacket_rcv)
1243-
return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1244-
<= sk->sk_rcvbuf;
1304+
if (po->prot_hook.func == tpacket_rcv) {
1305+
spin_lock(&po->sk.sk_receive_queue.lock);
1306+
ret = __packet_rcv_has_room(po, skb);
1307+
spin_unlock(&po->sk.sk_receive_queue.lock);
1308+
} else {
1309+
ret = __packet_rcv_has_room(po, skb);
1310+
}
12451311

1246-
spin_lock(&sk->sk_receive_queue.lock);
1247-
if (po->tp_version == TPACKET_V3)
1248-
has_room = prb_lookup_block(po, &po->rx_ring,
1249-
po->rx_ring.prb_bdqc.kactive_blk_num,
1250-
TP_STATUS_KERNEL);
1251-
else
1252-
has_room = packet_lookup_frame(po, &po->rx_ring,
1253-
po->rx_ring.head,
1254-
TP_STATUS_KERNEL);
1255-
spin_unlock(&sk->sk_receive_queue.lock);
1312+
has_room = ret == ROOM_NORMAL;
1313+
if (po->pressure == has_room)
1314+
xchg(&po->pressure, !has_room);
12561315

1257-
return has_room;
1316+
return ret;
12581317
}
12591318

12601319
static void packet_sock_destruct(struct sock *sk)
@@ -1282,6 +1341,20 @@ static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
12821341
return x;
12831342
}
12841343

1344+
static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1345+
{
1346+
u32 rxhash;
1347+
int i, count = 0;
1348+
1349+
rxhash = skb_get_hash(skb);
1350+
for (i = 0; i < ROLLOVER_HLEN; i++)
1351+
if (po->rollover->history[i] == rxhash)
1352+
count++;
1353+
1354+
po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1355+
return count > (ROLLOVER_HLEN >> 1);
1356+
}
1357+
12851358
static unsigned int fanout_demux_hash(struct packet_fanout *f,
12861359
struct sk_buff *skb,
12871360
unsigned int num)
@@ -1318,22 +1391,39 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f,
13181391

13191392
static unsigned int fanout_demux_rollover(struct packet_fanout *f,
13201393
struct sk_buff *skb,
1321-
unsigned int idx, unsigned int skip,
1394+
unsigned int idx, bool try_self,
13221395
unsigned int num)
13231396
{
1324-
unsigned int i, j;
1397+
struct packet_sock *po, *po_next;
1398+
unsigned int i, j, room = ROOM_NONE;
13251399

1326-
i = j = min_t(int, f->next[idx], num - 1);
1400+
po = pkt_sk(f->arr[idx]);
1401+
1402+
if (try_self) {
1403+
room = packet_rcv_has_room(po, skb);
1404+
if (room == ROOM_NORMAL ||
1405+
(room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1406+
return idx;
1407+
}
1408+
1409+
i = j = min_t(int, po->rollover->sock, num - 1);
13271410
do {
1328-
if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1411+
po_next = pkt_sk(f->arr[i]);
1412+
if (po_next != po && !po_next->pressure &&
1413+
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
13291414
if (i != j)
1330-
f->next[idx] = i;
1415+
po->rollover->sock = i;
1416+
atomic_long_inc(&po->rollover->num);
1417+
if (room == ROOM_LOW)
1418+
atomic_long_inc(&po->rollover->num_huge);
13311419
return i;
13321420
}
1421+
13331422
if (++i == num)
13341423
i = 0;
13351424
} while (i != j);
13361425

1426+
atomic_long_inc(&po->rollover->num_failed);
13371427
return idx;
13381428
}
13391429

@@ -1386,17 +1476,14 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
13861476
idx = fanout_demux_qm(f, skb, num);
13871477
break;
13881478
case PACKET_FANOUT_ROLLOVER:
1389-
idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
1479+
idx = fanout_demux_rollover(f, skb, 0, false, num);
13901480
break;
13911481
}
13921482

1393-
po = pkt_sk(f->arr[idx]);
1394-
if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1395-
unlikely(!packet_rcv_has_room(po, skb))) {
1396-
idx = fanout_demux_rollover(f, skb, idx, idx, num);
1397-
po = pkt_sk(f->arr[idx]);
1398-
}
1483+
if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1484+
idx = fanout_demux_rollover(f, skb, idx, true, num);
13991485

1486+
po = pkt_sk(f->arr[idx]);
14001487
return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
14011488
}
14021489

@@ -1467,6 +1554,15 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
14671554
if (po->fanout)
14681555
return -EALREADY;
14691556

1557+
if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) {
1558+
po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
1559+
if (!po->rollover)
1560+
return -ENOMEM;
1561+
atomic_long_set(&po->rollover->num, 0);
1562+
atomic_long_set(&po->rollover->num_huge, 0);
1563+
atomic_long_set(&po->rollover->num_failed, 0);
1564+
}
1565+
14701566
mutex_lock(&fanout_mutex);
14711567
match = NULL;
14721568
list_for_each_entry(f, &fanout_list, list) {
@@ -1515,6 +1611,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
15151611
}
15161612
out:
15171613
mutex_unlock(&fanout_mutex);
1614+
if (err) {
1615+
kfree(po->rollover);
1616+
po->rollover = NULL;
1617+
}
15181618
return err;
15191619
}
15201620

@@ -1536,6 +1636,8 @@ static void fanout_release(struct sock *sk)
15361636
kfree(f);
15371637
}
15381638
mutex_unlock(&fanout_mutex);
1639+
1640+
kfree(po->rollover);
15391641
}
15401642

15411643
static const struct proto_ops packet_ops;
@@ -2865,6 +2967,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
28652967

28662968
spin_lock_init(&po->bind_lock);
28672969
mutex_init(&po->pg_vec_lock);
2970+
po->rollover = NULL;
28682971
po->prot_hook.func = packet_rcv;
28692972

28702973
if (sock->type == SOCK_PACKET)
@@ -2942,6 +3045,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
29423045
if (skb == NULL)
29433046
goto out;
29443047

3048+
if (pkt_sk(sk)->pressure)
3049+
packet_rcv_has_room(pkt_sk(sk), NULL);
3050+
29453051
if (pkt_sk(sk)->has_vnet_hdr) {
29463052
struct virtio_net_hdr vnet_hdr = { 0 };
29473053

@@ -3485,6 +3591,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
34853591
struct packet_sock *po = pkt_sk(sk);
34863592
void *data = &val;
34873593
union tpacket_stats_u st;
3594+
struct tpacket_rollover_stats rstats;
34883595

34893596
if (level != SOL_PACKET)
34903597
return -ENOPROTOOPT;
@@ -3560,6 +3667,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
35603667
((u32)po->fanout->flags << 24)) :
35613668
0);
35623669
break;
3670+
case PACKET_ROLLOVER_STATS:
3671+
if (!po->rollover)
3672+
return -EINVAL;
3673+
rstats.tp_all = atomic_long_read(&po->rollover->num);
3674+
rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3675+
rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3676+
data = &rstats;
3677+
lv = sizeof(rstats);
3678+
break;
35633679
case PACKET_TX_HAS_OFF:
35643680
val = po->tp_tx_has_off;
35653681
break;
@@ -3697,6 +3813,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
36973813
TP_STATUS_KERNEL))
36983814
mask |= POLLIN | POLLRDNORM;
36993815
}
3816+
if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
3817+
xchg(&po->pressure, 0);
37003818
spin_unlock_bh(&sk->sk_receive_queue.lock);
37013819
spin_lock_bh(&sk->sk_write_queue.lock);
37023820
if (po->tx_ring.pg_vec) {

net/packet/internal.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,20 @@ struct packet_fanout {
8282
atomic_t rr_cur;
8383
struct list_head list;
8484
struct sock *arr[PACKET_FANOUT_MAX];
85-
int next[PACKET_FANOUT_MAX];
8685
spinlock_t lock;
8786
atomic_t sk_ref;
8887
struct packet_type prot_hook ____cacheline_aligned_in_smp;
8988
};
9089

90+
struct packet_rollover {
91+
int sock;
92+
atomic_long_t num;
93+
atomic_long_t num_huge;
94+
atomic_long_t num_failed;
95+
#define ROLLOVER_HLEN (L1_CACHE_BYTES / sizeof(u32))
96+
u32 history[ROLLOVER_HLEN] ____cacheline_aligned;
97+
} ____cacheline_aligned_in_smp;
98+
9199
struct packet_sock {
92100
/* struct sock has to be the first member of packet_sock */
93101
struct sock sk;
@@ -102,8 +110,10 @@ struct packet_sock {
102110
auxdata:1,
103111
origdev:1,
104112
has_vnet_hdr:1;
113+
int pressure;
105114
int ifindex; /* bound device */
106115
__be16 num;
116+
struct packet_rollover *rollover;
107117
struct packet_mclist *mclist;
108118
atomic_t mapped;
109119
enum tpacket_versions tp_version;

0 commit comments

Comments
 (0)