Skip to content

Commit e32ea7e

Browse files
kraigatgoogdavem330
authored andcommitted
soreuseport: fast reuseport UDP socket selection
Include a struct sock_reuseport instance when a UDP socket binds to a specific address for the first time with the reuseport flag set. When selecting a socket for an incoming UDP packet, use the information available in sock_reuseport if present. This required adding an additional field to the UDP source address equality function to differentiate between exact and wildcard matches. The original use case allowed wildcard matches when checking for existing port uses during bind. The new use case of adding a socket to a reuseport group requires exact address matching. Performance test (using a machine with 2 CPU sockets and a total of 48 cores): Create reuseport groups of varying size. Use one socket from this group per user thread (pinning each thread to a different core) calling recvmmsg in a tight loop. Record number of messages received per second while saturating a 10G link. 10 sockets: 18% increase (~2.8M -> 3.3M pkts/s) 20 sockets: 14% increase (~2.9M -> 3.3M pkts/s) 40 sockets: 13% increase (~3.0M -> 3.4M pkts/s) This work is based off a similar implementation written by Ying Cai <[email protected]> for implementing policy-based reuseport selection. Signed-off-by: Craig Gallek <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent ef45614 commit e32ea7e

File tree

5 files changed

+141
-35
lines changed

5 files changed

+141
-35
lines changed

include/net/addrconf.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
8787
u32 banned_flags);
8888
int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
8989
u32 banned_flags);
90-
int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2);
90+
int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
91+
bool match_wildcard);
9192
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
9293
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
9394

include/net/udp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ static inline void udp_lib_close(struct sock *sk, long timeout)
191191
}
192192

193193
int udp_lib_get_port(struct sock *sk, unsigned short snum,
194-
int (*)(const struct sock *, const struct sock *),
194+
int (*)(const struct sock *, const struct sock *, bool),
195195
unsigned int hash2_nulladdr);
196196

197197
u32 udp_flow_hashrnd(void);

net/ipv4/udp.c

Lines changed: 97 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@
113113
#include <trace/events/skb.h>
114114
#include <net/busy_poll.h>
115115
#include "udp_impl.h"
116+
#include <net/sock_reuseport.h>
116117

117118
struct udp_table udp_table __read_mostly;
118119
EXPORT_SYMBOL(udp_table);
@@ -137,7 +138,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
137138
unsigned long *bitmap,
138139
struct sock *sk,
139140
int (*saddr_comp)(const struct sock *sk1,
140-
const struct sock *sk2),
141+
const struct sock *sk2,
142+
bool match_wildcard),
141143
unsigned int log)
142144
{
143145
struct sock *sk2;
@@ -152,8 +154,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
152154
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
153155
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
154156
(!sk2->sk_reuseport || !sk->sk_reuseport ||
157+
rcu_access_pointer(sk->sk_reuseport_cb) ||
155158
!uid_eq(uid, sock_i_uid(sk2))) &&
156-
saddr_comp(sk, sk2)) {
159+
saddr_comp(sk, sk2, true)) {
157160
if (!bitmap)
158161
return 1;
159162
__set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
@@ -170,7 +173,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
170173
struct udp_hslot *hslot2,
171174
struct sock *sk,
172175
int (*saddr_comp)(const struct sock *sk1,
173-
const struct sock *sk2))
176+
const struct sock *sk2,
177+
bool match_wildcard))
174178
{
175179
struct sock *sk2;
176180
struct hlist_nulls_node *node;
@@ -186,8 +190,9 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
186190
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
187191
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
188192
(!sk2->sk_reuseport || !sk->sk_reuseport ||
193+
rcu_access_pointer(sk->sk_reuseport_cb) ||
189194
!uid_eq(uid, sock_i_uid(sk2))) &&
190-
saddr_comp(sk, sk2)) {
195+
saddr_comp(sk, sk2, true)) {
191196
res = 1;
192197
break;
193198
}
@@ -196,6 +201,35 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
196201
return res;
197202
}
198203

204+
static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
205+
int (*saddr_same)(const struct sock *sk1,
206+
const struct sock *sk2,
207+
bool match_wildcard))
208+
{
209+
struct net *net = sock_net(sk);
210+
struct hlist_nulls_node *node;
211+
kuid_t uid = sock_i_uid(sk);
212+
struct sock *sk2;
213+
214+
sk_nulls_for_each(sk2, node, &hslot->head) {
215+
if (net_eq(sock_net(sk2), net) &&
216+
sk2 != sk &&
217+
sk2->sk_family == sk->sk_family &&
218+
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
219+
(udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
220+
(sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
221+
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
222+
(*saddr_same)(sk, sk2, false)) {
223+
return reuseport_add_sock(sk, sk2);
224+
}
225+
}
226+
227+
/* Initial allocation may have already happened via setsockopt */
228+
if (!rcu_access_pointer(sk->sk_reuseport_cb))
229+
return reuseport_alloc(sk);
230+
return 0;
231+
}
232+
199233
/**
200234
* udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
201235
*
@@ -207,7 +241,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
207241
*/
208242
int udp_lib_get_port(struct sock *sk, unsigned short snum,
209243
int (*saddr_comp)(const struct sock *sk1,
210-
const struct sock *sk2),
244+
const struct sock *sk2,
245+
bool match_wildcard),
211246
unsigned int hash2_nulladdr)
212247
{
213248
struct udp_hslot *hslot, *hslot2;
@@ -290,6 +325,14 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
290325
udp_sk(sk)->udp_port_hash = snum;
291326
udp_sk(sk)->udp_portaddr_hash ^= snum;
292327
if (sk_unhashed(sk)) {
328+
if (sk->sk_reuseport &&
329+
udp_reuseport_add_sock(sk, hslot, saddr_comp)) {
330+
inet_sk(sk)->inet_num = 0;
331+
udp_sk(sk)->udp_port_hash = 0;
332+
udp_sk(sk)->udp_portaddr_hash ^= snum;
333+
goto fail_unlock;
334+
}
335+
293336
sk_nulls_add_node_rcu(sk, &hslot->head);
294337
hslot->count++;
295338
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -309,13 +352,22 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
309352
}
310353
EXPORT_SYMBOL(udp_lib_get_port);
311354

312-
static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
355+
/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
356+
* match_wildcard == false: addresses must be exactly the same, i.e.
357+
* 0.0.0.0 only equals to 0.0.0.0
358+
*/
359+
static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
360+
bool match_wildcard)
313361
{
314362
struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
315363

316-
return (!ipv6_only_sock(sk2) &&
317-
(!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
318-
inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
364+
if (!ipv6_only_sock(sk2)) {
365+
if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)
366+
return 1;
367+
if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr)
368+
return match_wildcard;
369+
}
370+
return 0;
319371
}
320372

321373
static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
@@ -459,8 +511,14 @@ static struct sock *udp4_lib_lookup2(struct net *net,
459511
badness = score;
460512
reuseport = sk->sk_reuseport;
461513
if (reuseport) {
514+
struct sock *sk2;
462515
hash = udp_ehashfn(net, daddr, hnum,
463516
saddr, sport);
517+
sk2 = reuseport_select_sock(sk, hash);
518+
if (sk2) {
519+
result = sk2;
520+
goto found;
521+
}
464522
matches = 1;
465523
}
466524
} else if (score == badness && reuseport) {
@@ -478,6 +536,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
478536
if (get_nulls_value(node) != slot2)
479537
goto begin;
480538
if (result) {
539+
found:
481540
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
482541
result = NULL;
483542
else if (unlikely(compute_score2(result, net, saddr, sport,
@@ -540,8 +599,14 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
540599
badness = score;
541600
reuseport = sk->sk_reuseport;
542601
if (reuseport) {
602+
struct sock *sk2;
543603
hash = udp_ehashfn(net, daddr, hnum,
544604
saddr, sport);
605+
sk2 = reuseport_select_sock(sk, hash);
606+
if (sk2) {
607+
result = sk2;
608+
goto found;
609+
}
545610
matches = 1;
546611
}
547612
} else if (score == badness && reuseport) {
@@ -560,6 +625,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
560625
goto begin;
561626

562627
if (result) {
628+
found:
563629
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
564630
result = NULL;
565631
else if (unlikely(compute_score(result, net, saddr, hnum, sport,
@@ -587,7 +653,8 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
587653
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
588654
__be32 daddr, __be16 dport, int dif)
589655
{
590-
return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
656+
return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif,
657+
&udp_table);
591658
}
592659
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
593660

@@ -1398,6 +1465,8 @@ void udp_lib_unhash(struct sock *sk)
13981465
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
13991466

14001467
spin_lock_bh(&hslot->lock);
1468+
if (rcu_access_pointer(sk->sk_reuseport_cb))
1469+
reuseport_detach_sock(sk);
14011470
if (sk_nulls_del_node_init_rcu(sk)) {
14021471
hslot->count--;
14031472
inet_sk(sk)->inet_num = 0;
@@ -1425,22 +1494,28 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
14251494
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
14261495
nhslot2 = udp_hashslot2(udptable, newhash);
14271496
udp_sk(sk)->udp_portaddr_hash = newhash;
1428-
if (hslot2 != nhslot2) {
1497+
1498+
if (hslot2 != nhslot2 ||
1499+
rcu_access_pointer(sk->sk_reuseport_cb)) {
14291500
hslot = udp_hashslot(udptable, sock_net(sk),
14301501
udp_sk(sk)->udp_port_hash);
14311502
/* we must lock primary chain too */
14321503
spin_lock_bh(&hslot->lock);
1433-
1434-
spin_lock(&hslot2->lock);
1435-
hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1436-
hslot2->count--;
1437-
spin_unlock(&hslot2->lock);
1438-
1439-
spin_lock(&nhslot2->lock);
1440-
hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
1441-
&nhslot2->head);
1442-
nhslot2->count++;
1443-
spin_unlock(&nhslot2->lock);
1504+
if (rcu_access_pointer(sk->sk_reuseport_cb))
1505+
reuseport_detach_sock(sk);
1506+
1507+
if (hslot2 != nhslot2) {
1508+
spin_lock(&hslot2->lock);
1509+
hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1510+
hslot2->count--;
1511+
spin_unlock(&hslot2->lock);
1512+
1513+
spin_lock(&nhslot2->lock);
1514+
hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
1515+
&nhslot2->head);
1516+
nhslot2->count++;
1517+
spin_unlock(&nhslot2->lock);
1518+
}
14441519

14451520
spin_unlock_bh(&hslot->lock);
14461521
}

net/ipv6/inet6_connection_sock.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,12 @@ int inet6_csk_bind_conflict(const struct sock *sk,
5151
(sk2->sk_state != TCP_TIME_WAIT &&
5252
!uid_eq(uid,
5353
sock_i_uid((struct sock *)sk2))))) {
54-
if (ipv6_rcv_saddr_equal(sk, sk2))
54+
if (ipv6_rcv_saddr_equal(sk, sk2, true))
5555
break;
5656
}
5757
if (!relax && reuse && sk2->sk_reuse &&
5858
sk2->sk_state != TCP_LISTEN &&
59-
ipv6_rcv_saddr_equal(sk, sk2))
59+
ipv6_rcv_saddr_equal(sk, sk2, true))
6060
break;
6161
}
6262
}

net/ipv6/udp.c

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#include <net/xfrm.h>
4848
#include <net/inet6_hashtables.h>
4949
#include <net/busy_poll.h>
50+
#include <net/sock_reuseport.h>
5051

5152
#include <linux/proc_fs.h>
5253
#include <linux/seq_file.h>
@@ -76,24 +77,39 @@ static u32 udp6_ehashfn(const struct net *net,
7677
udp_ipv6_hash_secret + net_hash_mix(net));
7778
}
7879

79-
int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
80+
/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
81+
* only, and any IPv4 addresses if not IPv6 only
82+
* match_wildcard == false: addresses must be exactly the same, i.e.
83+
* IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
84+
* and 0.0.0.0 equals to 0.0.0.0 only
85+
*/
86+
int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
87+
bool match_wildcard)
8088
{
8189
const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
8290
int sk2_ipv6only = inet_v6_ipv6only(sk2);
8391
int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
8492
int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
8593

8694
/* if both are mapped, treat as IPv4 */
87-
if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED)
88-
return (!sk2_ipv6only &&
89-
(!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr ||
90-
sk->sk_rcv_saddr == sk2->sk_rcv_saddr));
95+
if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
96+
if (!sk2_ipv6only) {
97+
if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
98+
return 1;
99+
if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
100+
return match_wildcard;
101+
}
102+
return 0;
103+
}
104+
105+
if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
106+
return 1;
91107

92-
if (addr_type2 == IPV6_ADDR_ANY &&
108+
if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
93109
!(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
94110
return 1;
95111

96-
if (addr_type == IPV6_ADDR_ANY &&
112+
if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
97113
!(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
98114
return 1;
99115

@@ -253,8 +269,14 @@ static struct sock *udp6_lib_lookup2(struct net *net,
253269
badness = score;
254270
reuseport = sk->sk_reuseport;
255271
if (reuseport) {
272+
struct sock *sk2;
256273
hash = udp6_ehashfn(net, daddr, hnum,
257274
saddr, sport);
275+
sk2 = reuseport_select_sock(sk, hash);
276+
if (sk2) {
277+
result = sk2;
278+
goto found;
279+
}
258280
matches = 1;
259281
}
260282
} else if (score == badness && reuseport) {
@@ -273,6 +295,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
273295
goto begin;
274296

275297
if (result) {
298+
found:
276299
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
277300
result = NULL;
278301
else if (unlikely(compute_score2(result, net, saddr, sport,
@@ -332,8 +355,14 @@ struct sock *__udp6_lib_lookup(struct net *net,
332355
badness = score;
333356
reuseport = sk->sk_reuseport;
334357
if (reuseport) {
358+
struct sock *sk2;
335359
hash = udp6_ehashfn(net, daddr, hnum,
336360
saddr, sport);
361+
sk2 = reuseport_select_sock(sk, hash);
362+
if (sk2) {
363+
result = sk2;
364+
goto found;
365+
}
337366
matches = 1;
338367
}
339368
} else if (score == badness && reuseport) {
@@ -352,6 +381,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
352381
goto begin;
353382

354383
if (result) {
384+
found:
355385
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
356386
result = NULL;
357387
else if (unlikely(compute_score(result, net, hnum, saddr, sport,
@@ -549,8 +579,8 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
549579
int err;
550580
struct net *net = dev_net(skb->dev);
551581

552-
sk = __udp6_lib_lookup(net, daddr, uh->dest,
553-
saddr, uh->source, inet6_iif(skb), udptable);
582+
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
583+
inet6_iif(skb), udptable);
554584
if (!sk) {
555585
ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
556586
ICMP6_MIB_INERRORS);

0 commit comments

Comments
 (0)