Skip to content

Commit c125e80

Browse files
kraigatgoogdavem330
authored andcommitted
soreuseport: fast reuseport TCP socket selection
This change extends the fast SO_REUSEPORT socket lookup implemented for UDP to TCP. Listener sockets with SO_REUSEPORT and the same receive address are additionally added to an array for faster random access. This means that only a single socket from the group must be found in the listener list before any socket in the group can be used to receive a packet. Previously, every socket in the group needed to be considered before handing off the incoming packet. This feature also exposes the ability to use a BPF program when selecting a socket from a reuseport group. Signed-off-by: Craig Gallek <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent fa46349 commit c125e80

File tree

6 files changed

+93
-12
lines changed

6 files changed

+93
-12
lines changed

include/net/inet_hashtables.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,10 @@ void inet_hashinfo_init(struct inet_hashinfo *h);
207207

208208
bool inet_ehash_insert(struct sock *sk, struct sock *osk);
209209
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
210-
void __inet_hash(struct sock *sk, struct sock *osk);
210+
int __inet_hash(struct sock *sk, struct sock *osk,
211+
int (*saddr_same)(const struct sock *sk1,
212+
const struct sock *sk2,
213+
bool match_wildcard));
211214
int inet_hash(struct sock *sk);
212215
void inet_unhash(struct sock *sk);
213216

net/ipv4/inet_connection_sock.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <net/tcp_states.h>
2525
#include <net/xfrm.h>
2626
#include <net/tcp.h>
27+
#include <net/sock_reuseport.h>
2728

2829
#ifdef INET_CSK_DEBUG
2930
const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
6768
if ((!reuse || !sk2->sk_reuse ||
6869
sk2->sk_state == TCP_LISTEN) &&
6970
(!reuseport || !sk2->sk_reuseport ||
70-
(sk2->sk_state != TCP_TIME_WAIT &&
71+
rcu_access_pointer(sk->sk_reuseport_cb) ||
72+
(sk2->sk_state != TCP_TIME_WAIT &&
7173
!uid_eq(uid, sock_i_uid(sk2))))) {
7274

7375
if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
@@ -132,6 +134,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
132134
sk->sk_state != TCP_LISTEN) ||
133135
(tb->fastreuseport > 0 &&
134136
sk->sk_reuseport &&
137+
!rcu_access_pointer(sk->sk_reuseport_cb) &&
135138
uid_eq(tb->fastuid, uid))) &&
136139
(tb->num_owners < smallest_size || smallest_size == -1)) {
137140
smallest_size = tb->num_owners;
@@ -193,15 +196,18 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
193196
if (((tb->fastreuse > 0 &&
194197
sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
195198
(tb->fastreuseport > 0 &&
196-
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
197-
smallest_size == -1) {
199+
sk->sk_reuseport &&
200+
!rcu_access_pointer(sk->sk_reuseport_cb) &&
201+
uid_eq(tb->fastuid, uid))) && smallest_size == -1) {
198202
goto success;
199203
} else {
200204
ret = 1;
201205
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
202206
if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
203207
(tb->fastreuseport > 0 &&
204-
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
208+
sk->sk_reuseport &&
209+
!rcu_access_pointer(sk->sk_reuseport_cb) &&
210+
uid_eq(tb->fastuid, uid))) &&
205211
smallest_size != -1 && --attempts >= 0) {
206212
spin_unlock(&head->lock);
207213
goto again;

net/ipv4/inet_hashtables.c

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@
2020
#include <linux/wait.h>
2121
#include <linux/vmalloc.h>
2222

23+
#include <net/addrconf.h>
2324
#include <net/inet_connection_sock.h>
2425
#include <net/inet_hashtables.h>
2526
#include <net/secure_seq.h>
2627
#include <net/ip.h>
28+
#include <net/sock_reuseport.h>
2729

2830
static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
2931
const __u16 lport, const __be32 faddr,
@@ -215,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net,
215217
unsigned int hash = inet_lhashfn(net, hnum);
216218
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
217219
int score, hiscore, matches = 0, reuseport = 0;
220+
bool select_ok = true;
218221
u32 phash = 0;
219222

220223
rcu_read_lock();
@@ -230,6 +233,15 @@ struct sock *__inet_lookup_listener(struct net *net,
230233
if (reuseport) {
231234
phash = inet_ehashfn(net, daddr, hnum,
232235
saddr, sport);
236+
if (select_ok) {
237+
struct sock *sk2;
238+
sk2 = reuseport_select_sock(sk, phash,
239+
skb, doff);
240+
if (sk2) {
241+
result = sk2;
242+
goto found;
243+
}
244+
}
233245
matches = 1;
234246
}
235247
} else if (score == hiscore && reuseport) {
@@ -247,11 +259,13 @@ struct sock *__inet_lookup_listener(struct net *net,
247259
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
248260
goto begin;
249261
if (result) {
262+
found:
250263
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
251264
result = NULL;
252265
else if (unlikely(compute_score(result, net, hnum, daddr,
253266
dif) < hiscore)) {
254267
sock_put(result);
268+
select_ok = false;
255269
goto begin;
256270
}
257271
}
@@ -450,34 +464,74 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
450464
}
451465
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
452466

453-
void __inet_hash(struct sock *sk, struct sock *osk)
467+
static int inet_reuseport_add_sock(struct sock *sk,
468+
struct inet_listen_hashbucket *ilb,
469+
int (*saddr_same)(const struct sock *sk1,
470+
const struct sock *sk2,
471+
bool match_wildcard))
472+
{
473+
struct sock *sk2;
474+
struct hlist_nulls_node *node;
475+
kuid_t uid = sock_i_uid(sk);
476+
477+
sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
478+
if (sk2 != sk &&
479+
sk2->sk_family == sk->sk_family &&
480+
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
481+
sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
482+
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
483+
saddr_same(sk, sk2, false))
484+
return reuseport_add_sock(sk, sk2);
485+
}
486+
487+
/* Initial allocation may have already happened via setsockopt */
488+
if (!rcu_access_pointer(sk->sk_reuseport_cb))
489+
return reuseport_alloc(sk);
490+
return 0;
491+
}
492+
493+
int __inet_hash(struct sock *sk, struct sock *osk,
494+
int (*saddr_same)(const struct sock *sk1,
495+
const struct sock *sk2,
496+
bool match_wildcard))
454497
{
455498
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
456499
struct inet_listen_hashbucket *ilb;
500+
int err = 0;
457501

458502
if (sk->sk_state != TCP_LISTEN) {
459503
inet_ehash_nolisten(sk, osk);
460-
return;
504+
return 0;
461505
}
462506
WARN_ON(!sk_unhashed(sk));
463507
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
464508

465509
spin_lock(&ilb->lock);
510+
if (sk->sk_reuseport) {
511+
err = inet_reuseport_add_sock(sk, ilb, saddr_same);
512+
if (err)
513+
goto unlock;
514+
}
466515
__sk_nulls_add_node_rcu(sk, &ilb->head);
467516
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
517+
unlock:
468518
spin_unlock(&ilb->lock);
519+
520+
return err;
469521
}
470522
EXPORT_SYMBOL(__inet_hash);
471523

472524
int inet_hash(struct sock *sk)
473525
{
526+
int err = 0;
527+
474528
if (sk->sk_state != TCP_CLOSE) {
475529
local_bh_disable();
476-
__inet_hash(sk, NULL);
530+
err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
477531
local_bh_enable();
478532
}
479533

480-
return 0;
534+
return err;
481535
}
482536
EXPORT_SYMBOL_GPL(inet_hash);
483537

@@ -496,6 +550,8 @@ void inet_unhash(struct sock *sk)
496550
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
497551

498552
spin_lock_bh(lock);
553+
if (rcu_access_pointer(sk->sk_reuseport_cb))
554+
reuseport_detach_sock(sk);
499555
done = __sk_nulls_del_node_init_rcu(sk);
500556
if (done)
501557
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);

net/ipv4/udp.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,8 +356,8 @@ EXPORT_SYMBOL(udp_lib_get_port);
356356
* match_wildcard == false: addresses must be exactly the same, i.e.
357357
* 0.0.0.0 only equals to 0.0.0.0
358358
*/
359-
static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
360-
bool match_wildcard)
359+
int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
360+
bool match_wildcard)
361361
{
362362
struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
363363

net/ipv6/inet6_connection_sock.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <net/ip6_route.h>
2727
#include <net/sock.h>
2828
#include <net/inet6_connection_sock.h>
29+
#include <net/sock_reuseport.h>
2930

3031
int inet6_csk_bind_conflict(const struct sock *sk,
3132
const struct inet_bind_bucket *tb, bool relax)
@@ -48,6 +49,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
4849
if ((!reuse || !sk2->sk_reuse ||
4950
sk2->sk_state == TCP_LISTEN) &&
5051
(!reuseport || !sk2->sk_reuseport ||
52+
rcu_access_pointer(sk->sk_reuseport_cb) ||
5153
(sk2->sk_state != TCP_TIME_WAIT &&
5254
!uid_eq(uid,
5355
sock_i_uid((struct sock *)sk2))))) {

net/ipv6/inet6_hashtables.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@
1717
#include <linux/module.h>
1818
#include <linux/random.h>
1919

20+
#include <net/addrconf.h>
2021
#include <net/inet_connection_sock.h>
2122
#include <net/inet_hashtables.h>
2223
#include <net/inet6_hashtables.h>
2324
#include <net/secure_seq.h>
2425
#include <net/ip.h>
26+
#include <net/sock_reuseport.h>
2527

2628
u32 inet6_ehashfn(const struct net *net,
2729
const struct in6_addr *laddr, const u16 lport,
@@ -131,6 +133,7 @@ struct sock *inet6_lookup_listener(struct net *net,
131133
const struct hlist_nulls_node *node;
132134
struct sock *result;
133135
int score, hiscore, matches = 0, reuseport = 0;
136+
bool select_ok = true;
134137
u32 phash = 0;
135138
unsigned int hash = inet_lhashfn(net, hnum);
136139
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
@@ -148,6 +151,15 @@ struct sock *inet6_lookup_listener(struct net *net,
148151
if (reuseport) {
149152
phash = inet6_ehashfn(net, daddr, hnum,
150153
saddr, sport);
154+
if (select_ok) {
155+
struct sock *sk2;
156+
sk2 = reuseport_select_sock(sk, phash,
157+
skb, doff);
158+
if (sk2) {
159+
result = sk2;
160+
goto found;
161+
}
162+
}
151163
matches = 1;
152164
}
153165
} else if (score == hiscore && reuseport) {
@@ -165,11 +177,13 @@ struct sock *inet6_lookup_listener(struct net *net,
165177
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
166178
goto begin;
167179
if (result) {
180+
found:
168181
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
169182
result = NULL;
170183
else if (unlikely(compute_score(result, net, hnum, daddr,
171184
dif) < hiscore)) {
172185
sock_put(result);
186+
select_ok = false;
173187
goto begin;
174188
}
175189
}
@@ -283,7 +297,7 @@ int inet6_hash(struct sock *sk)
283297
{
284298
if (sk->sk_state != TCP_CLOSE) {
285299
local_bh_disable();
286-
__inet_hash(sk, NULL);
300+
__inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
287301
local_bh_enable();
288302
}
289303

0 commit comments

Comments
 (0)