Skip to content

Commit 936a192

Browse files
q2vendavem330
authored andcommitted
tcp: Add TIME_WAIT sockets in bhash2.
Jiri Slaby reported regression of bind() with a simple repro. [0] The repro creates a TIME_WAIT socket and tries to bind() a new socket with the same local address and port. Before commit 28044fc ("net: Add a bhash2 table hashed by port and address"), the bind() failed with -EADDRINUSE, but now it succeeds. The cited commit should have put TIME_WAIT sockets into bhash2; otherwise, inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind() requests if the address is not a wildcard one. The straight option is to move sk_bind2_node from struct sock to struct sock_common to add twsk to bhash2 as implemented as RFC. [1] However, the binary layout change in the struct sock could affect performances moving hot fields on different cachelines. To avoid that, we add another TIME_WAIT list in inet_bind2_bucket and check it while validating bind(). [0]: https://lore.kernel.org/netdev/[email protected]/ [1]: https://lore.kernel.org/netdev/[email protected]/ Fixes: 28044fc ("net: Add a bhash2 table hashed by port and address") Reported-by: Jiri Slaby <[email protected]> Suggested-by: Paolo Abeni <[email protected]> Signed-off-by: Kuniyuki Iwashima <[email protected]> Acked-by: Joanne Koong <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 40cab44 commit 936a192

File tree

5 files changed

+65
-9
lines changed

5 files changed

+65
-9
lines changed

include/net/inet_hashtables.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ struct inet_bind2_bucket {
108108
struct hlist_node node;
109109
/* List of sockets hashed to this bucket */
110110
struct hlist_head owners;
111+
/* bhash has twsk in owners, but bhash2 has twsk in
112+
* deathrow not to add a member in struct sock_common.
113+
*/
114+
struct hlist_head deathrow;
111115
};
112116

113117
static inline struct net *ib_net(const struct inet_bind_bucket *ib)

include/net/inet_timewait_sock.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,14 @@ struct inet_timewait_sock {
7373
u32 tw_priority;
7474
struct timer_list tw_timer;
7575
struct inet_bind_bucket *tw_tb;
76+
struct inet_bind2_bucket *tw_tb2;
77+
struct hlist_node tw_bind2_node;
7678
};
7779
#define tw_tclass tw_tos
7880

81+
#define twsk_for_each_bound_bhash2(__tw, list) \
82+
hlist_for_each_entry(__tw, list, tw_bind2_node)
83+
7984
static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
8085
{
8186
return (struct inet_timewait_sock *)sk;

net/ipv4/inet_connection_sock.c

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,22 +173,40 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
173173
return false;
174174
}
175175

176+
static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
177+
kuid_t sk_uid, bool relax,
178+
bool reuseport_cb_ok, bool reuseport_ok)
179+
{
180+
if (sk->sk_family == AF_INET && ipv6_only_sock(sk2))
181+
return false;
182+
183+
return inet_bind_conflict(sk, sk2, sk_uid, relax,
184+
reuseport_cb_ok, reuseport_ok);
185+
}
186+
176187
static bool inet_bhash2_conflict(const struct sock *sk,
177188
const struct inet_bind2_bucket *tb2,
178189
kuid_t sk_uid,
179190
bool relax, bool reuseport_cb_ok,
180191
bool reuseport_ok)
181192
{
193+
struct inet_timewait_sock *tw2;
182194
struct sock *sk2;
183195

184196
sk_for_each_bound_bhash2(sk2, &tb2->owners) {
185-
if (sk->sk_family == AF_INET && ipv6_only_sock(sk2))
186-
continue;
197+
if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
198+
reuseport_cb_ok, reuseport_ok))
199+
return true;
200+
}
187201

188-
if (inet_bind_conflict(sk, sk2, sk_uid, relax,
189-
reuseport_cb_ok, reuseport_ok))
202+
twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) {
203+
sk2 = (struct sock *)tw2;
204+
205+
if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
206+
reuseport_cb_ok, reuseport_ok))
190207
return true;
191208
}
209+
192210
return false;
193211
}
194212

net/ipv4/inet_hashtables.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb,
116116
#endif
117117
tb->rcv_saddr = sk->sk_rcv_saddr;
118118
INIT_HLIST_HEAD(&tb->owners);
119+
INIT_HLIST_HEAD(&tb->deathrow);
119120
hlist_add_head(&tb->node, &head->chain);
120121
}
121122

@@ -137,7 +138,7 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
137138
/* Caller must hold hashbucket lock for this tb with local BH disabled */
138139
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
139140
{
140-
if (hlist_empty(&tb->owners)) {
141+
if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) {
141142
__hlist_del(&tb->node);
142143
kmem_cache_free(cachep, tb);
143144
}
@@ -1103,15 +1104,16 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
11031104
/* Head lock still held and bh's disabled */
11041105
inet_bind_hash(sk, tb, tb2, port);
11051106

1106-
spin_unlock(&head2->lock);
1107-
11081107
if (sk_unhashed(sk)) {
11091108
inet_sk(sk)->inet_sport = htons(port);
11101109
inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
11111110
}
11121111
if (tw)
11131112
inet_twsk_bind_unhash(tw, hinfo);
1113+
1114+
spin_unlock(&head2->lock);
11141115
spin_unlock(&head->lock);
1116+
11151117
if (tw)
11161118
inet_twsk_deschedule_put(tw);
11171119
local_bh_enable();

net/ipv4/inet_timewait_sock.c

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
3030
struct inet_hashinfo *hashinfo)
3131
{
32+
struct inet_bind2_bucket *tb2 = tw->tw_tb2;
3233
struct inet_bind_bucket *tb = tw->tw_tb;
3334

3435
if (!tb)
@@ -37,6 +38,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
3738
__hlist_del(&tw->tw_bind_node);
3839
tw->tw_tb = NULL;
3940
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
41+
42+
__hlist_del(&tw->tw_bind2_node);
43+
tw->tw_tb2 = NULL;
44+
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
45+
4046
__sock_put((struct sock *)tw);
4147
}
4248

@@ -45,7 +51,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
4551
{
4652
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
4753
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
48-
struct inet_bind_hashbucket *bhead;
54+
struct inet_bind_hashbucket *bhead, *bhead2;
4955

5056
spin_lock(lock);
5157
sk_nulls_del_node_init_rcu((struct sock *)tw);
@@ -54,9 +60,13 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
5460
/* Disassociate with bind bucket. */
5561
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
5662
hashinfo->bhash_size)];
63+
bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
64+
twsk_net(tw), tw->tw_num);
5765

5866
spin_lock(&bhead->lock);
67+
spin_lock(&bhead2->lock);
5968
inet_twsk_bind_unhash(tw, hashinfo);
69+
spin_unlock(&bhead2->lock);
6070
spin_unlock(&bhead->lock);
6171

6272
refcount_dec(&tw->tw_dr->tw_refcount);
@@ -93,6 +103,12 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
93103
hlist_add_head(&tw->tw_bind_node, list);
94104
}
95105

106+
static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw,
107+
struct hlist_head *list)
108+
{
109+
hlist_add_head(&tw->tw_bind2_node, list);
110+
}
111+
96112
/*
97113
* Enter the time wait state. This is called with locally disabled BH.
98114
* Essentially we whip up a timewait bucket, copy the relevant info into it
@@ -105,17 +121,28 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
105121
const struct inet_connection_sock *icsk = inet_csk(sk);
106122
struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
107123
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
108-
struct inet_bind_hashbucket *bhead;
124+
struct inet_bind_hashbucket *bhead, *bhead2;
125+
109126
/* Step 1: Put TW into bind hash. Original socket stays there too.
110127
Note, that any socket with inet->num != 0 MUST be bound in
111128
binding cache, even if it is closed.
112129
*/
113130
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
114131
hashinfo->bhash_size)];
132+
bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
133+
115134
spin_lock(&bhead->lock);
135+
spin_lock(&bhead2->lock);
136+
116137
tw->tw_tb = icsk->icsk_bind_hash;
117138
WARN_ON(!icsk->icsk_bind_hash);
118139
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
140+
141+
tw->tw_tb2 = icsk->icsk_bind2_hash;
142+
WARN_ON(!icsk->icsk_bind2_hash);
143+
inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow);
144+
145+
spin_unlock(&bhead2->lock);
119146
spin_unlock(&bhead->lock);
120147

121148
spin_lock(lock);

0 commit comments

Comments
 (0)