Skip to content

Commit 4589725

Browse files
Florian Westphalummakynes
authored andcommitted
netfilter: snat: evict closing tcp entries on reply tuple collision
When all tried source tuples are in use, the connection request (skb) and the new conntrack will be dropped in nf_confirm() due to the non-recoverable clash. Make it so that the last 32 attempts are allowed to evict a colliding entry if this connection is already closing and the new sequence number has advanced past the old one. Such "all tuples taken" secenario can happen with tcp-rpc workloads where same dst:dport gets queried repeatedly. Signed-off-by: Florian Westphal <[email protected]> Signed-off-by: Pablo Neira Ayuso <[email protected]>
1 parent 96b2ef9 commit 4589725

File tree

1 file changed

+88
-4
lines changed

1 file changed

+88
-4
lines changed

net/netfilter/nf_nat_core.c

Lines changed: 88 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727

2828
#include "nf_internals.h"
2929

30+
#define NF_NAT_MAX_ATTEMPTS 128
31+
#define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4)
32+
3033
static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
3134

3235
static DEFINE_MUTEX(nf_nat_proto_mutex);
@@ -197,6 +200,88 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
197200
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
198201
}
199202

203+
static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags)
204+
{
205+
static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT |
206+
IPS_DYING;
207+
static const unsigned long flags_needed = IPS_SRC_NAT;
208+
enum tcp_conntrack old_state;
209+
210+
old_state = READ_ONCE(ct->proto.tcp.state);
211+
if (old_state < TCP_CONNTRACK_TIME_WAIT)
212+
return false;
213+
214+
if (flags & flags_refuse)
215+
return false;
216+
217+
return (flags & flags_needed) == flags_needed;
218+
}
219+
220+
/* reverse direction will send packets to new source, so
221+
* make sure such packets are invalid.
222+
*/
223+
static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new)
224+
{
225+
return (__s32)(new->proto.tcp.seen[0].td_end -
226+
old->proto.tcp.seen[0].td_end) > 0;
227+
}
228+
229+
static int
230+
nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple,
231+
const struct nf_conn *ignored_conntrack,
232+
unsigned int attempts_left)
233+
{
234+
static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD;
235+
struct nf_conntrack_tuple_hash *thash;
236+
const struct nf_conntrack_zone *zone;
237+
struct nf_conntrack_tuple reply;
238+
unsigned long flags;
239+
struct nf_conn *ct;
240+
bool taken = true;
241+
struct net *net;
242+
243+
nf_ct_invert_tuple(&reply, tuple);
244+
245+
if (attempts_left > NF_NAT_HARDER_THRESH ||
246+
tuple->dst.protonum != IPPROTO_TCP ||
247+
ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT)
248+
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
249+
250+
/* :ast few attempts to find a free tcp port. Destructive
251+
* action: evict colliding if its in timewait state and the
252+
* tcp sequence number has advanced past the one used by the
253+
* old entry.
254+
*/
255+
net = nf_ct_net(ignored_conntrack);
256+
zone = nf_ct_zone(ignored_conntrack);
257+
258+
thash = nf_conntrack_find_get(net, zone, &reply);
259+
if (!thash)
260+
return false;
261+
262+
ct = nf_ct_tuplehash_to_ctrack(thash);
263+
264+
if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL)
265+
goto out;
266+
267+
if (WARN_ON_ONCE(ct == ignored_conntrack))
268+
goto out;
269+
270+
flags = READ_ONCE(ct->status);
271+
if (!nf_nat_may_kill(ct, flags))
272+
goto out;
273+
274+
if (!nf_seq_has_advanced(ct, ignored_conntrack))
275+
goto out;
276+
277+
/* Even if we can evict do not reuse if entry is offloaded. */
278+
if (nf_ct_kill(ct))
279+
taken = flags & flags_offload;
280+
out:
281+
nf_ct_put(ct);
282+
return taken;
283+
}
284+
200285
static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
201286
const struct nf_nat_range2 *range)
202287
{
@@ -385,7 +470,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
385470
unsigned int range_size, min, max, i, attempts;
386471
__be16 *keyptr;
387472
u16 off;
388-
static const unsigned int max_attempts = 128;
389473

390474
switch (tuple->dst.protonum) {
391475
case IPPROTO_ICMP:
@@ -471,8 +555,8 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
471555
off = get_random_u16();
472556

473557
attempts = range_size;
474-
if (attempts > max_attempts)
475-
attempts = max_attempts;
558+
if (attempts > NF_NAT_MAX_ATTEMPTS)
559+
attempts = NF_NAT_MAX_ATTEMPTS;
476560

477561
/* We are in softirq; doing a search of the entire range risks
478562
* soft lockup when all tuples are already used.
@@ -483,7 +567,7 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
483567
another_round:
484568
for (i = 0; i < attempts; i++, off++) {
485569
*keyptr = htons(min + off % range_size);
486-
if (!nf_nat_used_tuple(tuple, ct))
570+
if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
487571
return;
488572
}
489573

0 commit comments

Comments
 (0)