Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit 5b04f41

Browse files
Florian Westphalgregkh
authored andcommitted
netfilter: nf_nat: don't try nat source port reallocation for reverse dir clash
[ Upstream commit d8f84a9 ] A conntrack entry can be inserted to the connection tracking table if there is no existing entry with an identical tuple in either direction. Example: INITIATOR -> NAT/PAT -> RESPONDER Initiator passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). Then, later, NAT/PAT machine itself also wants to connect to RESPONDER. This will not work if the SNAT done earlier has same IP:PORT source pair. Conntrack table has: ORIGINAL: $IP_INITATOR:$SPORT -> $IP_RESPONDER:$DPORT REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT and new locally originating connection wants: ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT This is handled by the NAT engine which will do a source port reallocation for the locally originating connection that is colliding with an existing tuple by attempting a source port rewrite. This is done even if this new connection attempt did not go through a masquerade/snat rule. There is a rare race condition with connection-less protocols like UDP, where we do the port reallocation even though its not needed. This happens when new packets from the same, pre-existing flow are received in both directions at the exact same time on different CPUs after the conntrack table was flushed (or conntrack becomes active for first time). With strict ordering/single cpu, the first packet creates new ct entry and second packet is resolved as established reply packet. With parallel processing, both packets are picked up as new and both get their own ct entry. In this case, the 'reply' packet (picked up as ORIGINAL) can be mangled by NAT engine because a port collision is detected. This change isn't enough to prevent a packet drop later during nf_conntrack_confirm(), the existing clash resolution strategy will not detect such reverse clash case. This is resolved by a followup patch. Signed-off-by: Florian Westphal <[email protected]> Signed-off-by: Pablo Neira Ayuso <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent 1215e29 commit 5b04f41

File tree

1 file changed

+118
-2
lines changed

1 file changed

+118
-2
lines changed

net/netfilter/nf_nat_core.c

Lines changed: 118 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,35 @@ hash_by_src(const struct net *net,
183183
return reciprocal_scale(hash, nf_nat_htable_size);
184184
}
185185

186-
/* Is this tuple already taken? (not by us) */
186+
/**
187+
* nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry
188+
* @tuple: proposed NAT binding
189+
* @ignored_conntrack: our (unconfirmed) conntrack entry
190+
*
191+
* A conntrack entry can be inserted to the connection tracking table
192+
* if there is no existing entry with an identical tuple in either direction.
193+
*
194+
* Example:
195+
* INITIATOR -> NAT/PAT -> RESPONDER
196+
*
197+
* INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite).
198+
* Then, later, NAT/PAT itself also connects to RESPONDER.
199+
*
200+
* This will not work if the SNAT done earlier has same IP:PORT source pair.
201+
*
202+
* Conntrack table has:
203+
* ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT
204+
* REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT
205+
*
206+
* and new locally originating connection wants:
207+
* ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT
208+
* REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT
209+
*
210+
* ... which would mean incoming packets cannot be distinguished between
211+
* the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple).
212+
*
213+
* @return: true if the proposed NAT mapping collides with an existing entry.
214+
*/
187215
static int
188216
nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
189217
const struct nf_conn *ignored_conntrack)
@@ -200,6 +228,94 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
200228
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
201229
}
202230

231+
static bool nf_nat_allow_clash(const struct nf_conn *ct)
232+
{
233+
return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash;
234+
}
235+
236+
/**
237+
* nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry
238+
* @tuple: proposed NAT binding
239+
* @ignored_ct: our (unconfirmed) conntrack entry
240+
*
241+
* Same as nf_nat_used_tuple, but also check for rare clash in reverse
242+
* direction. Should be called only when @tuple has not been altered, i.e.
243+
* @ignored_conntrack will not be subject to NAT.
244+
*
245+
* @return: true if the proposed NAT mapping collides with existing entry.
246+
*/
247+
static noinline bool
248+
nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple,
249+
const struct nf_conn *ignored_ct)
250+
{
251+
static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST_BIT;
252+
const struct nf_conntrack_tuple_hash *thash;
253+
const struct nf_conntrack_zone *zone;
254+
struct nf_conn *ct;
255+
bool taken = true;
256+
struct net *net;
257+
258+
if (!nf_nat_used_tuple(tuple, ignored_ct))
259+
return false;
260+
261+
if (!nf_nat_allow_clash(ignored_ct))
262+
return true;
263+
264+
/* Initial choice clashes with existing conntrack.
265+
* Check for (rare) reverse collision.
266+
*
267+
* This can happen when new packets are received in both directions
268+
* at the exact same time on different CPUs.
269+
*
270+
* Without SMP, first packet creates new conntrack entry and second
271+
* packet is resolved as established reply packet.
272+
*
273+
* With parallel processing, both packets could be picked up as
274+
* new and both get their own ct entry allocated.
275+
*
276+
* If ignored_conntrack and colliding ct are not subject to NAT then
277+
* pretend the tuple is available and let later clash resolution
278+
* handle this at insertion time.
279+
*
280+
* Without it, the 'reply' packet has its source port rewritten
281+
* by nat engine.
282+
*/
283+
if (READ_ONCE(ignored_ct->status) & uses_nat)
284+
return true;
285+
286+
net = nf_ct_net(ignored_ct);
287+
zone = nf_ct_zone(ignored_ct);
288+
289+
thash = nf_conntrack_find_get(net, zone, tuple);
290+
if (unlikely(!thash)) /* clashing entry went away */
291+
return false;
292+
293+
ct = nf_ct_tuplehash_to_ctrack(thash);
294+
295+
/* NB: IP_CT_DIR_ORIGINAL should be impossible because
296+
* nf_nat_used_tuple() handles origin collisions.
297+
*
298+
* Handle remote chance other CPU confirmed its ct right after.
299+
*/
300+
if (thash->tuple.dst.dir != IP_CT_DIR_REPLY)
301+
goto out;
302+
303+
/* clashing connection subject to NAT? Retry with new tuple. */
304+
if (READ_ONCE(ct->status) & uses_nat)
305+
goto out;
306+
307+
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
308+
&ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) &&
309+
nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
310+
&ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) {
311+
taken = false;
312+
goto out;
313+
}
314+
out:
315+
nf_ct_put(ct);
316+
return taken;
317+
}
318+
203319
static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags)
204320
{
205321
static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT |
@@ -608,7 +724,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
608724
!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
609725
/* try the original tuple first */
610726
if (nf_in_range(orig_tuple, range)) {
611-
if (!nf_nat_used_tuple(orig_tuple, ct)) {
727+
if (!nf_nat_used_tuple_new(orig_tuple, ct)) {
612728
*tuple = *orig_tuple;
613729
return;
614730
}

0 commit comments

Comments
 (0)