Skip to content

Commit 6f1a298

Browse files
committed
Merge branch 'inet-add-drop-monitor-support'
Eric Dumazet says: ==================== inet: add drop monitor support I recently tried to analyse flakes in ip_defrag selftest. This failed miserably. IPv4 and IPv6 reassembly units are causing false kfree_skb() notifications. It is time to deal with this issue. First two patches are changing core networking to better deal with eventual skb frag_list chains, in respect of kfree_skb/consume_skb status. Last three patches are adding three new drop reasons, and make sure skbs that have been reassembled into a large datagram are no longer viewed as dropped ones. After this, understanding why ip_defrag selftest is flaky is possible using standard drop monitoring tools. ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents b98deb2 + 3bdfb04 commit 6f1a298

File tree

8 files changed

+71
-30
lines changed

8 files changed

+71
-30
lines changed

include/net/dropreason.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@
6868
FN(IP_INADDRERRORS) \
6969
FN(IP_INNOROUTES) \
7070
FN(PKT_TOO_BIG) \
71+
FN(DUP_FRAG) \
72+
FN(FRAG_REASM_TIMEOUT) \
73+
FN(FRAG_TOO_FAR) \
7174
FNe(MAX)
7275

7376
/**
@@ -80,6 +83,8 @@ enum skb_drop_reason {
8083
* @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case)
8184
*/
8285
SKB_NOT_DROPPED_YET = 0,
86+
/** @SKB_CONSUMED: packet has been consumed */
87+
SKB_CONSUMED,
8388
/** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */
8489
SKB_DROP_REASON_NOT_SPECIFIED,
8590
/** @SKB_DROP_REASON_NO_SOCKET: socket not found */
@@ -298,6 +303,15 @@ enum skb_drop_reason {
298303
* MTU)
299304
*/
300305
SKB_DROP_REASON_PKT_TOO_BIG,
306+
/** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */
307+
SKB_DROP_REASON_DUP_FRAG,
308+
/** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */
309+
SKB_DROP_REASON_FRAG_REASM_TIMEOUT,
310+
/**
311+
* @SKB_DROP_REASON_FRAG_TOO_FAR: ipv4 fragment too far.
312+
* (/proc/sys/net/ipv4/ipfrag_max_dist)
313+
*/
314+
SKB_DROP_REASON_FRAG_TOO_FAR,
301315
/**
302316
* @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
303317
* used as a real 'reason'

include/net/inet_frag.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <linux/in6.h>
88
#include <linux/rbtree_types.h>
99
#include <linux/refcount.h>
10+
#include <net/dropreason.h>
1011

1112
/* Per netns frag queues directory */
1213
struct fqdir {
@@ -34,12 +35,14 @@ struct fqdir {
3435
* @INET_FRAG_LAST_IN: final fragment has arrived
3536
* @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
3637
* @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable
38+
* @INET_FRAG_DROP: if skbs must be dropped (instead of being consumed)
3739
*/
3840
enum {
3941
INET_FRAG_FIRST_IN = BIT(0),
4042
INET_FRAG_LAST_IN = BIT(1),
4143
INET_FRAG_COMPLETE = BIT(2),
4244
INET_FRAG_HASH_DEAD = BIT(3),
45+
INET_FRAG_DROP = BIT(4),
4346
};
4447

4548
struct frag_v4_compare_key {
@@ -139,7 +142,8 @@ void inet_frag_destroy(struct inet_frag_queue *q);
139142
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);
140143

141144
/* Free all skbs in the queue; return the sum of their truesizes. */
142-
unsigned int inet_frag_rbtree_purge(struct rb_root *root);
145+
unsigned int inet_frag_rbtree_purge(struct rb_root *root,
146+
enum skb_drop_reason reason);
143147

144148
static inline void inet_frag_put(struct inet_frag_queue *q)
145149
{

include/net/ipv6_frag.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
7676
if (fq->q.flags & INET_FRAG_COMPLETE)
7777
goto out;
7878

79+
fq->q.flags |= INET_FRAG_DROP;
7980
inet_frag_kill(&fq->q);
8081

8182
dev = dev_get_by_index_rcu(net, fq->iif);
@@ -101,7 +102,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
101102
spin_unlock(&fq->q.lock);
102103

103104
icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
104-
kfree_skb(head);
105+
kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
105106
goto out_rcu_unlock;
106107

107108
out:

net/core/skbuff.c

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
9494
#undef FN
9595
#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
9696
const char * const drop_reasons[] = {
97+
[SKB_CONSUMED] = "CONSUMED",
9798
DEFINE_DROP_REASON(FN, FN)
9899
};
99100
EXPORT_SYMBOL(drop_reasons);
@@ -768,7 +769,7 @@ static void skb_free_head(struct sk_buff *skb)
768769
}
769770
}
770771

771-
static void skb_release_data(struct sk_buff *skb)
772+
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
772773
{
773774
struct skb_shared_info *shinfo = skb_shinfo(skb);
774775
int i;
@@ -791,7 +792,7 @@ static void skb_release_data(struct sk_buff *skb)
791792

792793
free_head:
793794
if (shinfo->frag_list)
794-
kfree_skb_list(shinfo->frag_list);
795+
kfree_skb_list_reason(shinfo->frag_list, reason);
795796

796797
skb_free_head(skb);
797798
exit:
@@ -854,11 +855,11 @@ void skb_release_head_state(struct sk_buff *skb)
854855
}
855856

856857
/* Free everything but the sk_buff shell. */
857-
static void skb_release_all(struct sk_buff *skb)
858+
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
858859
{
859860
skb_release_head_state(skb);
860861
if (likely(skb->head))
861-
skb_release_data(skb);
862+
skb_release_data(skb, reason);
862863
}
863864

864865
/**
@@ -872,7 +873,7 @@ static void skb_release_all(struct sk_buff *skb)
872873

873874
void __kfree_skb(struct sk_buff *skb)
874875
{
875-
skb_release_all(skb);
876+
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
876877
kfree_skbmem(skb);
877878
}
878879
EXPORT_SYMBOL(__kfree_skb);
@@ -894,7 +895,10 @@ kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
894895

895896
DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
896897

897-
trace_kfree_skb(skb, __builtin_return_address(0), reason);
898+
if (reason == SKB_CONSUMED)
899+
trace_consume_skb(skb);
900+
else
901+
trace_kfree_skb(skb, __builtin_return_address(0), reason);
898902
__kfree_skb(skb);
899903
}
900904
EXPORT_SYMBOL(kfree_skb_reason);
@@ -1052,7 +1056,7 @@ EXPORT_SYMBOL(consume_skb);
10521056
void __consume_stateless_skb(struct sk_buff *skb)
10531057
{
10541058
trace_consume_skb(skb);
1055-
skb_release_data(skb);
1059+
skb_release_data(skb, SKB_CONSUMED);
10561060
kfree_skbmem(skb);
10571061
}
10581062

@@ -1077,7 +1081,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
10771081

10781082
void __kfree_skb_defer(struct sk_buff *skb)
10791083
{
1080-
skb_release_all(skb);
1084+
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
10811085
napi_skb_cache_put(skb);
10821086
}
10831087

@@ -1115,7 +1119,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
11151119
return;
11161120
}
11171121

1118-
skb_release_all(skb);
1122+
skb_release_all(skb, SKB_CONSUMED);
11191123
napi_skb_cache_put(skb);
11201124
}
11211125
EXPORT_SYMBOL(napi_consume_skb);
@@ -1246,7 +1250,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
12461250
*/
12471251
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
12481252
{
1249-
skb_release_all(dst);
1253+
skb_release_all(dst, SKB_CONSUMED);
12501254
return __skb_clone(dst, src);
12511255
}
12521256
EXPORT_SYMBOL_GPL(skb_morph);
@@ -1869,7 +1873,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
18691873
if (skb_has_frag_list(skb))
18701874
skb_clone_fraglist(skb);
18711875

1872-
skb_release_data(skb);
1876+
skb_release_data(skb, SKB_CONSUMED);
18731877
} else {
18741878
skb_free_head(skb);
18751879
}
@@ -6209,7 +6213,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
62096213
skb_frag_ref(skb, i);
62106214
if (skb_has_frag_list(skb))
62116215
skb_clone_fraglist(skb);
6212-
skb_release_data(skb);
6216+
skb_release_data(skb, SKB_CONSUMED);
62136217
} else {
62146218
/* we can reuse existing recount- all we did was
62156219
* relocate values
@@ -6352,7 +6356,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
63526356
kfree(data);
63536357
return -ENOMEM;
63546358
}
6355-
skb_release_data(skb);
6359+
skb_release_data(skb, SKB_CONSUMED);
63566360

63576361
skb->head = data;
63586362
skb->head_frag = 0;

net/ipv4/inet_fragment.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
133133
count = del_timer_sync(&fq->timer) ? 1 : 0;
134134

135135
spin_lock_bh(&fq->lock);
136+
fq->flags |= INET_FRAG_DROP;
136137
if (!(fq->flags & INET_FRAG_COMPLETE)) {
137138
fq->flags |= INET_FRAG_COMPLETE;
138139
count++;
@@ -260,7 +261,8 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
260261
kmem_cache_free(f->frags_cachep, q);
261262
}
262263

263-
unsigned int inet_frag_rbtree_purge(struct rb_root *root)
264+
unsigned int inet_frag_rbtree_purge(struct rb_root *root,
265+
enum skb_drop_reason reason)
264266
{
265267
struct rb_node *p = rb_first(root);
266268
unsigned int sum = 0;
@@ -274,7 +276,7 @@ unsigned int inet_frag_rbtree_purge(struct rb_root *root)
274276
struct sk_buff *next = FRAG_CB(skb)->next_frag;
275277

276278
sum += skb->truesize;
277-
kfree_skb(skb);
279+
kfree_skb_reason(skb, reason);
278280
skb = next;
279281
}
280282
}
@@ -284,17 +286,21 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge);
284286

285287
void inet_frag_destroy(struct inet_frag_queue *q)
286288
{
287-
struct fqdir *fqdir;
288289
unsigned int sum, sum_truesize = 0;
290+
enum skb_drop_reason reason;
289291
struct inet_frags *f;
292+
struct fqdir *fqdir;
290293

291294
WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
295+
reason = (q->flags & INET_FRAG_DROP) ?
296+
SKB_DROP_REASON_FRAG_REASM_TIMEOUT :
297+
SKB_CONSUMED;
292298
WARN_ON(del_timer(&q->timer) != 0);
293299

294300
/* Release all fragment data. */
295301
fqdir = q->fqdir;
296302
f = fqdir->f;
297-
sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
303+
sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason);
298304
sum = sum_truesize + f->qsize;
299305

300306
call_rcu(&q->rcu, inet_frag_destroy_rcu);

net/ipv4/ip_fragment.c

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ static void ip_expire(struct timer_list *t)
153153
if (qp->q.flags & INET_FRAG_COMPLETE)
154154
goto out;
155155

156+
qp->q.flags |= INET_FRAG_DROP;
156157
ipq_kill(qp);
157158
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
158159
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
@@ -194,7 +195,7 @@ static void ip_expire(struct timer_list *t)
194195
spin_unlock(&qp->q.lock);
195196
out_rcu_unlock:
196197
rcu_read_unlock();
197-
kfree_skb(head);
198+
kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
198199
ipq_put(qp);
199200
}
200201

@@ -254,7 +255,8 @@ static int ip_frag_reinit(struct ipq *qp)
254255
return -ETIMEDOUT;
255256
}
256257

257-
sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
258+
sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments,
259+
SKB_DROP_REASON_FRAG_TOO_FAR);
258260
sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
259261

260262
qp->q.flags = 0;
@@ -278,10 +280,14 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
278280
struct net_device *dev;
279281
unsigned int fragsize;
280282
int err = -ENOENT;
283+
SKB_DR(reason);
281284
u8 ecn;
282285

283-
if (qp->q.flags & INET_FRAG_COMPLETE)
286+
/* If reassembly is already done, @skb must be a duplicate frag. */
287+
if (qp->q.flags & INET_FRAG_COMPLETE) {
288+
SKB_DR_SET(reason, DUP_FRAG);
284289
goto err;
290+
}
285291

286292
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
287293
unlikely(ip_frag_too_far(qp)) &&
@@ -382,16 +388,17 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
382388

383389
insert_error:
384390
if (err == IPFRAG_DUP) {
385-
kfree_skb(skb);
386-
return -EINVAL;
391+
SKB_DR_SET(reason, DUP_FRAG);
392+
err = -EINVAL;
393+
goto err;
387394
}
388395
err = -EINVAL;
389396
__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
390397
discard_qp:
391398
inet_frag_kill(&qp->q);
392399
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
393400
err:
394-
kfree_skb(skb);
401+
kfree_skb_reason(skb, reason);
395402
return err;
396403
}
397404

net/ipv6/netfilter/nf_conntrack_reasm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
253253
if (err) {
254254
if (err == IPFRAG_DUP) {
255255
/* No error for duplicates, pretend they got queued. */
256-
kfree_skb(skb);
256+
kfree_skb_reason(skb, SKB_DROP_REASON_DUP_FRAG);
257257
return -EINPROGRESS;
258258
}
259259
goto insert_error;

net/ipv6/reassembly.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,14 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
112112
struct sk_buff *prev_tail;
113113
struct net_device *dev;
114114
int err = -ENOENT;
115+
SKB_DR(reason);
115116
u8 ecn;
116117

117-
if (fq->q.flags & INET_FRAG_COMPLETE)
118+
/* If reassembly is already done, @skb must be a duplicate frag. */
119+
if (fq->q.flags & INET_FRAG_COMPLETE) {
120+
SKB_DR_SET(reason, DUP_FRAG);
118121
goto err;
122+
}
119123

120124
err = -EINVAL;
121125
offset = ntohs(fhdr->frag_off) & ~0x7;
@@ -226,8 +230,9 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
226230

227231
insert_error:
228232
if (err == IPFRAG_DUP) {
229-
kfree_skb(skb);
230-
return -EINVAL;
233+
SKB_DR_SET(reason, DUP_FRAG);
234+
err = -EINVAL;
235+
goto err;
231236
}
232237
err = -EINVAL;
233238
__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
@@ -237,7 +242,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
237242
__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
238243
IPSTATS_MIB_REASMFAILS);
239244
err:
240-
kfree_skb(skb);
245+
kfree_skb_reason(skb, reason);
241246
return err;
242247
}
243248

0 commit comments

Comments
 (0)