Skip to content

Commit edb09eb

Browse files
Eric Dumazetdavem330
authored andcommitted
net: sched: do not acquire qdisc spinlock in qdisc/class stats dump
Large tc dumps (tc -s {qdisc|class} sh dev ethX) done by Google BwE host agent [1] are problematic at scale : For each qdisc/class found in the dump, we currently lock the root qdisc spinlock in order to get stats. Sampling stats every 5 seconds from thousands of HTB classes is a challenge when the root qdisc spinlock is under high pressure. Not only the dumps take time, they also slow down the fast path (queue/dequeue packets) by 10 % to 20 % in some cases. An audit of existing qdiscs showed that sch_fq_codel is the only qdisc that might need the qdisc lock in fq_codel_dump_stats() and fq_codel_dump_class_stats() In v2 of this patch, I now use the Qdisc running seqcount to provide consistent reads of packets/bytes counters, regardless of 32/64 bit arches. I also changed rate estimators to use the same infrastructure so that they no longer need to lock root qdisc lock. [1] http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43838.pdf Signed-off-by: Eric Dumazet <[email protected]> Cc: Cong Wang <[email protected]> Cc: Jamal Hadi Salim <[email protected]> Cc: John Fastabend <[email protected]> Cc: Kevin Athey <[email protected]> Cc: Xiaotian Pei <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent f9eb8ae commit edb09eb

File tree

20 files changed

+126
-69
lines changed

20 files changed

+126
-69
lines changed

Documentation/networking/gen_stats.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ struct mystruct {
2121
...
2222
};
2323

24-
Update statistics:
24+
Update statistics, in dequeue() methods only, (while owning qdisc->running)
2525
mystruct->tstats.packet++;
2626
mystruct->qstats.backlog += skb->pkt_len;
2727

include/net/gen_stats.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,12 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
3333
spinlock_t *lock, struct gnet_dump *d,
3434
int padattr);
3535

36-
int gnet_stats_copy_basic(struct gnet_dump *d,
36+
int gnet_stats_copy_basic(const seqcount_t *running,
37+
struct gnet_dump *d,
3738
struct gnet_stats_basic_cpu __percpu *cpu,
3839
struct gnet_stats_basic_packed *b);
39-
void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
40+
void __gnet_stats_copy_basic(const seqcount_t *running,
41+
struct gnet_stats_basic_packed *bstats,
4042
struct gnet_stats_basic_cpu __percpu *cpu,
4143
struct gnet_stats_basic_packed *b);
4244
int gnet_stats_copy_rate_est(struct gnet_dump *d,
@@ -52,13 +54,15 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
5254
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
5355
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
5456
struct gnet_stats_rate_est64 *rate_est,
55-
spinlock_t *stats_lock, struct nlattr *opt);
57+
spinlock_t *stats_lock,
58+
seqcount_t *running, struct nlattr *opt);
5659
void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
5760
struct gnet_stats_rate_est64 *rate_est);
5861
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
5962
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
6063
struct gnet_stats_rate_est64 *rate_est,
61-
spinlock_t *stats_lock, struct nlattr *opt);
64+
spinlock_t *stats_lock,
65+
seqcount_t *running, struct nlattr *opt);
6266
bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
6367
const struct gnet_stats_rate_est64 *rate_est);
6468
#endif

include/net/sch_generic.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,14 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
314314
return qdisc_lock(root);
315315
}
316316

317+
static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
318+
{
319+
struct Qdisc *root = qdisc_root_sleeping(qdisc);
320+
321+
ASSERT_RTNL();
322+
return &root->running;
323+
}
324+
317325
static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc)
318326
{
319327
return qdisc->dev_queue->dev;

net/core/gen_estimator.c

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ struct gen_estimator
8484
struct gnet_stats_basic_packed *bstats;
8585
struct gnet_stats_rate_est64 *rate_est;
8686
spinlock_t *stats_lock;
87+
seqcount_t *running;
8788
int ewma_log;
8889
u32 last_packets;
8990
unsigned long avpps;
@@ -121,26 +122,28 @@ static void est_timer(unsigned long arg)
121122
unsigned long rate;
122123
u64 brate;
123124

124-
spin_lock(e->stats_lock);
125+
if (e->stats_lock)
126+
spin_lock(e->stats_lock);
125127
read_lock(&est_lock);
126128
if (e->bstats == NULL)
127129
goto skip;
128130

129-
__gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
131+
__gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
130132

131133
brate = (b.bytes - e->last_bytes)<<(7 - idx);
132134
e->last_bytes = b.bytes;
133135
e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
134-
e->rate_est->bps = (e->avbps+0xF)>>5;
136+
WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
135137

136138
rate = b.packets - e->last_packets;
137139
rate <<= (7 - idx);
138140
e->last_packets = b.packets;
139141
e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
140-
e->rate_est->pps = (e->avpps + 0xF) >> 5;
142+
WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
141143
skip:
142144
read_unlock(&est_lock);
143-
spin_unlock(e->stats_lock);
145+
if (e->stats_lock)
146+
spin_unlock(e->stats_lock);
144147
}
145148

146149
if (!list_empty(&elist[idx].list))
@@ -194,6 +197,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
194197
* @cpu_bstats: bstats per cpu
195198
* @rate_est: rate estimator statistics
196199
* @stats_lock: statistics lock
200+
* @running: qdisc running seqcount
197201
* @opt: rate estimator configuration TLV
198202
*
199203
* Creates a new rate estimator with &bstats as source and &rate_est
@@ -209,6 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
209213
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
210214
struct gnet_stats_rate_est64 *rate_est,
211215
spinlock_t *stats_lock,
216+
seqcount_t *running,
212217
struct nlattr *opt)
213218
{
214219
struct gen_estimator *est;
@@ -226,12 +231,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
226231
if (est == NULL)
227232
return -ENOBUFS;
228233

229-
__gnet_stats_copy_basic(&b, cpu_bstats, bstats);
234+
__gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
230235

231236
idx = parm->interval + 2;
232237
est->bstats = bstats;
233238
est->rate_est = rate_est;
234239
est->stats_lock = stats_lock;
240+
est->running = running;
235241
est->ewma_log = parm->ewma_log;
236242
est->last_bytes = b.bytes;
237243
est->avbps = rate_est->bps<<5;
@@ -291,6 +297,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
291297
* @cpu_bstats: bstats per cpu
292298
* @rate_est: rate estimator statistics
293299
* @stats_lock: statistics lock
300+
* @running: qdisc running seqcount (might be NULL)
294301
* @opt: rate estimator configuration TLV
295302
*
296303
* Replaces the configuration of a rate estimator by calling
@@ -301,10 +308,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
301308
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
302309
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
303310
struct gnet_stats_rate_est64 *rate_est,
304-
spinlock_t *stats_lock, struct nlattr *opt)
311+
spinlock_t *stats_lock,
312+
seqcount_t *running, struct nlattr *opt)
305313
{
306314
gen_kill_estimator(bstats, rate_est);
307-
return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
315+
return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
308316
}
309317
EXPORT_SYMBOL(gen_replace_estimator);
310318

net/core/gen_stats.c

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,11 @@ gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
3232
return 0;
3333

3434
nla_put_failure:
35+
if (d->lock)
36+
spin_unlock_bh(d->lock);
3537
kfree(d->xstats);
3638
d->xstats = NULL;
3739
d->xstats_len = 0;
38-
spin_unlock_bh(d->lock);
3940
return -1;
4041
}
4142

@@ -65,15 +66,16 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
6566
{
6667
memset(d, 0, sizeof(*d));
6768

68-
spin_lock_bh(lock);
69-
d->lock = lock;
7069
if (type)
7170
d->tail = (struct nlattr *)skb_tail_pointer(skb);
7271
d->skb = skb;
7372
d->compat_tc_stats = tc_stats_type;
7473
d->compat_xstats = xstats_type;
7574
d->padattr = padattr;
76-
75+
if (lock) {
76+
d->lock = lock;
77+
spin_lock_bh(lock);
78+
}
7779
if (d->tail)
7880
return gnet_stats_copy(d, type, NULL, 0, padattr);
7981

@@ -126,16 +128,23 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
126128
}
127129

128130
void
129-
__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
131+
__gnet_stats_copy_basic(const seqcount_t *running,
132+
struct gnet_stats_basic_packed *bstats,
130133
struct gnet_stats_basic_cpu __percpu *cpu,
131134
struct gnet_stats_basic_packed *b)
132135
{
136+
unsigned int seq;
137+
133138
if (cpu) {
134139
__gnet_stats_copy_basic_cpu(bstats, cpu);
135-
} else {
140+
return;
141+
}
142+
do {
143+
if (running)
144+
seq = read_seqcount_begin(running);
136145
bstats->bytes = b->bytes;
137146
bstats->packets = b->packets;
138-
}
147+
} while (running && read_seqcount_retry(running, seq));
139148
}
140149
EXPORT_SYMBOL(__gnet_stats_copy_basic);
141150

@@ -152,13 +161,14 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
152161
* if the room in the socket buffer was not sufficient.
153162
*/
154163
int
155-
gnet_stats_copy_basic(struct gnet_dump *d,
164+
gnet_stats_copy_basic(const seqcount_t *running,
165+
struct gnet_dump *d,
156166
struct gnet_stats_basic_cpu __percpu *cpu,
157167
struct gnet_stats_basic_packed *b)
158168
{
159169
struct gnet_stats_basic_packed bstats = {0};
160170

161-
__gnet_stats_copy_basic(&bstats, cpu, b);
171+
__gnet_stats_copy_basic(running, &bstats, cpu, b);
162172

163173
if (d->compat_tc_stats) {
164174
d->tc_stats.bytes = bstats.bytes;
@@ -328,8 +338,9 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
328338
return 0;
329339

330340
err_out:
341+
if (d->lock)
342+
spin_unlock_bh(d->lock);
331343
d->xstats_len = 0;
332-
spin_unlock_bh(d->lock);
333344
return -1;
334345
}
335346
EXPORT_SYMBOL(gnet_stats_copy_app);
@@ -363,10 +374,11 @@ gnet_stats_finish_copy(struct gnet_dump *d)
363374
return -1;
364375
}
365376

377+
if (d->lock)
378+
spin_unlock_bh(d->lock);
366379
kfree(d->xstats);
367380
d->xstats = NULL;
368381
d->xstats_len = 0;
369-
spin_unlock_bh(d->lock);
370382
return 0;
371383
}
372384
EXPORT_SYMBOL(gnet_stats_finish_copy);

net/netfilter/xt_RATEEST.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
137137
cfg.est.ewma_log = info->ewma_log;
138138

139139
ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
140-
&est->lock, &cfg.opt);
140+
&est->lock, NULL, &cfg.opt);
141141
if (ret < 0)
142142
goto err2;
143143

net/sched/act_api.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
287287
if (est) {
288288
err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats,
289289
&p->tcfc_rate_est,
290-
&p->tcfc_lock, est);
290+
&p->tcfc_lock, NULL, est);
291291
if (err) {
292292
free_percpu(p->cpu_qstats);
293293
goto err2;
@@ -671,7 +671,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
671671
if (err < 0)
672672
goto errout;
673673

674-
if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
674+
if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
675675
gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
676676
&p->tcfc_rate_est) < 0 ||
677677
gnet_stats_copy_queue(&d, p->cpu_qstats,

net/sched/act_police.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,8 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
185185
if (est) {
186186
err = gen_replace_estimator(&police->tcf_bstats, NULL,
187187
&police->tcf_rate_est,
188-
&police->tcf_lock, est);
188+
&police->tcf_lock,
189+
NULL, est);
189190
if (err)
190191
goto failure_unlock;
191192
} else if (tb[TCA_POLICE_AVRATE] &&

net/sched/sch_api.c

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -982,7 +982,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
982982
rcu_assign_pointer(sch->stab, stab);
983983
}
984984
if (tca[TCA_RATE]) {
985-
spinlock_t *root_lock;
985+
seqcount_t *running;
986986

987987
err = -EOPNOTSUPP;
988988
if (sch->flags & TCQ_F_MQROOT)
@@ -991,14 +991,15 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
991991
if ((sch->parent != TC_H_ROOT) &&
992992
!(sch->flags & TCQ_F_INGRESS) &&
993993
(!p || !(p->flags & TCQ_F_MQROOT)))
994-
root_lock = qdisc_root_sleeping_lock(sch);
994+
running = qdisc_root_sleeping_running(sch);
995995
else
996-
root_lock = qdisc_lock(sch);
996+
running = &sch->running;
997997

998998
err = gen_new_estimator(&sch->bstats,
999999
sch->cpu_bstats,
10001000
&sch->rate_est,
1001-
root_lock,
1001+
NULL,
1002+
running,
10021003
tca[TCA_RATE]);
10031004
if (err)
10041005
goto err_out4;
@@ -1061,7 +1062,8 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
10611062
gen_replace_estimator(&sch->bstats,
10621063
sch->cpu_bstats,
10631064
&sch->rate_est,
1064-
qdisc_root_sleeping_lock(sch),
1065+
NULL,
1066+
qdisc_root_sleeping_running(sch),
10651067
tca[TCA_RATE]);
10661068
}
10671069
out:
@@ -1369,8 +1371,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
13691371
goto nla_put_failure;
13701372

13711373
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1372-
qdisc_root_sleeping_lock(q), &d,
1373-
TCA_PAD) < 0)
1374+
NULL, &d, TCA_PAD) < 0)
13741375
goto nla_put_failure;
13751376

13761377
if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
@@ -1381,7 +1382,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
13811382
cpu_qstats = q->cpu_qstats;
13821383
}
13831384

1384-
if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
1385+
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
1386+
&d, cpu_bstats, &q->bstats) < 0 ||
13851387
gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
13861388
gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
13871389
goto nla_put_failure;
@@ -1684,8 +1686,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
16841686
goto nla_put_failure;
16851687

16861688
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1687-
qdisc_root_sleeping_lock(q), &d,
1688-
TCA_PAD) < 0)
1689+
NULL, &d, TCA_PAD) < 0)
16891690
goto nla_put_failure;
16901691

16911692
if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)

net/sched/sch_atm.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,8 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
637637
{
638638
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
639639

640-
if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 ||
640+
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
641+
d, NULL, &flow->bstats) < 0 ||
641642
gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
642643
return -1;
643644

net/sched/sch_cbq.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1600,7 +1600,8 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
16001600
if (cl->undertime != PSCHED_PASTPERFECT)
16011601
cl->xstats.undertime = cl->undertime - q->now;
16021602

1603-
if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
1603+
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1604+
d, NULL, &cl->bstats) < 0 ||
16041605
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
16051606
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
16061607
return -1;
@@ -1755,7 +1756,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
17551756
if (tca[TCA_RATE]) {
17561757
err = gen_replace_estimator(&cl->bstats, NULL,
17571758
&cl->rate_est,
1758-
qdisc_root_sleeping_lock(sch),
1759+
NULL,
1760+
qdisc_root_sleeping_running(sch),
17591761
tca[TCA_RATE]);
17601762
if (err) {
17611763
qdisc_put_rtab(rtab);
@@ -1848,7 +1850,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
18481850

18491851
if (tca[TCA_RATE]) {
18501852
err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1851-
qdisc_root_sleeping_lock(sch),
1853+
NULL,
1854+
qdisc_root_sleeping_running(sch),
18521855
tca[TCA_RATE]);
18531856
if (err) {
18541857
kfree(cl);

0 commit comments

Comments
 (0)