Skip to content

Commit 56b765b

Browse files
jvimaldavem330
authored andcommitted
htb: improved accuracy at high rates
Current HTB (and TBF) uses rate table computed by the "tc" userspace program, which has the following issue: The rate table has 256 entries to map packet lengths to token (time units). With TSO sized packets, the 256 entry granularity leads to loss/gain of rate, making the token bucket inaccurate. Thus, instead of relying on rate table, this patch explicitly computes the time and accounts for packet transmission times with nanosecond granularity. This greatly improves accuracy of HTB with a wide range of packet sizes. Example: tc qdisc add dev $dev root handle 1: \ htb default 1 tc class add dev $dev classid 1:1 parent 1: \ rate 5Gbit mtu 64k Here is an example of inaccuracy: $ iperf -c host -t 10 -i 1 With old htb: eth4: 34.76 Mb/s In 5827.98 Mb/s Out - 65836.0 p/s In 481273.0 p/s Out [SUM] 9.0-10.0 sec 669 MBytes 5.61 Gbits/sec [SUM] 0.0-10.0 sec 6.50 GBytes 5.58 Gbits/sec With new htb: eth4: 28.36 Mb/s In 5208.06 Mb/s Out - 53704.0 p/s In 430076.0 p/s Out [SUM] 9.0-10.0 sec 594 MBytes 4.98 Gbits/sec [SUM] 0.0-10.0 sec 5.80 GBytes 4.98 Gbits/sec The bits per second on the wire is still 5200Mb/s with new HTB because qdisc accounts for packet length using skb->len, which is smaller than total bytes on the wire if GSO is used. But that is for another patch regardless of how time is accounted. Many thanks to Eric Dumazet for review and feedback. Signed-off-by: Vimalkumar <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent afb9718 commit 56b765b

File tree

1 file changed

+90
-38
lines changed

1 file changed

+90
-38
lines changed

net/sched/sch_htb.c

Lines changed: 90 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ enum htb_cmode {
7171
HTB_CAN_SEND /* class can send */
7272
};
7373

74+
struct htb_rate_cfg {
75+
u64 rate_bps;
76+
u32 mult;
77+
u32 shift;
78+
};
79+
7480
/* interior & leaf nodes; props specific to leaves are marked L: */
7581
struct htb_class {
7682
struct Qdisc_class_common common;
@@ -118,11 +124,11 @@ struct htb_class {
118124
int filter_cnt;
119125

120126
/* token bucket parameters */
121-
struct qdisc_rate_table *rate; /* rate table of the class itself */
122-
struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */
123-
long buffer, cbuffer; /* token bucket depth/rate */
127+
struct htb_rate_cfg rate;
128+
struct htb_rate_cfg ceil;
129+
s64 buffer, cbuffer; /* token bucket depth/rate */
124130
psched_tdiff_t mbuffer; /* max wait time */
125-
long tokens, ctokens; /* current number of tokens */
131+
s64 tokens, ctokens; /* current number of tokens */
126132
psched_time_t t_c; /* checkpoint time */
127133
};
128134

@@ -162,6 +168,45 @@ struct htb_sched {
162168
struct work_struct work;
163169
};
164170

171+
static u64 l2t_ns(struct htb_rate_cfg *r, unsigned int len)
172+
{
173+
return ((u64)len * r->mult) >> r->shift;
174+
}
175+
176+
static void htb_precompute_ratedata(struct htb_rate_cfg *r)
177+
{
178+
u64 factor;
179+
u64 mult;
180+
int shift;
181+
182+
r->shift = 0;
183+
r->mult = 1;
184+
/*
185+
* Calibrate mult, shift so that token counting is accurate
186+
* for smallest packet size (64 bytes). Token (time in ns) is
187+
* computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will
188+
* work as long as the smallest packet transfer time can be
189+
* accurately represented in nanosec.
190+
*/
191+
if (r->rate_bps > 0) {
192+
/*
193+
* Higher shift gives better accuracy. Find the largest
194+
* shift such that mult fits in 32 bits.
195+
*/
196+
for (shift = 0; shift < 16; shift++) {
197+
r->shift = shift;
198+
factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
199+
mult = div64_u64(factor, r->rate_bps);
200+
if (mult > UINT_MAX)
201+
break;
202+
}
203+
204+
r->shift = shift - 1;
205+
factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
206+
r->mult = div64_u64(factor, r->rate_bps);
207+
}
208+
}
209+
165210
/* find class in global hash table using given handle */
166211
static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
167212
{
@@ -273,7 +318,7 @@ static void htb_add_to_id_tree(struct rb_root *root,
273318
* already in the queue.
274319
*/
275320
static void htb_add_to_wait_tree(struct htb_sched *q,
276-
struct htb_class *cl, long delay)
321+
struct htb_class *cl, s64 delay)
277322
{
278323
struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
279324

@@ -441,14 +486,14 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
441486
htb_remove_class_from_row(q, cl, mask);
442487
}
443488

444-
static inline long htb_lowater(const struct htb_class *cl)
489+
static inline s64 htb_lowater(const struct htb_class *cl)
445490
{
446491
if (htb_hysteresis)
447492
return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
448493
else
449494
return 0;
450495
}
451-
static inline long htb_hiwater(const struct htb_class *cl)
496+
static inline s64 htb_hiwater(const struct htb_class *cl)
452497
{
453498
if (htb_hysteresis)
454499
return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
@@ -469,9 +514,9 @@ static inline long htb_hiwater(const struct htb_class *cl)
469514
* mode transitions per time unit. The speed gain is about 1/6.
470515
*/
471516
static inline enum htb_cmode
472-
htb_class_mode(struct htb_class *cl, long *diff)
517+
htb_class_mode(struct htb_class *cl, s64 *diff)
473518
{
474-
long toks;
519+
s64 toks;
475520

476521
if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
477522
*diff = -toks;
@@ -495,7 +540,7 @@ htb_class_mode(struct htb_class *cl, long *diff)
495540
* to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
496541
*/
497542
static void
498-
htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
543+
htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
499544
{
500545
enum htb_cmode new_mode = htb_class_mode(cl, diff);
501546

@@ -581,26 +626,26 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
581626
return NET_XMIT_SUCCESS;
582627
}
583628

584-
static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff)
629+
static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
585630
{
586-
long toks = diff + cl->tokens;
631+
s64 toks = diff + cl->tokens;
587632

588633
if (toks > cl->buffer)
589634
toks = cl->buffer;
590-
toks -= (long) qdisc_l2t(cl->rate, bytes);
635+
toks -= (s64) l2t_ns(&cl->rate, bytes);
591636
if (toks <= -cl->mbuffer)
592637
toks = 1 - cl->mbuffer;
593638

594639
cl->tokens = toks;
595640
}
596641

597-
static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff)
642+
static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
598643
{
599-
long toks = diff + cl->ctokens;
644+
s64 toks = diff + cl->ctokens;
600645

601646
if (toks > cl->cbuffer)
602647
toks = cl->cbuffer;
603-
toks -= (long) qdisc_l2t(cl->ceil, bytes);
648+
toks -= (s64) l2t_ns(&cl->ceil, bytes);
604649
if (toks <= -cl->mbuffer)
605650
toks = 1 - cl->mbuffer;
606651

@@ -623,10 +668,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
623668
{
624669
int bytes = qdisc_pkt_len(skb);
625670
enum htb_cmode old_mode;
626-
long diff;
671+
s64 diff;
627672

628673
while (cl) {
629-
diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
674+
diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
630675
if (cl->level >= level) {
631676
if (cl->level == level)
632677
cl->xstats.lends++;
@@ -673,7 +718,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
673718
unsigned long stop_at = start + 2;
674719
while (time_before(jiffies, stop_at)) {
675720
struct htb_class *cl;
676-
long diff;
721+
s64 diff;
677722
struct rb_node *p = rb_first(&q->wait_pq[level]);
678723

679724
if (!p)
@@ -684,7 +729,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
684729
return cl->pq_key;
685730

686731
htb_safe_rb_erase(p, q->wait_pq + level);
687-
diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
732+
diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
688733
htb_change_class_mode(q, cl, &diff);
689734
if (cl->cmode != HTB_CAN_SEND)
690735
htb_add_to_wait_tree(q, cl, diff);
@@ -834,7 +879,6 @@ static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
834879
} while (cl != start);
835880

836881
if (likely(skb != NULL)) {
837-
bstats_update(&cl->bstats, skb);
838882
cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
839883
if (cl->un.leaf.deficit[level] < 0) {
840884
cl->un.leaf.deficit[level] += cl->quantum;
@@ -871,10 +915,10 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
871915

872916
if (!sch->q.qlen)
873917
goto fin;
874-
q->now = psched_get_time();
918+
q->now = ktime_to_ns(ktime_get());
875919
start_at = jiffies;
876920

877-
next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
921+
next_event = q->now + 5 * NSEC_PER_SEC;
878922

879923
for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
880924
/* common case optimization - skip event handler quickly */
@@ -884,7 +928,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
884928
if (q->now >= q->near_ev_cache[level]) {
885929
event = htb_do_events(q, level, start_at);
886930
if (!event)
887-
event = q->now + PSCHED_TICKS_PER_SEC;
931+
event = q->now + NSEC_PER_SEC;
888932
q->near_ev_cache[level] = event;
889933
} else
890934
event = q->near_ev_cache[level];
@@ -903,10 +947,17 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
903947
}
904948
}
905949
sch->qstats.overlimits++;
906-
if (likely(next_event > q->now))
907-
qdisc_watchdog_schedule(&q->watchdog, next_event);
908-
else
950+
if (likely(next_event > q->now)) {
951+
if (!test_bit(__QDISC_STATE_DEACTIVATED,
952+
&qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
953+
ktime_t time = ns_to_ktime(next_event);
954+
qdisc_throttled(q->watchdog.qdisc);
955+
hrtimer_start(&q->watchdog.timer, time,
956+
HRTIMER_MODE_ABS);
957+
}
958+
} else {
909959
schedule_work(&q->work);
960+
}
910961
fin:
911962
return skb;
912963
}
@@ -1082,9 +1133,9 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
10821133

10831134
memset(&opt, 0, sizeof(opt));
10841135

1085-
opt.rate = cl->rate->rate;
1136+
opt.rate.rate = cl->rate.rate_bps >> 3;
10861137
opt.buffer = cl->buffer;
1087-
opt.ceil = cl->ceil->rate;
1138+
opt.ceil.rate = cl->ceil.rate_bps >> 3;
10881139
opt.cbuffer = cl->cbuffer;
10891140
opt.quantum = cl->quantum;
10901141
opt.prio = cl->prio;
@@ -1203,9 +1254,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
12031254
qdisc_destroy(cl->un.leaf.q);
12041255
}
12051256
gen_kill_estimator(&cl->bstats, &cl->rate_est);
1206-
qdisc_put_rtab(cl->rate);
1207-
qdisc_put_rtab(cl->ceil);
1208-
12091257
tcf_destroy_chain(&cl->filter_list);
12101258
kfree(cl);
12111259
}
@@ -1460,12 +1508,16 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
14601508

14611509
cl->buffer = hopt->buffer;
14621510
cl->cbuffer = hopt->cbuffer;
1463-
if (cl->rate)
1464-
qdisc_put_rtab(cl->rate);
1465-
cl->rate = rtab;
1466-
if (cl->ceil)
1467-
qdisc_put_rtab(cl->ceil);
1468-
cl->ceil = ctab;
1511+
1512+
cl->rate.rate_bps = (u64)rtab->rate.rate << 3;
1513+
cl->ceil.rate_bps = (u64)ctab->rate.rate << 3;
1514+
1515+
htb_precompute_ratedata(&cl->rate);
1516+
htb_precompute_ratedata(&cl->ceil);
1517+
1518+
cl->buffer = hopt->buffer << PSCHED_SHIFT;
1519+
cl->cbuffer = hopt->buffer << PSCHED_SHIFT;
1520+
14691521
sch_tree_unlock(sch);
14701522

14711523
qdisc_class_hash_grow(sch, &q->clhash);

0 commit comments

Comments
 (0)