Skip to content

Commit c9d0c6e

Browse files
committed
Merge branch 'pie-next'
Leslie Monis says: ==================== net: sched: pie: align PIE implementation with RFC 8033 The current implementation of the PIE queuing discipline is according to the IETF draft [http://tools.ietf.org/html/draft-pan-aqm-pie-00] and the paper [PIE: A Lightweight Control Scheme to Address the Bufferbloat Problem]. However, a lot of necessary modifications and enhancements have been proposed in RFC 8033, which have not yet been incorporated in the source code of Linux. This patch series helps in achieving the same. Performance tests carried out using Flent [https://flent.org/] Changes from v2 to v3: - Used div_u64() instead of direct division after explicit type casting as recommended by David Changes from v1 to v2: - Excluded the patch setting PIE dynamically active/inactive as the test results were unsatisfactory - Fixed a scaling issue when adding more auto-tuning cases which caused local variables to underflow - Changed the long if/else chain to a loop as suggested by Stephen - Changed the position of the accu_prob variable in the pie_vars structure as recommended by Stephen ==================== Acked-by: Dave Taht <[email protected]> Acked-by: Jamal Hadi Salim <[email protected]> Signed-off-by: David S. Miller <[email protected]>
2 parents 7884406 + c9d2ac5 commit c9d0c6e

File tree

2 files changed

+66
-43
lines changed

2 files changed

+66
-43
lines changed

include/uapi/linux/pkt_sched.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -954,7 +954,7 @@ enum {
954954
#define TCA_PIE_MAX (__TCA_PIE_MAX - 1)
955955

956956
struct tc_pie_xstats {
957-
__u32 prob; /* current probability */
957+
__u64 prob; /* current probability */
958958
__u32 delay; /* current delay in ms */
959959
__u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */
960960
__u32 packets_in; /* total number of packets enqueued */

net/sched/sch_pie.c

Lines changed: 65 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,7 @@
1717
* University of Oslo, Norway.
1818
*
1919
* References:
20-
* IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
21-
* IEEE Conference on High Performance Switching and Routing 2013 :
22-
* "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
20+
* RFC 8033: https://tools.ietf.org/html/rfc8034
2321
*/
2422

2523
#include <linux/module.h>
@@ -31,9 +29,9 @@
3129
#include <net/pkt_sched.h>
3230
#include <net/inet_ecn.h>
3331

34-
#define QUEUE_THRESHOLD 10000
32+
#define QUEUE_THRESHOLD 16384
3533
#define DQCOUNT_INVALID -1
36-
#define MAX_PROB 0xffffffff
34+
#define MAX_PROB 0xffffffffffffffff
3735
#define PIE_SCALE 8
3836

3937
/* parameters used */
@@ -49,14 +47,16 @@ struct pie_params {
4947

5048
/* variables used */
5149
struct pie_vars {
52-
u32 prob; /* probability but scaled by u32 limit. */
50+
u64 prob; /* probability but scaled by u64 limit. */
5351
psched_time_t burst_time;
5452
psched_time_t qdelay;
5553
psched_time_t qdelay_old;
5654
u64 dq_count; /* measured in bytes */
5755
psched_time_t dq_tstamp; /* drain rate */
56+
u64 accu_prob; /* accumulated drop probability */
5857
u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
5958
u32 qlen_old; /* in bytes */
59+
u8 accu_prob_overflows; /* overflows of accu_prob */
6060
};
6161

6262
/* statistics gathering */
@@ -81,26 +81,28 @@ static void pie_params_init(struct pie_params *params)
8181
{
8282
params->alpha = 2;
8383
params->beta = 20;
84-
params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */
84+
params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */
8585
params->limit = 1000; /* default of 1000 packets */
86-
params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */
86+
params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */
8787
params->ecn = false;
8888
params->bytemode = false;
8989
}
9090

9191
static void pie_vars_init(struct pie_vars *vars)
9292
{
9393
vars->dq_count = DQCOUNT_INVALID;
94+
vars->accu_prob = 0;
9495
vars->avg_dq_rate = 0;
95-
/* default of 100 ms in pschedtime */
96-
vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
96+
/* default of 150 ms in pschedtime */
97+
vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC);
98+
vars->accu_prob_overflows = 0;
9799
}
98100

99101
static bool drop_early(struct Qdisc *sch, u32 packet_size)
100102
{
101103
struct pie_sched_data *q = qdisc_priv(sch);
102-
u32 rnd;
103-
u32 local_prob = q->vars.prob;
104+
u64 rnd;
105+
u64 local_prob = q->vars.prob;
104106
u32 mtu = psched_mtu(qdisc_dev(sch));
105107

106108
/* If there is still burst allowance left skip random early drop */
@@ -124,13 +126,33 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size)
124126
* probablity. Smaller packets will have lower drop prob in this case
125127
*/
126128
if (q->params.bytemode && packet_size <= mtu)
127-
local_prob = (local_prob / mtu) * packet_size;
129+
local_prob = (u64)packet_size * div_u64(local_prob, mtu);
128130
else
129131
local_prob = q->vars.prob;
130132

131-
rnd = prandom_u32();
132-
if (rnd < local_prob)
133+
if (local_prob == 0) {
134+
q->vars.accu_prob = 0;
135+
q->vars.accu_prob_overflows = 0;
136+
}
137+
138+
if (local_prob > MAX_PROB - q->vars.accu_prob)
139+
q->vars.accu_prob_overflows++;
140+
141+
q->vars.accu_prob += local_prob;
142+
143+
if (q->vars.accu_prob_overflows == 0 &&
144+
q->vars.accu_prob < (MAX_PROB / 100) * 85)
145+
return false;
146+
if (q->vars.accu_prob_overflows == 8 &&
147+
q->vars.accu_prob >= MAX_PROB / 2)
148+
return true;
149+
150+
prandom_bytes(&rnd, 8);
151+
if (rnd < local_prob) {
152+
q->vars.accu_prob = 0;
153+
q->vars.accu_prob_overflows = 0;
133154
return true;
155+
}
134156

135157
return false;
136158
}
@@ -168,6 +190,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
168190

169191
out:
170192
q->stats.dropped++;
193+
q->vars.accu_prob = 0;
194+
q->vars.accu_prob_overflows = 0;
171195
return qdisc_drop(skb, sch, to_free);
172196
}
173197

@@ -317,9 +341,10 @@ static void calculate_probability(struct Qdisc *sch)
317341
u32 qlen = sch->qstats.backlog; /* queue size in bytes */
318342
psched_time_t qdelay = 0; /* in pschedtime */
319343
psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
320-
s32 delta = 0; /* determines the change in probability */
321-
u32 oldprob;
322-
u32 alpha, beta;
344+
s64 delta = 0; /* determines the change in probability */
345+
u64 oldprob;
346+
u64 alpha, beta;
347+
u32 power;
323348
bool update_prob = true;
324349

325350
q->vars.qdelay_old = q->vars.qdelay;
@@ -339,38 +364,36 @@ static void calculate_probability(struct Qdisc *sch)
339364
* value for alpha as 0.125. In this implementation, we use values 0-32
340365
* passed from user space to represent this. Also, alpha and beta have
341366
* unit of HZ and need to be scaled before they can used to update
342-
* probability. alpha/beta are updated locally below by 1) scaling them
343-
* appropriately 2) scaling down by 16 to come to 0-2 range.
344-
* Please see paper for details.
345-
*
346-
* We scale alpha and beta differently depending on whether we are in
347-
* light, medium or high dropping mode.
367+
* probability. alpha/beta are updated locally below by scaling down
368+
* by 16 to come to 0-2 range.
348369
*/
349-
if (q->vars.prob < MAX_PROB / 100) {
350-
alpha =
351-
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
352-
beta =
353-
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
354-
} else if (q->vars.prob < MAX_PROB / 10) {
355-
alpha =
356-
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
357-
beta =
358-
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
359-
} else {
360-
alpha =
361-
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
362-
beta =
363-
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
370+
alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
371+
beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
372+
373+
/* We scale alpha and beta differently depending on how heavy the
374+
* congestion is. Please see RFC 8033 for details.
375+
*/
376+
if (q->vars.prob < MAX_PROB / 10) {
377+
alpha >>= 1;
378+
beta >>= 1;
379+
380+
power = 100;
381+
while (q->vars.prob < div_u64(MAX_PROB, power) &&
382+
power <= 1000000) {
383+
alpha >>= 2;
384+
beta >>= 2;
385+
power *= 10;
386+
}
364387
}
365388

366389
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
367-
delta += alpha * ((qdelay - q->params.target));
368-
delta += beta * ((qdelay - qdelay_old));
390+
delta += alpha * (u64)(qdelay - q->params.target);
391+
delta += beta * (u64)(qdelay - qdelay_old);
369392

370393
oldprob = q->vars.prob;
371394

372395
/* to ensure we increase probability in steps of no more than 2% */
373-
if (delta > (s32)(MAX_PROB / (100 / 2)) &&
396+
if (delta > (s64)(MAX_PROB / (100 / 2)) &&
374397
q->vars.prob >= MAX_PROB / 10)
375398
delta = (MAX_PROB / 100) * 2;
376399

0 commit comments

Comments
 (0)