17
17
* University of Oslo, Norway.
18
18
*
19
19
* References:
20
- * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
21
- * IEEE Conference on High Performance Switching and Routing 2013 :
22
- * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
20
+ * RFC 8033: https://tools.ietf.org/html/rfc8034
23
21
*/
24
22
25
23
#include <linux/module.h>
31
29
#include <net/pkt_sched.h>
32
30
#include <net/inet_ecn.h>
33
31
34
- #define QUEUE_THRESHOLD 10000
32
+ #define QUEUE_THRESHOLD 16384
35
33
#define DQCOUNT_INVALID -1
36
- #define MAX_PROB 0xffffffff
34
+ #define MAX_PROB 0xffffffffffffffff
37
35
#define PIE_SCALE 8
38
36
39
37
/* parameters used */
@@ -49,14 +47,16 @@ struct pie_params {
49
47
50
48
/* variables used */
51
49
struct pie_vars {
52
- u32 prob ; /* probability but scaled by u32 limit. */
50
+ u64 prob ; /* probability but scaled by u64 limit. */
53
51
psched_time_t burst_time ;
54
52
psched_time_t qdelay ;
55
53
psched_time_t qdelay_old ;
56
54
u64 dq_count ; /* measured in bytes */
57
55
psched_time_t dq_tstamp ; /* drain rate */
56
+ u64 accu_prob ; /* accumulated drop probability */
58
57
u32 avg_dq_rate ; /* bytes per pschedtime tick,scaled */
59
58
u32 qlen_old ; /* in bytes */
59
+ u8 accu_prob_overflows ; /* overflows of accu_prob */
60
60
};
61
61
62
62
/* statistics gathering */
@@ -81,26 +81,28 @@ static void pie_params_init(struct pie_params *params)
81
81
{
82
82
params -> alpha = 2 ;
83
83
params -> beta = 20 ;
84
- params -> tupdate = usecs_to_jiffies (30 * USEC_PER_MSEC ); /* 30 ms */
84
+ params -> tupdate = usecs_to_jiffies (15 * USEC_PER_MSEC ); /* 15 ms */
85
85
params -> limit = 1000 ; /* default of 1000 packets */
86
- params -> target = PSCHED_NS2TICKS (20 * NSEC_PER_MSEC ); /* 20 ms */
86
+ params -> target = PSCHED_NS2TICKS (15 * NSEC_PER_MSEC ); /* 15 ms */
87
87
params -> ecn = false;
88
88
params -> bytemode = false;
89
89
}
90
90
91
91
static void pie_vars_init (struct pie_vars * vars )
92
92
{
93
93
vars -> dq_count = DQCOUNT_INVALID ;
94
+ vars -> accu_prob = 0 ;
94
95
vars -> avg_dq_rate = 0 ;
95
- /* default of 100 ms in pschedtime */
96
- vars -> burst_time = PSCHED_NS2TICKS (100 * NSEC_PER_MSEC );
96
+ /* default of 150 ms in pschedtime */
97
+ vars -> burst_time = PSCHED_NS2TICKS (150 * NSEC_PER_MSEC );
98
+ vars -> accu_prob_overflows = 0 ;
97
99
}
98
100
99
101
static bool drop_early (struct Qdisc * sch , u32 packet_size )
100
102
{
101
103
struct pie_sched_data * q = qdisc_priv (sch );
102
- u32 rnd ;
103
- u32 local_prob = q -> vars .prob ;
104
+ u64 rnd ;
105
+ u64 local_prob = q -> vars .prob ;
104
106
u32 mtu = psched_mtu (qdisc_dev (sch ));
105
107
106
108
/* If there is still burst allowance left skip random early drop */
@@ -124,13 +126,33 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size)
124
126
* probablity. Smaller packets will have lower drop prob in this case
125
127
*/
126
128
if (q -> params .bytemode && packet_size <= mtu )
127
- local_prob = (local_prob / mtu ) * packet_size ;
129
+ local_prob = (u64 ) packet_size * div_u64 ( local_prob , mtu ) ;
128
130
else
129
131
local_prob = q -> vars .prob ;
130
132
131
- rnd = prandom_u32 ();
132
- if (rnd < local_prob )
133
+ if (local_prob == 0 ) {
134
+ q -> vars .accu_prob = 0 ;
135
+ q -> vars .accu_prob_overflows = 0 ;
136
+ }
137
+
138
+ if (local_prob > MAX_PROB - q -> vars .accu_prob )
139
+ q -> vars .accu_prob_overflows ++ ;
140
+
141
+ q -> vars .accu_prob += local_prob ;
142
+
143
+ if (q -> vars .accu_prob_overflows == 0 &&
144
+ q -> vars .accu_prob < (MAX_PROB / 100 ) * 85 )
145
+ return false;
146
+ if (q -> vars .accu_prob_overflows == 8 &&
147
+ q -> vars .accu_prob >= MAX_PROB / 2 )
148
+ return true;
149
+
150
+ prandom_bytes (& rnd , 8 );
151
+ if (rnd < local_prob ) {
152
+ q -> vars .accu_prob = 0 ;
153
+ q -> vars .accu_prob_overflows = 0 ;
133
154
return true;
155
+ }
134
156
135
157
return false;
136
158
}
@@ -168,6 +190,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
168
190
169
191
out :
170
192
q -> stats .dropped ++ ;
193
+ q -> vars .accu_prob = 0 ;
194
+ q -> vars .accu_prob_overflows = 0 ;
171
195
return qdisc_drop (skb , sch , to_free );
172
196
}
173
197
@@ -317,9 +341,10 @@ static void calculate_probability(struct Qdisc *sch)
317
341
u32 qlen = sch -> qstats .backlog ; /* queue size in bytes */
318
342
psched_time_t qdelay = 0 ; /* in pschedtime */
319
343
psched_time_t qdelay_old = q -> vars .qdelay ; /* in pschedtime */
320
- s32 delta = 0 ; /* determines the change in probability */
321
- u32 oldprob ;
322
- u32 alpha , beta ;
344
+ s64 delta = 0 ; /* determines the change in probability */
345
+ u64 oldprob ;
346
+ u64 alpha , beta ;
347
+ u32 power ;
323
348
bool update_prob = true;
324
349
325
350
q -> vars .qdelay_old = q -> vars .qdelay ;
@@ -339,38 +364,36 @@ static void calculate_probability(struct Qdisc *sch)
339
364
* value for alpha as 0.125. In this implementation, we use values 0-32
340
365
* passed from user space to represent this. Also, alpha and beta have
341
366
* unit of HZ and need to be scaled before they can used to update
342
- * probability. alpha/beta are updated locally below by 1) scaling them
343
- * appropriately 2) scaling down by 16 to come to 0-2 range.
344
- * Please see paper for details.
345
- *
346
- * We scale alpha and beta differently depending on whether we are in
347
- * light, medium or high dropping mode.
367
+ * probability. alpha/beta are updated locally below by scaling down
368
+ * by 16 to come to 0-2 range.
348
369
*/
349
- if (q -> vars .prob < MAX_PROB / 100 ) {
350
- alpha =
351
- (q -> params .alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC )) >> 7 ;
352
- beta =
353
- (q -> params .beta * (MAX_PROB / PSCHED_TICKS_PER_SEC )) >> 7 ;
354
- } else if (q -> vars .prob < MAX_PROB / 10 ) {
355
- alpha =
356
- (q -> params .alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC )) >> 5 ;
357
- beta =
358
- (q -> params .beta * (MAX_PROB / PSCHED_TICKS_PER_SEC )) >> 5 ;
359
- } else {
360
- alpha =
361
- (q -> params .alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC )) >> 4 ;
362
- beta =
363
- (q -> params .beta * (MAX_PROB / PSCHED_TICKS_PER_SEC )) >> 4 ;
370
+ alpha = ((u64 )q -> params .alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC )) >> 4 ;
371
+ beta = ((u64 )q -> params .beta * (MAX_PROB / PSCHED_TICKS_PER_SEC )) >> 4 ;
372
+
373
+ /* We scale alpha and beta differently depending on how heavy the
374
+ * congestion is. Please see RFC 8033 for details.
375
+ */
376
+ if (q -> vars .prob < MAX_PROB / 10 ) {
377
+ alpha >>= 1 ;
378
+ beta >>= 1 ;
379
+
380
+ power = 100 ;
381
+ while (q -> vars .prob < div_u64 (MAX_PROB , power ) &&
382
+ power <= 1000000 ) {
383
+ alpha >>= 2 ;
384
+ beta >>= 2 ;
385
+ power *= 10 ;
386
+ }
364
387
}
365
388
366
389
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
367
- delta += alpha * ((qdelay - q -> params .target ) );
368
- delta += beta * ((qdelay - qdelay_old ) );
390
+ delta += alpha * (u64 ) (qdelay - q -> params .target );
391
+ delta += beta * (u64 ) (qdelay - qdelay_old );
369
392
370
393
oldprob = q -> vars .prob ;
371
394
372
395
/* to ensure we increase probability in steps of no more than 2% */
373
- if (delta > (s32 )(MAX_PROB / (100 / 2 )) &&
396
+ if (delta > (s64 )(MAX_PROB / (100 / 2 )) &&
374
397
q -> vars .prob >= MAX_PROB / 10 )
375
398
delta = (MAX_PROB / 100 ) * 2 ;
376
399
0 commit comments