40
40
41
41
/* Number of delay samples for detecting the increase of delay */
42
42
#define HYSTART_MIN_SAMPLES 8
43
- #define HYSTART_DELAY_MIN (4U<<3)
44
- #define HYSTART_DELAY_MAX (16U<<3)
43
+ #define HYSTART_DELAY_MIN (4000U) /* 4 ms */
44
+ #define HYSTART_DELAY_MAX (16000U) /* 16 ms */
45
45
#define HYSTART_DELAY_THRESH (x ) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
46
46
47
47
static int fast_convergence __read_mostly = 1 ;
@@ -53,7 +53,7 @@ static int tcp_friendliness __read_mostly = 1;
53
53
static int hystart __read_mostly = 1 ;
54
54
static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY ;
55
55
static int hystart_low_window __read_mostly = 16 ;
56
- static int hystart_ack_delta __read_mostly = 2 ;
56
+ static int hystart_ack_delta_us __read_mostly = 2000 ;
57
57
58
58
static u32 cube_rtt_scale __read_mostly ;
59
59
static u32 beta_scale __read_mostly ;
@@ -77,8 +77,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
77
77
" 1: packet-train 2: delay 3: both packet-train and delay" );
78
78
module_param (hystart_low_window , int , 0644 );
79
79
MODULE_PARM_DESC (hystart_low_window , "lower bound cwnd for hybrid slow start" );
80
- module_param (hystart_ack_delta , int , 0644 );
81
- MODULE_PARM_DESC (hystart_ack_delta , "spacing between ack's indicating train (msecs )" );
80
+ module_param (hystart_ack_delta_us , int , 0644 );
81
+ MODULE_PARM_DESC (hystart_ack_delta_us , "spacing between ack's indicating train (usecs )" );
82
82
83
83
/* BIC TCP Parameters */
84
84
struct bictcp {
@@ -89,7 +89,7 @@ struct bictcp {
89
89
u32 bic_origin_point ;/* origin point of bic function */
90
90
u32 bic_K ; /* time to origin point
91
91
from the beginning of the current epoch */
92
- u32 delay_min ; /* min delay (msec << 3 ) */
92
+ u32 delay_min ; /* min delay (usec ) */
93
93
u32 epoch_start ; /* beginning of an epoch */
94
94
u32 ack_cnt ; /* number of acks */
95
95
u32 tcp_cwnd ; /* estimated tcp cwnd */
@@ -117,23 +117,19 @@ static inline void bictcp_reset(struct bictcp *ca)
117
117
ca -> found = 0 ;
118
118
}
119
119
120
- static inline u32 bictcp_clock ( void )
120
+ static inline u32 bictcp_clock_us ( const struct sock * sk )
121
121
{
122
- #if HZ < 1000
123
- return ktime_to_ms (ktime_get_real ());
124
- #else
125
- return jiffies_to_msecs (jiffies );
126
- #endif
122
+ return tcp_sk (sk )-> tcp_mstamp ;
127
123
}
128
124
129
125
static inline void bictcp_hystart_reset (struct sock * sk )
130
126
{
131
127
struct tcp_sock * tp = tcp_sk (sk );
132
128
struct bictcp * ca = inet_csk_ca (sk );
133
129
134
- ca -> round_start = ca -> last_ack = bictcp_clock ( );
130
+ ca -> round_start = ca -> last_ack = bictcp_clock_us ( sk );
135
131
ca -> end_seq = tp -> snd_nxt ;
136
- ca -> curr_rtt = 0 ;
132
+ ca -> curr_rtt = ~ 0U ;
137
133
ca -> sample_cnt = 0 ;
138
134
}
139
135
@@ -276,7 +272,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
276
272
*/
277
273
278
274
t = (s32 )(tcp_jiffies32 - ca -> epoch_start );
279
- t += msecs_to_jiffies (ca -> delay_min >> 3 );
275
+ t += usecs_to_jiffies (ca -> delay_min );
280
276
/* change the unit from HZ to bictcp_HZ */
281
277
t <<= BICTCP_HZ ;
282
278
do_div (t , HZ );
@@ -380,18 +376,26 @@ static void hystart_update(struct sock *sk, u32 delay)
380
376
{
381
377
struct tcp_sock * tp = tcp_sk (sk );
382
378
struct bictcp * ca = inet_csk_ca (sk );
383
-
384
- if (ca -> found & hystart_detect )
385
- return ;
379
+ u32 threshold ;
386
380
387
381
if (hystart_detect & HYSTART_ACK_TRAIN ) {
388
- u32 now = bictcp_clock ( );
382
+ u32 now = bictcp_clock_us ( sk );
389
383
390
384
/* first detection parameter - ack-train detection */
391
- if ((s32 )(now - ca -> last_ack ) <= hystart_ack_delta ) {
385
+ if ((s32 )(now - ca -> last_ack ) <= hystart_ack_delta_us ) {
392
386
ca -> last_ack = now ;
393
- if ((s32 )(now - ca -> round_start ) > ca -> delay_min >> 4 ) {
394
- ca -> found |= HYSTART_ACK_TRAIN ;
387
+
388
+ threshold = ca -> delay_min ;
389
+ /* Hystart ack train triggers if we get ack past
390
+ * ca->delay_min/2.
391
+ * Pacing might have delayed packets up to RTT/2
392
+ * during slow start.
393
+ */
394
+ if (sk -> sk_pacing_status == SK_PACING_NONE )
395
+ threshold >>= 1 ;
396
+
397
+ if ((s32 )(now - ca -> round_start ) > threshold ) {
398
+ ca -> found = 1 ;
395
399
NET_INC_STATS (sock_net (sk ),
396
400
LINUX_MIB_TCPHYSTARTTRAINDETECT );
397
401
NET_ADD_STATS (sock_net (sk ),
@@ -405,14 +409,14 @@ static void hystart_update(struct sock *sk, u32 delay)
405
409
if (hystart_detect & HYSTART_DELAY ) {
406
410
/* obtain the minimum delay of more than sampling packets */
407
411
if (ca -> sample_cnt < HYSTART_MIN_SAMPLES ) {
408
- if (ca -> curr_rtt == 0 || ca -> curr_rtt > delay )
412
+ if (ca -> curr_rtt > delay )
409
413
ca -> curr_rtt = delay ;
410
414
411
415
ca -> sample_cnt ++ ;
412
416
} else {
413
417
if (ca -> curr_rtt > ca -> delay_min +
414
418
HYSTART_DELAY_THRESH (ca -> delay_min >> 3 )) {
415
- ca -> found |= HYSTART_DELAY ;
419
+ ca -> found = 1 ;
416
420
NET_INC_STATS (sock_net (sk ),
417
421
LINUX_MIB_TCPHYSTARTDELAYDETECT );
418
422
NET_ADD_STATS (sock_net (sk ),
@@ -424,9 +428,6 @@ static void hystart_update(struct sock *sk, u32 delay)
424
428
}
425
429
}
426
430
427
- /* Track delayed acknowledgment ratio using sliding window
428
- * ratio = (15*ratio + sample) / 16
429
- */
430
431
static void bictcp_acked (struct sock * sk , const struct ack_sample * sample )
431
432
{
432
433
const struct tcp_sock * tp = tcp_sk (sk );
@@ -441,16 +442,35 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
441
442
if (ca -> epoch_start && (s32 )(tcp_jiffies32 - ca -> epoch_start ) < HZ )
442
443
return ;
443
444
444
- delay = ( sample -> rtt_us << 3 ) / USEC_PER_MSEC ;
445
+ delay = sample -> rtt_us ;
445
446
if (delay == 0 )
446
447
delay = 1 ;
447
448
448
449
/* first time call or link delay decreases */
449
- if (ca -> delay_min == 0 || ca -> delay_min > delay )
450
- ca -> delay_min = delay ;
450
+ if (ca -> delay_min == 0 || ca -> delay_min > delay ) {
451
+ unsigned long rate = READ_ONCE (sk -> sk_pacing_rate );
452
+
453
+ /* Account for TSO/GRO delays.
454
+ * Otherwise short RTT flows could get too small ssthresh,
455
+ * since during slow start we begin with small TSO packets
456
+ * and could lower ca->delay_min too much.
457
+ * Ideally even with a very small RTT we would like to have
458
+ * at least one TSO packet being sent and received by GRO,
459
+ * and another one in qdisc layer.
460
+ * We apply another 100% factor because @rate is doubled at
461
+ * this point.
462
+ * We cap the cushion to 1ms.
463
+ */
464
+ if (rate )
465
+ delay += min_t (u64 , USEC_PER_MSEC ,
466
+ div64_ul ((u64 )GSO_MAX_SIZE *
467
+ 4 * USEC_PER_SEC , rate ));
468
+ if (ca -> delay_min == 0 || ca -> delay_min > delay )
469
+ ca -> delay_min = delay ;
470
+ }
451
471
452
472
/* hystart triggers when cwnd is larger than some threshold */
453
- if (hystart && tcp_in_slow_start (tp ) &&
473
+ if (! ca -> found && hystart && tcp_in_slow_start (tp ) &&
454
474
tp -> snd_cwnd >= hystart_low_window )
455
475
hystart_update (sk , delay );
456
476
}
0 commit comments