Skip to content

Commit 36a7886

Browse files
committed
Merge branch 'tcp_cubic-various-fixes'
Eric Dumazet says: ==================== tcp_cubic: various fixes This patch series converts tcp_cubic to usec clock resolution for Hystart logic. This makes Hystart more relevant for data-center flows. Prior to this series, Hystart was not kicking, or was kicking without good reason, since the 1ms clock was too coarse. Last patch also fixes an issue with Hystart vs TCP pacing. v2: removed a last-minute debug chunk from last patch ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 2bbc078 + ede656e commit 36a7886

File tree

1 file changed

+51
-31
lines changed

1 file changed

+51
-31
lines changed

net/ipv4/tcp_cubic.c

Lines changed: 51 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@
4040

4141
/* Number of delay samples for detecting the increase of delay */
4242
#define HYSTART_MIN_SAMPLES 8
43-
#define HYSTART_DELAY_MIN (4U<<3)
44-
#define HYSTART_DELAY_MAX (16U<<3)
43+
#define HYSTART_DELAY_MIN (4000U) /* 4 ms */
44+
#define HYSTART_DELAY_MAX (16000U) /* 16 ms */
4545
#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
4646

4747
static int fast_convergence __read_mostly = 1;
@@ -53,7 +53,7 @@ static int tcp_friendliness __read_mostly = 1;
5353
static int hystart __read_mostly = 1;
5454
static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
5555
static int hystart_low_window __read_mostly = 16;
56-
static int hystart_ack_delta __read_mostly = 2;
56+
static int hystart_ack_delta_us __read_mostly = 2000;
5757

5858
static u32 cube_rtt_scale __read_mostly;
5959
static u32 beta_scale __read_mostly;
@@ -77,8 +77,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
7777
" 1: packet-train 2: delay 3: both packet-train and delay");
7878
module_param(hystart_low_window, int, 0644);
7979
MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
80-
module_param(hystart_ack_delta, int, 0644);
81-
MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
80+
module_param(hystart_ack_delta_us, int, 0644);
81+
MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)");
8282

8383
/* BIC TCP Parameters */
8484
struct bictcp {
@@ -89,7 +89,7 @@ struct bictcp {
8989
u32 bic_origin_point;/* origin point of bic function */
9090
u32 bic_K; /* time to origin point
9191
from the beginning of the current epoch */
92-
u32 delay_min; /* min delay (msec << 3) */
92+
u32 delay_min; /* min delay (usec) */
9393
u32 epoch_start; /* beginning of an epoch */
9494
u32 ack_cnt; /* number of acks */
9595
u32 tcp_cwnd; /* estimated tcp cwnd */
@@ -117,23 +117,19 @@ static inline void bictcp_reset(struct bictcp *ca)
117117
ca->found = 0;
118118
}
119119

120-
static inline u32 bictcp_clock(void)
120+
static inline u32 bictcp_clock_us(const struct sock *sk)
121121
{
122-
#if HZ < 1000
123-
return ktime_to_ms(ktime_get_real());
124-
#else
125-
return jiffies_to_msecs(jiffies);
126-
#endif
122+
return tcp_sk(sk)->tcp_mstamp;
127123
}
128124

129125
static inline void bictcp_hystart_reset(struct sock *sk)
130126
{
131127
struct tcp_sock *tp = tcp_sk(sk);
132128
struct bictcp *ca = inet_csk_ca(sk);
133129

134-
ca->round_start = ca->last_ack = bictcp_clock();
130+
ca->round_start = ca->last_ack = bictcp_clock_us(sk);
135131
ca->end_seq = tp->snd_nxt;
136-
ca->curr_rtt = 0;
132+
ca->curr_rtt = ~0U;
137133
ca->sample_cnt = 0;
138134
}
139135

@@ -276,7 +272,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
276272
*/
277273

278274
t = (s32)(tcp_jiffies32 - ca->epoch_start);
279-
t += msecs_to_jiffies(ca->delay_min >> 3);
275+
t += usecs_to_jiffies(ca->delay_min);
280276
/* change the unit from HZ to bictcp_HZ */
281277
t <<= BICTCP_HZ;
282278
do_div(t, HZ);
@@ -380,18 +376,26 @@ static void hystart_update(struct sock *sk, u32 delay)
380376
{
381377
struct tcp_sock *tp = tcp_sk(sk);
382378
struct bictcp *ca = inet_csk_ca(sk);
383-
384-
if (ca->found & hystart_detect)
385-
return;
379+
u32 threshold;
386380

387381
if (hystart_detect & HYSTART_ACK_TRAIN) {
388-
u32 now = bictcp_clock();
382+
u32 now = bictcp_clock_us(sk);
389383

390384
/* first detection parameter - ack-train detection */
391-
if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
385+
if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
392386
ca->last_ack = now;
393-
if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
394-
ca->found |= HYSTART_ACK_TRAIN;
387+
388+
threshold = ca->delay_min;
389+
/* Hystart ack train triggers if we get ack past
390+
* ca->delay_min/2.
391+
* Pacing might have delayed packets up to RTT/2
392+
* during slow start.
393+
*/
394+
if (sk->sk_pacing_status == SK_PACING_NONE)
395+
threshold >>= 1;
396+
397+
if ((s32)(now - ca->round_start) > threshold) {
398+
ca->found = 1;
395399
NET_INC_STATS(sock_net(sk),
396400
LINUX_MIB_TCPHYSTARTTRAINDETECT);
397401
NET_ADD_STATS(sock_net(sk),
@@ -405,14 +409,14 @@ static void hystart_update(struct sock *sk, u32 delay)
405409
if (hystart_detect & HYSTART_DELAY) {
406410
/* obtain the minimum delay of more than sampling packets */
407411
if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
408-
if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
412+
if (ca->curr_rtt > delay)
409413
ca->curr_rtt = delay;
410414

411415
ca->sample_cnt++;
412416
} else {
413417
if (ca->curr_rtt > ca->delay_min +
414418
HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
415-
ca->found |= HYSTART_DELAY;
419+
ca->found = 1;
416420
NET_INC_STATS(sock_net(sk),
417421
LINUX_MIB_TCPHYSTARTDELAYDETECT);
418422
NET_ADD_STATS(sock_net(sk),
@@ -424,9 +428,6 @@ static void hystart_update(struct sock *sk, u32 delay)
424428
}
425429
}
426430

427-
/* Track delayed acknowledgment ratio using sliding window
428-
* ratio = (15*ratio + sample) / 16
429-
*/
430431
static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
431432
{
432433
const struct tcp_sock *tp = tcp_sk(sk);
@@ -441,16 +442,35 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
441442
if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
442443
return;
443444

444-
delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
445+
delay = sample->rtt_us;
445446
if (delay == 0)
446447
delay = 1;
447448

448449
/* first time call or link delay decreases */
449-
if (ca->delay_min == 0 || ca->delay_min > delay)
450-
ca->delay_min = delay;
450+
if (ca->delay_min == 0 || ca->delay_min > delay) {
451+
unsigned long rate = READ_ONCE(sk->sk_pacing_rate);
452+
453+
/* Account for TSO/GRO delays.
454+
* Otherwise short RTT flows could get too small ssthresh,
455+
* since during slow start we begin with small TSO packets
456+
* and could lower ca->delay_min too much.
457+
* Ideally even with a very small RTT we would like to have
458+
* at least one TSO packet being sent and received by GRO,
459+
* and another one in qdisc layer.
460+
* We apply another 100% factor because @rate is doubled at
461+
* this point.
462+
* We cap the cushion to 1ms.
463+
*/
464+
if (rate)
465+
delay += min_t(u64, USEC_PER_MSEC,
466+
div64_ul((u64)GSO_MAX_SIZE *
467+
4 * USEC_PER_SEC, rate));
468+
if (ca->delay_min == 0 || ca->delay_min > delay)
469+
ca->delay_min = delay;
470+
}
451471

452472
/* hystart triggers when cwnd is larger than some threshold */
453-
if (hystart && tcp_in_slow_start(tp) &&
473+
if (!ca->found && hystart && tcp_in_slow_start(tp) &&
454474
tp->snd_cwnd >= hystart_low_window)
455475
hystart_update(sk, delay);
456476
}

0 commit comments

Comments
 (0)