Skip to content

Commit 740b0f1

Browse files
edumazetdavem330
authored andcommitted
tcp: switch rtt estimations to usec resolution
Upcoming congestion controls for TCP require usec resolution for RTT estimations. Millisecond resolution is simply not enough these days. FQ/pacing in DC environments also require this change for finer control and removal of bimodal behavior due to the current hack in tcp_update_pacing_rate() for 'small rtt' TCP_CONG_RTT_STAMP is no longer needed. As Julian Anastasov pointed out, we need to keep user compatibility : tcp_metrics used to export RTT and RTTVAR in msec resolution, so we added RTT_US and RTTVAR_US. An iproute2 patch is needed to use the new attributes if provided by the kernel. In this example ss command displays a srtt of 32 usecs (10Gbit link) lpk51:~# ./ss -i dst lpk52 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 1 10.246.11.51:42959 10.246.11.52:64614 cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448 cwnd:10 send 3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559 Updated iproute2 ip command displays : lpk51:~# ./ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source 10.246.11.51 Old binary displays : lpk51:~# ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source 10.246.11.51 With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng Signed-off-by: Eric Dumazet <[email protected]> Acked-by: Neal Cardwell <[email protected]> Cc: Stephen Hemminger <[email protected]> Cc: Yuchung Cheng <[email protected]> Cc: Larry Brakmo <[email protected]> Cc: Julian Anastasov <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 363ec39 commit 740b0f1

File tree

17 files changed

+174
-169
lines changed

17 files changed

+174
-169
lines changed

include/linux/tcp.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,10 @@ struct tcp_sock {
201201
u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */
202202

203203
/* RTT measurement */
204-
u32 srtt; /* smoothed round trip time << 3 */
205-
u32 mdev; /* medium deviation */
206-
u32 mdev_max; /* maximal mdev for the last rtt period */
207-
u32 rttvar; /* smoothed mdev_max */
204+
u32 srtt_us; /* smoothed round trip time << 3 in usecs */
205+
u32 mdev_us; /* medium deviation */
206+
u32 mdev_max_us; /* maximal mdev for the last rtt period */
207+
u32 rttvar_us; /* smoothed mdev_max */
208208
u32 rtt_seq; /* sequence number to update rttvar */
209209

210210
u32 packets_out; /* Packets which are "in flight" */

include/net/tcp.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <linux/crypto.h>
3232
#include <linux/cryptohash.h>
3333
#include <linux/kref.h>
34+
#include <linux/ktime.h>
3435

3536
#include <net/inet_connection_sock.h>
3637
#include <net/inet_timewait_sock.h>
@@ -478,7 +479,6 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
478479
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
479480
struct ip_options *opt);
480481
#ifdef CONFIG_SYN_COOKIES
481-
#include <linux/ktime.h>
482482

483483
/* Syncookies use a monotonic timer which increments every 64 seconds.
484484
* This counter is used both as a hash input and partially encoded into
@@ -619,7 +619,7 @@ static inline void tcp_bound_rto(const struct sock *sk)
619619

620620
static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
621621
{
622-
return (tp->srtt >> 3) + tp->rttvar;
622+
return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
623623
}
624624

625625
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
@@ -656,6 +656,11 @@ static inline u32 tcp_rto_min(struct sock *sk)
656656
return rto_min;
657657
}
658658

659+
static inline u32 tcp_rto_min_us(struct sock *sk)
660+
{
661+
return jiffies_to_usecs(tcp_rto_min(sk));
662+
}
663+
659664
/* Compute the actual receive window we are currently advertising.
660665
* Rcv_nxt can be after the window if our peer push more data
661666
* than the offered window.
@@ -778,7 +783,6 @@ enum tcp_ca_event {
778783
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
779784

780785
#define TCP_CONG_NON_RESTRICTED 0x1
781-
#define TCP_CONG_RTT_STAMP 0x2
782786

783787
struct tcp_congestion_ops {
784788
struct list_head list;

include/uapi/linux/tcp_metrics.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,15 @@
1111
#define TCP_METRICS_GENL_VERSION 0x1
1212

1313
enum tcp_metric_index {
14-
TCP_METRIC_RTT,
15-
TCP_METRIC_RTTVAR,
14+
TCP_METRIC_RTT, /* in ms units */
15+
TCP_METRIC_RTTVAR, /* in ms units */
1616
TCP_METRIC_SSTHRESH,
1717
TCP_METRIC_CWND,
1818
TCP_METRIC_REORDERING,
1919

20+
TCP_METRIC_RTT_US, /* in usec units */
21+
TCP_METRIC_RTTVAR_US, /* in usec units */
22+
2023
/* Always last. */
2124
__TCP_METRIC_MAX,
2225
};

net/ipv4/tcp.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk)
387387
INIT_LIST_HEAD(&tp->tsq_node);
388388

389389
icsk->icsk_rto = TCP_TIMEOUT_INIT;
390-
tp->mdev = TCP_TIMEOUT_INIT;
390+
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
391391

392392
/* So many TCP implementations out there (incorrectly) count the
393393
* initial SYN frame in their delayed-ACK and congestion control
@@ -2339,7 +2339,7 @@ int tcp_disconnect(struct sock *sk, int flags)
23392339

23402340
sk->sk_shutdown = 0;
23412341
sock_reset_flag(sk, SOCK_DONE);
2342-
tp->srtt = 0;
2342+
tp->srtt_us = 0;
23432343
if ((tp->write_seq += tp->max_window + 2) == 0)
23442344
tp->write_seq = 1;
23452345
icsk->icsk_backoff = 0;
@@ -2783,8 +2783,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
27832783

27842784
info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
27852785
info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2786-
info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2787-
info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2786+
info->tcpi_rtt = tp->srtt_us >> 3;
2787+
info->tcpi_rttvar = tp->mdev_us >> 2;
27882788
info->tcpi_snd_ssthresh = tp->snd_ssthresh;
27892789
info->tcpi_snd_cwnd = tp->snd_cwnd;
27902790
info->tcpi_advmss = tp->advmss;

net/ipv4/tcp_cubic.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -476,10 +476,6 @@ static int __init cubictcp_register(void)
476476
/* divide by bic_scale and by constant Srtt (100ms) */
477477
do_div(cube_factor, bic_scale * 10);
478478

479-
/* hystart needs ms clock resolution */
480-
if (hystart && HZ < 1000)
481-
cubictcp.flags |= TCP_CONG_RTT_STAMP;
482-
483479
return tcp_register_congestion_control(&cubictcp);
484480
}
485481

net/ipv4/tcp_hybla.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ struct hybla {
2121
u32 rho2; /* Rho * Rho, integer part */
2222
u32 rho_3ls; /* Rho parameter, <<3 */
2323
u32 rho2_7ls; /* Rho^2, <<7 */
24-
u32 minrtt; /* Minimum smoothed round trip time value seen */
24+
u32 minrtt_us; /* Minimum smoothed round trip time value seen */
2525
};
2626

2727
/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
@@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk)
3535
{
3636
struct hybla *ca = inet_csk_ca(sk);
3737

38-
ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
38+
ca->rho_3ls = max_t(u32,
39+
tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
40+
8U);
3941
ca->rho = ca->rho_3ls >> 3;
4042
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
4143
ca->rho2 = ca->rho2_7ls >> 7;
@@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk)
5961
hybla_recalc_param(sk);
6062

6163
/* set minimum rtt as this is the 1st ever seen */
62-
ca->minrtt = tp->srtt;
64+
ca->minrtt_us = tp->srtt_us;
6365
tp->snd_cwnd = ca->rho;
6466
}
6567

@@ -94,9 +96,9 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
9496
int is_slowstart = 0;
9597

9698
/* Recalculate rho only if this srtt is the lowest */
97-
if (tp->srtt < ca->minrtt){
99+
if (tp->srtt_us < ca->minrtt_us) {
98100
hybla_recalc_param(sk);
99-
ca->minrtt = tp->srtt;
101+
ca->minrtt_us = tp->srtt_us;
100102
}
101103

102104
if (!tcp_is_cwnd_limited(sk, in_flight))

net/ipv4/tcp_illinois.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,6 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
325325
}
326326

327327
static struct tcp_congestion_ops tcp_illinois __read_mostly = {
328-
.flags = TCP_CONG_RTT_STAMP,
329328
.init = tcp_illinois_init,
330329
.ssthresh = tcp_illinois_ssthresh,
331330
.cong_avoid = tcp_illinois_cong_avoid,

0 commit comments

Comments
 (0)