Skip to content

Commit a842fe1

Browse files
edumazetdavem330
authored andcommitted
tcp: add optional per socket transmit delay
Adding delays to TCP flows is crucial for studying behavior of TCP stacks, including congestion control modules. Linux offers netem module, but it has unpractical constraints : - Need root access to change qdisc - Hard to setup on egress if combined with non trivial qdisc like FQ - Single delay for all flows. EDT (Earliest Departure Time) adoption in TCP stack allows us to enable a per socket delay at a very small cost. Networking tools can now establish thousands of flows, each of them with a different delay, simulating real world conditions. This requires FQ packet scheduler or a EDT-enabled NIC. This patchs adds TCP_TX_DELAY socket option, to set a delay in usec units. unsigned int tx_delay = 10000; /* 10 msec */ setsockopt(fd, SOL_TCP, TCP_TX_DELAY, &tx_delay, sizeof(tx_delay)); Note that FQ packet scheduler limits might need some tweaking : man tc-fq PARAMETERS limit Hard limit on the real queue size. When this limit is reached, new packets are dropped. If the value is lowered, packets are dropped so that the new limit is met. Default is 10000 packets. flow_limit Hard limit on the maximum number of packets queued per flow. Default value is 100. Use of TCP_TX_DELAY option will increase number of skbs in FQ qdisc, so packets would be dropped if any of the previous limit is hit. Use of a jump label makes this support runtime-free, for hosts never using the option. Also note that TSQ (TCP Small Queues) limits are slightly changed with this patch : we need to account that skbs artificially delayed wont stop us providind more skbs to feed the pipe (netem uses skb_orphan_partial() for this purpose, but FQ can not use this trick) Because of that, using big delays might very well trigger old bugs in TSO auto defer logic and/or sndbuf limited detection. Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent e0ffbd3 commit a842fe1

File tree

8 files changed

+76
-8
lines changed

8 files changed

+76
-8
lines changed

include/linux/tcp.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ struct tcp_sock {
245245
syn_smc:1; /* SYN includes SMC */
246246
u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */
247247

248+
u32 tcp_tx_delay; /* delay (in usec) added to TX packets */
248249
u64 tcp_wstamp_ns; /* departure time for next sent data packet */
249250
u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
250251

@@ -436,6 +437,7 @@ struct tcp_timewait_sock {
436437
u32 tw_last_oow_ack_time;
437438

438439
int tw_ts_recent_stamp;
440+
u32 tw_tx_delay;
439441
#ifdef CONFIG_TCP_MD5SIG
440442
struct tcp_md5sig_key *tw_md5_key;
441443
#endif

include/net/tcp.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2232,4 +2232,23 @@ void clean_acked_data_disable(struct inet_connection_sock *icsk);
22322232
void clean_acked_data_flush(void);
22332233
#endif
22342234

2235+
DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
2236+
static inline void tcp_add_tx_delay(struct sk_buff *skb,
2237+
const struct tcp_sock *tp)
2238+
{
2239+
if (static_branch_unlikely(&tcp_tx_delay_enabled))
2240+
skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
2241+
}
2242+
2243+
static inline void tcp_set_tx_time(struct sk_buff *skb,
2244+
const struct sock *sk)
2245+
{
2246+
if (static_branch_unlikely(&tcp_tx_delay_enabled)) {
2247+
u32 delay = (sk->sk_state == TCP_TIME_WAIT) ?
2248+
tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay;
2249+
2250+
skb->skb_mstamp_ns = tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
2251+
}
2252+
}
2253+
22352254
#endif /* _TCP_H */

include/uapi/linux/tcp.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,9 @@ enum {
127127

128128
#define TCP_CM_INQ TCP_INQ
129129

130+
#define TCP_TX_DELAY 37 /* delay outgoing packets by XX usec */
131+
132+
130133
#define TCP_REPAIR_ON 1
131134
#define TCP_REPAIR_OFF 0
132135
#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */

net/ipv4/tcp.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2736,6 +2736,21 @@ static int tcp_repair_options_est(struct sock *sk,
27362736
return 0;
27372737
}
27382738

2739+
DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
2740+
EXPORT_SYMBOL(tcp_tx_delay_enabled);
2741+
2742+
static void tcp_enable_tx_delay(void)
2743+
{
2744+
if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
2745+
static int __tcp_tx_delay_enabled = 0;
2746+
2747+
if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
2748+
static_branch_enable(&tcp_tx_delay_enabled);
2749+
pr_info("TCP_TX_DELAY enabled\n");
2750+
}
2751+
}
2752+
}
2753+
27392754
/*
27402755
* Socket option code for TCP.
27412756
*/
@@ -3087,6 +3102,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
30873102
else
30883103
tp->recvmsg_inq = val;
30893104
break;
3105+
case TCP_TX_DELAY:
3106+
if (val)
3107+
tcp_enable_tx_delay();
3108+
tp->tcp_tx_delay = val;
3109+
break;
30903110
default:
30913111
err = -ENOPROTOOPT;
30923112
break;
@@ -3546,6 +3566,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
35463566
val = tp->fastopen_no_cookie;
35473567
break;
35483568

3569+
case TCP_TX_DELAY:
3570+
val = tp->tcp_tx_delay;
3571+
break;
3572+
35493573
case TCP_TIMESTAMP:
35503574
val = tcp_time_stamp_raw() + tp->tsoffset;
35513575
break;

net/ipv4/tcp_ipv4.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -767,9 +767,11 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
767767
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
768768
local_bh_disable();
769769
ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
770-
if (sk)
770+
if (sk) {
771771
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
772772
inet_twsk(sk)->tw_mark : sk->sk_mark;
773+
tcp_set_tx_time(skb, sk);
774+
}
773775
ip_send_unicast_reply(ctl_sk,
774776
skb, &TCP_SKB_CB(skb)->header.h4.opt,
775777
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
@@ -859,9 +861,9 @@ static void tcp_v4_send_ack(const struct sock *sk,
859861
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
860862
local_bh_disable();
861863
ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
862-
if (sk)
863-
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
864-
inet_twsk(sk)->tw_mark : sk->sk_mark;
864+
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
865+
inet_twsk(sk)->tw_mark : sk->sk_mark;
866+
tcp_set_tx_time(skb, sk);
865867
ip_send_unicast_reply(ctl_sk,
866868
skb, &TCP_SKB_CB(skb)->header.h4.opt,
867869
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,

net/ipv4/tcp_minisocks.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
274274
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
275275
tcptw->tw_ts_offset = tp->tsoffset;
276276
tcptw->tw_last_oow_ack_time = 0;
277-
277+
tcptw->tw_tx_delay = tp->tcp_tx_delay;
278278
#if IS_ENABLED(CONFIG_IPV6)
279279
if (tw->tw_family == PF_INET6) {
280280
struct ipv6_pinfo *np = inet6_sk(sk);

net/ipv4/tcp_output.c

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
11531153
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
11541154
sizeof(struct inet6_skb_parm)));
11551155

1156+
tcp_add_tx_delay(skb, tp);
1157+
11561158
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
11571159

11581160
if (unlikely(err > 0)) {
@@ -2234,6 +2236,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
22342236
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
22352237
limit <<= factor;
22362238

2239+
if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
2240+
tcp_sk(sk)->tcp_tx_delay) {
2241+
u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
2242+
2243+
/* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
2244+
* approximate our needs assuming an ~100% skb->truesize overhead.
2245+
* USEC_PER_SEC is approximated by 2^20.
2246+
* do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
2247+
*/
2248+
extra_bytes >>= (20 - 1);
2249+
limit += extra_bytes;
2250+
}
22372251
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
22382252
/* Always send skb if rtx queue is empty.
22392253
* No need to wait for TX completion to call us back,
@@ -3212,6 +3226,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
32123226
int tcp_header_size;
32133227
struct tcphdr *th;
32143228
int mss;
3229+
u64 now;
32153230

32163231
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
32173232
if (unlikely(!skb)) {
@@ -3243,13 +3258,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
32433258
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
32443259

32453260
memset(&opts, 0, sizeof(opts));
3261+
now = tcp_clock_ns();
32463262
#ifdef CONFIG_SYN_COOKIES
32473263
if (unlikely(req->cookie_ts))
32483264
skb->skb_mstamp_ns = cookie_init_timestamp(req);
32493265
else
32503266
#endif
32513267
{
3252-
skb->skb_mstamp_ns = tcp_clock_ns();
3268+
skb->skb_mstamp_ns = now;
32533269
if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
32543270
tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
32553271
}
@@ -3292,8 +3308,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
32923308
rcu_read_unlock();
32933309
#endif
32943310

3295-
/* Do not fool tcpdump (if any), clean our debris */
3296-
skb->tstamp = 0;
3311+
skb->skb_mstamp_ns = now;
3312+
tcp_add_tx_delay(skb, tp);
3313+
32973314
return skb;
32983315
}
32993316
EXPORT_SYMBOL(tcp_make_synack);

net/ipv6/tcp_ipv6.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -892,6 +892,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
892892
} else {
893893
mark = sk->sk_mark;
894894
}
895+
tcp_set_tx_time(buff, sk);
895896
}
896897
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
897898
fl6.fl6_dport = t1->dest;

0 commit comments

Comments
 (0)