Skip to content

Commit d7722e8

Browse files
soheilhydavem330
authored andcommitted
tcp: track application-limited rate samples
This commit adds code to track whether the delivery rate represented by each rate_sample was limited by the application. Upon each transmit, we store in the is_app_limited field in the skb a boolean bit indicating whether there is a known "bubble in the pipe": a point in the rate sample interval where the sender was application-limited, and did not transmit even though the cwnd and pacing rate allowed it. This logic marks the flow app-limited on a write if *all* of the following are true: 1) There is less than 1 MSS of unsent data in the write queue available to transmit. 2) There is no packet in the sender's queues (e.g. in fq or the NIC tx queue). 3) The connection is not limited by cwnd. 4) There are no lost packets to retransmit. The tcp_rate_check_app_limited() code in tcp_rate.c determines whether the connection is application-limited at the moment. If the flow is application-limited, it sets the tp->app_limited field. If the flow is application-limited then that means there is effectively a "bubble" of silence in the pipe now, and this silence will be reflected in a lower bandwidth sample for any rate samples from now until we get an ACK indicating this bubble has exited the pipe: specifically, until we get an ACK for the next packet we transmit. When we send every skb we record in scb->tx.is_app_limited whether the resulting rate sample will be application-limited. The code in tcp_rate_gen() checks to see when it is safe to mark all known application-limited bubbles of silence as having exited the pipe. It does this by checking to see when the delivered count moves past the tp->app_limited marker. At this point it zeroes the tp->app_limited marker, as all known bubbles are out of the pipe. We make room for the tx.is_app_limited bit in the skb by borrowing a bit from the in_flight field used by NV to record the number of bytes in flight. The receive window in the TCP header is 16 bits, and the max receive window scaling shift factor is 14 (RFC 1323). So the max receive window offered by the TCP protocol is 2^(16+14) = 2^30. So we only need 30 bits for the tx.in_flight used by NV. Signed-off-by: Van Jacobson <[email protected]> Signed-off-by: Neal Cardwell <[email protected]> Signed-off-by: Yuchung Cheng <[email protected]> Signed-off-by: Nandita Dukkipati <[email protected]> Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: Soheil Hassas Yeganeh <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent b9f6482 commit d7722e8

File tree

5 files changed

+45
-2
lines changed

5 files changed

+45
-2
lines changed

include/linux/tcp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ struct tcp_sock {
268268
u32 prr_out; /* Total number of pkts sent during Recovery. */
269269
u32 delivered; /* Total data packets delivered incl. rexmits */
270270
u32 lost; /* Total data packets lost incl. rexmits */
271+
u32 app_limited; /* limited until "delivered" reaches this val */
271272
struct skb_mstamp first_tx_mstamp; /* start of window send phase */
272273
struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
273274

include/net/tcp.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,9 @@ struct tcp_skb_cb {
764764
union {
765765
struct {
766766
/* There is space for up to 24 bytes */
767-
__u32 in_flight;/* Bytes in flight when packet sent */
767+
__u32 in_flight:30,/* Bytes in flight at transmit */
768+
is_app_limited:1, /* cwnd not fully used? */
769+
unused:1;
768770
/* pkts S/ACKed so far upon tx of skb, incl retrans: */
769771
__u32 delivered;
770772
/* start of send pipeline phase */
@@ -883,6 +885,7 @@ struct rate_sample {
883885
int losses; /* number of packets marked lost upon ACK */
884886
u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
885887
u32 prior_in_flight; /* in flight before this ACK */
888+
bool is_app_limited; /* is sample from packet with bubble in pipe? */
886889
bool is_retrans; /* is sample from retransmission? */
887890
};
888891

@@ -978,6 +981,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
978981
struct rate_sample *rs);
979982
void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
980983
struct skb_mstamp *now, struct rate_sample *rs);
984+
void tcp_rate_check_app_limited(struct sock *sk);
981985

982986
/* These functions determine how the current flow behaves in respect of SACK
983987
* handling. SACK is negotiated with the peer, and therefore it can vary

net/ipv4/tcp.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk)
396396
*/
397397
tp->snd_cwnd = TCP_INIT_CWND;
398398

399+
/* There's a bubble in the pipe until at least the first ACK. */
400+
tp->app_limited = ~0U;
401+
399402
/* See draft-stevens-tcpca-spec-01 for discussion of the
400403
* initialization of these values.
401404
*/
@@ -1014,6 +1017,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
10141017
flags);
10151018

10161019
lock_sock(sk);
1020+
1021+
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
1022+
10171023
res = do_tcp_sendpages(sk, page, offset, size, flags);
10181024
release_sock(sk);
10191025
return res;
@@ -1115,6 +1121,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
11151121

11161122
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
11171123

1124+
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
1125+
11181126
/* Wait for a connection to finish. One exception is TCP Fast Open
11191127
* (passive side) where data is allowed to be sent before a connection
11201128
* is fully established.

net/ipv4/tcp_minisocks.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
487487
newtp->snd_cwnd = TCP_INIT_CWND;
488488
newtp->snd_cwnd_cnt = 0;
489489

490+
/* There's a bubble in the pipe until at least the first ACK. */
491+
newtp->app_limited = ~0U;
492+
490493
tcp_init_xmit_timers(newsk);
491494
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
492495

net/ipv4/tcp_rate.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,13 @@
2626
* other factors like applications or receiver window limits. The estimator
2727
* deliberately avoids using the inter-packet spacing approach because that
2828
* approach requires a large number of samples and sophisticated filtering.
29+
*
30+
* TCP flows can often be application-limited in request/response workloads.
31+
* The estimator marks a bandwidth sample as application-limited if there
32+
* was some moment during the sampled window of packets when there was no data
33+
* ready to send in the write queue.
2934
*/
3035

31-
3236
/* Snapshot the current delivery information in the skb, to generate
3337
* a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
3438
*/
@@ -58,6 +62,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
5862
TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
5963
TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
6064
TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
65+
TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
6166
}
6267

6368
/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
@@ -80,6 +85,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
8085
after(scb->tx.delivered, rs->prior_delivered)) {
8186
rs->prior_delivered = scb->tx.delivered;
8287
rs->prior_mstamp = scb->tx.delivered_mstamp;
88+
rs->is_app_limited = scb->tx.is_app_limited;
8389
rs->is_retrans = scb->sacked & TCPCB_RETRANS;
8490

8591
/* Find the duration of the "send phase" of this window: */
@@ -105,6 +111,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
105111
struct tcp_sock *tp = tcp_sk(sk);
106112
u32 snd_us, ack_us;
107113

114+
/* Clear app limited if bubble is acked and gone. */
115+
if (tp->app_limited && after(tp->delivered, tp->app_limited))
116+
tp->app_limited = 0;
117+
108118
/* TODO: there are multiple places throughout tcp_ack() to get
109119
* current time. Refactor the code using a new "tcp_acktag_state"
110120
* to carry current time, flags, stats like "tcp_sacktag_state".
@@ -147,3 +157,20 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
147157
tp->rx_opt.sack_ok, tcp_min_rtt(tp));
148158
}
149159
}
160+
161+
/* If a gap is detected between sends, mark the socket application-limited. */
162+
void tcp_rate_check_app_limited(struct sock *sk)
163+
{
164+
struct tcp_sock *tp = tcp_sk(sk);
165+
166+
if (/* We have less than one packet to send. */
167+
tp->write_seq - tp->snd_nxt < tp->mss_cache &&
168+
/* Nothing in sending host's qdisc queues or NIC tx queue. */
169+
sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
170+
/* We are not limited by CWND. */
171+
tcp_packets_in_flight(tp) < tp->snd_cwnd &&
172+
/* All lost packets have been retransmitted. */
173+
tp->lost_out <= tp->retrans_out)
174+
tp->app_limited =
175+
(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
176+
}

0 commit comments

Comments
 (0)