Skip to content

Commit 30e502a

Browse files
Daniel Borkmanndavem330
authored andcommitted
net: tcp: add flag for ca to indicate that ECN is required
This patch adds a flag to TCP congestion algorithms that allows for requesting to mark IPv4/IPv6 sockets with transport as ECN capable, that is, ECT(0), when required by a congestion algorithm. It is currently used and needed in DataCenter TCP (DCTCP), as it requires both peers to assert ECT on all IP packets sent - it uses ECN feedback (i.e. CE, Congestion Encountered information) from switches inside the data center to derive feedback to the end hosts. Therefore, simply add a new flag to icsk_ca_ops. Note that DCTCP's algorithm/behaviour slightly diverges from RFC3168, therefore this is only (!) enabled iff the assigned congestion control ops module has requested this. By that, we can tightly couple this logic really only to the provided congestion control ops. Joint work with Florian Westphal and Glenn Judd. Signed-off-by: Daniel Borkmann <[email protected]> Signed-off-by: Florian Westphal <[email protected]> Signed-off-by: Glenn Judd <[email protected]> Acked-by: Stephen Hemminger <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 55d8694 commit 30e502a

File tree

3 files changed

+63
-25
lines changed

3 files changed

+63
-25
lines changed

include/net/tcp.h

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -733,23 +733,6 @@ struct tcp_skb_cb {
733733

734734
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
735735

736-
/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
737-
*
738-
* If we receive a SYN packet with these bits set, it means a network is
739-
* playing bad games with TOS bits. In order to avoid possible false congestion
740-
* notifications, we disable TCP ECN negociation.
741-
*/
742-
static inline void
743-
TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
744-
struct net *net)
745-
{
746-
const struct tcphdr *th = tcp_hdr(skb);
747-
748-
if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
749-
INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
750-
inet_rsk(req)->ecn_ok = 1;
751-
}
752-
753736
/* Due to TSO, an SKB can be composed of multiple actual
754737
* packets. To keep these tracked properly, we use this.
755738
*/
@@ -791,7 +774,10 @@ enum tcp_ca_event {
791774
#define TCP_CA_MAX 128
792775
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
793776

777+
/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
794778
#define TCP_CONG_NON_RESTRICTED 0x1
779+
/* Requires ECN/ECT set on all packets */
780+
#define TCP_CONG_NEEDS_ECN 0x2
795781

796782
struct tcp_congestion_ops {
797783
struct list_head list;
@@ -840,6 +826,13 @@ u32 tcp_reno_ssthresh(struct sock *sk);
840826
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
841827
extern struct tcp_congestion_ops tcp_reno;
842828

829+
static inline bool tcp_ca_needs_ecn(const struct sock *sk)
830+
{
831+
const struct inet_connection_sock *icsk = inet_csk(sk);
832+
833+
return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
834+
}
835+
843836
static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
844837
{
845838
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -857,6 +850,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
857850
icsk->icsk_ca_ops->cwnd_event(sk, event);
858851
}
859852

853+
/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
854+
*
855+
* If we receive a SYN packet with these bits set, it means a
856+
* network is playing bad games with TOS bits. In order to
857+
* avoid possible false congestion notifications, we disable
858+
* TCP ECN negociation.
859+
*
860+
* Exception: tcp_ca wants ECN. This is required for DCTCP
861+
* congestion control; it requires setting ECT on all packets,
862+
* including SYN. We inverse the test in this case: If our
863+
* local socket wants ECN, but peer only set ece/cwr (but not
864+
* ECT in IP header) its probably a non-DCTCP aware sender.
865+
*/
866+
static inline void
867+
TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
868+
const struct sock *listen_sk)
869+
{
870+
const struct tcphdr *th = tcp_hdr(skb);
871+
const struct net *net = sock_net(listen_sk);
872+
bool th_ecn = th->ece && th->cwr;
873+
bool ect, need_ecn;
874+
875+
if (!th_ecn)
876+
return;
877+
878+
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
879+
need_ecn = tcp_ca_needs_ecn(listen_sk);
880+
881+
if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
882+
inet_rsk(req)->ecn_ok = 1;
883+
else if (ect && need_ecn)
884+
inet_rsk(req)->ecn_ok = 1;
885+
}
886+
860887
/* These functions determine how the current flow behaves in respect of SACK
861888
* handling. SACK is negotiated with the peer, and therefore it can vary
862889
* between different flows.

net/ipv4/tcp_input.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5944,7 +5944,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
59445944
goto drop_and_free;
59455945

59465946
if (!want_cookie || tmp_opt.tstamp_ok)
5947-
TCP_ECN_create_request(req, skb, sock_net(sk));
5947+
TCP_ECN_create_request(req, skb, sk);
59485948

59495949
if (want_cookie) {
59505950
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);

net/ipv4/tcp_output.c

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk)
318318
}
319319

320320
/* Packet ECN state for a SYN-ACK */
321-
static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
321+
static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb)
322322
{
323+
const struct tcp_sock *tp = tcp_sk(sk);
324+
323325
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
324326
if (!(tp->ecn_flags & TCP_ECN_OK))
325327
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
328+
else if (tcp_ca_needs_ecn(sk))
329+
INET_ECN_xmit(sk);
326330
}
327331

328332
/* Packet ECN state for a SYN. */
@@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
331335
struct tcp_sock *tp = tcp_sk(sk);
332336

333337
tp->ecn_flags = 0;
334-
if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
338+
if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
339+
tcp_ca_needs_ecn(sk)) {
335340
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
336341
tp->ecn_flags = TCP_ECN_OK;
342+
if (tcp_ca_needs_ecn(sk))
343+
INET_ECN_xmit(sk);
337344
}
338345
}
339346

340347
static __inline__ void
341-
TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
348+
TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th,
349+
struct sock *sk)
342350
{
343-
if (inet_rsk(req)->ecn_ok)
351+
if (inet_rsk(req)->ecn_ok) {
344352
th->ece = 1;
353+
if (tcp_ca_needs_ecn(sk))
354+
INET_ECN_xmit(sk);
355+
}
345356
}
346357

347358
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
@@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
362373
tcp_hdr(skb)->cwr = 1;
363374
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
364375
}
365-
} else {
376+
} else if (!tcp_ca_needs_ecn(sk)) {
366377
/* ACK or retransmitted segment: clear ECT|CE */
367378
INET_ECN_dontxmit(sk);
368379
}
@@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk)
27892800
}
27902801

27912802
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
2792-
TCP_ECN_send_synack(tcp_sk(sk), skb);
2803+
TCP_ECN_send_synack(sk, skb);
27932804
}
27942805
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
27952806
}
@@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
28482859
memset(th, 0, sizeof(struct tcphdr));
28492860
th->syn = 1;
28502861
th->ack = 1;
2851-
TCP_ECN_make_synack(req, th);
2862+
TCP_ECN_make_synack(req, th, sk);
28522863
th->source = htons(ireq->ir_num);
28532864
th->dest = ireq->ir_rmt_port;
28542865
/* Setting of flags are superfluous here for callers (and ECE is

0 commit comments

Comments
 (0)