Skip to content

Commit a11238e

Browse files
committed
Merge branch 'dctcp'
Daniel Borkmann says: ==================== net: tcp: DCTCP congestion control algorithm This patch series adds support for the DataCenter TCP (DCTCP) congestion control algorithm. Please see individual patches for the details. The last patch adds DCTCP as a congestion control module, and previous ones add needed infrastructure to extend the congestion control framework. Joint work between Florian Westphal, Daniel Borkmann and Glenn Judd. v3 -> v2: - No changes anywhere, just a resend as requested by Dave - Added Stephen's ACK v1 -> v2: - Rebased to latest net-next - Addressed Eric's feedback, thanks! - Update stale comment wrt. DCTCP ECN usage - Don't call INET_ECN_xmit for every packet - Add dctcp ss/inetdiag support to expose internal stats to userspace ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 53dfd50 + e3118e8 commit a11238e

File tree

12 files changed

+574
-78
lines changed

12 files changed

+574
-78
lines changed

Documentation/networking/dctcp.txt

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
DCTCP (DataCenter TCP)
2+
----------------------
3+
4+
DCTCP is an enhancement to the TCP congestion control algorithm for data
5+
center networks and leverages Explicit Congestion Notification (ECN) in
6+
the data center network to provide multi-bit feedback to the end hosts.
7+
8+
To enable it on end hosts:
9+
10+
sysctl -w net.ipv4.tcp_congestion_control=dctcp
11+
12+
All switches in the data center network running DCTCP must support ECN
13+
marking and be configured for marking when reaching defined switch buffer
14+
thresholds. The default ECN marking threshold heuristic for DCTCP on
15+
switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps,
16+
but might need further careful tweaking.
17+
18+
For more details, see below documents:
19+
20+
Paper:
21+
22+
The algorithm is further described in detail in the following two
23+
SIGCOMM/SIGMETRICS papers:
24+
25+
i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
26+
Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
27+
"Data Center TCP (DCTCP)", Data Center Networks session
28+
Proc. ACM SIGCOMM, New Delhi, 2010.
29+
http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
30+
http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192
31+
32+
ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
33+
"Analysis of DCTCP: Stability, Convergence, and Fairness"
34+
Proc. ACM SIGMETRICS, San Jose, 2011.
35+
http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
36+
37+
IETF informational draft:
38+
39+
http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00
40+
41+
DCTCP site:
42+
43+
http://simula.stanford.edu/~alizade/Site/DCTCP.html

include/net/tcp.h

Lines changed: 58 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -733,23 +733,6 @@ struct tcp_skb_cb {
733733

734734
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
735735

736-
/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
737-
*
738-
* If we receive a SYN packet with these bits set, it means a network is
739-
* playing bad games with TOS bits. In order to avoid possible false congestion
740-
* notifications, we disable TCP ECN negociation.
741-
*/
742-
static inline void
743-
TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
744-
struct net *net)
745-
{
746-
const struct tcphdr *th = tcp_hdr(skb);
747-
748-
if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
749-
INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
750-
inet_rsk(req)->ecn_ok = 1;
751-
}
752-
753736
/* Due to TSO, an SKB can be composed of multiple actual
754737
* packets. To keep these tracked properly, we use this.
755738
*/
@@ -780,8 +763,17 @@ enum tcp_ca_event {
780763
CA_EVENT_CWND_RESTART, /* congestion window restart */
781764
CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
782765
CA_EVENT_LOSS, /* loss timeout */
783-
CA_EVENT_FAST_ACK, /* in sequence ack */
784-
CA_EVENT_SLOW_ACK, /* other ack */
766+
CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
767+
CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
768+
CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */
769+
CA_EVENT_NON_DELAYED_ACK,
770+
};
771+
772+
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
773+
enum tcp_ca_ack_event_flags {
774+
CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
775+
CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
776+
CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
785777
};
786778

787779
/*
@@ -791,7 +783,10 @@ enum tcp_ca_event {
791783
#define TCP_CA_MAX 128
792784
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
793785

786+
/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
794787
#define TCP_CONG_NON_RESTRICTED 0x1
788+
/* Requires ECN/ECT set on all packets */
789+
#define TCP_CONG_NEEDS_ECN 0x2
795790

796791
struct tcp_congestion_ops {
797792
struct list_head list;
@@ -810,6 +805,8 @@ struct tcp_congestion_ops {
810805
void (*set_state)(struct sock *sk, u8 new_state);
811806
/* call when cwnd event occurs (optional) */
812807
void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
808+
/* call when ack arrives (optional) */
809+
void (*in_ack_event)(struct sock *sk, u32 flags);
813810
/* new value of cwnd after loss (optional) */
814811
u32 (*undo_cwnd)(struct sock *sk);
815812
/* hook for packet ack accounting (optional) */
@@ -824,6 +821,7 @@ struct tcp_congestion_ops {
824821
int tcp_register_congestion_control(struct tcp_congestion_ops *type);
825822
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
826823

824+
void tcp_assign_congestion_control(struct sock *sk);
827825
void tcp_init_congestion_control(struct sock *sk);
828826
void tcp_cleanup_congestion_control(struct sock *sk);
829827
int tcp_set_default_congestion_control(const char *name);
@@ -835,11 +833,17 @@ int tcp_set_congestion_control(struct sock *sk, const char *name);
835833
int tcp_slow_start(struct tcp_sock *tp, u32 acked);
836834
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
837835

838-
extern struct tcp_congestion_ops tcp_init_congestion_ops;
839836
u32 tcp_reno_ssthresh(struct sock *sk);
840837
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
841838
extern struct tcp_congestion_ops tcp_reno;
842839

840+
static inline bool tcp_ca_needs_ecn(const struct sock *sk)
841+
{
842+
const struct inet_connection_sock *icsk = inet_csk(sk);
843+
844+
return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
845+
}
846+
843847
static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
844848
{
845849
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -857,6 +861,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
857861
icsk->icsk_ca_ops->cwnd_event(sk, event);
858862
}
859863

864+
/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
865+
*
866+
* If we receive a SYN packet with these bits set, it means a
867+
* network is playing bad games with TOS bits. In order to
868+
* avoid possible false congestion notifications, we disable
869+
* TCP ECN negociation.
870+
*
871+
* Exception: tcp_ca wants ECN. This is required for DCTCP
872+
* congestion control; it requires setting ECT on all packets,
873+
* including SYN. We inverse the test in this case: If our
874+
* local socket wants ECN, but peer only set ece/cwr (but not
875+
* ECT in IP header) its probably a non-DCTCP aware sender.
876+
*/
877+
static inline void
878+
TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
879+
const struct sock *listen_sk)
880+
{
881+
const struct tcphdr *th = tcp_hdr(skb);
882+
const struct net *net = sock_net(listen_sk);
883+
bool th_ecn = th->ece && th->cwr;
884+
bool ect, need_ecn;
885+
886+
if (!th_ecn)
887+
return;
888+
889+
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
890+
need_ecn = tcp_ca_needs_ecn(listen_sk);
891+
892+
if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
893+
inet_rsk(req)->ecn_ok = 1;
894+
else if (ect && need_ecn)
895+
inet_rsk(req)->ecn_ok = 1;
896+
}
897+
860898
/* These functions determine how the current flow behaves in respect of SACK
861899
* handling. SACK is negotiated with the peer, and therefore it can vary
862900
* between different flows.

include/uapi/linux/inet_diag.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,10 @@ enum {
110110
INET_DIAG_TCLASS,
111111
INET_DIAG_SKMEMINFO,
112112
INET_DIAG_SHUTDOWN,
113+
INET_DIAG_DCTCPINFO,
113114
};
114115

115-
#define INET_DIAG_MAX INET_DIAG_SHUTDOWN
116-
116+
#define INET_DIAG_MAX INET_DIAG_DCTCPINFO
117117

118118
/* INET_DIAG_MEM */
119119

@@ -133,5 +133,14 @@ struct tcpvegas_info {
133133
__u32 tcpv_minrtt;
134134
};
135135

136+
/* INET_DIAG_DCTCPINFO */
137+
138+
struct tcp_dctcp_info {
139+
__u16 dctcp_enabled;
140+
__u16 dctcp_ce_state;
141+
__u32 dctcp_alpha;
142+
__u32 dctcp_ab_ecn;
143+
__u32 dctcp_ab_tot;
144+
};
136145

137146
#endif /* _UAPI_INET_DIAG_H_ */

net/ipv4/Kconfig

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,27 @@ config TCP_CONG_ILLINOIS
570570
For further details see:
571571
http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
572572

573+
config TCP_CONG_DCTCP
574+
tristate "DataCenter TCP (DCTCP)"
575+
default n
576+
---help---
577+
DCTCP leverages Explicit Congestion Notification (ECN) in the network to
578+
provide multi-bit feedback to the end hosts. It is designed to provide:
579+
580+
- High burst tolerance (incast due to partition/aggregate),
581+
- Low latency (short flows, queries),
582+
- High throughput (continuous data updates, large file transfers) with
583+
commodity, shallow-buffered switches.
584+
585+
All switches in the data center network running DCTCP must support
586+
ECN marking and be configured for marking when reaching defined switch
587+
buffer thresholds. The default ECN marking threshold heuristic for
588+
DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
589+
(~100KB) at 10Gbps, but might need further careful tweaking.
590+
591+
For further details see:
592+
http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
593+
573594
choice
574595
prompt "Default TCP congestion control"
575596
default DEFAULT_CUBIC
@@ -598,9 +619,11 @@ choice
598619
config DEFAULT_WESTWOOD
599620
bool "Westwood" if TCP_CONG_WESTWOOD=y
600621

622+
config DEFAULT_DCTCP
623+
bool "DCTCP" if TCP_CONG_DCTCP=y
624+
601625
config DEFAULT_RENO
602626
bool "Reno"
603-
604627
endchoice
605628

606629
endif
@@ -620,6 +643,7 @@ config DEFAULT_TCP_CONG
620643
default "westwood" if DEFAULT_WESTWOOD
621644
default "veno" if DEFAULT_VENO
622645
default "reno" if DEFAULT_RENO
646+
default "dctcp" if DEFAULT_DCTCP
623647
default "cubic"
624648

625649
config TCP_MD5SIG

net/ipv4/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
4343
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
4444
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
4545
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
46+
obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
4647
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
4748
obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
4849
obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o

net/ipv4/tcp.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ void tcp_init_sock(struct sock *sk)
405405

406406
tp->reordering = sysctl_tcp_reordering;
407407
tcp_enable_early_retrans(tp);
408-
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
408+
tcp_assign_congestion_control(sk);
409409

410410
tp->tsoffset = 0;
411411

@@ -3258,8 +3258,6 @@ void __init tcp_init(void)
32583258
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
32593259

32603260
tcp_metrics_init();
3261-
3262-
tcp_register_congestion_control(&tcp_reno);
3263-
3261+
BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
32643262
tcp_tasklet_init();
32653263
}

net/ipv4/tcp_cong.c

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
7474
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
7575

7676
/* Assign choice of congestion control. */
77-
void tcp_init_congestion_control(struct sock *sk)
77+
void tcp_assign_congestion_control(struct sock *sk)
7878
{
7979
struct inet_connection_sock *icsk = inet_csk(sk);
8080
struct tcp_congestion_ops *ca;
8181

82-
/* if no choice made yet assign the current value set as default */
83-
if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
84-
rcu_read_lock();
85-
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
86-
if (try_module_get(ca->owner)) {
87-
icsk->icsk_ca_ops = ca;
88-
break;
89-
}
90-
91-
/* fallback to next available */
82+
rcu_read_lock();
83+
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
84+
if (likely(try_module_get(ca->owner))) {
85+
icsk->icsk_ca_ops = ca;
86+
goto out;
9287
}
93-
rcu_read_unlock();
88+
/* Fallback to next available. The last really
89+
* guaranteed fallback is Reno from this list.
90+
*/
9491
}
92+
out:
93+
rcu_read_unlock();
94+
95+
/* Clear out private data before diag gets it and
96+
* the ca has not been initialized.
97+
*/
98+
if (ca->get_info)
99+
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
100+
}
101+
102+
void tcp_init_congestion_control(struct sock *sk)
103+
{
104+
const struct inet_connection_sock *icsk = inet_csk(sk);
95105

96106
if (icsk->icsk_ca_ops->init)
97107
icsk->icsk_ca_ops->init(sk);
@@ -345,15 +355,3 @@ struct tcp_congestion_ops tcp_reno = {
345355
.ssthresh = tcp_reno_ssthresh,
346356
.cong_avoid = tcp_reno_cong_avoid,
347357
};
348-
349-
/* Initial congestion control used (until SYN)
350-
* really reno under another name so we can tell difference
351-
* during tcp_set_default_congestion_control
352-
*/
353-
struct tcp_congestion_ops tcp_init_congestion_ops = {
354-
.name = "",
355-
.owner = THIS_MODULE,
356-
.ssthresh = tcp_reno_ssthresh,
357-
.cong_avoid = tcp_reno_cong_avoid,
358-
};
359-
EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);

0 commit comments

Comments
 (0)