Skip to content

Commit d095c46

Browse files
committed
Merge branch 'tcp-add-tos-reflection-feature'
Wei Wang says: ==================== tcp: add tos reflection feature This patch series adds a new tcp feature to reflect TOS value received in SYN, and send it out in SYN-ACK, and eventually set the TOS value of the established socket with this reflected TOS value. This provides a way to set the traffic class/QoS level for all traffic in the same connection to be the same as the incoming SYN. It could be useful for datacenters to provide equivalent QoS according to the incoming request. This feature is guarded by /proc/sys/net/ipv4/tcp_reflect_tos, and is by default turned off. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 3a8c4ad + ac8f171 commit d095c46

File tree

10 files changed

+42
-10
lines changed

10 files changed

+42
-10
lines changed

include/linux/tcp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ struct tcp_request_sock {
134134
* FastOpen it's the seq#
135135
* after data-in-SYN.
136136
*/
137+
u8 syn_tos;
137138
};
138139

139140
static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)

include/net/ip.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ int igmp_mc_init(void);
151151

152152
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
153153
__be32 saddr, __be32 daddr,
154-
struct ip_options_rcu *opt);
154+
struct ip_options_rcu *opt, u8 tos);
155155
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
156156
struct net_device *orig_dev);
157157
void ip_list_rcv(struct list_head *head, struct packet_type *pt,

include/net/netns/ipv4.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ struct netns_ipv4 {
183183
unsigned int sysctl_tcp_fastopen_blackhole_timeout;
184184
atomic_t tfo_active_disable_times;
185185
unsigned long tfo_active_disable_stamp;
186+
int sysctl_tcp_reflect_tos;
186187

187188
int sysctl_udp_wmem_min;
188189
int sysctl_udp_rmem_min;

net/dccp/ipv4.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,8 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
495495
rcu_read_lock();
496496
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
497497
ireq->ir_rmt_addr,
498-
rcu_dereference(ireq->ireq_opt));
498+
rcu_dereference(ireq->ireq_opt),
499+
inet_sk(sk)->tos);
499500
rcu_read_unlock();
500501
err = net_xmit_eval(err);
501502
}
@@ -537,7 +538,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
537538
local_bh_disable();
538539
bh_lock_sock(ctl_sk);
539540
err = ip_build_and_send_pkt(skb, ctl_sk,
540-
rxiph->daddr, rxiph->saddr, NULL);
541+
rxiph->daddr, rxiph->saddr, NULL,
542+
inet_sk(ctl_sk)->tos);
541543
bh_unlock_sock(ctl_sk);
542544

543545
if (net_xmit_eval(err) == 0) {

net/ipv4/ip_output.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
142142
*
143143
*/
144144
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
145-
__be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
145+
__be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
146+
u8 tos)
146147
{
147148
struct inet_sock *inet = inet_sk(sk);
148149
struct rtable *rt = skb_rtable(skb);
@@ -155,7 +156,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
155156
iph = ip_hdr(skb);
156157
iph->version = 4;
157158
iph->ihl = 5;
158-
iph->tos = inet->tos;
159+
iph->tos = tos;
159160
iph->ttl = ip_select_ttl(inet, &rt->dst);
160161
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
161162
iph->saddr = saddr;

net/ipv4/syncookies.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -286,11 +286,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
286286
struct sock *sk,
287287
struct sk_buff *skb)
288288
{
289+
struct tcp_request_sock *treq;
289290
struct request_sock *req;
290291

291292
#ifdef CONFIG_MPTCP
292-
struct tcp_request_sock *treq;
293-
294293
if (sk_is_mptcp(sk))
295294
ops = &mptcp_subflow_request_sock_ops;
296295
#endif
@@ -299,8 +298,9 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
299298
if (!req)
300299
return NULL;
301300

302-
#if IS_ENABLED(CONFIG_MPTCP)
303301
treq = tcp_rsk(req);
302+
treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
303+
#if IS_ENABLED(CONFIG_MPTCP)
304304
treq->is_mptcp = sk_is_mptcp(sk);
305305
if (treq->is_mptcp) {
306306
int err = mptcp_subflow_init_cookie_req(req, sk, skb);

net/ipv4/sysctl_net_ipv4.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1329,6 +1329,15 @@ static struct ctl_table ipv4_net_table[] = {
13291329
.extra1 = SYSCTL_ZERO,
13301330
.extra2 = &comp_sack_nr_max,
13311331
},
1332+
{
1333+
.procname = "tcp_reflect_tos",
1334+
.data = &init_net.ipv4.sysctl_tcp_reflect_tos,
1335+
.maxlen = sizeof(int),
1336+
.mode = 0644,
1337+
.proc_handler = proc_dointvec_minmax,
1338+
.extra1 = SYSCTL_ZERO,
1339+
.extra2 = SYSCTL_ONE,
1340+
},
13321341
{
13331342
.procname = "udp_rmem_min",
13341343
.data = &init_net.ipv4.sysctl_udp_rmem_min,

net/ipv4/tcp_input.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6834,6 +6834,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
68346834

68356835
tcp_rsk(req)->snt_isn = isn;
68366836
tcp_rsk(req)->txhash = net_tx_rndhash();
6837+
tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
68376838
tcp_openreq_init_rwin(req, sk, dst);
68386839
sk_rx_queue_set(req_to_sk(req), skb);
68396840
if (!want_cookie) {

net/ipv4/tcp_ipv4.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -972,20 +972,25 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
972972
struct flowi4 fl4;
973973
int err = -1;
974974
struct sk_buff *skb;
975+
u8 tos;
975976

976977
/* First, grab a route. */
977978
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
978979
return -1;
979980

980981
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
981982

983+
tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
984+
tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
985+
982986
if (skb) {
983987
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
984988

985989
rcu_read_lock();
986990
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
987991
ireq->ir_rmt_addr,
988-
rcu_dereference(ireq->ireq_opt));
992+
rcu_dereference(ireq->ireq_opt),
993+
tos & ~INET_ECN_MASK);
989994
rcu_read_unlock();
990995
err = net_xmit_eval(err);
991996
}
@@ -1530,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
15301535
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
15311536
newinet->inet_id = prandom_u32();
15321537

1538+
/* Set ToS of the new socket based upon the value of incoming SYN. */
1539+
if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1540+
newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1541+
15331542
if (!dst) {
15341543
dst = inet_csk_route_child_sock(sk, newsk, req);
15351544
if (!dst)

net/ipv6/tcp_ipv6.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
510510
struct flowi6 *fl6 = &fl->u.ip6;
511511
struct sk_buff *skb;
512512
int err = -ENOMEM;
513+
u8 tclass;
513514

514515
/* First, grab a route. */
515516
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
@@ -528,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
528529

529530
rcu_read_lock();
530531
opt = ireq->ipv6_opt;
532+
tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
533+
tcp_rsk(req)->syn_tos : np->tclass;
531534
if (!opt)
532535
opt = rcu_dereference(np->opt);
533-
err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
536+
err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
537+
tclass & ~INET_ECN_MASK,
534538
sk->sk_priority);
535539
rcu_read_unlock();
536540
err = net_xmit_eval(err);
@@ -1310,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
13101314
if (np->repflow)
13111315
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
13121316

1317+
/* Set ToS of the new socket based upon the value of incoming SYN. */
1318+
if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1319+
newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1320+
13131321
/* Clone native IPv6 options from listening socket (if any)
13141322
13151323
Yes, keeping reference count would be much more clever,

0 commit comments

Comments
 (0)