Skip to content

Commit ac8f171

Browse files
tracywwnjdavem330
authored andcommitted
tcp: reflect tos value received in SYN to the socket
This commit adds a new TCP feature to reflect the tos value received in SYN, and send it out on the SYN-ACK, and eventually set the tos value of the established socket with this reflected tos value. This provides a way to set the traffic class/QoS level for all traffic in the same connection to be the same as the incoming SYN request. It could be useful in data centers to provide equivalent QoS according to the incoming request. This feature is guarded by /proc/sys/net/ipv4/tcp_reflect_tos, and is by default turned off. Signed-off-by: Wei Wang <[email protected]> Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent de033b7 commit ac8f171

File tree

4 files changed

+28
-2
lines changed

4 files changed

+28
-2
lines changed

include/net/netns/ipv4.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ struct netns_ipv4 {
183183
unsigned int sysctl_tcp_fastopen_blackhole_timeout;
184184
atomic_t tfo_active_disable_times;
185185
unsigned long tfo_active_disable_stamp;
186+
int sysctl_tcp_reflect_tos;
186187

187188
int sysctl_udp_wmem_min;
188189
int sysctl_udp_rmem_min;

net/ipv4/sysctl_net_ipv4.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1329,6 +1329,15 @@ static struct ctl_table ipv4_net_table[] = {
13291329
.extra1 = SYSCTL_ZERO,
13301330
.extra2 = &comp_sack_nr_max,
13311331
},
1332+
{
1333+
.procname = "tcp_reflect_tos",
1334+
.data = &init_net.ipv4.sysctl_tcp_reflect_tos,
1335+
.maxlen = sizeof(int),
1336+
.mode = 0644,
1337+
.proc_handler = proc_dointvec_minmax,
1338+
.extra1 = SYSCTL_ZERO,
1339+
.extra2 = SYSCTL_ONE,
1340+
},
13321341
{
13331342
.procname = "udp_rmem_min",
13341343
.data = &init_net.ipv4.sysctl_udp_rmem_min,

net/ipv4/tcp_ipv4.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -972,21 +972,25 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
972972
struct flowi4 fl4;
973973
int err = -1;
974974
struct sk_buff *skb;
975+
u8 tos;
975976

976977
/* First, grab a route. */
977978
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
978979
return -1;
979980

980981
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
981982

983+
tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
984+
tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
985+
982986
if (skb) {
983987
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
984988

985989
rcu_read_lock();
986990
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
987991
ireq->ir_rmt_addr,
988992
rcu_dereference(ireq->ireq_opt),
989-
inet_sk(sk)->tos);
993+
tos & ~INET_ECN_MASK);
990994
rcu_read_unlock();
991995
err = net_xmit_eval(err);
992996
}
@@ -1531,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
15311535
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
15321536
newinet->inet_id = prandom_u32();
15331537

1538+
/* Set ToS of the new socket based upon the value of incoming SYN. */
1539+
if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1540+
newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1541+
15341542
if (!dst) {
15351543
dst = inet_csk_route_child_sock(sk, newsk, req);
15361544
if (!dst)

net/ipv6/tcp_ipv6.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
510510
struct flowi6 *fl6 = &fl->u.ip6;
511511
struct sk_buff *skb;
512512
int err = -ENOMEM;
513+
u8 tclass;
513514

514515
/* First, grab a route. */
515516
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
@@ -528,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
528529

529530
rcu_read_lock();
530531
opt = ireq->ipv6_opt;
532+
tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
533+
tcp_rsk(req)->syn_tos : np->tclass;
531534
if (!opt)
532535
opt = rcu_dereference(np->opt);
533-
err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
536+
err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
537+
tclass & ~INET_ECN_MASK,
534538
sk->sk_priority);
535539
rcu_read_unlock();
536540
err = net_xmit_eval(err);
@@ -1310,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
13101314
if (np->repflow)
13111315
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
13121316

1317+
/* Set ToS of the new socket based upon the value of incoming SYN. */
1318+
if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1319+
newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1320+
13131321
/* Clone native IPv6 options from listening socket (if any)
13141322
13151323
Yes, keeping reference count would be much more clever,

0 commit comments

Comments
 (0)