Skip to content

Commit 03e2777

Browse files
committed
Merge branch 'ipv4-handle-tos-and-scope-properly-for-icmp-redirects-and-pmtu-updates'
Guillaume Nault says: ==================== ipv4: Handle TOS and scope properly for ICMP redirects and PMTU updates ICMPv4 PMTU and redirect handlers didn't properly initialise the struct flowi4 they used for route lookups: * ECN bits sometimes weren't cleared from ->flowi4_tos. * The RTO_ONLINK flag wasn't taken into account for ->flowi4_scope. In some special cases, this resulted in ICMP redirects and PMTU updates not being taken into account because fib_lookup() couldn't retrieve the correct route. ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 6bd0c76 + ec730c3 commit 03e2777

File tree

2 files changed

+151
-8
lines changed

2 files changed

+151
-8
lines changed

net/ipv4/route.c

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,15 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
499499
}
500500
EXPORT_SYMBOL(__ip_select_ident);
501501

502+
static void ip_rt_fix_tos(struct flowi4 *fl4)
503+
{
504+
__u8 tos = RT_FL_TOS(fl4);
505+
506+
fl4->flowi4_tos = tos & IPTOS_RT_MASK;
507+
fl4->flowi4_scope = tos & RTO_ONLINK ?
508+
RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
509+
}
510+
502511
static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
503512
const struct sock *sk,
504513
const struct iphdr *iph,
@@ -824,6 +833,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
824833
rt = (struct rtable *) dst;
825834

826835
__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
836+
ip_rt_fix_tos(&fl4);
827837
__ip_do_redirect(rt, skb, &fl4, true);
828838
}
829839

@@ -1048,6 +1058,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
10481058
struct flowi4 fl4;
10491059

10501060
ip_rt_build_flow_key(&fl4, sk, skb);
1061+
ip_rt_fix_tos(&fl4);
10511062

10521063
/* Don't make lookup fail for bridged encapsulations */
10531064
if (skb && netif_is_any_bridge_port(skb->dev))
@@ -1122,6 +1133,8 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
11221133
goto out;
11231134

11241135
new = true;
1136+
} else {
1137+
ip_rt_fix_tos(&fl4);
11251138
}
11261139

11271140
__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
@@ -2603,7 +2616,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
26032616
struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
26042617
const struct sk_buff *skb)
26052618
{
2606-
__u8 tos = RT_FL_TOS(fl4);
26072619
struct fib_result res = {
26082620
.type = RTN_UNSPEC,
26092621
.fi = NULL,
@@ -2613,9 +2625,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
26132625
struct rtable *rth;
26142626

26152627
fl4->flowi4_iif = LOOPBACK_IFINDEX;
2616-
fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2617-
fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2618-
RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2628+
ip_rt_fix_tos(fl4);
26192629

26202630
rcu_read_lock();
26212631
rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);

tools/testing/selftests/net/pmtu.sh

Lines changed: 137 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,15 @@
2626
# - pmtu_ipv6
2727
# Same as pmtu_ipv4, except for locked PMTU tests, using IPv6
2828
#
29+
# - pmtu_ipv4_dscp_icmp_exception
30+
# Set up the same network topology as pmtu_ipv4, but use non-default
31+
# routing table in A. A fib-rule is used to jump to this routing table
32+
# based on DSCP. Send ICMPv4 packets with the expected DSCP value and
33+
# verify that ECN doesn't interfere with the creation of PMTU exceptions.
34+
#
35+
# - pmtu_ipv4_dscp_udp_exception
36+
# Same as pmtu_ipv4_dscp_icmp_exception, but use UDP instead of ICMP.
37+
#
2938
# - pmtu_ipv4_vxlan4_exception
3039
# Set up the same network topology as pmtu_ipv4, create a VXLAN tunnel
3140
# over IPv4 between A and B, routed via R1. On the link between R1 and B,
@@ -203,6 +212,8 @@ which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
203212
tests="
204213
pmtu_ipv4_exception ipv4: PMTU exceptions 1
205214
pmtu_ipv6_exception ipv6: PMTU exceptions 1
215+
pmtu_ipv4_dscp_icmp_exception ICMPv4 with DSCP and ECN: PMTU exceptions 1
216+
pmtu_ipv4_dscp_udp_exception UDPv4 with DSCP and ECN: PMTU exceptions 1
206217
pmtu_ipv4_vxlan4_exception IPv4 over vxlan4: PMTU exceptions 1
207218
pmtu_ipv6_vxlan4_exception IPv6 over vxlan4: PMTU exceptions 1
208219
pmtu_ipv4_vxlan6_exception IPv4 over vxlan6: PMTU exceptions 1
@@ -323,6 +334,9 @@ routes_nh="
323334
B 6 default 61
324335
"
325336

337+
policy_mark=0x04
338+
rt_table=main
339+
326340
veth4_a_addr="192.168.1.1"
327341
veth4_b_addr="192.168.1.2"
328342
veth4_c_addr="192.168.2.10"
@@ -346,6 +360,7 @@ dummy6_mask="64"
346360
err_buf=
347361
tcpdump_pids=
348362
nettest_pids=
363+
socat_pids=
349364

350365
err() {
351366
err_buf="${err_buf}${1}
@@ -723,7 +738,7 @@ setup_routing_old() {
723738

724739
ns_name="$(nsname ${ns})"
725740

726-
ip -n ${ns_name} route add ${addr} via ${gw}
741+
ip -n "${ns_name}" route add "${addr}" table "${rt_table}" via "${gw}"
727742

728743
ns=""; addr=""; gw=""
729744
done
@@ -753,7 +768,7 @@ setup_routing_new() {
753768

754769
ns_name="$(nsname ${ns})"
755770

756-
ip -n ${ns_name} -${fam} route add ${addr} nhid ${nhid}
771+
ip -n "${ns_name}" -"${fam}" route add "${addr}" table "${rt_table}" nhid "${nhid}"
757772

758773
ns=""; fam=""; addr=""; nhid=""
759774
done
@@ -798,6 +813,24 @@ setup_routing() {
798813
return 0
799814
}
800815

816+
setup_policy_routing() {
817+
setup_routing
818+
819+
ip -netns "${NS_A}" -4 rule add dsfield "${policy_mark}" \
820+
table "${rt_table}"
821+
822+
# Set the IPv4 Don't Fragment bit with tc, since socat doesn't seem to
823+
# have an option do to it.
824+
tc -netns "${NS_A}" qdisc replace dev veth_A-R1 root prio
825+
tc -netns "${NS_A}" qdisc replace dev veth_A-R2 root prio
826+
tc -netns "${NS_A}" filter add dev veth_A-R1 \
827+
protocol ipv4 flower ip_proto udp \
828+
action pedit ex munge ip df set 0x40 pipe csum ip and udp
829+
tc -netns "${NS_A}" filter add dev veth_A-R2 \
830+
protocol ipv4 flower ip_proto udp \
831+
action pedit ex munge ip df set 0x40 pipe csum ip and udp
832+
}
833+
801834
setup_bridge() {
802835
run_cmd ${ns_a} ip link add br0 type bridge || return $ksft_skip
803836
run_cmd ${ns_a} ip link set br0 up
@@ -903,6 +936,11 @@ cleanup() {
903936
done
904937
nettest_pids=
905938

939+
for pid in ${socat_pids}; do
940+
kill "${pid}"
941+
done
942+
socat_pids=
943+
906944
for n in ${NS_A} ${NS_B} ${NS_C} ${NS_R1} ${NS_R2}; do
907945
ip netns del ${n} 2> /dev/null
908946
done
@@ -950,15 +988,21 @@ link_get_mtu() {
950988
route_get_dst_exception() {
951989
ns_cmd="${1}"
952990
dst="${2}"
991+
dsfield="${3}"
953992

954-
${ns_cmd} ip route get "${dst}"
993+
if [ -z "${dsfield}" ]; then
994+
dsfield=0
995+
fi
996+
997+
${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
955998
}
956999

9571000
route_get_dst_pmtu_from_exception() {
9581001
ns_cmd="${1}"
9591002
dst="${2}"
1003+
dsfield="${3}"
9601004

961-
mtu_parse "$(route_get_dst_exception "${ns_cmd}" ${dst})"
1005+
mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
9621006
}
9631007

9641008
check_pmtu_value() {
@@ -1068,6 +1112,95 @@ test_pmtu_ipv6_exception() {
10681112
test_pmtu_ipvX 6
10691113
}
10701114

1115+
test_pmtu_ipv4_dscp_icmp_exception() {
1116+
rt_table=100
1117+
1118+
setup namespaces policy_routing || return $ksft_skip
1119+
trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
1120+
"${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
1121+
"${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
1122+
"${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
1123+
1124+
# Set up initial MTU values
1125+
mtu "${ns_a}" veth_A-R1 2000
1126+
mtu "${ns_r1}" veth_R1-A 2000
1127+
mtu "${ns_r1}" veth_R1-B 1400
1128+
mtu "${ns_b}" veth_B-R1 1400
1129+
1130+
mtu "${ns_a}" veth_A-R2 2000
1131+
mtu "${ns_r2}" veth_R2-A 2000
1132+
mtu "${ns_r2}" veth_R2-B 1500
1133+
mtu "${ns_b}" veth_B-R2 1500
1134+
1135+
len=$((2000 - 20 - 8)) # Fills MTU of veth_A-R1
1136+
1137+
dst1="${prefix4}.${b_r1}.1"
1138+
dst2="${prefix4}.${b_r2}.1"
1139+
1140+
# Create route exceptions
1141+
dsfield=${policy_mark} # No ECN bit set (Not-ECT)
1142+
run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst1}"
1143+
1144+
dsfield=$(printf "%#x" $((policy_mark + 0x02))) # ECN=2 (ECT(0))
1145+
run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
1146+
1147+
# Check that exceptions have been created with the correct PMTU
1148+
pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
1149+
check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
1150+
1151+
pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
1152+
check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
1153+
}
1154+
1155+
test_pmtu_ipv4_dscp_udp_exception() {
1156+
rt_table=100
1157+
1158+
if ! which socat > /dev/null 2>&1; then
1159+
echo "'socat' command not found; skipping tests"
1160+
return $ksft_skip
1161+
fi
1162+
1163+
setup namespaces policy_routing || return $ksft_skip
1164+
trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
1165+
"${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
1166+
"${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
1167+
"${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
1168+
1169+
# Set up initial MTU values
1170+
mtu "${ns_a}" veth_A-R1 2000
1171+
mtu "${ns_r1}" veth_R1-A 2000
1172+
mtu "${ns_r1}" veth_R1-B 1400
1173+
mtu "${ns_b}" veth_B-R1 1400
1174+
1175+
mtu "${ns_a}" veth_A-R2 2000
1176+
mtu "${ns_r2}" veth_R2-A 2000
1177+
mtu "${ns_r2}" veth_R2-B 1500
1178+
mtu "${ns_b}" veth_B-R2 1500
1179+
1180+
len=$((2000 - 20 - 8)) # Fills MTU of veth_A-R1
1181+
1182+
dst1="${prefix4}.${b_r1}.1"
1183+
dst2="${prefix4}.${b_r2}.1"
1184+
1185+
# Create route exceptions
1186+
run_cmd_bg "${ns_b}" socat UDP-LISTEN:50000 OPEN:/dev/null,wronly=1
1187+
socat_pids="${socat_pids} $!"
1188+
1189+
dsfield=${policy_mark} # No ECN bit set (Not-ECT)
1190+
run_cmd "${ns_a}" socat OPEN:/dev/zero,rdonly=1,readbytes="${len}" \
1191+
UDP:"${dst1}":50000,tos="${dsfield}"
1192+
1193+
dsfield=$(printf "%#x" $((policy_mark + 0x02))) # ECN=2 (ECT(0))
1194+
run_cmd "${ns_a}" socat OPEN:/dev/zero,rdonly=1,readbytes="${len}" \
1195+
UDP:"${dst2}":50000,tos="${dsfield}"
1196+
1197+
# Check that exceptions have been created with the correct PMTU
1198+
pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
1199+
check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
1200+
pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
1201+
check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
1202+
}
1203+
10711204
test_pmtu_ipvX_over_vxlanY_or_geneveY_exception() {
10721205
type=${1}
10731206
family=${2}

0 commit comments

Comments
 (0)