Skip to content

Commit 73bf1fc

Browse files
committed
Merge branch 'net-ipv6-Fix-route-append-and-replace-use-cases'
David Ahern says: ==================== net/ipv6: Fix route append and replace use cases This patch set fixes a few append and replace uses cases for IPv6 and adds test cases that codifies the expectations of how append and replace are expected to work. In paricular it allows a multipath route to have a dev-only nexthop, something Thomas tried to accomplish with commit edd7ceb ("ipv6: Allow non-gateway ECMP for IPv6") which had to be reverted because of breakage, and to replace an existing FIB entry with a reject route. There are a number of inconsistent and surprising aspects to the Linux API for adding, deleting, replacing and changing FIB entries. For example, with IPv4 NLM_F_APPEND means insert the route after any existing entries with the same key (prefix + priority + TOS for IPv4) and NLM_F_CREATE without the append flag inserts the new route before any existing entries. IPv6 on the other hand attempts to guess whether a new route should be appended to an existing one, possibly creating a multipath route, or to add a new entry after any existing ones. This applies to both the 'append' (NLM_F_CREATE + NLM_F_APPEND) and 'prepend' (NLM_F_CREATE only) cases meaning for IPv6 the NLM_F_APPEND is basically ignored. This guessing whether the route should be added to a multipath route (gateway routes) or inserted after existing entries (non-gateway based routes) means a multipath route can not have a dev only nexthop (potentially required in some cases - tunnels or VRF route leaking for example) and route 'replace' is a bit adhoc treating gateway based routes and dev-only / reject routes differently. This has led to frustration with developers working on routing suites such as FRR where workarounds such as delete and add are used instead of replace. After this patch set there are 2 differences between IPv4 and IPv6: 1. 'ip ro prepend' = NLM_F_CREATE only IPv4 adds the new route before any existing ones IPv6 adds new route after any existing ones 2. 'ip ro append' = NLM_F_CREATE|NLM_F_APPEND IPv4 adds the new route after any existing ones IPv6 adds the nexthop to existing routes converting to multipath For the former, there are cases where we want same prefix routes added after existing ones (e.g., multicast, prefix routes for macvlan when used for virtual router redundancy). Requiring the APPEND flag to add a new route to an existing one helps here but is a slight change in behavior since prepend with gateway routes now create a separate entry. For the latter IPv6 behavior is preferred - appending a route for the same prefix and metric to make a multipath route, so really IPv4 not allowing an existing route to be updated is the limiter. This will be fixed when nexthops become separate objects - a future patch set. Thank you to Thomas and Ido for testing earlier versions of this set, and to Ido for providing an update to the mlxsw driver. Changes since RFC - cleanup wording in test script; add comments about expected failures and why ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents e3bb946 + abb1860 commit 73bf1fc

File tree

5 files changed

+743
-104
lines changed

5 files changed

+743
-104
lines changed

drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5725,6 +5725,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
57255725

57265726
switch (fib_work->event) {
57275727
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
5728+
case FIB_EVENT_ENTRY_APPEND: /* fall through */
57285729
case FIB_EVENT_ENTRY_ADD:
57295730
replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
57305731
err = mlxsw_sp_router_fib6_add(mlxsw_sp,
@@ -5831,6 +5832,7 @@ static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work,
58315832

58325833
switch (fib_work->event) {
58335834
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
5835+
case FIB_EVENT_ENTRY_APPEND: /* fall through */
58345836
case FIB_EVENT_ENTRY_ADD: /* fall through */
58355837
case FIB_EVENT_ENTRY_DEL:
58365838
fen6_info = container_of(info, struct fib6_entry_notifier_info,

include/net/ip6_route.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
6666
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
6767
}
6868

69-
static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
70-
{
71-
return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
72-
RTF_GATEWAY;
73-
}
74-
7569
void ip6_route_input(struct sk_buff *skb);
7670
struct dst_entry *ip6_route_input_lookup(struct net *net,
7771
struct net_device *dev,

net/ipv6/ip6_fib.c

Lines changed: 71 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -934,19 +934,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
934934
{
935935
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
936936
lockdep_is_held(&rt->fib6_table->tb6_lock));
937-
struct fib6_info *iter = NULL;
937+
struct fib6_info *iter = NULL, *match = NULL;
938938
struct fib6_info __rcu **ins;
939-
struct fib6_info __rcu **fallback_ins = NULL;
940939
int replace = (info->nlh &&
941940
(info->nlh->nlmsg_flags & NLM_F_REPLACE));
941+
int append = (info->nlh &&
942+
(info->nlh->nlmsg_flags & NLM_F_APPEND));
942943
int add = (!info->nlh ||
943944
(info->nlh->nlmsg_flags & NLM_F_CREATE));
944945
int found = 0;
945-
bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
946946
u16 nlflags = NLM_F_EXCL;
947947
int err;
948948

949-
if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
949+
if (append)
950950
nlflags |= NLM_F_APPEND;
951951

952952
ins = &fn->leaf;
@@ -968,13 +968,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
968968

969969
nlflags &= ~NLM_F_EXCL;
970970
if (replace) {
971-
if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
972-
found++;
973-
break;
974-
}
975-
if (rt_can_ecmp)
976-
fallback_ins = fallback_ins ?: ins;
977-
goto next_iter;
971+
found++;
972+
break;
978973
}
979974

980975
if (rt6_duplicate_nexthop(iter, rt)) {
@@ -989,86 +984,67 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
989984
fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
990985
return -EEXIST;
991986
}
992-
/* If we have the same destination and the same metric,
993-
* but not the same gateway, then the route we try to
994-
* add is sibling to this route, increment our counter
995-
* of siblings, and later we will add our route to the
996-
* list.
997-
* Only static routes (which don't have flag
998-
* RTF_EXPIRES) are used for ECMPv6.
999-
*
1000-
* To avoid long list, we only had siblings if the
1001-
* route have a gateway.
1002-
*/
1003-
if (rt_can_ecmp &&
1004-
rt6_qualify_for_ecmp(iter))
1005-
rt->fib6_nsiblings++;
987+
988+
/* first route that matches */
989+
if (!match)
990+
match = iter;
1006991
}
1007992

1008993
if (iter->fib6_metric > rt->fib6_metric)
1009994
break;
1010995

1011-
next_iter:
1012996
ins = &iter->fib6_next;
1013997
}
1014998

1015-
if (fallback_ins && !found) {
1016-
/* No ECMP-able route found, replace first non-ECMP one */
1017-
ins = fallback_ins;
1018-
iter = rcu_dereference_protected(*ins,
1019-
lockdep_is_held(&rt->fib6_table->tb6_lock));
1020-
found++;
1021-
}
1022-
1023999
/* Reset round-robin state, if necessary */
10241000
if (ins == &fn->leaf)
10251001
fn->rr_ptr = NULL;
10261002

10271003
/* Link this route to others same route. */
1028-
if (rt->fib6_nsiblings) {
1029-
unsigned int fib6_nsiblings;
1004+
if (append && match) {
10301005
struct fib6_info *sibling, *temp_sibling;
10311006

1032-
/* Find the first route that have the same metric */
1033-
sibling = leaf;
1034-
while (sibling) {
1035-
if (sibling->fib6_metric == rt->fib6_metric &&
1036-
rt6_qualify_for_ecmp(sibling)) {
1037-
list_add_tail(&rt->fib6_siblings,
1038-
&sibling->fib6_siblings);
1039-
break;
1040-
}
1041-
sibling = rcu_dereference_protected(sibling->fib6_next,
1042-
lockdep_is_held(&rt->fib6_table->tb6_lock));
1007+
if (rt->fib6_flags & RTF_REJECT) {
1008+
NL_SET_ERR_MSG(extack,
1009+
"Can not append a REJECT route");
1010+
return -EINVAL;
1011+
} else if (match->fib6_flags & RTF_REJECT) {
1012+
NL_SET_ERR_MSG(extack,
1013+
"Can not append to a REJECT route");
1014+
return -EINVAL;
10431015
}
1016+
rt->fib6_nsiblings = match->fib6_nsiblings;
1017+
list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
1018+
match->fib6_nsiblings++;
1019+
10441020
/* For each sibling in the list, increment the counter of
10451021
* siblings. BUG() if counters does not match, list of siblings
10461022
* is broken!
10471023
*/
1048-
fib6_nsiblings = 0;
10491024
list_for_each_entry_safe(sibling, temp_sibling,
1050-
&rt->fib6_siblings, fib6_siblings) {
1025+
&match->fib6_siblings, fib6_siblings) {
10511026
sibling->fib6_nsiblings++;
1052-
BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
1053-
fib6_nsiblings++;
1027+
BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
10541028
}
1055-
BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
1056-
rt6_multipath_rebalance(temp_sibling);
1029+
1030+
rt6_multipath_rebalance(match);
10571031
}
10581032

10591033
/*
10601034
* insert node
10611035
*/
10621036
if (!replace) {
1037+
enum fib_event_type event;
1038+
10631039
if (!add)
10641040
pr_warn("NLM_F_CREATE should be set when creating new route\n");
10651041

10661042
add:
10671043
nlflags |= NLM_F_CREATE;
10681044

1069-
err = call_fib6_entry_notifiers(info->nl_net,
1070-
FIB_EVENT_ENTRY_ADD,
1071-
rt, extack);
1045+
event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD;
1046+
err = call_fib6_entry_notifiers(info->nl_net, event, rt,
1047+
extack);
10721048
if (err)
10731049
return err;
10741050

@@ -1086,7 +1062,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
10861062
}
10871063

10881064
} else {
1089-
int nsiblings;
1065+
struct fib6_info *tmp;
10901066

10911067
if (!found) {
10921068
if (add)
@@ -1101,48 +1077,57 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
11011077
if (err)
11021078
return err;
11031079

1080+
/* if route being replaced has siblings, set tmp to
1081+
* last one, otherwise tmp is current route. this is
1082+
* used to set fib6_next for new route
1083+
*/
1084+
if (iter->fib6_nsiblings)
1085+
tmp = list_last_entry(&iter->fib6_siblings,
1086+
struct fib6_info,
1087+
fib6_siblings);
1088+
else
1089+
tmp = iter;
1090+
1091+
/* insert new route */
11041092
atomic_inc(&rt->fib6_ref);
11051093
rcu_assign_pointer(rt->fib6_node, fn);
1106-
rt->fib6_next = iter->fib6_next;
1094+
rt->fib6_next = tmp->fib6_next;
11071095
rcu_assign_pointer(*ins, rt);
1096+
11081097
if (!info->skip_notify)
11091098
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
11101099
if (!(fn->fn_flags & RTN_RTINFO)) {
11111100
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
11121101
fn->fn_flags |= RTN_RTINFO;
11131102
}
1114-
nsiblings = iter->fib6_nsiblings;
1115-
iter->fib6_node = NULL;
1116-
fib6_purge_rt(iter, fn, info->nl_net);
1117-
if (rcu_access_pointer(fn->rr_ptr) == iter)
1118-
fn->rr_ptr = NULL;
1119-
fib6_info_release(iter);
11201103

1121-
if (nsiblings) {
1104+
/* delete old route */
1105+
rt = iter;
1106+
1107+
if (rt->fib6_nsiblings) {
1108+
struct fib6_info *tmp;
1109+
11221110
/* Replacing an ECMP route, remove all siblings */
1123-
ins = &rt->fib6_next;
1124-
iter = rcu_dereference_protected(*ins,
1125-
lockdep_is_held(&rt->fib6_table->tb6_lock));
1126-
while (iter) {
1127-
if (iter->fib6_metric > rt->fib6_metric)
1128-
break;
1129-
if (rt6_qualify_for_ecmp(iter)) {
1130-
*ins = iter->fib6_next;
1131-
iter->fib6_node = NULL;
1132-
fib6_purge_rt(iter, fn, info->nl_net);
1133-
if (rcu_access_pointer(fn->rr_ptr) == iter)
1134-
fn->rr_ptr = NULL;
1135-
fib6_info_release(iter);
1136-
nsiblings--;
1137-
info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
1138-
} else {
1139-
ins = &iter->fib6_next;
1140-
}
1141-
iter = rcu_dereference_protected(*ins,
1142-
lockdep_is_held(&rt->fib6_table->tb6_lock));
1111+
list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
1112+
fib6_siblings) {
1113+
iter->fib6_node = NULL;
1114+
fib6_purge_rt(iter, fn, info->nl_net);
1115+
if (rcu_access_pointer(fn->rr_ptr) == iter)
1116+
fn->rr_ptr = NULL;
1117+
fib6_info_release(iter);
1118+
1119+
rt->fib6_nsiblings--;
1120+
info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
11431121
}
1144-
WARN_ON(nsiblings != 0);
11451122
}
1123+
1124+
WARN_ON(rt->fib6_nsiblings != 0);
1125+
1126+
rt->fib6_node = NULL;
1127+
fib6_purge_rt(rt, fn, info->nl_net);
1128+
if (rcu_access_pointer(fn->rr_ptr) == rt)
1129+
fn->rr_ptr = NULL;
1130+
fib6_info_release(rt);
11461131
}
11471132

11481133
return 0;

net/ipv6/route.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3791,7 +3791,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
37913791
lockdep_is_held(&rt->fib6_table->tb6_lock));
37923792
while (iter) {
37933793
if (iter->fib6_metric == rt->fib6_metric &&
3794-
rt6_qualify_for_ecmp(iter))
3794+
iter->fib6_nsiblings)
37953795
return iter;
37963796
iter = rcu_dereference_protected(iter->fib6_next,
37973797
lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -4381,6 +4381,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
43814381
*/
43824382
cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
43834383
NLM_F_REPLACE);
4384+
cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
43844385
nhn++;
43854386
}
43864387

0 commit comments

Comments
 (0)