Skip to content

Commit a6db449

Browse files
David Aherndavem330
authored andcommitted
net: ipv4: Consider failed nexthops in multipath routes
Multipath route lookups should consider knowledge about next hops and not select a hop that is known to be failed. Example: [h2] [h3] 15.0.0.5 | | 3| 3| [SP1] [SP2]--+ 1 2 1 2 | | /-------------+ | | \ / | | X | | / \ | | / \---------------\ | 1 2 1 2 12.0.0.2 [TOR1] 3-----------------3 [TOR2] 12.0.0.3 4 4 \ / \ / \ / -------| |-----/ 1 2 [TOR3] 3| | [h1] 12.0.0.1 host h1 with IP 12.0.0.1 has 2 paths to host h3 at 15.0.0.5: root@h1:~# ip ro ls ... 12.0.0.0/24 dev swp1 proto kernel scope link src 12.0.0.1 15.0.0.0/16 nexthop via 12.0.0.2 dev swp1 weight 1 nexthop via 12.0.0.3 dev swp1 weight 1 ... If the link between tor3 and tor1 is down and the link between tor1 and tor2 then tor1 is effectively cut-off from h1. Yet the route lookups in h1 are alternating between the 2 routes: ping 15.0.0.5 gets one and ssh 15.0.0.5 gets the other. Connections that attempt to use the 12.0.0.2 nexthop fail since that neighbor is not reachable: root@h1:~# ip neigh show ... 12.0.0.3 dev swp1 lladdr 00:02:00:00:00:1b REACHABLE 12.0.0.2 dev swp1 FAILED ... The failed path can be avoided by considering known neighbor information when selecting next hops. If the neighbor lookup fails we have no knowledge about the nexthop, so give it a shot. If there is an entry then only select the nexthop if the state is sane. This is similar to what fib_detect_death does. To maintain backward compatibility use of the neighbor information is based on a new sysctl, fib_multipath_use_neigh. Signed-off-by: David Ahern <[email protected]> Reviewed-by: Julian Anastasov <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 0b0e30c commit a6db449

File tree

4 files changed

+53
-5
lines changed

4 files changed

+53
-5
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,16 @@ fwmark_reflect - BOOLEAN
6363
fwmark of the packet they are replying to.
6464
Default: 0
6565

66+
fib_multipath_use_neigh - BOOLEAN
67+
Use status of existing neighbor entry when determining nexthop for
68+
multipath routes. If disabled, neighbor information is not used and
69+
packets could be directed to a failed nexthop. Only valid for kernels
70+
built with CONFIG_IP_ROUTE_MULTIPATH enabled.
71+
Default: 0 (disabled)
72+
Possible values:
73+
0 - disabled
74+
1 - enabled
75+
6676
route/max_size - INTEGER
6777
Maximum number of routes allowed in the kernel. Increase
6878
this when using large numbers of interfaces and/or routes.

include/net/netns/ipv4.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ struct netns_ipv4 {
132132
struct list_head mr_tables;
133133
struct fib_rules_ops *mr_rules_ops;
134134
#endif
135+
#endif
136+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
137+
int sysctl_fib_multipath_use_neigh;
135138
#endif
136139
atomic_t rt_genid;
137140
};

net/ipv4/fib_semantics.c

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,21 +1559,45 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
15591559
}
15601560

15611561
#ifdef CONFIG_IP_ROUTE_MULTIPATH
1562+
static bool fib_good_nh(const struct fib_nh *nh)
1563+
{
1564+
int state = NUD_REACHABLE;
1565+
1566+
if (nh->nh_scope == RT_SCOPE_LINK) {
1567+
struct neighbour *n;
1568+
1569+
rcu_read_lock_bh();
1570+
1571+
n = __ipv4_neigh_lookup_noref(nh->nh_dev, nh->nh_gw);
1572+
if (n)
1573+
state = n->nud_state;
1574+
1575+
rcu_read_unlock_bh();
1576+
}
1577+
1578+
return !!(state & NUD_VALID);
1579+
}
15621580

15631581
void fib_select_multipath(struct fib_result *res, int hash)
15641582
{
15651583
struct fib_info *fi = res->fi;
1584+
struct net *net = fi->fib_net;
1585+
bool first = false;
15661586

15671587
for_nexthops(fi) {
15681588
if (hash > atomic_read(&nh->nh_upper_bound))
15691589
continue;
15701590

1571-
res->nh_sel = nhsel;
1572-
return;
1591+
if (!net->ipv4.sysctl_fib_multipath_use_neigh ||
1592+
fib_good_nh(nh)) {
1593+
res->nh_sel = nhsel;
1594+
return;
1595+
}
1596+
if (!first) {
1597+
res->nh_sel = nhsel;
1598+
first = true;
1599+
}
15731600
} endfor_nexthops(fi);
1574-
1575-
/* Race condition: route has just become dead. */
1576-
res->nh_sel = 0;
15771601
}
15781602
#endif
15791603

net/ipv4/sysctl_net_ipv4.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,17 @@ static struct ctl_table ipv4_net_table[] = {
960960
.mode = 0644,
961961
.proc_handler = proc_dointvec,
962962
},
963+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
964+
{
965+
.procname = "fib_multipath_use_neigh",
966+
.data = &init_net.ipv4.sysctl_fib_multipath_use_neigh,
967+
.maxlen = sizeof(int),
968+
.mode = 0644,
969+
.proc_handler = proc_dointvec_minmax,
970+
.extra1 = &zero,
971+
.extra2 = &one,
972+
},
973+
#endif
963974
{ }
964975
};
965976

0 commit comments

Comments
 (0)