Skip to content

Commit 51ebd31

Browse files
NicolasDichteldavem330
authored andcommitted
ipv6: add support of equal cost multipath (ECMP)
Each nexthop is added like a single route in the routing table. All routes that have the same metric/weight and destination but not the same gateway are considering as ECMP routes. They are linked together, through a list called rt6i_siblings. ECMP routes can be added in one shot, with RTA_MULTIPATH attribute or one after the other (in both case, the flag NLM_F_EXCL should not be set). The patch is based on a previous work from Luc Saillard <[email protected]>. Signed-off-by: Nicolas Dichtel <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent d94ce9b commit 51ebd31

File tree

3 files changed

+200
-3
lines changed

3 files changed

+200
-3
lines changed

include/net/ip6_fib.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ struct fib6_config {
4747
unsigned long fc_expires;
4848
struct nlattr *fc_mx;
4949
int fc_mx_len;
50+
int fc_mp_len;
51+
struct nlattr *fc_mp;
5052

5153
struct nl_info fc_nlinfo;
5254
};
@@ -99,6 +101,14 @@ struct rt6_info {
99101

100102
struct in6_addr rt6i_gateway;
101103

104+
/* Multipath routes:
105+
* siblings is a list of rt6_info that have the the same metric/weight,
106+
* destination, but not the same gateway. nsiblings is just a cache
107+
* to speed up lookup.
108+
*/
109+
struct list_head rt6i_siblings;
110+
unsigned int rt6i_nsiblings;
111+
102112
atomic_t rt6i_ref;
103113

104114
/* These are in a separate cache line. */

net/ipv6/ip6_fib.c

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
672672
iter->rt6i_idev == rt->rt6i_idev &&
673673
ipv6_addr_equal(&iter->rt6i_gateway,
674674
&rt->rt6i_gateway)) {
675+
if (rt->rt6i_nsiblings)
676+
rt->rt6i_nsiblings = 0;
675677
if (!(iter->rt6i_flags & RTF_EXPIRES))
676678
return -EEXIST;
677679
if (!(rt->rt6i_flags & RTF_EXPIRES))
@@ -680,6 +682,21 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
680682
rt6_set_expires(iter, rt->dst.expires);
681683
return -EEXIST;
682684
}
685+
/* If we have the same destination and the same metric,
686+
* but not the same gateway, then the route we try to
687+
* add is sibling to this route, increment our counter
688+
* of siblings, and later we will add our route to the
689+
* list.
690+
* Only static routes (which don't have flag
691+
* RTF_EXPIRES) are used for ECMPv6.
692+
*
693+
* To avoid long list, we only had siblings if the
694+
* route have a gateway.
695+
*/
696+
if (rt->rt6i_flags & RTF_GATEWAY &&
697+
!(rt->rt6i_flags & RTF_EXPIRES) &&
698+
!(iter->rt6i_flags & RTF_EXPIRES))
699+
rt->rt6i_nsiblings++;
683700
}
684701

685702
if (iter->rt6i_metric > rt->rt6i_metric)
@@ -692,6 +709,35 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
692709
if (ins == &fn->leaf)
693710
fn->rr_ptr = NULL;
694711

712+
/* Link this route to others same route. */
713+
if (rt->rt6i_nsiblings) {
714+
unsigned int rt6i_nsiblings;
715+
struct rt6_info *sibling, *temp_sibling;
716+
717+
/* Find the first route that have the same metric */
718+
sibling = fn->leaf;
719+
while (sibling) {
720+
if (sibling->rt6i_metric == rt->rt6i_metric) {
721+
list_add_tail(&rt->rt6i_siblings,
722+
&sibling->rt6i_siblings);
723+
break;
724+
}
725+
sibling = sibling->dst.rt6_next;
726+
}
727+
/* For each sibling in the list, increment the counter of
728+
* siblings. BUG() if counters does not match, list of siblings
729+
* is broken!
730+
*/
731+
rt6i_nsiblings = 0;
732+
list_for_each_entry_safe(sibling, temp_sibling,
733+
&rt->rt6i_siblings, rt6i_siblings) {
734+
sibling->rt6i_nsiblings++;
735+
BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings);
736+
rt6i_nsiblings++;
737+
}
738+
BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
739+
}
740+
695741
/*
696742
* insert node
697743
*/
@@ -1193,6 +1239,17 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
11931239
if (fn->rr_ptr == rt)
11941240
fn->rr_ptr = NULL;
11951241

1242+
/* Remove this entry from other siblings */
1243+
if (rt->rt6i_nsiblings) {
1244+
struct rt6_info *sibling, *next_sibling;
1245+
1246+
list_for_each_entry_safe(sibling, next_sibling,
1247+
&rt->rt6i_siblings, rt6i_siblings)
1248+
sibling->rt6i_nsiblings--;
1249+
rt->rt6i_nsiblings = 0;
1250+
list_del_init(&rt->rt6i_siblings);
1251+
}
1252+
11961253
/* Adjust walkers */
11971254
read_lock(&fib6_walker_lock);
11981255
FOR_WALKERS(w) {

net/ipv6/route.c

Lines changed: 133 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
#include <net/xfrm.h>
5858
#include <net/netevent.h>
5959
#include <net/netlink.h>
60+
#include <net/nexthop.h>
6061

6162
#include <asm/uaccess.h>
6263

@@ -289,6 +290,8 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
289290
memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
290291
rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
291292
rt->rt6i_genid = rt_genid(net);
293+
INIT_LIST_HEAD(&rt->rt6i_siblings);
294+
rt->rt6i_nsiblings = 0;
292295
}
293296
return rt;
294297
}
@@ -385,6 +388,69 @@ static bool rt6_need_strict(const struct in6_addr *daddr)
385388
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
386389
}
387390

391+
/* Multipath route selection:
392+
* Hash based function using packet header and flowlabel.
393+
* Adapted from fib_info_hashfn()
394+
*/
395+
static int rt6_info_hash_nhsfn(unsigned int candidate_count,
396+
const struct flowi6 *fl6)
397+
{
398+
unsigned int val = fl6->flowi6_proto;
399+
400+
val ^= fl6->daddr.s6_addr32[0];
401+
val ^= fl6->daddr.s6_addr32[1];
402+
val ^= fl6->daddr.s6_addr32[2];
403+
val ^= fl6->daddr.s6_addr32[3];
404+
405+
val ^= fl6->saddr.s6_addr32[0];
406+
val ^= fl6->saddr.s6_addr32[1];
407+
val ^= fl6->saddr.s6_addr32[2];
408+
val ^= fl6->saddr.s6_addr32[3];
409+
410+
/* Work only if this not encapsulated */
411+
switch (fl6->flowi6_proto) {
412+
case IPPROTO_UDP:
413+
case IPPROTO_TCP:
414+
case IPPROTO_SCTP:
415+
val ^= fl6->fl6_sport;
416+
val ^= fl6->fl6_dport;
417+
break;
418+
419+
case IPPROTO_ICMPV6:
420+
val ^= fl6->fl6_icmp_type;
421+
val ^= fl6->fl6_icmp_code;
422+
break;
423+
}
424+
/* RFC6438 recommands to use flowlabel */
425+
val ^= fl6->flowlabel;
426+
427+
/* Perhaps, we need to tune, this function? */
428+
val = val ^ (val >> 7) ^ (val >> 12);
429+
return val % candidate_count;
430+
}
431+
432+
static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
433+
struct flowi6 *fl6)
434+
{
435+
struct rt6_info *sibling, *next_sibling;
436+
int route_choosen;
437+
438+
route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
439+
/* Don't change the route, if route_choosen == 0
440+
* (siblings does not include ourself)
441+
*/
442+
if (route_choosen)
443+
list_for_each_entry_safe(sibling, next_sibling,
444+
&match->rt6i_siblings, rt6i_siblings) {
445+
route_choosen--;
446+
if (route_choosen == 0) {
447+
match = sibling;
448+
break;
449+
}
450+
}
451+
return match;
452+
}
453+
388454
/*
389455
* Route lookup. Any table->tb6_lock is implied.
390456
*/
@@ -702,6 +768,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
702768
restart:
703769
rt = fn->leaf;
704770
rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
771+
if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
772+
rt = rt6_multipath_select(rt, fl6);
705773
BACKTRACK(net, &fl6->saddr);
706774
out:
707775
dst_use(&rt->dst, jiffies);
@@ -863,7 +931,8 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
863931

864932
restart:
865933
rt = rt6_select(fn, oif, strict | reachable);
866-
934+
if (rt->rt6i_nsiblings && oif == 0)
935+
rt = rt6_multipath_select(rt, fl6);
867936
BACKTRACK(net, &fl6->saddr);
868937
if (rt == net->ipv6.ip6_null_entry ||
869938
rt->rt6i_flags & RTF_CACHE)
@@ -2249,6 +2318,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
22492318
[RTA_IIF] = { .type = NLA_U32 },
22502319
[RTA_PRIORITY] = { .type = NLA_U32 },
22512320
[RTA_METRICS] = { .type = NLA_NESTED },
2321+
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
22522322
};
22532323

22542324
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2326,11 +2396,65 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
23262396
if (tb[RTA_TABLE])
23272397
cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
23282398

2399+
if (tb[RTA_MULTIPATH]) {
2400+
cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2401+
cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2402+
}
2403+
23292404
err = 0;
23302405
errout:
23312406
return err;
23322407
}
23332408

2409+
static int ip6_route_multipath(struct fib6_config *cfg, int add)
2410+
{
2411+
struct fib6_config r_cfg;
2412+
struct rtnexthop *rtnh;
2413+
int remaining;
2414+
int attrlen;
2415+
int err = 0, last_err = 0;
2416+
2417+
beginning:
2418+
rtnh = (struct rtnexthop *)cfg->fc_mp;
2419+
remaining = cfg->fc_mp_len;
2420+
2421+
/* Parse a Multipath Entry */
2422+
while (rtnh_ok(rtnh, remaining)) {
2423+
memcpy(&r_cfg, cfg, sizeof(*cfg));
2424+
if (rtnh->rtnh_ifindex)
2425+
r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2426+
2427+
attrlen = rtnh_attrlen(rtnh);
2428+
if (attrlen > 0) {
2429+
struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2430+
2431+
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2432+
if (nla) {
2433+
nla_memcpy(&r_cfg.fc_gateway, nla, 16);
2434+
r_cfg.fc_flags |= RTF_GATEWAY;
2435+
}
2436+
}
2437+
err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2438+
if (err) {
2439+
last_err = err;
2440+
/* If we are trying to remove a route, do not stop the
2441+
* loop when ip6_route_del() fails (because next hop is
2442+
* already gone), we should try to remove all next hops.
2443+
*/
2444+
if (add) {
2445+
/* If add fails, we should try to delete all
2446+
* next hops that have been already added.
2447+
*/
2448+
add = 0;
2449+
goto beginning;
2450+
}
2451+
}
2452+
rtnh = rtnh_next(rtnh, &remaining);
2453+
}
2454+
2455+
return last_err;
2456+
}
2457+
23342458
static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
23352459
{
23362460
struct fib6_config cfg;
@@ -2340,7 +2464,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
23402464
if (err < 0)
23412465
return err;
23422466

2343-
return ip6_route_del(&cfg);
2467+
if (cfg.fc_mp)
2468+
return ip6_route_multipath(&cfg, 0);
2469+
else
2470+
return ip6_route_del(&cfg);
23442471
}
23452472

23462473
static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -2352,7 +2479,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
23522479
if (err < 0)
23532480
return err;
23542481

2355-
return ip6_route_add(&cfg);
2482+
if (cfg.fc_mp)
2483+
return ip6_route_multipath(&cfg, 1);
2484+
else
2485+
return ip6_route_add(&cfg);
23562486
}
23572487

23582488
static inline size_t rt6_nlmsg_size(void)

0 commit comments

Comments
 (0)