Skip to content

Commit 8448f91

Browse files
committed
Merge branch 'ipv6-Add-support-for-non-equal-cost-multipath'
Ido Schimmel says: ==================== ipv6: Add support for non-equal-cost multipath This set aims to add support for IPv6 non-equal-cost multipath routes. The first three patches convert multipath selection to use the hash-threshold method (RFC 2992) instead of modulo-N. The same method is employed by the IPv4 routing code since commit 0e884c7 ("ipv4: L3 hash-based multipath"). Unlike modulo-N, with hash-threshold only the flows near the region boundaries are affected when a nexthop is added or removed. In addition, it allows us to easily add support for non-equal-cost multipath in the last patch by sizing the different regions according to the provided weights. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents e2b3b35 + 398958a commit 8448f91

File tree

4 files changed

+126
-32
lines changed

4 files changed

+126
-32
lines changed

include/net/ip6_fib.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ struct rt6_info {
149149
*/
150150
struct list_head rt6i_siblings;
151151
unsigned int rt6i_nsiblings;
152+
atomic_t rt6i_nh_upper_bound;
152153

153154
atomic_t rt6i_ref;
154155

@@ -170,6 +171,7 @@ struct rt6_info {
170171
u32 rt6i_metric;
171172
u32 rt6i_pmtu;
172173
/* more non-fragment space at head required */
174+
int rt6i_nh_weight;
173175
unsigned short rt6i_nfheader_len;
174176
u8 rt6i_protocol;
175177
u8 exception_bucket_flushed:1,

include/net/ip6_route.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
6666
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
6767
}
6868

69+
static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt)
70+
{
71+
return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
72+
RTF_GATEWAY;
73+
}
74+
6975
void ip6_route_input(struct sk_buff *skb);
7076
struct dst_entry *ip6_route_input_lookup(struct net *net,
7177
struct net_device *dev,
@@ -171,6 +177,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
171177
void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
172178
void rt6_disable_ip(struct net_device *dev, unsigned long event);
173179
void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
180+
void rt6_multipath_rebalance(struct rt6_info *rt);
174181

175182
static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
176183
{

net/ipv6/ip6_fib.c

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -796,12 +796,6 @@ static struct fib6_node *fib6_add_1(struct net *net,
796796
return ln;
797797
}
798798

799-
static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
800-
{
801-
return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
802-
RTF_GATEWAY;
803-
}
804-
805799
static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
806800
{
807801
int i;
@@ -991,6 +985,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
991985
rt6i_nsiblings++;
992986
}
993987
BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
988+
rt6_multipath_rebalance(temp_sibling);
994989
}
995990

996991
/*
@@ -1672,6 +1667,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
16721667
sibling->rt6i_nsiblings--;
16731668
rt->rt6i_nsiblings = 0;
16741669
list_del_init(&rt->rt6i_siblings);
1670+
rt6_multipath_rebalance(next_sibling);
16751671
}
16761672

16771673
/* Adjust walkers */

net/ipv6/route.c

Lines changed: 115 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -455,36 +455,26 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
455455
int strict)
456456
{
457457
struct rt6_info *sibling, *next_sibling;
458-
int route_choosen;
459458

460459
/* We might have already computed the hash for ICMPv6 errors. In such
461460
* case it will always be non-zero. Otherwise now is the time to do it.
462461
*/
463462
if (!fl6->mp_hash)
464463
fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465464

466-
route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467-
/* Don't change the route, if route_choosen == 0
468-
* (siblings does not include ourself)
469-
*/
470-
if (route_choosen)
471-
list_for_each_entry_safe(sibling, next_sibling,
472-
&match->rt6i_siblings, rt6i_siblings) {
473-
route_choosen--;
474-
if (route_choosen == 0) {
475-
struct inet6_dev *idev = sibling->rt6i_idev;
476-
477-
if (sibling->rt6i_nh_flags & RTNH_F_DEAD)
478-
break;
479-
if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN &&
480-
idev->cnf.ignore_routes_with_linkdown)
481-
break;
482-
if (rt6_score_route(sibling, oif, strict) < 0)
483-
break;
484-
match = sibling;
485-
break;
486-
}
487-
}
465+
if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
466+
return match;
467+
468+
list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
469+
rt6i_siblings) {
470+
if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
471+
continue;
472+
if (rt6_score_route(sibling, oif, strict) < 0)
473+
break;
474+
match = sibling;
475+
break;
476+
}
477+
488478
return match;
489479
}
490480

@@ -1833,10 +1823,10 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
18331823

18341824
if (skb) {
18351825
ip6_multipath_l3_keys(skb, &hash_keys);
1836-
return flow_hash_from_keys(&hash_keys);
1826+
return flow_hash_from_keys(&hash_keys) >> 1;
18371827
}
18381828

1839-
return get_hash_from_flowi6(fl6);
1829+
return get_hash_from_flowi6(fl6) >> 1;
18401830
}
18411831

18421832
void ip6_route_input(struct sk_buff *skb)
@@ -2604,6 +2594,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
26042594
#endif
26052595

26062596
rt->rt6i_metric = cfg->fc_metric;
2597+
rt->rt6i_nh_weight = 1;
26072598

26082599
/* We cannot add true routes via loopback here,
26092600
they would result in kernel looping; promote them to reject routes
@@ -3481,6 +3472,99 @@ struct arg_netdev_event {
34813472
};
34823473
};
34833474

3475+
static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3476+
{
3477+
struct rt6_info *iter;
3478+
struct fib6_node *fn;
3479+
3480+
fn = rcu_dereference_protected(rt->rt6i_node,
3481+
lockdep_is_held(&rt->rt6i_table->tb6_lock));
3482+
iter = rcu_dereference_protected(fn->leaf,
3483+
lockdep_is_held(&rt->rt6i_table->tb6_lock));
3484+
while (iter) {
3485+
if (iter->rt6i_metric == rt->rt6i_metric &&
3486+
rt6_qualify_for_ecmp(iter))
3487+
return iter;
3488+
iter = rcu_dereference_protected(iter->rt6_next,
3489+
lockdep_is_held(&rt->rt6i_table->tb6_lock));
3490+
}
3491+
3492+
return NULL;
3493+
}
3494+
3495+
static bool rt6_is_dead(const struct rt6_info *rt)
3496+
{
3497+
if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3498+
(rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3499+
rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3500+
return true;
3501+
3502+
return false;
3503+
}
3504+
3505+
static int rt6_multipath_total_weight(const struct rt6_info *rt)
3506+
{
3507+
struct rt6_info *iter;
3508+
int total = 0;
3509+
3510+
if (!rt6_is_dead(rt))
3511+
total += rt->rt6i_nh_weight;
3512+
3513+
list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3514+
if (!rt6_is_dead(iter))
3515+
total += iter->rt6i_nh_weight;
3516+
}
3517+
3518+
return total;
3519+
}
3520+
3521+
static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3522+
{
3523+
int upper_bound = -1;
3524+
3525+
if (!rt6_is_dead(rt)) {
3526+
*weight += rt->rt6i_nh_weight;
3527+
upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3528+
total) - 1;
3529+
}
3530+
atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3531+
}
3532+
3533+
static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3534+
{
3535+
struct rt6_info *iter;
3536+
int weight = 0;
3537+
3538+
rt6_upper_bound_set(rt, &weight, total);
3539+
3540+
list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3541+
rt6_upper_bound_set(iter, &weight, total);
3542+
}
3543+
3544+
void rt6_multipath_rebalance(struct rt6_info *rt)
3545+
{
3546+
struct rt6_info *first;
3547+
int total;
3548+
3549+
/* In case the entire multipath route was marked for flushing,
3550+
* then there is no need to rebalance upon the removal of every
3551+
* sibling route.
3552+
*/
3553+
if (!rt->rt6i_nsiblings || rt->should_flush)
3554+
return;
3555+
3556+
/* During lookup routes are evaluated in order, so we need to
3557+
* make sure upper bounds are assigned from the first sibling
3558+
* onwards.
3559+
*/
3560+
first = rt6_multipath_first_sibling(rt);
3561+
if (WARN_ON_ONCE(!first))
3562+
return;
3563+
3564+
total = rt6_multipath_total_weight(first);
3565+
rt6_multipath_upper_bound_set(first, total);
3566+
}
3567+
34843568
static int fib6_ifup(struct rt6_info *rt, void *p_arg)
34853569
{
34863570
const struct arg_netdev_event *arg = p_arg;
@@ -3489,6 +3573,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg)
34893573
if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
34903574
rt->rt6i_nh_flags &= ~arg->nh_flags;
34913575
fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3576+
rt6_multipath_rebalance(rt);
34923577
}
34933578

34943579
return 0;
@@ -3588,13 +3673,15 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
35883673
rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
35893674
RTNH_F_LINKDOWN);
35903675
fib6_update_sernum(rt);
3676+
rt6_multipath_rebalance(rt);
35913677
}
35923678
return -2;
35933679
case NETDEV_CHANGE:
35943680
if (rt->dst.dev != dev ||
35953681
rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
35963682
break;
35973683
rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3684+
rt6_multipath_rebalance(rt);
35983685
break;
35993686
}
36003687

@@ -3938,6 +4025,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
39384025
goto cleanup;
39394026
}
39404027

4028+
rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4029+
39414030
err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
39424031
if (err) {
39434032
dst_release_immediate(&rt->dst);
@@ -4160,7 +4249,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
41604249
if (!rtnh)
41614250
goto nla_put_failure;
41624251

4163-
rtnh->rtnh_hops = 0;
4252+
rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
41644253
rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
41654254

41664255
if (rt6_nexthop_info(skb, rt, &flags, true) < 0)

0 commit comments

Comments
 (0)