Skip to content

Commit 544fe7c

Browse files
roidayanSaeed Mahameed
authored andcommitted
net/mlx5e: Activate HW multipath and handle port affinity based on FIB events
To support multipath offload we are going to track SW multipath route and related nexthops. To do that we register to FIB notifier and handle the route and next-hops events and reflect that as port affinity to HW. When there is a new multipath route entry that all next-hops are the ports of an HCA we will activate LAG in HW. Egress wise, we use HW LAG as the means to emulate multipath on current HW which doesn't support port selection based on xmit hash. In the presence of multiple VFs which use multiple SQs (send queues) this yields fairly good distribution. HA wise, HW LAG buys us the ability for a given RQ (receive queue) to receive traffic from both ports and for SQs to migrate xmitting over the active port if their base port fails. When the route entry is being updated to single path we will update the HW port affinity to use that port only. If a next-hop becomes dead we update the HW port affinity to the living port. When all next-hops are alive again we reset the affinity to default. Due to FW/HW limitations, when a route is deleted we are not disabling the HW LAG since doing so will not allow us to enable it again while VFs are bounded. Typically this is just a temporary state when a routing daemon removes dead routes and later adds them back as needed. This patch only handles events for AF_INET. Signed-off-by: Roi Dayan <[email protected]> Reviewed-by: Or Gerlitz <[email protected]> Signed-off-by: Saeed Mahameed <[email protected]>
1 parent 724b509 commit 544fe7c

File tree

6 files changed

+326
-0
lines changed

6 files changed

+326
-0
lines changed

drivers/net/ethernet/mellanox/mlx5/core/eswitch.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2476,3 +2476,10 @@ bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1)
24762476

24772477
return false;
24782478
}
2479+
2480+
bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
2481+
struct mlx5_core_dev *dev1)
2482+
{
2483+
return (dev0->priv.eswitch->mode == SRIOV_OFFLOADS &&
2484+
dev1->priv.eswitch->mode == SRIOV_OFFLOADS);
2485+
}

drivers/net/ethernet/mellanox/mlx5/core/eswitch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,8 @@ static inline bool mlx5_eswitch_vlan_actions_supported(struct mlx5_core_dev *dev
371371

372372
bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0,
373373
struct mlx5_core_dev *dev1);
374+
bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
375+
struct mlx5_core_dev *dev1);
374376

375377
#define MLX5_DEBUG_ESWITCH_MASK BIT(3)
376378

drivers/net/ethernet/mellanox/mlx5/core/lag.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include "mlx5_core.h"
3737
#include "eswitch.h"
3838
#include "lag.h"
39+
#include "lag_mp.h"
3940

4041
/* General purpose, use for short periods of time.
4142
* Beware of lock dependencies (preferably, no locks should be acquired
@@ -559,6 +560,7 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
559560
{
560561
struct mlx5_lag *ldev = NULL;
561562
struct mlx5_core_dev *tmp_dev;
563+
int err;
562564

563565
if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
564566
!MLX5_CAP_GEN(dev, lag_master) ||
@@ -586,6 +588,11 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
586588
mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
587589
}
588590
}
591+
592+
err = mlx5_lag_mp_init(ldev);
593+
if (err)
594+
mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
595+
err);
589596
}
590597

591598
int mlx5_lag_get_pf_num(struct mlx5_core_dev *dev, int *pf_num)
@@ -631,6 +638,7 @@ void mlx5_lag_remove(struct mlx5_core_dev *dev)
631638
if (i == MLX5_MAX_PORTS) {
632639
if (ldev->nb.notifier_call)
633640
unregister_netdevice_notifier(&ldev->nb);
641+
mlx5_lag_mp_cleanup(ldev);
634642
cancel_delayed_work_sync(&ldev->bond_work);
635643
mlx5_lag_dev_free(ldev);
636644
}

drivers/net/ethernet/mellanox/mlx5/core/lag.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#define __MLX5_LAG_H__
66

77
#include "mlx5_core.h"
8+
#include "lag_mp.h"
89

910
enum {
1011
MLX5_LAG_FLAG_ROCE = 1 << 0,
@@ -38,6 +39,7 @@ struct mlx5_lag {
3839
struct workqueue_struct *wq;
3940
struct delayed_work bond_work;
4041
struct notifier_block nb;
42+
struct lag_mp lag_mp;
4143
};
4244

4345
static inline struct mlx5_lag *

drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,18 @@
33

44
#include <linux/netdevice.h>
55
#include "lag.h"
6+
#include "lag_mp.h"
67
#include "mlx5_core.h"
78
#include "eswitch.h"
89

10+
static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev)
11+
{
12+
if (!ldev->pf[0].dev || !ldev->pf[1].dev)
13+
return false;
14+
15+
return mlx5_esw_multipath_prereq(ldev->pf[0].dev, ldev->pf[1].dev);
16+
}
17+
918
static bool __mlx5_lag_is_multipath(struct mlx5_lag *ldev)
1019
{
1120
return !!(ldev->flags & MLX5_LAG_FLAG_MULTIPATH);
@@ -21,3 +30,275 @@ bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev)
2130

2231
return res;
2332
}
33+
34+
/**
35+
* Set lag port affinity
36+
*
37+
* @ldev: lag device
38+
* @port:
39+
* 0 - set normal affinity.
40+
* 1 - set affinity to port 1.
41+
* 2 - set affinity to port 2.
42+
*
43+
**/
44+
static void mlx5_lag_set_port_affinity(struct mlx5_lag *ldev, int port)
45+
{
46+
struct lag_tracker tracker;
47+
48+
if (!__mlx5_lag_is_multipath(ldev))
49+
return;
50+
51+
switch (port) {
52+
case 0:
53+
tracker.netdev_state[0].tx_enabled = true;
54+
tracker.netdev_state[1].tx_enabled = true;
55+
tracker.netdev_state[0].link_up = true;
56+
tracker.netdev_state[1].link_up = true;
57+
break;
58+
case 1:
59+
tracker.netdev_state[0].tx_enabled = true;
60+
tracker.netdev_state[0].link_up = true;
61+
tracker.netdev_state[1].tx_enabled = false;
62+
tracker.netdev_state[1].link_up = false;
63+
break;
64+
case 2:
65+
tracker.netdev_state[0].tx_enabled = false;
66+
tracker.netdev_state[0].link_up = false;
67+
tracker.netdev_state[1].tx_enabled = true;
68+
tracker.netdev_state[1].link_up = true;
69+
break;
70+
default:
71+
mlx5_core_warn(ldev->pf[0].dev, "Invalid affinity port %d",
72+
port);
73+
return;
74+
}
75+
76+
mlx5_modify_lag(ldev, &tracker);
77+
}
78+
79+
static void mlx5_lag_fib_event_flush(struct notifier_block *nb)
80+
{
81+
struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb);
82+
struct mlx5_lag *ldev = container_of(mp, struct mlx5_lag, lag_mp);
83+
84+
flush_workqueue(ldev->wq);
85+
}
86+
87+
struct mlx5_fib_event_work {
88+
struct work_struct work;
89+
struct mlx5_lag *ldev;
90+
unsigned long event;
91+
union {
92+
struct fib_entry_notifier_info fen_info;
93+
struct fib_nh_notifier_info fnh_info;
94+
};
95+
};
96+
97+
static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev,
98+
unsigned long event,
99+
struct fib_info *fi)
100+
{
101+
struct lag_mp *mp = &ldev->lag_mp;
102+
103+
/* Handle delete event */
104+
if (event == FIB_EVENT_ENTRY_DEL) {
105+
/* stop track */
106+
if (mp->mfi == fi)
107+
mp->mfi = NULL;
108+
return;
109+
}
110+
111+
/* Handle add/replace event */
112+
if (fi->fib_nhs == 1) {
113+
if (__mlx5_lag_is_active(ldev)) {
114+
struct net_device *nh_dev = fi->fib_nh[0].nh_dev;
115+
int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev);
116+
117+
mlx5_lag_set_port_affinity(ldev, ++i);
118+
}
119+
return;
120+
}
121+
122+
if (fi->fib_nhs != 2)
123+
return;
124+
125+
/* Verify next hops are ports of the same hca */
126+
if (!(fi->fib_nh[0].nh_dev == ldev->pf[0].netdev &&
127+
fi->fib_nh[1].nh_dev == ldev->pf[1].netdev) &&
128+
!(fi->fib_nh[0].nh_dev == ldev->pf[1].netdev &&
129+
fi->fib_nh[1].nh_dev == ldev->pf[0].netdev)) {
130+
mlx5_core_warn(ldev->pf[0].dev, "Multipath offload require two ports of the same HCA\n");
131+
return;
132+
}
133+
134+
/* First time we see multipath route */
135+
if (!mp->mfi && !__mlx5_lag_is_active(ldev)) {
136+
struct lag_tracker tracker;
137+
138+
tracker = ldev->tracker;
139+
mlx5_activate_lag(ldev, &tracker, MLX5_LAG_FLAG_MULTIPATH);
140+
}
141+
142+
mlx5_lag_set_port_affinity(ldev, 0);
143+
mp->mfi = fi;
144+
}
145+
146+
static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev,
147+
unsigned long event,
148+
struct fib_nh *fib_nh,
149+
struct fib_info *fi)
150+
{
151+
struct lag_mp *mp = &ldev->lag_mp;
152+
153+
/* Check the nh event is related to the route */
154+
if (!mp->mfi || mp->mfi != fi)
155+
return;
156+
157+
/* nh added/removed */
158+
if (event == FIB_EVENT_NH_DEL) {
159+
int i = mlx5_lag_dev_get_netdev_idx(ldev, fib_nh->nh_dev);
160+
161+
if (i >= 0) {
162+
i = (i + 1) % 2 + 1; /* peer port */
163+
mlx5_lag_set_port_affinity(ldev, i);
164+
}
165+
} else if (event == FIB_EVENT_NH_ADD &&
166+
fi->fib_nhs == 2) {
167+
mlx5_lag_set_port_affinity(ldev, 0);
168+
}
169+
}
170+
171+
static void mlx5_lag_fib_update(struct work_struct *work)
172+
{
173+
struct mlx5_fib_event_work *fib_work =
174+
container_of(work, struct mlx5_fib_event_work, work);
175+
struct mlx5_lag *ldev = fib_work->ldev;
176+
struct fib_nh *fib_nh;
177+
178+
/* Protect internal structures from changes */
179+
rtnl_lock();
180+
switch (fib_work->event) {
181+
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
182+
case FIB_EVENT_ENTRY_APPEND: /* fall through */
183+
case FIB_EVENT_ENTRY_ADD: /* fall through */
184+
case FIB_EVENT_ENTRY_DEL:
185+
mlx5_lag_fib_route_event(ldev, fib_work->event,
186+
fib_work->fen_info.fi);
187+
fib_info_put(fib_work->fen_info.fi);
188+
break;
189+
case FIB_EVENT_NH_ADD: /* fall through */
190+
case FIB_EVENT_NH_DEL:
191+
fib_nh = fib_work->fnh_info.fib_nh;
192+
mlx5_lag_fib_nexthop_event(ldev,
193+
fib_work->event,
194+
fib_work->fnh_info.fib_nh,
195+
fib_nh->nh_parent);
196+
fib_info_put(fib_work->fnh_info.fib_nh->nh_parent);
197+
break;
198+
}
199+
200+
rtnl_unlock();
201+
kfree(fib_work);
202+
}
203+
204+
static struct mlx5_fib_event_work *
205+
mlx5_lag_init_fib_work(struct mlx5_lag *ldev, unsigned long event)
206+
{
207+
struct mlx5_fib_event_work *fib_work;
208+
209+
fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
210+
if (WARN_ON(!fib_work))
211+
return NULL;
212+
213+
INIT_WORK(&fib_work->work, mlx5_lag_fib_update);
214+
fib_work->ldev = ldev;
215+
fib_work->event = event;
216+
217+
return fib_work;
218+
}
219+
220+
static int mlx5_lag_fib_event(struct notifier_block *nb,
221+
unsigned long event,
222+
void *ptr)
223+
{
224+
struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb);
225+
struct mlx5_lag *ldev = container_of(mp, struct mlx5_lag, lag_mp);
226+
struct fib_notifier_info *info = ptr;
227+
struct mlx5_fib_event_work *fib_work;
228+
struct fib_entry_notifier_info *fen_info;
229+
struct fib_nh_notifier_info *fnh_info;
230+
struct fib_info *fi;
231+
232+
if (info->family != AF_INET)
233+
return NOTIFY_DONE;
234+
235+
if (!mlx5_lag_multipath_check_prereq(ldev))
236+
return NOTIFY_DONE;
237+
238+
switch (event) {
239+
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
240+
case FIB_EVENT_ENTRY_APPEND: /* fall through */
241+
case FIB_EVENT_ENTRY_ADD: /* fall through */
242+
case FIB_EVENT_ENTRY_DEL:
243+
fen_info = container_of(info, struct fib_entry_notifier_info,
244+
info);
245+
fi = fen_info->fi;
246+
if (fi->fib_dev != ldev->pf[0].netdev &&
247+
fi->fib_dev != ldev->pf[1].netdev) {
248+
return NOTIFY_DONE;
249+
}
250+
fib_work = mlx5_lag_init_fib_work(ldev, event);
251+
if (!fib_work)
252+
return NOTIFY_DONE;
253+
fib_work->fen_info = *fen_info;
254+
/* Take reference on fib_info to prevent it from being
255+
* freed while work is queued. Release it afterwards.
256+
*/
257+
fib_info_hold(fib_work->fen_info.fi);
258+
break;
259+
case FIB_EVENT_NH_ADD: /* fall through */
260+
case FIB_EVENT_NH_DEL:
261+
fnh_info = container_of(info, struct fib_nh_notifier_info,
262+
info);
263+
fib_work = mlx5_lag_init_fib_work(ldev, event);
264+
if (!fib_work)
265+
return NOTIFY_DONE;
266+
fib_work->fnh_info = *fnh_info;
267+
fib_info_hold(fib_work->fnh_info.fib_nh->nh_parent);
268+
break;
269+
default:
270+
return NOTIFY_DONE;
271+
}
272+
273+
queue_work(ldev->wq, &fib_work->work);
274+
275+
return NOTIFY_DONE;
276+
}
277+
278+
int mlx5_lag_mp_init(struct mlx5_lag *ldev)
279+
{
280+
struct lag_mp *mp = &ldev->lag_mp;
281+
int err;
282+
283+
if (mp->fib_nb.notifier_call)
284+
return 0;
285+
286+
mp->fib_nb.notifier_call = mlx5_lag_fib_event;
287+
err = register_fib_notifier(&mp->fib_nb,
288+
mlx5_lag_fib_event_flush);
289+
if (err)
290+
mp->fib_nb.notifier_call = NULL;
291+
292+
return err;
293+
}
294+
295+
void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev)
296+
{
297+
struct lag_mp *mp = &ldev->lag_mp;
298+
299+
if (!mp->fib_nb.notifier_call)
300+
return;
301+
302+
unregister_fib_notifier(&mp->fib_nb);
303+
mp->fib_nb.notifier_call = NULL;
304+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
2+
/* Copyright (c) 2019 Mellanox Technologies. */
3+
4+
#ifndef __MLX5_LAG_MP_H__
5+
#define __MLX5_LAG_MP_H__
6+
7+
#include "lag.h"
8+
#include "mlx5_core.h"
9+
10+
struct lag_mp {
11+
struct notifier_block fib_nb;
12+
struct fib_info *mfi; /* used in tracking fib events */
13+
};
14+
15+
#ifdef CONFIG_MLX5_ESWITCH
16+
17+
int mlx5_lag_mp_init(struct mlx5_lag *ldev);
18+
void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev);
19+
20+
#else /* CONFIG_MLX5_ESWITCH */
21+
22+
static inline int mlx5_lag_mp_init(struct mlx5_lag *ldev) { return 0; }
23+
static inline void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) {}
24+
25+
#endif /* CONFIG_MLX5_ESWITCH */
26+
#endif /* __MLX5_LAG_MP_H__ */

0 commit comments

Comments
 (0)