Skip to content

Commit 691c041

Browse files
w1ldptrSaeed Mahameed
authored andcommitted
net/mlx5e: Fix deadlock in tc route query code
Cited commit causes ABBA deadlock[0] when peer flows are created while holding the devcom rw semaphore. Due to peer flows offload implementation the lock is taken much higher up the call chain and there is no obvious way to easily fix the deadlock. Instead, since tc route query code needs the peer eswitch structure only to perform a lookup in xarray and doesn't perform any sleeping operations with it, refactor the code for lockless execution in following ways: - RCUify the devcom 'data' pointer. When resetting the pointer synchronously wait for RCU grace period before returning. This is fine since devcom is currently only used for synchronization of pairing/unpairing of eswitches which is rare and already expensive as-is. - Wrap all usages of 'paired' boolean in {READ|WRITE}_ONCE(). The flag has already been used in some unlocked contexts without proper annotations (e.g. users of mlx5_devcom_is_paired() function), but it wasn't an issue since all relevant code paths checked it again after obtaining the devcom semaphore. Now it is also used by mlx5_devcom_get_peer_data_rcu() as "best effort" check to return NULL when devcom is being unpaired. Note that while RCU read lock doesn't prevent the unpaired flag from being changed concurrently it still guarantees that reader can continue to use 'data'. - Refactor mlx5e_tc_query_route_vport() function to use new mlx5_devcom_get_peer_data_rcu() API which fixes the deadlock. [0]: [ 164.599612] ====================================================== [ 164.600142] WARNING: possible circular locking dependency detected [ 164.600667] 6.3.0-rc3+ #1 Not tainted [ 164.601021] ------------------------------------------------------ [ 164.601557] handler1/3456 is trying to acquire lock: [ 164.601998] ffff88811f1714b0 (&esw->offloads.encap_tbl_lock){+.+.}-{3:3}, at: mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core] [ 164.603078] but task is already holding lock: [ 164.603617] ffff88810137fc98 (&comp->sem){++++}-{3:3}, at: mlx5_devcom_get_peer_data+0x37/0x80 [mlx5_core] [ 164.604459] which lock already depends on the new lock. [ 164.605190] the existing dependency chain (in reverse order) is: [ 164.605848] -> #1 (&comp->sem){++++}-{3:3}: [ 164.606380] down_read+0x39/0x50 [ 164.606772] mlx5_devcom_get_peer_data+0x37/0x80 [mlx5_core] [ 164.607336] mlx5e_tc_query_route_vport+0x86/0xc0 [mlx5_core] [ 164.607914] mlx5e_tc_tun_route_lookup+0x1a4/0x1d0 [mlx5_core] [ 164.608495] mlx5e_attach_decap_route+0xc6/0x1e0 [mlx5_core] [ 164.609063] mlx5e_tc_add_fdb_flow+0x1ea/0x360 [mlx5_core] [ 164.609627] __mlx5e_add_fdb_flow+0x2d2/0x430 [mlx5_core] [ 164.610175] mlx5e_configure_flower+0x952/0x1a20 [mlx5_core] [ 164.610741] tc_setup_cb_add+0xd4/0x200 [ 164.611146] fl_hw_replace_filter+0x14c/0x1f0 [cls_flower] [ 164.611661] fl_change+0xc95/0x18a0 [cls_flower] [ 164.612116] tc_new_tfilter+0x3fc/0xd20 [ 164.612516] rtnetlink_rcv_msg+0x418/0x5b0 [ 164.612936] netlink_rcv_skb+0x54/0x100 [ 164.613339] netlink_unicast+0x190/0x250 [ 164.613746] netlink_sendmsg+0x245/0x4a0 [ 164.614150] sock_sendmsg+0x38/0x60 [ 164.614522] ____sys_sendmsg+0x1d0/0x1e0 [ 164.614934] ___sys_sendmsg+0x80/0xc0 [ 164.615320] __sys_sendmsg+0x51/0x90 [ 164.615701] do_syscall_64+0x3d/0x90 [ 164.616083] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 164.616568] -> #0 (&esw->offloads.encap_tbl_lock){+.+.}-{3:3}: [ 164.617210] __lock_acquire+0x159e/0x26e0 [ 164.617638] lock_acquire+0xc2/0x2a0 [ 164.618018] __mutex_lock+0x92/0xcd0 [ 164.618401] mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core] [ 164.618943] post_process_attr+0x153/0x2d0 [mlx5_core] [ 164.619471] mlx5e_tc_add_fdb_flow+0x164/0x360 [mlx5_core] [ 164.620021] __mlx5e_add_fdb_flow+0x2d2/0x430 [mlx5_core] [ 164.620564] mlx5e_configure_flower+0xe33/0x1a20 [mlx5_core] [ 164.621125] tc_setup_cb_add+0xd4/0x200 [ 164.621531] fl_hw_replace_filter+0x14c/0x1f0 [cls_flower] [ 164.622047] fl_change+0xc95/0x18a0 [cls_flower] [ 164.622500] tc_new_tfilter+0x3fc/0xd20 [ 164.622906] rtnetlink_rcv_msg+0x418/0x5b0 [ 164.623324] netlink_rcv_skb+0x54/0x100 [ 164.623727] netlink_unicast+0x190/0x250 [ 164.624138] netlink_sendmsg+0x245/0x4a0 [ 164.624544] sock_sendmsg+0x38/0x60 [ 164.624919] ____sys_sendmsg+0x1d0/0x1e0 [ 164.625340] ___sys_sendmsg+0x80/0xc0 [ 164.625731] __sys_sendmsg+0x51/0x90 [ 164.626117] do_syscall_64+0x3d/0x90 [ 164.626502] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 164.626995] other info that might help us debug this: [ 164.627725] Possible unsafe locking scenario: [ 164.628268] CPU0 CPU1 [ 164.628683] ---- ---- [ 164.629098] lock(&comp->sem); [ 164.629421] lock(&esw->offloads.encap_tbl_lock); [ 164.630066] lock(&comp->sem); [ 164.630555] lock(&esw->offloads.encap_tbl_lock); [ 164.630993] *** DEADLOCK *** [ 164.631575] 3 locks held by handler1/3456: [ 164.631962] #0: ffff888124b75130 (&block->cb_lock){++++}-{3:3}, at: tc_setup_cb_add+0x5b/0x200 [ 164.632703] #1: ffff888116e512b8 (&esw->mode_lock){++++}-{3:3}, at: mlx5_esw_hold+0x39/0x50 [mlx5_core] [ 164.633552] #2: ffff88810137fc98 (&comp->sem){++++}-{3:3}, at: mlx5_devcom_get_peer_data+0x37/0x80 [mlx5_core] [ 164.634435] stack backtrace: [ 164.634883] CPU: 17 PID: 3456 Comm: handler1 Not tainted 6.3.0-rc3+ #1 [ 164.635431] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 164.636340] Call Trace: [ 164.636616] <TASK> [ 164.636863] dump_stack_lvl+0x47/0x70 [ 164.637217] check_noncircular+0xfe/0x110 [ 164.637601] __lock_acquire+0x159e/0x26e0 [ 164.637977] ? mlx5_cmd_set_fte+0x5b0/0x830 [mlx5_core] [ 164.638472] lock_acquire+0xc2/0x2a0 [ 164.638828] ? mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core] [ 164.639339] ? lock_is_held_type+0x98/0x110 [ 164.639728] __mutex_lock+0x92/0xcd0 [ 164.640074] ? mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core] [ 164.640576] ? __lock_acquire+0x382/0x26e0 [ 164.640958] ? mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core] [ 164.641468] ? mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core] [ 164.641965] mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core] [ 164.642454] ? lock_release+0xbf/0x240 [ 164.642819] post_process_attr+0x153/0x2d0 [mlx5_core] [ 164.643318] mlx5e_tc_add_fdb_flow+0x164/0x360 [mlx5_core] [ 164.643835] __mlx5e_add_fdb_flow+0x2d2/0x430 [mlx5_core] [ 164.644340] mlx5e_configure_flower+0xe33/0x1a20 [mlx5_core] [ 164.644862] ? lock_acquire+0xc2/0x2a0 [ 164.645219] tc_setup_cb_add+0xd4/0x200 [ 164.645588] fl_hw_replace_filter+0x14c/0x1f0 [cls_flower] [ 164.646067] fl_change+0xc95/0x18a0 [cls_flower] [ 164.646488] tc_new_tfilter+0x3fc/0xd20 [ 164.646861] ? tc_del_tfilter+0x810/0x810 [ 164.647236] rtnetlink_rcv_msg+0x418/0x5b0 [ 164.647621] ? rtnl_setlink+0x160/0x160 [ 164.647982] netlink_rcv_skb+0x54/0x100 [ 164.648348] netlink_unicast+0x190/0x250 [ 164.648722] netlink_sendmsg+0x245/0x4a0 [ 164.649090] sock_sendmsg+0x38/0x60 [ 164.649434] ____sys_sendmsg+0x1d0/0x1e0 [ 164.649804] ? copy_msghdr_from_user+0x6d/0xa0 [ 164.650213] ___sys_sendmsg+0x80/0xc0 [ 164.650563] ? lock_acquire+0xc2/0x2a0 [ 164.650926] ? lock_acquire+0xc2/0x2a0 [ 164.651286] ? __fget_files+0x5/0x190 [ 164.651644] ? find_held_lock+0x2b/0x80 [ 164.652006] ? __fget_files+0xb9/0x190 [ 164.652365] ? lock_release+0xbf/0x240 [ 164.652723] ? __fget_files+0xd3/0x190 [ 164.653079] __sys_sendmsg+0x51/0x90 [ 164.653435] do_syscall_64+0x3d/0x90 [ 164.653784] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 164.654229] RIP: 0033:0x7f378054f8bd [ 164.654577] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 6a c3 f4 ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 be c3 f4 ff 48 [ 164.656041] RSP: 002b:00007f377fa114b0 EFLAGS: 00000293 ORIG_RAX: 000000000000002e [ 164.656701] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f378054f8bd [ 164.657297] RDX: 0000000000000000 RSI: 00007f377fa11540 RDI: 0000000000000014 [ 164.657885] RBP: 00007f377fa12278 R08: 0000000000000000 R09: 000000000000015c [ 164.658472] R10: 00007f377fa123d0 R11: 0000000000000293 R12: 0000560962d99bd0 [ 164.665317] R13: 0000000000000000 R14: 0000560962d99bd0 R15: 00007f377fa11540 Fixes: f9d196b ("net/mlx5e: Use correct eswitch for stack devices with lag") Signed-off-by: Vlad Buslov <[email protected]> Reviewed-by: Roi Dayan <[email protected]> Reviewed-by: Shay Drory <[email protected]> Reviewed-by: Tariq Toukan <[email protected]> Signed-off-by: Saeed Mahameed <[email protected]>
1 parent a657351 commit 691c041

File tree

3 files changed

+48
-20
lines changed

3 files changed

+48
-20
lines changed

drivers/net/ethernet/mellanox/mlx5/core/en_tc.c

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1665,11 +1665,9 @@ bool mlx5e_tc_is_vf_tunnel(struct net_device *out_dev, struct net_device *route_
16651665
int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *route_dev, u16 *vport)
16661666
{
16671667
struct mlx5e_priv *out_priv, *route_priv;
1668-
struct mlx5_devcom *devcom = NULL;
16691668
struct mlx5_core_dev *route_mdev;
16701669
struct mlx5_eswitch *esw;
16711670
u16 vhca_id;
1672-
int err;
16731671

16741672
out_priv = netdev_priv(out_dev);
16751673
esw = out_priv->mdev->priv.eswitch;
@@ -1678,6 +1676,9 @@ int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *ro
16781676

16791677
vhca_id = MLX5_CAP_GEN(route_mdev, vhca_id);
16801678
if (mlx5_lag_is_active(out_priv->mdev)) {
1679+
struct mlx5_devcom *devcom;
1680+
int err;
1681+
16811682
/* In lag case we may get devices from different eswitch instances.
16821683
* If we failed to get vport num, it means, mostly, that we on the wrong
16831684
* eswitch.
@@ -1686,16 +1687,16 @@ int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *ro
16861687
if (err != -ENOENT)
16871688
return err;
16881689

1690+
rcu_read_lock();
16891691
devcom = out_priv->mdev->priv.devcom;
1690-
esw = mlx5_devcom_get_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS);
1691-
if (!esw)
1692-
return -ENODEV;
1692+
esw = mlx5_devcom_get_peer_data_rcu(devcom, MLX5_DEVCOM_ESW_OFFLOADS);
1693+
err = esw ? mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport) : -ENODEV;
1694+
rcu_read_unlock();
1695+
1696+
return err;
16931697
}
16941698

1695-
err = mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport);
1696-
if (devcom)
1697-
mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS);
1698-
return err;
1699+
return mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport);
16991700
}
17001701

17011702
static int

drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ static LIST_HEAD(devcom_list);
1313

1414
struct mlx5_devcom_component {
1515
struct {
16-
void *data;
16+
void __rcu *data;
1717
} device[MLX5_DEVCOM_PORTS_SUPPORTED];
1818

1919
mlx5_devcom_event_handler_t handler;
@@ -162,7 +162,7 @@ void mlx5_devcom_register_component(struct mlx5_devcom *devcom,
162162
comp = &devcom->priv->components[id];
163163
down_write(&comp->sem);
164164
comp->handler = handler;
165-
comp->device[devcom->idx].data = data;
165+
rcu_assign_pointer(comp->device[devcom->idx].data, data);
166166
up_write(&comp->sem);
167167
}
168168

@@ -176,8 +176,9 @@ void mlx5_devcom_unregister_component(struct mlx5_devcom *devcom,
176176

177177
comp = &devcom->priv->components[id];
178178
down_write(&comp->sem);
179-
comp->device[devcom->idx].data = NULL;
179+
RCU_INIT_POINTER(comp->device[devcom->idx].data, NULL);
180180
up_write(&comp->sem);
181+
synchronize_rcu();
181182
}
182183

183184
int mlx5_devcom_send_event(struct mlx5_devcom *devcom,
@@ -193,12 +194,15 @@ int mlx5_devcom_send_event(struct mlx5_devcom *devcom,
193194

194195
comp = &devcom->priv->components[id];
195196
down_write(&comp->sem);
196-
for (i = 0; i < MLX5_DEVCOM_PORTS_SUPPORTED; i++)
197-
if (i != devcom->idx && comp->device[i].data) {
198-
err = comp->handler(event, comp->device[i].data,
199-
event_data);
197+
for (i = 0; i < MLX5_DEVCOM_PORTS_SUPPORTED; i++) {
198+
void *data = rcu_dereference_protected(comp->device[i].data,
199+
lockdep_is_held(&comp->sem));
200+
201+
if (i != devcom->idx && data) {
202+
err = comp->handler(event, data, event_data);
200203
break;
201204
}
205+
}
202206

203207
up_write(&comp->sem);
204208
return err;
@@ -213,7 +217,7 @@ void mlx5_devcom_set_paired(struct mlx5_devcom *devcom,
213217
comp = &devcom->priv->components[id];
214218
WARN_ON(!rwsem_is_locked(&comp->sem));
215219

216-
comp->paired = paired;
220+
WRITE_ONCE(comp->paired, paired);
217221
}
218222

219223
bool mlx5_devcom_is_paired(struct mlx5_devcom *devcom,
@@ -222,7 +226,7 @@ bool mlx5_devcom_is_paired(struct mlx5_devcom *devcom,
222226
if (IS_ERR_OR_NULL(devcom))
223227
return false;
224228

225-
return devcom->priv->components[id].paired;
229+
return READ_ONCE(devcom->priv->components[id].paired);
226230
}
227231

228232
void *mlx5_devcom_get_peer_data(struct mlx5_devcom *devcom,
@@ -236,7 +240,7 @@ void *mlx5_devcom_get_peer_data(struct mlx5_devcom *devcom,
236240

237241
comp = &devcom->priv->components[id];
238242
down_read(&comp->sem);
239-
if (!comp->paired) {
243+
if (!READ_ONCE(comp->paired)) {
240244
up_read(&comp->sem);
241245
return NULL;
242246
}
@@ -245,7 +249,29 @@ void *mlx5_devcom_get_peer_data(struct mlx5_devcom *devcom,
245249
if (i != devcom->idx)
246250
break;
247251

248-
return comp->device[i].data;
252+
return rcu_dereference_protected(comp->device[i].data, lockdep_is_held(&comp->sem));
253+
}
254+
255+
void *mlx5_devcom_get_peer_data_rcu(struct mlx5_devcom *devcom, enum mlx5_devcom_components id)
256+
{
257+
struct mlx5_devcom_component *comp;
258+
int i;
259+
260+
if (IS_ERR_OR_NULL(devcom))
261+
return NULL;
262+
263+
for (i = 0; i < MLX5_DEVCOM_PORTS_SUPPORTED; i++)
264+
if (i != devcom->idx)
265+
break;
266+
267+
comp = &devcom->priv->components[id];
268+
/* This can change concurrently, however 'data' pointer will remain
269+
* valid for the duration of RCU read section.
270+
*/
271+
if (!READ_ONCE(comp->paired))
272+
return NULL;
273+
274+
return rcu_dereference(comp->device[i].data);
249275
}
250276

251277
void mlx5_devcom_release_peer_data(struct mlx5_devcom *devcom,

drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ bool mlx5_devcom_is_paired(struct mlx5_devcom *devcom,
4141

4242
void *mlx5_devcom_get_peer_data(struct mlx5_devcom *devcom,
4343
enum mlx5_devcom_components id);
44+
void *mlx5_devcom_get_peer_data_rcu(struct mlx5_devcom *devcom, enum mlx5_devcom_components id);
4445
void mlx5_devcom_release_peer_data(struct mlx5_devcom *devcom,
4546
enum mlx5_devcom_components id);
4647

0 commit comments

Comments
 (0)