Skip to content

Commit 2504b84

Browse files
walking-machineanguy11
authored andcommitted
ice: protect XDP configuration with a mutex
The main threat to data consistency in ice_xdp() is a possible asynchronous PF reset. It can be triggered by a user or by TX timeout handler. XDP setup and PF reset code access the same resources in the following sections: * ice_vsi_close() in ice_prepare_for_reset() - already rtnl-locked * ice_vsi_rebuild() for the PF VSI - not protected * ice_vsi_open() - already rtnl-locked With an unfortunate timing, such accesses can result in a crash such as the one below: [ +1.999878] ice 0000:b1:00.0: Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring 14 [ +2.002992] ice 0000:b1:00.0: Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring 18 [Mar15 18:17] ice 0000:b1:00.0 ens801f0np0: NETDEV WATCHDOG: CPU: 38: transmit queue 14 timed out 80692736 ms [ +0.000093] ice 0000:b1:00.0 ens801f0np0: tx_timeout: VSI_num: 6, Q 14, NTC: 0x0, HW_HEAD: 0x0, NTU: 0x0, INT: 0x4000001 [ +0.000012] ice 0000:b1:00.0 ens801f0np0: tx_timeout recovery level 1, txqueue 14 [ +0.394718] ice 0000:b1:00.0: PTP reset successful [ +0.006184] BUG: kernel NULL pointer dereference, address: 0000000000000098 [ +0.000045] #PF: supervisor read access in kernel mode [ +0.000023] #PF: error_code(0x0000) - not-present page [ +0.000023] PGD 0 P4D 0 [ +0.000018] Oops: 0000 [#1] PREEMPT SMP NOPTI [ +0.000023] CPU: 38 PID: 7540 Comm: kworker/38:1 Not tainted 6.8.0-rc7 #1 [ +0.000031] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0014.082620210524 08/26/2021 [ +0.000036] Workqueue: ice ice_service_task [ice] [ +0.000183] RIP: 0010:ice_clean_tx_ring+0xa/0xd0 [ice] [...] [ +0.000013] Call Trace: [ +0.000016] <TASK> [ +0.000014] ? __die+0x1f/0x70 [ +0.000029] ? page_fault_oops+0x171/0x4f0 [ +0.000029] ? schedule+0x3b/0xd0 [ +0.000027] ? exc_page_fault+0x7b/0x180 [ +0.000022] ? asm_exc_page_fault+0x22/0x30 [ +0.000031] ? ice_clean_tx_ring+0xa/0xd0 [ice] [ +0.000194] ice_free_tx_ring+0xe/0x60 [ice] [ +0.000186] ice_destroy_xdp_rings+0x157/0x310 [ice] [ +0.000151] ice_vsi_decfg+0x53/0xe0 [ice] [ +0.000180] ice_vsi_rebuild+0x239/0x540 [ice] [ +0.000186] ice_vsi_rebuild_by_type+0x76/0x180 [ice] [ +0.000145] ice_rebuild+0x18c/0x840 [ice] [ +0.000145] ? delay_tsc+0x4a/0xc0 [ +0.000022] ? delay_tsc+0x92/0xc0 [ +0.000020] ice_do_reset+0x140/0x180 [ice] [ +0.000886] ice_service_task+0x404/0x1030 [ice] [ +0.000824] process_one_work+0x171/0x340 [ +0.000685] worker_thread+0x277/0x3a0 [ +0.000675] ? preempt_count_add+0x6a/0xa0 [ +0.000677] ? _raw_spin_lock_irqsave+0x23/0x50 [ +0.000679] ? __pfx_worker_thread+0x10/0x10 [ +0.000653] kthread+0xf0/0x120 [ +0.000635] ? __pfx_kthread+0x10/0x10 [ +0.000616] ret_from_fork+0x2d/0x50 [ +0.000612] ? __pfx_kthread+0x10/0x10 [ +0.000604] ret_from_fork_asm+0x1b/0x30 [ +0.000604] </TASK> The previous way of handling this through returning -EBUSY is not viable, particularly when destroying AF_XDP socket, because the kernel proceeds with removal anyway. There is plenty of code between those calls and there is no need to create a large critical section that covers all of them, same as there is no need to protect ice_vsi_rebuild() with rtnl_lock(). Add xdp_state_lock mutex to protect ice_vsi_rebuild() and ice_xdp(). Leaving unprotected sections in between would result in two states that have to be considered: 1. when the VSI is closed, but not yet rebuild 2. when VSI is already rebuild, but not yet open The latter case is actually already handled through !netif_running() case, we just need to adjust flag checking a little. The former one is not as trivial, because between ice_vsi_close() and ice_vsi_rebuild(), a lot of hardware interaction happens, this can make adding/deleting rings exit with an error. Luckily, VSI rebuild is pending and can apply new configuration for us in a managed fashion. Therefore, add an additional VSI state flag ICE_VSI_REBUILD_PENDING to indicate that ice_xdp() can just hot-swap the program. Also, as ice_vsi_rebuild() flow is touched in this patch, make it more consistent by deconfiguring VSI when coalesce allocation fails. Fixes: 2d4238f ("ice: Add support for AF_XDP") Fixes: efc2214 ("ice: Add support for XDP") Reviewed-by: Wojciech Drewek <[email protected]> Reviewed-by: Jacob Keller <[email protected]> Tested-by: Chandan Kumar Rout <[email protected]> Signed-off-by: Larysa Zaremba <[email protected]> Reviewed-by: Maciej Fijalkowski <[email protected]> Signed-off-by: Tony Nguyen <[email protected]>
1 parent 2a5dc09 commit 2504b84

File tree

4 files changed

+39
-19
lines changed

4 files changed

+39
-19
lines changed

drivers/net/ethernet/intel/ice/ice.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,7 @@ enum ice_vsi_state {
318318
ICE_VSI_UMAC_FLTR_CHANGED,
319319
ICE_VSI_MMAC_FLTR_CHANGED,
320320
ICE_VSI_PROMISC_CHANGED,
321+
ICE_VSI_REBUILD_PENDING,
321322
ICE_VSI_STATE_NBITS /* must be last */
322323
};
323324

@@ -411,6 +412,7 @@ struct ice_vsi {
411412
struct ice_tx_ring **xdp_rings; /* XDP ring array */
412413
u16 num_xdp_txq; /* Used XDP queues */
413414
u8 xdp_mapping_mode; /* ICE_MAP_MODE_[CONTIG|SCATTER] */
415+
struct mutex xdp_state_lock;
414416

415417
struct net_device **target_netdevs;
416418

drivers/net/ethernet/intel/ice/ice_lib.c

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,7 @@ static void ice_vsi_free(struct ice_vsi *vsi)
447447

448448
ice_vsi_free_stats(vsi);
449449
ice_vsi_free_arrays(vsi);
450+
mutex_destroy(&vsi->xdp_state_lock);
450451
mutex_unlock(&pf->sw_mutex);
451452
devm_kfree(dev, vsi);
452453
}
@@ -626,6 +627,8 @@ static struct ice_vsi *ice_vsi_alloc(struct ice_pf *pf)
626627
pf->next_vsi = ice_get_free_slot(pf->vsi, pf->num_alloc_vsi,
627628
pf->next_vsi);
628629

630+
mutex_init(&vsi->xdp_state_lock);
631+
629632
unlock_pf:
630633
mutex_unlock(&pf->sw_mutex);
631634
return vsi;
@@ -2972,42 +2975,47 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags)
29722975
if (WARN_ON(vsi->type == ICE_VSI_VF && !vsi->vf))
29732976
return -EINVAL;
29742977

2978+
mutex_lock(&vsi->xdp_state_lock);
2979+
29752980
ret = ice_vsi_realloc_stat_arrays(vsi);
29762981
if (ret)
2977-
goto err_vsi_cfg;
2982+
goto unlock;
29782983

29792984
ice_vsi_decfg(vsi);
29802985
ret = ice_vsi_cfg_def(vsi);
29812986
if (ret)
2982-
goto err_vsi_cfg;
2987+
goto unlock;
29832988

29842989
coalesce = kcalloc(vsi->num_q_vectors,
29852990
sizeof(struct ice_coalesce_stored), GFP_KERNEL);
2986-
if (!coalesce)
2987-
return -ENOMEM;
2991+
if (!coalesce) {
2992+
ret = -ENOMEM;
2993+
goto decfg;
2994+
}
29882995

29892996
prev_num_q_vectors = ice_vsi_rebuild_get_coalesce(vsi, coalesce);
29902997

29912998
ret = ice_vsi_cfg_tc_lan(pf, vsi);
29922999
if (ret) {
29933000
if (vsi_flags & ICE_VSI_FLAG_INIT) {
29943001
ret = -EIO;
2995-
goto err_vsi_cfg_tc_lan;
3002+
goto free_coalesce;
29963003
}
29973004

2998-
kfree(coalesce);
2999-
return ice_schedule_reset(pf, ICE_RESET_PFR);
3005+
ret = ice_schedule_reset(pf, ICE_RESET_PFR);
3006+
goto free_coalesce;
30003007
}
30013008

30023009
ice_vsi_rebuild_set_coalesce(vsi, coalesce, prev_num_q_vectors);
3003-
kfree(coalesce);
3010+
clear_bit(ICE_VSI_REBUILD_PENDING, vsi->state);
30043011

3005-
return 0;
3006-
3007-
err_vsi_cfg_tc_lan:
3008-
ice_vsi_decfg(vsi);
3012+
free_coalesce:
30093013
kfree(coalesce);
3010-
err_vsi_cfg:
3014+
decfg:
3015+
if (ret)
3016+
ice_vsi_decfg(vsi);
3017+
unlock:
3018+
mutex_unlock(&vsi->xdp_state_lock);
30113019
return ret;
30123020
}
30133021

drivers/net/ethernet/intel/ice/ice_main.c

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,7 @@ ice_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
616616
/* clear SW filtering DB */
617617
ice_clear_hw_tbls(hw);
618618
/* disable the VSIs and their queues that are not already DOWN */
619+
set_bit(ICE_VSI_REBUILD_PENDING, ice_get_main_vsi(pf)->state);
619620
ice_pf_dis_all_vsi(pf, false);
620621

621622
if (test_bit(ICE_FLAG_PTP_SUPPORTED, pf->flags))
@@ -3016,7 +3017,8 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog,
30163017
}
30173018

30183019
/* hot swap progs and avoid toggling link */
3019-
if (ice_is_xdp_ena_vsi(vsi) == !!prog) {
3020+
if (ice_is_xdp_ena_vsi(vsi) == !!prog ||
3021+
test_bit(ICE_VSI_REBUILD_PENDING, vsi->state)) {
30203022
ice_vsi_assign_bpf_prog(vsi, prog);
30213023
return 0;
30223024
}
@@ -3088,21 +3090,28 @@ static int ice_xdp(struct net_device *dev, struct netdev_bpf *xdp)
30883090
{
30893091
struct ice_netdev_priv *np = netdev_priv(dev);
30903092
struct ice_vsi *vsi = np->vsi;
3093+
int ret;
30913094

30923095
if (vsi->type != ICE_VSI_PF) {
30933096
NL_SET_ERR_MSG_MOD(xdp->extack, "XDP can be loaded only on PF VSI");
30943097
return -EINVAL;
30953098
}
30963099

3100+
mutex_lock(&vsi->xdp_state_lock);
3101+
30973102
switch (xdp->command) {
30983103
case XDP_SETUP_PROG:
3099-
return ice_xdp_setup_prog(vsi, xdp->prog, xdp->extack);
3104+
ret = ice_xdp_setup_prog(vsi, xdp->prog, xdp->extack);
3105+
break;
31003106
case XDP_SETUP_XSK_POOL:
3101-
return ice_xsk_pool_setup(vsi, xdp->xsk.pool,
3102-
xdp->xsk.queue_id);
3107+
ret = ice_xsk_pool_setup(vsi, xdp->xsk.pool, xdp->xsk.queue_id);
3108+
break;
31033109
default:
3104-
return -EINVAL;
3110+
ret = -EINVAL;
31053111
}
3112+
3113+
mutex_unlock(&vsi->xdp_state_lock);
3114+
return ret;
31063115
}
31073116

31083117
/**

drivers/net/ethernet/intel/ice/ice_xsk.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,8 @@ int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid)
390390
goto failure;
391391
}
392392

393-
if_running = netif_running(vsi->netdev) && ice_is_xdp_ena_vsi(vsi);
393+
if_running = !test_bit(ICE_VSI_DOWN, vsi->state) &&
394+
ice_is_xdp_ena_vsi(vsi);
394395

395396
if (if_running) {
396397
struct ice_rx_ring *rx_ring = vsi->rx_rings[qid];

0 commit comments

Comments
 (0)