Skip to content

Commit 37ca95e

Browse files
GavinLi-NVSaeed Mahameed
authored andcommitted
net/mlx5: Increase FW pre-init timeout for health recovery
Currently, health recovery will reload driver to recover it from fatal errors. During the driver's load process, it would wait for FW to set the pre-init bit for up to 120 seconds, beyond this threshold it would abort the load process. In some cases, such as a FW upgrade on the DPU, this timeout period is insufficient, and the user has no way to recover the host device. To solve this issue, introduce a new FW pre-init timeout for health recovery, which is set to 2 hours. The timeout for devlink reload and probe will use the original one because they are user triggered flows, and therefore should not have a significantly long timeout, during which the user command would hang. Signed-off-by: Gavin Li <[email protected]> Reviewed-by: Moshe Shemesh <[email protected]> Reviewed-by: Shay Drory <[email protected]> Signed-off-by: Saeed Mahameed <[email protected]>
1 parent 8324a02 commit 37ca95e

File tree

6 files changed

+20
-13
lines changed

6 files changed

+20
-13
lines changed

drivers/net/ethernet/mellanox/mlx5/core/devlink.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,13 +178,13 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a
178178
*actions_performed = BIT(action);
179179
switch (action) {
180180
case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
181-
return mlx5_load_one(dev);
181+
return mlx5_load_one(dev, false);
182182
case DEVLINK_RELOAD_ACTION_FW_ACTIVATE:
183183
if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET)
184184
break;
185185
/* On fw_activate action, also driver is reloaded and reinit performed */
186186
*actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
187-
return mlx5_load_one(dev);
187+
return mlx5_load_one(dev, false);
188188
default:
189189
/* Unsupported action should not get to this function */
190190
WARN_ON(1);

drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
148148
if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
149149
complete(&fw_reset->done);
150150
} else {
151-
mlx5_load_one(dev);
151+
mlx5_load_one(dev, false);
152152
devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0,
153153
BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
154154
BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));

drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ struct mlx5_timeouts {
1010

1111
static const u32 tout_def_sw_val[MAX_TIMEOUT_TYPES] = {
1212
[MLX5_TO_FW_PRE_INIT_TIMEOUT_MS] = 120000,
13+
[MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS] = 7200000,
1314
[MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS] = 20000,
1415
[MLX5_TO_FW_PRE_INIT_WAIT_MS] = 2,
1516
[MLX5_TO_FW_INIT_MS] = 2000,

drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
enum mlx5_timeouts_types {
88
/* pre init timeouts (not read from FW) */
99
MLX5_TO_FW_PRE_INIT_TIMEOUT_MS,
10+
MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS,
1011
MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS,
1112
MLX5_TO_FW_PRE_INIT_WAIT_MS,
1213

drivers/net/ethernet/mellanox/mlx5/core/main.c

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,7 +1003,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
10031003
mlx5_devcom_unregister_device(dev->priv.devcom);
10041004
}
10051005

1006-
static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
1006+
static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
10071007
{
10081008
int err;
10091009

@@ -1018,11 +1018,11 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
10181018

10191019
/* wait for firmware to accept initialization segments configurations
10201020
*/
1021-
err = wait_fw_init(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT),
1021+
err = wait_fw_init(dev, timeout,
10221022
mlx5_tout_ms(dev, FW_PRE_INIT_WARN_MESSAGE_INTERVAL));
10231023
if (err) {
10241024
mlx5_core_err(dev, "Firmware over %llu MS in pre-initializing state, aborting\n",
1025-
mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
1025+
timeout);
10261026
return err;
10271027
}
10281028

@@ -1272,7 +1272,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
12721272
mutex_lock(&dev->intf_state_mutex);
12731273
dev->state = MLX5_DEVICE_STATE_UP;
12741274

1275-
err = mlx5_function_setup(dev, true);
1275+
err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
12761276
if (err)
12771277
goto err_function;
12781278

@@ -1336,9 +1336,10 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev)
13361336
mutex_unlock(&dev->intf_state_mutex);
13371337
}
13381338

1339-
int mlx5_load_one(struct mlx5_core_dev *dev)
1339+
int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery)
13401340
{
13411341
int err = 0;
1342+
u64 timeout;
13421343

13431344
mutex_lock(&dev->intf_state_mutex);
13441345
if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
@@ -1348,7 +1349,11 @@ int mlx5_load_one(struct mlx5_core_dev *dev)
13481349
/* remove any previous indication of internal error */
13491350
dev->state = MLX5_DEVICE_STATE_UP;
13501351

1351-
err = mlx5_function_setup(dev, false);
1352+
if (recovery)
1353+
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
1354+
else
1355+
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
1356+
err = mlx5_function_setup(dev, timeout);
13521357
if (err)
13531358
goto err_function;
13541359

@@ -1719,7 +1724,7 @@ static void mlx5_pci_resume(struct pci_dev *pdev)
17191724

17201725
mlx5_pci_trace(dev, "Enter, loading driver..\n");
17211726

1722-
err = mlx5_load_one(dev);
1727+
err = mlx5_load_one(dev, false);
17231728

17241729
mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
17251730
!err ? "recovered" : "Failed");
@@ -1807,7 +1812,7 @@ static int mlx5_resume(struct pci_dev *pdev)
18071812
{
18081813
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
18091814

1810-
return mlx5_load_one(dev);
1815+
return mlx5_load_one(dev, false);
18111816
}
18121817

18131818
static const struct pci_device_id mlx5_core_pci_table[] = {
@@ -1852,7 +1857,7 @@ int mlx5_recover_device(struct mlx5_core_dev *dev)
18521857
return -EIO;
18531858
}
18541859

1855-
return mlx5_load_one(dev);
1860+
return mlx5_load_one(dev, true);
18561861
}
18571862

18581863
static struct pci_driver mlx5_core_driver = {

drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev);
290290
int mlx5_init_one(struct mlx5_core_dev *dev);
291291
void mlx5_uninit_one(struct mlx5_core_dev *dev);
292292
void mlx5_unload_one(struct mlx5_core_dev *dev);
293-
int mlx5_load_one(struct mlx5_core_dev *dev);
293+
int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery);
294294

295295
int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out);
296296

0 commit comments

Comments
 (0)