Skip to content

Commit 9b98d39

Browse files
mosheshemesh2kuba-moo
authored andcommitted
net/mlx5: Start health poll at earlier stage of driver load
Start health poll at earlier stage, so if fw fatal issue occurred before or during initialization commands such as init_hca or set_hca_cap the poll health can detect and indicate that the driver is already in error state. Signed-off-by: Moshe Shemesh <[email protected]> Signed-off-by: Saeed Mahameed <[email protected]> Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 16ab85e commit 9b98d39

File tree

3 files changed

+19
-10
lines changed

3 files changed

+19
-10
lines changed

drivers/net/ethernet/mellanox/mlx5/core/health.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -843,9 +843,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
843843

844844
health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms);
845845
add_timer(&health->timer);
846-
847-
if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc))
848-
queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0);
849846
}
850847

851848
void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
@@ -862,6 +859,14 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
862859
del_timer_sync(&health->timer);
863860
}
864861

862+
void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev)
863+
{
864+
struct mlx5_core_health *health = &dev->priv.health;
865+
866+
if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc))
867+
queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0);
868+
}
869+
865870
void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
866871
{
867872
struct mlx5_core_health *health = &dev->priv.health;

drivers/net/ethernet/mellanox/mlx5/core/main.c

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,7 +1092,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
10921092
mlx5_devcom_unregister_device(dev->priv.devcom);
10931093
}
10941094

1095-
static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
1095+
static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot, u64 timeout)
10961096
{
10971097
int err;
10981098

@@ -1130,10 +1130,12 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
11301130

11311131
mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_UP);
11321132

1133+
mlx5_start_health_poll(dev);
1134+
11331135
err = mlx5_core_enable_hca(dev, 0);
11341136
if (err) {
11351137
mlx5_core_err(dev, "enable hca failed\n");
1136-
goto err_cmd_cleanup;
1138+
goto stop_health_poll;
11371139
}
11381140

11391141
err = mlx5_core_set_issi(dev);
@@ -1185,15 +1187,16 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
11851187
mlx5_core_err(dev, "query hca failed\n");
11861188
goto reclaim_boot_pages;
11871189
}
1188-
1189-
mlx5_start_health_poll(dev);
1190+
mlx5_start_health_fw_log_up(dev);
11901191

11911192
return 0;
11921193

11931194
reclaim_boot_pages:
11941195
mlx5_reclaim_startup_pages(dev);
11951196
err_disable_hca:
11961197
mlx5_core_disable_hca(dev, 0);
1198+
stop_health_poll:
1199+
mlx5_stop_health_poll(dev, boot);
11971200
err_cmd_cleanup:
11981201
mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN);
11991202
mlx5_cmd_cleanup(dev);
@@ -1205,14 +1208,14 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot)
12051208
{
12061209
int err;
12071210

1208-
mlx5_stop_health_poll(dev, boot);
12091211
err = mlx5_cmd_teardown_hca(dev);
12101212
if (err) {
12111213
mlx5_core_err(dev, "tear_down_hca failed, skip cleanup\n");
12121214
return err;
12131215
}
12141216
mlx5_reclaim_startup_pages(dev);
12151217
mlx5_core_disable_hca(dev, 0);
1218+
mlx5_stop_health_poll(dev, boot);
12161219
mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN);
12171220
mlx5_cmd_cleanup(dev);
12181221

@@ -1362,7 +1365,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
13621365
mutex_lock(&dev->intf_state_mutex);
13631366
dev->state = MLX5_DEVICE_STATE_UP;
13641367

1365-
err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
1368+
err = mlx5_function_setup(dev, true, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
13661369
if (err)
13671370
goto err_function;
13681371

@@ -1450,7 +1453,7 @@ int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery)
14501453
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
14511454
else
14521455
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
1453-
err = mlx5_function_setup(dev, timeout);
1456+
err = mlx5_function_setup(dev, false, timeout);
14541457
if (err)
14551458
goto err_function;
14561459

include/linux/mlx5/driver.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev);
10171017
int mlx5_health_init(struct mlx5_core_dev *dev);
10181018
void mlx5_start_health_poll(struct mlx5_core_dev *dev);
10191019
void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
1020+
void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev);
10201021
void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
10211022
void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
10221023
int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,

0 commit comments

Comments
 (0)