Skip to content

Commit 8812c24

Browse files
majdmellanoxSaeed Mahameed
authored andcommitted
net/mlx5: Add fast unload support in shutdown flow
Adding a support to flush all HW resources with one FW command and skip all the heavy unload flows of the driver on kernel shutdown. There's no need to free all the SW context since a new fresh kernel will be loaded afterwards. Regarding the FW resources, they should be closed, otherwise we will have leakage in the FW. To accelerate this flow, we execute one command in the beginning that tells the FW that the driver isn't going to close any of the FW resources and asks the FW to clean up everything. Once the commands complete, it's safe to close the PCI resources and finish the routine. Signed-off-by: Majd Dibbiny <[email protected]> Signed-off-by: Maor Gottlieb <[email protected]> Signed-off-by: Saeed Mahameed <[email protected]>
1 parent 4525abe commit 8812c24

File tree

5 files changed

+73
-8
lines changed

5 files changed

+73
-8
lines changed

drivers/net/ethernet/mellanox/mlx5/core/fw.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,3 +195,31 @@ int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev)
195195
MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA);
196196
return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
197197
}
198+
199+
int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev)
200+
{
201+
u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {0};
202+
u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {0};
203+
int force_state;
204+
int ret;
205+
206+
if (!MLX5_CAP_GEN(dev, force_teardown)) {
207+
mlx5_core_dbg(dev, "force teardown is not supported in the firmware\n");
208+
return -EOPNOTSUPP;
209+
}
210+
211+
MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA);
212+
MLX5_SET(teardown_hca_in, in, profile, MLX5_TEARDOWN_HCA_IN_PROFILE_FORCE_CLOSE);
213+
214+
ret = mlx5_cmd_exec_polling(dev, in, sizeof(in), out, sizeof(out));
215+
if (ret)
216+
return ret;
217+
218+
force_state = MLX5_GET(teardown_hca_out, out, force_state);
219+
if (force_state == MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL) {
220+
mlx5_core_err(dev, "teardown with force mode failed\n");
221+
return -EIO;
222+
}
223+
224+
return 0;
225+
}

drivers/net/ethernet/mellanox/mlx5/core/health.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,14 +111,14 @@ static int in_fatal(struct mlx5_core_dev *dev)
111111
return 0;
112112
}
113113

114-
void mlx5_enter_error_state(struct mlx5_core_dev *dev)
114+
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
115115
{
116116
mutex_lock(&dev->intf_state_mutex);
117117
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
118118
goto unlock;
119119

120120
mlx5_core_err(dev, "start\n");
121-
if (pci_channel_offline(dev->pdev) || in_fatal(dev)) {
121+
if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) {
122122
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
123123
trigger_cmd_completions(dev);
124124
}

drivers/net/ethernet/mellanox/mlx5/core/main.c

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1418,7 +1418,7 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
14181418

14191419
dev_info(&pdev->dev, "%s was called\n", __func__);
14201420

1421-
mlx5_enter_error_state(dev);
1421+
mlx5_enter_error_state(dev, false);
14221422
mlx5_unload_one(dev, priv, false);
14231423
/* In case of kernel call drain the health wq */
14241424
if (state) {
@@ -1505,15 +1505,43 @@ static const struct pci_error_handlers mlx5_err_handler = {
15051505
.resume = mlx5_pci_resume
15061506
};
15071507

1508+
static int mlx5_try_fast_unload(struct mlx5_core_dev *dev)
1509+
{
1510+
int ret;
1511+
1512+
if (!MLX5_CAP_GEN(dev, force_teardown)) {
1513+
mlx5_core_dbg(dev, "force teardown is not supported in the firmware\n");
1514+
return -EOPNOTSUPP;
1515+
}
1516+
1517+
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
1518+
mlx5_core_dbg(dev, "Device in internal error state, giving up\n");
1519+
return -EAGAIN;
1520+
}
1521+
1522+
ret = mlx5_cmd_force_teardown_hca(dev);
1523+
if (ret) {
1524+
mlx5_core_dbg(dev, "Firmware couldn't do fast unload error: %d\n", ret);
1525+
return ret;
1526+
}
1527+
1528+
mlx5_enter_error_state(dev, true);
1529+
1530+
return 0;
1531+
}
1532+
15081533
static void shutdown(struct pci_dev *pdev)
15091534
{
15101535
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
15111536
struct mlx5_priv *priv = &dev->priv;
1537+
int err;
15121538

15131539
dev_info(&pdev->dev, "Shutdown was called\n");
15141540
/* Notify mlx5 clients that the kernel is being shut down */
15151541
set_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &dev->intf_state);
1516-
mlx5_unload_one(dev, priv, false);
1542+
err = mlx5_try_fast_unload(dev);
1543+
if (err)
1544+
mlx5_unload_one(dev, priv, false);
15171545
mlx5_pci_disable_device(dev);
15181546
}
15191547

drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,13 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev);
8383
int mlx5_query_board_id(struct mlx5_core_dev *dev);
8484
int mlx5_cmd_init_hca(struct mlx5_core_dev *dev);
8585
int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
86+
int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev);
8687
void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
8788
unsigned long param);
8889
void mlx5_core_page_fault(struct mlx5_core_dev *dev,
8990
struct mlx5_pagefault *pfault);
9091
void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
91-
void mlx5_enter_error_state(struct mlx5_core_dev *dev);
92+
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force);
9293
void mlx5_disable_device(struct mlx5_core_dev *dev);
9394
void mlx5_recover_device(struct mlx5_core_dev *dev);
9495
int mlx5_sriov_init(struct mlx5_core_dev *dev);

include/linux/mlx5/mlx5_ifc.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -801,7 +801,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
801801
u8 max_indirection[0x8];
802802
u8 fixed_buffer_size[0x1];
803803
u8 log_max_mrw_sz[0x7];
804-
u8 reserved_at_110[0x2];
804+
u8 force_teardown[0x1];
805+
u8 reserved_at_111[0x1];
805806
u8 log_max_bsf_list_size[0x6];
806807
u8 umr_extended_translation_offset[0x1];
807808
u8 null_mkey[0x1];
@@ -3094,18 +3095,25 @@ struct mlx5_ifc_tsar_element_bits {
30943095
u8 reserved_at_10[0x10];
30953096
};
30963097

3098+
enum {
3099+
MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_SUCCESS = 0x0,
3100+
MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL = 0x1,
3101+
};
3102+
30973103
struct mlx5_ifc_teardown_hca_out_bits {
30983104
u8 status[0x8];
30993105
u8 reserved_at_8[0x18];
31003106

31013107
u8 syndrome[0x20];
31023108

3103-
u8 reserved_at_40[0x40];
3109+
u8 reserved_at_40[0x3f];
3110+
3111+
u8 force_state[0x1];
31043112
};
31053113

31063114
enum {
31073115
MLX5_TEARDOWN_HCA_IN_PROFILE_GRACEFUL_CLOSE = 0x0,
3108-
MLX5_TEARDOWN_HCA_IN_PROFILE_PANIC_CLOSE = 0x1,
3116+
MLX5_TEARDOWN_HCA_IN_PROFILE_FORCE_CLOSE = 0x1,
31093117
};
31103118

31113119
struct mlx5_ifc_teardown_hca_in_bits {

0 commit comments

Comments
 (0)