Skip to content

Commit cc5b9b4

Browse files
selvintxavierrleon
authored andcommitted
RDMA/bnxt_re: Recover the device when FW error is detected
If the FW crashes, L2 driver gets notified and it notifies the RoCE driver. Currently driver doesn't re-initialize the device. Add support for re-initialize the RoCE device. RoCE device is removed and re-attached in the ulp_stop and ulp_start respectively. The recovery logic expects the RoCE driver to be registered with L2 driver while its being removed. So the driver avoids unregistering with L2 driver in the recovery path. Signed-off-by: Chandramohan Akula <[email protected]> Signed-off-by: Kalesh AP <[email protected]> Signed-off-by: Selvin Xavier <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Leon Romanovsky <[email protected]>
1 parent 94a9dc6 commit cc5b9b4

File tree

3 files changed

+55
-31
lines changed

3 files changed

+55
-31
lines changed

drivers/infiniband/hw/bnxt_re/bnxt_re.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,15 @@ struct bnxt_re_ring_attr {
9191
u8 mode;
9292
};
9393

94+
/*
95+
* Data structure and defines to handle
96+
* recovery
97+
*/
98+
#define BNXT_RE_PRE_RECOVERY_REMOVE 0x1
99+
#define BNXT_RE_COMPLETE_REMOVE 0x2
100+
#define BNXT_RE_POST_RECOVERY_INIT 0x4
101+
#define BNXT_RE_COMPLETE_INIT 0x8
102+
94103
struct bnxt_re_sqp_entries {
95104
struct bnxt_qplib_sge sge;
96105
u64 wrid;
@@ -224,4 +233,10 @@ static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev)
224233
}
225234

226235
extern const struct uapi_definition bnxt_re_uapi_defs[];
236+
237+
static inline void bnxt_re_set_pacing_dev_state(struct bnxt_re_dev *rdev)
238+
{
239+
rdev->qplib_res.pacing_data->dev_err_state =
240+
test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags);
241+
}
227242
#endif

drivers/infiniband/hw/bnxt_re/main.c

Lines changed: 39 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev);
8383
static int bnxt_re_netdev_event(struct notifier_block *notifier,
8484
unsigned long event, void *ptr);
8585
static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev);
86-
static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev);
86+
static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type);
8787
static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev);
8888

8989
static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len,
@@ -169,6 +169,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev)
169169

170170
en_dev = rdev->en_dev;
171171

172+
rdev->qplib_res.pdev = en_dev->pdev;
172173
chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL);
173174
if (!chip_ctx)
174175
return -ENOMEM;
@@ -301,7 +302,7 @@ static void bnxt_re_shutdown(struct auxiliary_device *adev)
301302

302303
rdev = en_info->rdev;
303304
ib_unregister_device(&rdev->ibdev);
304-
bnxt_re_dev_uninit(rdev);
305+
bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE);
305306
}
306307

307308
static void bnxt_re_stop_irq(void *handle)
@@ -385,14 +386,9 @@ static struct bnxt_ulp_ops bnxt_re_ulp_ops = {
385386
static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev)
386387
{
387388
struct bnxt_en_dev *en_dev;
388-
int rc;
389389

390390
en_dev = rdev->en_dev;
391-
392-
rc = bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev->adev);
393-
if (!rc)
394-
rdev->qplib_res.pdev = rdev->en_dev->pdev;
395-
return rc;
391+
return bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev->adev);
396392
}
397393

398394
static void bnxt_re_init_hwrm_hdr(struct input *hdr, u16 opcd)
@@ -1593,7 +1589,7 @@ static int bnxt_re_ib_init(struct bnxt_re_dev *rdev)
15931589
return rc;
15941590
}
15951591

1596-
static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev)
1592+
static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type)
15971593
{
15981594
u8 type;
15991595
int rc;
@@ -1626,8 +1622,10 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev)
16261622
bnxt_re_deinitialize_dbr_pacing(rdev);
16271623

16281624
bnxt_re_destroy_chip_ctx(rdev);
1629-
if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags))
1630-
bnxt_unregister_dev(rdev->en_dev);
1625+
if (op_type == BNXT_RE_COMPLETE_REMOVE) {
1626+
if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags))
1627+
bnxt_unregister_dev(rdev->en_dev);
1628+
}
16311629
}
16321630

16331631
/* worker thread for polling periodic events. Now used for QoS programming*/
@@ -1640,7 +1638,7 @@ static void bnxt_re_worker(struct work_struct *work)
16401638
schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
16411639
}
16421640

1643-
static int bnxt_re_dev_init(struct bnxt_re_dev *rdev)
1641+
static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type)
16441642
{
16451643
struct bnxt_re_ring_attr rattr = {};
16461644
struct bnxt_qplib_creq_ctx *creq;
@@ -1649,12 +1647,14 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev)
16491647
u8 type;
16501648
int rc;
16511649

1652-
/* Registered a new RoCE device instance to netdev */
1653-
rc = bnxt_re_register_netdev(rdev);
1654-
if (rc) {
1655-
ibdev_err(&rdev->ibdev,
1656-
"Failed to register with netedev: %#x\n", rc);
1657-
return -EINVAL;
1650+
if (op_type == BNXT_RE_COMPLETE_INIT) {
1651+
/* Registered a new RoCE device instance to netdev */
1652+
rc = bnxt_re_register_netdev(rdev);
1653+
if (rc) {
1654+
ibdev_err(&rdev->ibdev,
1655+
"Failed to register with netedev: %#x\n", rc);
1656+
return -EINVAL;
1657+
}
16581658
}
16591659
set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
16601660

@@ -1807,7 +1807,7 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev)
18071807
free_rcfw:
18081808
bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
18091809
fail:
1810-
bnxt_re_dev_uninit(rdev);
1810+
bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE);
18111811

18121812
return rc;
18131813
}
@@ -1827,7 +1827,7 @@ static void bnxt_re_update_en_info_rdev(struct bnxt_re_dev *rdev,
18271827
rtnl_unlock();
18281828
}
18291829

1830-
static int bnxt_re_add_device(struct auxiliary_device *adev)
1830+
static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type)
18311831
{
18321832
struct bnxt_aux_priv *aux_priv =
18331833
container_of(adev, struct bnxt_aux_priv, aux_dev);
@@ -1839,8 +1839,6 @@ static int bnxt_re_add_device(struct auxiliary_device *adev)
18391839
en_info = auxiliary_get_drvdata(adev);
18401840
en_dev = en_info->en_dev;
18411841

1842-
/* en_dev should never be NULL as long as adev and aux_dev are valid. */
1843-
en_dev = aux_priv->edev;
18441842

18451843
rdev = bnxt_re_dev_add(aux_priv, en_dev);
18461844
if (!rdev || !rdev_to_dev(rdev)) {
@@ -1850,7 +1848,7 @@ static int bnxt_re_add_device(struct auxiliary_device *adev)
18501848

18511849
bnxt_re_update_en_info_rdev(rdev, en_info, adev);
18521850

1853-
rc = bnxt_re_dev_init(rdev);
1851+
rc = bnxt_re_dev_init(rdev, op_type);
18541852
if (rc)
18551853
goto re_dev_dealloc;
18561854

@@ -1875,7 +1873,7 @@ static int bnxt_re_add_device(struct auxiliary_device *adev)
18751873

18761874
re_dev_uninit:
18771875
bnxt_re_update_en_info_rdev(NULL, en_info, adev);
1878-
bnxt_re_dev_uninit(rdev);
1876+
bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE);
18791877
re_dev_dealloc:
18801878
ib_dealloc_device(&rdev->ibdev);
18811879
exit:
@@ -1958,7 +1956,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
19581956

19591957
#define BNXT_ADEV_NAME "bnxt_en"
19601958

1961-
static void bnxt_re_remove_device(struct bnxt_re_dev *rdev,
1959+
static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, u8 op_type,
19621960
struct auxiliary_device *aux_dev)
19631961
{
19641962
if (rdev->nb.notifier_call) {
@@ -1972,7 +1970,7 @@ static void bnxt_re_remove_device(struct bnxt_re_dev *rdev,
19721970
}
19731971
bnxt_re_setup_cc(rdev, false);
19741972
ib_unregister_device(&rdev->ibdev);
1975-
bnxt_re_dev_uninit(rdev);
1973+
bnxt_re_dev_uninit(rdev, op_type);
19761974
ib_dealloc_device(&rdev->ibdev);
19771975
}
19781976

@@ -1991,7 +1989,7 @@ static void bnxt_re_remove(struct auxiliary_device *adev)
19911989
rdev = en_info->rdev;
19921990

19931991
if (rdev)
1994-
bnxt_re_remove_device(rdev, adev);
1992+
bnxt_re_remove_device(rdev, BNXT_RE_COMPLETE_REMOVE, adev);
19951993
kfree(en_info);
19961994
mutex_unlock(&bnxt_re_mutex);
19971995
}
@@ -2017,7 +2015,7 @@ static int bnxt_re_probe(struct auxiliary_device *adev,
20172015

20182016
auxiliary_set_drvdata(adev, en_info);
20192017

2020-
rc = bnxt_re_add_device(adev);
2018+
rc = bnxt_re_add_device(adev, BNXT_RE_COMPLETE_INIT);
20212019
if (rc)
20222020
goto err;
20232021
mutex_unlock(&bnxt_re_mutex);
@@ -2033,12 +2031,14 @@ static int bnxt_re_probe(struct auxiliary_device *adev,
20332031
static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state)
20342032
{
20352033
struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev);
2034+
struct bnxt_en_dev *en_dev;
20362035
struct bnxt_re_dev *rdev;
20372036

20382037
if (!en_info)
20392038
return 0;
20402039

20412040
rdev = en_info->rdev;
2041+
en_dev = en_info->en_dev;
20422042
mutex_lock(&bnxt_re_mutex);
20432043
/* L2 driver may invoke this callback during device error/crash or device
20442044
* reset. Current RoCE driver doesn't recover the device in case of
@@ -2057,13 +2057,20 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state)
20572057
set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags);
20582058

20592059
bnxt_re_dev_stop(rdev);
2060-
bnxt_re_stop_irq(rdev);
2060+
bnxt_re_stop_irq(adev);
20612061
/* Move the device states to detached and avoid sending any more
20622062
* commands to HW
20632063
*/
20642064
set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags);
20652065
set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags);
20662066
wake_up_all(&rdev->rcfw.cmdq.waitq);
2067+
2068+
if (rdev->pacing.dbr_pacing)
2069+
bnxt_re_set_pacing_dev_state(rdev);
2070+
2071+
ibdev_info(&rdev->ibdev, "%s: L2 driver notified to stop en_state 0x%lx",
2072+
__func__, en_dev->en_state);
2073+
bnxt_re_remove_device(rdev, BNXT_RE_PRE_RECOVERY_REMOVE, adev);
20672074
mutex_unlock(&bnxt_re_mutex);
20682075

20692076
return 0;
@@ -2077,7 +2084,6 @@ static int bnxt_re_resume(struct auxiliary_device *adev)
20772084
if (!en_info)
20782085
return 0;
20792086

2080-
rdev = en_info->rdev;
20812087
mutex_lock(&bnxt_re_mutex);
20822088
/* L2 driver may invoke this callback during device recovery, resume.
20832089
* reset. Current RoCE driver doesn't recover the device in case of
@@ -2086,7 +2092,9 @@ static int bnxt_re_resume(struct auxiliary_device *adev)
20862092
* L2 driver want to modify the MSIx table.
20872093
*/
20882094

2089-
ibdev_info(&rdev->ibdev, "Handle device resume call");
2095+
bnxt_re_add_device(adev, BNXT_RE_POST_RECOVERY_INIT);
2096+
rdev = en_info->rdev;
2097+
ibdev_info(&rdev->ibdev, "Device resume completed");
20902098
mutex_unlock(&bnxt_re_mutex);
20912099

20922100
return 0;

drivers/infiniband/hw/bnxt_re/qplib_res.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ struct bnxt_qplib_db_pacing_data {
8282
u32 fifo_room_mask;
8383
u32 fifo_room_shift;
8484
u32 grc_reg_offset;
85+
u32 dev_err_state;
8586
};
8687

8788
#define BNXT_QPLIB_DBR_PF_DB_OFFSET 0x10000

0 commit comments

Comments
 (0)