Skip to content

Commit 89ea94a

Browse files
Maor Gottliebdledford
authored andcommitted
IB/mlx5: Reset flow support for IB kernel ULPs
The driver exposes interfaces that directly relate to HW state. Upon fatal error, consumers of these interfaces (ULPs) that rely on completion of all their posted work-request could hang, thereby introducing dependencies in shutdown order. To prevent this from happening, we manage the relevant resources (CQs, QPs) that are used by the device. Upon a fatal error, we now generate simulated completions for outstanding WQEs that were not completed at the time the HW was reset. It includes invoking the completion event handler for all involved CQs so that the ULPs will poll those CQs. When polled we return simulated CQEs with IB_WC_WR_FLUSH_ERR return code enabling ULPs to clean up their resources and not wait forever for completions upon receiving remove_one. The above change requires an extra check in the data path to make sure that when device is in error state, the simulated CQEs will be returned and no further WQEs will be posted. Signed-off-by: Maor Gottlieb <[email protected]> Signed-off-by: Leon Romanovsky <[email protected]> Signed-off-by: Doug Ledford <[email protected]>
1 parent 7c2344c commit 89ea94a

File tree

7 files changed

+250
-21
lines changed

7 files changed

+250
-21
lines changed

drivers/infiniband/hw/mlx5/cq.c

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,83 @@ static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
424424
item->key = be32_to_cpu(cqe->mkey);
425425
}
426426

427+
static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries,
428+
struct ib_wc *wc, int *npolled)
429+
{
430+
struct mlx5_ib_wq *wq;
431+
unsigned int cur;
432+
unsigned int idx;
433+
int np;
434+
int i;
435+
436+
wq = &qp->sq;
437+
cur = wq->head - wq->tail;
438+
np = *npolled;
439+
440+
if (cur == 0)
441+
return;
442+
443+
for (i = 0; i < cur && np < num_entries; i++) {
444+
idx = wq->last_poll & (wq->wqe_cnt - 1);
445+
wc->wr_id = wq->wrid[idx];
446+
wc->status = IB_WC_WR_FLUSH_ERR;
447+
wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
448+
wq->tail++;
449+
np++;
450+
wc->qp = &qp->ibqp;
451+
wc++;
452+
wq->last_poll = wq->w_list[idx].next;
453+
}
454+
*npolled = np;
455+
}
456+
457+
static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries,
458+
struct ib_wc *wc, int *npolled)
459+
{
460+
struct mlx5_ib_wq *wq;
461+
unsigned int cur;
462+
int np;
463+
int i;
464+
465+
wq = &qp->rq;
466+
cur = wq->head - wq->tail;
467+
np = *npolled;
468+
469+
if (cur == 0)
470+
return;
471+
472+
for (i = 0; i < cur && np < num_entries; i++) {
473+
wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
474+
wc->status = IB_WC_WR_FLUSH_ERR;
475+
wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
476+
wq->tail++;
477+
np++;
478+
wc->qp = &qp->ibqp;
479+
wc++;
480+
}
481+
*npolled = np;
482+
}
483+
484+
static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries,
485+
struct ib_wc *wc, int *npolled)
486+
{
487+
struct mlx5_ib_qp *qp;
488+
489+
*npolled = 0;
490+
/* Find uncompleted WQEs belonging to that cq and retrun mmics ones */
491+
list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) {
492+
sw_send_comp(qp, num_entries, wc + *npolled, npolled);
493+
if (*npolled >= num_entries)
494+
return;
495+
}
496+
497+
list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) {
498+
sw_recv_comp(qp, num_entries, wc + *npolled, npolled);
499+
if (*npolled >= num_entries)
500+
return;
501+
}
502+
}
503+
427504
static int mlx5_poll_one(struct mlx5_ib_cq *cq,
428505
struct mlx5_ib_qp **cur_qp,
429506
struct ib_wc *wc)
@@ -594,12 +671,18 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
594671
{
595672
struct mlx5_ib_cq *cq = to_mcq(ibcq);
596673
struct mlx5_ib_qp *cur_qp = NULL;
674+
struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
675+
struct mlx5_core_dev *mdev = dev->mdev;
597676
unsigned long flags;
598677
int soft_polled = 0;
599678
int npolled;
600679
int err = 0;
601680

602681
spin_lock_irqsave(&cq->lock, flags);
682+
if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
683+
mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
684+
goto out;
685+
}
603686

604687
if (unlikely(!list_empty(&cq->wc_list)))
605688
soft_polled = poll_soft_wc(cq, num_entries, wc);
@@ -612,7 +695,7 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
612695

613696
if (npolled)
614697
mlx5_cq_set_ci(&cq->mcq);
615-
698+
out:
616699
spin_unlock_irqrestore(&cq->lock, flags);
617700

618701
if (err == 0 || err == -EAGAIN)
@@ -843,6 +926,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
843926
cq->resize_buf = NULL;
844927
cq->resize_umem = NULL;
845928
cq->create_flags = attr->flags;
929+
INIT_LIST_HEAD(&cq->list_send_qp);
930+
INIT_LIST_HEAD(&cq->list_recv_qp);
846931

847932
if (context) {
848933
err = create_cq_user(dev, udata, context, cq, entries,

drivers/infiniband/hw/mlx5/main.c

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1980,6 +1980,65 @@ static void pkey_change_handler(struct work_struct *work)
19801980
mutex_unlock(&ports->devr->mutex);
19811981
}
19821982

1983+
static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
1984+
{
1985+
struct mlx5_ib_qp *mqp;
1986+
struct mlx5_ib_cq *send_mcq, *recv_mcq;
1987+
struct mlx5_core_cq *mcq;
1988+
struct list_head cq_armed_list;
1989+
unsigned long flags_qp;
1990+
unsigned long flags_cq;
1991+
unsigned long flags;
1992+
1993+
INIT_LIST_HEAD(&cq_armed_list);
1994+
1995+
/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
1996+
spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
1997+
list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
1998+
spin_lock_irqsave(&mqp->sq.lock, flags_qp);
1999+
if (mqp->sq.tail != mqp->sq.head) {
2000+
send_mcq = to_mcq(mqp->ibqp.send_cq);
2001+
spin_lock_irqsave(&send_mcq->lock, flags_cq);
2002+
if (send_mcq->mcq.comp &&
2003+
mqp->ibqp.send_cq->comp_handler) {
2004+
if (!send_mcq->mcq.reset_notify_added) {
2005+
send_mcq->mcq.reset_notify_added = 1;
2006+
list_add_tail(&send_mcq->mcq.reset_notify,
2007+
&cq_armed_list);
2008+
}
2009+
}
2010+
spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
2011+
}
2012+
spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
2013+
spin_lock_irqsave(&mqp->rq.lock, flags_qp);
2014+
/* no handling is needed for SRQ */
2015+
if (!mqp->ibqp.srq) {
2016+
if (mqp->rq.tail != mqp->rq.head) {
2017+
recv_mcq = to_mcq(mqp->ibqp.recv_cq);
2018+
spin_lock_irqsave(&recv_mcq->lock, flags_cq);
2019+
if (recv_mcq->mcq.comp &&
2020+
mqp->ibqp.recv_cq->comp_handler) {
2021+
if (!recv_mcq->mcq.reset_notify_added) {
2022+
recv_mcq->mcq.reset_notify_added = 1;
2023+
list_add_tail(&recv_mcq->mcq.reset_notify,
2024+
&cq_armed_list);
2025+
}
2026+
}
2027+
spin_unlock_irqrestore(&recv_mcq->lock,
2028+
flags_cq);
2029+
}
2030+
}
2031+
spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
2032+
}
2033+
/*At that point all inflight post send were put to be executed as of we
2034+
* lock/unlock above locks Now need to arm all involved CQs.
2035+
*/
2036+
list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
2037+
mcq->comp(mcq);
2038+
}
2039+
spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
2040+
}
2041+
19832042
static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
19842043
enum mlx5_dev_event event, unsigned long param)
19852044
{
@@ -1992,6 +2051,7 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
19922051
case MLX5_DEV_EVENT_SYS_ERROR:
19932052
ibdev->ib_active = false;
19942053
ibev.event = IB_EVENT_DEVICE_FATAL;
2054+
mlx5_ib_handle_internal_error(ibdev);
19952055
break;
19962056

19972057
case MLX5_DEV_EVENT_PORT_UP:
@@ -2595,6 +2655,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
25952655

25962656
mutex_init(&dev->flow_db.lock);
25972657
mutex_init(&dev->cap_mask_mutex);
2658+
INIT_LIST_HEAD(&dev->qp_list);
2659+
spin_lock_init(&dev->reset_flow_resource_lock);
25982660

25992661
if (ll == IB_LINK_LAYER_ETHERNET) {
26002662
err = mlx5_enable_roce(dev);

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,9 @@ struct mlx5_ib_qp {
380380
spinlock_t disable_page_faults_lock;
381381
struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
382382
#endif
383+
struct list_head qps_list;
384+
struct list_head cq_recv_list;
385+
struct list_head cq_send_list;
383386
};
384387

385388
struct mlx5_ib_cq_buf {
@@ -441,6 +444,8 @@ struct mlx5_ib_cq {
441444
struct mlx5_ib_cq_buf *resize_buf;
442445
struct ib_umem *resize_umem;
443446
int cqe_size;
447+
struct list_head list_send_qp;
448+
struct list_head list_recv_qp;
444449
u32 create_flags;
445450
struct list_head wc_list;
446451
enum ib_cq_notify_flags notify_flags;
@@ -621,6 +626,9 @@ struct mlx5_ib_dev {
621626
struct srcu_struct mr_srcu;
622627
#endif
623628
struct mlx5_ib_flow_db flow_db;
629+
/* protect resources needed as part of reset flow */
630+
spinlock_t reset_flow_resource_lock;
631+
struct list_head qp_list;
624632
};
625633

626634
static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)

drivers/infiniband/hw/mlx5/mr.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,12 +1193,16 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
11931193

11941194
static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
11951195
{
1196+
struct mlx5_core_dev *mdev = dev->mdev;
11961197
struct umr_common *umrc = &dev->umrc;
11971198
struct mlx5_ib_umr_context umr_context;
11981199
struct mlx5_umr_wr umrwr = {};
11991200
struct ib_send_wr *bad;
12001201
int err;
12011202

1203+
if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1204+
return 0;
1205+
12021206
mlx5_ib_init_umr_context(&umr_context);
12031207

12041208
umrwr.wr.wr_cqe = &umr_context.cqe;

0 commit comments

Comments
 (0)