Skip to content

Commit c06f8c2

Browse files
Erez Shitritgregkh
authored andcommitted
IB/mlx5: Fetch soft WQE's on fatal error state
commit 7b74a83 upstream. On fatal error the driver simulates CQE's for ULPs that rely on completion of all their posted work-request. For the GSI traffic, the mlx5 has its own mechanism that sends the completions via software CQE's directly to the relevant CQ. This should be kept in fatal error too, so the driver should simulate such CQE's with the specified error state in order to complete GSI QP work requests. Without the fix the next deadlock might appears: schedule_timeout+0x274/0x350 wait_for_common+0xec/0x240 mcast_remove_one+0xd0/0x120 [ib_core] ib_unregister_device+0x12c/0x230 [ib_core] mlx5_ib_remove+0xc4/0x270 [mlx5_ib] mlx5_detach_device+0x184/0x1a0 [mlx5_core] mlx5_unload_one+0x308/0x340 [mlx5_core] mlx5_pci_err_detected+0x74/0xe0 [mlx5_core] Cc: <[email protected]> # 4.7 Fixes: 89ea94a ("IB/mlx5: Reset flow support for IB kernel ULPs") Signed-off-by: Erez Shitrit <[email protected]> Signed-off-by: Leon Romanovsky <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 96fb9b8 commit c06f8c2

File tree

1 file changed

+12
-3
lines changed
  • drivers/infiniband/hw/mlx5

1 file changed

+12
-3
lines changed

drivers/infiniband/hw/mlx5/cq.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,7 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,
646646
}
647647

648648
static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries,
649-
struct ib_wc *wc)
649+
struct ib_wc *wc, bool is_fatal_err)
650650
{
651651
struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
652652
struct mlx5_ib_wc *soft_wc, *next;
@@ -659,6 +659,10 @@ static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries,
659659
mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n",
660660
cq->mcq.cqn);
661661

662+
if (unlikely(is_fatal_err)) {
663+
soft_wc->wc.status = IB_WC_WR_FLUSH_ERR;
664+
soft_wc->wc.vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
665+
}
662666
wc[npolled++] = soft_wc->wc;
663667
list_del(&soft_wc->list);
664668
kfree(soft_wc);
@@ -679,12 +683,17 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
679683

680684
spin_lock_irqsave(&cq->lock, flags);
681685
if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
682-
mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
686+
/* make sure no soft wqe's are waiting */
687+
if (unlikely(!list_empty(&cq->wc_list)))
688+
soft_polled = poll_soft_wc(cq, num_entries, wc, true);
689+
690+
mlx5_ib_poll_sw_comp(cq, num_entries - soft_polled,
691+
wc + soft_polled, &npolled);
683692
goto out;
684693
}
685694

686695
if (unlikely(!list_empty(&cq->wc_list)))
687-
soft_polled = poll_soft_wc(cq, num_entries, wc);
696+
soft_polled = poll_soft_wc(cq, num_entries, wc, false);
688697

689698
for (npolled = 0; npolled < num_entries - soft_polled; npolled++) {
690699
if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled))

0 commit comments

Comments
 (0)