Skip to content

Commit b537428

Browse files
Yixian Liujgunthorpe
authored andcommitted
RDMA/hns: Delayed flush cqe process with workqueue
HiP08 RoCE hardware lacks ability(a known hardware problem) to flush outstanding WQEs if QP state gets into errored mode for some reason. To overcome this hardware problem and as a workaround, when QP is detected to be in errored state during various legs like post send, post receive etc[1], flush needs to be performed from the driver. The earlier patch[1] sent to solve the hardware limitation explained in the cover-letter had a bug in the software flushing leg. It acquired mutex while modifying QP state to errored state and while conveying it to the hardware using the mailbox. This caused leg to sleep while holding spin-lock and caused crash. Suggested Solution: we have proposed to defer the flushing of the QP in the Errored state using the workqueue to get around with the limitation of our hardware. This patch specifically adds the calls to the flush handler from where parts of the code like post_send/post_recv etc. when the QP state gets into the errored mode. [1] https://patchwork.kernel.org/patch/10534271/ Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Yixian Liu <[email protected]> Reviewed-by: Salil Mehta <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent ffd541d commit b537428

File tree

3 files changed

+66
-50
lines changed

3 files changed

+66
-50
lines changed

drivers/infiniband/hw/hns/hns_roce_device.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,10 @@ struct hns_roce_rinl_buf {
641641
u32 wqe_cnt;
642642
};
643643

644+
enum {
645+
HNS_ROCE_FLUSH_FLAG = 0,
646+
};
647+
644648
struct hns_roce_work {
645649
struct hns_roce_dev *hr_dev;
646650
struct work_struct work;
@@ -693,6 +697,8 @@ struct hns_roce_qp {
693697
struct hns_roce_sge sge;
694698
u32 next_sge;
695699

700+
/* 0: flush needed, 1: unneeded */
701+
unsigned long flush_flag;
696702
struct hns_roce_work flush_work;
697703
struct hns_roce_rinl_buf rq_inl_buf;
698704
struct list_head node; /* all qps are on a list */

drivers/infiniband/hw/hns/hns_roce_hw_v2.c

Lines changed: 53 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -220,11 +220,6 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr,
220220
return 0;
221221
}
222222

223-
static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
224-
const struct ib_qp_attr *attr,
225-
int attr_mask, enum ib_qp_state cur_state,
226-
enum ib_qp_state new_state);
227-
228223
static int check_send_valid(struct hns_roce_dev *hr_dev,
229224
struct hns_roce_qp *hr_qp)
230225
{
@@ -261,15 +256,13 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
261256
struct hns_roce_wqe_frmr_seg *fseg;
262257
struct device *dev = hr_dev->dev;
263258
struct hns_roce_v2_db sq_db;
264-
struct ib_qp_attr attr;
265259
unsigned int owner_bit;
266260
unsigned int sge_idx;
267261
unsigned int wqe_idx;
268262
unsigned long flags;
269263
int valid_num_sge;
270264
void *wqe = NULL;
271265
bool loopback;
272-
int attr_mask;
273266
u32 tmp_len;
274267
u32 hr_op;
275268
u8 *smac;
@@ -607,18 +600,19 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
607600

608601
qp->next_sge = sge_idx;
609602

610-
if (qp->state == IB_QPS_ERR) {
611-
attr_mask = IB_QP_STATE;
612-
attr.qp_state = IB_QPS_ERR;
613-
614-
ret = hns_roce_v2_modify_qp(&qp->ibqp, &attr, attr_mask,
615-
qp->state, IB_QPS_ERR);
616-
if (ret) {
617-
spin_unlock_irqrestore(&qp->sq.lock, flags);
618-
*bad_wr = wr;
619-
return ret;
620-
}
621-
}
603+
/*
604+
* Hip08 hardware cannot flush the WQEs in SQ if the QP state
605+
* gets into errored mode. Hence, as a workaround to this
606+
* hardware limitation, driver needs to assist in flushing. But
607+
* the flushing operation uses mailbox to convey the QP state to
608+
* the hardware and which can sleep due to the mutex protection
609+
* around the mailbox calls. Hence, use the deferred flush for
610+
* now.
611+
*/
612+
if (qp->state == IB_QPS_ERR)
613+
if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG,
614+
&qp->flush_flag))
615+
init_flush_work(hr_dev, qp);
622616
}
623617

624618
spin_unlock_irqrestore(&qp->sq.lock, flags);
@@ -646,10 +640,8 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
646640
struct hns_roce_v2_wqe_data_seg *dseg;
647641
struct hns_roce_rinl_sge *sge_list;
648642
struct device *dev = hr_dev->dev;
649-
struct ib_qp_attr attr;
650643
unsigned long flags;
651644
void *wqe = NULL;
652-
int attr_mask;
653645
u32 wqe_idx;
654646
int nreq;
655647
int ret;
@@ -719,19 +711,19 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
719711

720712
*hr_qp->rdb.db_record = hr_qp->rq.head & 0xffff;
721713

722-
if (hr_qp->state == IB_QPS_ERR) {
723-
attr_mask = IB_QP_STATE;
724-
attr.qp_state = IB_QPS_ERR;
725-
726-
ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, &attr,
727-
attr_mask, hr_qp->state,
728-
IB_QPS_ERR);
729-
if (ret) {
730-
spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
731-
*bad_wr = wr;
732-
return ret;
733-
}
734-
}
714+
/*
715+
* Hip08 hardware cannot flush the WQEs in RQ if the QP state
716+
* gets into errored mode. Hence, as a workaround to this
717+
* hardware limitation, driver needs to assist in flushing. But
718+
* the flushing operation uses mailbox to convey the QP state to
719+
* the hardware and which can sleep due to the mutex protection
720+
* around the mailbox calls. Hence, use the deferred flush for
721+
* now.
722+
*/
723+
if (hr_qp->state == IB_QPS_ERR)
724+
if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG,
725+
&hr_qp->flush_flag))
726+
init_flush_work(hr_dev, hr_qp);
735727
}
736728
spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
737729

@@ -3013,13 +3005,11 @@ static int hns_roce_v2_sw_poll_cq(struct hns_roce_cq *hr_cq, int num_entries,
30133005
static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
30143006
struct hns_roce_qp **cur_qp, struct ib_wc *wc)
30153007
{
3008+
struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device);
30163009
struct hns_roce_srq *srq = NULL;
3017-
struct hns_roce_dev *hr_dev;
30183010
struct hns_roce_v2_cqe *cqe;
30193011
struct hns_roce_qp *hr_qp;
30203012
struct hns_roce_wq *wq;
3021-
struct ib_qp_attr attr;
3022-
int attr_mask;
30233013
int is_send;
30243014
u16 wqe_ctr;
30253015
u32 opcode;
@@ -3043,7 +3033,6 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
30433033
V2_CQE_BYTE_16_LCL_QPN_S);
30443034

30453035
if (!*cur_qp || (qpn & HNS_ROCE_V2_CQE_QPN_MASK) != (*cur_qp)->qpn) {
3046-
hr_dev = to_hr_dev(hr_cq->ib_cq.device);
30473036
hr_qp = __hns_roce_qp_lookup(hr_dev, qpn);
30483037
if (unlikely(!hr_qp)) {
30493038
dev_err(hr_dev->dev, "CQ %06lx with entry for unknown QPN %06x\n",
@@ -3053,6 +3042,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
30533042
*cur_qp = hr_qp;
30543043
}
30553044

3045+
hr_qp = *cur_qp;
30563046
wc->qp = &(*cur_qp)->ibqp;
30573047
wc->vendor_err = 0;
30583048

@@ -3137,14 +3127,24 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
31373127
break;
31383128
}
31393129

3140-
/* flush cqe if wc status is error, excluding flush error */
3141-
if ((wc->status != IB_WC_SUCCESS) &&
3142-
(wc->status != IB_WC_WR_FLUSH_ERR)) {
3143-
attr_mask = IB_QP_STATE;
3144-
attr.qp_state = IB_QPS_ERR;
3145-
return hns_roce_v2_modify_qp(&(*cur_qp)->ibqp,
3146-
&attr, attr_mask,
3147-
(*cur_qp)->state, IB_QPS_ERR);
3130+
/*
3131+
* Hip08 hardware cannot flush the WQEs in SQ/RQ if the QP state gets
3132+
* into errored mode. Hence, as a workaround to this hardware
3133+
* limitation, driver needs to assist in flushing. But the flushing
3134+
* operation uses mailbox to convey the QP state to the hardware and
3135+
* which can sleep due to the mutex protection around the mailbox calls.
3136+
* Hence, use the deferred flush for now. Once wc error detected, the
3137+
* flushing operation is needed.
3138+
*/
3139+
if (wc->status != IB_WC_SUCCESS &&
3140+
wc->status != IB_WC_WR_FLUSH_ERR) {
3141+
dev_err(hr_dev->dev, "error cqe status is: 0x%x\n",
3142+
status & HNS_ROCE_V2_CQE_STATUS_MASK);
3143+
3144+
if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &hr_qp->flush_flag))
3145+
init_flush_work(hr_dev, hr_qp);
3146+
3147+
return 0;
31483148
}
31493149

31503150
if (wc->status == IB_WC_WR_FLUSH_ERR)
@@ -4735,6 +4735,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
47354735
struct hns_roce_v2_qp_context *context = ctx;
47364736
struct hns_roce_v2_qp_context *qpc_mask = ctx + 1;
47374737
struct device *dev = hr_dev->dev;
4738+
unsigned long sq_flag = 0;
4739+
unsigned long rq_flag = 0;
47384740
int ret;
47394741

47404742
/*
@@ -4752,6 +4754,9 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
47524754

47534755
/* When QP state is err, SQ and RQ WQE should be flushed */
47544756
if (new_state == IB_QPS_ERR) {
4757+
spin_lock_irqsave(&hr_qp->sq.lock, sq_flag);
4758+
spin_lock_irqsave(&hr_qp->rq.lock, rq_flag);
4759+
hr_qp->state = IB_QPS_ERR;
47554760
roce_set_field(context->byte_160_sq_ci_pi,
47564761
V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
47574762
V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S,
@@ -4769,6 +4774,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
47694774
V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
47704775
V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
47714776
}
4777+
spin_unlock_irqrestore(&hr_qp->rq.lock, rq_flag);
4778+
spin_unlock_irqrestore(&hr_qp->sq.lock, sq_flag);
47724779
}
47734780

47744781
/* Configure the optional fields */

drivers/infiniband/hw/hns/hns_roce_qp.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,12 @@ static void flush_work_handle(struct work_struct *work)
5757
attr_mask = IB_QP_STATE;
5858
attr.qp_state = IB_QPS_ERR;
5959

60-
ret = hns_roce_modify_qp(&hr_qp->ibqp, &attr, attr_mask, NULL);
61-
if (ret)
62-
dev_err(dev, "Modify QP to error state failed(%d) during CQE flush\n",
63-
ret);
60+
if (test_and_clear_bit(HNS_ROCE_FLUSH_FLAG, &hr_qp->flush_flag)) {
61+
ret = hns_roce_modify_qp(&hr_qp->ibqp, &attr, attr_mask, NULL);
62+
if (ret)
63+
dev_err(dev, "Modify QP to error state failed(%d) during CQE flush\n",
64+
ret);
65+
}
6466

6567
/*
6668
* make sure we signal QP destroy leg that flush QP was completed
@@ -764,6 +766,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
764766
spin_lock_init(&hr_qp->rq.lock);
765767

766768
hr_qp->state = IB_QPS_RESET;
769+
hr_qp->flush_flag = 0;
767770

768771
hr_qp->ibqp.qp_type = init_attr->qp_type;
769772

0 commit comments

Comments
 (0)