Skip to content

Commit 0a1367a

Browse files
venkatxvenkatsubraBrian Maly
authored andcommitted
rds: RDS connection does not reconnect after CQ access violation error
The sequence that leads to this state is as follows. 1) First we see CQ error logged. Sep 29 22:32:33 dm54cel14 kernel: [471472.784371] mlx4_core 0000:46:00.0: CQ access violation on CQN 000419 syndrome=0x2 vendor_error_syndrome=0x0 2) That is followed by the drop of the associated RDS connection. Sep 29 22:32:33 dm54cel14 kernel: [471472.784403] RDS/IB: connection <192.168.54.43,192.168.54.1,0> dropped due to 'qp event' 3) We don't get the WR_FLUSH_ERRs for the posted receive buffers after that. 4) RDS is stuck in rds_ib_conn_shutdown while shutting down that connection. crash64> bt 62577 PID: 62577 TASK: ffff88143f045400 CPU: 4 COMMAND: "kworker/u224:1" #0 [ffff8813663bbb58] __schedule at ffffffff816ab68b #1 [ffff8813663bbbb0] schedule at ffffffff816abca7 #2 [ffff8813663bbbd0] schedule_timeout at ffffffff816aee71 #3 [ffff8813663bbc80] rds_ib_conn_shutdown at ffffffffa041f7d1 [rds_rdma] #4 [ffff8813663bbd10] rds_conn_shutdown at ffffffffa03dc6e2 [rds] #5 [ffff8813663bbdb0] rds_shutdown_worker at ffffffffa03e2699 [rds] #6 [ffff8813663bbe00] process_one_work at ffffffff8109cda1 #7 [ffff8813663bbe50] worker_thread at ffffffff8109d92b #8 [ffff8813663bbec0] kthread at ffffffff810a304b #9 [ffff8813663bbf50] ret_from_fork at ffffffff816b0752 crash64> It was stuck here in rds_ib_conn_shutdown for ever: /* quiesce tx and rx completion before tearing down */ while (!wait_event_timeout(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && (atomic_read(&ic->i_signaled_sends) == 0), msecs_to_jiffies(5000))) { /* Try to reap pending RX completions every 5 secs */ if (!rds_ib_ring_empty(&ic->i_recv_ring)) { spin_lock_bh(&ic->i_rx_lock); rds_ib_rx(ic); spin_unlock_bh(&ic->i_rx_lock); } } The recv ring was not empty. w_alloc_ptr = 560 w_free_ptr = 256 This is what Mellanox had to say: When CQ moves to error (e.g. due to CQ Overrun, CQ Access violation) FW will generate Async event to notify this error, also the QPs that tries to access this CQ will be put to error state but will not be flushed since we must not post CQEs to a broken CQ. The QP that tries to access will also issue an Async catas event. In summary we cannot wait for any more WR_FLUSH_ERRs in that state. Orabug: 28733324 Reviewed-by: Rama Nichanamatlu <[email protected]> Signed-off-by: Venkat Venkatsubra <[email protected]> Signed-off-by: Brian Maly <[email protected]>
1 parent 53da198 commit 0a1367a

File tree

2 files changed

+20
-6
lines changed

2 files changed

+20
-6
lines changed

net/rds/ib.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
#define NUM_RDS_RECV_SG (PAGE_ALIGN(RDS_MAX_FRAG_SIZE) / PAGE_SIZE)
5454

5555
#define RDS_IB_CLEAN_CACHE 1
56+
#define RDS_IB_CQ_ERR 2
5657

5758
#define RDS_IB_DEFAULT_FREG_PORT_NUM 1
5859
#define RDS_CM_RETRY_SEQ_EN BIT(7)

net/rds/ib_cm.c

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
319319

320320
ic->i_sl = ic->i_cm_id->route.path_rec->sl;
321321
atomic_set(&ic->i_cq_quiesce, 0);
322+
ic->i_flags &= ~RDS_IB_CQ_ERR;
322323

323324
/*
324325
* Init rings and fill recv. this needs to wait until protocol negotiation
@@ -438,8 +439,15 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
438439

439440
static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
440441
{
441-
rdsdebug("event %u (%s) data %p\n",
442+
struct rds_connection *conn = data;
443+
struct rds_ib_connection *ic = conn->c_transport_data;
444+
445+
pr_info("RDS/IB: event %u (%s) data %p\n",
442446
event->event, rds_ib_event_str(event->event), data);
447+
448+
ic->i_flags |= RDS_IB_CQ_ERR;
449+
if (waitqueue_active(&rds_ib_ring_empty_wait))
450+
wake_up(&rds_ib_ring_empty_wait);
443451
}
444452

445453
static void rds_ib_cq_comp_handler_fastreg(struct ib_cq *cq, void *context)
@@ -1409,11 +1417,15 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
14091417

14101418
/* quiesce tx and rx completion before tearing down */
14111419
while (!wait_event_timeout(rds_ib_ring_empty_wait,
1412-
rds_ib_ring_empty(&ic->i_recv_ring) &&
1413-
(atomic_read(&ic->i_signaled_sends) == 0) &&
1414-
(atomic_read(&ic->i_fastreg_wrs) ==
1415-
RDS_IB_DEFAULT_FREG_WR),
1416-
msecs_to_jiffies(5000))) {
1420+
(rds_ib_ring_empty(&ic->i_recv_ring) &&
1421+
(atomic_read(&ic->i_signaled_sends) == 0) &&
1422+
(atomic_read(&ic->i_fastreg_wrs) ==
1423+
RDS_IB_DEFAULT_FREG_WR)) ||
1424+
(ic->i_flags & RDS_IB_CQ_ERR),
1425+
msecs_to_jiffies(5000))) {
1426+
1427+
if (ic->i_flags & RDS_IB_CQ_ERR)
1428+
break;
14171429

14181430
/* Try to reap pending RX completions every 5 secs */
14191431
if (!rds_ib_ring_empty(&ic->i_recv_ring)) {
@@ -1427,6 +1439,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
14271439
tasklet_kill(&ic->i_rtasklet);
14281440

14291441
atomic_set(&ic->i_cq_quiesce, 1);
1442+
ic->i_flags &= ~RDS_IB_CQ_ERR;
14301443

14311444
/* first destroy the ib state that generates callbacks */
14321445
if (ic->i_cm_id->qp)

0 commit comments

Comments
 (0)