Skip to content

Commit 147954a

Browse files
Ajaykumar HotchandaniLinuxMinion
authored andcommitted
rds: find connection drop reason
This patch attempts to find connection drop details. Rational for adding this type of patch is, there are too many places from where connection can get dropped. And, in some cases, we don't have any idea of the source of connection drop. This is especially painful for issues which are reproducible in customer environment only. Idea here is, we have tracker variable which keeps latest value of connection drop source. We can fetch that tracker variable as per our need. Orabug: 22631108 Signed-off-by: Ajaykumar Hotchandani <[email protected]> Reviewed-by: Santosh Shilimkar <[email protected]> Acked-by: Wengang Wang <[email protected]> Signed-off-by: Brian Maly <[email protected]>
1 parent 67fb744 commit 147954a

File tree

14 files changed

+166
-13
lines changed

14 files changed

+166
-13
lines changed

net/rds/af_rds.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ static int rds_user_reset(struct rds_sock *rs, char __user *optval, int optlen)
334334
"<%u.%u.%u.%u,%u.%u.%u.%u,%d>\n",
335335
NIPQUAD(reset.src.s_addr),
336336
NIPQUAD(reset.dst.s_addr), conn->c_tos);
337+
conn->c_drop_source = 1;
337338
rds_conn_drop(conn);
338339
}
339340

net/rds/connection.c

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ void rds_conn_shutdown(struct rds_connection *conn, int restart)
322322
mutex_lock(&conn->c_cm_lock);
323323
if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
324324
&& !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
325+
conn->c_drop_source = 2;
325326
rds_conn_error(conn, "shutdown called in state %d\n",
326327
atomic_read(&conn->c_state));
327328
mutex_unlock(&conn->c_cm_lock);
@@ -343,6 +344,7 @@ void rds_conn_shutdown(struct rds_connection *conn, int restart)
343344
* Quite reproduceable with loopback connections.
344345
* Mostly harmless.
345346
*/
347+
conn->c_drop_source = 3;
346348
rds_conn_error(conn,
347349
"%s: failed to transition to state DOWN, "
348350
"current state is %d\n",
@@ -398,6 +400,7 @@ void rds_conn_destroy(struct rds_connection *conn)
398400
synchronize_rcu();
399401

400402
/* shut the connection down */
403+
conn->c_drop_source = 4;
401404
rds_conn_drop(conn);
402405
flush_work(&conn->c_down_w);
403406

@@ -610,6 +613,94 @@ void rds_conn_exit(void)
610613
rds_conn_message_info_retrans);
611614
}
612615

616+
char *conn_drop_reason_str(u8 reason)
617+
{
618+
/* Here is distribution of drop reason:
619+
*
620+
* 0-19: rds-core
621+
*
622+
* 20-119: IB
623+
* 20-39: ib_cm
624+
* 40-59: event handling
625+
* 60-79: data path
626+
* 80-119: special features like active bonding
627+
*
628+
* 120-139: iWARP
629+
*
630+
* 140-159: TCP
631+
*
632+
* 160-255: any other future additions
633+
*
634+
*/
635+
switch (reason) {
636+
case 1: return "user reset";
637+
case 2: return "invalid connection state";
638+
case 3: return "failure to move to DOWN state";
639+
case 4: return "connection destroy";
640+
case 5: return "zero lane went down";
641+
case 6: return "conn_connect failure";
642+
case 7: return "hb timeout";
643+
case 8: return "reconnect timeout";
644+
645+
case 20: return "race between ESTABLISHED event and drop";
646+
case 21: return "conn is not in CONNECTING state";
647+
case 22: return "qp event";
648+
case 23: return "base conn down";
649+
case 24: return "incoming REQ in CONN_UP state";
650+
case 25: return "incoming REQ in CONNECTING state";
651+
case 26: return "setup_qp failure";
652+
case 27: return "rdma_accept failure";
653+
case 28: return "setup_qp failure";
654+
case 29: return "rdma_connect failure";
655+
656+
case 40: return "rdma_set_ib_paths failure";
657+
case 41: return "resolve_route failure";
658+
case 42: return "detected rdma_cm_id mismatch";
659+
case 43: return "ROUTE_ERROR event";
660+
case 44: return "ADDR_ERROR event";
661+
case 45: return "CONNECT_ERROR or UNREACHABLE or DEVICE_REMOVE event";
662+
case 46: return "CONSUMER_DEFINED reject";
663+
case 47: return "REJECTED event";
664+
case 48: return "ADDR_CHANGE event";
665+
case 49: return "DISCONNECTED event";
666+
case 50: return "TIMEWAIT_EXIT event";
667+
668+
case 60: return "post_recv failure";
669+
case 61: return "send_ack failure";
670+
case 62: return "no header in incoming msg";
671+
case 63: return "corrupted header in incoming msg";
672+
case 64: return "fragment header mismatch";
673+
case 65: return "recv completion error";
674+
case 66: return "send completion error";
675+
case 67: return "post_send failure";
676+
677+
case 80: return "rds_rdma module unload";
678+
case 81: return "active bonding failover";
679+
case 82: return "corresponding loopback conn drop";
680+
case 83: return "active bonding failback";
681+
682+
case 120: return "qp_event";
683+
case 121: return "incoming REQ in connecting state";
684+
case 122: return "setup_qp failure";
685+
case 123: return "rdma_accept failure";
686+
case 124: return "setup_qp failure";
687+
case 125: return "rdma_connect failure";
688+
689+
case 130: return "post_recv failure";
690+
case 131: return "send_ack failure";
691+
case 132: return "no header in incoming msg";
692+
case 133: return "corrupted header in incoming msg";
693+
case 134: return "fragment header mismatch";
694+
case 135: return "recv completion error";
695+
case 136: return "send completion error";
696+
697+
case 140: return "sk_state to TCP_CLOSE";
698+
case 141: return "tcp_send failure";
699+
700+
default: return "unknown reason";
701+
}
702+
}
703+
613704
static void rds_conn_probe_lanes(struct rds_connection *conn)
614705
{
615706
struct hlist_head *head =
@@ -632,6 +723,7 @@ static void rds_conn_probe_lanes(struct rds_connection *conn)
632723
NIPQUAD(tmp->c_faddr),
633724
tmp->c_tos);
634725

726+
conn->c_drop_source = 5;
635727
rds_conn_drop(tmp);
636728
}
637729
}
@@ -653,10 +745,11 @@ void rds_conn_drop(struct rds_connection *conn)
653745
conn->c_reconnect_err = 0;
654746
conn->c_reconnect_racing = 0;
655747
printk(KERN_INFO "RDS/IB: connection "
656-
"<%u.%u.%u.%u,%u.%u.%u.%u,%d> dropped\n",
748+
"<%u.%u.%u.%u,%u.%u.%u.%u,%d> dropped due to '%s'\n",
657749
NIPQUAD(conn->c_laddr),
658750
NIPQUAD(conn->c_faddr),
659-
conn->c_tos);
751+
conn->c_tos,
752+
conn_drop_reason_str(conn->c_drop_source));
660753

661754
if (conn->c_tos == 0)
662755
rds_conn_probe_lanes(conn);

net/rds/ib.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,10 @@ void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
197197
"calling rds_conn_drop to drop all connections.\n");
198198

199199
spin_lock_irqsave(&rds_ibdev->spinlock, flags);
200-
list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
200+
list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) {
201+
ic->conn->c_drop_source = 80;
201202
rds_conn_drop(ic->conn);
203+
}
202204
spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
203205
}
204206

@@ -625,6 +627,7 @@ static void rds_ib_conn_drop(struct work_struct *_work)
625627
rds_rtd(RDS_RTD_CM_EXT,
626628
"conn: %p, calling rds_conn_drop\n", conn);
627629

630+
conn->c_drop_source = 81;
628631
rds_conn_drop(conn);
629632

630633
kfree(work);
@@ -852,10 +855,11 @@ static int rds_ib_move_ip(char *from_dev,
852855
ic->conn->c_faddr &&
853856
ic2->conn->c_faddr ==
854857
ic->conn->c_laddr) {
855-
rds_rtd(RDS_RTD_CM_EXT_P,
856-
"conn:%p, tos %d, calling rds_conn_drop\n",
857-
ic2->conn,
858-
ic2->conn->c_tos);
858+
rds_rtd(RDS_RTD_CM_EXT_P,
859+
"conn:%p, tos %d, calling rds_conn_drop\n",
860+
ic2->conn,
861+
ic2->conn->c_tos);
862+
ic2->conn->c_drop_source = 82;
859863
rds_conn_drop(ic2->conn);
860864
}
861865
}
@@ -884,6 +888,7 @@ static int rds_ib_move_ip(char *from_dev,
884888
rds_rtd(RDS_RTD_CM_EXT,
885889
"conn: %p, tos %d, calling rds_conn_drop\n",
886890
ic->conn, ic->conn->c_tos);
891+
ic->conn->c_drop_source = 83;
887892
rds_conn_drop(ic->conn);
888893
}
889894
}

net/rds/ib_cm.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
177177
rds_rtd(RDS_RTD_CM,
178178
"ic->i_cm_id is NULL, ic: %p, calling rds_conn_drop\n",
179179
ic);
180+
conn->c_drop_source = 20;
180181
rds_conn_drop(conn);
181182
return;
182183
}
@@ -187,6 +188,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
187188
rds_rtd(RDS_RTD_CM,
188189
"conn is in connecting state, conn: %p, calling rds_conn_drop\n",
189190
conn);
191+
conn->c_drop_source = 21;
190192
rds_conn_drop(conn);
191193
return;
192194
}
@@ -530,6 +532,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
530532
"Fatal QP Event %u (%s) - connection %pI4->%pI4 tos %d, reconnecting\n",
531533
event->event, rds_ib_event_str(event->event),
532534
&conn->c_laddr, &conn->c_faddr, conn->c_tos);
535+
conn->c_drop_source = 22;
533536
rds_conn_drop(conn);
534537
break;
535538
}
@@ -833,6 +836,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
833836
NIPQUAD(conn->c_laddr),
834837
NIPQUAD(conn->c_faddr),
835838
conn->c_tos);
839+
conn->c_drop_source = 23;
836840
rds_conn_drop(conn);
837841
}
838842

@@ -855,6 +859,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
855859
if (rds_conn_state(conn) == RDS_CONN_UP) {
856860
rds_rtd(RDS_RTD_CM_EXT_P,
857861
"incoming connect while connecting\n");
862+
conn->c_drop_source = 24;
858863
rds_conn_drop(conn);
859864
rds_ib_stats_inc(s_ib_listen_closed_stale);
860865
} else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
@@ -876,6 +881,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
876881
NIPQUAD(conn->c_laddr),
877882
NIPQUAD(conn->c_faddr),
878883
conn->c_tos);
884+
conn->c_drop_source = 25;
879885
rds_conn_drop(conn);
880886
rds_ib_stats_inc(s_ib_listen_closed_stale);
881887
} else {
@@ -914,6 +920,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
914920

915921
err = rds_ib_setup_qp(conn);
916922
if (err) {
923+
conn->c_drop_source = 26;
917924
rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
918925
goto out;
919926
}
@@ -928,8 +935,10 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
928935
#endif
929936
/* rdma_accept() calls rdma_reject() internally if it fails */
930937
err = rdma_accept(cm_id, &conn_param);
931-
if (err)
938+
if (err) {
939+
conn->c_drop_source = 27;
932940
rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
941+
}
933942
#if RDMA_RDS_APM_SUPPORTED
934943
else if (rds_ib_apm_enabled && !conn->c_loopback) {
935944
err = rdma_enable_apm(cm_id, RDMA_ALT_PATH_BEST);
@@ -968,15 +977,18 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
968977

969978
ret = rds_ib_setup_qp(conn);
970979
if (ret) {
980+
conn->c_drop_source = 28;
971981
rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
972982
goto out;
973983
}
974984

975985
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
976986
conn->c_proposed_version, UINT_MAX, UINT_MAX);
977987
ret = rdma_connect(cm_id, &conn_param);
978-
if (ret)
988+
if (ret) {
989+
conn->c_drop_source = 29;
979990
rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
991+
}
980992

981993
out:
982994
/* Beware - returning non-zero tells the rdma_cm to destroy

net/rds/ib_recv.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
596596
recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
597597
(long) sg_dma_address(&recv->r_frag->f_sg), ret);
598598
if (ret) {
599+
conn->c_drop_source = 60;
599600
rds_ib_conn_error(conn, "recv post on "
600601
"%pI4 returned %d, disconnecting and "
601602
"reconnecting\n", &conn->c_faddr,
@@ -863,6 +864,7 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi
863864

864865
rds_ib_stats_inc(s_ib_ack_send_failure);
865866

867+
ic->conn->c_drop_source = 61;
866868
rds_ib_conn_error(ic->conn, "sending ack failed\n");
867869
} else
868870
rds_ib_stats_inc(s_ib_ack_sent);
@@ -1039,6 +1041,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
10391041
data_len);
10401042

10411043
if (data_len < sizeof(struct rds_header)) {
1044+
conn->c_drop_source = 62;
10421045
rds_ib_conn_error(conn, "incoming message "
10431046
"from %pI4 didn't inclue a "
10441047
"header, disconnecting and "
@@ -1052,6 +1055,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
10521055

10531056
/* Validate the checksum. */
10541057
if (!rds_message_verify_checksum(ihdr)) {
1058+
conn->c_drop_source = 63;
10551059
rds_ib_conn_error(conn, "incoming message "
10561060
"from %pI4 has corrupted header - "
10571061
"forcing a reconnect\n",
@@ -1119,6 +1123,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
11191123
|| hdr->h_len != ihdr->h_len
11201124
|| hdr->h_sport != ihdr->h_sport
11211125
|| hdr->h_dport != ihdr->h_dport) {
1126+
conn->c_drop_source = 64;
11221127
rds_ib_conn_error(conn,
11231128
"fragment header mismatch; forcing reconnect\n");
11241129
return;
@@ -1279,6 +1284,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
12791284
} else {
12801285
/* We expect errors as the qp is drained during shutdown */
12811286
if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
1287+
conn->c_drop_source = 65;
12821288
rds_ib_conn_error(conn, "recv completion "
12831289
"<%pI4,%pI4,%d> had "
12841290
"status %u, disconnecting and "

net/rds/ib_send.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
341341

342342
/* We expect errors as the qp is drained during shutdown */
343343
if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
344+
conn->c_drop_source = 66;
344345
rds_ib_conn_error(conn,
345346
"send completion <%u.%u.%u.%u,%u.%u.%u.%u,%d> status "
346347
"%u vendor_err %u, disconnecting and reconnecting\n",
@@ -807,6 +808,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
807808
prev->s_op = NULL;
808809
}
809810

811+
ic->conn->c_drop_source = 67;
810812
rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
811813
goto out;
812814
}

net/rds/iw_cm.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
160160
"- connection %pI4->%pI4, reconnecting\n",
161161
event->event, &conn->c_laddr,
162162
&conn->c_faddr);
163+
conn->c_drop_source = 120;
163164
rds_conn_drop(conn);
164165
break;
165166
}
@@ -415,6 +416,7 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
415416
if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
416417
if (rds_conn_state(conn) == RDS_CONN_UP) {
417418
rdsdebug("incoming connect while connecting\n");
419+
conn->c_drop_source = 121;
418420
rds_conn_drop(conn);
419421
rds_iw_stats_inc(s_iw_listen_closed_stale);
420422
} else
@@ -451,6 +453,7 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
451453

452454
err = rds_iw_setup_qp(conn);
453455
if (err) {
456+
conn->c_drop_source = 122;
454457
rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
455458
goto out;
456459
}
@@ -461,6 +464,7 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
461464
err = rdma_accept(cm_id, &conn_param);
462465
mutex_unlock(&conn->c_cm_lock);
463466
if (err) {
467+
conn->c_drop_source = 123;
464468
rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
465469
goto out;
466470
}
@@ -488,16 +492,18 @@ int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
488492

489493
ret = rds_iw_setup_qp(conn);
490494
if (ret) {
495+
conn->c_drop_source = 124;
491496
rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
492497
goto out;
493498
}
494499

495500
rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
496501

497502
ret = rdma_connect(cm_id, &conn_param);
498-
if (ret)
503+
if (ret) {
504+
conn->c_drop_source = 125;
499505
rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
500-
506+
}
501507
out:
502508
/* Beware - returning non-zero tells the rdma_cm to destroy
503509
* the cm_id. We should certainly not do it as long as we still

0 commit comments

Comments
 (0)