Skip to content

Commit c96b52e

Browse files
Ajaykumar Hotchandanivijay-suman
authored andcommitted
rds: find connection drop reason
This patch attempts to find connection drop details. Rational for adding this type of patch is, there are too many places from where connection can get dropped. And, in some cases, we don't have any idea of the source of connection drop. This is especially painful for issues which are reproducible in customer environment only. Idea here is, we have tracker variable which keeps latest value of connection drop source. We can fetch that tracker variable as per our need. Orabug: 22631108 Signed-off-by: Ajaykumar Hotchandani <[email protected]> Signed-off-by: Brian Maly <[email protected]> Reviewed-by: Santosh Shilimkar <[email protected]> Acked-by: Wengang Wang <[email protected]> Orabug: 27364391 (cherry picked from commit 147954a) cherry-pick-repo=linux-uek.git Conflicts: net/rds/af_rds.c net/rds/connection.c net/rds/iw_cm.c net/rds/iw_recv.c net/rds/iw_send.c net/rds/rdma_transport.c net/rds/tcp_send.c Signed-off-by: Gerd Rausch <[email protected]> Signed-off-by: Somasundaram Krishnasamy <[email protected]> Orabug: 33590097 UEK6 => UEK7 (cherry picked from commit 4815d5c) cherry-pick-repo=UEK/production/linux-uek.git Signed-off-by: Gerd Rausch <[email protected]> Reviewed-by: William Kucharski <[email protected]> Orabug: 33590087 UEK7 => LUCI (cherry picked from commit 9f5a18c) cherry-pick-repo=UEK/production/linux-uek.git Signed-off-by: Gerd Rausch <[email protected]> Reviewed-by: William Kucharski <[email protected]>
1 parent 8c963a7 commit c96b52e

File tree

11 files changed

+151
-11
lines changed

11 files changed

+151
-11
lines changed

net/rds/af_rds.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ static int rds_user_reset(struct rds_sock *rs, char __user *optval, int optlen)
334334
"<%pI4,%pI4,%d>\n",
335335
&reset.src.s_addr,
336336
&reset.dst.s_addr, conn->c_tos);
337+
conn->c_drop_source = 1;
337338
rds_conn_drop(conn);
338339
}
339340

net/rds/connection.c

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ void rds_conn_shutdown(struct rds_connection *conn, int restart)
322322
mutex_lock(&conn->c_cm_lock);
323323
if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
324324
&& !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
325+
conn->c_drop_source = 2;
325326
rds_conn_error(conn, "shutdown called in state %d\n",
326327
atomic_read(&conn->c_state));
327328
mutex_unlock(&conn->c_cm_lock);
@@ -343,6 +344,7 @@ void rds_conn_shutdown(struct rds_connection *conn, int restart)
343344
* Quite reproduceable with loopback connections.
344345
* Mostly harmless.
345346
*/
347+
conn->c_drop_source = 3;
346348
rds_conn_error(conn,
347349
"%s: failed to transition to state DOWN, "
348350
"current state is %d\n",
@@ -398,6 +400,7 @@ void rds_conn_destroy(struct rds_connection *conn)
398400
synchronize_rcu();
399401

400402
/* shut the connection down */
403+
conn->c_drop_source = 4;
401404
rds_conn_drop(conn);
402405
flush_work(&conn->c_down_w);
403406

@@ -610,6 +613,94 @@ void rds_conn_exit(void)
610613
rds_conn_message_info_retrans);
611614
}
612615

616+
char *conn_drop_reason_str(u8 reason)
617+
{
618+
/* Here is distribution of drop reason:
619+
*
620+
* 0-19: rds-core
621+
*
622+
* 20-119: IB
623+
* 20-39: ib_cm
624+
* 40-59: event handling
625+
* 60-79: data path
626+
* 80-119: special features like active bonding
627+
*
628+
* 120-139: iWARP
629+
*
630+
* 140-159: TCP
631+
*
632+
* 160-255: any other future additions
633+
*
634+
*/
635+
switch (reason) {
636+
case 1: return "user reset";
637+
case 2: return "invalid connection state";
638+
case 3: return "failure to move to DOWN state";
639+
case 4: return "connection destroy";
640+
case 5: return "zero lane went down";
641+
case 6: return "conn_connect failure";
642+
case 7: return "hb timeout";
643+
case 8: return "reconnect timeout";
644+
645+
case 20: return "race between ESTABLISHED event and drop";
646+
case 21: return "conn is not in CONNECTING state";
647+
case 22: return "qp event";
648+
case 23: return "base conn down";
649+
case 24: return "incoming REQ in CONN_UP state";
650+
case 25: return "incoming REQ in CONNECTING state";
651+
case 26: return "setup_qp failure";
652+
case 27: return "rdma_accept failure";
653+
case 28: return "setup_qp failure";
654+
case 29: return "rdma_connect failure";
655+
656+
case 40: return "rdma_set_ib_paths failure";
657+
case 41: return "resolve_route failure";
658+
case 42: return "detected rdma_cm_id mismatch";
659+
case 43: return "ROUTE_ERROR event";
660+
case 44: return "ADDR_ERROR event";
661+
case 45: return "CONNECT_ERROR or UNREACHABLE or DEVICE_REMOVE event";
662+
case 46: return "CONSUMER_DEFINED reject";
663+
case 47: return "REJECTED event";
664+
case 48: return "ADDR_CHANGE event";
665+
case 49: return "DISCONNECTED event";
666+
case 50: return "TIMEWAIT_EXIT event";
667+
668+
case 60: return "post_recv failure";
669+
case 61: return "send_ack failure";
670+
case 62: return "no header in incoming msg";
671+
case 63: return "corrupted header in incoming msg";
672+
case 64: return "fragment header mismatch";
673+
case 65: return "recv completion error";
674+
case 66: return "send completion error";
675+
case 67: return "post_send failure";
676+
677+
case 80: return "rds_rdma module unload";
678+
case 81: return "active bonding failover";
679+
case 82: return "corresponding loopback conn drop";
680+
case 83: return "active bonding failback";
681+
682+
case 120: return "qp_event";
683+
case 121: return "incoming REQ in connecting state";
684+
case 122: return "setup_qp failure";
685+
case 123: return "rdma_accept failure";
686+
case 124: return "setup_qp failure";
687+
case 125: return "rdma_connect failure";
688+
689+
case 130: return "post_recv failure";
690+
case 131: return "send_ack failure";
691+
case 132: return "no header in incoming msg";
692+
case 133: return "corrupted header in incoming msg";
693+
case 134: return "fragment header mismatch";
694+
case 135: return "recv completion error";
695+
case 136: return "send completion error";
696+
697+
case 140: return "sk_state to TCP_CLOSE";
698+
case 141: return "tcp_send failure";
699+
700+
default: return "unknown reason";
701+
}
702+
}
703+
613704
static void rds_conn_probe_lanes(struct rds_connection *conn)
614705
{
615706
struct hlist_head *head =
@@ -632,6 +723,7 @@ static void rds_conn_probe_lanes(struct rds_connection *conn)
632723
&tmp->c_faddr,
633724
tmp->c_tos);
634725

726+
conn->c_drop_source = 5;
635727
rds_conn_drop(tmp);
636728
}
637729
}
@@ -653,10 +745,11 @@ void rds_conn_drop(struct rds_connection *conn)
653745
conn->c_reconnect_err = 0;
654746
conn->c_reconnect_racing = 0;
655747
printk(KERN_INFO "RDS/IB: connection "
656-
"<%pI4,%pI4,%d> dropped\n",
748+
"<%pI4,%pI4,%d> dropped due to '%s'\n",
657749
&conn->c_laddr,
658750
&conn->c_faddr,
659-
conn->c_tos);
751+
conn->c_tos,
752+
conn_drop_reason_str(conn->c_drop_source));
660753

661754
if (conn->c_tos == 0)
662755
rds_conn_probe_lanes(conn);

net/rds/ib.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,10 @@ void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
197197
"calling rds_conn_drop to drop all connections.\n");
198198

199199
spin_lock_irqsave(&rds_ibdev->spinlock, flags);
200-
list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
200+
list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) {
201+
ic->conn->c_drop_source = 80;
201202
rds_conn_drop(ic->conn);
203+
}
202204
spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
203205
}
204206

@@ -626,6 +628,7 @@ static void rds_ib_conn_drop(struct work_struct *_work)
626628
rds_rtd(RDS_RTD_CM_EXT,
627629
"conn: %p, calling rds_conn_drop\n", conn);
628630

631+
conn->c_drop_source = 81;
629632
rds_conn_drop(conn);
630633

631634
kfree(work);
@@ -853,10 +856,11 @@ static int rds_ib_move_ip(char *from_dev,
853856
ic->conn->c_faddr &&
854857
ic2->conn->c_faddr ==
855858
ic->conn->c_laddr) {
856-
rds_rtd(RDS_RTD_CM_EXT_P,
857-
"conn:%p, tos %d, calling rds_conn_drop\n",
858-
ic2->conn,
859-
ic2->conn->c_tos);
859+
rds_rtd(RDS_RTD_CM_EXT_P,
860+
"conn:%p, tos %d, calling rds_conn_drop\n",
861+
ic2->conn,
862+
ic2->conn->c_tos);
863+
ic2->conn->c_drop_source = 82;
860864
rds_conn_drop(ic2->conn);
861865
}
862866
}
@@ -885,6 +889,7 @@ static int rds_ib_move_ip(char *from_dev,
885889
rds_rtd(RDS_RTD_CM_EXT,
886890
"conn: %p, tos %d, calling rds_conn_drop\n",
887891
ic->conn, ic->conn->c_tos);
892+
ic->conn->c_drop_source = 83;
888893
rds_conn_drop(ic->conn);
889894
}
890895
}

net/rds/ib_cm.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
178178
rds_rtd(RDS_RTD_CM,
179179
"ic->i_cm_id is NULL, ic: %p, calling rds_conn_drop\n",
180180
ic);
181+
conn->c_drop_source = 20;
181182
rds_conn_drop(conn);
182183
return;
183184
}
@@ -188,6 +189,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
188189
rds_rtd(RDS_RTD_CM,
189190
"conn is in connecting state, conn: %p, calling rds_conn_drop\n",
190191
conn);
192+
conn->c_drop_source = 21;
191193
rds_conn_drop(conn);
192194
return;
193195
}
@@ -531,6 +533,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
531533
"Fatal QP Event %u (%s) - connection %pI4->%pI4 tos %d, reconnecting\n",
532534
event->event, rds_ib_event_str(event->event),
533535
&conn->c_laddr, &conn->c_faddr, conn->c_tos);
536+
conn->c_drop_source = 22;
534537
rds_conn_drop(conn);
535538
break;
536539
}
@@ -841,6 +844,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
841844
&conn->c_laddr,
842845
&conn->c_faddr,
843846
conn->c_tos);
847+
conn->c_drop_source = 23;
844848
rds_conn_drop(conn);
845849
}
846850

@@ -863,6 +867,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
863867
if (rds_conn_state(conn) == RDS_CONN_UP) {
864868
rds_rtd(RDS_RTD_CM_EXT_P,
865869
"incoming connect while connecting\n");
870+
conn->c_drop_source = 24;
866871
rds_conn_drop(conn);
867872
rds_ib_stats_inc(s_ib_listen_closed_stale);
868873
} else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
@@ -884,6 +889,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
884889
&conn->c_laddr,
885890
&conn->c_faddr,
886891
conn->c_tos);
892+
conn->c_drop_source = 25;
887893
rds_conn_drop(conn);
888894
rds_ib_stats_inc(s_ib_listen_closed_stale);
889895
} else {
@@ -922,6 +928,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
922928

923929
err = rds_ib_setup_qp(conn);
924930
if (err) {
931+
conn->c_drop_source = 26;
925932
rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
926933
goto out;
927934
}
@@ -936,8 +943,10 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
936943
#endif
937944
/* rdma_accept() calls rdma_reject() internally if it fails */
938945
err = rdma_accept(cm_id, &conn_param);
939-
if (err)
946+
if (err) {
947+
conn->c_drop_source = 27;
940948
rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
949+
}
941950
#if RDMA_RDS_APM_SUPPORTED
942951
else if (rds_ib_apm_enabled && !conn->c_loopback) {
943952
err = rdma_enable_apm(cm_id, RDMA_ALT_PATH_BEST);
@@ -976,15 +985,18 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
976985

977986
ret = rds_ib_setup_qp(conn);
978987
if (ret) {
988+
conn->c_drop_source = 28;
979989
rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
980990
goto out;
981991
}
982992

983993
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
984994
conn->c_proposed_version, UINT_MAX, UINT_MAX);
985995
ret = rdma_connect(cm_id, &conn_param);
986-
if (ret)
996+
if (ret) {
997+
conn->c_drop_source = 29;
987998
rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
999+
}
9881000

9891001
out:
9901002
/* Beware - returning non-zero tells the rdma_cm to destroy

net/rds/ib_recv.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, int can_wait)
591591
recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
592592
(long) sg_dma_address(&recv->r_frag->f_sg), ret);
593593
if (ret) {
594+
conn->c_drop_source = 60;
594595
rds_ib_conn_error(conn, "recv post on "
595596
"%pI4 returned %d, disconnecting and "
596597
"reconnecting\n", &conn->c_faddr,
@@ -858,6 +859,7 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi
858859

859860
rds_ib_stats_inc(s_ib_ack_send_failure);
860861

862+
ic->conn->c_drop_source = 61;
861863
rds_ib_conn_error(ic->conn, "sending ack failed\n");
862864
} else
863865
rds_ib_stats_inc(s_ib_ack_sent);
@@ -1034,6 +1036,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
10341036
data_len);
10351037

10361038
if (data_len < sizeof(struct rds_header)) {
1039+
conn->c_drop_source = 62;
10371040
rds_ib_conn_error(conn, "incoming message "
10381041
"from %pI4 didn't inclue a "
10391042
"header, disconnecting and "
@@ -1047,6 +1050,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
10471050

10481051
/* Validate the checksum. */
10491052
if (!rds_message_verify_checksum(ihdr)) {
1053+
conn->c_drop_source = 63;
10501054
rds_ib_conn_error(conn, "incoming message "
10511055
"from %pI4 has corrupted header - "
10521056
"forcing a reconnect\n",
@@ -1114,6 +1118,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
11141118
|| hdr->h_len != ihdr->h_len
11151119
|| hdr->h_sport != ihdr->h_sport
11161120
|| hdr->h_dport != ihdr->h_dport) {
1121+
conn->c_drop_source = 64;
11171122
rds_ib_conn_error(conn,
11181123
"fragment header mismatch; forcing reconnect\n");
11191124
return;
@@ -1274,6 +1279,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
12741279
} else {
12751280
/* We expect errors as the qp is drained during shutdown */
12761281
if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
1282+
conn->c_drop_source = 65;
12771283
rds_ib_conn_error(conn, "recv completion "
12781284
"<%pI4,%pI4,%d> had "
12791285
"status %u, disconnecting and "

net/rds/ib_send.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
341341

342342
/* We expect errors as the qp is drained during shutdown */
343343
if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
344+
conn->c_drop_source = 66;
344345
rds_ib_conn_error(conn,
345346
"send completion <%pI4,%pI4,%d> status "
346347
"%u vendor_err %u, disconnecting and reconnecting\n",
@@ -807,6 +808,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
807808
prev->s_op = NULL;
808809
}
809810

811+
ic->conn->c_drop_source = 67;
810812
rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
811813
goto out;
812814
}

0 commit comments

Comments
 (0)