Skip to content

Commit ed999f8

Browse files
chuckleverjgunthorpe
authored andcommitted
RDMA/cma: Add trace points in RDMA Connection Manager
Record state transitions as each connection is established. The IP address of both peers and the Type of Service is reported. These trace points are not in performance hot paths. Also, record each cm_event_handler call to ULPs. This eliminates the need for each ULP to add its own similar trace point in its CM event handler function. These new trace points appear in a new trace subsystem called "rdma_cma". Sample events: <...>-220 [004] 121.430733: cm_id_create: cm.id=0 <...>-472 [003] 121.430991: cm_event_handler: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 ADDR_RESOLVED (0/0) <...>-472 [003] 121.430995: cm_event_done: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 result=0 <...>-472 [003] 121.431172: cm_event_handler: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 ROUTE_RESOLVED (2/0) <...>-472 [003] 121.431174: cm_event_done: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 result=0 <...>-220 [004] 121.433480: cm_qp_create: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 pd.id=2 qp_type=RC send_wr=4091 recv_wr=256 qp_num=521 rc=0 <...>-220 [004] 121.433577: cm_send_req: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 qp_num=521 kworker/1:2-973 [001] 121.436190: cm_send_mra: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 kworker/1:2-973 [001] 121.436340: cm_send_rtu: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 kworker/1:2-973 [001] 121.436359: cm_event_handler: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 ESTABLISHED (9/0) kworker/1:2-973 [001] 121.436365: cm_event_done: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 result=0 <...>-1975 [005] 123.161954: cm_disconnect: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 <...>-1975 [005] 123.161974: cm_sent_dreq: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 <...>-220 [004] 123.162102: cm_disconnect: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 kworker/0:1-13 [000] 123.162391: cm_event_handler: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 DISCONNECTED (10/0) kworker/0:1-13 [000] 123.162393: cm_event_done: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 result=0 <...>-220 [004] 123.164456: cm_qp_destroy: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 qp_num=521 <...>-220 [004] 123.165290: cm_id_destroy: cm.id=0 src=192.168.2.51:35090 dst=192.168.2.55:20049 tos=0 Some features to note: - restracker ID of the rdma_cm_id is tagged on each trace event - The source and destination IP addresses and TOS are reported - CM event upcalls are shown with decoded event and status - CM state transitions are reported - rdma_cm_id lifetime events are captured - The latency of ULP CM event handlers is reported - Lifetime events of associated QPs are reported - Device removal and insertion is reported This patch is based on previous work by: Saeed Mahameed <[email protected]> Mukesh Kacker <[email protected]> Ajaykumar Hotchandani <[email protected]> Aron Silverton <[email protected]> Avinash Repaka <[email protected]> Somasundaram Krishnasamy <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Chuck Lever <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent ad9efa0 commit ed999f8

File tree

4 files changed

+475
-23
lines changed

4 files changed

+475
-23
lines changed

drivers/infiniband/core/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ ib_cm-y := cm.o
2020

2121
iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o
2222

23-
rdma_cm-y := cma.o
23+
CFLAGS_cma_trace.o += -I$(src)
24+
rdma_cm-y := cma.o cma_trace.o
2425

2526
rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o
2627

drivers/infiniband/core/cma.c

Lines changed: 66 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636

3737
#include "core_priv.h"
3838
#include "cma_priv.h"
39+
#include "cma_trace.h"
3940

4041
MODULE_AUTHOR("Sean Hefty");
4142
MODULE_DESCRIPTION("Generic RDMA CM Agent");
@@ -877,6 +878,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net,
877878
id_priv->id.route.addr.dev_addr.net = get_net(net);
878879
id_priv->seq_num &= 0x00ffffff;
879880

881+
trace_cm_id_create(id_priv);
880882
return &id_priv->id;
881883
}
882884
EXPORT_SYMBOL(__rdma_create_id);
@@ -928,27 +930,34 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
928930
int ret;
929931

930932
id_priv = container_of(id, struct rdma_id_private, id);
931-
if (id->device != pd->device)
932-
return -EINVAL;
933+
if (id->device != pd->device) {
934+
ret = -EINVAL;
935+
goto out_err;
936+
}
933937

934938
qp_init_attr->port_num = id->port_num;
935939
qp = ib_create_qp(pd, qp_init_attr);
936-
if (IS_ERR(qp))
937-
return PTR_ERR(qp);
940+
if (IS_ERR(qp)) {
941+
ret = PTR_ERR(qp);
942+
goto out_err;
943+
}
938944

939945
if (id->qp_type == IB_QPT_UD)
940946
ret = cma_init_ud_qp(id_priv, qp);
941947
else
942948
ret = cma_init_conn_qp(id_priv, qp);
943949
if (ret)
944-
goto err;
950+
goto out_destroy;
945951

946952
id->qp = qp;
947953
id_priv->qp_num = qp->qp_num;
948954
id_priv->srq = (qp->srq != NULL);
955+
trace_cm_qp_create(id_priv, pd, qp_init_attr, 0);
949956
return 0;
950-
err:
957+
out_destroy:
951958
ib_destroy_qp(qp);
959+
out_err:
960+
trace_cm_qp_create(id_priv, pd, qp_init_attr, ret);
952961
return ret;
953962
}
954963
EXPORT_SYMBOL(rdma_create_qp);
@@ -958,6 +967,7 @@ void rdma_destroy_qp(struct rdma_cm_id *id)
958967
struct rdma_id_private *id_priv;
959968

960969
id_priv = container_of(id, struct rdma_id_private, id);
970+
trace_cm_qp_destroy(id_priv);
961971
mutex_lock(&id_priv->qp_mutex);
962972
ib_destroy_qp(id_priv->id.qp);
963973
id_priv->id.qp = NULL;
@@ -1811,6 +1821,7 @@ void rdma_destroy_id(struct rdma_cm_id *id)
18111821
enum rdma_cm_state state;
18121822

18131823
id_priv = container_of(id, struct rdma_id_private, id);
1824+
trace_cm_id_destroy(id_priv);
18141825
state = cma_exch(id_priv, RDMA_CM_DESTROYING);
18151826
cma_cancel_operation(id_priv, state);
18161827

@@ -1863,6 +1874,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
18631874
if (ret)
18641875
goto reject;
18651876

1877+
trace_cm_send_rtu(id_priv);
18661878
ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
18671879
if (ret)
18681880
goto reject;
@@ -1871,6 +1883,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
18711883
reject:
18721884
pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret);
18731885
cma_modify_qp_err(id_priv);
1886+
trace_cm_send_rej(id_priv);
18741887
ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
18751888
NULL, 0, NULL, 0);
18761889
return ret;
@@ -1890,6 +1903,17 @@ static void cma_set_rep_event_data(struct rdma_cm_event *event,
18901903
event->param.conn.qp_num = rep_data->remote_qpn;
18911904
}
18921905

1906+
static int cma_cm_event_handler(struct rdma_id_private *id_priv,
1907+
struct rdma_cm_event *event)
1908+
{
1909+
int ret;
1910+
1911+
trace_cm_event_handler(id_priv, event);
1912+
ret = id_priv->id.event_handler(&id_priv->id, event);
1913+
trace_cm_event_done(id_priv, event, ret);
1914+
return ret;
1915+
}
1916+
18931917
static int cma_ib_handler(struct ib_cm_id *cm_id,
18941918
const struct ib_cm_event *ib_event)
18951919
{
@@ -1912,8 +1936,10 @@ static int cma_ib_handler(struct ib_cm_id *cm_id,
19121936
break;
19131937
case IB_CM_REP_RECEIVED:
19141938
if (cma_comp(id_priv, RDMA_CM_CONNECT) &&
1915-
(id_priv->id.qp_type != IB_QPT_UD))
1939+
(id_priv->id.qp_type != IB_QPT_UD)) {
1940+
trace_cm_send_mra(id_priv);
19161941
ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
1942+
}
19171943
if (id_priv->id.qp) {
19181944
event.status = cma_rep_recv(id_priv);
19191945
event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
@@ -1958,7 +1984,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id,
19581984
goto out;
19591985
}
19601986

1961-
ret = id_priv->id.event_handler(&id_priv->id, &event);
1987+
ret = cma_cm_event_handler(id_priv, &event);
19621988
if (ret) {
19631989
/* Destroy the CM ID by returning a non-zero value. */
19641990
id_priv->cm_id.ib = NULL;
@@ -2119,6 +2145,7 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
21192145
if (IS_ERR(listen_id))
21202146
return PTR_ERR(listen_id);
21212147

2148+
trace_cm_req_handler(listen_id, ib_event->event);
21222149
if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) {
21232150
ret = -EINVAL;
21242151
goto net_dev_put;
@@ -2161,7 +2188,7 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
21612188
* until we're done accessing it.
21622189
*/
21632190
atomic_inc(&conn_id->refcount);
2164-
ret = conn_id->id.event_handler(&conn_id->id, &event);
2191+
ret = cma_cm_event_handler(conn_id, &event);
21652192
if (ret)
21662193
goto err3;
21672194
/*
@@ -2170,8 +2197,10 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
21702197
*/
21712198
mutex_lock(&lock);
21722199
if (cma_comp(conn_id, RDMA_CM_CONNECT) &&
2173-
(conn_id->id.qp_type != IB_QPT_UD))
2200+
(conn_id->id.qp_type != IB_QPT_UD)) {
2201+
trace_cm_send_mra(cm_id->context);
21742202
ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
2203+
}
21752204
mutex_unlock(&lock);
21762205
mutex_unlock(&conn_id->handler_mutex);
21772206
mutex_unlock(&listen_id->handler_mutex);
@@ -2286,7 +2315,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
22862315
event.status = iw_event->status;
22872316
event.param.conn.private_data = iw_event->private_data;
22882317
event.param.conn.private_data_len = iw_event->private_data_len;
2289-
ret = id_priv->id.event_handler(&id_priv->id, &event);
2318+
ret = cma_cm_event_handler(id_priv, &event);
22902319
if (ret) {
22912320
/* Destroy the CM ID by returning a non-zero value. */
22922321
id_priv->cm_id.iw = NULL;
@@ -2363,7 +2392,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
23632392
* until we're done accessing it.
23642393
*/
23652394
atomic_inc(&conn_id->refcount);
2366-
ret = conn_id->id.event_handler(&conn_id->id, &event);
2395+
ret = cma_cm_event_handler(conn_id, &event);
23672396
if (ret) {
23682397
/* User wants to destroy the CM ID */
23692398
conn_id->cm_id.iw = NULL;
@@ -2435,6 +2464,7 @@ static int cma_listen_handler(struct rdma_cm_id *id,
24352464

24362465
id->context = id_priv->id.context;
24372466
id->event_handler = id_priv->id.event_handler;
2467+
trace_cm_event_handler(id_priv, event);
24382468
return id_priv->id.event_handler(id, event);
24392469
}
24402470

@@ -2611,7 +2641,7 @@ static void cma_work_handler(struct work_struct *_work)
26112641
if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
26122642
goto out;
26132643

2614-
if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
2644+
if (cma_cm_event_handler(id_priv, &work->event)) {
26152645
cma_exch(id_priv, RDMA_CM_DESTROYING);
26162646
destroy = 1;
26172647
}
@@ -2634,7 +2664,7 @@ static void cma_ndev_work_handler(struct work_struct *_work)
26342664
id_priv->state == RDMA_CM_DEVICE_REMOVAL)
26352665
goto out;
26362666

2637-
if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
2667+
if (cma_cm_event_handler(id_priv, &work->event)) {
26382668
cma_exch(id_priv, RDMA_CM_DESTROYING);
26392669
destroy = 1;
26402670
}
@@ -3089,7 +3119,7 @@ static void addr_handler(int status, struct sockaddr *src_addr,
30893119
} else
30903120
event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
30913121

3092-
if (id_priv->id.event_handler(&id_priv->id, &event)) {
3122+
if (cma_cm_event_handler(id_priv, &event)) {
30933123
cma_exch(id_priv, RDMA_CM_DESTROYING);
30943124
mutex_unlock(&id_priv->handler_mutex);
30953125
rdma_destroy_id(&id_priv->id);
@@ -3736,7 +3766,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
37363766
goto out;
37373767
}
37383768

3739-
ret = id_priv->id.event_handler(&id_priv->id, &event);
3769+
ret = cma_cm_event_handler(id_priv, &event);
37403770

37413771
rdma_destroy_ah_attr(&event.param.ud.ah_attr);
37423772
if (ret) {
@@ -3800,6 +3830,7 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
38003830
req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
38013831
req.max_cm_retries = CMA_MAX_CM_RETRIES;
38023832

3833+
trace_cm_send_sidr_req(id_priv);
38033834
ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
38043835
if (ret) {
38053836
ib_destroy_cm_id(id_priv->cm_id.ib);
@@ -3873,6 +3904,7 @@ static int cma_connect_ib(struct rdma_id_private *id_priv,
38733904
req.max_cm_retries = CMA_MAX_CM_RETRIES;
38743905
req.srq = id_priv->srq ? 1 : 0;
38753906

3907+
trace_cm_send_req(id_priv);
38763908
ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
38773909
out:
38783910
if (ret && !IS_ERR(id)) {
@@ -3986,6 +4018,7 @@ static int cma_accept_ib(struct rdma_id_private *id_priv,
39864018
rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
39874019
rep.srq = id_priv->srq ? 1 : 0;
39884020

4021+
trace_cm_send_rep(id_priv);
39894022
ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
39904023
out:
39914024
return ret;
@@ -4035,6 +4068,7 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
40354068
rep.private_data = private_data;
40364069
rep.private_data_len = private_data_len;
40374070

4071+
trace_cm_send_sidr_rep(id_priv);
40384072
return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
40394073
}
40404074

@@ -4120,13 +4154,15 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data,
41204154
return -EINVAL;
41214155

41224156
if (rdma_cap_ib_cm(id->device, id->port_num)) {
4123-
if (id->qp_type == IB_QPT_UD)
4157+
if (id->qp_type == IB_QPT_UD) {
41244158
ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0,
41254159
private_data, private_data_len);
4126-
else
4160+
} else {
4161+
trace_cm_send_rej(id_priv);
41274162
ret = ib_send_cm_rej(id_priv->cm_id.ib,
41284163
IB_CM_REJ_CONSUMER_DEFINED, NULL,
41294164
0, private_data, private_data_len);
4165+
}
41304166
} else if (rdma_cap_iw_cm(id->device, id->port_num)) {
41314167
ret = iw_cm_reject(id_priv->cm_id.iw,
41324168
private_data, private_data_len);
@@ -4151,8 +4187,13 @@ int rdma_disconnect(struct rdma_cm_id *id)
41514187
if (ret)
41524188
goto out;
41534189
/* Initiate or respond to a disconnect. */
4154-
if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
4155-
ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
4190+
trace_cm_disconnect(id_priv);
4191+
if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) {
4192+
if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0))
4193+
trace_cm_sent_drep(id_priv);
4194+
} else {
4195+
trace_cm_sent_dreq(id_priv);
4196+
}
41564197
} else if (rdma_cap_iw_cm(id->device, id->port_num)) {
41574198
ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
41584199
} else
@@ -4218,7 +4259,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
42184259
} else
42194260
event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
42204261

4221-
ret = id_priv->id.event_handler(&id_priv->id, &event);
4262+
ret = cma_cm_event_handler(id_priv, &event);
42224263

42234264
rdma_destroy_ah_attr(&event.param.ud.ah_attr);
42244265
if (ret) {
@@ -4623,6 +4664,7 @@ static void cma_add_one(struct ib_device *device)
46234664
cma_listen_on_dev(id_priv, cma_dev);
46244665
mutex_unlock(&lock);
46254666

4667+
trace_cm_add_one(device);
46264668
return;
46274669

46284670
free_gid_type:
@@ -4653,7 +4695,7 @@ static int cma_remove_id_dev(struct rdma_id_private *id_priv)
46534695
goto out;
46544696

46554697
event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
4656-
ret = id_priv->id.event_handler(&id_priv->id, &event);
4698+
ret = cma_cm_event_handler(id_priv, &event);
46574699
out:
46584700
mutex_unlock(&id_priv->handler_mutex);
46594701
return ret;
@@ -4691,6 +4733,8 @@ static void cma_remove_one(struct ib_device *device, void *client_data)
46914733
{
46924734
struct cma_device *cma_dev = client_data;
46934735

4736+
trace_cm_remove_one(device);
4737+
46944738
if (!cma_dev)
46954739
return;
46964740

drivers/infiniband/core/cma_trace.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
/*
3+
* Trace points for the RDMA Connection Manager.
4+
*
5+
* Author: Chuck Lever <[email protected]>
6+
*
7+
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
8+
*/
9+
10+
#define CREATE_TRACE_POINTS
11+
12+
#include <rdma/rdma_cm.h>
13+
#include <rdma/ib_cm.h>
14+
#include "cma_priv.h"
15+
16+
#include "cma_trace.h"

0 commit comments

Comments
 (0)