Skip to content

Commit 20c9398

Browse files
Wen Gudavem330
authored andcommitted
net/smc: Resolve the race between SMC-R link access and clear
We encountered some crashes caused by the race between SMC-R link access and link clear that triggered by abnormal link group termination, such as port error. Here is an example of this kind of crashes: BUG: kernel NULL pointer dereference, address: 0000000000000000 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_llc_flow_initiate+0x44/0x190 [smc] Call Trace: <TASK> ? __smc_buf_create+0x75a/0x950 [smc] smcr_lgr_reg_rmbs+0x2a/0xbf [smc] smc_listen_work+0xf72/0x1230 [smc] ? process_one_work+0x25c/0x600 process_one_work+0x25c/0x600 worker_thread+0x4f/0x3a0 ? process_one_work+0x600/0x600 kthread+0x15d/0x1a0 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 </TASK> smc_listen_work() __smc_lgr_terminate() --------------------------------------------------------------- | smc_lgr_free() | |- smcr_link_clear() | |- memset(lnk, 0) smc_listen_rdma_reg() | |- smcr_lgr_reg_rmbs() | |- smc_llc_flow_initiate() | |- access lnk->lgr (panic) | These crashes are similarly caused by clearing SMC-R link resources when some functions is still accessing to them. This patch tries to fix the issue by introducing reference count of SMC-R links and ensuring that the sensitive resources of links won't be cleared until reference count reaches zero. The operation to the SMC-R link reference count can be concluded as follows: object [hold or initialized as 1] [put] -------------------------------------------------------------------- links smcr_link_init() smcr_link_clear() connections smc_conn_create() smc_conn_free() Through this way, the clear of SMC-R links is later than the free of all the smc connections above it, thus avoiding the unsafe reference to SMC-R links. Signed-off-by: Wen Gu <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent ea89c6c commit 20c9398

File tree

2 files changed

+44
-12
lines changed

2 files changed

+44
-12
lines changed

net/smc/smc_core.c

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
748748
}
749749
get_device(&lnk->smcibdev->ibdev->dev);
750750
atomic_inc(&lnk->smcibdev->lnk_cnt);
751+
refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
752+
lnk->clearing = 0;
751753
lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
752754
lnk->link_id = smcr_next_link_id(lgr);
753755
lnk->lgr = lgr;
@@ -998,8 +1000,12 @@ void smc_switch_link_and_count(struct smc_connection *conn,
9981000
struct smc_link *to_lnk)
9991001
{
10001002
atomic_dec(&conn->lnk->conn_cnt);
1003+
/* link_hold in smc_conn_create() */
1004+
smcr_link_put(conn->lnk);
10011005
conn->lnk = to_lnk;
10021006
atomic_inc(&conn->lnk->conn_cnt);
1007+
/* link_put in smc_conn_free() */
1008+
smcr_link_hold(conn->lnk);
10031009
}
10041010

10051011
struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
@@ -1162,6 +1168,8 @@ void smc_conn_free(struct smc_connection *conn)
11621168
if (!lgr->conns_num)
11631169
smc_lgr_schedule_free_work(lgr);
11641170
lgr_put:
1171+
if (!lgr->is_smcd)
1172+
smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */
11651173
smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */
11661174
}
11671175

@@ -1218,22 +1226,11 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk)
12181226
}
12191227
}
12201228

1221-
/* must be called under lgr->llc_conf_mutex lock */
1222-
void smcr_link_clear(struct smc_link *lnk, bool log)
1229+
static void __smcr_link_clear(struct smc_link *lnk)
12231230
{
12241231
struct smc_link_group *lgr = lnk->lgr;
12251232
struct smc_ib_device *smcibdev;
12261233

1227-
if (!lgr || lnk->state == SMC_LNK_UNUSED)
1228-
return;
1229-
lnk->peer_qpn = 0;
1230-
smc_llc_link_clear(lnk, log);
1231-
smcr_buf_unmap_lgr(lnk);
1232-
smcr_rtoken_clear_link(lnk);
1233-
smc_ib_modify_qp_error(lnk);
1234-
smc_wr_free_link(lnk);
1235-
smc_ib_destroy_queue_pair(lnk);
1236-
smc_ib_dealloc_protection_domain(lnk);
12371234
smc_wr_free_link_mem(lnk);
12381235
smc_ibdev_cnt_dec(lnk);
12391236
put_device(&lnk->smcibdev->ibdev->dev);
@@ -1245,6 +1242,35 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
12451242
smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */
12461243
}
12471244

1245+
/* must be called under lgr->llc_conf_mutex lock */
1246+
void smcr_link_clear(struct smc_link *lnk, bool log)
1247+
{
1248+
if (!lnk->lgr || lnk->clearing ||
1249+
lnk->state == SMC_LNK_UNUSED)
1250+
return;
1251+
lnk->clearing = 1;
1252+
lnk->peer_qpn = 0;
1253+
smc_llc_link_clear(lnk, log);
1254+
smcr_buf_unmap_lgr(lnk);
1255+
smcr_rtoken_clear_link(lnk);
1256+
smc_ib_modify_qp_error(lnk);
1257+
smc_wr_free_link(lnk);
1258+
smc_ib_destroy_queue_pair(lnk);
1259+
smc_ib_dealloc_protection_domain(lnk);
1260+
smcr_link_put(lnk); /* theoretically last link_put */
1261+
}
1262+
1263+
void smcr_link_hold(struct smc_link *lnk)
1264+
{
1265+
refcount_inc(&lnk->refcnt);
1266+
}
1267+
1268+
void smcr_link_put(struct smc_link *lnk)
1269+
{
1270+
if (refcount_dec_and_test(&lnk->refcnt))
1271+
__smcr_link_clear(lnk);
1272+
}
1273+
12481274
static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
12491275
struct smc_buf_desc *buf_desc)
12501276
{
@@ -1893,6 +1919,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
18931919
}
18941920
}
18951921
smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */
1922+
if (!conn->lgr->is_smcd)
1923+
smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */
18961924
conn->freed = 0;
18971925
conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
18981926
conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;

net/smc/smc_core.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,8 @@ struct smc_link {
137137
u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */
138138
u8 link_idx; /* index in lgr link array */
139139
u8 link_is_asym; /* is link asymmetric? */
140+
u8 clearing : 1; /* link is being cleared */
141+
refcount_t refcnt; /* link reference count */
140142
struct smc_link_group *lgr; /* parent link group */
141143
struct work_struct link_down_wrk; /* wrk to bring link down */
142144
char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */
@@ -526,6 +528,8 @@ void smc_core_exit(void);
526528
int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
527529
u8 link_idx, struct smc_init_info *ini);
528530
void smcr_link_clear(struct smc_link *lnk, bool log);
531+
void smcr_link_hold(struct smc_link *lnk);
532+
void smcr_link_put(struct smc_link *lnk);
529533
void smc_switch_link_and_count(struct smc_connection *conn,
530534
struct smc_link *to_lnk);
531535
int smcr_buf_map_lgr(struct smc_link *lnk);

0 commit comments

Comments
 (0)