Skip to content

Commit 0676651

Browse files
RDS: IB: split mr pool to improve 8K messages performance
8K message sizes are pretty important usecase for RDS current workloads so we make provison to have 8K mrs available from the pool. Based on number of SG's in the RDS message, we pick a pool to use. Also to make sure that we don't under utlise mrs when say 8k messages are dominating which could lead to 8k pull being exhausted, we fall-back to 1m pool till 8k pool recovers for use. This helps to at least push ~55 kB/s bidirectional data which is a nice improvement. Signed-off-by: Santosh Shilimkar <[email protected]> Signed-off-by: Santosh Shilimkar <[email protected]>
1 parent 41a4e96 commit 0676651

File tree

4 files changed

+147
-62
lines changed

4 files changed

+147
-62
lines changed

net/rds/ib.c

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,14 @@
4343
#include "rds.h"
4444
#include "ib.h"
4545

46-
static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
47-
unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
46+
unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE;
47+
unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE;
4848
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
4949

50-
module_param(fmr_pool_size, int, 0444);
51-
MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
52-
module_param(fmr_message_size, int, 0444);
53-
MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
50+
module_param(rds_ib_fmr_1m_pool_size, int, 0444);
51+
MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA");
52+
module_param(rds_ib_fmr_8k_pool_size, int, 0444);
53+
MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA");
5454
module_param(rds_ib_retry_count, int, 0444);
5555
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
5656

@@ -97,8 +97,10 @@ static void rds_ib_dev_free(struct work_struct *work)
9797
struct rds_ib_device *rds_ibdev = container_of(work,
9898
struct rds_ib_device, free_work);
9999

100-
if (rds_ibdev->mr_pool)
101-
rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
100+
if (rds_ibdev->mr_8k_pool)
101+
rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
102+
if (rds_ibdev->mr_1m_pool)
103+
rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
102104
if (rds_ibdev->pd)
103105
ib_dealloc_pd(rds_ibdev->pd);
104106

@@ -148,9 +150,13 @@ static void rds_ib_add_one(struct ib_device *device)
148150
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
149151

150152
rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
151-
rds_ibdev->max_fmrs = dev_attr->max_mr ?
152-
min_t(unsigned int, dev_attr->max_mr, fmr_pool_size) :
153-
fmr_pool_size;
153+
rds_ibdev->max_1m_fmrs = dev_attr->max_mr ?
154+
min_t(unsigned int, (dev_attr->max_mr / 2),
155+
rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
156+
157+
rds_ibdev->max_8k_fmrs = dev_attr->max_mr ?
158+
min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE),
159+
rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
154160

155161
rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
156162
rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
@@ -162,12 +168,25 @@ static void rds_ib_add_one(struct ib_device *device)
162168
goto put_dev;
163169
}
164170

165-
rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
166-
if (IS_ERR(rds_ibdev->mr_pool)) {
167-
rds_ibdev->mr_pool = NULL;
171+
rds_ibdev->mr_1m_pool =
172+
rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL);
173+
if (IS_ERR(rds_ibdev->mr_1m_pool)) {
174+
rds_ibdev->mr_1m_pool = NULL;
168175
goto put_dev;
169176
}
170177

178+
rds_ibdev->mr_8k_pool =
179+
rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL);
180+
if (IS_ERR(rds_ibdev->mr_8k_pool)) {
181+
rds_ibdev->mr_8k_pool = NULL;
182+
goto put_dev;
183+
}
184+
185+
rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
186+
dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
187+
rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
188+
rds_ibdev->max_8k_fmrs);
189+
171190
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
172191
INIT_LIST_HEAD(&rds_ibdev->conn_list);
173192

net/rds/ib.h

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,11 @@
99
#include "rds.h"
1010
#include "rdma_transport.h"
1111

12-
#define RDS_FMR_SIZE 256
13-
#define RDS_FMR_POOL_SIZE 8192
12+
#define RDS_FMR_1M_POOL_SIZE (8192 / 2)
13+
#define RDS_FMR_1M_MSG_SIZE 256
14+
#define RDS_FMR_8K_MSG_SIZE 2
15+
#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1))
16+
#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
1417

1518
#define RDS_IB_MAX_SGE 8
1619
#define RDS_IB_RECV_SGE 2
@@ -189,15 +192,23 @@ struct rds_ib_ipaddr {
189192
struct rcu_head rcu;
190193
};
191194

195+
enum {
196+
RDS_IB_MR_8K_POOL,
197+
RDS_IB_MR_1M_POOL,
198+
};
199+
192200
struct rds_ib_device {
193201
struct list_head list;
194202
struct list_head ipaddr_list;
195203
struct list_head conn_list;
196204
struct ib_device *dev;
197205
struct ib_pd *pd;
198-
struct rds_ib_mr_pool *mr_pool;
199-
unsigned int fmr_max_remaps;
200206
unsigned int max_fmrs;
207+
struct rds_ib_mr_pool *mr_1m_pool;
208+
struct rds_ib_mr_pool *mr_8k_pool;
209+
unsigned int fmr_max_remaps;
210+
unsigned int max_8k_fmrs;
211+
unsigned int max_1m_fmrs;
201212
int max_sge;
202213
unsigned int max_wrs;
203214
unsigned int max_initiator_depth;
@@ -239,12 +250,18 @@ struct rds_ib_statistics {
239250
uint64_t s_ib_ack_send_delayed;
240251
uint64_t s_ib_ack_send_piggybacked;
241252
uint64_t s_ib_ack_received;
242-
uint64_t s_ib_rdma_mr_alloc;
243-
uint64_t s_ib_rdma_mr_free;
244-
uint64_t s_ib_rdma_mr_used;
245-
uint64_t s_ib_rdma_mr_pool_flush;
246-
uint64_t s_ib_rdma_mr_pool_wait;
247-
uint64_t s_ib_rdma_mr_pool_depleted;
253+
uint64_t s_ib_rdma_mr_8k_alloc;
254+
uint64_t s_ib_rdma_mr_8k_free;
255+
uint64_t s_ib_rdma_mr_8k_used;
256+
uint64_t s_ib_rdma_mr_8k_pool_flush;
257+
uint64_t s_ib_rdma_mr_8k_pool_wait;
258+
uint64_t s_ib_rdma_mr_8k_pool_depleted;
259+
uint64_t s_ib_rdma_mr_1m_alloc;
260+
uint64_t s_ib_rdma_mr_1m_free;
261+
uint64_t s_ib_rdma_mr_1m_used;
262+
uint64_t s_ib_rdma_mr_1m_pool_flush;
263+
uint64_t s_ib_rdma_mr_1m_pool_wait;
264+
uint64_t s_ib_rdma_mr_1m_pool_depleted;
248265
uint64_t s_ib_atomic_cswp;
249266
uint64_t s_ib_atomic_fadd;
250267
};
@@ -296,7 +313,8 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
296313
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
297314
extern struct ib_client rds_ib_client;
298315

299-
extern unsigned int fmr_message_size;
316+
extern unsigned int rds_ib_fmr_1m_pool_size;
317+
extern unsigned int rds_ib_fmr_8k_pool_size;
300318
extern unsigned int rds_ib_retry_count;
301319

302320
extern spinlock_t ib_nodev_conns_lock;
@@ -326,7 +344,8 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
326344
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
327345
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
328346
void rds_ib_destroy_nodev_conns(void);
329-
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
347+
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
348+
int npages);
330349
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
331350
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
332351
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,

net/rds/ib_rdma.c

Lines changed: 71 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ struct rds_ib_mr {
6565
* Our own little FMR pool
6666
*/
6767
struct rds_ib_mr_pool {
68+
unsigned int pool_type;
6869
struct mutex flush_lock; /* serialize fmr invalidate */
6970
struct delayed_work flush_worker; /* flush worker */
7071

@@ -234,43 +235,47 @@ void rds_ib_destroy_nodev_conns(void)
234235
rds_conn_destroy(ic->conn);
235236
}
236237

237-
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
238+
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
239+
int pool_type)
238240
{
239241
struct rds_ib_mr_pool *pool;
240242

241243
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
242244
if (!pool)
243245
return ERR_PTR(-ENOMEM);
244246

247+
pool->pool_type = pool_type;
245248
init_llist_head(&pool->free_list);
246249
init_llist_head(&pool->drop_list);
247250
init_llist_head(&pool->clean_list);
248251
mutex_init(&pool->flush_lock);
249252
init_waitqueue_head(&pool->flush_wait);
250253
INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
251254

252-
pool->fmr_attr.max_pages = fmr_message_size;
255+
if (pool_type == RDS_IB_MR_1M_POOL) {
256+
/* +1 allows for unaligned MRs */
257+
pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1;
258+
pool->max_items = RDS_FMR_1M_POOL_SIZE;
259+
} else {
260+
/* pool_type == RDS_IB_MR_8K_POOL */
261+
pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1;
262+
pool->max_items = RDS_FMR_8K_POOL_SIZE;
263+
}
264+
265+
pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
253266
pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
254267
pool->fmr_attr.page_shift = PAGE_SHIFT;
255-
pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
256-
257-
/* We never allow more than max_items MRs to be allocated.
258-
* When we exceed more than max_items_soft, we start freeing
259-
* items more aggressively.
260-
* Make sure that max_items > max_items_soft > max_items / 2
261-
*/
262268
pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
263-
pool->max_items = rds_ibdev->max_fmrs;
264269

265270
return pool;
266271
}
267272

268273
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
269274
{
270-
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
275+
struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
271276

272-
iinfo->rdma_mr_max = pool->max_items;
273-
iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
277+
iinfo->rdma_mr_max = pool_1m->max_items;
278+
iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
274279
}
275280

276281
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
@@ -312,15 +317,29 @@ static inline void wait_clean_list_grace(void)
312317
}
313318
}
314319

315-
static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
320+
static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
321+
int npages)
316322
{
317-
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
323+
struct rds_ib_mr_pool *pool;
318324
struct rds_ib_mr *ibmr = NULL;
319325
int err = 0, iter = 0;
320326

327+
if (npages <= RDS_FMR_8K_MSG_SIZE)
328+
pool = rds_ibdev->mr_8k_pool;
329+
else
330+
pool = rds_ibdev->mr_1m_pool;
331+
321332
if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
322333
queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
323334

335+
/* Switch pools if one of the pool is reaching upper limit */
336+
if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) {
337+
if (pool->pool_type == RDS_IB_MR_8K_POOL)
338+
pool = rds_ibdev->mr_1m_pool;
339+
else
340+
pool = rds_ibdev->mr_8k_pool;
341+
}
342+
324343
while (1) {
325344
ibmr = rds_ib_reuse_fmr(pool);
326345
if (ibmr)
@@ -341,12 +360,18 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
341360
atomic_dec(&pool->item_count);
342361

343362
if (++iter > 2) {
344-
rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
363+
if (pool->pool_type == RDS_IB_MR_8K_POOL)
364+
rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
365+
else
366+
rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
345367
return ERR_PTR(-EAGAIN);
346368
}
347369

348370
/* We do have some empty MRs. Flush them out. */
349-
rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
371+
if (pool->pool_type == RDS_IB_MR_8K_POOL)
372+
rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
373+
else
374+
rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
350375
rds_ib_flush_mr_pool(pool, 0, &ibmr);
351376
if (ibmr)
352377
return ibmr;
@@ -371,7 +396,12 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
371396
goto out_no_cigar;
372397
}
373398

374-
rds_ib_stats_inc(s_ib_rdma_mr_alloc);
399+
ibmr->pool = pool;
400+
if (pool->pool_type == RDS_IB_MR_8K_POOL)
401+
rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
402+
else
403+
rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
404+
375405
return ibmr;
376406

377407
out_no_cigar:
@@ -427,7 +457,7 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
427457
}
428458

429459
page_cnt += len >> PAGE_SHIFT;
430-
if (page_cnt > fmr_message_size)
460+
if (page_cnt > ibmr->pool->fmr_attr.max_pages)
431461
return -EINVAL;
432462

433463
dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
@@ -459,7 +489,10 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
459489
ibmr->sg_dma_len = sg_dma_len;
460490
ibmr->remap_count++;
461491

462-
rds_ib_stats_inc(s_ib_rdma_mr_used);
492+
if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
493+
rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
494+
else
495+
rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
463496
ret = 0;
464497

465498
out:
@@ -591,7 +624,7 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
591624
* to free as many MRs as needed to get back to this limit.
592625
*/
593626
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
594-
int free_all, struct rds_ib_mr **ibmr_ret)
627+
int free_all, struct rds_ib_mr **ibmr_ret)
595628
{
596629
struct rds_ib_mr *ibmr, *next;
597630
struct llist_node *clean_nodes;
@@ -602,11 +635,14 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
602635
unsigned int nfreed = 0, dirty_to_clean = 0, free_goal;
603636
int ret = 0;
604637

605-
rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
638+
if (pool->pool_type == RDS_IB_MR_8K_POOL)
639+
rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush);
640+
else
641+
rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush);
606642

607643
if (ibmr_ret) {
608644
DEFINE_WAIT(wait);
609-
while(!mutex_trylock(&pool->flush_lock)) {
645+
while (!mutex_trylock(&pool->flush_lock)) {
610646
ibmr = rds_ib_reuse_fmr(pool);
611647
if (ibmr) {
612648
*ibmr_ret = ibmr;
@@ -663,8 +699,12 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
663699
list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
664700
unpinned += ibmr->sg_len;
665701
__rds_ib_teardown_mr(ibmr);
666-
if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
667-
rds_ib_stats_inc(s_ib_rdma_mr_free);
702+
if (nfreed < free_goal ||
703+
ibmr->remap_count >= pool->fmr_attr.max_maps) {
704+
if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
705+
rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
706+
else
707+
rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
668708
list_del(&ibmr->unmap_list);
669709
ib_dealloc_fmr(ibmr->fmr);
670710
kfree(ibmr);
@@ -756,10 +796,11 @@ void rds_ib_flush_mrs(void)
756796

757797
down_read(&rds_ib_devices_lock);
758798
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
759-
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
799+
if (rds_ibdev->mr_8k_pool)
800+
rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL);
760801

761-
if (pool)
762-
rds_ib_flush_mr_pool(pool, 0, NULL);
802+
if (rds_ibdev->mr_1m_pool)
803+
rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL);
763804
}
764805
up_read(&rds_ib_devices_lock);
765806
}
@@ -777,12 +818,12 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
777818
goto out;
778819
}
779820

780-
if (!rds_ibdev->mr_pool) {
821+
if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) {
781822
ret = -ENODEV;
782823
goto out;
783824
}
784825

785-
ibmr = rds_ib_alloc_fmr(rds_ibdev);
826+
ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
786827
if (IS_ERR(ibmr)) {
787828
rds_ib_dev_put(rds_ibdev);
788829
return ibmr;

0 commit comments

Comments
 (0)