Skip to content

Commit dfeefe4

Browse files
Hans Westgaard Ryvijay-suman
authored andcommitted
RDS/IB: Fix RDS IB SRQ implementation and tune it
The current version of SRQ, Shared Receive Queue, is rewritten and optimized for better performance. First version is disabled by default, commits are posted to get a version merged and making further development easier. Performance is not what we were hoping for and we need to investigate further to figure out why and what we can do to improve it. Orabug: 28388725 Signed-off-by: Hans Westgaard Ry <[email protected]> Tested-by: Ivan Murillo <[email protected]> Reviewed-by: William Kucharski <[email protected]> luci => 6.5-rc6 * After commit 2dfe9f114f14 ("rds/ib: Replace cmpxchg_double with try_cmpxchg128"), we need to s/struct lfstack/union lfstack/ * The above substitute is done three places * This commit at large will be reverted in a newer luci commit Signed-off-by: Håkon Bugge <[email protected]> Reviewed-by: Sharath Srinivasan <[email protected]> 6.5-rc6 => 6.8-rc7 * make rds_ib_srq_refill_worker(), rds_ib_srq_prefill() and rds_ib_srq_destroy_one() static to avoid error: no previous prototype for '...' [-Werror=missing-prototypes] Signed-off-by: Hans Westgaard Ry <[email protected]> Reviewed-by: Håkon Bugge <[email protected]>
1 parent a24303d commit dfeefe4

File tree

11 files changed

+730
-588
lines changed

11 files changed

+730
-588
lines changed

net/rds/ib.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -609,10 +609,7 @@ static void rds_ib_dev_free(struct work_struct *work)
609609
bool last_to_free;
610610
int allocated;
611611

612-
if (rds_ibdev->srq) {
613-
rds_ib_srq_exit(rds_ibdev);
614-
kfree(rds_ibdev->srq);
615-
}
612+
rds_ib_srq_exit(rds_ibdev);
616613
rds_ib_free_caches(rds_ibdev);
617614

618615
if (rds_ibdev->mr_8k_pool)
@@ -892,7 +889,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
892889
if (ic) {
893890
iinfo->tos = conn->c_tos;
894891
iinfo->sl = ic->i_sl;
895-
iinfo->frag = ic->i_frag_sz;
892+
iinfo->frag = ic->i_cache_info.ci_frag_sz;
896893
}
897894

898895
if (rds_conn_state(conn) == RDS_CONN_UP) {
@@ -915,7 +912,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
915912
iinfo->flow_ctl_send_credit =
916913
IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits));
917914
rds_ib_get_mr_info(rds_ibdev, iinfo);
918-
iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs);
915+
iinfo->cache_allocs = atomic_read(&ic->i_cache_info.ci_cache_allocs);
919916
iinfo->send_alloc_ctr = ic->i_send_ring.w_alloc_ctr;
920917
iinfo->send_free_ctr =
921918
(uint32_t)atomic_read(&ic->i_send_ring.w_free_ctr);
@@ -964,7 +961,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
964961
if (ic) {
965962
iinfo6->tos = conn->c_tos;
966963
iinfo6->sl = ic->i_sl;
967-
iinfo6->frag = ic->i_frag_sz;
964+
iinfo6->frag = ic->i_cache_info.ci_frag_sz;
968965
}
969966

970967
if (rds_conn_state(conn) == RDS_CONN_UP) {
@@ -987,7 +984,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
987984
iinfo6->flow_ctl_send_credit =
988985
IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits));
989986
rds6_ib_get_mr_info(rds_ibdev, iinfo6);
990-
iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs);
987+
iinfo6->cache_allocs = atomic_read(&ic->i_cache_info.ci_cache_allocs);
991988
iinfo6->send_alloc_ctr = ic->i_send_ring.w_alloc_ctr;
992989
iinfo6->send_free_ctr =
993990
(uint32_t)atomic_read(&ic->i_send_ring.w_free_ctr);
@@ -1420,6 +1417,8 @@ int rds_ib_init(void)
14201417
#if IS_ENABLED(CONFIG_IPV6)
14211418
rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
14221419
#endif
1420+
pr_err("RDS/IB: Shared Receive Queues are %s\n",
1421+
rds_ib_srq_enabled ? "enabled" : "disabled");
14231422

14241423
/* Register with RDMA framework at last. Once registered, upcall
14251424
* can be made so everything should be set up first.

net/rds/ib.h

Lines changed: 78 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,6 @@
5959

6060
#define NUM_RDS_RECV_SG (PAGE_ALIGN(RDS_MAX_FRAG_SIZE) / PAGE_SIZE)
6161

62-
#define RDS_IB_CQ_ERR 2
63-
#define RDS_IB_NEED_SHUTDOWN 3
6462
static inline void set_bit_mb(long nr, unsigned long *flags)
6563
{
6664
/* set_bit() does not imply a memory barrier */
@@ -80,9 +78,12 @@ static inline void clear_bit_mb(long nr, unsigned long *flags)
8078
}
8179

8280
enum rds_ib_conn_flags {
83-
RDS_IB_CLEAN_CACHE,
84-
RDS_IB_CQ_ERR,
85-
RDS_IB_NEED_SHUTDOWN
81+
RDS_IB_CLEAN_CACHE, /* 0x01 */
82+
RDS_IB_CQ_ERR, /* 0x02 */
83+
RDS_IB_NEED_SHUTDOWN, /* 0x04 */
84+
RDS_IB_SRQ_NEED_FLUSH, /* 0x08 */
85+
RDS_IB_SRQ_LAST_WQE_REACHED, /* 0x10 */
86+
RDS_IB_SRQ_CQ_FLUSHED /* 0x20 */
8687
};
8788

8889
#define RDS_IB_DEFAULT_FREG_PORT_NUM 1
@@ -130,6 +131,15 @@ struct rds_ib_refill_cache {
130131

131132
};
132133

134+
struct rds_ib_cache_info {
135+
u16 ci_frag_sz; /* IB fragment size */
136+
u8 ci_frag_pages;
137+
u16 ci_frag_cache_inx;
138+
uint ci_irq_local_cpu;
139+
atomic_t ci_cache_allocs;
140+
struct rds_transport *ci_trans;
141+
};
142+
133143
struct rds_ib_conn_priv_cmn {
134144
u8 ricpc_protocol_major;
135145
u8 ricpc_protocol_minor;
@@ -185,8 +195,8 @@ struct rds_ib_recv_work {
185195
struct rds_page_frag *r_frag;
186196
struct ib_recv_wr r_wr;
187197
struct ib_sge r_sge[RDS_IB_MAX_SGE];
188-
struct rds_ib_connection *r_ic;
189-
int r_posted;
198+
unsigned long r_posted;
199+
struct lfstack_el r_stack_entry;
190200
};
191201

192202
struct rds_ib_work_ring {
@@ -335,11 +345,8 @@ struct rds_ib_connection {
335345

336346
/* Protocol version specific information */
337347
unsigned int i_flowctl:1; /* enable/disable flow ctl */
338-
u16 i_frag_sz; /* IB fragment size */
339-
u16 i_frag_cache_sz;
340-
u8 i_frag_pages;
348+
struct rds_ib_cache_info i_cache_info;
341349
unsigned long i_flags;
342-
u16 i_frag_cache_inx;
343350
u16 i_hca_sge;
344351

345352
/* Batched completions */
@@ -349,8 +356,6 @@ struct rds_ib_connection {
349356
unsigned int i_unsolicited_wrs;
350357
u8 i_sl;
351358

352-
atomic_t i_cache_allocs;
353-
354359
struct completion i_last_wqe_complete;
355360

356361
/* Active Bonding */
@@ -364,7 +369,6 @@ struct rds_ib_connection {
364369
spinlock_t i_rx_lock;
365370
unsigned int i_rx_wait_for_handler;
366371
atomic_t i_worker_has_rx;
367-
uint i_irq_local_cpu;
368372

369373
/* For handling delayed release of device related resource. */
370374
struct mutex i_delayed_free_lock;
@@ -399,18 +403,28 @@ struct rds_ib_ipaddr {
399403
struct rcu_head rcu_head;
400404
};
401405

406+
enum rds_ib_srq_flags {
407+
RDS_SRQ_REFILL, /* 0x01 */
408+
};
409+
410+
#define RDS_SRQ_NMBR_STACKS 8 /* must be 2^n */
402411
struct rds_ib_srq {
403412
struct rds_ib_device *rds_ibdev;
404413
struct ib_srq *s_srq;
405414
struct ib_event_handler s_event_handler;
406415
struct rds_ib_recv_work *s_recvs;
407416
u32 s_n_wr;
408-
struct rds_header *s_recv_hdrs;
409-
u64 s_recv_hdrs_dma;
417+
struct rds_header **s_recv_hdrs;
418+
dma_addr_t *s_recv_hdrs_dma;
419+
struct scatterlist *s_recv_hdrs_sg;
410420
atomic_t s_num_posted;
411-
unsigned long s_refill_gate;
421+
unsigned long s_flags;
412422
struct delayed_work s_refill_w;
413423
struct delayed_work s_rearm_w;
424+
atomic_t s_refill_ix;
425+
atomic_t s_release_ix;
426+
struct rds_ib_cache_info s_cache_info;
427+
union lfstack s_stack[RDS_SRQ_NMBR_STACKS];
414428
};
415429

416430

@@ -436,6 +450,7 @@ enum {
436450
};
437451

438452
#define RDS_FRAG_CACHE_ENTRIES (ilog2(RDS_MAX_FRAG_SIZE / PAGE_SIZE) + 1)
453+
#define NMBR_QOS 256
439454

440455
/* Each RDMA device maintains a list of RDS sockets associated with it. The
441456
* following struct is used to represent this association. This struct is
@@ -484,7 +499,14 @@ struct rds_ib_device {
484499
unsigned int max_initiator_depth;
485500
unsigned int max_responder_resources;
486501
spinlock_t spinlock; /* protect the above */
487-
struct rds_ib_srq *srq;
502+
atomic_t refcount;
503+
struct work_struct free_work;
504+
struct rds_ib_srq *srqs[NMBR_QOS];
505+
/* Several QOS connections may invoke rds_ib_srq_get
506+
* concurrently, hence we need protection for rds_ib_srq_get
507+
*/
508+
struct mutex srq_get_lock;
509+
488510
struct rds_ib_port *ports;
489511
struct ib_event_handler event_handler;
490512
int *vector_load;
@@ -551,6 +573,16 @@ struct rds_ib_statistics {
551573
uint64_t s_ib_rx_refill_from_cq;
552574
uint64_t s_ib_rx_refill_from_thread;
553575
uint64_t s_ib_rx_refill_lock_taken;
576+
uint64_t s_ib_srq_refill_from_cm;
577+
uint64_t s_ib_srq_refill_from_rx;
578+
uint64_t s_ib_srq_refill_from_event;
579+
uint64_t s_ib_srq_limit_reached_event;
580+
uint64_t s_ib_srq_refills;
581+
uint64_t s_ib_srq_empty_refills;
582+
uint64_t s_ib_srq_entries_refilled;
583+
uint64_t s_ib_srq_entries_from_stacks;
584+
uint64_t s_ib_srq_jiffies_refilled;
585+
uint64_t s_ib_srq_jiffies_from_stacks;
554586
uint64_t s_ib_rx_alloc_limit;
555587
uint64_t s_ib_rx_total_frags;
556588
uint64_t s_ib_rx_total_incs;
@@ -585,9 +617,6 @@ struct rds_ib_statistics {
585617
uint64_t s_ib_rdma_flush_mr_pool_avoided;
586618
uint64_t s_ib_atomic_cswp;
587619
uint64_t s_ib_atomic_fadd;
588-
uint64_t s_ib_srq_lows;
589-
uint64_t s_ib_srq_refills;
590-
uint64_t s_ib_srq_empty_refills;
591620
uint64_t s_ib_recv_added_to_cache;
592621
uint64_t s_ib_recv_removed_from_cache;
593622
uint64_t s_ib_recv_nmb_added_to_cache;
@@ -708,7 +737,19 @@ u32 __rds_find_ifindex_v4(struct net *net, __be32 addr);
708737
#if IS_ENABLED(CONFIG_IPV6)
709738
u32 __rds_find_ifindex_v6(struct net *net, const struct in6_addr *addr);
710739
#endif
711-
740+
void rds_ib_free_unmap_hdrs(struct ib_device *dev,
741+
struct rds_header ***_hdrs,
742+
dma_addr_t **_dma,
743+
struct scatterlist **_sg,
744+
const int n,
745+
enum dma_data_direction direction);
746+
int rds_ib_alloc_map_hdrs(struct ib_device *dev,
747+
struct rds_header ***_hdrs,
748+
dma_addr_t **_dma,
749+
struct scatterlist **_sg,
750+
char **reason,
751+
const int n,
752+
enum dma_data_direction direction);
712753
/* ib_rdma.c */
713754
struct rds_ib_device *rds_ib_get_device(const struct in6_addr *ipaddr);
714755
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
@@ -754,19 +795,16 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
754795
struct ib_wc *wc,
755796
struct rds_ib_ack_state *state);
756797
void rds_ib_recv_tasklet_fn(unsigned long data);
757-
void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
798+
void rds_ib_recv_init_ring(struct rds_ib_connection *ic, struct rds_ib_srq *srq);
758799
void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
759800
void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
760801
void rds_ib_attempt_ack(struct rds_ib_connection *ic);
761802
void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
762803
u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
763-
void rds_ib_srq_refill(struct work_struct *work);
764-
int rds_ib_srq_prefill_ring(struct rds_ib_device *rds_ibdev);
804+
void rds_ib_srq_refill(struct rds_ib_srq *srq, bool prefill, gfp_t gfp, bool use_worker);
765805
void rds_ib_srq_rearm(struct work_struct *work);
766806
void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required);
767-
void rds_ib_srq_process_recv(struct rds_connection *conn,
768-
struct rds_ib_recv_work *recv, u32 data_len,
769-
struct rds_ib_ack_state *state);
807+
struct rds_ib_srq *rds_ib_srq_get(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
770808
static inline int rds_ib_recv_acquire_refill(struct rds_connection *conn)
771809
{
772810
return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0;
@@ -786,6 +824,18 @@ static inline int rds_ib_recv_acquire_refill(struct rds_connection *conn)
786824
} \
787825
} while (false)
788826

827+
/* The goal here is to just make sure that someone, somewhere
828+
* is posting buffers. If we can't get the refill lock,
829+
* let them do their thing
830+
*/
831+
#define RDS_IB_SRQ_REFILL(srq, prefill, gfp, where, use_worker) do { \
832+
struct rds_ib_srq *s = (srq); \
833+
int np = atomic_read(&s->s_num_posted); \
834+
if (np < rds_ib_srq_hwm_refill) { \
835+
rds_ib_stats_inc(where); \
836+
rds_ib_srq_refill(s, prefill, gfp, use_worker); \
837+
} \
838+
} while (false)
789839
/* ib_ring.c */
790840
void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
791841
void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);

0 commit comments

Comments
 (0)