Skip to content

Commit 930c491

Browse files
Hakon-Buggevijay-suman
authored andcommitted
rds: Put back pages on the CPU that allocated them
The RDBMS usage model for RDS dictates that an RDS message, which is an SG list populated with enough order zero pages to hold the RDS messages, is allocated where the RDBMS process runs. And they run on any CPU. The Reliability aspect of RDS requires RDS to keep the message until the peer has acknowledged it. This happens by an explicit or an implicit ACK. When said ACKs are received, the pages in the SG list are freed (put). However, this only happens on the NUMA node local to the HCA used for communication, due to how cellirqbalance works. The above facts lead to a surplus of order zero pages on the NUMA node local to the HCA, and similar, a deficit of order zero pages on other NUMA nodes. Even though the SLAB allocation system is supposedly lock-free for order zero pages, locks are taken in the above scenario in order to establish balance again. This will again lead to lock contention in the kernel, and reduced RDBMS IOPS for certain workloads. This is fixed by maintaining a per-cpu cache for pages. In rds_copy_from_user(), we allocate from the per-cpu cache if possible. Likewise, when purging RDS messages, we put the pages back on the per-cpu list that allocated the page, unless it is too long. To avoid pages being stuck forever in the per-cpu cache, we have a garbage collector, which by default runs every second and cleans 10% of the possible CPU caches per invocation. When the RDS module is removed, we stop the worker thread and flushes 100% of all possible CPUs. This optimization provides an 11% improvement in update IOPS on an X10M VM running Oracle RDBMS. Orabug: 35768362 Suggested-by: Jane Chu <[email protected]> Signed-off-by: Håkon Bugge <[email protected]> Tested-by: Håkon Bugge <[email protected]> Tested-by: Shih-Yu Huang <[email protected]> Reviewed-by: William Kucharski <[email protected]> Tested-by: Alexis Silva <[email protected]> Orabug: 35768362 LUCI => v6.11 Conflicts: net/rds/sysctl.c - Merge conflict due to the missing ctl_table sentinel (Orabug 36936368) Reviewed-by: Hans Westgaard Ry <[email protected]> Signed-off-by: Håkon Bugge <[email protected]>
1 parent decdc40 commit 930c491

File tree

5 files changed

+168
-11
lines changed

5 files changed

+168
-11
lines changed

net/rds/af_rds.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2006, 2021 Oracle and/or its affiliates.
2+
* Copyright (c) 2006, 2023, Oracle and/or its affiliates.
33
*
44
* This software is available to you under a choice of one of two
55
* licenses. You may choose to be licensed under the terms of the GNU
@@ -1449,6 +1449,7 @@ static void __exit rds_exit(void)
14491449
rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
14501450
#endif
14511451
kmem_cache_destroy(rds_rs_buf_info_slab);
1452+
rds_cfu_fini_cache();
14521453
}
14531454

14541455
module_exit(rds_exit);
@@ -1503,6 +1504,7 @@ static int __init rds_init(void)
15031504
#endif
15041505

15051506
rds_qos_threshold_init();
1507+
rds_cfu_init_cache();
15061508

15071509
goto out;
15081510

net/rds/message.c

Lines changed: 136 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2006, 2020 Oracle and/or its affiliates.
2+
* Copyright (c) 2006, 2023, Oracle and/or its affiliates.
33
*
44
* This software is available to you under a choice of one of two
55
* licenses. You may choose to be licensed under the terms of the GNU
@@ -31,8 +31,10 @@
3131
*
3232
*/
3333
#include <linux/kernel.h>
34+
#include <linux/topology.h>
3435

3536
#include "rds.h"
37+
#include "lfstack.h"
3638

3739
static unsigned int rds_exthdr_size[] = {
3840
[RDS_EXTHDR_NONE] = 0,
@@ -47,6 +49,22 @@ static unsigned int rds_exthdr_size[] = {
4749
[RDS_EXTHDR_CSUM] = sizeof(struct rds_ext_header_rdma_csum),
4850
};
4951

52+
struct rds_cfu_cache_entry {
53+
struct page *pg;
54+
struct lfstack_el list;
55+
};
56+
57+
struct rds_cfu_gc_control {
58+
int percent_cpus_to_clean;
59+
int next_cpu;
60+
struct delayed_work work;
61+
};
62+
63+
static struct rds_cfu_gc_control gc_control;
64+
65+
DEFINE_PER_CPU(union lfstack, rds_cfu_cache) ____cacheline_aligned;
66+
DEFINE_PER_CPU(atomic_t, rds_cfu_entries) ____cacheline_aligned;
67+
static bool rds_cfu_cache_tearing_down;
5068

5169
void rds_message_addref(struct rds_message *rm)
5270
{
@@ -55,21 +73,108 @@ void rds_message_addref(struct rds_message *rm)
5573
}
5674
EXPORT_SYMBOL_GPL(rds_message_addref);
5775

76+
static void rds_cfu_cache_do_gc(void)
77+
{
78+
int cpus_to_check = num_possible_cpus() * gc_control.percent_cpus_to_clean / 100;
79+
int i;
80+
81+
if (!cpus_to_check)
82+
cpus_to_check = 1;
83+
84+
for (i = 0; i < cpus_to_check; ++i) {
85+
atomic_t *nmbr_entries_ptr = per_cpu_ptr(&rds_cfu_entries, gc_control.next_cpu);
86+
union lfstack *stack = per_cpu_ptr(&rds_cfu_cache, gc_control.next_cpu);
87+
unsigned int nmbr_cleaned = 0;
88+
struct lfstack_el *el;
89+
90+
while ((el = lfstack_pop(stack))) {
91+
struct rds_cfu_cache_entry *entry =
92+
container_of(el, struct rds_cfu_cache_entry, list);
93+
94+
++nmbr_cleaned;
95+
rds_page_free(entry->pg);
96+
}
97+
98+
atomic_sub(nmbr_cleaned, nmbr_entries_ptr);
99+
rds_stats_add(s_copy_from_user_cache_get, nmbr_cleaned);
100+
if (++gc_control.next_cpu >= num_possible_cpus())
101+
gc_control.next_cpu = 0;
102+
}
103+
}
104+
105+
static void rds_cfu_cache_gc_worker(struct work_struct *work)
106+
{
107+
rds_cfu_cache_do_gc();
108+
109+
/* To pair with smp_store_release() below */
110+
if (!smp_load_acquire(&rds_cfu_cache_tearing_down))
111+
rds_queue_delayed_work(NULL, rds_wq, &gc_control.work,
112+
msecs_to_jiffies(rds_cfu_cache_gc_interval * 1000),
113+
"CFU_Cache_gc");
114+
}
115+
116+
void rds_cfu_init_cache(void)
117+
{
118+
INIT_DELAYED_WORK(&gc_control.work, rds_cfu_cache_gc_worker);
119+
120+
gc_control.percent_cpus_to_clean = 10;
121+
gc_control.next_cpu = 0;
122+
rds_queue_delayed_work(NULL, rds_wq, &gc_control.work,
123+
msecs_to_jiffies(rds_cfu_cache_gc_interval * 1000),
124+
"CFU_Cache_gc");
125+
}
126+
127+
void rds_cfu_fini_cache(void)
128+
{
129+
/* To pair with the smp_load_acquire() above */
130+
smp_store_release(&rds_cfu_cache_tearing_down, true);
131+
cancel_delayed_work_sync(&gc_control.work);
132+
133+
gc_control.percent_cpus_to_clean = 100;
134+
rds_cfu_cache_do_gc();
135+
}
136+
58137
/*
59138
* This relies on dma_map_sg() not touching sg[].page during merging.
60139
*/
61140
static void rds_message_purge(struct rds_message *rm)
62141
{
142+
atomic_t *nmbr_entries_ptr = per_cpu_ptr(&rds_cfu_entries, rm->m_alloc_cpu);
143+
union lfstack *stack = per_cpu_ptr(&rds_cfu_cache, rm->m_alloc_cpu);
144+
struct lfstack_el *first = NULL;
145+
unsigned int cache_puts = 0;
146+
struct lfstack_el *last;
63147
unsigned long i;
64148

65149
if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
66150
return;
67151

68152
for (i = 0; i < rm->data.op_nents; i++) {
69-
rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
70-
/* XXX will have to put_page for page refs */
71-
rds_page_free(sg_page(&rm->data.op_sg[i]));
153+
if (rm->m_alloc_cpu != NUMA_NO_NODE && rm->data.op_sg[i].length >= PAGE_SIZE &&
154+
atomic_read(nmbr_entries_ptr) < rds_sysctl_cfu_cache_cap) {
155+
struct rds_cfu_cache_entry *entry =
156+
page_address(sg_page(rm->data.op_sg + i));
157+
158+
++cache_puts;
159+
if (!first) {
160+
first = &entry->list;
161+
last = first;
162+
} else {
163+
lfstack_link(last, &entry->list);
164+
last = lfstack_next(last);
165+
}
166+
entry->pg = sg_page(rm->data.op_sg + i);
167+
last->next = NULL;
168+
} else {
169+
rds_page_free(sg_page(&rm->data.op_sg[i]));
170+
}
171+
}
172+
if (first) {
173+
lfstack_push_many(stack, first, last);
174+
atomic_add(cache_puts, nmbr_entries_ptr);
175+
rds_stats_add(s_copy_from_user_cache_put, cache_puts);
72176
}
177+
73178
rm->data.op_nents = 0;
74179

75180
if (rm->rdma.op_active)
@@ -272,6 +377,7 @@ struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
272377

273378
rm->m_used_sgs = 0;
274379
rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
380+
rm->m_alloc_cpu = NUMA_NO_NODE;
275381

276382
atomic_set(&rm->m_refcount, 1);
277383
INIT_LIST_HEAD(&rm->m_sock_item);
@@ -319,12 +425,34 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
319425

320426
while (iov_iter_count(from)) {
321427
if (!sg_page(sg)) {
322-
ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
323-
GFP_HIGHUSER, NUMA_NO_NODE);
324-
if (ret)
325-
return ret;
428+
if (iov_iter_count(from) >= PAGE_SIZE) {
429+
union lfstack *stack = per_cpu_ptr(&rds_cfu_cache,
430+
smp_processor_id());
431+
struct lfstack_el *el = lfstack_pop(stack);
432+
433+
if (el) {
434+
atomic_t *nmbr_entries_ptr =
435+
per_cpu_ptr(&rds_cfu_entries, smp_processor_id());
436+
struct rds_cfu_cache_entry *entry =
437+
container_of(el, struct rds_cfu_cache_entry, list);
438+
439+
sg_set_page(sg, entry->pg, PAGE_SIZE, 0);
440+
rds_stats_inc(s_copy_from_user_cache_get);
441+
atomic_dec(nmbr_entries_ptr);
442+
}
443+
}
444+
445+
if (!sg_page(sg)) {
446+
ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
447+
GFP_HIGHUSER, NUMA_NO_NODE);
448+
if (ret)
449+
return ret;
450+
}
451+
326452
rm->data.op_nents++;
327453
sg_off = 0;
454+
if (rm->m_alloc_cpu == NUMA_NO_NODE)
455+
rm->m_alloc_cpu = smp_processor_id();
328456
}
329457

330458
to_copy = min_t(unsigned long, iov_iter_count(from),

net/rds/rds.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,7 @@ struct rds_message {
725725

726726
struct rds_conn_path *m_conn_path;
727727
struct rds_csum m_payload_csum;
728+
int m_alloc_cpu;
728729
};
729730

730731
/*
@@ -1036,6 +1037,8 @@ struct rds_statistics {
10361037
uint64_t s_page_remainder_miss;
10371038
uint64_t s_copy_to_user;
10381039
uint64_t s_copy_from_user;
1040+
uint64_t s_copy_from_user_cache_get;
1041+
uint64_t s_copy_from_user_cache_put;
10391042
uint64_t s_cong_update_queued;
10401043
uint64_t s_cong_update_received;
10411044
uint64_t s_cong_send_error;
@@ -1363,6 +1366,8 @@ void rds_message_addref(struct rds_message *rm);
13631366
void rds_message_put(struct rds_message *rm);
13641367
void rds_message_wait(struct rds_message *rm);
13651368
void rds_message_unmapped(struct rds_message *rm);
1369+
void rds_cfu_init_cache(void);
1370+
void rds_cfu_fini_cache(void);
13661371

13671372
static inline void rds_message_make_checksum(struct rds_header *hdr)
13681373
{
@@ -1492,6 +1497,8 @@ extern unsigned int rds_sysctl_conn_hb_timeout;
14921497
extern unsigned int rds_sysctl_conn_hb_interval;
14931498
extern unsigned long rds_sysctl_dr_sock_cancel_jiffies;
14941499
extern unsigned int rds_sysctl_enable_payload_csum;
1500+
extern unsigned int rds_sysctl_cfu_cache_cap;
1501+
extern unsigned int rds_cfu_cache_gc_interval;
14951502

14961503
/* threads.c */
14971504
int rds_threads_init(void);

net/rds/stats.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2006 Oracle. All rights reserved.
2+
* Copyright (c) 2006, 2023, Oracle and/or its affiliates.
33
*
44
* This software is available to you under a choice of one of two
55
* licenses. You may choose to be licensed under the terms of the GNU
@@ -87,6 +87,8 @@ static char *rds_stat_names[] = {
8787
"page_remainder_miss",
8888
"copy_to_user",
8989
"copy_from_user",
90+
"copy_from_user_cache_get",
91+
"copy_from_user_cache_put",
9092
"cong_update_queued",
9193
"cong_update_received",
9294
"cong_send_error",

net/rds/sysctl.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2006, 2023, Oracle and/or its affiliates.
33
*
44
* This software is available to you under a choice of one of two
55
* licenses. You may choose to be licensed under the terms of the GNU
@@ -88,6 +88,10 @@ static int rds_sysctl_sol_rds = SOL_RDS;
8888

8989
unsigned int rds_sysctl_enable_payload_csum;
9090

91+
unsigned int rds_sysctl_cfu_cache_cap = 512;
92+
93+
unsigned int rds_cfu_cache_gc_interval = 1;
94+
9195
static struct ctl_table rds_sysctl_rds_table[] = {
9296
{
9397
.procname = "reconnect_min_delay_ms",
@@ -204,6 +208,20 @@ static struct ctl_table rds_sysctl_rds_table[] = {
204208
.mode = 0644,
205209
.proc_handler = proc_douintvec,
206210
},
211+
{
212+
.procname = "copy_from_user_per_cpu_cache_cap",
213+
.data = &rds_sysctl_cfu_cache_cap,
214+
.maxlen = sizeof(rds_sysctl_cfu_cache_cap),
215+
.mode = 0644,
216+
.proc_handler = proc_douintvec,
217+
},
218+
{
219+
.procname = "copy_from_user_gc_interval_secs",
220+
.data = &rds_cfu_cache_gc_interval,
221+
.maxlen = sizeof(rds_cfu_cache_gc_interval),
222+
.mode = 0644,
223+
.proc_handler = proc_douintvec,
224+
},
207225
};
208226

209227
void rds_sysctl_exit(void)

0 commit comments

Comments
 (0)