RDS: avoid queueing delayed work on an offlined cpu

pkannoju · jfvogel · commit adeee1b8caef · 2025-04-22T18:54:02.000-07:00
During cpu scaling operations, when an rds delayed_work with non-zero delay is scheduled on an offlined cpu, we've seen that the work gets stuck and the work will reside in the send queue without gettting transmitted. Only when other traffic on that connection path in a non worker context is submitted, the earlier stuck work will be flushed out. This situation is causing latency in the rds-traffic, especially visible from the rds-ping data. We've reproduced this issue in-house with simple cpu scale-down activity. Corresponding details are shown below. ----------------------------------------- [Tue Dec 24 06:47:33 2024] Unregister pv shared memory for cpu 52 [Tue Dec 24 06:47:33 2024] smpboot: CPU 52 is now offline [Tue Dec 24 06:47:35 2024] <::ffff:192.168.10.15,::ffff:192.168.10.17,0> work scheduled on offine cpu: 52, delay: 1, raw_smp_processor_id: 22 PID: 53903 Comm: ora_dia0_c219cd [Tue Dec 24 06:47:35 2024] CPU: 22 PID: 53903 Comm: ora_dia0_c219cd Kdump: loaded Tainted: P OE 5.4.17-2136.322.6.5.el8uek.x86_64 #2 [Tue Dec 24 06:47:35 2024] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-4.module+el8.10.0+90413+d8f5961d 04/01/2014 [Tue Dec 24 06:47:35 2024] Call Trace: [Tue Dec 24 06:47:35 2024] dump_stack+0x6d/0x8f [Tue Dec 24 06:47:35 2024] rds_queue_delayed_work_on+0x131/0x140 [ksplice_4nnxk5aq_rds_new] [Tue Dec 24 06:47:35 2024] rds_sendmsg+0x1339/0x1499 [rds] [Tue Dec 24 06:47:35 2024] ? __check_object_size+0x51/0x1c7 [Tue Dec 24 06:47:35 2024] ? _copy_from_user+0x34/0x64 [Tue Dec 24 06:47:35 2024] ? rw_copy_check_uvector+0x61/0x13f [Tue Dec 24 06:47:35 2024] sock_sendmsg+0x67/0x69 [Tue Dec 24 06:47:35 2024] ____sys_sendmsg+0x1fe/0x266 [Tue Dec 24 06:47:35 2024] ? copy_msghdr_from_user+0x60/0x8f [Tue Dec 24 06:47:35 2024] ___sys_sendmsg+0x7c/0xb9 [Tue Dec 24 06:47:35 2024] ? ___sys_recvmsg+0x89/0xb8 [Tue Dec 24 06:47:35 2024] __sys_sendmsg+0x5c/0xa2 [Tue Dec 24 06:47:35 2024] __x64_sys_sendmsg+0x1f/0x25 [Tue Dec 24 06:47:35 2024] do_syscall_64+0x60/0x1cf [Tue Dec 24 06:47:35 2024] entry_SYSCALL_64_after_hwframe+0x175/0x0 [Tue Dec 24 06:47:35 2024] RIP: 0033:0x7f4bebd1aa85 ----------------------------------------- The above stack indicates that the oracle db process "ora_dia0_c219cd" issued an rds related work on the connection between 192.168.10.15 and 192.168.10.17 on lane0", which was scheduled to run on CPU 52 at 06:47:35, which just got offlined at 06:47:33. This started the increase in rds-ping latencies on the same connection. ----------------------------------------- [INFO:2024-12-24-06:42:20] numactl --cpunodebind=0 --membind=0 rds-ping -c 1 -i 5 -Q 0 -I 192.168.10.17 192.168.10.15: 1: 75 usec [INFO:2024-12-24-06:43:21] numactl --cpunodebind=0 --membind=0 rds-ping -c 1 -i 5 -Q 0 -I 192.168.10.17 192.168.10.15: 1: 90 usec [INFO:2024-12-24-06:44:41] numactl --cpunodebind=0 --membind=0 rds-ping -c 1 -i 5 -Q 0 -I 192.168.10.17 192.168.10.15: 1: 103 usec [INFO:2024-12-24-06:45:41] numactl --cpunodebind=0 --membind=0 rds-ping -c 1 -i 5 -Q 0 -I 192.168.10.17 192.168.10.15: 1: 97 usec [INFO:2024-12-24-06:46:41] numactl --cpunodebind=0 --membind=0 rds-ping -c 1 -i 5 -Q 0 -I 192.168.10.17 192.168.10.15: 1: 99 usec [INFO:2024-12-24-06:47:48] numactl --cpunodebind=0 --membind=0 rds-ping -c 1 -i 5 -Q 0 -I 192.168.10.17 192.168.10.15: 1: 1101878 usec [INFO:2024-12-24-06:48:48] numactl --cpunodebind=0 --membind=0 rds-ping -c 1 -i 5 -Q 0 -I 192.168.10.17 192.168.10.15: 1: 70558 usec [INFO:2024-12-24-06:49:50] numactl --cpunodebind=0 --membind=0 rds-ping -c 1 -i 5 -Q 0 -I 192.168.10.17 192.168.10.15: 1: 717324 usec ----------------------------------------- The patch we're proposing to fix this issue ensures that we execute the delayed work on a cpu which is online at the moment. In case the cpu becomes offline after this, the timer would migrate to the available cpu and get the job executed instead of remaining stuck. We've verified the performance through rds-stress tests to ensure there is no huge performance impact with this patch. QA tests for this patch are under progress. Orabug: 37260584 Signed-off-by: Praveen Kumar Kannoju <praveen.kannoju@oracle.com> Reviewed-by: Imran Khan <imran.f.khan@oracle.com> Acked-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Signed-off-by: Arumugam Kolappan <aru.kolappan@oracle.com> Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com> (cherry picked from commit dfcbc82) Orabug: 37551308 Signed-off-by: Arumugam Kolappan <aru.kolappan@oracle.com> Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
@@ -645,7 +645,7 @@ static void rds_ib_queue_delayed_work_on(struct rds_ib_device *rds_ibdev,
 					 char *reason)
 {
 	trace_rds_ib_queue_work(rds_ibdev, wq, &dwork->work, delay, reason);
-	queue_delayed_work_on(cpu, wq, dwork, delay);
+	__rds_queue_delayed_work_on(cpu, wq, dwork, delay);
 }
 
 static void rds_ib_queue_cancel_work(struct rds_ib_device *rds_ibdev,
diff --git a/net/rds/rds.h b/net/rds/rds.h
@@ -1244,6 +1244,10 @@ void rds_queue_delayed_work_on(struct rds_conn_path *cp, int cpu,
 			       struct workqueue_struct *wq,
 			       struct delayed_work *dwork,
 			       unsigned long delay, char *reason);
+void __rds_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+				 struct delayed_work *dwork,
+				 unsigned long delay);
+
 void rds_mod_delayed_work(struct rds_conn_path *cp,
 			  struct workqueue_struct *wq,
 			  struct delayed_work *dwork,
diff --git a/net/rds/threads.c b/net/rds/threads.c
@@ -112,6 +112,28 @@ void rds_queue_work(struct rds_conn_path *cp,
 }
 EXPORT_SYMBOL_GPL(rds_queue_work);
 
+void __rds_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+				 struct delayed_work *dwork,
+				 unsigned long delay)
+{
+	if (!delay || cpu == WORK_CPU_UNBOUND) {
+		queue_delayed_work_on(cpu, wq, dwork, delay);
+		return;
+	}
+
+	if (cpus_read_trylock()) {
+		if (cpu_online(cpu)) {
+			queue_delayed_work_on(cpu, wq, dwork, delay);
+			cpus_read_unlock();
+			return;
+		}
+		cpus_read_unlock();
+	}
+
+	queue_delayed_work(wq, dwork, delay);
+}
+EXPORT_SYMBOL_GPL(__rds_queue_delayed_work_on);
+
 void rds_queue_delayed_work(struct rds_conn_path *cp,
 			    struct workqueue_struct *wq,
 			    struct delayed_work *dwork,
@@ -125,7 +147,7 @@ void rds_queue_delayed_work(struct rds_conn_path *cp,
 
 	if (cp && cp->cp_conn->c_trans->conn_preferred_cpu) {
 		cpu = cp->cp_conn->c_trans->conn_preferred_cpu(cp->cp_conn, false);
-		queue_delayed_work_on(cpu, wq, dwork, delay);
+		__rds_queue_delayed_work_on(cpu, wq, dwork, delay);
 	} else
 		queue_delayed_work(wq, dwork, delay);
 }
@@ -140,7 +162,7 @@ void rds_queue_delayed_work_on(struct rds_conn_path *cp,
 {
 	trace_rds_queue_work(cp ? cp->cp_conn : NULL, cp, wq, &dwork->work,
 			     delay, reason);
-	queue_delayed_work_on(cpu, wq, dwork, delay);
+	__rds_queue_delayed_work_on(cpu, wq, dwork, delay);
 }
 EXPORT_SYMBOL_GPL(rds_queue_delayed_work_on);
 

Original file line number	Diff line number	Diff line change
`@@ -645,7 +645,7 @@ static void rds_ib_queue_delayed_work_on(struct rds_ib_device *rds_ibdev,`
`645`	`645`	`char *reason)`
`646`	`646`	`{`
`647`	`647`	`trace_rds_ib_queue_work(rds_ibdev, wq, &dwork->work, delay, reason);`
`648`		`- queue_delayed_work_on(cpu, wq, dwork, delay);`
	`648`	`+ __rds_queue_delayed_work_on(cpu, wq, dwork, delay);`
`649`	`649`	`}`
`650`	`650`
`651`	`651`	`static void rds_ib_queue_cancel_work(struct rds_ib_device *rds_ibdev,`