Skip to content

Commit ebababc

Browse files
kkdwivediAlexei Starovoitov
authored andcommitted
rqspinlock: Hardcode cond_acquire loops for arm64
Currently, for rqspinlock usage, the implementation of smp_cond_load_acquire (and thus, atomic_cond_read_acquire) are susceptible to stalls on arm64, because they do not guarantee that the conditional expression will be repeatedly invoked if the address being loaded from is not written to by other CPUs. When support for event-streams is absent (which unblocks stuck WFE-based loops every ~100us), we may end up being stuck forever. This causes a problem for us, as we need to repeatedly invoke the RES_CHECK_TIMEOUT in the spin loop to break out when the timeout expires. Let us import the smp_cond_load_acquire_timewait implementation Ankur is proposing in [0], and then fallback to it once it is merged. While we rely on the implementation to amortize the cost of sampling check_timeout for us, it will not happen when event stream support is unavailable. This is not the common case, and it would be difficult to fit our logic in the time_expr_ns >= time_limit_ns comparison, hence just let it be. [0]: https://lore.kernel.org/lkml/[email protected] Cc: Ankur Arora <[email protected]> Signed-off-by: Kumar Kartikeya Dwivedi <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
1 parent 14c48ee commit ebababc

File tree

2 files changed

+108
-0
lines changed

2 files changed

+108
-0
lines changed

arch/arm64/include/asm/rqspinlock.h

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
#ifndef _ASM_RQSPINLOCK_H
3+
#define _ASM_RQSPINLOCK_H
4+
5+
#include <asm/barrier.h>
6+
7+
/*
8+
* Hardcode res_smp_cond_load_acquire implementations for arm64 to a custom
9+
* version based on [0]. In rqspinlock code, our conditional expression involves
10+
* checking the value _and_ additionally a timeout. However, on arm64, the
11+
* WFE-based implementation may never spin again if no stores occur to the
12+
* locked byte in the lock word. As such, we may be stuck forever if
13+
* event-stream based unblocking is not available on the platform for WFE spin
14+
* loops (arch_timer_evtstrm_available).
15+
*
16+
* Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this
17+
* copy-paste.
18+
*
19+
* While we rely on the implementation to amortize the cost of sampling
20+
* cond_expr for us, it will not happen when event stream support is
21+
* unavailable, time_expr check is amortized. This is not the common case, and
22+
* it would be difficult to fit our logic in the time_expr_ns >= time_limit_ns
23+
* comparison, hence just let it be. In case of event-stream, the loop is woken
24+
* up at microsecond granularity.
25+
*
26+
* [0]: https://lore.kernel.org/lkml/[email protected]
27+
*/
28+
29+
#ifndef smp_cond_load_acquire_timewait
30+
31+
#define smp_cond_time_check_count 200
32+
33+
#define __smp_cond_load_relaxed_spinwait(ptr, cond_expr, time_expr_ns, \
34+
time_limit_ns) ({ \
35+
typeof(ptr) __PTR = (ptr); \
36+
__unqual_scalar_typeof(*ptr) VAL; \
37+
unsigned int __count = 0; \
38+
for (;;) { \
39+
VAL = READ_ONCE(*__PTR); \
40+
if (cond_expr) \
41+
break; \
42+
cpu_relax(); \
43+
if (__count++ < smp_cond_time_check_count) \
44+
continue; \
45+
if ((time_expr_ns) >= (time_limit_ns)) \
46+
break; \
47+
__count = 0; \
48+
} \
49+
(typeof(*ptr))VAL; \
50+
})
51+
52+
#define __smp_cond_load_acquire_timewait(ptr, cond_expr, \
53+
time_expr_ns, time_limit_ns) \
54+
({ \
55+
typeof(ptr) __PTR = (ptr); \
56+
__unqual_scalar_typeof(*ptr) VAL; \
57+
for (;;) { \
58+
VAL = smp_load_acquire(__PTR); \
59+
if (cond_expr) \
60+
break; \
61+
__cmpwait_relaxed(__PTR, VAL); \
62+
if ((time_expr_ns) >= (time_limit_ns)) \
63+
break; \
64+
} \
65+
(typeof(*ptr))VAL; \
66+
})
67+
68+
#define smp_cond_load_acquire_timewait(ptr, cond_expr, \
69+
time_expr_ns, time_limit_ns) \
70+
({ \
71+
__unqual_scalar_typeof(*ptr) _val; \
72+
int __wfe = arch_timer_evtstrm_available(); \
73+
\
74+
if (likely(__wfe)) { \
75+
_val = __smp_cond_load_acquire_timewait(ptr, cond_expr, \
76+
time_expr_ns, \
77+
time_limit_ns); \
78+
} else { \
79+
_val = __smp_cond_load_relaxed_spinwait(ptr, cond_expr, \
80+
time_expr_ns, \
81+
time_limit_ns); \
82+
smp_acquire__after_ctrl_dep(); \
83+
} \
84+
(typeof(*ptr))_val; \
85+
})
86+
87+
#endif
88+
89+
#define res_smp_cond_load_acquire_timewait(v, c) smp_cond_load_acquire_timewait(v, c, 0, 1)
90+
91+
#include <asm-generic/rqspinlock.h>
92+
93+
#endif /* _ASM_RQSPINLOCK_H */

kernel/bpf/rqspinlock.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,21 @@ static noinline int check_timeout(struct rqspinlock_timeout *ts)
9292
return 0;
9393
}
9494

95+
/*
96+
* Do not amortize with spins when res_smp_cond_load_acquire is defined,
97+
* as the macro does internal amortization for us.
98+
*/
99+
#ifndef res_smp_cond_load_acquire
95100
#define RES_CHECK_TIMEOUT(ts, ret) \
96101
({ \
97102
if (!(ts).spin++) \
98103
(ret) = check_timeout(&(ts)); \
99104
(ret); \
100105
})
106+
#else
107+
#define RES_CHECK_TIMEOUT(ts, ret, mask) \
108+
({ (ret) = check_timeout(&(ts)); })
109+
#endif
101110

102111
/*
103112
* Initialize the 'spin' member.
@@ -118,6 +127,12 @@ static noinline int check_timeout(struct rqspinlock_timeout *ts)
118127
*/
119128
static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
120129

130+
#ifndef res_smp_cond_load_acquire
131+
#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c)
132+
#endif
133+
134+
#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
135+
121136
/**
122137
* resilient_queued_spin_lock_slowpath - acquire the queued spinlock
123138
* @lock: Pointer to queued spinlock structure

0 commit comments

Comments
 (0)