@@ -31,6 +31,8 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
31
31
*/
32
32
static DEFINE_PER_CPU (raw_spinlock_t , wakeup_vcpus_on_cpu_lock ) ;
33
33
34
+ #define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING
35
+
34
36
static inline struct pi_desc * vcpu_to_pi_desc (struct kvm_vcpu * vcpu )
35
37
{
36
38
return & (to_vmx (vcpu )-> pi_desc );
@@ -89,9 +91,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
89
91
* current pCPU if the task was migrated.
90
92
*/
91
93
if (pi_desc -> nv == POSTED_INTR_WAKEUP_VECTOR ) {
92
- raw_spin_lock (& per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu ));
94
+ raw_spinlock_t * spinlock = & per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu );
95
+
96
+ /*
97
+ * In addition to taking the wakeup lock for the regular/IRQ
98
+ * context, tell lockdep it is being taken for the "sched out"
99
+ * context as well. vCPU loads happens in task context, and
100
+ * this is taking the lock of the *previous* CPU, i.e. can race
101
+ * with both the scheduler and the wakeup handler.
102
+ */
103
+ raw_spin_lock (spinlock );
104
+ spin_acquire (& spinlock -> dep_map , PI_LOCK_SCHED_OUT , 0 , _RET_IP_ );
93
105
list_del (& vmx -> pi_wakeup_list );
94
- raw_spin_unlock (& per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu ));
106
+ spin_release (& spinlock -> dep_map , _RET_IP_ );
107
+ raw_spin_unlock (spinlock );
95
108
}
96
109
97
110
dest = cpu_physical_id (cpu );
@@ -148,11 +161,23 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
148
161
struct pi_desc * pi_desc = vcpu_to_pi_desc (vcpu );
149
162
struct vcpu_vmx * vmx = to_vmx (vcpu );
150
163
struct pi_desc old , new ;
151
- unsigned long flags ;
152
164
153
- local_irq_save ( flags );
165
+ lockdep_assert_irqs_disabled ( );
154
166
155
- raw_spin_lock (& per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu ));
167
+ /*
168
+ * Acquire the wakeup lock using the "sched out" context to workaround
169
+ * a lockdep false positive. When this is called, schedule() holds
170
+ * various per-CPU scheduler locks. When the wakeup handler runs, it
171
+ * holds this CPU's wakeup lock while calling try_to_wake_up(), which
172
+ * can eventually take the aforementioned scheduler locks, which causes
173
+ * lockdep to assume there is deadlock.
174
+ *
175
+ * Deadlock can't actually occur because IRQs are disabled for the
176
+ * entirety of the sched_out critical section, i.e. the wakeup handler
177
+ * can't run while the scheduler locks are held.
178
+ */
179
+ raw_spin_lock_nested (& per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu ),
180
+ PI_LOCK_SCHED_OUT );
156
181
list_add_tail (& vmx -> pi_wakeup_list ,
157
182
& per_cpu (wakeup_vcpus_on_cpu , vcpu -> cpu ));
158
183
raw_spin_unlock (& per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu ));
@@ -176,8 +201,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
176
201
*/
177
202
if (pi_test_on (& new ))
178
203
__apic_send_IPI_self (POSTED_INTR_WAKEUP_VECTOR );
179
-
180
- local_irq_restore (flags );
181
204
}
182
205
183
206
static bool vmx_needs_pi_wakeup (struct kvm_vcpu * vcpu )
0 commit comments