Skip to content

Commit f0295e0

Browse files
mikeympe
authored andcommitted
powerpc/eeh: Fix race with driver un/bind
The current EEH callbacks can race with a driver unbind. This can result in a backtraces like this: EEH: Frozen PHB#0-PE#1fc detected EEH: PE location: S000009, PHB location: N/A CPU: 2 PID: 2312 Comm: kworker/u258:3 Not tainted 4.15.6-openpower1 #2 Workqueue: nvme-wq nvme_reset_work [nvme] Call Trace: dump_stack+0x9c/0xd0 (unreliable) eeh_dev_check_failure+0x420/0x470 eeh_check_failure+0xa0/0xa4 nvme_reset_work+0x138/0x1414 [nvme] process_one_work+0x1ec/0x328 worker_thread+0x2e4/0x3a8 kthread+0x14c/0x154 ret_from_kernel_thread+0x5c/0xc8 nvme nvme1: Removing after probe failure status: -19 <snip> cpu 0x23: Vector: 300 (Data Access) at [c000000ff50f3800] pc: c0080000089a0eb0: nvme_error_detected+0x4c/0x90 [nvme] lr: c000000000026564: eeh_report_error+0xe0/0x110 sp: c000000ff50f3a80 msr: 9000000000009033 dar: 400 dsisr: 40000000 current = 0xc000000ff507c000 paca = 0xc00000000fdc9d80 softe: 0 irq_happened: 0x01 pid = 782, comm = eehd Linux version 4.15.6-openpower1 (smc@smc-desktop) (gcc version 6.4.0 (Buildroot 2017.11.2-00008-g4b6188e)) #2 SM P Tue Feb 27 12:33:27 PST 2018 enter ? for help eeh_report_error+0xe0/0x110 eeh_pe_dev_traverse+0xc0/0xdc eeh_handle_normal_event+0x184/0x4c4 eeh_handle_event+0x30/0x288 eeh_event_handler+0x124/0x170 kthread+0x14c/0x154 ret_from_kernel_thread+0x5c/0xc8 The first part is an EEH (on boot), the second half is the resulting crash. nvme probe starts the nvme_reset_work() worker thread. This worker thread starts touching the device which see a device error (EEH) and hence queues up an event in the powerpc EEH worker thread. nvme_reset_work() then continues and runs nvme_remove_dead_ctrl_work() which results in unbinding the driver from the device and hence releases all resources. At the same time, the EEH worker thread starts doing the EEH .error_detected() driver callback, which no longer works since the resources have been freed. This fixes the problem in the same way the generic PCIe AER code (in drivers/pci/pcie/aer/aerdrv_core.c) does. It makes the EEH code hold the device_lock() while performing the driver EEH callbacks and associated code. This ensures either the callbacks are no longer register, or if they are registered the driver will not be removed from underneath us. This has been broken forever. The EEH call backs were first introduced in 2005 (in 77bd741) but it's not clear if a lock was needed back then. Fixes: 77bd741 ("[PATCH] powerpc: PCI Error Recovery: PPC64 core recovery routines") Cc: [email protected] # v2.6.16+ Signed-off-by: Michael Neuling <[email protected]> Reviewed-by: Benjamin Herrenschmidt <[email protected]> Signed-off-by: Michael Ellerman <[email protected]>
1 parent bf8a1ab commit f0295e0

File tree

1 file changed

+42
-26
lines changed

1 file changed

+42
-26
lines changed

arch/powerpc/kernel/eeh_driver.c

Lines changed: 42 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -207,18 +207,18 @@ static void *eeh_report_error(void *data, void *userdata)
207207

208208
if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
209209
return NULL;
210+
211+
device_lock(&dev->dev);
210212
dev->error_state = pci_channel_io_frozen;
211213

212214
driver = eeh_pcid_get(dev);
213-
if (!driver) return NULL;
215+
if (!driver) goto out_no_dev;
214216

215217
eeh_disable_irq(dev);
216218

217219
if (!driver->err_handler ||
218-
!driver->err_handler->error_detected) {
219-
eeh_pcid_put(dev);
220-
return NULL;
221-
}
220+
!driver->err_handler->error_detected)
221+
goto out;
222222

223223
rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
224224

@@ -227,8 +227,12 @@ static void *eeh_report_error(void *data, void *userdata)
227227
if (*res == PCI_ERS_RESULT_NONE) *res = rc;
228228

229229
edev->in_error = true;
230-
eeh_pcid_put(dev);
231230
pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
231+
232+
out:
233+
eeh_pcid_put(dev);
234+
out_no_dev:
235+
device_unlock(&dev->dev);
232236
return NULL;
233237
}
234238

@@ -251,23 +255,25 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
251255
if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
252256
return NULL;
253257

258+
device_lock(&dev->dev);
254259
driver = eeh_pcid_get(dev);
255-
if (!driver) return NULL;
260+
if (!driver) goto out_no_dev;
256261

257262
if (!driver->err_handler ||
258263
!driver->err_handler->mmio_enabled ||
259-
(edev->mode & EEH_DEV_NO_HANDLER)) {
260-
eeh_pcid_put(dev);
261-
return NULL;
262-
}
264+
(edev->mode & EEH_DEV_NO_HANDLER))
265+
goto out;
263266

264267
rc = driver->err_handler->mmio_enabled(dev);
265268

266269
/* A driver that needs a reset trumps all others */
267270
if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
268271
if (*res == PCI_ERS_RESULT_NONE) *res = rc;
269272

273+
out:
270274
eeh_pcid_put(dev);
275+
out_no_dev:
276+
device_unlock(&dev->dev);
271277
return NULL;
272278
}
273279

@@ -290,28 +296,31 @@ static void *eeh_report_reset(void *data, void *userdata)
290296

291297
if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
292298
return NULL;
299+
300+
device_lock(&dev->dev);
293301
dev->error_state = pci_channel_io_normal;
294302

295303
driver = eeh_pcid_get(dev);
296-
if (!driver) return NULL;
304+
if (!driver) goto out_no_dev;
297305

298306
eeh_enable_irq(dev);
299307

300308
if (!driver->err_handler ||
301309
!driver->err_handler->slot_reset ||
302310
(edev->mode & EEH_DEV_NO_HANDLER) ||
303-
(!edev->in_error)) {
304-
eeh_pcid_put(dev);
305-
return NULL;
306-
}
311+
(!edev->in_error))
312+
goto out;
307313

308314
rc = driver->err_handler->slot_reset(dev);
309315
if ((*res == PCI_ERS_RESULT_NONE) ||
310316
(*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
311317
if (*res == PCI_ERS_RESULT_DISCONNECT &&
312318
rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
313319

320+
out:
314321
eeh_pcid_put(dev);
322+
out_no_dev:
323+
device_unlock(&dev->dev);
315324
return NULL;
316325
}
317326

@@ -362,10 +371,12 @@ static void *eeh_report_resume(void *data, void *userdata)
362371

363372
if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
364373
return NULL;
374+
375+
device_lock(&dev->dev);
365376
dev->error_state = pci_channel_io_normal;
366377

367378
driver = eeh_pcid_get(dev);
368-
if (!driver) return NULL;
379+
if (!driver) goto out_no_dev;
369380

370381
was_in_error = edev->in_error;
371382
edev->in_error = false;
@@ -375,18 +386,20 @@ static void *eeh_report_resume(void *data, void *userdata)
375386
!driver->err_handler->resume ||
376387
(edev->mode & EEH_DEV_NO_HANDLER) || !was_in_error) {
377388
edev->mode &= ~EEH_DEV_NO_HANDLER;
378-
eeh_pcid_put(dev);
379-
return NULL;
389+
goto out;
380390
}
381391

382392
driver->err_handler->resume(dev);
383393

384-
eeh_pcid_put(dev);
385394
pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
395+
out:
396+
eeh_pcid_put(dev);
386397
#ifdef CONFIG_PCI_IOV
387398
if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev))
388399
eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
389400
#endif
401+
out_no_dev:
402+
device_unlock(&dev->dev);
390403
return NULL;
391404
}
392405

@@ -406,23 +419,26 @@ static void *eeh_report_failure(void *data, void *userdata)
406419

407420
if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
408421
return NULL;
422+
423+
device_lock(&dev->dev);
409424
dev->error_state = pci_channel_io_perm_failure;
410425

411426
driver = eeh_pcid_get(dev);
412-
if (!driver) return NULL;
427+
if (!driver) goto out_no_dev;
413428

414429
eeh_disable_irq(dev);
415430

416431
if (!driver->err_handler ||
417-
!driver->err_handler->error_detected) {
418-
eeh_pcid_put(dev);
419-
return NULL;
420-
}
432+
!driver->err_handler->error_detected)
433+
goto out;
421434

422435
driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
423436

424-
eeh_pcid_put(dev);
425437
pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
438+
out:
439+
eeh_pcid_put(dev);
440+
out_no_dev:
441+
device_unlock(&dev->dev);
426442
return NULL;
427443
}
428444

0 commit comments

Comments
 (0)