Skip to content

Commit 7fddff5

Browse files
mikeygregkh
authored andcommitted
powerpc/eeh: Fix race with driver un/bind
commit f0295e0 upstream. The current EEH callbacks can race with a driver unbind. This can result in a backtraces like this: EEH: Frozen PHB#0-PE#1fc detected EEH: PE location: S000009, PHB location: N/A CPU: 2 PID: 2312 Comm: kworker/u258:3 Not tainted 4.15.6-openpower1 #2 Workqueue: nvme-wq nvme_reset_work [nvme] Call Trace: dump_stack+0x9c/0xd0 (unreliable) eeh_dev_check_failure+0x420/0x470 eeh_check_failure+0xa0/0xa4 nvme_reset_work+0x138/0x1414 [nvme] process_one_work+0x1ec/0x328 worker_thread+0x2e4/0x3a8 kthread+0x14c/0x154 ret_from_kernel_thread+0x5c/0xc8 nvme nvme1: Removing after probe failure status: -19 <snip> cpu 0x23: Vector: 300 (Data Access) at [c000000ff50f3800] pc: c0080000089a0eb0: nvme_error_detected+0x4c/0x90 [nvme] lr: c000000000026564: eeh_report_error+0xe0/0x110 sp: c000000ff50f3a80 msr: 9000000000009033 dar: 400 dsisr: 40000000 current = 0xc000000ff507c000 paca = 0xc00000000fdc9d80 softe: 0 irq_happened: 0x01 pid = 782, comm = eehd Linux version 4.15.6-openpower1 (smc@smc-desktop) (gcc version 6.4.0 (Buildroot 2017.11.2-00008-g4b6188e)) #2 SM P Tue Feb 27 12:33:27 PST 2018 enter ? for help eeh_report_error+0xe0/0x110 eeh_pe_dev_traverse+0xc0/0xdc eeh_handle_normal_event+0x184/0x4c4 eeh_handle_event+0x30/0x288 eeh_event_handler+0x124/0x170 kthread+0x14c/0x154 ret_from_kernel_thread+0x5c/0xc8 The first part is an EEH (on boot), the second half is the resulting crash. nvme probe starts the nvme_reset_work() worker thread. This worker thread starts touching the device which see a device error (EEH) and hence queues up an event in the powerpc EEH worker thread. nvme_reset_work() then continues and runs nvme_remove_dead_ctrl_work() which results in unbinding the driver from the device and hence releases all resources. At the same time, the EEH worker thread starts doing the EEH .error_detected() driver callback, which no longer works since the resources have been freed. This fixes the problem in the same way the generic PCIe AER code (in drivers/pci/pcie/aer/aerdrv_core.c) does. It makes the EEH code hold the device_lock() while performing the driver EEH callbacks and associated code. This ensures either the callbacks are no longer register, or if they are registered the driver will not be removed from underneath us. This has been broken forever. The EEH call backs were first introduced in 2005 (in 77bd741) but it's not clear if a lock was needed back then. Fixes: 77bd741 ("[PATCH] powerpc: PCI Error Recovery: PPC64 core recovery routines") Cc: [email protected] # v2.6.16+ Signed-off-by: Michael Neuling <[email protected]> Reviewed-by: Benjamin Herrenschmidt <[email protected]> Signed-off-by: Michael Ellerman <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent e5a290c commit 7fddff5

File tree

1 file changed

+38
-23
lines changed

1 file changed

+38
-23
lines changed

arch/powerpc/kernel/eeh_driver.c

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -207,18 +207,18 @@ static void *eeh_report_error(void *data, void *userdata)
207207

208208
if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
209209
return NULL;
210+
211+
device_lock(&dev->dev);
210212
dev->error_state = pci_channel_io_frozen;
211213

212214
driver = eeh_pcid_get(dev);
213-
if (!driver) return NULL;
215+
if (!driver) goto out_no_dev;
214216

215217
eeh_disable_irq(dev);
216218

217219
if (!driver->err_handler ||
218-
!driver->err_handler->error_detected) {
219-
eeh_pcid_put(dev);
220-
return NULL;
221-
}
220+
!driver->err_handler->error_detected)
221+
goto out;
222222

223223
rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
224224

@@ -227,7 +227,10 @@ static void *eeh_report_error(void *data, void *userdata)
227227
if (*res == PCI_ERS_RESULT_NONE) *res = rc;
228228

229229
edev->in_error = true;
230+
out:
230231
eeh_pcid_put(dev);
232+
out_no_dev:
233+
device_unlock(&dev->dev);
231234
return NULL;
232235
}
233236

@@ -250,23 +253,25 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
250253
if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
251254
return NULL;
252255

256+
device_lock(&dev->dev);
253257
driver = eeh_pcid_get(dev);
254-
if (!driver) return NULL;
258+
if (!driver) goto out_no_dev;
255259

256260
if (!driver->err_handler ||
257261
!driver->err_handler->mmio_enabled ||
258-
(edev->mode & EEH_DEV_NO_HANDLER)) {
259-
eeh_pcid_put(dev);
260-
return NULL;
261-
}
262+
(edev->mode & EEH_DEV_NO_HANDLER))
263+
goto out;
262264

263265
rc = driver->err_handler->mmio_enabled(dev);
264266

265267
/* A driver that needs a reset trumps all others */
266268
if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
267269
if (*res == PCI_ERS_RESULT_NONE) *res = rc;
268270

271+
out:
269272
eeh_pcid_put(dev);
273+
out_no_dev:
274+
device_unlock(&dev->dev);
270275
return NULL;
271276
}
272277

@@ -289,28 +294,31 @@ static void *eeh_report_reset(void *data, void *userdata)
289294

290295
if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
291296
return NULL;
297+
298+
device_lock(&dev->dev);
292299
dev->error_state = pci_channel_io_normal;
293300

294301
driver = eeh_pcid_get(dev);
295-
if (!driver) return NULL;
302+
if (!driver) goto out_no_dev;
296303

297304
eeh_enable_irq(dev);
298305

299306
if (!driver->err_handler ||
300307
!driver->err_handler->slot_reset ||
301308
(edev->mode & EEH_DEV_NO_HANDLER) ||
302-
(!edev->in_error)) {
303-
eeh_pcid_put(dev);
304-
return NULL;
305-
}
309+
(!edev->in_error))
310+
goto out;
306311

307312
rc = driver->err_handler->slot_reset(dev);
308313
if ((*res == PCI_ERS_RESULT_NONE) ||
309314
(*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
310315
if (*res == PCI_ERS_RESULT_DISCONNECT &&
311316
rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
312317

318+
out:
313319
eeh_pcid_put(dev);
320+
out_no_dev:
321+
device_unlock(&dev->dev);
314322
return NULL;
315323
}
316324

@@ -361,10 +369,12 @@ static void *eeh_report_resume(void *data, void *userdata)
361369

362370
if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
363371
return NULL;
372+
373+
device_lock(&dev->dev);
364374
dev->error_state = pci_channel_io_normal;
365375

366376
driver = eeh_pcid_get(dev);
367-
if (!driver) return NULL;
377+
if (!driver) goto out_no_dev;
368378

369379
was_in_error = edev->in_error;
370380
edev->in_error = false;
@@ -374,13 +384,15 @@ static void *eeh_report_resume(void *data, void *userdata)
374384
!driver->err_handler->resume ||
375385
(edev->mode & EEH_DEV_NO_HANDLER) || !was_in_error) {
376386
edev->mode &= ~EEH_DEV_NO_HANDLER;
377-
eeh_pcid_put(dev);
378-
return NULL;
387+
goto out;
379388
}
380389

381390
driver->err_handler->resume(dev);
382391

392+
out:
383393
eeh_pcid_put(dev);
394+
out_no_dev:
395+
device_unlock(&dev->dev);
384396
return NULL;
385397
}
386398

@@ -400,22 +412,25 @@ static void *eeh_report_failure(void *data, void *userdata)
400412

401413
if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
402414
return NULL;
415+
416+
device_lock(&dev->dev);
403417
dev->error_state = pci_channel_io_perm_failure;
404418

405419
driver = eeh_pcid_get(dev);
406-
if (!driver) return NULL;
420+
if (!driver) goto out_no_dev;
407421

408422
eeh_disable_irq(dev);
409423

410424
if (!driver->err_handler ||
411-
!driver->err_handler->error_detected) {
412-
eeh_pcid_put(dev);
413-
return NULL;
414-
}
425+
!driver->err_handler->error_detected)
426+
goto out;
415427

416428
driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
417429

430+
out:
418431
eeh_pcid_put(dev);
432+
out_no_dev:
433+
device_unlock(&dev->dev);
419434
return NULL;
420435
}
421436

0 commit comments

Comments
 (0)