Skip to content

Commit 25baf3d

Browse files
oohalmpe
authored andcommitted
powerpc/eeh: Defer printing stack trace
Currently we print a stack trace in the event handler to help with debugging EEH issues. In the case of suprise hot-unplug this is unneeded, so we want to prevent printing the stack trace unless we know it's due to an actual device error. To accomplish this, we can save a stack trace at the point of detection and only print it once the EEH recovery handler has determined the freeze was due to an actual error. Since the whole point of this is to prevent spurious EEH output we also move a few prints out of the detection thread, or mark them as pr_debug so anyone interested can get output from the eeh_check_dev_failure() if they want. Signed-off-by: Oliver O'Halloran <[email protected]> Signed-off-by: Michael Ellerman <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent b104af5 commit 25baf3d

File tree

4 files changed

+64
-26
lines changed

4 files changed

+64
-26
lines changed

arch/powerpc/include/asm/eeh.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,17 @@ struct eeh_pe {
8888
struct list_head child_list; /* List of PEs below this PE */
8989
struct list_head child; /* Memb. child_list/eeh_phb_pe */
9090
struct list_head edevs; /* List of eeh_dev in this PE */
91+
92+
/*
93+
* Saved stack trace. When we find a PE freeze in eeh_dev_check_failure
94+
* the stack trace is saved here so we can print it in the recovery
95+
* thread if it turns out to due to a real problem rather than
96+
* a hot-remove.
97+
*
98+
* A max of 64 entries might be overkill, but it also might not be.
99+
*/
100+
unsigned long stack_trace[64];
101+
int trace_entries;
91102
};
92103

93104
#define eeh_pe_for_each_dev(pe, edev, tmp) \

arch/powerpc/kernel/eeh.c

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -420,11 +420,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
420420
eeh_pe_mark_isolated(phb_pe);
421421
eeh_serialize_unlock(flags);
422422

423-
pr_err("EEH: PHB#%x failure detected, location: %s\n",
423+
pr_debug("EEH: PHB#%x failure detected, location: %s\n",
424424
phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe));
425-
dump_stack();
426425
eeh_send_failure_event(phb_pe);
427-
428426
return 1;
429427
out:
430428
eeh_serialize_unlock(flags);
@@ -451,7 +449,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
451449
unsigned long flags;
452450
struct device_node *dn;
453451
struct pci_dev *dev;
454-
struct eeh_pe *pe, *parent_pe, *phb_pe;
452+
struct eeh_pe *pe, *parent_pe;
455453
int rc = 0;
456454
const char *location = NULL;
457455

@@ -581,13 +579,8 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
581579
* a stack trace will help the device-driver authors figure
582580
* out what happened. So print that out.
583581
*/
584-
phb_pe = eeh_phb_pe_get(pe->phb);
585-
pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
586-
pe->phb->global_number, pe->addr);
587-
pr_err("EEH: PE location: %s, PHB location: %s\n",
588-
eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
589-
dump_stack();
590-
582+
pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n",
583+
__func__, pe->phb->global_number, pe->addr);
591584
eeh_send_failure_event(pe);
592585

593586
return 1;

arch/powerpc/kernel/eeh_driver.c

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -863,8 +863,44 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
863863
if (eeh_slot_presence_check(edev->pdev))
864864
devices++;
865865

866-
if (!devices)
866+
if (!devices) {
867+
pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
868+
pe->phb->global_number, pe->addr);
867869
goto out; /* nothing to recover */
870+
}
871+
872+
/* Log the event */
873+
if (pe->type & EEH_PE_PHB) {
874+
pr_err("EEH: PHB#%x failure detected, location: %s\n",
875+
pe->phb->global_number, eeh_pe_loc_get(pe));
876+
} else {
877+
struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb);
878+
879+
pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
880+
pe->phb->global_number, pe->addr);
881+
pr_err("EEH: PE location: %s, PHB location: %s\n",
882+
eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
883+
}
884+
885+
/*
886+
* Print the saved stack trace now that we've verified there's
887+
* something to recover.
888+
*/
889+
if (pe->trace_entries) {
890+
void **ptrs = (void **) pe->stack_trace;
891+
int i;
892+
893+
pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
894+
pe->phb->global_number, pe->addr);
895+
896+
/* FIXME: Use the same format as dump_stack() */
897+
pr_err("EEH: Call Trace:\n");
898+
for (i = 0; i < pe->trace_entries; i++)
899+
pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]);
900+
901+
pe->trace_entries = 0;
902+
}
903+
868904

869905
eeh_pe_update_time_stamp(pe);
870906
pe->freeze_count++;

arch/powerpc/kernel/eeh_event.c

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ static int eeh_event_handler(void * dummy)
4040
{
4141
unsigned long flags;
4242
struct eeh_event *event;
43-
struct eeh_pe *pe;
4443

4544
while (!kthread_should_stop()) {
4645
if (wait_for_completion_interruptible(&eeh_eventlist_event))
@@ -59,19 +58,10 @@ static int eeh_event_handler(void * dummy)
5958
continue;
6059

6160
/* We might have event without binding PE */
62-
pe = event->pe;
63-
if (pe) {
64-
if (pe->type & EEH_PE_PHB)
65-
pr_info("EEH: Detected error on PHB#%x\n",
66-
pe->phb->global_number);
67-
else
68-
pr_info("EEH: Detected PCI bus error on "
69-
"PHB#%x-PE#%x\n",
70-
pe->phb->global_number, pe->addr);
71-
eeh_handle_normal_event(pe);
72-
} else {
61+
if (event->pe)
62+
eeh_handle_normal_event(event->pe);
63+
else
7364
eeh_handle_special_event();
74-
}
7565

7666
kfree(event);
7767
}
@@ -126,8 +116,16 @@ int __eeh_send_failure_event(struct eeh_pe *pe)
126116
* This prevents the PE from being free()ed by a hotplug driver
127117
* while the PE is sitting in the event queue.
128118
*/
129-
if (pe)
119+
if (pe) {
120+
/*
121+
* Save the current stack trace so we can dump it from the
122+
* event handler thread.
123+
*/
124+
pe->trace_entries = stack_trace_save(pe->stack_trace,
125+
ARRAY_SIZE(pe->stack_trace), 0);
126+
130127
eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
128+
}
131129

132130
/* We may or may not be called in an interrupt context */
133131
spin_lock_irqsave(&eeh_eventlist_lock, flags);

0 commit comments

Comments
 (0)