Skip to content

Commit c1a797c

Browse files
kwan-intcjgunthorpe
authored andcommitted
IB/hfi1: Ignore LNI errors before DC8051 transitions to Polling state
When it is requested to change its physical state back to Offline while in the process to go up, DC8051 will set the ERROR field in the DC8051_DBG_ERR_INFO_SET_BY_8051 register. This ERROR field will remain until the next time when DC8051 transitions from Offline to Polling. Subsequently, when the host requests DC8051 to change its physical state to Polling again, it may receive a DC8051 interrupt with the stale ERROR field still in DC8051_DBG_ERR_INFO_SET_BY_8051. If the host link state has been changed to Polling, this stale ERROR will force the host to transition to Offline state, resulting in a vicious cycle of Polling ->Offline->Polling->Offline. On the other hand, if the host link state is still Offline when the stale ERROR is received, the stale ERROR will be ignored, and the link will come up correctly. This patch implements the correct behavior by changing host link state to Polling only after DC8051 changes its physical state to Polling. Reviewed-by: Mike Marciniszyn <[email protected]> Signed-off-by: Krzysztof Goreczny <[email protected]> Signed-off-by: Kaike Wan <[email protected]> Signed-off-by: Dennis Dalessandro <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent 937488a commit c1a797c

File tree

1 file changed

+46
-1
lines changed
  • drivers/infiniband/hw/hfi1

1 file changed

+46
-1
lines changed

drivers/infiniband/hw/hfi1/chip.c

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1072,6 +1072,8 @@ static void log_state_transition(struct hfi1_pportdata *ppd, u32 state);
10721072
static void log_physical_state(struct hfi1_pportdata *ppd, u32 state);
10731073
static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state,
10741074
int msecs);
1075+
static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
1076+
int msecs);
10751077
static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
10761078
static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
10771079
static void handle_temp_err(struct hfi1_devdata *dd);
@@ -10770,20 +10772,30 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
1077010772
break;
1077110773

1077210774
ppd->port_error_action = 0;
10773-
ppd->host_link_state = HLS_DN_POLL;
1077410775

1077510776
if (quick_linkup) {
1077610777
/* quick linkup does not go into polling */
1077710778
ret = do_quick_linkup(dd);
1077810779
} else {
1077910780
ret1 = set_physical_link_state(dd, PLS_POLLING);
10781+
if (!ret1)
10782+
ret1 = wait_phys_link_out_of_offline(ppd,
10783+
3000);
1078010784
if (ret1 != HCMD_SUCCESS) {
1078110785
dd_dev_err(dd,
1078210786
"Failed to transition to Polling link state, return 0x%x\n",
1078310787
ret1);
1078410788
ret = -EINVAL;
1078510789
}
1078610790
}
10791+
10792+
/*
10793+
* Change the host link state after requesting DC8051 to
10794+
* change its physical state so that we can ignore any
10795+
* interrupt with stale LNI(XX) error, which will not be
10796+
* cleared until DC8051 transitions to Polling state.
10797+
*/
10798+
ppd->host_link_state = HLS_DN_POLL;
1078710799
ppd->offline_disabled_reason =
1078810800
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
1078910801
/*
@@ -12927,6 +12939,39 @@ static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd,
1292712939
return read_state;
1292812940
}
1292912941

12942+
/*
12943+
* wait_phys_link_out_of_offline - wait for any out of offline state
12944+
* @ppd: port device
12945+
* @msecs: the number of milliseconds to wait
12946+
*
12947+
* Wait up to msecs milliseconds for any out of offline physical link
12948+
* state change to occur.
12949+
* Returns 0 if at least one state is reached, otherwise -ETIMEDOUT.
12950+
*/
12951+
static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
12952+
int msecs)
12953+
{
12954+
u32 read_state;
12955+
unsigned long timeout;
12956+
12957+
timeout = jiffies + msecs_to_jiffies(msecs);
12958+
while (1) {
12959+
read_state = read_physical_state(ppd->dd);
12960+
if ((read_state & 0xF0) != PLS_OFFLINE)
12961+
break;
12962+
if (time_after(jiffies, timeout)) {
12963+
dd_dev_err(ppd->dd,
12964+
"timeout waiting for phy link out of offline. Read state 0x%x, %dms\n",
12965+
read_state, msecs);
12966+
return -ETIMEDOUT;
12967+
}
12968+
usleep_range(1950, 2050); /* sleep 2ms-ish */
12969+
}
12970+
12971+
log_state_transition(ppd, read_state);
12972+
return read_state;
12973+
}
12974+
1293012975
#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
1293112976
(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
1293212977

0 commit comments

Comments
 (0)