Skip to content

Commit ec8ee71

Browse files
emuslndavem330
authored andcommitted
ionic: stretch heartbeat detection
The driver can be premature in detecting stalled firmware when the heartbeat is not updated because the firmware can occasionally take a long time (more than 2 seconds) to service a request, and doesn't update the heartbeat during that time. The firmware heartbeat is not necessarily a steady 1 second periodic beat, but better described as something that should progress at least once in every DECVMD_TIMEOUT period. The single-threaded design in the FW means that if a devcmd or adminq request launches a large internal job, it is stuck waiting for that job to finish before it can get back to updating the heartbeat. Since all requests are "guaranteed" to finish within the DEVCMD_TIMEOUT period, the driver needs to less aggressive in checking the heartbeat progress. We change our current 2 second window to something bigger than DEVCMD_TIMEOUT which should take care of most of the issue. We stop checking for the heartbeat while waiting for a request, as long as we're still watching for the FW status. Lastly, we make sure our FW status is up to date before running a devcmd request. Once we do this, we need to not check the heartbeat on DEV commands because it may be stalled while we're on the fw_down path. Instead, we can rely on the is_fw_running check. Fixes: b2b9a8d ("ionic: avoid races in ionic_heartbeat_check") Signed-off-by: Brett Creeley <[email protected]> Signed-off-by: Shannon Nelson <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent b1552a4 commit ec8ee71

File tree

4 files changed

+20
-24
lines changed

4 files changed

+20
-24
lines changed

drivers/net/ethernet/pensando/ionic/ionic.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ struct ionic_lif;
1818
#define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_PF 0x1002
1919
#define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_VF 0x1003
2020

21-
#define DEVCMD_TIMEOUT 10
21+
#define DEVCMD_TIMEOUT 5
2222
#define IONIC_ADMINQ_TIME_SLICE msecs_to_jiffies(100)
2323

2424
#define IONIC_PHC_UPDATE_NS 10000000000 /* 10s in nanoseconds */

drivers/net/ethernet/pensando/ionic/ionic_dev.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,9 +236,11 @@ int ionic_heartbeat_check(struct ionic *ionic)
236236
if (!idev->fw_status_ready)
237237
return -ENXIO;
238238

239-
/* wait at least one watchdog period since the last heartbeat */
239+
/* Because of some variability in the actual FW heartbeat, we
240+
* wait longer than the DEVCMD_TIMEOUT before checking again.
241+
*/
240242
last_check_time = idev->last_hb_time;
241-
if (time_before(check_time, last_check_time + ionic->watchdog_period))
243+
if (time_before(check_time, last_check_time + DEVCMD_TIMEOUT * 2 * HZ))
242244
return 0;
243245

244246
fw_hb = ioread32(&idev->dev_info_regs->fw_heartbeat);

drivers/net/ethernet/pensando/ionic/ionic_lif.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1787,7 +1787,7 @@ static void ionic_lif_quiesce(struct ionic_lif *lif)
17871787

17881788
err = ionic_adminq_post_wait(lif, &ctx);
17891789
if (err)
1790-
netdev_err(lif->netdev, "lif quiesce failed %d\n", err);
1790+
netdev_dbg(lif->netdev, "lif quiesce failed %d\n", err);
17911791
}
17921792

17931793
static void ionic_txrx_disable(struct ionic_lif *lif)

drivers/net/ethernet/pensando/ionic/ionic_main.c

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -358,13 +358,14 @@ int ionic_adminq_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx,
358358
if (remaining)
359359
break;
360360

361-
/* interrupt the wait if FW stopped */
361+
/* force a check of FW status and break out if FW reset */
362+
(void)ionic_heartbeat_check(lif->ionic);
362363
if ((test_bit(IONIC_LIF_F_FW_RESET, lif->state) &&
363364
!lif->ionic->idev.fw_status_ready) ||
364365
test_bit(IONIC_LIF_F_FW_STOPPING, lif->state)) {
365366
if (do_msg)
366-
netdev_err(netdev, "%s (%d) interrupted, FW in reset\n",
367-
name, ctx->cmd.cmd.opcode);
367+
netdev_warn(netdev, "%s (%d) interrupted, FW in reset\n",
368+
name, ctx->cmd.cmd.opcode);
368369
ctx->comp.comp.status = IONIC_RC_ERROR;
369370
return -ENXIO;
370371
}
@@ -425,9 +426,9 @@ static int __ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds,
425426
unsigned long start_time;
426427
unsigned long max_wait;
427428
unsigned long duration;
429+
int done = 0;
430+
bool fw_up;
428431
int opcode;
429-
int hb = 0;
430-
int done;
431432
int err;
432433

433434
/* Wait for dev cmd to complete, retrying if we get EAGAIN,
@@ -437,31 +438,24 @@ static int __ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds,
437438
try_again:
438439
opcode = readb(&idev->dev_cmd_regs->cmd.cmd.opcode);
439440
start_time = jiffies;
440-
do {
441+
for (fw_up = ionic_is_fw_running(idev);
442+
!done && fw_up && time_before(jiffies, max_wait);
443+
fw_up = ionic_is_fw_running(idev)) {
441444
done = ionic_dev_cmd_done(idev);
442445
if (done)
443446
break;
444447
usleep_range(100, 200);
445-
446-
/* Don't check the heartbeat on FW_CONTROL commands as they are
447-
* notorious for interrupting the firmware's heartbeat update.
448-
*/
449-
if (opcode != IONIC_CMD_FW_CONTROL)
450-
hb = ionic_heartbeat_check(ionic);
451-
} while (!done && !hb && time_before(jiffies, max_wait));
448+
}
452449
duration = jiffies - start_time;
453450

454451
dev_dbg(ionic->dev, "DEVCMD %s (%d) done=%d took %ld secs (%ld jiffies)\n",
455452
ionic_opcode_to_str(opcode), opcode,
456453
done, duration / HZ, duration);
457454

458-
if (!done && hb) {
459-
/* It is possible (but unlikely) that FW was busy and missed a
460-
* heartbeat check but is still alive and will process this
461-
* request, so don't clean the dev_cmd in this case.
462-
*/
463-
dev_dbg(ionic->dev, "DEVCMD %s (%d) failed - FW halted\n",
464-
ionic_opcode_to_str(opcode), opcode);
455+
if (!done && !fw_up) {
456+
ionic_dev_cmd_clean(ionic);
457+
dev_warn(ionic->dev, "DEVCMD %s (%d) interrupted - FW is down\n",
458+
ionic_opcode_to_str(opcode), opcode);
465459
return -ENXIO;
466460
}
467461

0 commit comments

Comments
 (0)