Skip to content

Commit 66093e8

Browse files
Eran Ben ElishaSomasundaram Krishnasamy
authored andcommitted
net/mlx5e: Poll event queue upon TX timeout before performing full channels recovery
Up until this patch, on every TX timeout we would try to do channels recovery. However, in case of a lost interrupt for an EQ, the channel associated to it cannot be recovered if reopened as it would never get another interrupt on sent/received traffic, and eventually ends up with another TX timeout (Restarting the EQ is not part of channel recovery). This patch adds a mechanism for explicitly polling EQ in case of a TX timeout in order to recover from a lost interrupt. If this is not the case (no pending EQEs), perform a channels full recovery as usual. Once a lost EQE is recovered, it triggers the NAPI to run and handle all pending completions. This will free some budget in the bql (via calling netdev_tx_completed_queue) or by clearing pending TXWQEs and waking up the queue. One of the above actions will move the queue to be ready for transmit again. Signed-off-by: Eran Ben Elisha <[email protected]> Reviewed-by: Tariq Toukan <[email protected]> Signed-off-by: Saeed Mahameed <[email protected]> (cherry picked from commit 7ca560b) Orabug: 31753100 Signed-off-by: aru kolappan <[email protected]> Reviewed-by: Praveen Kannoju <[email protected]> Conflicts: drivers/net/ethernet/mellanox/mlx5/core/en_main.c (Resolved conflict due to variable name 'sched_work' changed to 'reopen_channels') Signed-off-by: Somasundaram Krishnasamy <[email protected]>
1 parent 153c66a commit 66093e8

File tree

3 files changed

+61
-7
lines changed

3 files changed

+61
-7
lines changed

drivers/net/ethernet/mellanox/mlx5/core/en_main.c

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3449,26 +3449,61 @@ static netdev_features_t mlx5e_features_check(struct sk_buff *skb,
34493449
return features;
34503450
}
34513451

3452+
static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev,
3453+
struct mlx5e_txqsq *sq)
3454+
{
3455+
struct mlx5e_priv *priv = netdev_priv(dev);
3456+
struct mlx5_core_dev *mdev = priv->mdev;
3457+
int irqn_not_used, eqn;
3458+
struct mlx5_eq *eq;
3459+
u32 eqe_count;
3460+
3461+
if (mlx5_vector2eqn(mdev, sq->cq.mcq.vector, &eqn, &irqn_not_used))
3462+
return false;
3463+
3464+
eq = mlx5_eqn2eq(mdev, eqn);
3465+
if (IS_ERR(eq))
3466+
return false;
3467+
3468+
netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
3469+
eqn, eq->cons_index, eq->irqn);
3470+
3471+
eqe_count = mlx5_eq_poll_irq_disabled(eq);
3472+
if (!eqe_count)
3473+
return false;
3474+
3475+
netdev_err(dev, "Recover %d eqes on EQ 0x%x\n", eqe_count, eq->eqn);
3476+
return true;
3477+
}
3478+
34523479
static void mlx5e_tx_timeout(struct net_device *dev)
34533480
{
34543481
struct mlx5e_priv *priv = netdev_priv(dev);
3455-
bool sched_work = false;
3482+
bool reopen_channels = false;
34563483
int i;
34573484

34583485
netdev_err(dev, "TX timeout detected\n");
34593486

34603487
for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; i++) {
3488+
struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, i);
34613489
struct mlx5e_txqsq *sq = priv->txq2sq[i];
34623490

3463-
if (!netif_xmit_stopped(netdev_get_tx_queue(dev, i)))
3491+
if (!netif_xmit_stopped(dev_queue))
34643492
continue;
3465-
sched_work = true;
3466-
clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
3467-
netdev_err(dev, "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x\n",
3468-
i, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc);
3493+
netdev_err(dev, "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n",
3494+
i, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
3495+
jiffies_to_usecs(jiffies - dev_queue->trans_start));
3496+
3497+
/* If we recover a lost interrupt, most likely TX timeout will
3498+
* be resolved, skip reopening channels
3499+
*/
3500+
if (!mlx5e_tx_timeout_eq_recover(dev, sq)) {
3501+
clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
3502+
reopen_channels = true;
3503+
}
34693504
}
34703505

3471-
if (sched_work && test_bit(MLX5E_STATE_OPENED, &priv->state))
3506+
if (reopen_channels && test_bit(MLX5E_STATE_OPENED, &priv->state))
34723507
schedule_work(&priv->tx_timeout_work);
34733508
}
34743509

drivers/net/ethernet/mellanox/mlx5/core/eq.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,24 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
544544
return IRQ_HANDLED;
545545
}
546546

547+
/* Some architectures don't latch interrupts when they are disabled, so using
548+
* mlx5_eq_poll_irq_disabled could end up losing interrupts while trying to
549+
* avoid losing them. It is not recommended to use it, unless this is the last
550+
* resort.
551+
*/
552+
u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq *eq)
553+
{
554+
u32 count_eqe;
555+
556+
disable_irq(eq->irqn);
557+
count_eqe = eq->cons_index;
558+
mlx5_eq_int(eq->irqn, eq);
559+
count_eqe = eq->cons_index - count_eqe;
560+
enable_irq(eq->irqn);
561+
562+
return count_eqe;
563+
}
564+
547565
static void init_eq_buf(struct mlx5_eq *eq)
548566
{
549567
struct mlx5_eqe *eqe;

drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev);
111111
/* This function should only be called after mlx5_cmd_force_teardown_hca */
112112
void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev);
113113
struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn);
114+
u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq *eq);
114115
void mlx5_cq_tasklet_cb(unsigned long data);
115116

116117
int mlx5_query_pcam_reg(struct mlx5_core_dev *dev, u32 *pcam, u8 feature_group,

0 commit comments

Comments
 (0)