Skip to content

Commit a3bfb61

Browse files
Hariprasad Shenaidavem330
authored andcommitted
cxgb4: Move SGE Ingress DMA state monitor code to a new routine
Signed-off-by: Hariprasad Shenai <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 982b81e commit a3bfb61

File tree

3 files changed

+156
-79
lines changed

3 files changed

+156
-79
lines changed

drivers/net/ethernet/chelsio/cxgb4/cxgb4.h

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,17 @@ struct adapter_params {
328328
unsigned int max_ird_adapter; /* Max read depth per adapter */
329329
};
330330

331+
/* State needed to monitor the forward progress of SGE Ingress DMA activities
332+
* and possible hangs.
333+
*/
334+
struct sge_idma_monitor_state {
335+
unsigned int idma_1s_thresh; /* 1s threshold in Core Clock ticks */
336+
unsigned int idma_stalled[2]; /* synthesized stalled timers in HZ */
337+
unsigned int idma_state[2]; /* IDMA Hang detect state */
338+
unsigned int idma_qid[2]; /* IDMA Hung Ingress Queue ID */
339+
unsigned int idma_warn[2]; /* time to warning in HZ */
340+
};
341+
331342
#include "t4fw_api.h"
332343

333344
#define FW_VERSION(chip) ( \
@@ -630,12 +641,7 @@ struct sge {
630641
u32 fl_align; /* response queue message alignment */
631642
u32 fl_starve_thres; /* Free List starvation threshold */
632643

633-
/* State variables for detecting an SGE Ingress DMA hang */
634-
unsigned int idma_1s_thresh;/* SGE same State Counter 1s threshold */
635-
unsigned int idma_stalled[2];/* SGE synthesized stalled timers in HZ */
636-
unsigned int idma_state[2]; /* SGE IDMA Hang detect state */
637-
unsigned int idma_qid[2]; /* SGE IDMA Hung Ingress Queue ID */
638-
644+
struct sge_idma_monitor_state idma_monitor;
639645
unsigned int egr_start;
640646
unsigned int egr_sz;
641647
unsigned int ingr_start;
@@ -1311,4 +1317,9 @@ int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox,
13111317
u32 addr, u32 val);
13121318
void t4_sge_decode_idma_state(struct adapter *adapter, int state);
13131319
void t4_free_mem(void *addr);
1320+
void t4_idma_monitor_init(struct adapter *adapter,
1321+
struct sge_idma_monitor_state *idma);
1322+
void t4_idma_monitor(struct adapter *adapter,
1323+
struct sge_idma_monitor_state *idma,
1324+
int hz, int ticks);
13141325
#endif /* __CXGB4_H__ */

drivers/net/ethernet/chelsio/cxgb4/sge.c

Lines changed: 12 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -100,16 +100,6 @@
100100
*/
101101
#define TX_QCHECK_PERIOD (HZ / 2)
102102

103-
/* SGE Hung Ingress DMA Threshold Warning time (in Hz) and Warning Repeat Rate
104-
* (in RX_QCHECK_PERIOD multiples). If we find one of the SGE Ingress DMA
105-
* State Machines in the same state for this amount of time (in HZ) then we'll
106-
* issue a warning about a potential hang. We'll repeat the warning as the
107-
* SGE Ingress DMA Channel appears to be hung every N RX_QCHECK_PERIODs till
108-
* the situation clears. If the situation clears, we'll note that as well.
109-
*/
110-
#define SGE_IDMA_WARN_THRESH (1 * HZ)
111-
#define SGE_IDMA_WARN_REPEAT (20 * RX_QCHECK_PERIOD)
112-
113103
/*
114104
* Max number of Tx descriptors to be reclaimed by the Tx timer.
115105
*/
@@ -2279,7 +2269,7 @@ irq_handler_t t4_intr_handler(struct adapter *adap)
22792269
static void sge_rx_timer_cb(unsigned long data)
22802270
{
22812271
unsigned long m;
2282-
unsigned int i, idma_same_state_cnt[2];
2272+
unsigned int i;
22832273
struct adapter *adap = (struct adapter *)data;
22842274
struct sge *s = &adap->sge;
22852275

@@ -2300,67 +2290,16 @@ static void sge_rx_timer_cb(unsigned long data)
23002290
set_bit(id, s->starving_fl);
23012291
}
23022292
}
2293+
/* The remainder of the SGE RX Timer Callback routine is dedicated to
2294+
* global Master PF activities like checking for chip ingress stalls,
2295+
* etc.
2296+
*/
2297+
if (!(adap->flags & MASTER_PF))
2298+
goto done;
23032299

2304-
t4_write_reg(adap, SGE_DEBUG_INDEX_A, 13);
2305-
idma_same_state_cnt[0] = t4_read_reg(adap, SGE_DEBUG_DATA_HIGH_A);
2306-
idma_same_state_cnt[1] = t4_read_reg(adap, SGE_DEBUG_DATA_LOW_A);
2307-
2308-
for (i = 0; i < 2; i++) {
2309-
u32 debug0, debug11;
2310-
2311-
/* If the Ingress DMA Same State Counter ("timer") is less
2312-
* than 1s, then we can reset our synthesized Stall Timer and
2313-
* continue. If we have previously emitted warnings about a
2314-
* potential stalled Ingress Queue, issue a note indicating
2315-
* that the Ingress Queue has resumed forward progress.
2316-
*/
2317-
if (idma_same_state_cnt[i] < s->idma_1s_thresh) {
2318-
if (s->idma_stalled[i] >= SGE_IDMA_WARN_THRESH)
2319-
CH_WARN(adap, "SGE idma%d, queue%u,resumed after %d sec\n",
2320-
i, s->idma_qid[i],
2321-
s->idma_stalled[i]/HZ);
2322-
s->idma_stalled[i] = 0;
2323-
continue;
2324-
}
2325-
2326-
/* Synthesize an SGE Ingress DMA Same State Timer in the Hz
2327-
* domain. The first time we get here it'll be because we
2328-
* passed the 1s Threshold; each additional time it'll be
2329-
* because the RX Timer Callback is being fired on its regular
2330-
* schedule.
2331-
*
2332-
* If the stall is below our Potential Hung Ingress Queue
2333-
* Warning Threshold, continue.
2334-
*/
2335-
if (s->idma_stalled[i] == 0)
2336-
s->idma_stalled[i] = HZ;
2337-
else
2338-
s->idma_stalled[i] += RX_QCHECK_PERIOD;
2339-
2340-
if (s->idma_stalled[i] < SGE_IDMA_WARN_THRESH)
2341-
continue;
2342-
2343-
/* We'll issue a warning every SGE_IDMA_WARN_REPEAT Hz */
2344-
if (((s->idma_stalled[i] - HZ) % SGE_IDMA_WARN_REPEAT) != 0)
2345-
continue;
2346-
2347-
/* Read and save the SGE IDMA State and Queue ID information.
2348-
* We do this every time in case it changes across time ...
2349-
*/
2350-
t4_write_reg(adap, SGE_DEBUG_INDEX_A, 0);
2351-
debug0 = t4_read_reg(adap, SGE_DEBUG_DATA_LOW_A);
2352-
s->idma_state[i] = (debug0 >> (i * 9)) & 0x3f;
2353-
2354-
t4_write_reg(adap, SGE_DEBUG_INDEX_A, 11);
2355-
debug11 = t4_read_reg(adap, SGE_DEBUG_DATA_LOW_A);
2356-
s->idma_qid[i] = (debug11 >> (i * 16)) & 0xffff;
2357-
2358-
CH_WARN(adap, "SGE idma%u, queue%u, maybe stuck state%u %dsecs (debug0=%#x, debug11=%#x)\n",
2359-
i, s->idma_qid[i], s->idma_state[i],
2360-
s->idma_stalled[i]/HZ, debug0, debug11);
2361-
t4_sge_decode_idma_state(adap, s->idma_state[i]);
2362-
}
2300+
t4_idma_monitor(adap, &s->idma_monitor, HZ, RX_QCHECK_PERIOD);
23632301

2302+
done:
23642303
mod_timer(&s->rx_timer, jiffies + RX_QCHECK_PERIOD);
23652304
}
23662305

@@ -3121,11 +3060,11 @@ int t4_sge_init(struct adapter *adap)
31213060
egress_threshold = EGRTHRESHOLDPACKING_G(sge_conm_ctrl);
31223061
s->fl_starve_thres = 2*egress_threshold + 1;
31233062

3063+
t4_idma_monitor_init(adap, &s->idma_monitor);
3064+
31243065
setup_timer(&s->rx_timer, sge_rx_timer_cb, (unsigned long)adap);
31253066
setup_timer(&s->tx_timer, sge_tx_timer_cb, (unsigned long)adap);
3126-
s->idma_1s_thresh = core_ticks_per_usec(adap) * 1000000; /* 1 s */
3127-
s->idma_stalled[0] = 0;
3128-
s->idma_stalled[1] = 0;
3067+
31293068
spin_lock_init(&s->intrq_lock);
31303069

31313070
return 0;

drivers/net/ethernet/chelsio/cxgb4/t4_hw.c

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5717,3 +5717,130 @@ void t4_tp_read_la(struct adapter *adap, u64 *la_buf, unsigned int *wrptr)
57175717
t4_write_reg(adap, TP_DBG_LA_CONFIG_A,
57185718
cfg | adap->params.tp.la_mask);
57195719
}
5720+
5721+
/* SGE Hung Ingress DMA Warning Threshold time and Warning Repeat Rate (in
5722+
* seconds). If we find one of the SGE Ingress DMA State Machines in the same
5723+
* state for more than the Warning Threshold then we'll issue a warning about
5724+
* a potential hang. We'll repeat the warning as the SGE Ingress DMA Channel
5725+
* appears to be hung every Warning Repeat second till the situation clears.
5726+
* If the situation clears, we'll note that as well.
5727+
*/
5728+
#define SGE_IDMA_WARN_THRESH 1
5729+
#define SGE_IDMA_WARN_REPEAT 300
5730+
5731+
/**
5732+
* t4_idma_monitor_init - initialize SGE Ingress DMA Monitor
5733+
* @adapter: the adapter
5734+
* @idma: the adapter IDMA Monitor state
5735+
*
5736+
* Initialize the state of an SGE Ingress DMA Monitor.
5737+
*/
5738+
void t4_idma_monitor_init(struct adapter *adapter,
5739+
struct sge_idma_monitor_state *idma)
5740+
{
5741+
/* Initialize the state variables for detecting an SGE Ingress DMA
5742+
* hang. The SGE has internal counters which count up on each clock
5743+
* tick whenever the SGE finds its Ingress DMA State Engines in the
5744+
* same state they were on the previous clock tick. The clock used is
5745+
* the Core Clock so we have a limit on the maximum "time" they can
5746+
* record; typically a very small number of seconds. For instance,
5747+
* with a 600MHz Core Clock, we can only count up to a bit more than
5748+
* 7s. So we'll synthesize a larger counter in order to not run the
5749+
* risk of having the "timers" overflow and give us the flexibility to
5750+
* maintain a Hung SGE State Machine of our own which operates across
5751+
* a longer time frame.
5752+
*/
5753+
idma->idma_1s_thresh = core_ticks_per_usec(adapter) * 1000000; /* 1s */
5754+
idma->idma_stalled[0] = 0;
5755+
idma->idma_stalled[1] = 0;
5756+
}
5757+
5758+
/**
5759+
* t4_idma_monitor - monitor SGE Ingress DMA state
5760+
* @adapter: the adapter
5761+
* @idma: the adapter IDMA Monitor state
5762+
* @hz: number of ticks/second
5763+
* @ticks: number of ticks since the last IDMA Monitor call
5764+
*/
5765+
void t4_idma_monitor(struct adapter *adapter,
5766+
struct sge_idma_monitor_state *idma,
5767+
int hz, int ticks)
5768+
{
5769+
int i, idma_same_state_cnt[2];
5770+
5771+
/* Read the SGE Debug Ingress DMA Same State Count registers. These
5772+
* are counters inside the SGE which count up on each clock when the
5773+
* SGE finds its Ingress DMA State Engines in the same states they
5774+
* were in the previous clock. The counters will peg out at
5775+
* 0xffffffff without wrapping around so once they pass the 1s
5776+
* threshold they'll stay above that till the IDMA state changes.
5777+
*/
5778+
t4_write_reg(adapter, SGE_DEBUG_INDEX_A, 13);
5779+
idma_same_state_cnt[0] = t4_read_reg(adapter, SGE_DEBUG_DATA_HIGH_A);
5780+
idma_same_state_cnt[1] = t4_read_reg(adapter, SGE_DEBUG_DATA_LOW_A);
5781+
5782+
for (i = 0; i < 2; i++) {
5783+
u32 debug0, debug11;
5784+
5785+
/* If the Ingress DMA Same State Counter ("timer") is less
5786+
* than 1s, then we can reset our synthesized Stall Timer and
5787+
* continue. If we have previously emitted warnings about a
5788+
* potential stalled Ingress Queue, issue a note indicating
5789+
* that the Ingress Queue has resumed forward progress.
5790+
*/
5791+
if (idma_same_state_cnt[i] < idma->idma_1s_thresh) {
5792+
if (idma->idma_stalled[i] >= SGE_IDMA_WARN_THRESH * hz)
5793+
dev_warn(adapter->pdev_dev, "SGE idma%d, queue %u, "
5794+
"resumed after %d seconds\n",
5795+
i, idma->idma_qid[i],
5796+
idma->idma_stalled[i] / hz);
5797+
idma->idma_stalled[i] = 0;
5798+
continue;
5799+
}
5800+
5801+
/* Synthesize an SGE Ingress DMA Same State Timer in the Hz
5802+
* domain. The first time we get here it'll be because we
5803+
* passed the 1s Threshold; each additional time it'll be
5804+
* because the RX Timer Callback is being fired on its regular
5805+
* schedule.
5806+
*
5807+
* If the stall is below our Potential Hung Ingress Queue
5808+
* Warning Threshold, continue.
5809+
*/
5810+
if (idma->idma_stalled[i] == 0) {
5811+
idma->idma_stalled[i] = hz;
5812+
idma->idma_warn[i] = 0;
5813+
} else {
5814+
idma->idma_stalled[i] += ticks;
5815+
idma->idma_warn[i] -= ticks;
5816+
}
5817+
5818+
if (idma->idma_stalled[i] < SGE_IDMA_WARN_THRESH * hz)
5819+
continue;
5820+
5821+
/* We'll issue a warning every SGE_IDMA_WARN_REPEAT seconds.
5822+
*/
5823+
if (idma->idma_warn[i] > 0)
5824+
continue;
5825+
idma->idma_warn[i] = SGE_IDMA_WARN_REPEAT * hz;
5826+
5827+
/* Read and save the SGE IDMA State and Queue ID information.
5828+
* We do this every time in case it changes across time ...
5829+
* can't be too careful ...
5830+
*/
5831+
t4_write_reg(adapter, SGE_DEBUG_INDEX_A, 0);
5832+
debug0 = t4_read_reg(adapter, SGE_DEBUG_DATA_LOW_A);
5833+
idma->idma_state[i] = (debug0 >> (i * 9)) & 0x3f;
5834+
5835+
t4_write_reg(adapter, SGE_DEBUG_INDEX_A, 11);
5836+
debug11 = t4_read_reg(adapter, SGE_DEBUG_DATA_LOW_A);
5837+
idma->idma_qid[i] = (debug11 >> (i * 16)) & 0xffff;
5838+
5839+
dev_warn(adapter->pdev_dev, "SGE idma%u, queue %u, potentially stuck in "
5840+
"state %u for %d seconds (debug0=%#x, debug11=%#x)\n",
5841+
i, idma->idma_qid[i], idma->idma_state[i],
5842+
idma->idma_stalled[i] / hz,
5843+
debug0, debug11);
5844+
t4_sge_decode_idma_state(adapter, idma->idma_state[i]);
5845+
}
5846+
}

0 commit comments

Comments
 (0)