@@ -5717,3 +5717,130 @@ void t4_tp_read_la(struct adapter *adap, u64 *la_buf, unsigned int *wrptr)
5717
5717
t4_write_reg (adap , TP_DBG_LA_CONFIG_A ,
5718
5718
cfg | adap -> params .tp .la_mask );
5719
5719
}
5720
+
5721
+ /* SGE Hung Ingress DMA Warning Threshold time and Warning Repeat Rate (in
5722
+ * seconds). If we find one of the SGE Ingress DMA State Machines in the same
5723
+ * state for more than the Warning Threshold then we'll issue a warning about
5724
+ * a potential hang. We'll repeat the warning as the SGE Ingress DMA Channel
5725
+ * appears to be hung every Warning Repeat second till the situation clears.
5726
+ * If the situation clears, we'll note that as well.
5727
+ */
5728
+ #define SGE_IDMA_WARN_THRESH 1
5729
+ #define SGE_IDMA_WARN_REPEAT 300
5730
+
5731
+ /**
5732
+ * t4_idma_monitor_init - initialize SGE Ingress DMA Monitor
5733
+ * @adapter: the adapter
5734
+ * @idma: the adapter IDMA Monitor state
5735
+ *
5736
+ * Initialize the state of an SGE Ingress DMA Monitor.
5737
+ */
5738
+ void t4_idma_monitor_init (struct adapter * adapter ,
5739
+ struct sge_idma_monitor_state * idma )
5740
+ {
5741
+ /* Initialize the state variables for detecting an SGE Ingress DMA
5742
+ * hang. The SGE has internal counters which count up on each clock
5743
+ * tick whenever the SGE finds its Ingress DMA State Engines in the
5744
+ * same state they were on the previous clock tick. The clock used is
5745
+ * the Core Clock so we have a limit on the maximum "time" they can
5746
+ * record; typically a very small number of seconds. For instance,
5747
+ * with a 600MHz Core Clock, we can only count up to a bit more than
5748
+ * 7s. So we'll synthesize a larger counter in order to not run the
5749
+ * risk of having the "timers" overflow and give us the flexibility to
5750
+ * maintain a Hung SGE State Machine of our own which operates across
5751
+ * a longer time frame.
5752
+ */
5753
+ idma -> idma_1s_thresh = core_ticks_per_usec (adapter ) * 1000000 ; /* 1s */
5754
+ idma -> idma_stalled [0 ] = 0 ;
5755
+ idma -> idma_stalled [1 ] = 0 ;
5756
+ }
5757
+
5758
+ /**
5759
+ * t4_idma_monitor - monitor SGE Ingress DMA state
5760
+ * @adapter: the adapter
5761
+ * @idma: the adapter IDMA Monitor state
5762
+ * @hz: number of ticks/second
5763
+ * @ticks: number of ticks since the last IDMA Monitor call
5764
+ */
5765
+ void t4_idma_monitor (struct adapter * adapter ,
5766
+ struct sge_idma_monitor_state * idma ,
5767
+ int hz , int ticks )
5768
+ {
5769
+ int i , idma_same_state_cnt [2 ];
5770
+
5771
+ /* Read the SGE Debug Ingress DMA Same State Count registers. These
5772
+ * are counters inside the SGE which count up on each clock when the
5773
+ * SGE finds its Ingress DMA State Engines in the same states they
5774
+ * were in the previous clock. The counters will peg out at
5775
+ * 0xffffffff without wrapping around so once they pass the 1s
5776
+ * threshold they'll stay above that till the IDMA state changes.
5777
+ */
5778
+ t4_write_reg (adapter , SGE_DEBUG_INDEX_A , 13 );
5779
+ idma_same_state_cnt [0 ] = t4_read_reg (adapter , SGE_DEBUG_DATA_HIGH_A );
5780
+ idma_same_state_cnt [1 ] = t4_read_reg (adapter , SGE_DEBUG_DATA_LOW_A );
5781
+
5782
+ for (i = 0 ; i < 2 ; i ++ ) {
5783
+ u32 debug0 , debug11 ;
5784
+
5785
+ /* If the Ingress DMA Same State Counter ("timer") is less
5786
+ * than 1s, then we can reset our synthesized Stall Timer and
5787
+ * continue. If we have previously emitted warnings about a
5788
+ * potential stalled Ingress Queue, issue a note indicating
5789
+ * that the Ingress Queue has resumed forward progress.
5790
+ */
5791
+ if (idma_same_state_cnt [i ] < idma -> idma_1s_thresh ) {
5792
+ if (idma -> idma_stalled [i ] >= SGE_IDMA_WARN_THRESH * hz )
5793
+ dev_warn (adapter -> pdev_dev , "SGE idma%d, queue %u, "
5794
+ "resumed after %d seconds\n" ,
5795
+ i , idma -> idma_qid [i ],
5796
+ idma -> idma_stalled [i ] / hz );
5797
+ idma -> idma_stalled [i ] = 0 ;
5798
+ continue ;
5799
+ }
5800
+
5801
+ /* Synthesize an SGE Ingress DMA Same State Timer in the Hz
5802
+ * domain. The first time we get here it'll be because we
5803
+ * passed the 1s Threshold; each additional time it'll be
5804
+ * because the RX Timer Callback is being fired on its regular
5805
+ * schedule.
5806
+ *
5807
+ * If the stall is below our Potential Hung Ingress Queue
5808
+ * Warning Threshold, continue.
5809
+ */
5810
+ if (idma -> idma_stalled [i ] == 0 ) {
5811
+ idma -> idma_stalled [i ] = hz ;
5812
+ idma -> idma_warn [i ] = 0 ;
5813
+ } else {
5814
+ idma -> idma_stalled [i ] += ticks ;
5815
+ idma -> idma_warn [i ] -= ticks ;
5816
+ }
5817
+
5818
+ if (idma -> idma_stalled [i ] < SGE_IDMA_WARN_THRESH * hz )
5819
+ continue ;
5820
+
5821
+ /* We'll issue a warning every SGE_IDMA_WARN_REPEAT seconds.
5822
+ */
5823
+ if (idma -> idma_warn [i ] > 0 )
5824
+ continue ;
5825
+ idma -> idma_warn [i ] = SGE_IDMA_WARN_REPEAT * hz ;
5826
+
5827
+ /* Read and save the SGE IDMA State and Queue ID information.
5828
+ * We do this every time in case it changes across time ...
5829
+ * can't be too careful ...
5830
+ */
5831
+ t4_write_reg (adapter , SGE_DEBUG_INDEX_A , 0 );
5832
+ debug0 = t4_read_reg (adapter , SGE_DEBUG_DATA_LOW_A );
5833
+ idma -> idma_state [i ] = (debug0 >> (i * 9 )) & 0x3f ;
5834
+
5835
+ t4_write_reg (adapter , SGE_DEBUG_INDEX_A , 11 );
5836
+ debug11 = t4_read_reg (adapter , SGE_DEBUG_DATA_LOW_A );
5837
+ idma -> idma_qid [i ] = (debug11 >> (i * 16 )) & 0xffff ;
5838
+
5839
+ dev_warn (adapter -> pdev_dev , "SGE idma%u, queue %u, potentially stuck in "
5840
+ "state %u for %d seconds (debug0=%#x, debug11=%#x)\n" ,
5841
+ i , idma -> idma_qid [i ], idma -> idma_state [i ],
5842
+ idma -> idma_stalled [i ] / hz ,
5843
+ debug0 , debug11 );
5844
+ t4_sge_decode_idma_state (adapter , idma -> idma_state [i ]);
5845
+ }
5846
+ }
0 commit comments