@@ -245,16 +245,6 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget)
245
245
tx_ring -> q_vector -> tx .total_bytes += total_bytes ;
246
246
tx_ring -> q_vector -> tx .total_packets += total_packets ;
247
247
248
- /* check to see if there are any non-cache aligned descriptors
249
- * waiting to be written back, and kick the hardware to force
250
- * them to be written back in case of napi polling
251
- */
252
- if (budget &&
253
- !((i & WB_STRIDE ) == WB_STRIDE ) &&
254
- !test_bit (__I40E_DOWN , & tx_ring -> vsi -> state ) &&
255
- (I40E_DESC_UNUSED (tx_ring ) != tx_ring -> count ))
256
- tx_ring -> arm_wb = true;
257
-
258
248
netdev_tx_completed_queue (netdev_get_tx_queue (tx_ring -> netdev ,
259
249
tx_ring -> queue_index ),
260
250
total_packets , total_bytes );
@@ -1770,6 +1760,9 @@ static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
1770
1760
u32 td_tag = 0 ;
1771
1761
dma_addr_t dma ;
1772
1762
u16 gso_segs ;
1763
+ u16 desc_count = 0 ;
1764
+ bool tail_bump = true;
1765
+ bool do_rs = false;
1773
1766
1774
1767
if (tx_flags & I40E_TX_FLAGS_HW_VLAN ) {
1775
1768
td_cmd |= I40E_TX_DESC_CMD_IL2TAG1 ;
@@ -1810,6 +1803,8 @@ static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
1810
1803
1811
1804
tx_desc ++ ;
1812
1805
i ++ ;
1806
+ desc_count ++ ;
1807
+
1813
1808
if (i == tx_ring -> count ) {
1814
1809
tx_desc = I40E_TX_DESC (tx_ring , 0 );
1815
1810
i = 0 ;
@@ -1829,6 +1824,8 @@ static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
1829
1824
1830
1825
tx_desc ++ ;
1831
1826
i ++ ;
1827
+ desc_count ++ ;
1828
+
1832
1829
if (i == tx_ring -> count ) {
1833
1830
tx_desc = I40E_TX_DESC (tx_ring , 0 );
1834
1831
i = 0 ;
@@ -1843,35 +1840,7 @@ static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
1843
1840
tx_bi = & tx_ring -> tx_bi [i ];
1844
1841
}
1845
1842
1846
- /* Place RS bit on last descriptor of any packet that spans across the
1847
- * 4th descriptor (WB_STRIDE aka 0x3) in a 64B cacheline.
1848
- */
1849
1843
#define WB_STRIDE 0x3
1850
- if (((i & WB_STRIDE ) != WB_STRIDE ) &&
1851
- (first <= & tx_ring -> tx_bi [i ]) &&
1852
- (first >= & tx_ring -> tx_bi [i & ~WB_STRIDE ])) {
1853
- tx_desc -> cmd_type_offset_bsz =
1854
- build_ctob (td_cmd , td_offset , size , td_tag ) |
1855
- cpu_to_le64 ((u64 )I40E_TX_DESC_CMD_EOP <<
1856
- I40E_TXD_QW1_CMD_SHIFT );
1857
- } else {
1858
- tx_desc -> cmd_type_offset_bsz =
1859
- build_ctob (td_cmd , td_offset , size , td_tag ) |
1860
- cpu_to_le64 ((u64 )I40E_TXD_CMD <<
1861
- I40E_TXD_QW1_CMD_SHIFT );
1862
- }
1863
-
1864
- netdev_tx_sent_queue (netdev_get_tx_queue (tx_ring -> netdev ,
1865
- tx_ring -> queue_index ),
1866
- first -> bytecount );
1867
-
1868
- /* Force memory writes to complete before letting h/w
1869
- * know there are new descriptors to fetch. (Only
1870
- * applicable for weak-ordered memory model archs,
1871
- * such as IA-64).
1872
- */
1873
- wmb ();
1874
-
1875
1844
/* set next_to_watch value indicating a packet is present */
1876
1845
first -> next_to_watch = tx_desc ;
1877
1846
@@ -1881,15 +1850,78 @@ static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
1881
1850
1882
1851
tx_ring -> next_to_use = i ;
1883
1852
1853
+ netdev_tx_sent_queue (netdev_get_tx_queue (tx_ring -> netdev ,
1854
+ tx_ring -> queue_index ),
1855
+ first -> bytecount );
1884
1856
i40evf_maybe_stop_tx (tx_ring , DESC_NEEDED );
1857
+
1858
+ /* Algorithm to optimize tail and RS bit setting:
1859
+ * if xmit_more is supported
1860
+ * if xmit_more is true
1861
+ * do not update tail and do not mark RS bit.
1862
+ * if xmit_more is false and last xmit_more was false
1863
+ * if every packet spanned less than 4 desc
1864
+ * then set RS bit on 4th packet and update tail
1865
+ * on every packet
1866
+ * else
1867
+ * update tail and set RS bit on every packet.
1868
+ * if xmit_more is false and last_xmit_more was true
1869
+ * update tail and set RS bit.
1870
+ * else (kernel < 3.18)
1871
+ * if every packet spanned less than 4 desc
1872
+ * then set RS bit on 4th packet and update tail
1873
+ * on every packet
1874
+ * else
1875
+ * set RS bit on EOP for every packet and update tail
1876
+ *
1877
+ * Optimization: wmb to be issued only in case of tail update.
1878
+ * Also optimize the Descriptor WB path for RS bit with the same
1879
+ * algorithm.
1880
+ *
1881
+ * Note: If there are less than 4 packets
1882
+ * pending and interrupts were disabled the service task will
1883
+ * trigger a force WB.
1884
+ */
1885
+ if (skb -> xmit_more &&
1886
+ !netif_xmit_stopped (netdev_get_tx_queue (tx_ring -> netdev ,
1887
+ tx_ring -> queue_index ))) {
1888
+ tx_ring -> flags |= I40E_TXR_FLAGS_LAST_XMIT_MORE_SET ;
1889
+ tail_bump = false;
1890
+ } else if (!skb -> xmit_more &&
1891
+ !netif_xmit_stopped (netdev_get_tx_queue (tx_ring -> netdev ,
1892
+ tx_ring -> queue_index )) &&
1893
+ (!(tx_ring -> flags & I40E_TXR_FLAGS_LAST_XMIT_MORE_SET )) &&
1894
+ (tx_ring -> packet_stride < WB_STRIDE ) &&
1895
+ (desc_count < WB_STRIDE )) {
1896
+ tx_ring -> packet_stride ++ ;
1897
+ } else {
1898
+ tx_ring -> packet_stride = 0 ;
1899
+ tx_ring -> flags &= ~I40E_TXR_FLAGS_LAST_XMIT_MORE_SET ;
1900
+ do_rs = true;
1901
+ }
1902
+ if (do_rs )
1903
+ tx_ring -> packet_stride = 0 ;
1904
+
1905
+ tx_desc -> cmd_type_offset_bsz =
1906
+ build_ctob (td_cmd , td_offset , size , td_tag ) |
1907
+ cpu_to_le64 ((u64 )(do_rs ? I40E_TXD_CMD :
1908
+ I40E_TX_DESC_CMD_EOP ) <<
1909
+ I40E_TXD_QW1_CMD_SHIFT );
1910
+
1885
1911
/* notify HW of packet */
1886
- if (!skb -> xmit_more ||
1887
- netif_xmit_stopped (netdev_get_tx_queue (tx_ring -> netdev ,
1888
- tx_ring -> queue_index )))
1889
- writel (i , tx_ring -> tail );
1890
- else
1912
+ if (!tail_bump )
1891
1913
prefetchw (tx_desc + 1 );
1892
1914
1915
+ if (tail_bump ) {
1916
+ /* Force memory writes to complete before letting h/w
1917
+ * know there are new descriptors to fetch. (Only
1918
+ * applicable for weak-ordered memory model archs,
1919
+ * such as IA-64).
1920
+ */
1921
+ wmb ();
1922
+ writel (i , tx_ring -> tail );
1923
+ }
1924
+
1893
1925
return ;
1894
1926
1895
1927
dma_error :
0 commit comments