Skip to content

Commit ad2c457

Browse files
Yuval ShaiaMukesh Kacker
authored andcommitted
IB/ipoib: Scatter-Gather support in connected mode
By default, IPoIB-CM driver uses 64k MTU. Larger MTU gives better performance. This MTU plus overhead puts the memory allocation for IP based packets at 32 4k pages (order 5), which have to be contiguous. When the system memory under pressure, it was observed that allocating 128k contiguous physical memory is difficult and causes serious errors (such as system becomes unusable). This enhancement resolve the issue by removing the physically contiguous memory requirement using Scatter/Gather feature that exists in Linux stack. With this fix Scatter-Gather will be supported also in connected mode This change also revert the change made in commit e112373 ("IPoIB/cm: Reduce connected mode TX object size)". Orabug: 20422840 Reviewed-by: John Sobecki <[email protected]> Signed-off-by: Yuval Shaia <[email protected]>
1 parent 7e762be commit ad2c457

File tree

4 files changed

+92
-29
lines changed

4 files changed

+92
-29
lines changed

drivers/infiniband/ulp/ipoib/ipoib.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -176,11 +176,6 @@ struct ipoib_tx_buf {
176176
u64 mapping[MAX_SKB_FRAGS + 1];
177177
};
178178

179-
struct ipoib_cm_tx_buf {
180-
struct sk_buff *skb;
181-
u64 mapping;
182-
};
183-
184179
/* in order to call dst->ops->update_pmtu out of spin-lock*/
185180
struct ipoib_pmtu_update {
186181
struct work_struct work;
@@ -246,7 +241,7 @@ struct ipoib_cm_tx {
246241
struct net_device *dev;
247242
struct ipoib_neigh *neigh;
248243
struct ipoib_path *path;
249-
struct ipoib_cm_tx_buf *tx_ring;
244+
struct ipoib_tx_buf *tx_ring;
250245
unsigned tx_head;
251246
unsigned tx_tail;
252247
unsigned long flags;
@@ -511,6 +506,8 @@ int ipoib_mcast_stop_thread(struct net_device *dev);
511506
void ipoib_mcast_dev_down(struct net_device *dev);
512507
void ipoib_mcast_dev_flush(struct net_device *dev);
513508

509+
int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
510+
514511
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
515512
struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
516513
int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);

drivers/infiniband/ulp/ipoib/ipoib_cm.c

Lines changed: 87 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,31 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
8888
ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
8989
}
9090

91+
static void ipoib_cm_dma_unmap_tx(struct ipoib_dev_priv *priv,
92+
struct ipoib_tx_buf *tx_req)
93+
{
94+
struct sk_buff *skb;
95+
int i, offs;
96+
97+
skb = tx_req->skb;
98+
if (skb_shinfo(skb)->nr_frags) {
99+
offs = 0;
100+
if (skb_headlen(skb)) {
101+
ib_dma_unmap_single(priv->ca, tx_req->mapping[0],
102+
skb_headlen(skb), DMA_TO_DEVICE);
103+
offs = 1;
104+
}
105+
for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
106+
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
107+
108+
ib_dma_unmap_page(priv->ca, tx_req->mapping[i + offs],
109+
skb_frag_size(frag), DMA_TO_DEVICE);
110+
}
111+
} else
112+
ib_dma_unmap_single(priv->ca, tx_req->mapping[0], skb->len,
113+
DMA_TO_DEVICE);
114+
}
115+
91116
static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
92117
{
93118
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -707,11 +732,39 @@ static inline int post_send(struct ipoib_dev_priv *priv,
707732
return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
708733
}
709734

735+
static inline int post_send_sg(struct ipoib_dev_priv *priv,
736+
struct ipoib_cm_tx *tx,
737+
unsigned int wr_id,
738+
struct sk_buff *skb,
739+
u64 mapping[MAX_SKB_FRAGS + 1])
740+
{
741+
struct ib_send_wr *bad_wr;
742+
int i, off;
743+
skb_frag_t *frags = skb_shinfo(skb)->frags;
744+
int nr_frags = skb_shinfo(skb)->nr_frags;
745+
746+
if (skb_headlen(skb)) {
747+
priv->tx_sge[0].addr = mapping[0];
748+
priv->tx_sge[0].length = skb_headlen(skb);
749+
off = 1;
750+
} else
751+
off = 0;
752+
753+
for (i = 0; i < nr_frags; ++i) {
754+
priv->tx_sge[i + off].addr = mapping[i + off];
755+
priv->tx_sge[i + off].length = frags[i].size;
756+
}
757+
priv->tx_wr.num_sge = nr_frags + off;
758+
priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
759+
760+
return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
761+
}
762+
710763
void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
711764
{
712765
struct ipoib_dev_priv *priv = netdev_priv(dev);
713-
struct ipoib_cm_tx_buf *tx_req;
714-
u64 addr;
766+
struct ipoib_tx_buf *tx_req;
767+
u64 addr = 0;
715768
int rc;
716769

717770
if (unlikely(skb->len > tx->mtu)) {
@@ -735,24 +788,37 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
735788
*/
736789
tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
737790
tx_req->skb = skb;
738-
addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
739-
if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
740-
++dev->stats.tx_errors;
741-
dev_kfree_skb_any(skb);
742-
return;
743-
}
744791

745-
tx_req->mapping = addr;
792+
if (skb_shinfo(skb)->nr_frags) {
793+
if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
794+
++dev->stats.tx_errors;
795+
dev_kfree_skb_any(skb);
796+
return;
797+
}
798+
rc = post_send_sg(priv, tx, tx->tx_head &
799+
(ipoib_sendq_size - 1),
800+
skb, tx_req->mapping);
801+
} else {
802+
addr = ib_dma_map_single(priv->ca, skb->data, skb->len,
803+
DMA_TO_DEVICE);
804+
if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
805+
++dev->stats.tx_errors;
806+
dev_kfree_skb_any(skb);
807+
return;
808+
}
809+
810+
tx_req->mapping[0] = addr;
746811

747-
skb_orphan(skb);
748-
skb_dst_drop(skb);
812+
skb_orphan(skb);
813+
skb_dst_drop(skb);
749814

750-
rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
751-
addr, skb->len);
815+
rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
816+
addr, skb->len);
817+
}
752818
if (unlikely(rc)) {
753819
ipoib_warn(priv, "post_send failed, error %d\n", rc);
754820
++dev->stats.tx_errors;
755-
ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
821+
ipoib_cm_dma_unmap_tx(priv, tx_req);
756822
dev_kfree_skb_any(skb);
757823
} else {
758824
dev->trans_start = jiffies;
@@ -777,7 +843,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
777843
struct ipoib_dev_priv *priv = netdev_priv(dev);
778844
struct ipoib_cm_tx *tx = wc->qp->qp_context;
779845
unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
780-
struct ipoib_cm_tx_buf *tx_req;
846+
struct ipoib_tx_buf *tx_req;
781847
unsigned long flags;
782848

783849
ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
@@ -791,7 +857,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
791857

792858
tx_req = &tx->tx_ring[wr_id];
793859

794-
ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
860+
ipoib_cm_dma_unmap_tx(priv, tx_req);
795861

796862
/* FIXME: is this right? Shouldn't we only increment on success? */
797863
++dev->stats.tx_packets;
@@ -1046,6 +1112,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
10461112

10471113
struct ib_qp *tx_qp;
10481114

1115+
if (dev->features & NETIF_F_SG)
1116+
attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
1117+
10491118
tx_qp = ib_create_qp(priv->pd, &attr);
10501119
if (PTR_ERR(tx_qp) == -EINVAL) {
10511120
ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
@@ -1180,7 +1249,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
11801249
static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
11811250
{
11821251
struct ipoib_dev_priv *priv = netdev_priv(p->dev);
1183-
struct ipoib_cm_tx_buf *tx_req;
1252+
struct ipoib_tx_buf *tx_req;
11841253
unsigned long begin;
11851254

11861255
ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
@@ -1207,8 +1276,7 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
12071276

12081277
while ((int) p->tx_tail - (int) p->tx_head < 0) {
12091278
tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
1210-
ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
1211-
DMA_TO_DEVICE);
1279+
ipoib_cm_dma_unmap_tx(priv, tx_req);
12121280
dev_kfree_skb_any(tx_req->skb);
12131281
++p->tx_tail;
12141282
netif_tx_lock_bh(p->dev);
@@ -1498,7 +1566,6 @@ static void ipoib_cm_stale_task(struct work_struct *work)
14981566
spin_unlock_irq(&priv->lock);
14991567
}
15001568

1501-
15021569
static ssize_t show_mode(struct device *d, struct device_attribute *attr,
15031570
char *buf)
15041571
{

drivers/infiniband/ulp/ipoib/ipoib_ib.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,8 +263,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
263263
"for buf %d\n", wr_id);
264264
}
265265

266-
static int ipoib_dma_map_tx(struct ib_device *ca,
267-
struct ipoib_tx_buf *tx_req)
266+
int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
268267
{
269268
struct sk_buff *skb = tx_req->skb;
270269
u64 *mapping = tx_req->mapping;

drivers/infiniband/ulp/ipoib/ipoib_main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
198198
struct ipoib_dev_priv *priv = netdev_priv(dev);
199199

200200
if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
201-
features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
201+
features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
202202

203203
return features;
204204
}

0 commit comments

Comments
 (0)