Skip to content

Commit 839fcab

Browse files
Michael S. TsirkinRoland Dreier
authored andcommitted
IPoIB: Connected mode experimental support
The following patch adds experimental support for IPoIB connected mode, as defined by the draft from the IETF ipoib working group. The idea is to increase performance by increasing the MTU from the maximum of 2K (theoretically 4K) supported by IPoIB on top of UD. With this code, I'm able to get 800MByte/sec or more with netperf without options on a Mellanox 4x back-to-back DDR system. Some notes on code: 1. SRQ is used for scalability to large cluster sizes 2. Only RC connections are used (UC does not support SRQ now) 3. Retry count is set to 0 since spec draft warns against retries 4. Each connection is used for data transfers in only 1 direction, so each connection is either active(TX) or passive (RX). 2 sides that want to communicate create 2 connections. 5. Each active (TX) connection has a separate CQ for send completions - this keeps the code simple without CQ resize and other tricks 6. To detect stale passive side connections (where the remote side is down), we keep an LRU list of passive connections (updated once per second per connection) and destroy a connection after it has been unused for several seconds. The LRU rule makes it possible to avoid scanning connections that have recently been active. Signed-off-by: Michael S. Tsirkin <[email protected]> Signed-off-by: Roland Dreier <[email protected]>
1 parent 9a6b090 commit 839fcab

File tree

9 files changed

+1575
-32
lines changed

9 files changed

+1575
-32
lines changed

drivers/infiniband/ulp/ipoib/Kconfig

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,27 @@
11
config INFINIBAND_IPOIB
22
tristate "IP-over-InfiniBand"
3-
depends on INFINIBAND && NETDEVICES && INET
3+
depends on INFINIBAND && NETDEVICES && INET && (IPV6 || IPV6=n)
44
---help---
55
Support for the IP-over-InfiniBand protocol (IPoIB). This
66
transports IP packets over InfiniBand so you can use your IB
77
device as a fancy NIC.
88

99
See Documentation/infiniband/ipoib.txt for more information
1010

11+
config INFINIBAND_IPOIB_CM
12+
bool "IP-over-InfiniBand Connected Mode support"
13+
depends on INFINIBAND_IPOIB && EXPERIMENTAL
14+
default n
15+
---help---
16+
This option enables experimental support for IPoIB connected mode.
17+
After enabling this option, you need to switch to connected mode through
18+
/sys/class/net/ibXXX/mode to actually create connections, and then increase
19+
the interface MTU with e.g. ifconfig ib0 mtu 65520.
20+
21+
WARNING: Enabling connected mode will trigger some
22+
packet drops for multicast and UD mode traffic from this interface,
23+
unless you limit mtu for these destinations to 2044.
24+
1125
config INFINIBAND_IPOIB_DEBUG
1226
bool "IP-over-InfiniBand debugging" if EMBEDDED
1327
depends on INFINIBAND_IPOIB

drivers/infiniband/ulp/ipoib/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@ ib_ipoib-y := ipoib_main.o \
55
ipoib_multicast.o \
66
ipoib_verbs.o \
77
ipoib_vlan.o
8+
ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o
89
ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o
910

drivers/infiniband/ulp/ipoib/ipoib.h

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ enum {
6262

6363
IPOIB_ENCAP_LEN = 4,
6464

65+
IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */
66+
IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN,
67+
IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
68+
IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
6569
IPOIB_RX_RING_SIZE = 128,
6670
IPOIB_TX_RING_SIZE = 64,
6771
IPOIB_MAX_QUEUE_SIZE = 8192,
@@ -81,6 +85,8 @@ enum {
8185
IPOIB_MCAST_RUN = 6,
8286
IPOIB_STOP_REAPER = 7,
8387
IPOIB_MCAST_STARTED = 8,
88+
IPOIB_FLAG_NETIF_STOPPED = 9,
89+
IPOIB_FLAG_ADMIN_CM = 10,
8490

8591
IPOIB_MAX_BACKOFF_SECONDS = 16,
8692

@@ -90,6 +96,13 @@ enum {
9096
IPOIB_MCAST_FLAG_ATTACHED = 3,
9197
};
9298

99+
#define IPOIB_OP_RECV (1ul << 31)
100+
#ifdef CONFIG_INFINIBAND_IPOIB_CM
101+
#define IPOIB_CM_OP_SRQ (1ul << 30)
102+
#else
103+
#define IPOIB_CM_OP_SRQ (0)
104+
#endif
105+
93106
/* structs */
94107

95108
struct ipoib_header {
@@ -113,6 +126,59 @@ struct ipoib_tx_buf {
113126
u64 mapping;
114127
};
115128

129+
struct ib_cm_id;
130+
131+
struct ipoib_cm_data {
132+
__be32 qpn; /* High byte MUST be ignored on receive */
133+
__be32 mtu;
134+
};
135+
136+
struct ipoib_cm_rx {
137+
struct ib_cm_id *id;
138+
struct ib_qp *qp;
139+
struct list_head list;
140+
struct net_device *dev;
141+
unsigned long jiffies;
142+
};
143+
144+
struct ipoib_cm_tx {
145+
struct ib_cm_id *id;
146+
struct ib_cq *cq;
147+
struct ib_qp *qp;
148+
struct list_head list;
149+
struct net_device *dev;
150+
struct ipoib_neigh *neigh;
151+
struct ipoib_path *path;
152+
struct ipoib_tx_buf *tx_ring;
153+
unsigned tx_head;
154+
unsigned tx_tail;
155+
unsigned long flags;
156+
u32 mtu;
157+
struct ib_wc ibwc[IPOIB_NUM_WC];
158+
};
159+
160+
struct ipoib_cm_rx_buf {
161+
struct sk_buff *skb;
162+
u64 mapping[IPOIB_CM_RX_SG];
163+
};
164+
165+
struct ipoib_cm_dev_priv {
166+
struct ib_srq *srq;
167+
struct ipoib_cm_rx_buf *srq_ring;
168+
struct ib_cm_id *id;
169+
struct list_head passive_ids;
170+
struct work_struct start_task;
171+
struct work_struct reap_task;
172+
struct work_struct skb_task;
173+
struct delayed_work stale_task;
174+
struct sk_buff_head skb_queue;
175+
struct list_head start_list;
176+
struct list_head reap_list;
177+
struct ib_wc ibwc[IPOIB_NUM_WC];
178+
struct ib_sge rx_sge[IPOIB_CM_RX_SG];
179+
struct ib_recv_wr rx_wr;
180+
};
181+
116182
/*
117183
* Device private locking: tx_lock protects members used in TX fast
118184
* path (and we use LLTX so upper layers don't do extra locking).
@@ -179,6 +245,10 @@ struct ipoib_dev_priv {
179245
struct list_head child_intfs;
180246
struct list_head list;
181247

248+
#ifdef CONFIG_INFINIBAND_IPOIB_CM
249+
struct ipoib_cm_dev_priv cm;
250+
#endif
251+
182252
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
183253
struct list_head fs_list;
184254
struct dentry *mcg_dentry;
@@ -212,6 +282,9 @@ struct ipoib_path {
212282

213283
struct ipoib_neigh {
214284
struct ipoib_ah *ah;
285+
#ifdef CONFIG_INFINIBAND_IPOIB_CM
286+
struct ipoib_cm_tx *cm;
287+
#endif
215288
union ib_gid dgid;
216289
struct sk_buff_head queue;
217290

@@ -315,6 +388,146 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
315388
void ipoib_pkey_poll(struct work_struct *work);
316389
int ipoib_pkey_dev_delay_open(struct net_device *dev);
317390

391+
#ifdef CONFIG_INFINIBAND_IPOIB_CM
392+
393+
#define IPOIB_FLAGS_RC 0x80
394+
#define IPOIB_FLAGS_UC 0x40
395+
396+
/* We don't support UC connections at the moment */
397+
#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC))
398+
399+
static inline int ipoib_cm_admin_enabled(struct net_device *dev)
400+
{
401+
struct ipoib_dev_priv *priv = netdev_priv(dev);
402+
return IPOIB_CM_SUPPORTED(dev->dev_addr) &&
403+
test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
404+
}
405+
406+
static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
407+
{
408+
struct ipoib_dev_priv *priv = netdev_priv(dev);
409+
return IPOIB_CM_SUPPORTED(n->ha) &&
410+
test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
411+
}
412+
413+
static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
414+
415+
{
416+
return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags);
417+
}
418+
419+
static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
420+
{
421+
return neigh->cm;
422+
}
423+
424+
static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
425+
{
426+
neigh->cm = tx;
427+
}
428+
429+
void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
430+
int ipoib_cm_dev_open(struct net_device *dev);
431+
void ipoib_cm_dev_stop(struct net_device *dev);
432+
int ipoib_cm_dev_init(struct net_device *dev);
433+
int ipoib_cm_add_mode_attr(struct net_device *dev);
434+
void ipoib_cm_dev_cleanup(struct net_device *dev);
435+
struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
436+
struct ipoib_neigh *neigh);
437+
void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
438+
void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
439+
unsigned int mtu);
440+
void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
441+
#else
442+
443+
struct ipoib_cm_tx;
444+
445+
static inline int ipoib_cm_admin_enabled(struct net_device *dev)
446+
{
447+
return 0;
448+
}
449+
static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
450+
451+
{
452+
return 0;
453+
}
454+
455+
static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
456+
457+
{
458+
return 0;
459+
}
460+
461+
static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
462+
{
463+
return NULL;
464+
}
465+
466+
static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
467+
{
468+
}
469+
470+
static inline
471+
void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
472+
{
473+
return;
474+
}
475+
476+
static inline
477+
int ipoib_cm_dev_open(struct net_device *dev)
478+
{
479+
return 0;
480+
}
481+
482+
static inline
483+
void ipoib_cm_dev_stop(struct net_device *dev)
484+
{
485+
return;
486+
}
487+
488+
static inline
489+
int ipoib_cm_dev_init(struct net_device *dev)
490+
{
491+
return -ENOSYS;
492+
}
493+
494+
static inline
495+
void ipoib_cm_dev_cleanup(struct net_device *dev)
496+
{
497+
return;
498+
}
499+
500+
static inline
501+
struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
502+
struct ipoib_neigh *neigh)
503+
{
504+
return NULL;
505+
}
506+
507+
static inline
508+
void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
509+
{
510+
return;
511+
}
512+
513+
static inline
514+
int ipoib_cm_add_mode_attr(struct net_device *dev)
515+
{
516+
return 0;
517+
}
518+
519+
static inline void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
520+
unsigned int mtu)
521+
{
522+
dev_kfree_skb_any(skb);
523+
}
524+
525+
static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
526+
{
527+
}
528+
529+
#endif
530+
318531
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
319532
void ipoib_create_debug_files(struct net_device *dev);
320533
void ipoib_delete_debug_files(struct net_device *dev);
@@ -392,4 +605,6 @@ extern int ipoib_debug_level;
392605

393606
#define IPOIB_GID_ARG(gid) IPOIB_GID_RAW_ARG((gid).raw)
394607

608+
#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
609+
395610
#endif /* _IPOIB_H */

0 commit comments

Comments
 (0)