Skip to content

Commit ee1e2c8

Browse files
Moni ShouaRoland Dreier
authored andcommitted
IPoIB: Refresh paths instead of flushing them on SM change events
The patch tries to solve the problem of device going down and paths being flushed on an SM change event. The method is to mark the paths as candidates for refresh (by setting the new valid flag to 0), and wait for an ARP probe a new path record query. The solution requires a different and less intrusive handling of SM change event. For that, the second argument of the flush function changes its meaning from a boolean flag to a level. In most cases, SM failover doesn't cause LID change so traffic won't stop. In the rare cases of LID change, the remote host (the one that hadn't changed its LID) will lose connectivity until paths are refreshed. This is no worse than the current state. In fact, preventing the device from going down saves packets that otherwise would be lost. Signed-off-by: Moni Levy <[email protected]> Signed-off-by: Moni Shoua <[email protected]> Signed-off-by: Roland Dreier <[email protected]>
1 parent 038919f commit ee1e2c8

File tree

4 files changed

+91
-30
lines changed

4 files changed

+91
-30
lines changed

drivers/infiniband/ulp/ipoib/ipoib.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@
5454

5555
/* constants */
5656

57+
enum ipoib_flush_level {
58+
IPOIB_FLUSH_LIGHT,
59+
IPOIB_FLUSH_NORMAL,
60+
IPOIB_FLUSH_HEAVY
61+
};
62+
5763
enum {
5864
IPOIB_ENCAP_LEN = 4,
5965

@@ -284,10 +290,11 @@ struct ipoib_dev_priv {
284290

285291
struct delayed_work pkey_poll_task;
286292
struct delayed_work mcast_task;
287-
struct work_struct flush_task;
293+
struct work_struct flush_light;
294+
struct work_struct flush_normal;
295+
struct work_struct flush_heavy;
288296
struct work_struct restart_task;
289297
struct delayed_work ah_reap_task;
290-
struct work_struct pkey_event_task;
291298

292299
struct ib_device *ca;
293300
u8 port;
@@ -369,6 +376,7 @@ struct ipoib_path {
369376

370377
struct rb_node rb_node;
371378
struct list_head list;
379+
int valid;
372380
};
373381

374382
struct ipoib_neigh {
@@ -433,11 +441,14 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
433441
struct ipoib_ah *address, u32 qpn);
434442
void ipoib_reap_ah(struct work_struct *work);
435443

444+
void ipoib_mark_paths_invalid(struct net_device *dev);
436445
void ipoib_flush_paths(struct net_device *dev);
437446
struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
438447

439448
int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
440-
void ipoib_ib_dev_flush(struct work_struct *work);
449+
void ipoib_ib_dev_flush_light(struct work_struct *work);
450+
void ipoib_ib_dev_flush_normal(struct work_struct *work);
451+
void ipoib_ib_dev_flush_heavy(struct work_struct *work);
441452
void ipoib_pkey_event(struct work_struct *work);
442453
void ipoib_ib_dev_cleanup(struct net_device *dev);
443454

drivers/infiniband/ulp/ipoib/ipoib_ib.c

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -902,7 +902,8 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
902902
return 0;
903903
}
904904

905-
static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
905+
static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
906+
enum ipoib_flush_level level)
906907
{
907908
struct ipoib_dev_priv *cpriv;
908909
struct net_device *dev = priv->dev;
@@ -915,7 +916,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
915916
* the parent is down.
916917
*/
917918
list_for_each_entry(cpriv, &priv->child_intfs, list)
918-
__ipoib_ib_dev_flush(cpriv, pkey_event);
919+
__ipoib_ib_dev_flush(cpriv, level);
919920

920921
mutex_unlock(&priv->vlan_mutex);
921922

@@ -929,7 +930,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
929930
return;
930931
}
931932

932-
if (pkey_event) {
933+
if (level == IPOIB_FLUSH_HEAVY) {
933934
if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
934935
clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
935936
ipoib_ib_dev_down(dev, 0);
@@ -947,11 +948,15 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
947948
priv->pkey_index = new_index;
948949
}
949950

950-
ipoib_dbg(priv, "flushing\n");
951+
if (level == IPOIB_FLUSH_LIGHT) {
952+
ipoib_mark_paths_invalid(dev);
953+
ipoib_mcast_dev_flush(dev);
954+
}
951955

952-
ipoib_ib_dev_down(dev, 0);
956+
if (level >= IPOIB_FLUSH_NORMAL)
957+
ipoib_ib_dev_down(dev, 0);
953958

954-
if (pkey_event) {
959+
if (level == IPOIB_FLUSH_HEAVY) {
955960
ipoib_ib_dev_stop(dev, 0);
956961
ipoib_ib_dev_open(dev);
957962
}
@@ -961,27 +966,34 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
961966
* we get here, don't bring it back up if it's not configured up
962967
*/
963968
if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
964-
ipoib_ib_dev_up(dev);
969+
if (level >= IPOIB_FLUSH_NORMAL)
970+
ipoib_ib_dev_up(dev);
965971
ipoib_mcast_restart_task(&priv->restart_task);
966972
}
967973
}
968974

969-
void ipoib_ib_dev_flush(struct work_struct *work)
975+
void ipoib_ib_dev_flush_light(struct work_struct *work)
976+
{
977+
struct ipoib_dev_priv *priv =
978+
container_of(work, struct ipoib_dev_priv, flush_light);
979+
980+
__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
981+
}
982+
983+
void ipoib_ib_dev_flush_normal(struct work_struct *work)
970984
{
971985
struct ipoib_dev_priv *priv =
972-
container_of(work, struct ipoib_dev_priv, flush_task);
986+
container_of(work, struct ipoib_dev_priv, flush_normal);
973987

974-
ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
975-
__ipoib_ib_dev_flush(priv, 0);
988+
__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
976989
}
977990

978-
void ipoib_pkey_event(struct work_struct *work)
991+
void ipoib_ib_dev_flush_heavy(struct work_struct *work)
979992
{
980993
struct ipoib_dev_priv *priv =
981-
container_of(work, struct ipoib_dev_priv, pkey_event_task);
994+
container_of(work, struct ipoib_dev_priv, flush_heavy);
982995

983-
ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
984-
__ipoib_ib_dev_flush(priv, 1);
996+
__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
985997
}
986998

987999
void ipoib_ib_dev_cleanup(struct net_device *dev)

drivers/infiniband/ulp/ipoib/ipoib_main.c

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,23 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
357357

358358
#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
359359

360+
void ipoib_mark_paths_invalid(struct net_device *dev)
361+
{
362+
struct ipoib_dev_priv *priv = netdev_priv(dev);
363+
struct ipoib_path *path, *tp;
364+
365+
spin_lock_irq(&priv->lock);
366+
367+
list_for_each_entry_safe(path, tp, &priv->path_list, list) {
368+
ipoib_dbg(priv, "mark path LID 0x%04x GID " IPOIB_GID_FMT " invalid\n",
369+
be16_to_cpu(path->pathrec.dlid),
370+
IPOIB_GID_ARG(path->pathrec.dgid));
371+
path->valid = 0;
372+
}
373+
374+
spin_unlock_irq(&priv->lock);
375+
}
376+
360377
void ipoib_flush_paths(struct net_device *dev)
361378
{
362379
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -393,6 +410,7 @@ static void path_rec_completion(int status,
393410
struct net_device *dev = path->dev;
394411
struct ipoib_dev_priv *priv = netdev_priv(dev);
395412
struct ipoib_ah *ah = NULL;
413+
struct ipoib_ah *old_ah;
396414
struct ipoib_neigh *neigh, *tn;
397415
struct sk_buff_head skqueue;
398416
struct sk_buff *skb;
@@ -416,6 +434,7 @@ static void path_rec_completion(int status,
416434

417435
spin_lock_irqsave(&priv->lock, flags);
418436

437+
old_ah = path->ah;
419438
path->ah = ah;
420439

421440
if (ah) {
@@ -428,6 +447,17 @@ static void path_rec_completion(int status,
428447
__skb_queue_tail(&skqueue, skb);
429448

430449
list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
450+
if (neigh->ah) {
451+
WARN_ON(neigh->ah != old_ah);
452+
/*
453+
* Dropping the ah reference inside
454+
* priv->lock is safe here, because we
455+
* will hold one more reference from
456+
* the original value of path->ah (ie
457+
* old_ah).
458+
*/
459+
ipoib_put_ah(neigh->ah);
460+
}
431461
kref_get(&path->ah->ref);
432462
neigh->ah = path->ah;
433463
memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
@@ -450,13 +480,17 @@ static void path_rec_completion(int status,
450480
while ((skb = __skb_dequeue(&neigh->queue)))
451481
__skb_queue_tail(&skqueue, skb);
452482
}
483+
path->valid = 1;
453484
}
454485

455486
path->query = NULL;
456487
complete(&path->done);
457488

458489
spin_unlock_irqrestore(&priv->lock, flags);
459490

491+
if (old_ah)
492+
ipoib_put_ah(old_ah);
493+
460494
while ((skb = __skb_dequeue(&skqueue))) {
461495
skb->dev = dev;
462496
if (dev_queue_xmit(skb))
@@ -630,8 +664,9 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
630664
spin_lock(&priv->lock);
631665

632666
path = __path_find(dev, phdr->hwaddr + 4);
633-
if (!path) {
634-
path = path_rec_create(dev, phdr->hwaddr + 4);
667+
if (!path || !path->valid) {
668+
if (!path)
669+
path = path_rec_create(dev, phdr->hwaddr + 4);
635670
if (path) {
636671
/* put pseudoheader back on for next time */
637672
skb_push(skb, sizeof *phdr);
@@ -1046,9 +1081,10 @@ static void ipoib_setup(struct net_device *dev)
10461081
INIT_LIST_HEAD(&priv->multicast_list);
10471082

10481083
INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
1049-
INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
10501084
INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
1051-
INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush);
1085+
INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);
1086+
INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);
1087+
INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
10521088
INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
10531089
INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
10541090
}

drivers/infiniband/ulp/ipoib/ipoib_verbs.c

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -290,15 +290,17 @@ void ipoib_event(struct ib_event_handler *handler,
290290
if (record->element.port_num != priv->port)
291291
return;
292292

293-
if (record->event == IB_EVENT_PORT_ERR ||
294-
record->event == IB_EVENT_PORT_ACTIVE ||
295-
record->event == IB_EVENT_LID_CHANGE ||
296-
record->event == IB_EVENT_SM_CHANGE ||
293+
ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
294+
record->device->name, record->element.port_num);
295+
296+
if (record->event == IB_EVENT_SM_CHANGE ||
297297
record->event == IB_EVENT_CLIENT_REREGISTER) {
298-
ipoib_dbg(priv, "Port state change event\n");
299-
queue_work(ipoib_workqueue, &priv->flush_task);
298+
queue_work(ipoib_workqueue, &priv->flush_light);
299+
} else if (record->event == IB_EVENT_PORT_ERR ||
300+
record->event == IB_EVENT_PORT_ACTIVE ||
301+
record->event == IB_EVENT_LID_CHANGE) {
302+
queue_work(ipoib_workqueue, &priv->flush_normal);
300303
} else if (record->event == IB_EVENT_PKEY_CHANGE) {
301-
ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
302-
queue_work(ipoib_workqueue, &priv->pkey_event_task);
304+
queue_work(ipoib_workqueue, &priv->flush_heavy);
303305
}
304306
}

0 commit comments

Comments
 (0)