Skip to content

Commit 4d19ae2

Browse files
Hakon-Buggejfvogel
authored andcommitted
net/rds: Implement ARP flushing correctly
If a remote peer has moved its IP address from one port to the other, the local node may have an incorrect ARP entry in its cache. During connection management, we will then get back a route-error-event from the CM. Current code attempts to flush the ARP entry from the cache. However, 1) it does not check for return values, 2) it does not supply the device name, 3) it does not iterate over all possible device names, and 4) its doesn't supply the correct flags. Due to 2-4 above, the flushing doesn't work. This commit fixes this. On a system with a single CX-3 and 16 VFs, fail-over just after a fail-back is reduced from ~60 seconds down to ~10 seconds with the fix (1156 RDS connections). The fix for UEK5 is slightly more complicated compared to the UEK4 variants, because rdmaip has moved stuff out of the rds_rdma module and due to RoCE. Hence, this commit detects possible IB link-layers and flushes the ARP cache for the possible devices accordingly. This is a temporary fix and should be moved out of the rds_rdma module and into the rdmaip module, as tracked by ER 28341928 - Move ARP flushing logic from rds_rdma to rdmaip. V1 -> V2: * Added correct use of netmask for the ATF_PUBL flag (Ka-Cheong) * Moved the link-layer detected flags into the rds_ib_transport struct (Ka-Cheong) V2 -> V3: * Added to commit message that this is a temporary fix (Santosh) * Added Santosh' r-b Orabug: 28219823 Signed-off-by: Håkon Bugge <[email protected]> Reviewed-by: [email protected]
1 parent 55fe097 commit 4d19ae2

File tree

4 files changed

+131
-23
lines changed

4 files changed

+131
-23
lines changed

net/rds/ib.c

Lines changed: 126 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@
3030
* SOFTWARE.
3131
*
3232
*/
33+
#include <linux/if_arp.h>
34+
#include <linux/sockios.h>
3335
#include <net/addrconf.h>
36+
#include <net/inet_common.h>
3437

3538
#include "ib.h"
3639
#include "rds_single_path.h"
@@ -463,6 +466,34 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
463466
return ret;
464467
}
465468

469+
/* Detect possible link-layers in order to flush ARP correctly */
470+
static void detect_link_layers(struct ib_device *ibdev)
471+
{
472+
if (ibdev->get_link_layer) {
473+
u8 port;
474+
475+
for (port = 1; port <= ibdev->phys_port_cnt; ++port) {
476+
switch (ibdev->get_link_layer(ibdev, port)) {
477+
case IB_LINK_LAYER_UNSPECIFIED:
478+
rds_ib_transport.t_ll_ib_detected = true;
479+
rds_ib_transport.t_ll_eth_detected = true;
480+
break;
481+
482+
case IB_LINK_LAYER_INFINIBAND:
483+
rds_ib_transport.t_ll_ib_detected = true;
484+
break;
485+
486+
case IB_LINK_LAYER_ETHERNET:
487+
rds_ib_transport.t_ll_eth_detected = true;
488+
break;
489+
}
490+
}
491+
} else {
492+
rds_ib_transport.t_ll_ib_detected = true;
493+
rds_ib_transport.t_ll_eth_detected = true;
494+
}
495+
}
496+
466497
void rds_ib_add_one(struct ib_device *device)
467498
{
468499
struct rds_ib_device *rds_ibdev;
@@ -477,6 +508,8 @@ void rds_ib_add_one(struct ib_device *device)
477508
if (device->node_type != RDMA_NODE_IB_CA)
478509
return;
479510

511+
detect_link_layers(device);
512+
480513
dev_attr = kmalloc(sizeof(*dev_attr), GFP_KERNEL);
481514
if (!dev_attr)
482515
return;
@@ -772,5 +805,97 @@ int rds_ib_inc_to_skb(struct rds_incoming *inc, struct sk_buff *skb)
772805
return ret;
773806
}
774807

775-
MODULE_LICENSE("GPL");
808+
static void __flush_arp_entry(struct arpreq *r, char name[IFNAMSIZ])
809+
{
810+
int ret;
776811

812+
r->arp_flags = ATF_PERM;
813+
((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr = htonl(0);
814+
strcpy(r->arp_dev, name);
815+
ret = inet_ioctl(rds_ib_inet_socket, SIOCDARP, (unsigned long)r);
816+
if ((ret == -ENOENT) || (ret == -ENXIO)) {
817+
r->arp_flags |= ATF_PUBL;
818+
((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr = htonl(0xFFFFFFFF);
819+
ret = inet_ioctl(rds_ib_inet_socket, SIOCDARP, (unsigned long)r);
820+
}
821+
822+
if (ret && (ret != -ENOENT) && (ret != -ENXIO))
823+
pr_err("SIOCDARP failed, err %d, addr %pI4, flags 0x%x, device %s\n",
824+
ret, &((struct sockaddr_in *)r)->sin_addr.s_addr,
825+
r->arp_flags, r->arp_dev);
826+
}
827+
828+
static void __flush_eth_arp_entry(struct arpreq *r)
829+
{
830+
struct rds_ib_device *rds_ibdev;
831+
832+
down_read(&rds_ib_devices_lock);
833+
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
834+
struct ib_device *ibdev = rds_ibdev->dev;
835+
u8 port;
836+
837+
if (!ibdev->get_netdev)
838+
continue;
839+
840+
for (port = 1; port <= ibdev->phys_port_cnt; ++port) {
841+
struct net_device *ndev = ibdev->get_netdev(ibdev, port);
842+
843+
if (ndev)
844+
__flush_arp_entry(r, ndev->name);
845+
}
846+
}
847+
up_read(&rds_ib_devices_lock);
848+
}
849+
850+
static void __flush_ib_arp_entry(struct arpreq *r)
851+
{
852+
struct net_device *ndev;
853+
854+
read_lock(&dev_base_lock);
855+
for_each_netdev(&init_net, ndev)
856+
if (ndev->type == ARPHRD_INFINIBAND)
857+
__flush_arp_entry(r, ndev->name);
858+
read_unlock(&dev_base_lock);
859+
}
860+
861+
void rds_ib_flush_arp_entry(struct in6_addr *prot_addr)
862+
{
863+
struct sockaddr_in *sin;
864+
struct page *page;
865+
struct arpreq *r;
866+
867+
if (!ipv6_addr_v4mapped(prot_addr)) {
868+
/* Addressed by bug 28220027 */
869+
pr_err("IPv6 addresses are not flushed from ARP cache");
870+
return;
871+
}
872+
873+
page = alloc_page(GFP_HIGHUSER);
874+
if (!page) {
875+
pr_err("alloc_page failed");
876+
return;
877+
}
878+
879+
r = (struct arpreq *)kmap(page);
880+
if (!r) {
881+
pr_err("kmap failed");
882+
goto out_free;
883+
}
884+
885+
memset(r, 0, sizeof(struct arpreq));
886+
sin = (struct sockaddr_in *)&r->arp_pa;
887+
sin->sin_family = AF_INET;
888+
sin->sin_addr.s_addr = prot_addr->s6_addr32[3];
889+
890+
if (rds_ib_transport.t_ll_eth_detected)
891+
__flush_eth_arp_entry(r);
892+
if (rds_ib_transport.t_ll_ib_detected)
893+
__flush_ib_arp_entry(r);
894+
895+
kunmap(page);
896+
897+
out_free:
898+
__free_page(page);
899+
}
900+
901+
MODULE_LICENSE("GPL");

net/rds/ib.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,7 @@ extern struct workqueue_struct *rds_aux_wq;
467467
extern struct rds_transport rds_ib_transport;
468468
extern void rds_ib_add_one(struct ib_device *device);
469469
extern void rds_ib_remove_one(struct ib_device *device, void *client_data);
470+
extern void rds_ib_flush_arp_entry(struct in6_addr *prot_addr);
470471
void rds_ib_srq_exit(struct rds_ib_device *rds_ibdev);
471472
int rds_ib_srq_init(struct rds_ib_device *rds_ibdev);
472473

net/rds/rdma_transport.c

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,6 @@ int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
9090
/* this can be null in the listening path */
9191
struct rds_connection *conn = cm_id->context;
9292
struct rds_transport *trans = &rds_ib_transport;
93-
struct page *page;
94-
struct arpreq *r;
95-
struct sockaddr_in *sin;
9693
int ret = 0;
9794
int *err;
9895

@@ -178,24 +175,7 @@ int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
178175

179176
case RDMA_CM_EVENT_ROUTE_ERROR:
180177
/* IP might have been moved so flush the ARP entry and retry */
181-
page = alloc_page(GFP_HIGHUSER);
182-
if (!page) {
183-
printk(KERN_ERR "alloc_page failed .. NO MEM\n");
184-
ret = -ENOMEM;
185-
} else {
186-
if (ipv6_addr_v4mapped(&conn->c_faddr)) {
187-
r = (struct arpreq *)kmap(page);
188-
memset(r, 0, sizeof(struct arpreq));
189-
sin = (struct sockaddr_in *)&r->arp_pa;
190-
sin->sin_family = AF_INET;
191-
sin->sin_addr.s_addr =
192-
conn->c_faddr.s6_addr32[3];
193-
inet_ioctl(rds_ib_inet_socket, SIOCDARP,
194-
(unsigned long)r);
195-
kunmap(page);
196-
__free_page(page);
197-
}
198-
}
178+
rds_ib_flush_arp_entry(&conn->c_faddr);
199179

200180
if (conn) {
201181
rds_rtd_ptr(RDS_RTD_ERR,

net/rds/rds.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -663,7 +663,9 @@ struct rds_transport {
663663
struct list_head t_item;
664664
struct module *t_owner;
665665
unsigned int t_prefer_loopback:1,
666-
t_mp_capable:1;
666+
t_mp_capable:1,
667+
t_ll_ib_detected:1,
668+
t_ll_eth_detected:1;
667669
unsigned int t_type;
668670

669671
int (*laddr_check)(struct net *net, const struct in6_addr *addr,

0 commit comments

Comments
 (0)