Skip to content

Commit 4fbae7d

Browse files
Mahesh Bandewardavem330
authored andcommitted
ipvlan: Introduce l3s mode
In a typical IPvlan L3 setup where master is in default-ns and each slave is into different (slave) ns. In this setup egress packet processing for traffic originating from slave-ns will hit all NF_HOOKs in slave-ns as well as default-ns. However same is not true for ingress processing. All these NF_HOOKs are hit only in the slave-ns skipping them in the default-ns. IPvlan in L3 mode is restrictive and if admins want to deploy iptables rules in default-ns, this asymmetric data path makes it impossible to do so. This patch makes use of the l3_rcv() (added as part of l3mdev enhancements) to perform input route lookup on RX packets without changing the skb->dev and then uses nf_hook at NF_INET_LOCAL_IN to change the skb->dev just before handing over skb to L4. Signed-off-by: Mahesh Bandewar <[email protected]> CC: David Ahern <[email protected]> Reviewed-by: David Ahern <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent e8bffe0 commit 4fbae7d

File tree

6 files changed

+188
-8
lines changed

6 files changed

+188
-8
lines changed

Documentation/networking/ipvlan.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module
2222
There are no module parameters for this driver and it can be configured
2323
using IProute2/ip utility.
2424

25-
ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | L3 }
25+
ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | l3 | l3s }
2626

2727
e.g. ip link add link ipvl0 eth0 type ipvlan mode l2
2828

@@ -48,6 +48,11 @@ master device for the L2 processing and routing from that instance will be
4848
used before packets are queued on the outbound device. In this mode the slaves
4949
will not receive nor can send multicast / broadcast traffic.
5050

51+
4.3 L3S mode:
52+
This is very similar to the L3 mode except that iptables (conn-tracking)
53+
works in this mode and hence it is L3-symmetric (L3s). This will have slightly less
54+
performance but that shouldn't matter since you are choosing this mode over plain-L3
55+
mode to make conn-tracking work.
5156

5257
5. What to choose (macvlan vs. ipvlan)?
5358
These two devices are very similar in many regards and the specific use

drivers/net/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ config IPVLAN
149149
tristate "IP-VLAN support"
150150
depends on INET
151151
depends on IPV6
152+
depends on NET_L3_MASTER_DEV
152153
---help---
153154
This allows one to create virtual devices off of a main interface
154155
and packets will be delivered based on the dest L3 (IPv6/IPv4 addr)

drivers/net/ipvlan/ipvlan.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,13 @@
2323
#include <linux/if_vlan.h>
2424
#include <linux/ip.h>
2525
#include <linux/inetdevice.h>
26+
#include <linux/netfilter.h>
2627
#include <net/ip.h>
2728
#include <net/ip6_route.h>
2829
#include <net/rtnetlink.h>
2930
#include <net/route.h>
3031
#include <net/addrconf.h>
32+
#include <net/l3mdev.h>
3133

3234
#define IPVLAN_DRV "ipvlan"
3335
#define IPV_DRV_VER "0.1"
@@ -124,4 +126,8 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
124126
const void *iaddr, bool is_v6);
125127
bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6);
126128
void ipvlan_ht_addr_del(struct ipvl_addr *addr);
129+
struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
130+
u16 proto);
131+
unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
132+
const struct nf_hook_state *state);
127133
#endif /* __IPVLAN_H */

drivers/net/ipvlan/ipvlan_core.c

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
560560
case IPVLAN_MODE_L2:
561561
return ipvlan_xmit_mode_l2(skb, dev);
562562
case IPVLAN_MODE_L3:
563+
case IPVLAN_MODE_L3S:
563564
return ipvlan_xmit_mode_l3(skb, dev);
564565
}
565566

@@ -664,6 +665,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
664665
return ipvlan_handle_mode_l2(pskb, port);
665666
case IPVLAN_MODE_L3:
666667
return ipvlan_handle_mode_l3(pskb, port);
668+
case IPVLAN_MODE_L3S:
669+
return RX_HANDLER_PASS;
667670
}
668671

669672
/* Should not reach here */
@@ -672,3 +675,94 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
672675
kfree_skb(skb);
673676
return RX_HANDLER_CONSUMED;
674677
}
678+
679+
static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb,
680+
struct net_device *dev)
681+
{
682+
struct ipvl_addr *addr = NULL;
683+
struct ipvl_port *port;
684+
void *lyr3h;
685+
int addr_type;
686+
687+
if (!dev || !netif_is_ipvlan_port(dev))
688+
goto out;
689+
690+
port = ipvlan_port_get_rcu(dev);
691+
if (!port || port->mode != IPVLAN_MODE_L3S)
692+
goto out;
693+
694+
lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
695+
if (!lyr3h)
696+
goto out;
697+
698+
addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
699+
out:
700+
return addr;
701+
}
702+
703+
struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
704+
u16 proto)
705+
{
706+
struct ipvl_addr *addr;
707+
struct net_device *sdev;
708+
709+
addr = ipvlan_skb_to_addr(skb, dev);
710+
if (!addr)
711+
goto out;
712+
713+
sdev = addr->master->dev;
714+
switch (proto) {
715+
case AF_INET:
716+
{
717+
int err;
718+
struct iphdr *ip4h = ip_hdr(skb);
719+
720+
err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr,
721+
ip4h->tos, sdev);
722+
if (unlikely(err))
723+
goto out;
724+
break;
725+
}
726+
case AF_INET6:
727+
{
728+
struct dst_entry *dst;
729+
struct ipv6hdr *ip6h = ipv6_hdr(skb);
730+
int flags = RT6_LOOKUP_F_HAS_SADDR;
731+
struct flowi6 fl6 = {
732+
.flowi6_iif = sdev->ifindex,
733+
.daddr = ip6h->daddr,
734+
.saddr = ip6h->saddr,
735+
.flowlabel = ip6_flowinfo(ip6h),
736+
.flowi6_mark = skb->mark,
737+
.flowi6_proto = ip6h->nexthdr,
738+
};
739+
740+
skb_dst_drop(skb);
741+
dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, flags);
742+
skb_dst_set(skb, dst);
743+
break;
744+
}
745+
default:
746+
break;
747+
}
748+
749+
out:
750+
return skb;
751+
}
752+
753+
unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
754+
const struct nf_hook_state *state)
755+
{
756+
struct ipvl_addr *addr;
757+
unsigned int len;
758+
759+
addr = ipvlan_skb_to_addr(skb, skb->dev);
760+
if (!addr)
761+
goto out;
762+
763+
skb->dev = addr->master->dev;
764+
len = skb->len + ETH_HLEN;
765+
ipvlan_count_rx(addr->master, len, true, false);
766+
out:
767+
return NF_ACCEPT;
768+
}

drivers/net/ipvlan/ipvlan_main.c

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,87 @@
99

1010
#include "ipvlan.h"
1111

12+
static u32 ipvl_nf_hook_refcnt = 0;
13+
14+
static struct nf_hook_ops ipvl_nfops[] __read_mostly = {
15+
{
16+
.hook = ipvlan_nf_input,
17+
.pf = NFPROTO_IPV4,
18+
.hooknum = NF_INET_LOCAL_IN,
19+
.priority = INT_MAX,
20+
},
21+
{
22+
.hook = ipvlan_nf_input,
23+
.pf = NFPROTO_IPV6,
24+
.hooknum = NF_INET_LOCAL_IN,
25+
.priority = INT_MAX,
26+
},
27+
};
28+
29+
static struct l3mdev_ops ipvl_l3mdev_ops __read_mostly = {
30+
.l3mdev_l3_rcv = ipvlan_l3_rcv,
31+
};
32+
1233
static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
1334
{
1435
ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj;
1536
}
1637

17-
static void ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
38+
static int ipvlan_register_nf_hook(void)
39+
{
40+
int err = 0;
41+
42+
if (!ipvl_nf_hook_refcnt) {
43+
err = _nf_register_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops));
44+
if (!err)
45+
ipvl_nf_hook_refcnt = 1;
46+
} else {
47+
ipvl_nf_hook_refcnt++;
48+
}
49+
50+
return err;
51+
}
52+
53+
static void ipvlan_unregister_nf_hook(void)
54+
{
55+
WARN_ON(!ipvl_nf_hook_refcnt);
56+
57+
ipvl_nf_hook_refcnt--;
58+
if (!ipvl_nf_hook_refcnt)
59+
_nf_unregister_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops));
60+
}
61+
62+
static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
1863
{
1964
struct ipvl_dev *ipvlan;
65+
struct net_device *mdev = port->dev;
66+
int err = 0;
2067

68+
ASSERT_RTNL();
2169
if (port->mode != nval) {
70+
if (nval == IPVLAN_MODE_L3S) {
71+
/* New mode is L3S */
72+
err = ipvlan_register_nf_hook();
73+
if (!err) {
74+
mdev->l3mdev_ops = &ipvl_l3mdev_ops;
75+
mdev->priv_flags |= IFF_L3MDEV_MASTER;
76+
} else
77+
return err;
78+
} else if (port->mode == IPVLAN_MODE_L3S) {
79+
/* Old mode was L3S */
80+
mdev->priv_flags &= ~IFF_L3MDEV_MASTER;
81+
ipvlan_unregister_nf_hook();
82+
mdev->l3mdev_ops = NULL;
83+
}
2284
list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
23-
if (nval == IPVLAN_MODE_L3)
85+
if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S)
2486
ipvlan->dev->flags |= IFF_NOARP;
2587
else
2688
ipvlan->dev->flags &= ~IFF_NOARP;
2789
}
2890
port->mode = nval;
2991
}
92+
return err;
3093
}
3194

3295
static int ipvlan_port_create(struct net_device *dev)
@@ -74,6 +137,11 @@ static void ipvlan_port_destroy(struct net_device *dev)
74137
struct ipvl_port *port = ipvlan_port_get_rtnl(dev);
75138

76139
dev->priv_flags &= ~IFF_IPVLAN_MASTER;
140+
if (port->mode == IPVLAN_MODE_L3S) {
141+
dev->priv_flags &= ~IFF_L3MDEV_MASTER;
142+
ipvlan_unregister_nf_hook();
143+
dev->l3mdev_ops = NULL;
144+
}
77145
netdev_rx_handler_unregister(dev);
78146
cancel_work_sync(&port->wq);
79147
__skb_queue_purge(&port->backlog);
@@ -132,7 +200,8 @@ static int ipvlan_open(struct net_device *dev)
132200
struct net_device *phy_dev = ipvlan->phy_dev;
133201
struct ipvl_addr *addr;
134202

135-
if (ipvlan->port->mode == IPVLAN_MODE_L3)
203+
if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
204+
ipvlan->port->mode == IPVLAN_MODE_L3S)
136205
dev->flags |= IFF_NOARP;
137206
else
138207
dev->flags &= ~IFF_NOARP;
@@ -372,13 +441,14 @@ static int ipvlan_nl_changelink(struct net_device *dev,
372441
{
373442
struct ipvl_dev *ipvlan = netdev_priv(dev);
374443
struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
444+
int err = 0;
375445

376446
if (data && data[IFLA_IPVLAN_MODE]) {
377447
u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
378448

379-
ipvlan_set_port_mode(port, nmode);
449+
err = ipvlan_set_port_mode(port, nmode);
380450
}
381-
return 0;
451+
return err;
382452
}
383453

384454
static size_t ipvlan_nl_getsize(const struct net_device *dev)
@@ -473,10 +543,13 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
473543
unregister_netdevice(dev);
474544
return err;
475545
}
546+
err = ipvlan_set_port_mode(port, mode);
547+
if (err) {
548+
unregister_netdevice(dev);
549+
return err;
550+
}
476551

477552
list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
478-
ipvlan_set_port_mode(port, mode);
479-
480553
netif_stacked_transfer_operstate(phy_dev, dev);
481554
return 0;
482555
}

include/uapi/linux/if_link.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,7 @@ enum {
464464
enum ipvlan_mode {
465465
IPVLAN_MODE_L2 = 0,
466466
IPVLAN_MODE_L3,
467+
IPVLAN_MODE_L3S,
467468
IPVLAN_MODE_MAX
468469
};
469470

0 commit comments

Comments
 (0)