Skip to content

Commit ee122c7

Browse files
tgrafdavem330
authored andcommitted
vxlan: Flow based tunneling
Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <[email protected]> Signed-off-by: Pravin B Shelar <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 0accfc2 commit ee122c7

File tree

6 files changed

+165
-23
lines changed

6 files changed

+165
-23
lines changed

drivers/net/vxlan.c

Lines changed: 127 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include <net/ip6_tunnel.h>
5050
#include <net/ip6_checksum.h>
5151
#endif
52+
#include <net/dst_metadata.h>
5253

5354
#define VXLAN_VERSION "0.1"
5455

@@ -140,6 +141,11 @@ struct vxlan_dev {
140141
static u32 vxlan_salt __read_mostly;
141142
static struct workqueue_struct *vxlan_wq;
142143

144+
static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
145+
{
146+
return vs->flags & VXLAN_F_COLLECT_METADATA;
147+
}
148+
143149
#if IS_ENABLED(CONFIG_IPV6)
144150
static inline
145151
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
@@ -1164,10 +1170,13 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
11641170
/* Callback from net/ipv4/udp.c to receive packets */
11651171
static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
11661172
{
1173+
struct metadata_dst *tun_dst = NULL;
1174+
struct ip_tunnel_info *info;
11671175
struct vxlan_sock *vs;
11681176
struct vxlanhdr *vxh;
11691177
u32 flags, vni;
1170-
struct vxlan_metadata md = {0};
1178+
struct vxlan_metadata _md;
1179+
struct vxlan_metadata *md = &_md;
11711180

11721181
/* Need Vxlan and inner Ethernet header to be present */
11731182
if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -1202,20 +1211,50 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
12021211
vni &= VXLAN_VNI_MASK;
12031212
}
12041213

1214+
if (vxlan_collect_metadata(vs)) {
1215+
const struct iphdr *iph = ip_hdr(skb);
1216+
1217+
tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC);
1218+
if (!tun_dst)
1219+
goto drop;
1220+
1221+
info = &tun_dst->u.tun_info;
1222+
info->key.ipv4_src = iph->saddr;
1223+
info->key.ipv4_dst = iph->daddr;
1224+
info->key.ipv4_tos = iph->tos;
1225+
info->key.ipv4_ttl = iph->ttl;
1226+
info->key.tp_src = udp_hdr(skb)->source;
1227+
info->key.tp_dst = udp_hdr(skb)->dest;
1228+
1229+
info->mode = IP_TUNNEL_INFO_RX;
1230+
info->key.tun_flags = TUNNEL_KEY;
1231+
info->key.tun_id = cpu_to_be64(vni >> 8);
1232+
if (udp_hdr(skb)->check != 0)
1233+
info->key.tun_flags |= TUNNEL_CSUM;
1234+
1235+
md = ip_tunnel_info_opts(info, sizeof(*md));
1236+
md->tun_dst = tun_dst;
1237+
} else {
1238+
memset(md, 0, sizeof(*md));
1239+
}
1240+
12051241
/* For backwards compatibility, only allow reserved fields to be
12061242
* used by VXLAN extensions if explicitly requested.
12071243
*/
12081244
if ((flags & VXLAN_HF_GBP) && (vs->flags & VXLAN_F_GBP)) {
12091245
struct vxlanhdr_gbp *gbp;
12101246

12111247
gbp = (struct vxlanhdr_gbp *)vxh;
1212-
md.gbp = ntohs(gbp->policy_id);
1248+
md->gbp = ntohs(gbp->policy_id);
1249+
1250+
if (tun_dst)
1251+
info->key.tun_flags |= TUNNEL_VXLAN_OPT;
12131252

12141253
if (gbp->dont_learn)
1215-
md.gbp |= VXLAN_GBP_DONT_LEARN;
1254+
md->gbp |= VXLAN_GBP_DONT_LEARN;
12161255

12171256
if (gbp->policy_applied)
1218-
md.gbp |= VXLAN_GBP_POLICY_APPLIED;
1257+
md->gbp |= VXLAN_GBP_POLICY_APPLIED;
12191258

12201259
flags &= ~VXLAN_GBP_USED_BITS;
12211260
}
@@ -1233,8 +1272,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
12331272
goto bad_flags;
12341273
}
12351274

1236-
md.vni = vxh->vx_vni;
1237-
vs->rcv(vs, skb, &md);
1275+
md->vni = vxh->vx_vni;
1276+
vs->rcv(vs, skb, md);
12381277
return 0;
12391278

12401279
drop:
@@ -1247,6 +1286,9 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
12471286
ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
12481287

12491288
error:
1289+
if (tun_dst)
1290+
dst_release((struct dst_entry *)tun_dst);
1291+
12501292
/* Return non vxlan pkt */
12511293
return 1;
12521294
}
@@ -1263,7 +1305,12 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
12631305
int err = 0;
12641306
union vxlan_addr *remote_ip;
12651307

1266-
vni = ntohl(md->vni) >> 8;
1308+
/* For flow based devices, map all packets to VNI 0 */
1309+
if (vs->flags & VXLAN_F_FLOW_BASED)
1310+
vni = 0;
1311+
else
1312+
vni = ntohl(md->vni) >> 8;
1313+
12671314
/* Is this VNI defined? */
12681315
vxlan = vxlan_vs_find_vni(vs, vni);
12691316
if (!vxlan)
@@ -1292,12 +1339,19 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
12921339
#endif
12931340
}
12941341

1342+
if (md->tun_dst) {
1343+
skb_dst_set(skb, (struct dst_entry *)md->tun_dst);
1344+
md->tun_dst = NULL;
1345+
}
1346+
12951347
if ((vxlan->flags & VXLAN_F_LEARN) &&
12961348
vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
12971349
goto drop;
12981350

12991351
skb_reset_network_header(skb);
1300-
skb->mark = md->gbp;
1352+
/* In flow-based mode, GBP is carried in dst_metadata */
1353+
if (!(vs->flags & VXLAN_F_FLOW_BASED))
1354+
skb->mark = md->gbp;
13011355

13021356
if (oip6)
13031357
err = IP6_ECN_decapsulate(oip6, skb);
@@ -1330,6 +1384,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
13301384

13311385
return;
13321386
drop:
1387+
if (md->tun_dst)
1388+
dst_release((struct dst_entry *)md->tun_dst);
1389+
13331390
/* Consume bad packet */
13341391
kfree_skb(skb);
13351392
}
@@ -1878,22 +1935,40 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
18781935
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
18791936
struct vxlan_rdst *rdst, bool did_rsc)
18801937
{
1938+
struct ip_tunnel_info *info = skb_tunnel_info(skb);
18811939
struct vxlan_dev *vxlan = netdev_priv(dev);
18821940
struct sock *sk = vxlan->vn_sock->sock->sk;
18831941
struct rtable *rt = NULL;
18841942
const struct iphdr *old_iph;
18851943
struct flowi4 fl4;
18861944
union vxlan_addr *dst;
1887-
struct vxlan_metadata md;
1945+
union vxlan_addr remote_ip;
1946+
struct vxlan_metadata _md;
1947+
struct vxlan_metadata *md = &_md;
18881948
__be16 src_port = 0, dst_port;
18891949
u32 vni;
18901950
__be16 df = 0;
18911951
__u8 tos, ttl;
18921952
int err;
1953+
u32 flags = vxlan->flags;
18931954

1894-
dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
1895-
vni = rdst->remote_vni;
1896-
dst = &rdst->remote_ip;
1955+
if (rdst) {
1956+
dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
1957+
vni = rdst->remote_vni;
1958+
dst = &rdst->remote_ip;
1959+
} else {
1960+
if (!info) {
1961+
WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
1962+
dev->name);
1963+
goto drop;
1964+
}
1965+
1966+
dst_port = info->key.tp_dst ? : vxlan->dst_port;
1967+
vni = be64_to_cpu(info->key.tun_id);
1968+
remote_ip.sin.sin_family = AF_INET;
1969+
remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst;
1970+
dst = &remote_ip;
1971+
}
18971972

18981973
if (vxlan_addr_any(dst)) {
18991974
if (did_rsc) {
@@ -1918,8 +1993,25 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
19181993
vxlan->port_max, true);
19191994

19201995
if (dst->sa.sa_family == AF_INET) {
1996+
if (info) {
1997+
if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
1998+
df = htons(IP_DF);
1999+
if (info->key.tun_flags & TUNNEL_CSUM)
2000+
flags |= VXLAN_F_UDP_CSUM;
2001+
else
2002+
flags &= ~VXLAN_F_UDP_CSUM;
2003+
2004+
ttl = info->key.ipv4_ttl;
2005+
tos = info->key.ipv4_tos;
2006+
2007+
if (info->options_len)
2008+
md = ip_tunnel_info_opts(info, sizeof(*md));
2009+
} else {
2010+
md->gbp = skb->mark;
2011+
}
2012+
19212013
memset(&fl4, 0, sizeof(fl4));
1922-
fl4.flowi4_oif = rdst->remote_ifindex;
2014+
fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0;
19232015
fl4.flowi4_tos = RT_TOS(tos);
19242016
fl4.flowi4_mark = skb->mark;
19252017
fl4.flowi4_proto = IPPROTO_UDP;
@@ -1958,14 +2050,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
19582050

19592051
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
19602052
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
1961-
md.vni = htonl(vni << 8);
1962-
md.gbp = skb->mark;
1963-
2053+
md->vni = htonl(vni << 8);
19642054
err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr,
19652055
dst->sin.sin_addr.s_addr, tos, ttl, df,
1966-
src_port, dst_port, &md,
2056+
src_port, dst_port, md,
19672057
!net_eq(vxlan->net, dev_net(vxlan->dev)),
1968-
vxlan->flags);
2058+
flags);
19692059
if (err < 0) {
19702060
/* skb is already freed. */
19712061
skb = NULL;
@@ -1980,7 +2070,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
19802070
u32 flags;
19812071

19822072
memset(&fl6, 0, sizeof(fl6));
1983-
fl6.flowi6_oif = rdst->remote_ifindex;
2073+
fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
19842074
fl6.daddr = dst->sin6.sin6_addr;
19852075
fl6.saddr = vxlan->saddr.sin6.sin6_addr;
19862076
fl6.flowi6_mark = skb->mark;
@@ -2018,11 +2108,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
20182108
}
20192109

20202110
ttl = ttl ? : ip6_dst_hoplimit(ndst);
2021-
md.vni = htonl(vni << 8);
2022-
md.gbp = skb->mark;
2111+
md->vni = htonl(vni << 8);
2112+
md->gbp = skb->mark;
20232113

20242114
err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
2025-
0, ttl, src_port, dst_port, &md,
2115+
0, ttl, src_port, dst_port, md,
20262116
!net_eq(vxlan->net, dev_net(vxlan->dev)),
20272117
vxlan->flags);
20282118
#endif
@@ -2051,6 +2141,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
20512141
static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
20522142
{
20532143
struct vxlan_dev *vxlan = netdev_priv(dev);
2144+
const struct ip_tunnel_info *info = skb_tunnel_info(skb);
20542145
struct ethhdr *eth;
20552146
bool did_rsc = false;
20562147
struct vxlan_rdst *rdst, *fdst = NULL;
@@ -2078,6 +2169,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
20782169
#endif
20792170
}
20802171

2172+
if (vxlan->flags & VXLAN_F_FLOW_BASED &&
2173+
info && info->mode == IP_TUNNEL_INFO_TX) {
2174+
vxlan_xmit_one(skb, dev, NULL, false);
2175+
return NETDEV_TX_OK;
2176+
}
2177+
20812178
f = vxlan_find_mac(vxlan, eth->h_dest);
20822179
did_rsc = false;
20832180

@@ -2405,6 +2502,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
24052502
[IFLA_VXLAN_RSC] = { .type = NLA_U8 },
24062503
[IFLA_VXLAN_L2MISS] = { .type = NLA_U8 },
24072504
[IFLA_VXLAN_L3MISS] = { .type = NLA_U8 },
2505+
[IFLA_VXLAN_FLOWBASED] = { .type = NLA_U8 },
24082506
[IFLA_VXLAN_PORT] = { .type = NLA_U16 },
24092507
[IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 },
24102508
[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
@@ -2681,6 +2779,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
26812779
if (data[IFLA_VXLAN_LIMIT])
26822780
vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
26832781

2782+
if (data[IFLA_VXLAN_FLOWBASED] &&
2783+
nla_get_u8(data[IFLA_VXLAN_FLOWBASED]))
2784+
vxlan->flags |= VXLAN_F_FLOW_BASED;
2785+
26842786
if (data[IFLA_VXLAN_PORT_RANGE]) {
26852787
const struct ifla_vxlan_port_range *p
26862788
= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
@@ -2777,6 +2879,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
27772879
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */
27782880
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */
27792881
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */
2882+
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_FLOWBASED */
27802883
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
27812884
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
27822885
nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
@@ -2843,6 +2946,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
28432946
!!(vxlan->flags & VXLAN_F_L2MISS)) ||
28442947
nla_put_u8(skb, IFLA_VXLAN_L3MISS,
28452948
!!(vxlan->flags & VXLAN_F_L3MISS)) ||
2949+
nla_put_u8(skb, IFLA_VXLAN_FLOWBASED,
2950+
!!(vxlan->flags & VXLAN_F_FLOW_BASED)) ||
28462951
nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
28472952
nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) ||
28482953
nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) ||

include/linux/skbuff.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3469,5 +3469,6 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
34693469
skb_network_header(skb);
34703470
return hdr_len + skb_gso_transport_seglen(skb);
34713471
}
3472+
34723473
#endif /* __KERNEL__ */
34733474
#endif /* _LINUX_SKBUFF_H */

include/net/dst_metadata.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
struct metadata_dst {
99
struct dst_entry dst;
1010
size_t opts_len;
11+
union {
12+
struct ip_tunnel_info tun_info;
13+
} u;
1114
};
1215

1316
static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
@@ -20,6 +23,16 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
2023
return NULL;
2124
}
2225

26+
static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
27+
{
28+
struct metadata_dst *md_dst = skb_metadata_dst(skb);
29+
30+
if (md_dst)
31+
return &md_dst->u.tun_info;
32+
33+
return NULL;
34+
}
35+
2336
static inline bool skb_valid_dst(const struct sk_buff *skb)
2437
{
2538
struct dst_entry *dst = skb_dst(skb);

include/net/ip_tunnels.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,19 @@ struct ip_tunnel_key {
3838
__be16 tp_dst;
3939
} __packed __aligned(4); /* Minimize padding. */
4040

41+
/* Indicates whether the tunnel info structure represents receive
42+
* or transmit tunnel parameters.
43+
*/
44+
enum {
45+
IP_TUNNEL_INFO_RX,
46+
IP_TUNNEL_INFO_TX,
47+
};
48+
4149
struct ip_tunnel_info {
4250
struct ip_tunnel_key key;
4351
const void *options;
4452
u8 options_len;
53+
u8 mode;
4554
};
4655

4756
/* 6rd prefix/relay information */
@@ -284,6 +293,11 @@ static inline void iptunnel_xmit_stats(int err,
284293
}
285294
}
286295

296+
static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n)
297+
{
298+
return info + 1;
299+
}
300+
287301
#endif /* CONFIG_INET */
288302

289303
#endif /* __NET_IP_TUNNELS_H */

0 commit comments

Comments
 (0)