49
49
#include <net/ip6_tunnel.h>
50
50
#include <net/ip6_checksum.h>
51
51
#endif
52
+ #include <net/dst_metadata.h>
52
53
53
54
#define VXLAN_VERSION "0.1"
54
55
@@ -140,6 +141,11 @@ struct vxlan_dev {
140
141
static u32 vxlan_salt __read_mostly ;
141
142
static struct workqueue_struct * vxlan_wq ;
142
143
144
+ static inline bool vxlan_collect_metadata (struct vxlan_sock * vs )
145
+ {
146
+ return vs -> flags & VXLAN_F_COLLECT_METADATA ;
147
+ }
148
+
143
149
#if IS_ENABLED (CONFIG_IPV6 )
144
150
static inline
145
151
bool vxlan_addr_equal (const union vxlan_addr * a , const union vxlan_addr * b )
@@ -1164,10 +1170,13 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
1164
1170
/* Callback from net/ipv4/udp.c to receive packets */
1165
1171
static int vxlan_udp_encap_recv (struct sock * sk , struct sk_buff * skb )
1166
1172
{
1173
+ struct metadata_dst * tun_dst = NULL ;
1174
+ struct ip_tunnel_info * info ;
1167
1175
struct vxlan_sock * vs ;
1168
1176
struct vxlanhdr * vxh ;
1169
1177
u32 flags , vni ;
1170
- struct vxlan_metadata md = {0 };
1178
+ struct vxlan_metadata _md ;
1179
+ struct vxlan_metadata * md = & _md ;
1171
1180
1172
1181
/* Need Vxlan and inner Ethernet header to be present */
1173
1182
if (!pskb_may_pull (skb , VXLAN_HLEN ))
@@ -1202,20 +1211,50 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
1202
1211
vni &= VXLAN_VNI_MASK ;
1203
1212
}
1204
1213
1214
+ if (vxlan_collect_metadata (vs )) {
1215
+ const struct iphdr * iph = ip_hdr (skb );
1216
+
1217
+ tun_dst = metadata_dst_alloc (sizeof (* md ), GFP_ATOMIC );
1218
+ if (!tun_dst )
1219
+ goto drop ;
1220
+
1221
+ info = & tun_dst -> u .tun_info ;
1222
+ info -> key .ipv4_src = iph -> saddr ;
1223
+ info -> key .ipv4_dst = iph -> daddr ;
1224
+ info -> key .ipv4_tos = iph -> tos ;
1225
+ info -> key .ipv4_ttl = iph -> ttl ;
1226
+ info -> key .tp_src = udp_hdr (skb )-> source ;
1227
+ info -> key .tp_dst = udp_hdr (skb )-> dest ;
1228
+
1229
+ info -> mode = IP_TUNNEL_INFO_RX ;
1230
+ info -> key .tun_flags = TUNNEL_KEY ;
1231
+ info -> key .tun_id = cpu_to_be64 (vni >> 8 );
1232
+ if (udp_hdr (skb )-> check != 0 )
1233
+ info -> key .tun_flags |= TUNNEL_CSUM ;
1234
+
1235
+ md = ip_tunnel_info_opts (info , sizeof (* md ));
1236
+ md -> tun_dst = tun_dst ;
1237
+ } else {
1238
+ memset (md , 0 , sizeof (* md ));
1239
+ }
1240
+
1205
1241
/* For backwards compatibility, only allow reserved fields to be
1206
1242
* used by VXLAN extensions if explicitly requested.
1207
1243
*/
1208
1244
if ((flags & VXLAN_HF_GBP ) && (vs -> flags & VXLAN_F_GBP )) {
1209
1245
struct vxlanhdr_gbp * gbp ;
1210
1246
1211
1247
gbp = (struct vxlanhdr_gbp * )vxh ;
1212
- md .gbp = ntohs (gbp -> policy_id );
1248
+ md -> gbp = ntohs (gbp -> policy_id );
1249
+
1250
+ if (tun_dst )
1251
+ info -> key .tun_flags |= TUNNEL_VXLAN_OPT ;
1213
1252
1214
1253
if (gbp -> dont_learn )
1215
- md . gbp |= VXLAN_GBP_DONT_LEARN ;
1254
+ md -> gbp |= VXLAN_GBP_DONT_LEARN ;
1216
1255
1217
1256
if (gbp -> policy_applied )
1218
- md . gbp |= VXLAN_GBP_POLICY_APPLIED ;
1257
+ md -> gbp |= VXLAN_GBP_POLICY_APPLIED ;
1219
1258
1220
1259
flags &= ~VXLAN_GBP_USED_BITS ;
1221
1260
}
@@ -1233,8 +1272,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
1233
1272
goto bad_flags ;
1234
1273
}
1235
1274
1236
- md . vni = vxh -> vx_vni ;
1237
- vs -> rcv (vs , skb , & md );
1275
+ md -> vni = vxh -> vx_vni ;
1276
+ vs -> rcv (vs , skb , md );
1238
1277
return 0 ;
1239
1278
1240
1279
drop :
@@ -1247,6 +1286,9 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
1247
1286
ntohl (vxh -> vx_flags ), ntohl (vxh -> vx_vni ));
1248
1287
1249
1288
error :
1289
+ if (tun_dst )
1290
+ dst_release ((struct dst_entry * )tun_dst );
1291
+
1250
1292
/* Return non vxlan pkt */
1251
1293
return 1 ;
1252
1294
}
@@ -1263,7 +1305,12 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
1263
1305
int err = 0 ;
1264
1306
union vxlan_addr * remote_ip ;
1265
1307
1266
- vni = ntohl (md -> vni ) >> 8 ;
1308
+ /* For flow based devices, map all packets to VNI 0 */
1309
+ if (vs -> flags & VXLAN_F_FLOW_BASED )
1310
+ vni = 0 ;
1311
+ else
1312
+ vni = ntohl (md -> vni ) >> 8 ;
1313
+
1267
1314
/* Is this VNI defined? */
1268
1315
vxlan = vxlan_vs_find_vni (vs , vni );
1269
1316
if (!vxlan )
@@ -1292,12 +1339,19 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
1292
1339
#endif
1293
1340
}
1294
1341
1342
+ if (md -> tun_dst ) {
1343
+ skb_dst_set (skb , (struct dst_entry * )md -> tun_dst );
1344
+ md -> tun_dst = NULL ;
1345
+ }
1346
+
1295
1347
if ((vxlan -> flags & VXLAN_F_LEARN ) &&
1296
1348
vxlan_snoop (skb -> dev , & saddr , eth_hdr (skb )-> h_source ))
1297
1349
goto drop ;
1298
1350
1299
1351
skb_reset_network_header (skb );
1300
- skb -> mark = md -> gbp ;
1352
+ /* In flow-based mode, GBP is carried in dst_metadata */
1353
+ if (!(vs -> flags & VXLAN_F_FLOW_BASED ))
1354
+ skb -> mark = md -> gbp ;
1301
1355
1302
1356
if (oip6 )
1303
1357
err = IP6_ECN_decapsulate (oip6 , skb );
@@ -1330,6 +1384,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
1330
1384
1331
1385
return ;
1332
1386
drop :
1387
+ if (md -> tun_dst )
1388
+ dst_release ((struct dst_entry * )md -> tun_dst );
1389
+
1333
1390
/* Consume bad packet */
1334
1391
kfree_skb (skb );
1335
1392
}
@@ -1878,22 +1935,40 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
1878
1935
static void vxlan_xmit_one (struct sk_buff * skb , struct net_device * dev ,
1879
1936
struct vxlan_rdst * rdst , bool did_rsc )
1880
1937
{
1938
+ struct ip_tunnel_info * info = skb_tunnel_info (skb );
1881
1939
struct vxlan_dev * vxlan = netdev_priv (dev );
1882
1940
struct sock * sk = vxlan -> vn_sock -> sock -> sk ;
1883
1941
struct rtable * rt = NULL ;
1884
1942
const struct iphdr * old_iph ;
1885
1943
struct flowi4 fl4 ;
1886
1944
union vxlan_addr * dst ;
1887
- struct vxlan_metadata md ;
1945
+ union vxlan_addr remote_ip ;
1946
+ struct vxlan_metadata _md ;
1947
+ struct vxlan_metadata * md = & _md ;
1888
1948
__be16 src_port = 0 , dst_port ;
1889
1949
u32 vni ;
1890
1950
__be16 df = 0 ;
1891
1951
__u8 tos , ttl ;
1892
1952
int err ;
1953
+ u32 flags = vxlan -> flags ;
1893
1954
1894
- dst_port = rdst -> remote_port ? rdst -> remote_port : vxlan -> dst_port ;
1895
- vni = rdst -> remote_vni ;
1896
- dst = & rdst -> remote_ip ;
1955
+ if (rdst ) {
1956
+ dst_port = rdst -> remote_port ? rdst -> remote_port : vxlan -> dst_port ;
1957
+ vni = rdst -> remote_vni ;
1958
+ dst = & rdst -> remote_ip ;
1959
+ } else {
1960
+ if (!info ) {
1961
+ WARN_ONCE (1 , "%s: Missing encapsulation instructions\n" ,
1962
+ dev -> name );
1963
+ goto drop ;
1964
+ }
1965
+
1966
+ dst_port = info -> key .tp_dst ? : vxlan -> dst_port ;
1967
+ vni = be64_to_cpu (info -> key .tun_id );
1968
+ remote_ip .sin .sin_family = AF_INET ;
1969
+ remote_ip .sin .sin_addr .s_addr = info -> key .ipv4_dst ;
1970
+ dst = & remote_ip ;
1971
+ }
1897
1972
1898
1973
if (vxlan_addr_any (dst )) {
1899
1974
if (did_rsc ) {
@@ -1918,8 +1993,25 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
1918
1993
vxlan -> port_max , true);
1919
1994
1920
1995
if (dst -> sa .sa_family == AF_INET ) {
1996
+ if (info ) {
1997
+ if (info -> key .tun_flags & TUNNEL_DONT_FRAGMENT )
1998
+ df = htons (IP_DF );
1999
+ if (info -> key .tun_flags & TUNNEL_CSUM )
2000
+ flags |= VXLAN_F_UDP_CSUM ;
2001
+ else
2002
+ flags &= ~VXLAN_F_UDP_CSUM ;
2003
+
2004
+ ttl = info -> key .ipv4_ttl ;
2005
+ tos = info -> key .ipv4_tos ;
2006
+
2007
+ if (info -> options_len )
2008
+ md = ip_tunnel_info_opts (info , sizeof (* md ));
2009
+ } else {
2010
+ md -> gbp = skb -> mark ;
2011
+ }
2012
+
1921
2013
memset (& fl4 , 0 , sizeof (fl4 ));
1922
- fl4 .flowi4_oif = rdst -> remote_ifindex ;
2014
+ fl4 .flowi4_oif = rdst ? rdst -> remote_ifindex : 0 ;
1923
2015
fl4 .flowi4_tos = RT_TOS (tos );
1924
2016
fl4 .flowi4_mark = skb -> mark ;
1925
2017
fl4 .flowi4_proto = IPPROTO_UDP ;
@@ -1958,14 +2050,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
1958
2050
1959
2051
tos = ip_tunnel_ecn_encap (tos , old_iph , skb );
1960
2052
ttl = ttl ? : ip4_dst_hoplimit (& rt -> dst );
1961
- md .vni = htonl (vni << 8 );
1962
- md .gbp = skb -> mark ;
1963
-
2053
+ md -> vni = htonl (vni << 8 );
1964
2054
err = vxlan_xmit_skb (rt , sk , skb , fl4 .saddr ,
1965
2055
dst -> sin .sin_addr .s_addr , tos , ttl , df ,
1966
- src_port , dst_port , & md ,
2056
+ src_port , dst_port , md ,
1967
2057
!net_eq (vxlan -> net , dev_net (vxlan -> dev )),
1968
- vxlan -> flags );
2058
+ flags );
1969
2059
if (err < 0 ) {
1970
2060
/* skb is already freed. */
1971
2061
skb = NULL ;
@@ -1980,7 +2070,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
1980
2070
u32 flags ;
1981
2071
1982
2072
memset (& fl6 , 0 , sizeof (fl6 ));
1983
- fl6 .flowi6_oif = rdst -> remote_ifindex ;
2073
+ fl6 .flowi6_oif = rdst ? rdst -> remote_ifindex : 0 ;
1984
2074
fl6 .daddr = dst -> sin6 .sin6_addr ;
1985
2075
fl6 .saddr = vxlan -> saddr .sin6 .sin6_addr ;
1986
2076
fl6 .flowi6_mark = skb -> mark ;
@@ -2018,11 +2108,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
2018
2108
}
2019
2109
2020
2110
ttl = ttl ? : ip6_dst_hoplimit (ndst );
2021
- md . vni = htonl (vni << 8 );
2022
- md . gbp = skb -> mark ;
2111
+ md -> vni = htonl (vni << 8 );
2112
+ md -> gbp = skb -> mark ;
2023
2113
2024
2114
err = vxlan6_xmit_skb (ndst , sk , skb , dev , & fl6 .saddr , & fl6 .daddr ,
2025
- 0 , ttl , src_port , dst_port , & md ,
2115
+ 0 , ttl , src_port , dst_port , md ,
2026
2116
!net_eq (vxlan -> net , dev_net (vxlan -> dev )),
2027
2117
vxlan -> flags );
2028
2118
#endif
@@ -2051,6 +2141,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
2051
2141
static netdev_tx_t vxlan_xmit (struct sk_buff * skb , struct net_device * dev )
2052
2142
{
2053
2143
struct vxlan_dev * vxlan = netdev_priv (dev );
2144
+ const struct ip_tunnel_info * info = skb_tunnel_info (skb );
2054
2145
struct ethhdr * eth ;
2055
2146
bool did_rsc = false;
2056
2147
struct vxlan_rdst * rdst , * fdst = NULL ;
@@ -2078,6 +2169,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
2078
2169
#endif
2079
2170
}
2080
2171
2172
+ if (vxlan -> flags & VXLAN_F_FLOW_BASED &&
2173
+ info && info -> mode == IP_TUNNEL_INFO_TX ) {
2174
+ vxlan_xmit_one (skb , dev , NULL , false);
2175
+ return NETDEV_TX_OK ;
2176
+ }
2177
+
2081
2178
f = vxlan_find_mac (vxlan , eth -> h_dest );
2082
2179
did_rsc = false;
2083
2180
@@ -2405,6 +2502,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
2405
2502
[IFLA_VXLAN_RSC ] = { .type = NLA_U8 },
2406
2503
[IFLA_VXLAN_L2MISS ] = { .type = NLA_U8 },
2407
2504
[IFLA_VXLAN_L3MISS ] = { .type = NLA_U8 },
2505
+ [IFLA_VXLAN_FLOWBASED ] = { .type = NLA_U8 },
2408
2506
[IFLA_VXLAN_PORT ] = { .type = NLA_U16 },
2409
2507
[IFLA_VXLAN_UDP_CSUM ] = { .type = NLA_U8 },
2410
2508
[IFLA_VXLAN_UDP_ZERO_CSUM6_TX ] = { .type = NLA_U8 },
@@ -2681,6 +2779,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
2681
2779
if (data [IFLA_VXLAN_LIMIT ])
2682
2780
vxlan -> addrmax = nla_get_u32 (data [IFLA_VXLAN_LIMIT ]);
2683
2781
2782
+ if (data [IFLA_VXLAN_FLOWBASED ] &&
2783
+ nla_get_u8 (data [IFLA_VXLAN_FLOWBASED ]))
2784
+ vxlan -> flags |= VXLAN_F_FLOW_BASED ;
2785
+
2684
2786
if (data [IFLA_VXLAN_PORT_RANGE ]) {
2685
2787
const struct ifla_vxlan_port_range * p
2686
2788
= nla_data (data [IFLA_VXLAN_PORT_RANGE ]);
@@ -2777,6 +2879,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
2777
2879
nla_total_size (sizeof (__u8 )) + /* IFLA_VXLAN_RSC */
2778
2880
nla_total_size (sizeof (__u8 )) + /* IFLA_VXLAN_L2MISS */
2779
2881
nla_total_size (sizeof (__u8 )) + /* IFLA_VXLAN_L3MISS */
2882
+ nla_total_size (sizeof (__u8 )) + /* IFLA_VXLAN_FLOWBASED */
2780
2883
nla_total_size (sizeof (__u32 )) + /* IFLA_VXLAN_AGEING */
2781
2884
nla_total_size (sizeof (__u32 )) + /* IFLA_VXLAN_LIMIT */
2782
2885
nla_total_size (sizeof (struct ifla_vxlan_port_range )) +
@@ -2843,6 +2946,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
2843
2946
!!(vxlan -> flags & VXLAN_F_L2MISS )) ||
2844
2947
nla_put_u8 (skb , IFLA_VXLAN_L3MISS ,
2845
2948
!!(vxlan -> flags & VXLAN_F_L3MISS )) ||
2949
+ nla_put_u8 (skb , IFLA_VXLAN_FLOWBASED ,
2950
+ !!(vxlan -> flags & VXLAN_F_FLOW_BASED )) ||
2846
2951
nla_put_u32 (skb , IFLA_VXLAN_AGEING , vxlan -> age_interval ) ||
2847
2952
nla_put_u32 (skb , IFLA_VXLAN_LIMIT , vxlan -> addrmax ) ||
2848
2953
nla_put_be16 (skb , IFLA_VXLAN_PORT , vxlan -> dst_port ) ||
0 commit comments