Skip to content

Commit 3fcece1

Browse files
Jakub Kicinskidavem330
authored andcommitted
net: store port/representator id in metadata_dst
Switches and modern SR-IOV enabled NICs may multiplex traffic from Port representators and control messages over single set of hardware queues. Control messages and muxed traffic may need ordered delivery. Those requirements make it hard to comfortably use TC infrastructure today unless we have a way of attaching metadata to skbs at the upper device. Because single set of queues is used for many netdevs stopping TC/sched queues of all of them reliably is impossible and lower device has to retreat to returning NETDEV_TX_BUSY and usually has to take extra locks on the fastpath. This patch attempts to enable port/representative devs to attach metadata to skbs which carry port id. This way representatives can be queueless and all queuing can be performed at the lower netdev in the usual way. Traffic arriving on the port/representative interfaces will be have metadata attached and will subsequently be queued to the lower device for transmission. The lower device should recognize the metadata and translate it to HW specific format which is most likely either a special header inserted before the network headers or descriptor/metadata fields. Metadata is associated with the lower device by storing the netdev pointer along with port id so that if TC decides to redirect or mirror the new netdev will not try to interpret it. This is mostly for SR-IOV devices since switches don't have lower netdevs today. Signed-off-by: Jakub Kicinski <[email protected]> Signed-off-by: Sridhar Samudrala <[email protected]> Signed-off-by: Simon Horman <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent cf3db45 commit 3fcece1

File tree

5 files changed

+50
-17
lines changed

5 files changed

+50
-17
lines changed

include/net/dst_metadata.h

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,22 @@
55
#include <net/ip_tunnels.h>
66
#include <net/dst.h>
77

8+
enum metadata_type {
9+
METADATA_IP_TUNNEL,
10+
METADATA_HW_PORT_MUX,
11+
};
12+
13+
struct hw_port_info {
14+
struct net_device *lower_dev;
15+
u32 port_id;
16+
};
17+
818
struct metadata_dst {
919
struct dst_entry dst;
20+
enum metadata_type type;
1021
union {
1122
struct ip_tunnel_info tun_info;
23+
struct hw_port_info port_info;
1224
} u;
1325
};
1426

@@ -27,7 +39,7 @@ static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
2739
struct metadata_dst *md_dst = skb_metadata_dst(skb);
2840
struct dst_entry *dst;
2941

30-
if (md_dst)
42+
if (md_dst && md_dst->type == METADATA_IP_TUNNEL)
3143
return &md_dst->u.tun_info;
3244

3345
dst = skb_dst(skb);
@@ -55,22 +67,33 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
5567
a = (const struct metadata_dst *) skb_dst(skb_a);
5668
b = (const struct metadata_dst *) skb_dst(skb_b);
5769

58-
if (!a != !b || a->u.tun_info.options_len != b->u.tun_info.options_len)
70+
if (!a != !b || a->type != b->type)
5971
return 1;
6072

61-
return memcmp(&a->u.tun_info, &b->u.tun_info,
62-
sizeof(a->u.tun_info) + a->u.tun_info.options_len);
73+
switch (a->type) {
74+
case METADATA_HW_PORT_MUX:
75+
return memcmp(&a->u.port_info, &b->u.port_info,
76+
sizeof(a->u.port_info));
77+
case METADATA_IP_TUNNEL:
78+
return memcmp(&a->u.tun_info, &b->u.tun_info,
79+
sizeof(a->u.tun_info) +
80+
a->u.tun_info.options_len);
81+
default:
82+
return 1;
83+
}
6384
}
6485

6586
void metadata_dst_free(struct metadata_dst *);
66-
struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
67-
struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags);
87+
struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
88+
gfp_t flags);
89+
struct metadata_dst __percpu *
90+
metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags);
6891

6992
static inline struct metadata_dst *tun_rx_dst(int md_size)
7093
{
7194
struct metadata_dst *tun_dst;
7295

73-
tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC);
96+
tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
7497
if (!tun_dst)
7598
return NULL;
7699

@@ -85,11 +108,11 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb)
85108
int md_size;
86109
struct metadata_dst *new_md;
87110

88-
if (!md_dst)
111+
if (!md_dst || md_dst->type != METADATA_IP_TUNNEL)
89112
return ERR_PTR(-EINVAL);
90113

91114
md_size = md_dst->u.tun_info.options_len;
92-
new_md = metadata_dst_alloc(md_size, GFP_ATOMIC);
115+
new_md = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
93116
if (!new_md)
94117
return ERR_PTR(-ENOMEM);
95118

net/core/dst.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,9 @@ static int dst_md_discard(struct sk_buff *skb)
264264
return 0;
265265
}
266266

267-
static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
267+
static void __metadata_dst_init(struct metadata_dst *md_dst,
268+
enum metadata_type type, u8 optslen)
269+
268270
{
269271
struct dst_entry *dst;
270272

@@ -276,17 +278,19 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
276278
dst->output = dst_md_discard_out;
277279

278280
memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
281+
md_dst->type = type;
279282
}
280283

281-
struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
284+
struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
285+
gfp_t flags)
282286
{
283287
struct metadata_dst *md_dst;
284288

285289
md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
286290
if (!md_dst)
287291
return NULL;
288292

289-
__metadata_dst_init(md_dst, optslen);
293+
__metadata_dst_init(md_dst, type, optslen);
290294

291295
return md_dst;
292296
}
@@ -300,7 +304,8 @@ void metadata_dst_free(struct metadata_dst *md_dst)
300304
kfree(md_dst);
301305
}
302306

303-
struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
307+
struct metadata_dst __percpu *
308+
metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
304309
{
305310
int cpu;
306311
struct metadata_dst __percpu *md_dst;
@@ -311,7 +316,7 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
311316
return NULL;
312317

313318
for_each_possible_cpu(cpu)
314-
__metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen);
319+
__metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);
315320

316321
return md_dst;
317322
}

net/core/filter.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2565,6 +2565,7 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
25652565
* that is holding verifier mutex.
25662566
*/
25672567
md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
2568+
METADATA_IP_TUNNEL,
25682569
GFP_KERNEL);
25692570
if (!md_dst)
25702571
return NULL;

net/ipv4/ip_tunnel_core.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,12 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
134134
struct metadata_dst *res;
135135
struct ip_tunnel_info *dst, *src;
136136

137-
if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
137+
if (!md || md->type != METADATA_IP_TUNNEL ||
138+
md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
139+
138140
return NULL;
139141

140-
res = metadata_dst_alloc(0, flags);
142+
res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags);
141143
if (!res)
142144
return NULL;
143145

net/openvswitch/flow_netlink.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2202,7 +2202,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
22022202
if (start < 0)
22032203
return start;
22042204

2205-
tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL);
2205+
tun_dst = metadata_dst_alloc(key.tun_opts_len, METADATA_IP_TUNNEL,
2206+
GFP_KERNEL);
2207+
22062208
if (!tun_dst)
22072209
return -ENOMEM;
22082210

0 commit comments

Comments
 (0)