Skip to content

Commit 79774d6

Browse files
committed
Merge branch 'fix-bpf_redirect'
Martin KaFai Lau says: ==================== bpf: Fix bpf_redirect to an ipip/ip6tnl dev This patch set fixes a bug in bpf_redirect(dev, flags) when dev is an ipip/ip6tnl. The current problem is IP-EthHdr-IP is sent out instead of IP-IP. Patch 1 adds a dev->type test similar to dev_is_mac_header_xmit() in act_mirred.c which is only available in net-next. We can consider to refactor it once this patch is pulled into net-next from net. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 23dd831 + 90e0289 commit 79774d6

File tree

7 files changed

+567
-19
lines changed

7 files changed

+567
-19
lines changed

include/linux/netdevice.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3354,6 +3354,21 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
33543354
bool is_skb_forwardable(const struct net_device *dev,
33553355
const struct sk_buff *skb);
33563356

3357+
static __always_inline int ____dev_forward_skb(struct net_device *dev,
3358+
struct sk_buff *skb)
3359+
{
3360+
if (skb_orphan_frags(skb, GFP_ATOMIC) ||
3361+
unlikely(!is_skb_forwardable(dev, skb))) {
3362+
atomic_long_inc(&dev->rx_dropped);
3363+
kfree_skb(skb);
3364+
return NET_RX_DROP;
3365+
}
3366+
3367+
skb_scrub_packet(skb, true);
3368+
skb->priority = 0;
3369+
return 0;
3370+
}
3371+
33573372
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
33583373

33593374
extern int netdev_budget;

net/core/dev.c

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1766,19 +1766,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable);
17661766

17671767
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
17681768
{
1769-
if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1770-
unlikely(!is_skb_forwardable(dev, skb))) {
1771-
atomic_long_inc(&dev->rx_dropped);
1772-
kfree_skb(skb);
1773-
return NET_RX_DROP;
1774-
}
1769+
int ret = ____dev_forward_skb(dev, skb);
17751770

1776-
skb_scrub_packet(skb, true);
1777-
skb->priority = 0;
1778-
skb->protocol = eth_type_trans(skb, dev);
1779-
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1771+
if (likely(!ret)) {
1772+
skb->protocol = eth_type_trans(skb, dev);
1773+
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1774+
}
17801775

1781-
return 0;
1776+
return ret;
17821777
}
17831778
EXPORT_SYMBOL_GPL(__dev_forward_skb);
17841779

net/core/filter.c

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1628,6 +1628,19 @@ static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
16281628
return dev_forward_skb(dev, skb);
16291629
}
16301630

1631+
static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
1632+
struct sk_buff *skb)
1633+
{
1634+
int ret = ____dev_forward_skb(dev, skb);
1635+
1636+
if (likely(!ret)) {
1637+
skb->dev = dev;
1638+
ret = netif_rx(skb);
1639+
}
1640+
1641+
return ret;
1642+
}
1643+
16311644
static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
16321645
{
16331646
int ret;
@@ -1647,6 +1660,51 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
16471660
return ret;
16481661
}
16491662

1663+
static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
1664+
u32 flags)
1665+
{
1666+
/* skb->mac_len is not set on normal egress */
1667+
unsigned int mlen = skb->network_header - skb->mac_header;
1668+
1669+
__skb_pull(skb, mlen);
1670+
1671+
/* At ingress, the mac header has already been pulled once.
1672+
* At egress, skb_pospull_rcsum has to be done in case that
1673+
* the skb is originated from ingress (i.e. a forwarded skb)
1674+
* to ensure that rcsum starts at net header.
1675+
*/
1676+
if (!skb_at_tc_ingress(skb))
1677+
skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
1678+
skb_pop_mac_header(skb);
1679+
skb_reset_mac_len(skb);
1680+
return flags & BPF_F_INGRESS ?
1681+
__bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
1682+
}
1683+
1684+
static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
1685+
u32 flags)
1686+
{
1687+
bpf_push_mac_rcsum(skb);
1688+
return flags & BPF_F_INGRESS ?
1689+
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
1690+
}
1691+
1692+
static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
1693+
u32 flags)
1694+
{
1695+
switch (dev->type) {
1696+
case ARPHRD_TUNNEL:
1697+
case ARPHRD_TUNNEL6:
1698+
case ARPHRD_SIT:
1699+
case ARPHRD_IPGRE:
1700+
case ARPHRD_VOID:
1701+
case ARPHRD_NONE:
1702+
return __bpf_redirect_no_mac(skb, dev, flags);
1703+
default:
1704+
return __bpf_redirect_common(skb, dev, flags);
1705+
}
1706+
}
1707+
16501708
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
16511709
{
16521710
struct net_device *dev;
@@ -1675,10 +1733,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
16751733
return -ENOMEM;
16761734
}
16771735

1678-
bpf_push_mac_rcsum(clone);
1679-
1680-
return flags & BPF_F_INGRESS ?
1681-
__bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
1736+
return __bpf_redirect(clone, dev, flags);
16821737
}
16831738

16841739
static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -1722,10 +1777,7 @@ int skb_do_redirect(struct sk_buff *skb)
17221777
return -EINVAL;
17231778
}
17241779

1725-
bpf_push_mac_rcsum(skb);
1726-
1727-
return ri->flags & BPF_F_INGRESS ?
1728-
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
1780+
return __bpf_redirect(skb, dev, ri->flags);
17291781
}
17301782

17311783
static const struct bpf_func_proto bpf_redirect_proto = {

samples/bpf/Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ hostprogs-y += xdp2
2727
hostprogs-y += test_current_task_under_cgroup
2828
hostprogs-y += trace_event
2929
hostprogs-y += sampleip
30+
hostprogs-y += tc_l2_redirect
3031

3132
test_verifier-objs := test_verifier.o libbpf.o
3233
test_maps-objs := test_maps.o libbpf.o
@@ -56,6 +57,7 @@ test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
5657
test_current_task_under_cgroup_user.o
5758
trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
5859
sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
60+
tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o
5961

6062
# Tell kbuild to always build the programs
6163
always := $(hostprogs-y)
@@ -72,6 +74,7 @@ always += test_probe_write_user_kern.o
7274
always += trace_output_kern.o
7375
always += tcbpf1_kern.o
7476
always += tcbpf2_kern.o
77+
always += tc_l2_redirect_kern.o
7578
always += lathist_kern.o
7679
always += offwaketime_kern.o
7780
always += spintest_kern.o
@@ -111,6 +114,7 @@ HOSTLOADLIBES_xdp2 += -lelf
111114
HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
112115
HOSTLOADLIBES_trace_event += -lelf
113116
HOSTLOADLIBES_sampleip += -lelf
117+
HOSTLOADLIBES_tc_l2_redirect += -l elf
114118

115119
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
116120
# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang

samples/bpf/tc_l2_redirect.sh

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
#!/bin/bash
2+
3+
[[ -z $TC ]] && TC='tc'
4+
[[ -z $IP ]] && IP='ip'
5+
6+
REDIRECT_USER='./tc_l2_redirect'
7+
REDIRECT_BPF='./tc_l2_redirect_kern.o'
8+
9+
RP_FILTER=$(< /proc/sys/net/ipv4/conf/all/rp_filter)
10+
IPV6_FORWARDING=$(< /proc/sys/net/ipv6/conf/all/forwarding)
11+
12+
function config_common {
13+
local tun_type=$1
14+
15+
$IP netns add ns1
16+
$IP netns add ns2
17+
$IP link add ve1 type veth peer name vens1
18+
$IP link add ve2 type veth peer name vens2
19+
$IP link set dev ve1 up
20+
$IP link set dev ve2 up
21+
$IP link set dev ve1 mtu 1500
22+
$IP link set dev ve2 mtu 1500
23+
$IP link set dev vens1 netns ns1
24+
$IP link set dev vens2 netns ns2
25+
26+
$IP -n ns1 link set dev lo up
27+
$IP -n ns1 link set dev vens1 up
28+
$IP -n ns1 addr add 10.1.1.101/24 dev vens1
29+
$IP -n ns1 addr add 2401:db01::65/64 dev vens1 nodad
30+
$IP -n ns1 route add default via 10.1.1.1 dev vens1
31+
$IP -n ns1 route add default via 2401:db01::1 dev vens1
32+
33+
$IP -n ns2 link set dev lo up
34+
$IP -n ns2 link set dev vens2 up
35+
$IP -n ns2 addr add 10.2.1.102/24 dev vens2
36+
$IP -n ns2 addr add 2401:db02::66/64 dev vens2 nodad
37+
$IP -n ns2 addr add 10.10.1.102 dev lo
38+
$IP -n ns2 addr add 2401:face::66/64 dev lo nodad
39+
$IP -n ns2 link add ipt2 type ipip local 10.2.1.102 remote 10.2.1.1
40+
$IP -n ns2 link add ip6t2 type ip6tnl mode any local 2401:db02::66 remote 2401:db02::1
41+
$IP -n ns2 link set dev ipt2 up
42+
$IP -n ns2 link set dev ip6t2 up
43+
$IP netns exec ns2 $TC qdisc add dev vens2 clsact
44+
$IP netns exec ns2 $TC filter add dev vens2 ingress bpf da obj $REDIRECT_BPF sec drop_non_tun_vip
45+
if [[ $tun_type == "ipip" ]]; then
46+
$IP -n ns2 route add 10.1.1.0/24 dev ipt2
47+
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
48+
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.ipt2.rp_filter=0
49+
else
50+
$IP -n ns2 route add 10.1.1.0/24 dev ip6t2
51+
$IP -n ns2 route add 2401:db01::/64 dev ip6t2
52+
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
53+
$IP netns exec ns2 sysctl -q -w net.ipv4.conf.ip6t2.rp_filter=0
54+
fi
55+
56+
$IP addr add 10.1.1.1/24 dev ve1
57+
$IP addr add 2401:db01::1/64 dev ve1 nodad
58+
$IP addr add 10.2.1.1/24 dev ve2
59+
$IP addr add 2401:db02::1/64 dev ve2 nodad
60+
61+
$TC qdisc add dev ve2 clsact
62+
$TC filter add dev ve2 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_forward
63+
64+
sysctl -q -w net.ipv4.conf.all.rp_filter=0
65+
sysctl -q -w net.ipv6.conf.all.forwarding=1
66+
}
67+
68+
function cleanup {
69+
set +e
70+
[[ -z $DEBUG ]] || set +x
71+
$IP netns delete ns1 >& /dev/null
72+
$IP netns delete ns2 >& /dev/null
73+
$IP link del ve1 >& /dev/null
74+
$IP link del ve2 >& /dev/null
75+
$IP link del ipt >& /dev/null
76+
$IP link del ip6t >& /dev/null
77+
sysctl -q -w net.ipv4.conf.all.rp_filter=$RP_FILTER
78+
sysctl -q -w net.ipv6.conf.all.forwarding=$IPV6_FORWARDING
79+
rm -f /sys/fs/bpf/tc/globals/tun_iface
80+
[[ -z $DEBUG ]] || set -x
81+
set -e
82+
}
83+
84+
function l2_to_ipip {
85+
echo -n "l2_to_ipip $1: "
86+
87+
local dir=$1
88+
89+
config_common ipip
90+
91+
$IP link add ipt type ipip external
92+
$IP link set dev ipt up
93+
sysctl -q -w net.ipv4.conf.ipt.rp_filter=0
94+
sysctl -q -w net.ipv4.conf.ipt.forwarding=1
95+
96+
if [[ $dir == "egress" ]]; then
97+
$IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
98+
$TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
99+
sysctl -q -w net.ipv4.conf.ve1.forwarding=1
100+
else
101+
$TC qdisc add dev ve1 clsact
102+
$TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
103+
fi
104+
105+
$REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ipt/ifindex)
106+
107+
$IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null
108+
109+
if [[ $dir == "egress" ]]; then
110+
# test direct egress to ve2 (i.e. not forwarding from
111+
# ve1 to ve2).
112+
ping -c1 10.10.1.102 >& /dev/null
113+
fi
114+
115+
cleanup
116+
117+
echo "OK"
118+
}
119+
120+
function l2_to_ip6tnl {
121+
echo -n "l2_to_ip6tnl $1: "
122+
123+
local dir=$1
124+
125+
config_common ip6tnl
126+
127+
$IP link add ip6t type ip6tnl mode any external
128+
$IP link set dev ip6t up
129+
sysctl -q -w net.ipv4.conf.ip6t.rp_filter=0
130+
sysctl -q -w net.ipv4.conf.ip6t.forwarding=1
131+
132+
if [[ $dir == "egress" ]]; then
133+
$IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
134+
$IP route add 2401:face::/64 via 2401:db02::66 dev ve2
135+
$TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
136+
sysctl -q -w net.ipv4.conf.ve1.forwarding=1
137+
else
138+
$TC qdisc add dev ve1 clsact
139+
$TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
140+
fi
141+
142+
$REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ip6t/ifindex)
143+
144+
$IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null
145+
$IP netns exec ns1 ping -6 -c1 2401:face::66 >& /dev/null
146+
147+
if [[ $dir == "egress" ]]; then
148+
# test direct egress to ve2 (i.e. not forwarding from
149+
# ve1 to ve2).
150+
ping -c1 10.10.1.102 >& /dev/null
151+
ping -6 -c1 2401:face::66 >& /dev/null
152+
fi
153+
154+
cleanup
155+
156+
echo "OK"
157+
}
158+
159+
cleanup
160+
test_names="l2_to_ipip l2_to_ip6tnl"
161+
test_dirs="ingress egress"
162+
if [[ $# -ge 2 ]]; then
163+
test_names=$1
164+
test_dirs=$2
165+
elif [[ $# -ge 1 ]]; then
166+
test_names=$1
167+
fi
168+
169+
for t in $test_names; do
170+
for d in $test_dirs; do
171+
$t $d
172+
done
173+
done

0 commit comments

Comments
 (0)