Skip to content

Commit 015632b

Browse files
jrfastabborkmann
authored andcommitted
bpf: sk_msg program helper bpf_sk_msg_pull_data
Currently, if a bpf sk msg program is run the program can only parse data that the (start,end) pointers already consumed. For sendmsg hooks this is likely the first scatterlist element. For sendpage this will be the range (0,0) because the data is shared with userspace and by default we want to avoid allowing userspace to modify data while (or after) BPF verdict is being decided. To support pulling in additional bytes for parsing use a new helper bpf_sk_msg_pull(start, end, flags) which works similar to cls tc logic. This helper will attempt to point the data start pointer at 'start' bytes offest into msg and data end pointer at 'end' bytes offset into message. After basic sanity checks to ensure 'start' <= 'end' and 'end' <= msg_length there are a few cases we need to handle. First the sendmsg hook has already copied the data from userspace and has exclusive access to it. Therefor, it is not necessesary to copy the data. However, it may be required. After finding the scatterlist element with 'start' offset byte in it there are two cases. One the range (start,end) is entirely contained in the sg element and is already linear. All that is needed is to update the data pointers, no allocate/copy is needed. The other case is (start, end) crosses sg element boundaries. In this case we allocate a block of size 'end - start' and copy the data to linearize it. Next sendpage hook has not copied any data in initial state so that data pointers are (0,0). In this case we handle it similar to the above sendmsg case except the allocation/copy must always happen. Then when sending the data we have possibly three memory regions that need to be sent, (0, start - 1), (start, end), and (end + 1, msg_length). This is required to ensure any writes by the BPF program are correctly transmitted. Lastly this operation will invalidate any previous data checks so BPF programs will have to revalidate pointers after making this BPF call. Signed-off-by: John Fastabend <[email protected]> Acked-by: David S. Miller <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]>
1 parent 91843d5 commit 015632b

File tree

2 files changed

+136
-2
lines changed

2 files changed

+136
-2
lines changed

include/uapi/linux/bpf.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -793,7 +793,8 @@ union bpf_attr {
793793
FN(sock_ops_cb_flags_set), \
794794
FN(msg_redirect_map), \
795795
FN(msg_apply_bytes), \
796-
FN(msg_cork_bytes),
796+
FN(msg_cork_bytes), \
797+
FN(msg_pull_data),
797798

798799
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
799800
* function eBPF program intends to call

net/core/filter.c

Lines changed: 134 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1956,6 +1956,136 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
19561956
.arg2_type = ARG_ANYTHING,
19571957
};
19581958

1959+
BPF_CALL_4(bpf_msg_pull_data,
1960+
struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
1961+
{
1962+
unsigned int len = 0, offset = 0, copy = 0;
1963+
struct scatterlist *sg = msg->sg_data;
1964+
int first_sg, last_sg, i, shift;
1965+
unsigned char *p, *to, *from;
1966+
int bytes = end - start;
1967+
struct page *page;
1968+
1969+
if (unlikely(flags || end <= start))
1970+
return -EINVAL;
1971+
1972+
/* First find the starting scatterlist element */
1973+
i = msg->sg_start;
1974+
do {
1975+
len = sg[i].length;
1976+
offset += len;
1977+
if (start < offset + len)
1978+
break;
1979+
i++;
1980+
if (i == MAX_SKB_FRAGS)
1981+
i = 0;
1982+
} while (i != msg->sg_end);
1983+
1984+
if (unlikely(start >= offset + len))
1985+
return -EINVAL;
1986+
1987+
if (!msg->sg_copy[i] && bytes <= len)
1988+
goto out;
1989+
1990+
first_sg = i;
1991+
1992+
/* At this point we need to linearize multiple scatterlist
1993+
* elements or a single shared page. Either way we need to
1994+
* copy into a linear buffer exclusively owned by BPF. Then
1995+
* place the buffer in the scatterlist and fixup the original
1996+
* entries by removing the entries now in the linear buffer
1997+
* and shifting the remaining entries. For now we do not try
1998+
* to copy partial entries to avoid complexity of running out
1999+
* of sg_entry slots. The downside is reading a single byte
2000+
* will copy the entire sg entry.
2001+
*/
2002+
do {
2003+
copy += sg[i].length;
2004+
i++;
2005+
if (i == MAX_SKB_FRAGS)
2006+
i = 0;
2007+
if (bytes < copy)
2008+
break;
2009+
} while (i != msg->sg_end);
2010+
last_sg = i;
2011+
2012+
if (unlikely(copy < end - start))
2013+
return -EINVAL;
2014+
2015+
page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));
2016+
if (unlikely(!page))
2017+
return -ENOMEM;
2018+
p = page_address(page);
2019+
offset = 0;
2020+
2021+
i = first_sg;
2022+
do {
2023+
from = sg_virt(&sg[i]);
2024+
len = sg[i].length;
2025+
to = p + offset;
2026+
2027+
memcpy(to, from, len);
2028+
offset += len;
2029+
sg[i].length = 0;
2030+
put_page(sg_page(&sg[i]));
2031+
2032+
i++;
2033+
if (i == MAX_SKB_FRAGS)
2034+
i = 0;
2035+
} while (i != last_sg);
2036+
2037+
sg[first_sg].length = copy;
2038+
sg_set_page(&sg[first_sg], page, copy, 0);
2039+
2040+
/* To repair sg ring we need to shift entries. If we only
2041+
* had a single entry though we can just replace it and
2042+
* be done. Otherwise walk the ring and shift the entries.
2043+
*/
2044+
shift = last_sg - first_sg - 1;
2045+
if (!shift)
2046+
goto out;
2047+
2048+
i = first_sg + 1;
2049+
do {
2050+
int move_from;
2051+
2052+
if (i + shift >= MAX_SKB_FRAGS)
2053+
move_from = i + shift - MAX_SKB_FRAGS;
2054+
else
2055+
move_from = i + shift;
2056+
2057+
if (move_from == msg->sg_end)
2058+
break;
2059+
2060+
sg[i] = sg[move_from];
2061+
sg[move_from].length = 0;
2062+
sg[move_from].page_link = 0;
2063+
sg[move_from].offset = 0;
2064+
2065+
i++;
2066+
if (i == MAX_SKB_FRAGS)
2067+
i = 0;
2068+
} while (1);
2069+
msg->sg_end -= shift;
2070+
if (msg->sg_end < 0)
2071+
msg->sg_end += MAX_SKB_FRAGS;
2072+
out:
2073+
msg->data = sg_virt(&sg[i]) + start - offset;
2074+
msg->data_end = msg->data + bytes;
2075+
2076+
return 0;
2077+
}
2078+
2079+
static const struct bpf_func_proto bpf_msg_pull_data_proto = {
2080+
.func = bpf_msg_pull_data,
2081+
.gpl_only = false,
2082+
.ret_type = RET_INTEGER,
2083+
.arg1_type = ARG_PTR_TO_CTX,
2084+
.arg2_type = ARG_ANYTHING,
2085+
.arg3_type = ARG_ANYTHING,
2086+
.arg4_type = ARG_ANYTHING,
2087+
};
2088+
19592089
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
19602090
{
19612091
return task_get_classid(skb);
@@ -2897,7 +3027,8 @@ bool bpf_helper_changes_pkt_data(void *func)
28973027
func == bpf_l3_csum_replace ||
28983028
func == bpf_l4_csum_replace ||
28993029
func == bpf_xdp_adjust_head ||
2900-
func == bpf_xdp_adjust_meta)
3030+
func == bpf_xdp_adjust_meta ||
3031+
func == bpf_msg_pull_data)
29013032
return true;
29023033

29033034
return false;
@@ -3666,6 +3797,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
36663797
return &bpf_msg_apply_bytes_proto;
36673798
case BPF_FUNC_msg_cork_bytes:
36683799
return &bpf_msg_cork_bytes_proto;
3800+
case BPF_FUNC_msg_pull_data:
3801+
return &bpf_msg_pull_data_proto;
36693802
default:
36703803
return bpf_base_func_proto(func_id);
36713804
}

0 commit comments

Comments
 (0)