Skip to content

Commit bcdb239

Browse files
committed
Merge branch 'bpf-Add-support-for-sock_ops'
Lawrence Brakmo says: ==================== bpf: Add support for sock_ops Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding struct that allows BPF programs of this type to access some of the socket's fields (such as IP addresses, ports, etc.) and setting connection parameters such as buffer sizes, initial window, SYN/SYN-ACK RTOs, etc. Unlike current BPF program types that expect to be called at a particular place in the network stack code, SOCK_OPS program can be called at different places and use an "op" field to indicate the context. There are currently two types of operations, those whose effect is through their return value and those whose effect is through the new bpf_setsocketop BPF helper function. Example operands of the first type are: BPF_SOCK_OPS_TIMEOUT_INIT BPF_SOCK_OPS_RWND_INIT BPF_SOCK_OPS_NEEDS_ECN Example operands of the secont type are: BPF_SOCK_OPS_TCP_CONNECT_CB BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB Current operands are only called during connection establishment so there should not be any BPF overheads after connection establishment. The main idea is to use connection information form both hosts, such as IP addresses and ports to allow setting of per connection parameters to optimize the connection's peformance. Alghough there are already 3 mechanisms to set parameters (sysctls, route metrics and setsockopts), this new mechanism provides some disticnt advantages. Unlike sysctls, it can set parameters per connection. In contrast to route metrics, it can also use port numbers and information provided by a user level program. In addition, it could set parameters probabilistically for evaluation purposes (i.e. do something different on 10% of the flows and compare results with the other 90% of the flows). Also, in cases where IPv6 addresses contain geographic information, the rules to make changes based on the distance (or RTT) between the hosts are much easier than route metric rules and can be global. Finally, unlike setsockopt, it does not require application changes and it can be updated easily at any time. It uses the existing bpf cgroups infrastructure so the programs can be attached per cgroup with full inheritance support. Although the bpf cgroup framework already contains a sock related program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type (BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called only once during the connections's lifetime. In contrast, the new program type will be called multiple times from different places in the network stack code. For example, before sending SYN and SYN-ACKs to set an appropriate timeout, when the connection is established to set congestion control, etc. As a result it has "op" field to specify the type of operation requested. This patch set also includes sample BPF programs to demostrate the differnet features. v2: Formatting changes, rebased to latest net-next v3: Fixed build issues, changed socket_ops to sock_ops throught, fixed formatting issues, removed the syscall to load sock_ops program and added functionality to use existing bpf attach and bpf detach system calls, removed reader/writer locks in sock_bpfops.c (used when saving sock_ops global program) and fixed missing module refcount increment. v4: Removed global sock_ops program and instead used existing cgroup bpf infrastructure to support a new BPF_CGROUP_ATTCH type. v5: fixed kbuild warning happening in bpf-cgroup.h removed automatic converstion to host byte order from some sock_ops fields (ipv4 and ipv6 addresses, remote port) Added conversion to host byte order in some of the sample programs Added to sample BPF program comments about using load_sock_ops to load Removed is_req_sock field from bpf_sock_ops_kern and related places, using sk_fullsock() instead. v6: fixes to BPF helper function setsockopt (possible NULL deferencing, etc.) ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 57a53a0 + 04df41e commit bcdb239

25 files changed

+1218
-26
lines changed

include/linux/bpf-cgroup.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
struct sock;
88
struct cgroup;
99
struct sk_buff;
10+
struct bpf_sock_ops_kern;
1011

1112
#ifdef CONFIG_CGROUP_BPF
1213

@@ -42,6 +43,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
4243
int __cgroup_bpf_run_filter_sk(struct sock *sk,
4344
enum bpf_attach_type type);
4445

46+
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
47+
struct bpf_sock_ops_kern *sock_ops,
48+
enum bpf_attach_type type);
49+
4550
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
4651
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
4752
({ \
@@ -75,6 +80,18 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
7580
__ret; \
7681
})
7782

83+
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \
84+
({ \
85+
int __ret = 0; \
86+
if (cgroup_bpf_enabled && (sock_ops)->sk) { \
87+
typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \
88+
if (sk_fullsock(__sk)) \
89+
__ret = __cgroup_bpf_run_filter_sock_ops(__sk, \
90+
sock_ops, \
91+
BPF_CGROUP_SOCK_OPS); \
92+
} \
93+
__ret; \
94+
})
7895
#else
7996

8097
struct cgroup_bpf {};
@@ -85,6 +102,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
85102
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
86103
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
87104
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
105+
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
88106

89107
#endif /* CONFIG_CGROUP_BPF */
90108

include/linux/bpf_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops)
1010
BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
1111
BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
1212
BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
13+
BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops)
1314
#endif
1415
#ifdef CONFIG_BPF_EVENTS
1516
BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)

include/linux/filter.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -898,4 +898,13 @@ static inline int bpf_tell_extensions(void)
898898
return SKF_AD_MAX;
899899
}
900900

901+
struct bpf_sock_ops_kern {
902+
struct sock *sk;
903+
u32 op;
904+
union {
905+
u32 reply;
906+
u32 replylong[4];
907+
};
908+
};
909+
901910
#endif /* __LINUX_FILTER_H__ */

include/net/tcp.h

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@
4646
#include <linux/seq_file.h>
4747
#include <linux/memcontrol.h>
4848

49+
#include <linux/bpf.h>
50+
#include <linux/filter.h>
51+
#include <linux/bpf-cgroup.h>
52+
4953
extern struct inet_hashinfo tcp_hashinfo;
5054

5155
extern struct percpu_counter tcp_orphan_count;
@@ -1000,7 +1004,9 @@ void tcp_get_default_congestion_control(char *name);
10001004
void tcp_get_available_congestion_control(char *buf, size_t len);
10011005
void tcp_get_allowed_congestion_control(char *buf, size_t len);
10021006
int tcp_set_allowed_congestion_control(char *allowed);
1003-
int tcp_set_congestion_control(struct sock *sk, const char *name);
1007+
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
1008+
void tcp_reinit_congestion_control(struct sock *sk,
1009+
const struct tcp_congestion_ops *ca);
10041010
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
10051011
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
10061012

@@ -2021,4 +2027,62 @@ int tcp_set_ulp(struct sock *sk, const char *name);
20212027
void tcp_get_available_ulp(char *buf, size_t len);
20222028
void tcp_cleanup_ulp(struct sock *sk);
20232029

2030+
/* Call BPF_SOCK_OPS program that returns an int. If the return value
2031+
* is < 0, then the BPF op failed (for example if the loaded BPF
2032+
* program does not support the chosen operation or there is no BPF
2033+
* program loaded).
2034+
*/
2035+
#ifdef CONFIG_BPF
2036+
static inline int tcp_call_bpf(struct sock *sk, int op)
2037+
{
2038+
struct bpf_sock_ops_kern sock_ops;
2039+
int ret;
2040+
2041+
if (sk_fullsock(sk))
2042+
sock_owned_by_me(sk);
2043+
2044+
memset(&sock_ops, 0, sizeof(sock_ops));
2045+
sock_ops.sk = sk;
2046+
sock_ops.op = op;
2047+
2048+
ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
2049+
if (ret == 0)
2050+
ret = sock_ops.reply;
2051+
else
2052+
ret = -1;
2053+
return ret;
2054+
}
2055+
#else
2056+
static inline int tcp_call_bpf(struct sock *sk, int op)
2057+
{
2058+
return -EPERM;
2059+
}
2060+
#endif
2061+
2062+
static inline u32 tcp_timeout_init(struct sock *sk)
2063+
{
2064+
int timeout;
2065+
2066+
timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
2067+
2068+
if (timeout <= 0)
2069+
timeout = TCP_TIMEOUT_INIT;
2070+
return timeout;
2071+
}
2072+
2073+
static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
2074+
{
2075+
int rwnd;
2076+
2077+
rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
2078+
2079+
if (rwnd < 0)
2080+
rwnd = 0;
2081+
return rwnd;
2082+
}
2083+
2084+
static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
2085+
{
2086+
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
2087+
}
20242088
#endif /* _TCP_H */

include/uapi/linux/bpf.h

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,12 +120,14 @@ enum bpf_prog_type {
120120
BPF_PROG_TYPE_LWT_IN,
121121
BPF_PROG_TYPE_LWT_OUT,
122122
BPF_PROG_TYPE_LWT_XMIT,
123+
BPF_PROG_TYPE_SOCK_OPS,
123124
};
124125

125126
enum bpf_attach_type {
126127
BPF_CGROUP_INET_INGRESS,
127128
BPF_CGROUP_INET_EGRESS,
128129
BPF_CGROUP_INET_SOCK_CREATE,
130+
BPF_CGROUP_SOCK_OPS,
129131
__MAX_BPF_ATTACH_TYPE
130132
};
131133

@@ -518,6 +520,17 @@ union bpf_attr {
518520
* Set full skb->hash.
519521
* @skb: pointer to skb
520522
* @hash: hash to set
523+
*
524+
* int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
525+
* Calls setsockopt. Not all opts are available, only those with
526+
* integer optvals plus TCP_CONGESTION.
527+
* Supported levels: SOL_SOCKET and IPROTO_TCP
528+
* @bpf_socket: pointer to bpf_socket
529+
* @level: SOL_SOCKET or IPROTO_TCP
530+
* @optname: option name
531+
* @optval: pointer to option value
532+
* @optlen: length of optval in byes
533+
* Return: 0 or negative error
521534
*/
522535
#define __BPF_FUNC_MAPPER(FN) \
523536
FN(unspec), \
@@ -568,7 +581,8 @@ union bpf_attr {
568581
FN(probe_read_str), \
569582
FN(get_socket_cookie), \
570583
FN(get_socket_uid), \
571-
FN(set_hash),
584+
FN(set_hash), \
585+
FN(setsockopt),
572586

573587
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
574588
* function eBPF program intends to call
@@ -720,4 +734,56 @@ struct bpf_map_info {
720734
__u32 map_flags;
721735
} __attribute__((aligned(8)));
722736

737+
/* User bpf_sock_ops struct to access socket values and specify request ops
738+
* and their replies.
739+
* Some of this fields are in network (bigendian) byte order and may need
740+
* to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h).
741+
* New fields can only be added at the end of this structure
742+
*/
743+
struct bpf_sock_ops {
744+
__u32 op;
745+
union {
746+
__u32 reply;
747+
__u32 replylong[4];
748+
};
749+
__u32 family;
750+
__u32 remote_ip4; /* Stored in network byte order */
751+
__u32 local_ip4; /* Stored in network byte order */
752+
__u32 remote_ip6[4]; /* Stored in network byte order */
753+
__u32 local_ip6[4]; /* Stored in network byte order */
754+
__u32 remote_port; /* Stored in network byte order */
755+
__u32 local_port; /* stored in host byte order */
756+
};
757+
758+
/* List of known BPF sock_ops operators.
759+
* New entries can only be added at the end
760+
*/
761+
enum {
762+
BPF_SOCK_OPS_VOID,
763+
BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or
764+
* -1 if default value should be used
765+
*/
766+
BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized
767+
* window (in packets) or -1 if default
768+
* value should be used
769+
*/
770+
BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an
771+
* active connection is initialized
772+
*/
773+
BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an
774+
* active connection is
775+
* established
776+
*/
777+
BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a
778+
* passive connection is
779+
* established
780+
*/
781+
BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control
782+
* needs ECN
783+
*/
784+
};
785+
786+
#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
787+
#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */
788+
723789
#endif /* _UAPI__LINUX_BPF_H__ */

kernel/bpf/cgroup.c

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
236236
return ret;
237237
}
238238
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
239+
240+
/**
241+
* __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
242+
* @sk: socket to get cgroup from
243+
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
244+
* sk with connection information (IP addresses, etc.) May not contain
245+
* cgroup info if it is a req sock.
246+
* @type: The type of program to be exectuted
247+
*
248+
* socket passed is expected to be of type INET or INET6.
249+
*
250+
* The program type passed in via @type must be suitable for sock_ops
251+
* filtering. No further check is performed to assert that.
252+
*
253+
* This function will return %-EPERM if any if an attached program was found
254+
* and if it returned != 1 during execution. In all other cases, 0 is returned.
255+
*/
256+
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
257+
struct bpf_sock_ops_kern *sock_ops,
258+
enum bpf_attach_type type)
259+
{
260+
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
261+
struct bpf_prog *prog;
262+
int ret = 0;
263+
264+
265+
rcu_read_lock();
266+
267+
prog = rcu_dereference(cgrp->bpf.effective[type]);
268+
if (prog)
269+
ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
270+
271+
rcu_read_unlock();
272+
273+
return ret;
274+
}
275+
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);

kernel/bpf/syscall.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1079,6 +1079,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
10791079
case BPF_CGROUP_INET_SOCK_CREATE:
10801080
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
10811081
break;
1082+
case BPF_CGROUP_SOCK_OPS:
1083+
ptype = BPF_PROG_TYPE_SOCK_OPS;
1084+
break;
10821085
default:
10831086
return -EINVAL;
10841087
}
@@ -1119,6 +1122,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
11191122
case BPF_CGROUP_INET_INGRESS:
11201123
case BPF_CGROUP_INET_EGRESS:
11211124
case BPF_CGROUP_INET_SOCK_CREATE:
1125+
case BPF_CGROUP_SOCK_OPS:
11221126
cgrp = cgroup_get_from_fd(attr->target_fd);
11231127
if (IS_ERR(cgrp))
11241128
return PTR_ERR(cgrp);
@@ -1133,6 +1137,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
11331137

11341138
return ret;
11351139
}
1140+
11361141
#endif /* CONFIG_CGROUP_BPF */
11371142

11381143
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration

0 commit comments

Comments
 (0)