Skip to content

Commit ca6fb06

Browse files
Eric Dumazetdavem330
authored andcommitted
tcp: attach SYNACK messages to request sockets instead of listener
If a listen backlog is very big (to avoid syncookies), then the listener sk->sk_wmem_alloc is the main source of false sharing, as we need to touch it twice per SYNACK re-transmit and TX completion. (One SYN packet takes listener lock once, but up to 6 SYNACK are generated) By attaching the skb to the request socket, we remove this source of contention. Tested: listen(fd, 10485760); // single listener (no SO_REUSEPORT) 16 RX/TX queue NIC Sustain a SYNFLOOD attack of ~320,000 SYN per second, Sending ~1,400,000 SYNACK per second. Perf profiles now show listener spinlock being next bottleneck. 20.29% [kernel] [k] queued_spin_lock_slowpath 10.06% [kernel] [k] __inet_lookup_established 5.12% [kernel] [k] reqsk_timer_handler 3.22% [kernel] [k] get_next_timer_interrupt 3.00% [kernel] [k] tcp_make_synack 2.77% [kernel] [k] ipt_do_table 2.70% [kernel] [k] run_timer_softirq 2.50% [kernel] [k] ip_finish_output 2.04% [kernel] [k] cascade Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 1b33bc3 commit ca6fb06

File tree

8 files changed

+47
-32
lines changed

8 files changed

+47
-32
lines changed

include/net/tcp.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
462462
int tcp_connect(struct sock *sk);
463463
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
464464
struct request_sock *req,
465-
struct tcp_fastopen_cookie *foc);
465+
struct tcp_fastopen_cookie *foc,
466+
bool attach_req);
466467
int tcp_disconnect(struct sock *sk, int flags);
467468

468469
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
@@ -1715,7 +1716,8 @@ struct tcp_request_sock_ops {
17151716
__u32 (*init_seq)(const struct sk_buff *skb);
17161717
int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
17171718
struct flowi *fl, struct request_sock *req,
1718-
u16 queue_mapping, struct tcp_fastopen_cookie *foc);
1719+
u16 queue_mapping, struct tcp_fastopen_cookie *foc,
1720+
bool attach_req);
17191721
};
17201722

17211723
#ifdef CONFIG_SYN_COOKIES

net/ipv4/inet_connection_sock.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -628,7 +628,7 @@ static void reqsk_queue_hash_req(struct request_sock *req,
628628
* are committed to memory and refcnt initialized.
629629
*/
630630
smp_wmb();
631-
atomic_set(&req->rsk_refcnt, 2);
631+
atomic_set(&req->rsk_refcnt, 2 + 1);
632632
}
633633

634634
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,

net/ipv4/tcp_fastopen.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,13 +161,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
161161
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
162162

163163
/* Activate the retrans timer so that SYNACK can be retransmitted.
164-
* The request socket is not added to the SYN table of the parent
164+
* The request socket is not added to the ehash
165165
* because it's been added to the accept queue directly.
166166
*/
167167
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
168168
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
169169

170-
atomic_set(&req->rsk_refcnt, 1);
170+
atomic_set(&req->rsk_refcnt, 2);
171171
/* Add the child socket directly into the accept queue */
172172
inet_csk_reqsk_queue_add(sk, req, child);
173173

net/ipv4/tcp_input.c

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6120,8 +6120,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
61206120
struct request_sock *req;
61216121
bool want_cookie = false;
61226122
struct flowi fl;
6123-
int err;
6124-
61256123

61266124
/* TW buckets are converted to open requests without
61276125
* limitations, they conserve resources and peer is
@@ -6230,21 +6228,24 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
62306228
tcp_rsk(req)->snt_isn = isn;
62316229
tcp_rsk(req)->txhash = net_tx_rndhash();
62326230
tcp_openreq_init_rwin(req, sk, dst);
6233-
if (!want_cookie)
6231+
if (!want_cookie) {
62346232
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
6235-
err = af_ops->send_synack(fastopen_sk ?: sk, dst, &fl, req,
6236-
skb_get_queue_mapping(skb), &foc);
6233+
tcp_reqsk_record_syn(sk, req, skb);
6234+
}
62376235
if (fastopen_sk) {
6236+
af_ops->send_synack(fastopen_sk, dst, &fl, req,
6237+
skb_get_queue_mapping(skb), &foc, false);
62386238
sock_put(fastopen_sk);
62396239
} else {
6240-
if (err || want_cookie)
6241-
goto drop_and_free;
6242-
62436240
tcp_rsk(req)->tfo_listener = false;
6244-
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6241+
if (!want_cookie)
6242+
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6243+
af_ops->send_synack(sk, dst, &fl, req,
6244+
skb_get_queue_mapping(skb), &foc, !want_cookie);
6245+
if (want_cookie)
6246+
goto drop_and_free;
62456247
}
6246-
tcp_reqsk_record_syn(sk, req, skb);
6247-
6248+
reqsk_put(req);
62486249
return 0;
62496250

62506251
drop_and_release:

net/ipv4/tcp_ipv4.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
822822
struct flowi *fl,
823823
struct request_sock *req,
824824
u16 queue_mapping,
825-
struct tcp_fastopen_cookie *foc)
825+
struct tcp_fastopen_cookie *foc,
826+
bool attach_req)
826827
{
827828
const struct inet_request_sock *ireq = inet_rsk(req);
828829
struct flowi4 fl4;
@@ -833,7 +834,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
833834
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
834835
return -1;
835836

836-
skb = tcp_make_synack(sk, dst, req, foc);
837+
skb = tcp_make_synack(sk, dst, req, foc, attach_req);
837838

838839
if (skb) {
839840
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);

net/ipv4/tcp_output.c

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2947,7 +2947,8 @@ int tcp_send_synack(struct sock *sk)
29472947
*/
29482948
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
29492949
struct request_sock *req,
2950-
struct tcp_fastopen_cookie *foc)
2950+
struct tcp_fastopen_cookie *foc,
2951+
bool attach_req)
29512952
{
29522953
struct inet_request_sock *ireq = inet_rsk(req);
29532954
const struct tcp_sock *tp = tcp_sk(sk);
@@ -2959,18 +2960,25 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
29592960
u16 user_mss;
29602961
int mss;
29612962

2962-
/* sk is a const pointer, because we want to express multiple cpus
2963-
* might call us concurrently.
2964-
* sock_wmalloc() will change sk->sk_wmem_alloc in an atomic way.
2965-
*/
2966-
skb = sock_wmalloc((struct sock *)sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
2963+
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
29672964
if (unlikely(!skb)) {
29682965
dst_release(dst);
29692966
return NULL;
29702967
}
29712968
/* Reserve space for headers. */
29722969
skb_reserve(skb, MAX_TCP_HEADER);
29732970

2971+
if (attach_req) {
2972+
skb->destructor = sock_edemux;
2973+
sock_hold(req_to_sk(req));
2974+
skb->sk = req_to_sk(req);
2975+
} else {
2976+
/* sk is a const pointer, because we want to express multiple
2977+
* cpu might call us concurrently.
2978+
* sk->sk_wmem_alloc in an atomic, we can promote to rw.
2979+
*/
2980+
skb_set_owner_w(skb, (struct sock *)sk);
2981+
}
29742982
skb_dst_set(skb, dst);
29752983

29762984
mss = dst_metric_advmss(dst);
@@ -3510,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
35103518
int res;
35113519

35123520
tcp_rsk(req)->txhash = net_tx_rndhash();
3513-
res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
3521+
res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true);
35143522
if (!res) {
35153523
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
35163524
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);

net/ipv6/tcp_ipv6.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
438438
struct flowi *fl,
439439
struct request_sock *req,
440440
u16 queue_mapping,
441-
struct tcp_fastopen_cookie *foc)
441+
struct tcp_fastopen_cookie *foc,
442+
bool attach_req)
442443
{
443444
struct inet_request_sock *ireq = inet_rsk(req);
444445
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -451,7 +452,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
451452
IPPROTO_TCP)) == NULL)
452453
goto done;
453454

454-
skb = tcp_make_synack(sk, dst, req, foc);
455+
skb = tcp_make_synack(sk, dst, req, foc, attach_req);
455456

456457
if (skb) {
457458
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,

net/sched/sch_fq.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -224,13 +224,15 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
224224
if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
225225
return &q->internal;
226226

227-
/* SYNACK messages are attached to a listener socket.
228-
* 1) They are not part of a 'flow' yet
229-
* 2) We do not want to rate limit them (eg SYNFLOOD attack),
227+
/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
228+
* 1) request sockets are not full blown,
229+
* they do not contain sk_pacing_rate
230+
* 2) They are not part of a 'flow' yet
231+
* 3) We do not want to rate limit them (eg SYNFLOOD attack),
230232
* especially if the listener set SO_MAX_PACING_RATE
231-
* 3) We pretend they are orphaned
233+
* 4) We pretend they are orphaned
232234
*/
233-
if (!sk || sk->sk_state == TCP_LISTEN) {
235+
if (!sk || sk->sk_state == TCP_NEW_SYN_RECV) {
234236
unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
235237

236238
/* By forcing low order bit to 1, we make sure to not

0 commit comments

Comments
 (0)