Skip to content

Commit 789f558

Browse files
edumazetdavem330
authored andcommitted
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when memory was expensive and machines had a single processor. This does not scale, code is ugly and source of huge latencies (Typically 30 ms have been seen, cpus spinning on death_lock spinlock.) We can afford to use an extra 64 bytes per timewait sock and spread timewait load to all cpus to have better behavior. Tested: On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1 on the target (lpaa24) Before patch : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 419594 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 437171 While test is running, we can observe 25 or even 33 ms latencies. lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2 lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2 After patch : About 90% increase of throughput : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 810442 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 800992 And latencies are kept to minimal values during this load, even if network utilization is 90% higher : lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 20a1d16 commit 789f558

File tree

11 files changed

+69
-386
lines changed

11 files changed

+69
-386
lines changed

include/net/inet_timewait_sock.h

Lines changed: 9 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -31,67 +31,14 @@
3131

3232
struct inet_hashinfo;
3333

34-
#define INET_TWDR_RECYCLE_SLOTS_LOG 5
35-
#define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG)
36-
37-
/*
38-
* If time > 4sec, it is "slow" path, no recycling is required,
39-
* so that we select tick to get range about 4 seconds.
40-
*/
41-
#if HZ <= 16 || HZ > 4096
42-
# error Unsupported: HZ <= 16 or HZ > 4096
43-
#elif HZ <= 32
44-
# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
45-
#elif HZ <= 64
46-
# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
47-
#elif HZ <= 128
48-
# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
49-
#elif HZ <= 256
50-
# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
51-
#elif HZ <= 512
52-
# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
53-
#elif HZ <= 1024
54-
# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
55-
#elif HZ <= 2048
56-
# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
57-
#else
58-
# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
59-
#endif
60-
61-
static inline u32 inet_tw_time_stamp(void)
62-
{
63-
return jiffies;
64-
}
65-
66-
/* TIME_WAIT reaping mechanism. */
67-
#define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
68-
69-
#define INET_TWDR_TWKILL_QUOTA 100
70-
7134
struct inet_timewait_death_row {
72-
/* Short-time timewait calendar */
73-
int twcal_hand;
74-
unsigned long twcal_jiffie;
75-
struct timer_list twcal_timer;
76-
struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS];
77-
78-
spinlock_t death_lock;
79-
int tw_count;
80-
int period;
81-
u32 thread_slots;
82-
struct work_struct twkill_work;
83-
struct timer_list tw_timer;
84-
int slot;
85-
struct hlist_head cells[INET_TWDR_TWKILL_SLOTS];
86-
struct inet_hashinfo *hashinfo;
35+
atomic_t tw_count;
36+
37+
struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp;
8738
int sysctl_tw_recycle;
8839
int sysctl_max_tw_buckets;
8940
};
9041

91-
void inet_twdr_hangman(unsigned long data);
92-
void inet_twdr_twkill_work(struct work_struct *work);
93-
void inet_twdr_twcal_tick(unsigned long data);
94-
9542
struct inet_bind_bucket;
9643

9744
/*
@@ -133,52 +80,18 @@ struct inet_timewait_sock {
13380
__be16 tw_sport;
13481
kmemcheck_bitfield_begin(flags);
13582
/* And these are ours. */
136-
unsigned int tw_pad0 : 1, /* 1 bit hole */
83+
unsigned int tw_kill : 1,
13784
tw_transparent : 1,
13885
tw_flowlabel : 20,
13986
tw_pad : 2, /* 2 bits hole */
14087
tw_tos : 8;
14188
kmemcheck_bitfield_end(flags);
142-
u32 tw_ttd;
89+
struct timer_list tw_timer;
14390
struct inet_bind_bucket *tw_tb;
144-
struct hlist_node tw_death_node;
91+
struct inet_timewait_death_row *tw_dr;
14592
};
14693
#define tw_tclass tw_tos
14794

148-
static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
149-
{
150-
return !hlist_unhashed(&tw->tw_death_node);
151-
}
152-
153-
static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw)
154-
{
155-
tw->tw_death_node.pprev = NULL;
156-
}
157-
158-
static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
159-
{
160-
__hlist_del(&tw->tw_death_node);
161-
inet_twsk_dead_node_init(tw);
162-
}
163-
164-
static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
165-
{
166-
if (inet_twsk_dead_hashed(tw)) {
167-
__inet_twsk_del_dead_node(tw);
168-
return 1;
169-
}
170-
return 0;
171-
}
172-
173-
#define inet_twsk_for_each(tw, node, head) \
174-
hlist_nulls_for_each_entry(tw, node, head, tw_node)
175-
176-
#define inet_twsk_for_each_inmate(tw, jail) \
177-
hlist_for_each_entry(tw, jail, tw_death_node)
178-
179-
#define inet_twsk_for_each_inmate_safe(tw, safe, jail) \
180-
hlist_for_each_entry_safe(tw, safe, jail, tw_death_node)
181-
18295
static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
18396
{
18497
return (struct inet_timewait_sock *)sk;
@@ -193,16 +106,14 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
193106
struct inet_hashinfo *hashinfo);
194107

195108
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
109+
struct inet_timewait_death_row *dr,
196110
const int state);
197111

198112
void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
199113
struct inet_hashinfo *hashinfo);
200114

201-
void inet_twsk_schedule(struct inet_timewait_sock *tw,
202-
struct inet_timewait_death_row *twdr,
203-
const int timeo, const int timewait_len);
204-
void inet_twsk_deschedule(struct inet_timewait_sock *tw,
205-
struct inet_timewait_death_row *twdr);
115+
void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo);
116+
void inet_twsk_deschedule(struct inet_timewait_sock *tw);
206117

207118
void inet_twsk_purge(struct inet_hashinfo *hashinfo,
208119
struct inet_timewait_death_row *twdr, int family);

net/dccp/minisocks.c

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,28 +27,16 @@
2727

2828
struct inet_timewait_death_row dccp_death_row = {
2929
.sysctl_max_tw_buckets = NR_FILE * 2,
30-
.period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
31-
.death_lock = __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock),
3230
.hashinfo = &dccp_hashinfo,
33-
.tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
34-
(unsigned long)&dccp_death_row),
35-
.twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work,
36-
inet_twdr_twkill_work),
37-
/* Short-time timewait calendar */
38-
39-
.twcal_hand = -1,
40-
.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
41-
(unsigned long)&dccp_death_row),
4231
};
4332

4433
EXPORT_SYMBOL_GPL(dccp_death_row);
4534

4635
void dccp_time_wait(struct sock *sk, int state, int timeo)
4736
{
48-
struct inet_timewait_sock *tw = NULL;
37+
struct inet_timewait_sock *tw;
4938

50-
if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
51-
tw = inet_twsk_alloc(sk, state);
39+
tw = inet_twsk_alloc(sk, &dccp_death_row, state);
5240

5341
if (tw != NULL) {
5442
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -71,8 +59,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
7159
if (state == DCCP_TIME_WAIT)
7260
timeo = DCCP_TIMEWAIT_LEN;
7361

74-
inet_twsk_schedule(tw, &dccp_death_row, timeo,
75-
DCCP_TIMEWAIT_LEN);
62+
inet_twsk_schedule(tw, timeo);
7663
inet_twsk_put(tw);
7764
} else {
7865
/* Sorry, if we're out of memory, just CLOSE this

net/ipv4/inet_diag.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ static int inet_twsk_diag_fill(struct sock *sk,
248248
struct inet_timewait_sock *tw = inet_twsk(sk);
249249
struct inet_diag_msg *r;
250250
struct nlmsghdr *nlh;
251-
s32 tmo;
251+
long tmo;
252252

253253
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
254254
nlmsg_flags);
@@ -258,7 +258,7 @@ static int inet_twsk_diag_fill(struct sock *sk,
258258
r = nlmsg_data(nlh);
259259
BUG_ON(tw->tw_state != TCP_TIME_WAIT);
260260

261-
tmo = tw->tw_ttd - inet_tw_time_stamp();
261+
tmo = tw->tw_timer.expires - jiffies;
262262
if (tmo < 0)
263263
tmo = 0;
264264

net/ipv4/inet_hashtables.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
388388
*twp = tw;
389389
} else if (tw) {
390390
/* Silly. Should hash-dance instead... */
391-
inet_twsk_deschedule(tw, death_row);
391+
inet_twsk_deschedule(tw);
392392

393393
inet_twsk_put(tw);
394394
}
@@ -565,7 +565,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
565565
spin_unlock(&head->lock);
566566

567567
if (tw) {
568-
inet_twsk_deschedule(tw, death_row);
568+
inet_twsk_deschedule(tw);
569569
while (twrefcnt) {
570570
twrefcnt--;
571571
inet_twsk_put(tw);

0 commit comments

Comments
 (0)