Skip to content

Commit 9f2f27a

Browse files
committed
Merge branch 'icmp-reply-optimize'
Jesper Dangaard Brouer says: ==================== net: optimize ICMP-reply code path This patchset is optimizing the ICMP-reply code path, for ICMP packets that gets rate limited. A remote party can easily trigger this code path by sending packets to port number with no listening service. Generally the patchset moves the sysctl_icmp_msgs_per_sec ratelimit checking to earlier in the code path and removes an allocation. Use-case: The specific case I experienced this being a bottleneck is, sending UDP packets to a port with no listener, which obviously result in kernel replying with ICMP Destination Unreachable (type:3), Port Unreachable (code:3), which cause the bottleneck. After Eric and Paolo optimized the UDP socket code, the kernels PPS processing capabilities is lower for no-listen ports, than normal UDP sockets. This is bad for capacity planning when restarting a service. UDP no-listen benchmark 8xCPUs using pktgen_sample04_many_flows.sh: Baseline: 6.6 Mpps Patch: 14.7 Mpps Driver mlx5 at 50Gbit/s. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents aaa9c10 + 7ba91ec commit 9f2f27a

File tree

2 files changed

+123
-70
lines changed

2 files changed

+123
-70
lines changed

net/ipv4/icmp.c

Lines changed: 76 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -209,27 +209,25 @@ static struct sock *icmp_sk(struct net *net)
209209
return *this_cpu_ptr(net->ipv4.icmp_sk);
210210
}
211211

212+
/* Called with BH disabled */
212213
static inline struct sock *icmp_xmit_lock(struct net *net)
213214
{
214215
struct sock *sk;
215216

216-
local_bh_disable();
217-
218217
sk = icmp_sk(net);
219218

220219
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
221220
/* This can happen if the output path signals a
222221
* dst_link_failure() for an outgoing ICMP packet.
223222
*/
224-
local_bh_enable();
225223
return NULL;
226224
}
227225
return sk;
228226
}
229227

230228
static inline void icmp_xmit_unlock(struct sock *sk)
231229
{
232-
spin_unlock_bh(&sk->sk_lock.slock);
230+
spin_unlock(&sk->sk_lock.slock);
233231
}
234232

235233
int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
@@ -282,6 +280,33 @@ bool icmp_global_allow(void)
282280
}
283281
EXPORT_SYMBOL(icmp_global_allow);
284282

283+
static bool icmpv4_mask_allow(struct net *net, int type, int code)
284+
{
285+
if (type > NR_ICMP_TYPES)
286+
return true;
287+
288+
/* Don't limit PMTU discovery. */
289+
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
290+
return true;
291+
292+
/* Limit if icmp type is enabled in ratemask. */
293+
if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
294+
return true;
295+
296+
return false;
297+
}
298+
299+
static bool icmpv4_global_allow(struct net *net, int type, int code)
300+
{
301+
if (icmpv4_mask_allow(net, type, code))
302+
return true;
303+
304+
if (icmp_global_allow())
305+
return true;
306+
307+
return false;
308+
}
309+
285310
/*
286311
* Send an ICMP frame.
287312
*/
@@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
290315
struct flowi4 *fl4, int type, int code)
291316
{
292317
struct dst_entry *dst = &rt->dst;
318+
struct inet_peer *peer;
293319
bool rc = true;
320+
int vif;
294321

295-
if (type > NR_ICMP_TYPES)
296-
goto out;
297-
298-
/* Don't limit PMTU discovery. */
299-
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
322+
if (icmpv4_mask_allow(net, type, code))
300323
goto out;
301324

302325
/* No rate limit on loopback */
303326
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
304327
goto out;
305328

306-
/* Limit if icmp type is enabled in ratemask. */
307-
if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
308-
goto out;
309-
310-
rc = false;
311-
if (icmp_global_allow()) {
312-
int vif = l3mdev_master_ifindex(dst->dev);
313-
struct inet_peer *peer;
314-
315-
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
316-
rc = inet_peer_xrlim_allow(peer,
317-
net->ipv4.sysctl_icmp_ratelimit);
318-
if (peer)
319-
inet_putpeer(peer);
320-
}
329+
vif = l3mdev_master_ifindex(dst->dev);
330+
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
331+
rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
332+
if (peer)
333+
inet_putpeer(peer);
321334
out:
322335
return rc;
323336
}
@@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
396409
struct inet_sock *inet;
397410
__be32 daddr, saddr;
398411
u32 mark = IP4_REPLY_MARK(net, skb->mark);
412+
int type = icmp_param->data.icmph.type;
413+
int code = icmp_param->data.icmph.code;
399414

400415
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
401416
return;
402417

418+
/* Needed by both icmp_global_allow and icmp_xmit_lock */
419+
local_bh_disable();
420+
421+
/* global icmp_msgs_per_sec */
422+
if (!icmpv4_global_allow(net, type, code))
423+
goto out_bh_enable;
424+
403425
sk = icmp_xmit_lock(net);
404426
if (!sk)
405-
return;
427+
goto out_bh_enable;
406428
inet = inet_sk(sk);
407429

408430
icmp_param->data.icmph.checksum = 0;
@@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
433455
rt = ip_route_output_key(net, &fl4);
434456
if (IS_ERR(rt))
435457
goto out_unlock;
436-
if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
437-
icmp_param->data.icmph.code))
458+
if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
438459
icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
439460
ip_rt_put(rt);
440461
out_unlock:
441462
icmp_xmit_unlock(sk);
463+
out_bh_enable:
464+
local_bh_enable();
442465
}
443466

444467
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
571594
{
572595
struct iphdr *iph;
573596
int room;
574-
struct icmp_bxm *icmp_param;
597+
struct icmp_bxm icmp_param;
575598
struct rtable *rt = skb_rtable(skb_in);
576599
struct ipcm_cookie ipc;
577600
struct flowi4 fl4;
@@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
648671
}
649672
}
650673

651-
icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
652-
if (!icmp_param)
653-
return;
674+
/* Needed by both icmp_global_allow and icmp_xmit_lock */
675+
local_bh_disable();
676+
677+
/* Check global sysctl_icmp_msgs_per_sec ratelimit */
678+
if (!icmpv4_global_allow(net, type, code))
679+
goto out_bh_enable;
654680

655681
sk = icmp_xmit_lock(net);
656682
if (!sk)
657-
goto out_free;
683+
goto out_bh_enable;
658684

659685
/*
660686
* Construct source address and options.
@@ -681,33 +707,34 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
681707
iph->tos;
682708
mark = IP4_REPLY_MARK(net, skb_in->mark);
683709

684-
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
710+
if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
685711
goto out_unlock;
686712

687713

688714
/*
689715
* Prepare data for ICMP header.
690716
*/
691717

692-
icmp_param->data.icmph.type = type;
693-
icmp_param->data.icmph.code = code;
694-
icmp_param->data.icmph.un.gateway = info;
695-
icmp_param->data.icmph.checksum = 0;
696-
icmp_param->skb = skb_in;
697-
icmp_param->offset = skb_network_offset(skb_in);
718+
icmp_param.data.icmph.type = type;
719+
icmp_param.data.icmph.code = code;
720+
icmp_param.data.icmph.un.gateway = info;
721+
icmp_param.data.icmph.checksum = 0;
722+
icmp_param.skb = skb_in;
723+
icmp_param.offset = skb_network_offset(skb_in);
698724
inet_sk(sk)->tos = tos;
699725
sk->sk_mark = mark;
700726
ipc.addr = iph->saddr;
701-
ipc.opt = &icmp_param->replyopts.opt;
727+
ipc.opt = &icmp_param.replyopts.opt;
702728
ipc.tx_flags = 0;
703729
ipc.ttl = 0;
704730
ipc.tos = -1;
705731

706732
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
707-
type, code, icmp_param);
733+
type, code, &icmp_param);
708734
if (IS_ERR(rt))
709735
goto out_unlock;
710736

737+
/* peer icmp_ratelimit */
711738
if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
712739
goto ende;
713740

@@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
716743
room = dst_mtu(&rt->dst);
717744
if (room > 576)
718745
room = 576;
719-
room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
746+
room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
720747
room -= sizeof(struct icmphdr);
721748

722-
icmp_param->data_len = skb_in->len - icmp_param->offset;
723-
if (icmp_param->data_len > room)
724-
icmp_param->data_len = room;
725-
icmp_param->head_len = sizeof(struct icmphdr);
749+
icmp_param.data_len = skb_in->len - icmp_param.offset;
750+
if (icmp_param.data_len > room)
751+
icmp_param.data_len = room;
752+
icmp_param.head_len = sizeof(struct icmphdr);
726753

727-
icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
754+
icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
728755
ende:
729756
ip_rt_put(rt);
730757
out_unlock:
731758
icmp_xmit_unlock(sk);
732-
out_free:
733-
kfree(icmp_param);
759+
out_bh_enable:
760+
local_bh_enable();
734761
out:;
735762
}
736763
EXPORT_SYMBOL(icmp_send);

0 commit comments

Comments
 (0)