Skip to content

Commit 2c8cec5

Browse files
committed
ipv4: Cache learned PMTU information in inetpeer.
The general idea is that if we learn new PMTU information, we bump the peer genid. This triggers the dst_ops->check() code to validate and if necessary propagate the new PMTU value into the metrics. Learned PMTU information self-expires. This means that it is not necessary to kill a cached route entry just because the PMTU information is too old. As a consequence: 1) When the path appears unreachable (dst_ops->link_failure or dst_ops->negative_advice) we unwind the PMTU state if it is out of date, instead of killing the cached route. A redirected route will still be invalidated in these situations. 2) rt_check_expire(), rt_worker_func(), et al. are no longer necessary at all. Signed-off-by: David S. Miller <[email protected]>
1 parent d606ef3 commit 2c8cec5

File tree

1 file changed

+86
-174
lines changed

1 file changed

+86
-174
lines changed

net/ipv4/route.c

Lines changed: 86 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131131
static int ip_rt_min_advmss __read_mostly = 256;
132132
static int rt_chain_length_max __read_mostly = 20;
133133

134-
static struct delayed_work expires_work;
135-
static unsigned long expires_ljiffies;
136-
137134
/*
138135
* Interface to generic destination cache.
139136
*/
@@ -668,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth)
668665
static inline int rt_valuable(struct rtable *rth)
669666
{
670667
return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
671-
rth->dst.expires;
668+
(rth->peer && rth->peer->pmtu_expires);
672669
}
673670

674671
static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -679,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
679676
if (atomic_read(&rth->dst.__refcnt))
680677
goto out;
681678

682-
ret = 1;
683-
if (rth->dst.expires &&
684-
time_after_eq(jiffies, rth->dst.expires))
685-
goto out;
686-
687679
age = jiffies - rth->dst.lastuse;
688-
ret = 0;
689680
if ((age <= tmo1 && !rt_fast_clean(rth)) ||
690681
(age <= tmo2 && rt_valuable(rth)))
691682
goto out;
@@ -829,97 +820,6 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
829820
return ONE;
830821
}
831822

832-
static void rt_check_expire(void)
833-
{
834-
static unsigned int rover;
835-
unsigned int i = rover, goal;
836-
struct rtable *rth;
837-
struct rtable __rcu **rthp;
838-
unsigned long samples = 0;
839-
unsigned long sum = 0, sum2 = 0;
840-
unsigned long delta;
841-
u64 mult;
842-
843-
delta = jiffies - expires_ljiffies;
844-
expires_ljiffies = jiffies;
845-
mult = ((u64)delta) << rt_hash_log;
846-
if (ip_rt_gc_timeout > 1)
847-
do_div(mult, ip_rt_gc_timeout);
848-
goal = (unsigned int)mult;
849-
if (goal > rt_hash_mask)
850-
goal = rt_hash_mask + 1;
851-
for (; goal > 0; goal--) {
852-
unsigned long tmo = ip_rt_gc_timeout;
853-
unsigned long length;
854-
855-
i = (i + 1) & rt_hash_mask;
856-
rthp = &rt_hash_table[i].chain;
857-
858-
if (need_resched())
859-
cond_resched();
860-
861-
samples++;
862-
863-
if (rcu_dereference_raw(*rthp) == NULL)
864-
continue;
865-
length = 0;
866-
spin_lock_bh(rt_hash_lock_addr(i));
867-
while ((rth = rcu_dereference_protected(*rthp,
868-
lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
869-
prefetch(rth->dst.rt_next);
870-
if (rt_is_expired(rth)) {
871-
*rthp = rth->dst.rt_next;
872-
rt_free(rth);
873-
continue;
874-
}
875-
if (rth->dst.expires) {
876-
/* Entry is expired even if it is in use */
877-
if (time_before_eq(jiffies, rth->dst.expires)) {
878-
nofree:
879-
tmo >>= 1;
880-
rthp = &rth->dst.rt_next;
881-
/*
882-
* We only count entries on
883-
* a chain with equal hash inputs once
884-
* so that entries for different QOS
885-
* levels, and other non-hash input
886-
* attributes don't unfairly skew
887-
* the length computation
888-
*/
889-
length += has_noalias(rt_hash_table[i].chain, rth);
890-
continue;
891-
}
892-
} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
893-
goto nofree;
894-
895-
/* Cleanup aged off entries. */
896-
*rthp = rth->dst.rt_next;
897-
rt_free(rth);
898-
}
899-
spin_unlock_bh(rt_hash_lock_addr(i));
900-
sum += length;
901-
sum2 += length*length;
902-
}
903-
if (samples) {
904-
unsigned long avg = sum / samples;
905-
unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
906-
rt_chain_length_max = max_t(unsigned long,
907-
ip_rt_gc_elasticity,
908-
(avg + 4*sd) >> FRACT_BITS);
909-
}
910-
rover = i;
911-
}
912-
913-
/*
914-
* rt_worker_func() is run in process context.
915-
* we call rt_check_expire() to scan part of the hash table
916-
*/
917-
static void rt_worker_func(struct work_struct *work)
918-
{
919-
rt_check_expire();
920-
schedule_delayed_work(&expires_work, ip_rt_gc_interval);
921-
}
922-
923823
/*
924824
* Pertubation of rt_genid by a small quantity [1..256]
925825
* Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -1535,9 +1435,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
15351435
if (dst->obsolete > 0) {
15361436
ip_rt_put(rt);
15371437
ret = NULL;
1538-
} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1539-
(rt->dst.expires &&
1540-
time_after_eq(jiffies, rt->dst.expires))) {
1438+
} else if (rt->rt_flags & RTCF_REDIRECTED) {
15411439
unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
15421440
rt->fl.oif,
15431441
rt_genid(dev_net(dst->dev)));
@@ -1547,6 +1445,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
15471445
#endif
15481446
rt_del(hash, rt);
15491447
ret = NULL;
1448+
} else if (rt->peer &&
1449+
rt->peer->pmtu_expires &&
1450+
time_after_eq(jiffies, rt->peer->pmtu_expires)) {
1451+
unsigned long orig = rt->peer->pmtu_expires;
1452+
1453+
if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1454+
dst_metric_set(dst, RTAX_MTU,
1455+
rt->peer->pmtu_orig);
15501456
}
15511457
}
15521458
return ret;
@@ -1697,80 +1603,78 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
16971603
unsigned short new_mtu,
16981604
struct net_device *dev)
16991605
{
1700-
int i, k;
17011606
unsigned short old_mtu = ntohs(iph->tot_len);
1702-
struct rtable *rth;
1703-
int ikeys[2] = { dev->ifindex, 0 };
1704-
__be32 skeys[2] = { iph->saddr, 0, };
1705-
__be32 daddr = iph->daddr;
17061607
unsigned short est_mtu = 0;
1608+
struct inet_peer *peer;
17071609

1708-
for (k = 0; k < 2; k++) {
1709-
for (i = 0; i < 2; i++) {
1710-
unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1711-
rt_genid(net));
1712-
1713-
rcu_read_lock();
1714-
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1715-
rth = rcu_dereference(rth->dst.rt_next)) {
1716-
unsigned short mtu = new_mtu;
1610+
peer = inet_getpeer_v4(iph->daddr, 1);
1611+
if (peer) {
1612+
unsigned short mtu = new_mtu;
17171613

1718-
if (rth->fl.fl4_dst != daddr ||
1719-
rth->fl.fl4_src != skeys[i] ||
1720-
rth->rt_dst != daddr ||
1721-
rth->rt_src != iph->saddr ||
1722-
rth->fl.oif != ikeys[k] ||
1723-
rt_is_input_route(rth) ||
1724-
dst_metric_locked(&rth->dst, RTAX_MTU) ||
1725-
!net_eq(dev_net(rth->dst.dev), net) ||
1726-
rt_is_expired(rth))
1727-
continue;
1614+
if (new_mtu < 68 || new_mtu >= old_mtu) {
1615+
/* BSD 4.2 derived systems incorrectly adjust
1616+
* tot_len by the IP header length, and report
1617+
* a zero MTU in the ICMP message.
1618+
*/
1619+
if (mtu == 0 &&
1620+
old_mtu >= 68 + (iph->ihl << 2))
1621+
old_mtu -= iph->ihl << 2;
1622+
mtu = guess_mtu(old_mtu);
1623+
}
17281624

1729-
if (new_mtu < 68 || new_mtu >= old_mtu) {
1625+
if (mtu < ip_rt_min_pmtu)
1626+
mtu = ip_rt_min_pmtu;
1627+
if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1628+
est_mtu = mtu;
1629+
peer->pmtu_learned = mtu;
1630+
peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
1631+
}
17301632

1731-
/* BSD 4.2 compatibility hack :-( */
1732-
if (mtu == 0 &&
1733-
old_mtu >= dst_mtu(&rth->dst) &&
1734-
old_mtu >= 68 + (iph->ihl << 2))
1735-
old_mtu -= iph->ihl << 2;
1633+
inet_putpeer(peer);
17361634

1737-
mtu = guess_mtu(old_mtu);
1738-
}
1739-
if (mtu <= dst_mtu(&rth->dst)) {
1740-
if (mtu < dst_mtu(&rth->dst)) {
1741-
dst_confirm(&rth->dst);
1742-
if (mtu < ip_rt_min_pmtu) {
1743-
u32 lock = dst_metric(&rth->dst,
1744-
RTAX_LOCK);
1745-
mtu = ip_rt_min_pmtu;
1746-
lock |= (1 << RTAX_MTU);
1747-
dst_metric_set(&rth->dst, RTAX_LOCK,
1748-
lock);
1749-
}
1750-
dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1751-
dst_set_expires(&rth->dst,
1752-
ip_rt_mtu_expires);
1753-
}
1754-
est_mtu = mtu;
1755-
}
1756-
}
1757-
rcu_read_unlock();
1758-
}
1635+
atomic_inc(&__rt_peer_genid);
17591636
}
17601637
return est_mtu ? : new_mtu;
17611638
}
17621639

1640+
static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1641+
{
1642+
unsigned long expires = peer->pmtu_expires;
1643+
1644+
if (time_before(expires, jiffies)) {
1645+
u32 orig_dst_mtu = dst_mtu(dst);
1646+
if (peer->pmtu_learned < orig_dst_mtu) {
1647+
if (!peer->pmtu_orig)
1648+
peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1649+
dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1650+
}
1651+
} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1652+
dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1653+
}
1654+
17631655
static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
17641656
{
1765-
if (dst_mtu(dst) > mtu && mtu >= 68 &&
1766-
!(dst_metric_locked(dst, RTAX_MTU))) {
1767-
if (mtu < ip_rt_min_pmtu) {
1768-
u32 lock = dst_metric(dst, RTAX_LOCK);
1657+
struct rtable *rt = (struct rtable *) dst;
1658+
struct inet_peer *peer;
1659+
1660+
dst_confirm(dst);
1661+
1662+
if (!rt->peer)
1663+
rt_bind_peer(rt, 1);
1664+
peer = rt->peer;
1665+
if (peer) {
1666+
if (mtu < ip_rt_min_pmtu)
17691667
mtu = ip_rt_min_pmtu;
1770-
dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1668+
if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1669+
peer->pmtu_learned = mtu;
1670+
peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
1671+
1672+
atomic_inc(&__rt_peer_genid);
1673+
rt->rt_peer_genid = rt_peer_genid();
1674+
1675+
check_peer_pmtu(dst, peer);
17711676
}
1772-
dst_metric_set(dst, RTAX_MTU, mtu);
1773-
dst_set_expires(dst, ip_rt_mtu_expires);
1677+
inet_putpeer(peer);
17741678
}
17751679
}
17761680

@@ -1781,9 +1685,15 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
17811685
if (rt_is_expired(rt))
17821686
return NULL;
17831687
if (rt->rt_peer_genid != rt_peer_genid()) {
1688+
struct inet_peer *peer;
1689+
17841690
if (!rt->peer)
17851691
rt_bind_peer(rt, 0);
17861692

1693+
peer = rt->peer;
1694+
if (peer && peer->pmtu_expires)
1695+
check_peer_pmtu(dst, peer);
1696+
17871697
rt->rt_peer_genid = rt_peer_genid();
17881698
}
17891699
return dst;
@@ -1812,8 +1722,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
18121722
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
18131723

18141724
rt = skb_rtable(skb);
1815-
if (rt)
1816-
dst_set_expires(&rt->dst, 0);
1725+
if (rt &&
1726+
rt->peer &&
1727+
rt->peer->pmtu_expires) {
1728+
unsigned long orig = rt->peer->pmtu_expires;
1729+
1730+
if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1731+
dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1732+
}
18171733
}
18181734

18191735
static int ip_rt_bug(struct sk_buff *skb)
@@ -1911,6 +1827,9 @@ static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
19111827
memcpy(peer->metrics, fi->fib_metrics,
19121828
sizeof(u32) * RTAX_MAX);
19131829
dst_init_metrics(&rt->dst, peer->metrics, false);
1830+
1831+
if (peer->pmtu_expires)
1832+
check_peer_pmtu(&rt->dst, peer);
19141833
} else {
19151834
if (fi->fib_metrics != (u32 *) dst_default_metrics) {
19161835
rt->fi = fi;
@@ -2961,7 +2880,8 @@ static int rt_fill_info(struct net *net,
29612880
NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
29622881

29632882
error = rt->dst.error;
2964-
expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2883+
expires = (rt->peer && rt->peer->pmtu_expires) ?
2884+
rt->peer->pmtu_expires - jiffies : 0;
29652885
if (rt->peer) {
29662886
inet_peer_refcheck(rt->peer);
29672887
id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
@@ -3418,14 +3338,6 @@ int __init ip_rt_init(void)
34183338
devinet_init();
34193339
ip_fib_init();
34203340

3421-
/* All the timers, started at system startup tend
3422-
to synchronize. Perturb it a bit.
3423-
*/
3424-
INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3425-
expires_ljiffies = jiffies;
3426-
schedule_delayed_work(&expires_work,
3427-
net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3428-
34293341
if (ip_rt_proc_init())
34303342
printk(KERN_ERR "Unable to create route proc files\n");
34313343
#ifdef CONFIG_XFRM

0 commit comments

Comments
 (0)