@@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131
131
static int ip_rt_min_advmss __read_mostly = 256 ;
132
132
static int rt_chain_length_max __read_mostly = 20 ;
133
133
134
- static struct delayed_work expires_work ;
135
- static unsigned long expires_ljiffies ;
136
-
137
134
/*
138
135
* Interface to generic destination cache.
139
136
*/
@@ -668,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth)
668
665
static inline int rt_valuable (struct rtable * rth )
669
666
{
670
667
return (rth -> rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY )) ||
671
- rth -> dst . expires ;
668
+ ( rth -> peer && rth -> peer -> pmtu_expires ) ;
672
669
}
673
670
674
671
static int rt_may_expire (struct rtable * rth , unsigned long tmo1 , unsigned long tmo2 )
@@ -679,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
679
676
if (atomic_read (& rth -> dst .__refcnt ))
680
677
goto out ;
681
678
682
- ret = 1 ;
683
- if (rth -> dst .expires &&
684
- time_after_eq (jiffies , rth -> dst .expires ))
685
- goto out ;
686
-
687
679
age = jiffies - rth -> dst .lastuse ;
688
- ret = 0 ;
689
680
if ((age <= tmo1 && !rt_fast_clean (rth )) ||
690
681
(age <= tmo2 && rt_valuable (rth )))
691
682
goto out ;
@@ -829,97 +820,6 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
829
820
return ONE ;
830
821
}
831
822
832
- static void rt_check_expire (void )
833
- {
834
- static unsigned int rover ;
835
- unsigned int i = rover , goal ;
836
- struct rtable * rth ;
837
- struct rtable __rcu * * rthp ;
838
- unsigned long samples = 0 ;
839
- unsigned long sum = 0 , sum2 = 0 ;
840
- unsigned long delta ;
841
- u64 mult ;
842
-
843
- delta = jiffies - expires_ljiffies ;
844
- expires_ljiffies = jiffies ;
845
- mult = ((u64 )delta ) << rt_hash_log ;
846
- if (ip_rt_gc_timeout > 1 )
847
- do_div (mult , ip_rt_gc_timeout );
848
- goal = (unsigned int )mult ;
849
- if (goal > rt_hash_mask )
850
- goal = rt_hash_mask + 1 ;
851
- for (; goal > 0 ; goal -- ) {
852
- unsigned long tmo = ip_rt_gc_timeout ;
853
- unsigned long length ;
854
-
855
- i = (i + 1 ) & rt_hash_mask ;
856
- rthp = & rt_hash_table [i ].chain ;
857
-
858
- if (need_resched ())
859
- cond_resched ();
860
-
861
- samples ++ ;
862
-
863
- if (rcu_dereference_raw (* rthp ) == NULL )
864
- continue ;
865
- length = 0 ;
866
- spin_lock_bh (rt_hash_lock_addr (i ));
867
- while ((rth = rcu_dereference_protected (* rthp ,
868
- lockdep_is_held (rt_hash_lock_addr (i )))) != NULL ) {
869
- prefetch (rth -> dst .rt_next );
870
- if (rt_is_expired (rth )) {
871
- * rthp = rth -> dst .rt_next ;
872
- rt_free (rth );
873
- continue ;
874
- }
875
- if (rth -> dst .expires ) {
876
- /* Entry is expired even if it is in use */
877
- if (time_before_eq (jiffies , rth -> dst .expires )) {
878
- nofree :
879
- tmo >>= 1 ;
880
- rthp = & rth -> dst .rt_next ;
881
- /*
882
- * We only count entries on
883
- * a chain with equal hash inputs once
884
- * so that entries for different QOS
885
- * levels, and other non-hash input
886
- * attributes don't unfairly skew
887
- * the length computation
888
- */
889
- length += has_noalias (rt_hash_table [i ].chain , rth );
890
- continue ;
891
- }
892
- } else if (!rt_may_expire (rth , tmo , ip_rt_gc_timeout ))
893
- goto nofree ;
894
-
895
- /* Cleanup aged off entries. */
896
- * rthp = rth -> dst .rt_next ;
897
- rt_free (rth );
898
- }
899
- spin_unlock_bh (rt_hash_lock_addr (i ));
900
- sum += length ;
901
- sum2 += length * length ;
902
- }
903
- if (samples ) {
904
- unsigned long avg = sum / samples ;
905
- unsigned long sd = int_sqrt (sum2 / samples - avg * avg );
906
- rt_chain_length_max = max_t (unsigned long ,
907
- ip_rt_gc_elasticity ,
908
- (avg + 4 * sd ) >> FRACT_BITS );
909
- }
910
- rover = i ;
911
- }
912
-
913
- /*
914
- * rt_worker_func() is run in process context.
915
- * we call rt_check_expire() to scan part of the hash table
916
- */
917
- static void rt_worker_func (struct work_struct * work )
918
- {
919
- rt_check_expire ();
920
- schedule_delayed_work (& expires_work , ip_rt_gc_interval );
921
- }
922
-
923
823
/*
924
824
* Pertubation of rt_genid by a small quantity [1..256]
925
825
* Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -1535,9 +1435,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1535
1435
if (dst -> obsolete > 0 ) {
1536
1436
ip_rt_put (rt );
1537
1437
ret = NULL ;
1538
- } else if ((rt -> rt_flags & RTCF_REDIRECTED ) ||
1539
- (rt -> dst .expires &&
1540
- time_after_eq (jiffies , rt -> dst .expires ))) {
1438
+ } else if (rt -> rt_flags & RTCF_REDIRECTED ) {
1541
1439
unsigned hash = rt_hash (rt -> fl .fl4_dst , rt -> fl .fl4_src ,
1542
1440
rt -> fl .oif ,
1543
1441
rt_genid (dev_net (dst -> dev )));
@@ -1547,6 +1445,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1547
1445
#endif
1548
1446
rt_del (hash , rt );
1549
1447
ret = NULL ;
1448
+ } else if (rt -> peer &&
1449
+ rt -> peer -> pmtu_expires &&
1450
+ time_after_eq (jiffies , rt -> peer -> pmtu_expires )) {
1451
+ unsigned long orig = rt -> peer -> pmtu_expires ;
1452
+
1453
+ if (cmpxchg (& rt -> peer -> pmtu_expires , orig , 0 ) == orig )
1454
+ dst_metric_set (dst , RTAX_MTU ,
1455
+ rt -> peer -> pmtu_orig );
1550
1456
}
1551
1457
}
1552
1458
return ret ;
@@ -1697,80 +1603,78 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1697
1603
unsigned short new_mtu ,
1698
1604
struct net_device * dev )
1699
1605
{
1700
- int i , k ;
1701
1606
unsigned short old_mtu = ntohs (iph -> tot_len );
1702
- struct rtable * rth ;
1703
- int ikeys [2 ] = { dev -> ifindex , 0 };
1704
- __be32 skeys [2 ] = { iph -> saddr , 0 , };
1705
- __be32 daddr = iph -> daddr ;
1706
1607
unsigned short est_mtu = 0 ;
1608
+ struct inet_peer * peer ;
1707
1609
1708
- for (k = 0 ; k < 2 ; k ++ ) {
1709
- for (i = 0 ; i < 2 ; i ++ ) {
1710
- unsigned hash = rt_hash (daddr , skeys [i ], ikeys [k ],
1711
- rt_genid (net ));
1712
-
1713
- rcu_read_lock ();
1714
- for (rth = rcu_dereference (rt_hash_table [hash ].chain ); rth ;
1715
- rth = rcu_dereference (rth -> dst .rt_next )) {
1716
- unsigned short mtu = new_mtu ;
1610
+ peer = inet_getpeer_v4 (iph -> daddr , 1 );
1611
+ if (peer ) {
1612
+ unsigned short mtu = new_mtu ;
1717
1613
1718
- if (rth -> fl . fl4_dst != daddr ||
1719
- rth -> fl . fl4_src != skeys [ i ] ||
1720
- rth -> rt_dst != daddr ||
1721
- rth -> rt_src != iph -> saddr ||
1722
- rth -> fl . oif != ikeys [ k ] ||
1723
- rt_is_input_route ( rth ) ||
1724
- dst_metric_locked ( & rth -> dst , RTAX_MTU ) ||
1725
- ! net_eq ( dev_net ( rth -> dst . dev ), net ) ||
1726
- rt_is_expired ( rth ))
1727
- continue ;
1614
+ if (new_mtu < 68 || new_mtu >= old_mtu ) {
1615
+ /* BSD 4.2 derived systems incorrectly adjust
1616
+ * tot_len by the IP header length, and report
1617
+ * a zero MTU in the ICMP message.
1618
+ */
1619
+ if ( mtu == 0 &&
1620
+ old_mtu >= 68 + ( iph -> ihl << 2 ))
1621
+ old_mtu -= iph -> ihl << 2 ;
1622
+ mtu = guess_mtu ( old_mtu );
1623
+ }
1728
1624
1729
- if (new_mtu < 68 || new_mtu >= old_mtu ) {
1625
+ if (mtu < ip_rt_min_pmtu )
1626
+ mtu = ip_rt_min_pmtu ;
1627
+ if (!peer -> pmtu_expires || mtu < peer -> pmtu_learned ) {
1628
+ est_mtu = mtu ;
1629
+ peer -> pmtu_learned = mtu ;
1630
+ peer -> pmtu_expires = jiffies + ip_rt_mtu_expires ;
1631
+ }
1730
1632
1731
- /* BSD 4.2 compatibility hack :-( */
1732
- if (mtu == 0 &&
1733
- old_mtu >= dst_mtu (& rth -> dst ) &&
1734
- old_mtu >= 68 + (iph -> ihl << 2 ))
1735
- old_mtu -= iph -> ihl << 2 ;
1633
+ inet_putpeer (peer );
1736
1634
1737
- mtu = guess_mtu (old_mtu );
1738
- }
1739
- if (mtu <= dst_mtu (& rth -> dst )) {
1740
- if (mtu < dst_mtu (& rth -> dst )) {
1741
- dst_confirm (& rth -> dst );
1742
- if (mtu < ip_rt_min_pmtu ) {
1743
- u32 lock = dst_metric (& rth -> dst ,
1744
- RTAX_LOCK );
1745
- mtu = ip_rt_min_pmtu ;
1746
- lock |= (1 << RTAX_MTU );
1747
- dst_metric_set (& rth -> dst , RTAX_LOCK ,
1748
- lock );
1749
- }
1750
- dst_metric_set (& rth -> dst , RTAX_MTU , mtu );
1751
- dst_set_expires (& rth -> dst ,
1752
- ip_rt_mtu_expires );
1753
- }
1754
- est_mtu = mtu ;
1755
- }
1756
- }
1757
- rcu_read_unlock ();
1758
- }
1635
+ atomic_inc (& __rt_peer_genid );
1759
1636
}
1760
1637
return est_mtu ? : new_mtu ;
1761
1638
}
1762
1639
1640
+ static void check_peer_pmtu (struct dst_entry * dst , struct inet_peer * peer )
1641
+ {
1642
+ unsigned long expires = peer -> pmtu_expires ;
1643
+
1644
+ if (time_before (expires , jiffies )) {
1645
+ u32 orig_dst_mtu = dst_mtu (dst );
1646
+ if (peer -> pmtu_learned < orig_dst_mtu ) {
1647
+ if (!peer -> pmtu_orig )
1648
+ peer -> pmtu_orig = dst_metric_raw (dst , RTAX_MTU );
1649
+ dst_metric_set (dst , RTAX_MTU , peer -> pmtu_learned );
1650
+ }
1651
+ } else if (cmpxchg (& peer -> pmtu_expires , expires , 0 ) == expires )
1652
+ dst_metric_set (dst , RTAX_MTU , peer -> pmtu_orig );
1653
+ }
1654
+
1763
1655
static void ip_rt_update_pmtu (struct dst_entry * dst , u32 mtu )
1764
1656
{
1765
- if (dst_mtu (dst ) > mtu && mtu >= 68 &&
1766
- !(dst_metric_locked (dst , RTAX_MTU ))) {
1767
- if (mtu < ip_rt_min_pmtu ) {
1768
- u32 lock = dst_metric (dst , RTAX_LOCK );
1657
+ struct rtable * rt = (struct rtable * ) dst ;
1658
+ struct inet_peer * peer ;
1659
+
1660
+ dst_confirm (dst );
1661
+
1662
+ if (!rt -> peer )
1663
+ rt_bind_peer (rt , 1 );
1664
+ peer = rt -> peer ;
1665
+ if (peer ) {
1666
+ if (mtu < ip_rt_min_pmtu )
1769
1667
mtu = ip_rt_min_pmtu ;
1770
- dst_metric_set (dst , RTAX_LOCK , lock | (1 << RTAX_MTU ));
1668
+ if (!peer -> pmtu_expires || mtu < peer -> pmtu_learned ) {
1669
+ peer -> pmtu_learned = mtu ;
1670
+ peer -> pmtu_expires = jiffies + ip_rt_mtu_expires ;
1671
+
1672
+ atomic_inc (& __rt_peer_genid );
1673
+ rt -> rt_peer_genid = rt_peer_genid ();
1674
+
1675
+ check_peer_pmtu (dst , peer );
1771
1676
}
1772
- dst_metric_set (dst , RTAX_MTU , mtu );
1773
- dst_set_expires (dst , ip_rt_mtu_expires );
1677
+ inet_putpeer (peer );
1774
1678
}
1775
1679
}
1776
1680
@@ -1781,9 +1685,15 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1781
1685
if (rt_is_expired (rt ))
1782
1686
return NULL ;
1783
1687
if (rt -> rt_peer_genid != rt_peer_genid ()) {
1688
+ struct inet_peer * peer ;
1689
+
1784
1690
if (!rt -> peer )
1785
1691
rt_bind_peer (rt , 0 );
1786
1692
1693
+ peer = rt -> peer ;
1694
+ if (peer && peer -> pmtu_expires )
1695
+ check_peer_pmtu (dst , peer );
1696
+
1787
1697
rt -> rt_peer_genid = rt_peer_genid ();
1788
1698
}
1789
1699
return dst ;
@@ -1812,8 +1722,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
1812
1722
icmp_send (skb , ICMP_DEST_UNREACH , ICMP_HOST_UNREACH , 0 );
1813
1723
1814
1724
rt = skb_rtable (skb );
1815
- if (rt )
1816
- dst_set_expires (& rt -> dst , 0 );
1725
+ if (rt &&
1726
+ rt -> peer &&
1727
+ rt -> peer -> pmtu_expires ) {
1728
+ unsigned long orig = rt -> peer -> pmtu_expires ;
1729
+
1730
+ if (cmpxchg (& rt -> peer -> pmtu_expires , orig , 0 ) == orig )
1731
+ dst_metric_set (& rt -> dst , RTAX_MTU , rt -> peer -> pmtu_orig );
1732
+ }
1817
1733
}
1818
1734
1819
1735
static int ip_rt_bug (struct sk_buff * skb )
@@ -1911,6 +1827,9 @@ static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1911
1827
memcpy (peer -> metrics , fi -> fib_metrics ,
1912
1828
sizeof (u32 ) * RTAX_MAX );
1913
1829
dst_init_metrics (& rt -> dst , peer -> metrics , false);
1830
+
1831
+ if (peer -> pmtu_expires )
1832
+ check_peer_pmtu (& rt -> dst , peer );
1914
1833
} else {
1915
1834
if (fi -> fib_metrics != (u32 * ) dst_default_metrics ) {
1916
1835
rt -> fi = fi ;
@@ -2961,7 +2880,8 @@ static int rt_fill_info(struct net *net,
2961
2880
NLA_PUT_BE32 (skb , RTA_MARK , rt -> fl .mark );
2962
2881
2963
2882
error = rt -> dst .error ;
2964
- expires = rt -> dst .expires ? rt -> dst .expires - jiffies : 0 ;
2883
+ expires = (rt -> peer && rt -> peer -> pmtu_expires ) ?
2884
+ rt -> peer -> pmtu_expires - jiffies : 0 ;
2965
2885
if (rt -> peer ) {
2966
2886
inet_peer_refcheck (rt -> peer );
2967
2887
id = atomic_read (& rt -> peer -> ip_id_count ) & 0xffff ;
@@ -3418,14 +3338,6 @@ int __init ip_rt_init(void)
3418
3338
devinet_init ();
3419
3339
ip_fib_init ();
3420
3340
3421
- /* All the timers, started at system startup tend
3422
- to synchronize. Perturb it a bit.
3423
- */
3424
- INIT_DELAYED_WORK_DEFERRABLE (& expires_work , rt_worker_func );
3425
- expires_ljiffies = jiffies ;
3426
- schedule_delayed_work (& expires_work ,
3427
- net_random () % ip_rt_gc_interval + ip_rt_gc_interval );
3428
-
3429
3341
if (ip_rt_proc_init ())
3430
3342
printk (KERN_ERR "Unable to create route proc files\n" );
3431
3343
#ifdef CONFIG_XFRM
0 commit comments