Skip to content

Commit df82dc2

Browse files
author
Mukesh Kacker
committed
RDS/IB: Tune failover-on-reboot scheduling
In certain platforms (e.g. X5-2) the IB devices are slow to come up and the start of failover-on-reboot (in RDS active bonding) is changed to accomodate them. Otherwise all interfaces get de-activated and then re-activated on almost every reboot. We also make sure when all interfaces do get de-activated by failover-on-reboot, it is not affected by delayed startup of devices from all-ports-down which is present for other situations. The startup interval of first scheduling of failover-on-reboot on module load is also turned into a module parameter. Orabug: 20063740 Signed-off-by: Mukesh Kacker <[email protected]> Acked-by: Ajaykumar Hotchandani <[email protected]>
1 parent 15f96ed commit df82dc2

File tree

1 file changed

+68
-10
lines changed

1 file changed

+68
-10
lines changed

net/rds/ib.c

Lines changed: 68 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ unsigned int rds_ib_active_bonding_enabled = 0;
6161
unsigned int rds_ib_active_bonding_fallback = 1;
6262
unsigned int rds_ib_active_bonding_reconnect_delay = 1;
6363
unsigned int rds_ib_active_bonding_trigger_delay_max_msecs; /* = 0; */
64+
unsigned int rds_ib_active_bonding_trigger_delay_min_msecs; /* = 0; */
6465
#if RDMA_RDS_APM_SUPPORTED
6566
unsigned int rds_ib_apm_timeout = RDS_IB_DEFAULT_TIMEOUT;
6667
#endif
@@ -104,6 +105,10 @@ MODULE_PARM_DESC(rds_ib_active_bonding_reconnect_delay, " Active Bonding reconne
104105
module_param(rds_ib_active_bonding_trigger_delay_max_msecs, int, 0444);
105106
MODULE_PARM_DESC(rds_ib_active_bonding_trigger_delay_max_msecs,
106107
" Active Bonding Max delay before active bonding is triggered(msecs)");
108+
module_param(rds_ib_active_bonding_trigger_delay_min_msecs, int, 0444);
109+
MODULE_PARM_DESC(rds_ib_active_bonding_trigger_delay_min_msecs,
110+
" Active Bonding Min delay before active "
111+
"bonding is triggered(msecs)");
107112
#if IB_RDS_CQ_VECTOR_SUPPORTED
108113
module_param(rds_ib_cq_balance_enabled, int, 0444);
109114
MODULE_PARM_DESC(rds_ib_cq_balance_enabled, " CQ load balance Enabled");
@@ -138,7 +143,7 @@ static struct rds_ib_excl_ips excl_ips_tbl[RDS_IB_MAX_EXCL_IPS];
138143
static u8 excl_ips_cnt = 0;
139144

140145
static int ip_config_init_phase_flag; /* = 0 */
141-
146+
static int initial_failovers_all_ports_deactivated_flag; /* = 0 */
142147
static int initial_failovers_iterations; /* = 0 */
143148

144149
/*
@@ -1571,6 +1576,7 @@ rds_ib_do_initial_failovers(struct work_struct *workarg)
15711576
container_of(workarg, struct rds_ib_initial_failovers_work,
15721577
dlywork.work);
15731578
unsigned int ii;
1579+
unsigned int ports_deactivated = 0;
15741580
int ret = 0;
15751581

15761582
/*
@@ -1632,11 +1638,15 @@ rds_ib_do_initial_failovers(struct work_struct *workarg)
16321638
ret = rds_ib_set_ip(NULL, NULL,
16331639
ip_config[ii].if_name,
16341640
0, 0, 0);
1641+
ports_deactivated++;
16351642

16361643
}
16371644
}
1638-
16391645
}
1646+
1647+
if (ports_deactivated == ip_port_cnt)
1648+
initial_failovers_all_ports_deactivated_flag = 1;
1649+
16401650
ip_config_init_phase_flag = 0; /* done with initial phase! */
16411651
kfree(riif_work);
16421652
}
@@ -1800,16 +1810,44 @@ sched_initial_failovers(unsigned int tot_devs,
18001810
* max time.
18011811
*
18021812
* Based on some empirical experiments, we put
1803-
* upper bound to be 30sec(30000msecs) and up.
1804-
* And we put min to be 10sec (10000msecs).
1813+
* upper bound to be 60sec(60000msecs) and up.
1814+
* And we put min to be 20sec (20000msecs).
18051815
*/
1806-
rds_ib_active_bonding_trigger_delay_max_msecs = 30000+
1816+
rds_ib_active_bonding_trigger_delay_max_msecs = 60000+
18071817
tot_ibdevs*1200+(tot_devs-tot_ibdevs)*1000;
18081818
}
18091819

1810-
trigger_delay_max_jiffies =
1811-
msecs_to_jiffies(rds_ib_active_bonding_trigger_delay_max_msecs);
1812-
trigger_delay_min_jiffies = msecs_to_jiffies(10000); /* 10 sec */
1820+
if (rds_ib_active_bonding_trigger_delay_min_msecs == 0) {
1821+
/*
1822+
* Derive guestimate of minimum time before we trigger the
1823+
* initial failovers for devices.
1824+
*/
1825+
rds_ib_active_bonding_trigger_delay_min_msecs =
1826+
msecs_to_jiffies(20000); /* 20 sec */
1827+
}
1828+
1829+
if (rds_ib_active_bonding_trigger_delay_min_msecs >=
1830+
rds_ib_active_bonding_trigger_delay_max_msecs) {
1831+
/*
1832+
* If these parameters are set inconsistently using
1833+
* module parameters, try to recover from it by deriving
1834+
* reasonable values such that max > min and log
1835+
* warning.
1836+
*/
1837+
printk(KERN_WARNING
1838+
"RDS/IB: rds active bonding trigger max delay(%u msecs)"
1839+
" is set less than min the minimum delay(%u msecs).\n",
1840+
rds_ib_active_bonding_trigger_delay_max_msecs,
1841+
rds_ib_active_bonding_trigger_delay_min_msecs);
1842+
1843+
/* set max slightly higher than min! */
1844+
rds_ib_active_bonding_trigger_delay_max_msecs =
1845+
rds_ib_active_bonding_trigger_delay_min_msecs + 10;
1846+
1847+
printk(KERN_WARNING "RDS/IB: rds active bonding trigger max "
1848+
"delay adjusted to %u msecs.\n",
1849+
rds_ib_active_bonding_trigger_delay_max_msecs);
1850+
}
18131851

18141852
riif_work = kzalloc(sizeof(struct rds_ib_initial_failovers_work),
18151853
GFP_KERNEL);
@@ -1820,9 +1858,17 @@ sched_initial_failovers(unsigned int tot_devs,
18201858
return;
18211859
}
18221860

1861+
trigger_delay_max_jiffies =
1862+
msecs_to_jiffies(rds_ib_active_bonding_trigger_delay_max_msecs);
18231863
riif_work->timeout = trigger_delay_max_jiffies;
18241864

1865+
trigger_delay_min_jiffies =
1866+
msecs_to_jiffies(rds_ib_active_bonding_trigger_delay_min_msecs);
1867+
18251868
INIT_DELAYED_WORK(&riif_work->dlywork, rds_ib_initial_failovers);
1869+
1870+
riif_work->timeout = trigger_delay_max_jiffies;
1871+
18261872
queue_delayed_work(rds_wq,
18271873
&riif_work->dlywork,
18281874
trigger_delay_min_jiffies);
@@ -2585,11 +2631,14 @@ static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long eve
25852631
if (rds_ib_active_bonding_fallback) {
25862632
/*
25872633
* Special case:
2588-
* If all interfaces were down OR
2634+
* If all interfaces were down
2635+
* (but NOT deactivated during initial failovers) OR
25892636
* transitioning port_state was in INIT
25902637
* use a larger timeout.
25912638
*/
2592-
if (all_ports_were_down || port_state_was_init) {
2639+
if ((all_ports_were_down &&
2640+
!initial_failovers_all_ports_deactivated_flag)
2641+
|| port_state_was_init) {
25932642
INIT_DELAYED_WORK(&work->work,
25942643
rds_ib_net_failback);
25952644
work->timeout = msecs_to_jiffies(10000);
@@ -2603,7 +2652,16 @@ static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long eve
26032652
} else
26042653
kfree(work);
26052654

2655+
/*
2656+
* clear this state - onetime use only to
2657+
* exclude the deactivation of ports
2658+
* during initial failovers from the
2659+
* 'special case' logic above!
2660+
*/
2661+
initial_failovers_all_ports_deactivated_flag = 0;
2662+
26062663
break;
2664+
26072665
case RDSIBP_TRANSITION_DOWN:
26082666
if (rds_ib_sysctl_active_bonding) {
26092667
INIT_DELAYED_WORK(&work->work, rds_ib_failover);

0 commit comments

Comments
 (0)