Skip to content

Commit 40c935b

Browse files
sudhakar-dindukurtiSomasundaram Krishnasamy
authored andcommitted
A/A Bonding: Flush all the delayed works posted to rdmaip_wq before destroying the workq
Whenever a hardware port goes down, Resilient RDMAIP gets a NETDEV_CHANGE event and a hardware port down event. Resilient RDMAIP processes such events and does failback if needed (i.e moves the IP addresses from the down port to the other active port in the group). Resilient RDMAIP module schedules a failback task with a delay of 10 seconds to do IP migration. If module is unloaded when a scheduled failback has not run to completion, the system can crash. Current code calls flush_workqueue() and destroy_workqueue() but none of these calls wait for the completion of the delayed posted work to the workqueue. task: ffff9fcf41f45f00 task.stack: ffffb8634c610000 RIP: 0010:__queue_work+0x80/0x3e0 RSP: 0018:ffff9feebfc03e40 EFLAGS: 00010046 Call Trace: <IRQ> ? netif_schedule_queue+0x1c/0x1e ? execute_in_process_context+0x70/0x65 ? execute_in_process_context+0x70/0x65 delayed_work_timer_fn+0x18/0x1a call_timer_fn+0x3c/0x148 ? execute_in_process_context+0x70/0x65 run_timer_softirq+0x18b/0x494 ? timerqueue_add+0x59/0x82 ? ktime_get+0x3e/0x95 __do_softirq+0xd9/0x28d irq_exit+0xdf/0xe5 smp_apic_timer_interrupt+0x91/0x155 apic_timer_interrupt+0x1a2/0x1a7 </IRQ> RIP: 0010:cpuidle_enter_state+0xda/0x2a5 The following changes are done to address the kernel crash. 1) List of delayed works (with delay > 0) are maintained in a new linked list rdmaip_delayed_work_list. During module unload, all these work requests are canceled before destroying the workqueue. 2) Ensured that no new delayed works are posted when module unload is in progress. Orabug: 29379514 Signed-off-by: Sudhakar Dindukurti <[email protected]> Reviewed-by: Zhu Yanjun <[email protected]> Signed-off-by: Somasundaram Krishnasamy <[email protected]>
1 parent bddff61 commit 40c935b

File tree

2 files changed

+59
-3
lines changed

2 files changed

+59
-3
lines changed

drivers/infiniband/core/rdmaip.c

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ static void rdmaip_impl_inetaddr_event(struct work_struct *);
7575
static void rdmaip_inetaddr_unregister(void);
7676

7777
static DECLARE_DELAYED_WORK(riif_dlywork, rdmaip_initial_failovers);
78+
79+
static LIST_HEAD(rdmaip_delayed_work_list);
80+
7881
/*
7982
* This structure is registed with IB core. IB core calls
8083
* rdmaip_device_add() function when ever a new RDMA adpter
@@ -1184,6 +1187,12 @@ static void rdmaip_failback(struct work_struct *_work)
11841187
container_of(_work, struct rdmaip_port_ud_work, work.work);
11851188
u8 i, ip_active_port, port = work->port;
11861189

1190+
if (work->queued) {
1191+
list_del(&work->list);
1192+
work->queued = false;
1193+
RDMAIP_DBG2("Deleted %p work from the list\n", work);
1194+
}
1195+
11871196
if ((ip_config[port].port_state == RDMAIP_PORT_INIT) ||
11881197
(ip_config[port].port_state == RDMAIP_PORT_DOWN)) {
11891198
pr_err("rdmaip: devname %s failback request with port_state in %s state!",
@@ -1398,9 +1407,12 @@ static void rdmaip_sched_failover_failback(struct net_device *netdev, u8 port,
13981407
work->netdev = netdev;
13991408
if (rdmaip_active_bonding_failback) {
14001409
RDMAIP_DBG2("Schedule failback\n");
1410+
work->queued = true;
14011411
INIT_DELAYED_WORK(&work->work, rdmaip_failback);
14021412
queue_delayed_work(rdmaip_wq, &work->work,
14031413
rdmaip_get_failback_sync_jiffies(port));
1414+
list_add(&work->list, &rdmaip_delayed_work_list);
1415+
RDMAIP_DBG2("Adding %p work to the list\n", work);
14041416
} else
14051417
kfree(work);
14061418
} else {
@@ -1562,6 +1574,7 @@ static void rdmaip_event_handler(struct ib_event_handler *handler,
15621574
work->ib_port = event->element.port_num;
15631575

15641576
INIT_DELAYED_WORK(&work->work, rdmaip_impl_ib_event_handler);
1577+
work->queued = false;
15651578
queue_delayed_work(rdmaip_wq, &work->work, 0);
15661579

15671580
RDMAIP_DBG2("Queued IB event handler to process events : %s\n",
@@ -2363,6 +2376,12 @@ static void rdmaip_add_new_rdmaip_port_handler(struct work_struct *_work)
23632376
u8 port = 0;
23642377
u16 pkey_vid = 0;
23652378

2379+
if (work->queued) {
2380+
list_del(&work->list);
2381+
work->queued = false;
2382+
RDMAIP_DBG2("Deleted %p work from the list\n", work);
2383+
}
2384+
23662385
in_dev = in_dev_get(ndev);
23672386
if (rdmaip_inet6_socket)
23682387
in6_dev = in6_dev_get(ndev);
@@ -2421,10 +2440,13 @@ static void rdmaip_add_new_rdmaip_port(struct net_device *netdev)
24212440
if (work) {
24222441
work->netdev = netdev;
24232442
work->timeout = msecs_to_jiffies(10000);
2443+
work->queued = true;
24242444
INIT_DELAYED_WORK(&work->work,
24252445
rdmaip_add_new_rdmaip_port_handler);
24262446
queue_delayed_work(rdmaip_wq, &work->work,
24272447
msecs_to_jiffies(100));
2448+
list_add(&work->list, &rdmaip_delayed_work_list);
2449+
RDMAIP_DBG2("Adding %p work to the list\n", work);
24282450
} else
24292451
RDMAIP_DBG2("Failed to allocated memory for work\n");
24302452
}
@@ -2438,6 +2460,12 @@ static void rdmaip_impl_netdev_callback(struct work_struct *_work)
24382460
long int event = work->net_event;
24392461
struct net_device *ndev = work->netdev;
24402462

2463+
if (work->queued) {
2464+
list_del(&work->list);
2465+
work->queued = false;
2466+
RDMAIP_DBG2("Deleted %p work from the list\n", work);
2467+
}
2468+
24412469
mutex_lock(&rdmaip_global_flag_lock);
24422470
if (rdmaip_is_busy_flag_set() || rdmaip_is_teardown_flag_set()) {
24432471
rdmaip_set_event_pending();
@@ -2526,6 +2554,7 @@ static int rdmaip_netdev_callback(struct notifier_block *self,
25262554
work->event_type = RDMAIP_EVENT_NET;
25272555
work->net_event = event;
25282556
work->netdev = ndev;
2557+
work->queued = false;
25292558

25302559
INIT_DELAYED_WORK(&work->work, rdmaip_impl_netdev_callback);
25312560
queue_delayed_work(rdmaip_wq, &work->work, 0);
@@ -2636,9 +2665,10 @@ static void rdmaip_comm_inetaddr_handler(struct net_device *netdev,
26362665
work->event_type = RDMAIP_EVENT_INETADDR;
26372666
work->net_event = event;
26382667
work->netdev = netdev;
2668+
work->queued = false;
26392669

26402670
INIT_DELAYED_WORK(&work->work, rdmaip_impl_inetaddr_event);
2641-
queue_delayed_work(rdmaip_wq, &work->work, RDMAIP_100MSECS);
2671+
queue_delayed_work(rdmaip_wq, &work->work, 0);
26422672
}
26432673

26442674
/*
@@ -2682,6 +2712,14 @@ static void rdmaip_impl_inetaddr_event(struct work_struct *_work)
26822712
struct rdmaip_port_ud_work *work =
26832713
container_of(_work, struct rdmaip_port_ud_work, work.work);
26842714

2715+
mutex_lock(&rdmaip_global_flag_lock);
2716+
if (rdmaip_is_teardown_flag_set()) {
2717+
RDMAIP_DBG2("Teardown inprogress: skip inetaddr event\n");
2718+
mutex_unlock(&rdmaip_global_flag_lock);
2719+
return;
2720+
}
2721+
mutex_unlock(&rdmaip_global_flag_lock);
2722+
26852723
port = rdmaip_get_port_index(work->netdev);
26862724
if (!port) {
26872725
RDMAIP_DBG2("inetadd_event: rdmaip port not found\n");
@@ -2729,6 +2767,8 @@ static int rdmaip_inetaddr_event(struct notifier_block *this,
27292767
*/
27302768
void rdmaip_cleanup(void)
27312769
{
2770+
struct rdmaip_port_ud_work *work, *temp;
2771+
27322772
RDMAIP_DBG2("%s Enter rdmaip_init_flag = 0x%x\n", __func__,
27332773
rdmaip_init_flag);
27342774

@@ -2755,6 +2795,22 @@ void rdmaip_cleanup(void)
27552795
rdmaip_init_flag &= ~RDMAIP_REG_NETDEV_NOTIFIER;
27562796
}
27572797

2798+
/*
2799+
* Make sure all the queued (except delayed) works
2800+
* in rdmaip_wq callbacks run to completion. This
2801+
* also ensures that no new work is queued to the
2802+
* queue.
2803+
*/
2804+
flush_workqueue(rdmaip_wq);
2805+
2806+
/* Cancel all the delayed work items */
2807+
list_for_each_entry_safe(work, temp, &rdmaip_delayed_work_list, list) {
2808+
list_del(&work->list);
2809+
RDMAIP_DBG2("Cancelling %p delayed work\n", work);
2810+
cancel_delayed_work_sync(&work->work);
2811+
kfree(work);
2812+
}
2813+
27582814
rdmaip_destroy_workqs();
27592815

27602816
/*

drivers/infiniband/core/rdmaip.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,6 @@
5353
#define RDMAIP_DEFAULT_GIDTBL_LEN 64
5454
#define RDMAIP_MAX_NAME_LEN 32
5555

56-
#define RDMAIP_100MSECS 100
57-
5856
#define RDMAIP_DEV_TYPE_IB 0x1
5957
#define RDMAIP_DEV_TYPE_ETHER 0x2
6058

@@ -368,6 +366,8 @@ struct rdmaip_port_ud_work {
368366
int event_type;
369367
int ib_event;
370368
int net_event;
369+
bool queued;
370+
struct list_head list;
371371
};
372372

373373
enum {

0 commit comments

Comments
 (0)