@@ -292,6 +292,8 @@ struct o2hb_bio_wait_ctxt {
292
292
int wc_error ;
293
293
};
294
294
295
+ #define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2)
296
+
295
297
enum {
296
298
O2HB_NEGO_TIMEOUT_MSG = 1 ,
297
299
O2HB_NEGO_APPROVE_MSG = 2 ,
@@ -358,7 +360,7 @@ static void o2hb_arm_timeout(struct o2hb_region *reg)
358
360
cancel_delayed_work (& reg -> hr_nego_timeout_work );
359
361
/* negotiate timeout must be less than write timeout. */
360
362
schedule_delayed_work (& reg -> hr_nego_timeout_work ,
361
- msecs_to_jiffies (O2HB_MAX_WRITE_TIMEOUT_MS )/ 2 );
363
+ msecs_to_jiffies (O2HB_NEGO_TIMEOUT_MS ) );
362
364
memset (reg -> hr_nego_node_bitmap , 0 , sizeof (reg -> hr_nego_node_bitmap ));
363
365
}
364
366
@@ -389,7 +391,7 @@ static int o2hb_send_nego_msg(int key, int type, u8 target)
389
391
static void o2hb_nego_timeout (struct work_struct * work )
390
392
{
391
393
unsigned long live_node_bitmap [BITS_TO_LONGS (O2NM_MAX_NODES )];
392
- int master_node , i ;
394
+ int master_node , i , ret ;
393
395
struct o2hb_region * reg ;
394
396
395
397
reg = container_of (work , struct o2hb_region , hr_nego_timeout_work .work );
@@ -398,7 +400,12 @@ static void o2hb_nego_timeout(struct work_struct *work)
398
400
master_node = find_next_bit (live_node_bitmap , O2NM_MAX_NODES , 0 );
399
401
400
402
if (master_node == o2nm_this_node ()) {
401
- set_bit (master_node , reg -> hr_nego_node_bitmap );
403
+ if (!test_bit (master_node , reg -> hr_nego_node_bitmap )) {
404
+ printk (KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n" ,
405
+ o2nm_this_node (), O2HB_NEGO_TIMEOUT_MS /1000 ,
406
+ config_item_name (& reg -> hr_item ), reg -> hr_dev_name );
407
+ set_bit (master_node , reg -> hr_nego_node_bitmap );
408
+ }
402
409
if (memcmp (reg -> hr_nego_node_bitmap , live_node_bitmap ,
403
410
sizeof (reg -> hr_nego_node_bitmap ))) {
404
411
/* check negotiate bitmap every second to do timeout
@@ -410,6 +417,8 @@ static void o2hb_nego_timeout(struct work_struct *work)
410
417
return ;
411
418
}
412
419
420
+ printk (KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n" ,
421
+ config_item_name (& reg -> hr_item ), reg -> hr_dev_name );
413
422
/* approve negotiate timeout request. */
414
423
o2hb_arm_timeout (reg );
415
424
@@ -419,13 +428,23 @@ static void o2hb_nego_timeout(struct work_struct *work)
419
428
if (i == master_node )
420
429
continue ;
421
430
422
- o2hb_send_nego_msg (reg -> hr_key ,
431
+ mlog (ML_HEARTBEAT , "send NEGO_APPROVE msg to node %d\n" , i );
432
+ ret = o2hb_send_nego_msg (reg -> hr_key ,
423
433
O2HB_NEGO_APPROVE_MSG , i );
434
+ if (ret )
435
+ mlog (ML_ERROR , "send NEGO_APPROVE msg to node %d fail %d\n" ,
436
+ i , ret );
424
437
}
425
438
} else {
426
439
/* negotiate timeout with master node. */
427
- o2hb_send_nego_msg (reg -> hr_key , O2HB_NEGO_TIMEOUT_MSG ,
428
- master_node );
440
+ printk (KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n" ,
441
+ o2nm_this_node (), O2HB_NEGO_TIMEOUT_MS /1000 , config_item_name (& reg -> hr_item ),
442
+ reg -> hr_dev_name , master_node );
443
+ ret = o2hb_send_nego_msg (reg -> hr_key , O2HB_NEGO_TIMEOUT_MSG ,
444
+ master_node );
445
+ if (ret )
446
+ mlog (ML_ERROR , "send NEGO_TIMEOUT msg to node %d fail %d\n" ,
447
+ master_node , ret );
429
448
}
430
449
}
431
450
@@ -436,6 +455,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
436
455
struct o2hb_nego_msg * nego_msg ;
437
456
438
457
nego_msg = (struct o2hb_nego_msg * )msg -> buf ;
458
+ printk (KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n" ,
459
+ nego_msg -> node_num , config_item_name (& reg -> hr_item ), reg -> hr_dev_name );
439
460
if (nego_msg -> node_num < O2NM_MAX_NODES )
440
461
set_bit (nego_msg -> node_num , reg -> hr_nego_node_bitmap );
441
462
else
@@ -447,7 +468,11 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
447
468
static int o2hb_nego_approve_handler (struct o2net_msg * msg , u32 len , void * data ,
448
469
void * * ret_data )
449
470
{
450
- o2hb_arm_timeout (data );
471
+ struct o2hb_region * reg = data ;
472
+
473
+ printk (KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n" ,
474
+ config_item_name (& reg -> hr_item ), reg -> hr_dev_name );
475
+ o2hb_arm_timeout (reg );
451
476
return 0 ;
452
477
}
453
478
0 commit comments