Skip to content

Commit 8e9801d

Browse files
biger410torvalds
authored andcommitted
ocfs2: o2net: set tcp user timeout to max value
When tcp retransmit timeout(15mins), the connection will be closed. Pending messages may be lost during this time. So we set tcp user timeout to override the retransmit timeout to the max value. This is OK for ocfs2 since we have disk heartbeat, if peer crash, the disk heartbeat will timeout and it will be evicted, if disk heartbeat not timeout and connection idle for a long time, then this means the cluster enters split-brain state, since fence can't happen, we'd better keep the connection and wait network recover. Signed-off-by: Junxiao Bi <[email protected]> Reviewed-by: Srinivas Eeda <[email protected]> Reviewed-by: Mark Fasheh <[email protected]> Cc: Joel Becker <[email protected]> Cc: Joseph Qi <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent c43c363 commit 8e9801d

File tree

2 files changed

+21
-0
lines changed

2 files changed

+21
-0
lines changed

fs/ocfs2/cluster/tcp.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock)
14801480
return ret;
14811481
}
14821482

1483+
static int o2net_set_usertimeout(struct socket *sock)
1484+
{
1485+
int user_timeout = O2NET_TCP_USER_TIMEOUT;
1486+
1487+
return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
1488+
(char *)&user_timeout, sizeof(user_timeout));
1489+
}
1490+
14831491
static void o2net_initialize_handshake(void)
14841492
{
14851493
o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
@@ -1663,6 +1671,12 @@ static void o2net_start_connect(struct work_struct *work)
16631671
goto out;
16641672
}
16651673

1674+
ret = o2net_set_usertimeout(sock);
1675+
if (ret) {
1676+
mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
1677+
goto out;
1678+
}
1679+
16661680
o2net_register_callbacks(sc->sc_sock->sk, sc);
16671681

16681682
spin_lock(&nn->nn_lock);
@@ -1844,6 +1858,12 @@ static int o2net_accept_one(struct socket *sock, int *more)
18441858
goto out;
18451859
}
18461860

1861+
ret = o2net_set_usertimeout(new_sock);
1862+
if (ret) {
1863+
mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
1864+
goto out;
1865+
}
1866+
18471867
slen = sizeof(sin);
18481868
ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
18491869
&slen, 1);

fs/ocfs2/cluster/tcp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
6363
#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
6464
#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
6565

66+
#define O2NET_TCP_USER_TIMEOUT 0x7fffffff
6667

6768
/* TODO: figure this out.... */
6869
static inline int o2net_link_down(int err, struct socket *sock)

0 commit comments

Comments
 (0)