Skip to content

Commit 560bc4b

Browse files
josefbacikaxboe
authored andcommitted
nbd: handle dead connections
Sometimes we like to upgrade our server without making all of our clients freak out and reconnect. This patch provides a way to specify a dead connection timeout to allow us to pause all requests and wait for new connections to be opened. With this in place I can take down the nbd server for less than the dead connection timeout time and bring it back up and everything resumes gracefully. Signed-off-by: Josef Bacik <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent 2516ab1 commit 560bc4b

File tree

2 files changed

+60
-4
lines changed

2 files changed

+60
-4
lines changed

drivers/block/nbd.c

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,12 @@ struct link_dead_args {
7777
struct nbd_config {
7878
u32 flags;
7979
unsigned long runtime_flags;
80+
u64 dead_conn_timeout;
8081

8182
struct nbd_sock **socks;
8283
int num_connections;
84+
atomic_t live_connections;
85+
wait_queue_head_t conn_wait;
8386

8487
atomic_t recv_threads;
8588
wait_queue_head_t recv_wq;
@@ -178,8 +181,10 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
178181
queue_work(system_wq, &args->work);
179182
}
180183
}
181-
if (!nsock->dead)
184+
if (!nsock->dead) {
182185
kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
186+
atomic_dec(&nbd->config->live_connections);
187+
}
183188
nsock->dead = true;
184189
nsock->pending = NULL;
185190
nsock->sent = 0;
@@ -257,6 +262,14 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
257262
return BLK_EH_HANDLED;
258263
}
259264

265+
/* If we are waiting on our dead timer then we could get timeout
266+
* callbacks for our request. For this we just want to reset the timer
267+
* and let the queue side take care of everything.
268+
*/
269+
if (!completion_done(&cmd->send_complete)) {
270+
nbd_config_put(nbd);
271+
return BLK_EH_RESET_TIMER;
272+
}
260273
config = nbd->config;
261274

262275
if (config->num_connections > 1) {
@@ -665,6 +678,19 @@ static int find_fallback(struct nbd_device *nbd, int index)
665678
return new_index;
666679
}
667680

681+
static int wait_for_reconnect(struct nbd_device *nbd)
682+
{
683+
struct nbd_config *config = nbd->config;
684+
if (!config->dead_conn_timeout)
685+
return 0;
686+
if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
687+
return 0;
688+
wait_event_interruptible_timeout(config->conn_wait,
689+
atomic_read(&config->live_connections),
690+
config->dead_conn_timeout);
691+
return atomic_read(&config->live_connections);
692+
}
693+
668694
static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
669695
{
670696
struct request *req = blk_mq_rq_from_pdu(cmd);
@@ -691,12 +717,24 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
691717
nsock = config->socks[index];
692718
mutex_lock(&nsock->tx_lock);
693719
if (nsock->dead) {
720+
int old_index = index;
694721
index = find_fallback(nbd, index);
722+
mutex_unlock(&nsock->tx_lock);
695723
if (index < 0) {
696-
ret = -EIO;
697-
goto out;
724+
if (wait_for_reconnect(nbd)) {
725+
index = old_index;
726+
goto again;
727+
}
728+
/* All the sockets should already be down at this point,
729+
* we just want to make sure that DISCONNECTED is set so
730+
* any requests that come in that were queue'ed waiting
731+
* for the reconnect timer don't trigger the timer again
732+
* and instead just error out.
733+
*/
734+
sock_shutdown(nbd);
735+
nbd_config_put(nbd);
736+
return -EIO;
698737
}
699-
mutex_unlock(&nsock->tx_lock);
700738
goto again;
701739
}
702740

@@ -809,6 +847,7 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
809847
nsock->sent = 0;
810848
nsock->cookie = 0;
811849
socks[config->num_connections++] = nsock;
850+
atomic_inc(&config->live_connections);
812851

813852
return 0;
814853
}
@@ -860,6 +899,9 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
860899
* need to queue_work outside of the tx_mutex.
861900
*/
862901
queue_work(recv_workqueue, &args->work);
902+
903+
atomic_inc(&config->live_connections);
904+
wake_up(&config->conn_wait);
863905
return 0;
864906
}
865907
sockfd_put(sock);
@@ -1137,7 +1179,9 @@ static struct nbd_config *nbd_alloc_config(void)
11371179
return NULL;
11381180
atomic_set(&config->recv_threads, 0);
11391181
init_waitqueue_head(&config->recv_wq);
1182+
init_waitqueue_head(&config->conn_wait);
11401183
config->blksize = 1024;
1184+
atomic_set(&config->live_connections, 0);
11411185
try_module_get(THIS_MODULE);
11421186
return config;
11431187
}
@@ -1448,6 +1492,7 @@ static struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
14481492
[NBD_ATTR_SERVER_FLAGS] = { .type = NLA_U64 },
14491493
[NBD_ATTR_CLIENT_FLAGS] = { .type = NLA_U64 },
14501494
[NBD_ATTR_SOCKETS] = { .type = NLA_NESTED},
1495+
[NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 },
14511496
};
14521497

14531498
static struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
@@ -1534,6 +1579,11 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
15341579
nbd->tag_set.timeout = timeout * HZ;
15351580
blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
15361581
}
1582+
if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1583+
config->dead_conn_timeout =
1584+
nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1585+
config->dead_conn_timeout *= HZ;
1586+
}
15371587
if (info->attrs[NBD_ATTR_SERVER_FLAGS])
15381588
config->flags =
15391589
nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
@@ -1654,6 +1704,11 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
16541704
nbd->tag_set.timeout = timeout * HZ;
16551705
blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
16561706
}
1707+
if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1708+
config->dead_conn_timeout =
1709+
nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1710+
config->dead_conn_timeout *= HZ;
1711+
}
16571712

16581713
if (info->attrs[NBD_ATTR_SOCKETS]) {
16591714
struct nlattr *attr;

include/uapi/linux/nbd-netlink.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ enum {
3232
NBD_ATTR_SERVER_FLAGS,
3333
NBD_ATTR_CLIENT_FLAGS,
3434
NBD_ATTR_SOCKETS,
35+
NBD_ATTR_DEAD_CONN_TIMEOUT,
3536
__NBD_ATTR_MAX,
3637
};
3738
#define NBD_ATTR_MAX (__NBD_ATTR_MAX - 1)

0 commit comments

Comments
 (0)