@@ -77,9 +77,12 @@ struct link_dead_args {
77
77
struct nbd_config {
78
78
u32 flags ;
79
79
unsigned long runtime_flags ;
80
+ u64 dead_conn_timeout ;
80
81
81
82
struct nbd_sock * * socks ;
82
83
int num_connections ;
84
+ atomic_t live_connections ;
85
+ wait_queue_head_t conn_wait ;
83
86
84
87
atomic_t recv_threads ;
85
88
wait_queue_head_t recv_wq ;
@@ -178,8 +181,10 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
178
181
queue_work (system_wq , & args -> work );
179
182
}
180
183
}
181
- if (!nsock -> dead )
184
+ if (!nsock -> dead ) {
182
185
kernel_sock_shutdown (nsock -> sock , SHUT_RDWR );
186
+ atomic_dec (& nbd -> config -> live_connections );
187
+ }
183
188
nsock -> dead = true;
184
189
nsock -> pending = NULL ;
185
190
nsock -> sent = 0 ;
@@ -257,6 +262,14 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
257
262
return BLK_EH_HANDLED ;
258
263
}
259
264
265
+ /* If we are waiting on our dead timer then we could get timeout
266
+ * callbacks for our request. For this we just want to reset the timer
267
+ * and let the queue side take care of everything.
268
+ */
269
+ if (!completion_done (& cmd -> send_complete )) {
270
+ nbd_config_put (nbd );
271
+ return BLK_EH_RESET_TIMER ;
272
+ }
260
273
config = nbd -> config ;
261
274
262
275
if (config -> num_connections > 1 ) {
@@ -665,6 +678,19 @@ static int find_fallback(struct nbd_device *nbd, int index)
665
678
return new_index ;
666
679
}
667
680
681
+ static int wait_for_reconnect (struct nbd_device * nbd )
682
+ {
683
+ struct nbd_config * config = nbd -> config ;
684
+ if (!config -> dead_conn_timeout )
685
+ return 0 ;
686
+ if (test_bit (NBD_DISCONNECTED , & config -> runtime_flags ))
687
+ return 0 ;
688
+ wait_event_interruptible_timeout (config -> conn_wait ,
689
+ atomic_read (& config -> live_connections ),
690
+ config -> dead_conn_timeout );
691
+ return atomic_read (& config -> live_connections );
692
+ }
693
+
668
694
static int nbd_handle_cmd (struct nbd_cmd * cmd , int index )
669
695
{
670
696
struct request * req = blk_mq_rq_from_pdu (cmd );
@@ -691,12 +717,24 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
691
717
nsock = config -> socks [index ];
692
718
mutex_lock (& nsock -> tx_lock );
693
719
if (nsock -> dead ) {
720
+ int old_index = index ;
694
721
index = find_fallback (nbd , index );
722
+ mutex_unlock (& nsock -> tx_lock );
695
723
if (index < 0 ) {
696
- ret = - EIO ;
697
- goto out ;
724
+ if (wait_for_reconnect (nbd )) {
725
+ index = old_index ;
726
+ goto again ;
727
+ }
728
+ /* All the sockets should already be down at this point,
729
+ * we just want to make sure that DISCONNECTED is set so
730
+ * any requests that come in that were queue'ed waiting
731
+ * for the reconnect timer don't trigger the timer again
732
+ * and instead just error out.
733
+ */
734
+ sock_shutdown (nbd );
735
+ nbd_config_put (nbd );
736
+ return - EIO ;
698
737
}
699
- mutex_unlock (& nsock -> tx_lock );
700
738
goto again ;
701
739
}
702
740
@@ -809,6 +847,7 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
809
847
nsock -> sent = 0 ;
810
848
nsock -> cookie = 0 ;
811
849
socks [config -> num_connections ++ ] = nsock ;
850
+ atomic_inc (& config -> live_connections );
812
851
813
852
return 0 ;
814
853
}
@@ -860,6 +899,9 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
860
899
* need to queue_work outside of the tx_mutex.
861
900
*/
862
901
queue_work (recv_workqueue , & args -> work );
902
+
903
+ atomic_inc (& config -> live_connections );
904
+ wake_up (& config -> conn_wait );
863
905
return 0 ;
864
906
}
865
907
sockfd_put (sock );
@@ -1137,7 +1179,9 @@ static struct nbd_config *nbd_alloc_config(void)
1137
1179
return NULL ;
1138
1180
atomic_set (& config -> recv_threads , 0 );
1139
1181
init_waitqueue_head (& config -> recv_wq );
1182
+ init_waitqueue_head (& config -> conn_wait );
1140
1183
config -> blksize = 1024 ;
1184
+ atomic_set (& config -> live_connections , 0 );
1141
1185
try_module_get (THIS_MODULE );
1142
1186
return config ;
1143
1187
}
@@ -1448,6 +1492,7 @@ static struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
1448
1492
[NBD_ATTR_SERVER_FLAGS ] = { .type = NLA_U64 },
1449
1493
[NBD_ATTR_CLIENT_FLAGS ] = { .type = NLA_U64 },
1450
1494
[NBD_ATTR_SOCKETS ] = { .type = NLA_NESTED },
1495
+ [NBD_ATTR_DEAD_CONN_TIMEOUT ] = { .type = NLA_U64 },
1451
1496
};
1452
1497
1453
1498
static struct nla_policy nbd_sock_policy [NBD_SOCK_MAX + 1 ] = {
@@ -1534,6 +1579,11 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
1534
1579
nbd -> tag_set .timeout = timeout * HZ ;
1535
1580
blk_queue_rq_timeout (nbd -> disk -> queue , timeout * HZ );
1536
1581
}
1582
+ if (info -> attrs [NBD_ATTR_DEAD_CONN_TIMEOUT ]) {
1583
+ config -> dead_conn_timeout =
1584
+ nla_get_u64 (info -> attrs [NBD_ATTR_DEAD_CONN_TIMEOUT ]);
1585
+ config -> dead_conn_timeout *= HZ ;
1586
+ }
1537
1587
if (info -> attrs [NBD_ATTR_SERVER_FLAGS ])
1538
1588
config -> flags =
1539
1589
nla_get_u64 (info -> attrs [NBD_ATTR_SERVER_FLAGS ]);
@@ -1654,6 +1704,11 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
1654
1704
nbd -> tag_set .timeout = timeout * HZ ;
1655
1705
blk_queue_rq_timeout (nbd -> disk -> queue , timeout * HZ );
1656
1706
}
1707
+ if (info -> attrs [NBD_ATTR_DEAD_CONN_TIMEOUT ]) {
1708
+ config -> dead_conn_timeout =
1709
+ nla_get_u64 (info -> attrs [NBD_ATTR_DEAD_CONN_TIMEOUT ]);
1710
+ config -> dead_conn_timeout *= HZ ;
1711
+ }
1657
1712
1658
1713
if (info -> attrs [NBD_ATTR_SOCKETS ]) {
1659
1714
struct nlattr * attr ;
0 commit comments