Skip to content

Commit 0ab7104

Browse files
Bharath Ravimartinkpetersen
authored andcommitted
scsi: iscsi: Perform connection failure entirely in kernel space
Connection failure processing depends on a daemon being present to (at least) stop the connection and start recovery. This is a problem on a multipath scenario, where if the daemon failed for whatever reason, the SCSI path is never marked as down, multipath won't perform the failover and IO to the device will be forever waiting for that connection to come back. This patch performs the connection failure entirely inside the kernel. This way, the failover can happen and pending IO can continue even if the daemon is dead. Once the daemon comes alive again, it can execute recovery procedures if applicable. Cc: Mike Christie <[email protected]> Cc: Lee Duncan <[email protected]> Cc: Bart Van Assche <[email protected]> Link: https://lore.kernel.org/r/[email protected] Co-developed-by: Dave Clausen <[email protected]> Co-developed-by: Nick Black <[email protected]> Co-developed-by: Vaibhav Nagarnaik <[email protected]> Co-developed-by: Anatol Pomazau <[email protected]> Co-developed-by: Tahsin Erdogan <[email protected]> Co-developed-by: Frank Mayhar <[email protected]> Co-developed-by: Junho Ryu <[email protected]> Co-developed-by: Khazhismel Kumykov <[email protected]> Reviewed-by: Reviewed-by: Khazhismel Kumykov <[email protected]> Co-developed-by: Gabriel Krisman Bertazi <[email protected]> Reviewed-by: Lee Duncan <[email protected]> Signed-off-by: Bharath Ravi <[email protected]> Signed-off-by: Dave Clausen <[email protected]> Signed-off-by: Nick Black <[email protected]> Signed-off-by: Vaibhav Nagarnaik <[email protected]> Signed-off-by: Anatol Pomazau <[email protected]> Signed-off-by: Tahsin Erdogan <[email protected]> Signed-off-by: Frank Mayhar <[email protected]> Signed-off-by: Junho Ryu <[email protected]> Signed-off-by: Khazhismel Kumykov <[email protected]> Signed-off-by: Gabriel Krisman Bertazi <[email protected]> Signed-off-by: Martin K. Petersen <[email protected]>
1 parent 80363e1 commit 0ab7104

File tree

2 files changed

+69
-0
lines changed

2 files changed

+69
-0
lines changed

drivers/scsi/scsi_transport_iscsi.c

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,12 @@ struct iscsi_internal {
8686
struct transport_container session_cont;
8787
};
8888

89+
/* Worker to perform connection failure on unresponsive connections
90+
* completely in kernel space.
91+
*/
92+
static void stop_conn_work_fn(struct work_struct *work);
93+
static DECLARE_WORK(stop_conn_work, stop_conn_work_fn);
94+
8995
static atomic_t iscsi_session_nr; /* sysfs session id for next new session */
9096
static struct workqueue_struct *iscsi_eh_timer_workq;
9197

@@ -1611,6 +1617,7 @@ static DEFINE_MUTEX(rx_queue_mutex);
16111617
static LIST_HEAD(sesslist);
16121618
static DEFINE_SPINLOCK(sesslock);
16131619
static LIST_HEAD(connlist);
1620+
static LIST_HEAD(connlist_err);
16141621
static DEFINE_SPINLOCK(connlock);
16151622

16161623
static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn)
@@ -2254,6 +2261,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)
22542261

22552262
mutex_init(&conn->ep_mutex);
22562263
INIT_LIST_HEAD(&conn->conn_list);
2264+
INIT_LIST_HEAD(&conn->conn_list_err);
22572265
conn->transport = transport;
22582266
conn->cid = cid;
22592267

@@ -2307,6 +2315,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn)
23072315

23082316
spin_lock_irqsave(&connlock, flags);
23092317
list_del(&conn->conn_list);
2318+
list_del(&conn->conn_list_err);
23102319
spin_unlock_irqrestore(&connlock, flags);
23112320

23122321
transport_unregister_device(&conn->dev);
@@ -2421,13 +2430,64 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,
24212430
}
24222431
EXPORT_SYMBOL_GPL(iscsi_offload_mesg);
24232432

2433+
static void stop_conn_work_fn(struct work_struct *work)
2434+
{
2435+
struct iscsi_cls_conn *conn, *tmp;
2436+
unsigned long flags;
2437+
LIST_HEAD(recovery_list);
2438+
2439+
spin_lock_irqsave(&connlock, flags);
2440+
if (list_empty(&connlist_err)) {
2441+
spin_unlock_irqrestore(&connlock, flags);
2442+
return;
2443+
}
2444+
list_splice_init(&connlist_err, &recovery_list);
2445+
spin_unlock_irqrestore(&connlock, flags);
2446+
2447+
list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) {
2448+
uint32_t sid = iscsi_conn_get_sid(conn);
2449+
struct iscsi_cls_session *session;
2450+
2451+
mutex_lock(&rx_queue_mutex);
2452+
2453+
session = iscsi_session_lookup(sid);
2454+
if (session) {
2455+
if (system_state != SYSTEM_RUNNING) {
2456+
session->recovery_tmo = 0;
2457+
conn->transport->stop_conn(conn,
2458+
STOP_CONN_TERM);
2459+
} else {
2460+
conn->transport->stop_conn(conn,
2461+
STOP_CONN_RECOVER);
2462+
}
2463+
}
2464+
2465+
list_del_init(&conn->conn_list_err);
2466+
2467+
mutex_unlock(&rx_queue_mutex);
2468+
2469+
/* we don't want to hold rx_queue_mutex for too long,
2470+
* for instance if many conns failed at the same time,
2471+
* since this stall other iscsi maintenance operations.
2472+
* Give other users a chance to proceed.
2473+
*/
2474+
cond_resched();
2475+
}
2476+
}
2477+
24242478
void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
24252479
{
24262480
struct nlmsghdr *nlh;
24272481
struct sk_buff *skb;
24282482
struct iscsi_uevent *ev;
24292483
struct iscsi_internal *priv;
24302484
int len = nlmsg_total_size(sizeof(*ev));
2485+
unsigned long flags;
2486+
2487+
spin_lock_irqsave(&connlock, flags);
2488+
list_add(&conn->conn_list_err, &connlist_err);
2489+
spin_unlock_irqrestore(&connlock, flags);
2490+
queue_work(system_unbound_wq, &stop_conn_work);
24312491

24322492
priv = iscsi_if_transport_lookup(conn->transport);
24332493
if (!priv)
@@ -2757,11 +2817,19 @@ static int
27572817
iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev)
27582818
{
27592819
struct iscsi_cls_conn *conn;
2820+
unsigned long flags;
27602821

27612822
conn = iscsi_conn_lookup(ev->u.d_conn.sid, ev->u.d_conn.cid);
27622823
if (!conn)
27632824
return -EINVAL;
27642825

2826+
spin_lock_irqsave(&connlock, flags);
2827+
if (!list_empty(&conn->conn_list_err)) {
2828+
spin_unlock_irqrestore(&connlock, flags);
2829+
return -EAGAIN;
2830+
}
2831+
spin_unlock_irqrestore(&connlock, flags);
2832+
27652833
ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n");
27662834
if (transport->destroy_conn)
27672835
transport->destroy_conn(conn);

include/scsi/scsi_transport_iscsi.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ extern void iscsi_ping_comp_event(uint32_t host_no,
190190

191191
struct iscsi_cls_conn {
192192
struct list_head conn_list; /* item in connlist */
193+
struct list_head conn_list_err; /* item in connlist_err */
193194
void *dd_data; /* LLD private data */
194195
struct iscsi_transport *transport;
195196
uint32_t cid; /* connection id */

0 commit comments

Comments
 (0)