Skip to content

Commit 12c058d

Browse files
sagigrimberggregkh
authored andcommitted
nvme-pci: allocate device queues storage space at probe
commit 147b27e upstream. It may cause race by setting 'nvmeq' in nvme_init_request() because .init_request is called inside switching io scheduler, which may happen when the NVMe device is being resetted and its nvme queues are being freed and created. We don't have any sync between the two pathes. This patch changes the nvmeq allocation to occur at probe time so there is no way we can dereference it at init_request. [ 93.268391] kernel BUG at drivers/nvme/host/pci.c:408! [ 93.274146] invalid opcode: 0000 [#1] SMP [ 93.278618] Modules linked in: nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache sunrpc ipmi_ssif vfat fat intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel iTCO_wdt intel_cstate ipmi_si iTCO_vendor_support intel_uncore mxm_wmi mei_me ipmi_devintf intel_rapl_perf pcspkr sg ipmi_msghandler lpc_ich dcdbas mei shpchp acpi_power_meter wmi dm_multipath ip_tables xfs libcrc32c sd_mod mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm ahci libahci nvme libata crc32c_intel nvme_core tg3 megaraid_sas ptp i2c_core pps_core dm_mirror dm_region_hash dm_log dm_mod [ 93.349071] CPU: 5 PID: 1842 Comm: sh Not tainted 4.15.0-rc2.ming+ #4 [ 93.356256] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.5.5 08/16/2017 [ 93.364801] task: 00000000fb8abf2a task.stack: 0000000028bd82d1 [ 93.371408] RIP: 0010:nvme_init_request+0x36/0x40 [nvme] [ 93.377333] RSP: 0018:ffffc90002537ca8 EFLAGS: 00010246 [ 93.383161] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000008 [ 93.391122] RDX: 0000000000000000 RSI: ffff880276ae0000 RDI: ffff88047bae9008 [ 93.399084] RBP: ffff88047bae9008 R08: ffff88047bae9008 R09: 0000000009dabc00 [ 93.407045] R10: 0000000000000004 R11: 000000000000299c R12: ffff880186bc1f00 [ 93.415007] R13: ffff880276ae0000 R14: 0000000000000000 R15: 0000000000000071 [ 93.422969] FS: 00007f33cf288740(0000) GS:ffff88047ba80000(0000) knlGS:0000000000000000 [ 93.431996] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 93.438407] CR2: 00007f33cf28e000 CR3: 000000047e5bb006 CR4: 00000000001606e0 [ 93.446368] Call Trace: [ 93.449103] blk_mq_alloc_rqs+0x231/0x2a0 [ 93.453579] blk_mq_sched_alloc_tags.isra.8+0x42/0x80 [ 93.459214] blk_mq_init_sched+0x7e/0x140 [ 93.463687] elevator_switch+0x5a/0x1f0 [ 93.467966] ? elevator_get.isra.17+0x52/0xc0 [ 93.472826] elv_iosched_store+0xde/0x150 [ 93.477299] queue_attr_store+0x4e/0x90 [ 93.481580] kernfs_fop_write+0xfa/0x180 [ 93.485958] __vfs_write+0x33/0x170 [ 93.489851] ? __inode_security_revalidate+0x4c/0x60 [ 93.495390] ? selinux_file_permission+0xda/0x130 [ 93.500641] ? _cond_resched+0x15/0x30 [ 93.504815] vfs_write+0xad/0x1a0 [ 93.508512] SyS_write+0x52/0xc0 [ 93.512113] do_syscall_64+0x61/0x1a0 [ 93.516199] entry_SYSCALL64_slow_path+0x25/0x25 [ 93.521351] RIP: 0033:0x7f33ce96aab0 [ 93.525337] RSP: 002b:00007ffe57570238 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 93.533785] RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007f33ce96aab0 [ 93.541746] RDX: 0000000000000006 RSI: 00007f33cf28e000 RDI: 0000000000000001 [ 93.549707] RBP: 00007f33cf28e000 R08: 000000000000000a R09: 00007f33cf288740 [ 93.557669] R10: 00007f33cf288740 R11: 0000000000000246 R12: 00007f33cec42400 [ 93.565630] R13: 0000000000000006 R14: 0000000000000001 R15: 0000000000000000 [ 93.573592] Code: 4c 8d 40 08 4c 39 c7 74 16 48 8b 00 48 8b 04 08 48 85 c0 74 16 48 89 86 78 01 00 00 31 c0 c3 8d 4a 01 48 63 c9 48 c1 e1 03 eb de <0f> 0b 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 85 f6 53 48 89 [ 93.594676] RIP: nvme_init_request+0x36/0x40 [nvme] RSP: ffffc90002537ca8 [ 93.602273] ---[ end trace 810dde3993e5f14e ]--- Reported-by: Yi Zhang <[email protected]> Signed-off-by: Sagi Grimberg <[email protected]> Signed-off-by: Christoph Hellwig <[email protected]> Signed-off-by: Jon Derrick <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 0ea7fcf commit 12c058d

File tree

1 file changed

+25
-36
lines changed

1 file changed

+25
-36
lines changed

drivers/nvme/host/pci.c

Lines changed: 25 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
7777
* Represents an NVM Express device. Each nvme_dev is a PCI function.
7878
*/
7979
struct nvme_dev {
80-
struct nvme_queue **queues;
80+
struct nvme_queue *queues;
8181
struct blk_mq_tag_set tagset;
8282
struct blk_mq_tag_set admin_tagset;
8383
u32 __iomem *dbs;
@@ -348,7 +348,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
348348
unsigned int hctx_idx)
349349
{
350350
struct nvme_dev *dev = data;
351-
struct nvme_queue *nvmeq = dev->queues[0];
351+
struct nvme_queue *nvmeq = &dev->queues[0];
352352

353353
WARN_ON(hctx_idx != 0);
354354
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
@@ -370,7 +370,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
370370
unsigned int hctx_idx)
371371
{
372372
struct nvme_dev *dev = data;
373-
struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
373+
struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
374374

375375
if (!nvmeq->tags)
376376
nvmeq->tags = &dev->tagset.tags[hctx_idx];
@@ -386,7 +386,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
386386
struct nvme_dev *dev = set->driver_data;
387387
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
388388
int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
389-
struct nvme_queue *nvmeq = dev->queues[queue_idx];
389+
struct nvme_queue *nvmeq = &dev->queues[queue_idx];
390390

391391
BUG_ON(!nvmeq);
392392
iod->nvmeq = nvmeq;
@@ -900,7 +900,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
900900
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
901901
{
902902
struct nvme_dev *dev = to_nvme_dev(ctrl);
903-
struct nvme_queue *nvmeq = dev->queues[0];
903+
struct nvme_queue *nvmeq = &dev->queues[0];
904904
struct nvme_command c;
905905

906906
memset(&c, 0, sizeof(c));
@@ -1146,18 +1146,15 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
11461146
if (nvmeq->sq_cmds)
11471147
dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
11481148
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1149-
kfree(nvmeq);
11501149
}
11511150

11521151
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
11531152
{
11541153
int i;
11551154

11561155
for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
1157-
struct nvme_queue *nvmeq = dev->queues[i];
11581156
dev->ctrl.queue_count--;
1159-
dev->queues[i] = NULL;
1160-
nvme_free_queue(nvmeq);
1157+
nvme_free_queue(&dev->queues[i]);
11611158
}
11621159
}
11631160

@@ -1189,10 +1186,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
11891186

11901187
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
11911188
{
1192-
struct nvme_queue *nvmeq = dev->queues[0];
1189+
struct nvme_queue *nvmeq = &dev->queues[0];
11931190

1194-
if (!nvmeq)
1195-
return;
11961191
if (nvme_suspend_queue(nvmeq))
11971192
return;
11981193

@@ -1246,13 +1241,10 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
12461241
return 0;
12471242
}
12481243

1249-
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1250-
int depth, int node)
1244+
static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
1245+
int depth, int node)
12511246
{
1252-
struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
1253-
node);
1254-
if (!nvmeq)
1255-
return NULL;
1247+
struct nvme_queue *nvmeq = &dev->queues[qid];
12561248

12571249
nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
12581250
&nvmeq->cq_dma_addr, GFP_KERNEL);
@@ -1271,17 +1263,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
12711263
nvmeq->q_depth = depth;
12721264
nvmeq->qid = qid;
12731265
nvmeq->cq_vector = -1;
1274-
dev->queues[qid] = nvmeq;
12751266
dev->ctrl.queue_count++;
12761267

1277-
return nvmeq;
1268+
return 0;
12781269

12791270
free_cqdma:
12801271
dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
12811272
nvmeq->cq_dma_addr);
12821273
free_nvmeq:
1283-
kfree(nvmeq);
1284-
return NULL;
1274+
return -ENOMEM;
12851275
}
12861276

12871277
static int queue_request_irq(struct nvme_queue *nvmeq)
@@ -1468,14 +1458,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
14681458
if (result < 0)
14691459
return result;
14701460

1471-
nvmeq = dev->queues[0];
1472-
if (!nvmeq) {
1473-
nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
1474-
dev_to_node(dev->dev));
1475-
if (!nvmeq)
1476-
return -ENOMEM;
1477-
}
1461+
result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
1462+
dev_to_node(dev->dev));
1463+
if (result)
1464+
return result;
14781465

1466+
nvmeq = &dev->queues[0];
14791467
aqa = nvmeq->q_depth - 1;
14801468
aqa |= aqa << 16;
14811469

@@ -1505,7 +1493,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
15051493

15061494
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
15071495
/* vector == qid - 1, match nvme_create_queue */
1508-
if (!nvme_alloc_queue(dev, i, dev->q_depth,
1496+
if (nvme_alloc_queue(dev, i, dev->q_depth,
15091497
pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
15101498
ret = -ENOMEM;
15111499
break;
@@ -1514,7 +1502,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
15141502

15151503
max = min(dev->max_qid, dev->ctrl.queue_count - 1);
15161504
for (i = dev->online_queues; i <= max; i++) {
1517-
ret = nvme_create_queue(dev->queues[i], i);
1505+
ret = nvme_create_queue(&dev->queues[i], i);
15181506
if (ret)
15191507
break;
15201508
}
@@ -1770,7 +1758,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
17701758

17711759
static int nvme_setup_io_queues(struct nvme_dev *dev)
17721760
{
1773-
struct nvme_queue *adminq = dev->queues[0];
1761+
struct nvme_queue *adminq = &dev->queues[0];
17741762
struct pci_dev *pdev = to_pci_dev(dev->dev);
17751763
int result, nr_io_queues;
17761764
unsigned long size;
@@ -1896,7 +1884,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
18961884
retry:
18971885
timeout = ADMIN_TIMEOUT;
18981886
for (; i > 0; i--, sent++)
1899-
if (nvme_delete_queue(dev->queues[i], opcode))
1887+
if (nvme_delete_queue(&dev->queues[i], opcode))
19001888
break;
19011889

19021890
while (sent--) {
@@ -2081,15 +2069,15 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
20812069

20822070
queues = dev->online_queues - 1;
20832071
for (i = dev->ctrl.queue_count - 1; i > 0; i--)
2084-
nvme_suspend_queue(dev->queues[i]);
2072+
nvme_suspend_queue(&dev->queues[i]);
20852073

20862074
if (dead) {
20872075
/* A device might become IO incapable very soon during
20882076
* probe, before the admin queue is configured. Thus,
20892077
* queue_count can be 0 here.
20902078
*/
20912079
if (dev->ctrl.queue_count)
2092-
nvme_suspend_queue(dev->queues[0]);
2080+
nvme_suspend_queue(&dev->queues[0]);
20932081
} else {
20942082
nvme_disable_io_queues(dev, queues);
20952083
nvme_disable_admin_queue(dev, shutdown);
@@ -2345,7 +2333,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
23452333
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
23462334
if (!dev)
23472335
return -ENOMEM;
2348-
dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
2336+
2337+
dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(struct nvme_queue),
23492338
GFP_KERNEL, node);
23502339
if (!dev->queues)
23512340
goto free;

0 commit comments

Comments
 (0)