Skip to content

Commit 943e942

Browse files
axboeChristoph Hellwig
authored andcommitted
nvme-pci: limit max IO size and segments to avoid high order allocations
nvme requires an sg table allocation for each request. If the request is large, then the allocation can become quite large. For instance, with our default software settings of 1280KB IO size, we'll need 10248 bytes of sg table. That turns into a 2nd order allocation, which we can't always guarantee. If we fail the allocation, blk-mq will retry it later. But there's no guarantee that we'll EVER be able to allocate that much contigious memory. Limit the IO size such that we never need more than a single page of memory. That's a lot faster and more reliable. Then back that allocation with a mempool, so that we know we'll always be able to succeed the allocation at some point. Signed-off-by: Jens Axboe <[email protected]> Acked-by: Keith Busch <[email protected]> Signed-off-by: Christoph Hellwig <[email protected]>
1 parent 9f9cafc commit 943e942

File tree

3 files changed

+39
-5
lines changed

3 files changed

+39
-5
lines changed

drivers/nvme/host/core.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1808,6 +1808,7 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
18081808
u32 max_segments =
18091809
(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
18101810

1811+
max_segments = min_not_zero(max_segments, ctrl->max_segments);
18111812
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
18121813
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
18131814
}

drivers/nvme/host/nvme.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ struct nvme_ctrl {
170170
u64 cap;
171171
u32 page_size;
172172
u32 max_hw_sectors;
173+
u32 max_segments;
173174
u16 oncs;
174175
u16 oacs;
175176
u16 nssa;

drivers/nvme/host/pci.c

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,13 @@
3838

3939
#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
4040

41+
/*
42+
* These can be higher, but we need to ensure that any command doesn't
43+
* require an sg allocation that needs more than a page of data.
44+
*/
45+
#define NVME_MAX_KB_SZ 4096
46+
#define NVME_MAX_SEGS 127
47+
4148
static int use_threaded_interrupts;
4249
module_param(use_threaded_interrupts, int, 0);
4350

@@ -100,6 +107,8 @@ struct nvme_dev {
100107
struct nvme_ctrl ctrl;
101108
struct completion ioq_wait;
102109

110+
mempool_t *iod_mempool;
111+
103112
/* shadow doorbell buffer support: */
104113
u32 *dbbuf_dbs;
105114
dma_addr_t dbbuf_dbs_dma_addr;
@@ -477,10 +486,7 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
477486
iod->use_sgl = nvme_pci_use_sgls(dev, rq);
478487

479488
if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
480-
size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
481-
iod->use_sgl);
482-
483-
iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
489+
iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
484490
if (!iod->sg)
485491
return BLK_STS_RESOURCE;
486492
} else {
@@ -526,7 +532,7 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
526532
}
527533

528534
if (iod->sg != iod->inline_sg)
529-
kfree(iod->sg);
535+
mempool_free(iod->sg, dev->iod_mempool);
530536
}
531537

532538
#ifdef CONFIG_BLK_DEV_INTEGRITY
@@ -2280,6 +2286,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
22802286
blk_put_queue(dev->ctrl.admin_q);
22812287
kfree(dev->queues);
22822288
free_opal_dev(dev->ctrl.opal_dev);
2289+
mempool_destroy(dev->iod_mempool);
22832290
kfree(dev);
22842291
}
22852292

@@ -2334,6 +2341,13 @@ static void nvme_reset_work(struct work_struct *work)
23342341
if (result)
23352342
goto out;
23362343

2344+
/*
2345+
* Limit the max command size to prevent iod->sg allocations going
2346+
* over a single page.
2347+
*/
2348+
dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;
2349+
dev->ctrl.max_segments = NVME_MAX_SEGS;
2350+
23372351
result = nvme_init_identify(&dev->ctrl);
23382352
if (result)
23392353
goto out;
@@ -2509,6 +2523,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
25092523
int node, result = -ENOMEM;
25102524
struct nvme_dev *dev;
25112525
unsigned long quirks = id->driver_data;
2526+
size_t alloc_size;
25122527

25132528
node = dev_to_node(&pdev->dev);
25142529
if (node == NUMA_NO_NODE)
@@ -2546,6 +2561,23 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
25462561
if (result)
25472562
goto release_pools;
25482563

2564+
/*
2565+
* Double check that our mempool alloc size will cover the biggest
2566+
* command we support.
2567+
*/
2568+
alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
2569+
NVME_MAX_SEGS, true);
2570+
WARN_ON_ONCE(alloc_size > PAGE_SIZE);
2571+
2572+
dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
2573+
mempool_kfree,
2574+
(void *) alloc_size,
2575+
GFP_KERNEL, node);
2576+
if (!dev->iod_mempool) {
2577+
result = -ENOMEM;
2578+
goto release_pools;
2579+
}
2580+
25492581
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
25502582

25512583
nvme_get_ctrl(&dev->ctrl);

0 commit comments

Comments
 (0)