Skip to content

Commit 64f1c21

Browse files
committed
blk-mq: make the polling code adaptive
The previous commit introduced the hybrid sleep/poll mode. Take that one step further, and use the completion latencies to automatically sleep for half the mean completion time. This is a good approximation. This changes the 'io_poll_delay' sysfs file a bit to expose the various options. Depending on the value, the polling code will behave differently: -1 Never enter hybrid sleep mode 0 Use half of the completion mean for the sleep delay >0 Use this specific value as the sleep delay Signed-off-by: Jens Axboe <[email protected]> Tested-By: Stephen Bates <[email protected]> Reviewed-By: Stephen Bates <[email protected]>
1 parent 06426ad commit 64f1c21

File tree

3 files changed

+83
-12
lines changed

3 files changed

+83
-12
lines changed

block/blk-mq.c

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2132,6 +2132,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
21322132
*/
21332133
q->nr_requests = set->queue_depth;
21342134

2135+
/*
2136+
* Default to classic polling
2137+
*/
2138+
q->poll_nsec = -1;
2139+
21352140
if (set->ops->complete)
21362141
blk_queue_softirq_done(q, set->ops->complete);
21372142

@@ -2469,14 +2474,70 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
24692474
}
24702475
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
24712476

2477+
static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
2478+
struct blk_mq_hw_ctx *hctx,
2479+
struct request *rq)
2480+
{
2481+
struct blk_rq_stat stat[2];
2482+
unsigned long ret = 0;
2483+
2484+
/*
2485+
* If stats collection isn't on, don't sleep but turn it on for
2486+
* future users
2487+
*/
2488+
if (!blk_stat_enable(q))
2489+
return 0;
2490+
2491+
/*
2492+
* We don't have to do this once per IO, should optimize this
2493+
* to just use the current window of stats until it changes
2494+
*/
2495+
memset(&stat, 0, sizeof(stat));
2496+
blk_hctx_stat_get(hctx, stat);
2497+
2498+
/*
2499+
* As an optimistic guess, use half of the mean service time
2500+
* for this type of request. We can (and should) make this smarter.
2501+
* For instance, if the completion latencies are tight, we can
2502+
* get closer than just half the mean. This is especially
2503+
* important on devices where the completion latencies are longer
2504+
* than ~10 usec.
2505+
*/
2506+
if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
2507+
ret = (stat[BLK_STAT_READ].mean + 1) / 2;
2508+
else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
2509+
ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;
2510+
2511+
return ret;
2512+
}
2513+
24722514
static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
2515+
struct blk_mq_hw_ctx *hctx,
24732516
struct request *rq)
24742517
{
24752518
struct hrtimer_sleeper hs;
24762519
enum hrtimer_mode mode;
2520+
unsigned int nsecs;
24772521
ktime_t kt;
24782522

2479-
if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
2523+
if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
2524+
return false;
2525+
2526+
/*
2527+
* poll_nsec can be:
2528+
*
2529+
* -1: don't ever hybrid sleep
2530+
* 0: use half of prev avg
2531+
* >0: use this specific value
2532+
*/
2533+
if (q->poll_nsec == -1)
2534+
return false;
2535+
else if (q->poll_nsec > 0)
2536+
nsecs = q->poll_nsec;
2537+
else
2538+
nsecs = blk_mq_poll_nsecs(q, hctx, rq);
2539+
2540+
if (!nsecs)
24802541
return false;
24812542

24822543
set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
@@ -2485,7 +2546,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
24852546
* This will be replaced with the stats tracking code, using
24862547
* 'avg_completion_time / 2' as the pre-sleep target.
24872548
*/
2488-
kt = ktime_set(0, q->poll_nsec);
2549+
kt = ktime_set(0, nsecs);
24892550

24902551
mode = HRTIMER_MODE_REL;
24912552
hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
@@ -2520,7 +2581,7 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
25202581
* the IO isn't complete, we'll get called again and will go
25212582
* straight to the busy poll loop.
25222583
*/
2523-
if (blk_mq_poll_hybrid_sleep(q, rq))
2584+
if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
25242585
return true;
25252586

25262587
hctx->poll_considered++;

block/blk-sysfs.c

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -352,24 +352,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
352352

353353
static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
354354
{
355-
return queue_var_show(q->poll_nsec / 1000, page);
355+
int val;
356+
357+
if (q->poll_nsec == -1)
358+
val = -1;
359+
else
360+
val = q->poll_nsec / 1000;
361+
362+
return sprintf(page, "%d\n", val);
356363
}
357364

358365
static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
359366
size_t count)
360367
{
361-
unsigned long poll_usec;
362-
ssize_t ret;
368+
int err, val;
363369

364370
if (!q->mq_ops || !q->mq_ops->poll)
365371
return -EINVAL;
366372

367-
ret = queue_var_store(&poll_usec, page, count);
368-
if (ret < 0)
369-
return ret;
373+
err = kstrtoint(page, 10, &val);
374+
if (err < 0)
375+
return err;
370376

371-
q->poll_nsec = poll_usec * 1000;
372-
return ret;
377+
if (val == -1)
378+
q->poll_nsec = -1;
379+
else
380+
q->poll_nsec = val * 1000;
381+
382+
return count;
373383
}
374384

375385
static ssize_t queue_poll_show(struct request_queue *q, char *page)

include/linux/blkdev.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ struct request_queue {
509509
unsigned int request_fn_active;
510510

511511
unsigned int rq_timeout;
512-
unsigned int poll_nsec;
512+
int poll_nsec;
513513
struct timer_list timeout;
514514
struct work_struct timeout_work;
515515
struct list_head timeout_list;

0 commit comments

Comments
 (0)