Skip to content

Commit b6e68ee

Browse files
jankaraaxboe
authored andcommitted
blk-mq: Improve performance of non-mq IO schedulers with multiple HW queues
Currently when non-mq aware IO scheduler (BFQ, mq-deadline) is used for a queue with multiple HW queues, the performance it rather bad. The problem is that these IO schedulers use queue-wide locking and their dispatch function does not respect the hctx it is passed in and returns any request it finds appropriate. Thus locality of request access is broken and dispatch from multiple CPUs just contends on IO scheduler locks. For these IO schedulers there's little point in dispatching from multiple CPUs. Instead dispatch always only from a single CPU to limit contention. Below is a comparison of dbench runs on XFS filesystem where the storage is a raid card with 64 HW queues and to it attached a single rotating disk. BFQ is used as IO scheduler: clients MQ SQ MQ-Patched Amean 1 39.12 (0.00%) 43.29 * -10.67%* 36.09 * 7.74%* Amean 2 128.58 (0.00%) 101.30 * 21.22%* 96.14 * 25.23%* Amean 4 577.42 (0.00%) 494.47 * 14.37%* 508.49 * 11.94%* Amean 8 610.95 (0.00%) 363.86 * 40.44%* 362.12 * 40.73%* Amean 16 391.78 (0.00%) 261.49 * 33.25%* 282.94 * 27.78%* Amean 32 324.64 (0.00%) 267.71 * 17.54%* 233.00 * 28.23%* Amean 64 295.04 (0.00%) 253.02 * 14.24%* 242.37 * 17.85%* Amean 512 10281.61 (0.00%) 10211.16 * 0.69%* 10447.53 * -1.61%* Numbers are times so lower is better. MQ is stock 5.10-rc6 kernel. SQ is the same kernel with megaraid_sas.host_tagset_enable=0 so that the card advertises just a single HW queue. MQ-Patched is a kernel with this patch applied. You can see multiple hardware queues heavily hurt performance in combination with BFQ. The patch restores the performance. Signed-off-by: Jan Kara <[email protected]> Reviewed-by: Ming Lei <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent 5ac83c6 commit b6e68ee

File tree

3 files changed

+63
-6
lines changed

3 files changed

+63
-6
lines changed

block/blk-mq.c

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1646,21 +1646,66 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
16461646
}
16471647
EXPORT_SYMBOL(blk_mq_run_hw_queue);
16481648

1649+
/*
1650+
* Is the request queue handled by an IO scheduler that does not respect
1651+
* hardware queues when dispatching?
1652+
*/
1653+
static bool blk_mq_has_sqsched(struct request_queue *q)
1654+
{
1655+
struct elevator_queue *e = q->elevator;
1656+
1657+
if (e && e->type->ops.dispatch_request &&
1658+
!(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
1659+
return true;
1660+
return false;
1661+
}
1662+
1663+
/*
1664+
* Return prefered queue to dispatch from (if any) for non-mq aware IO
1665+
* scheduler.
1666+
*/
1667+
static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
1668+
{
1669+
struct blk_mq_hw_ctx *hctx;
1670+
1671+
/*
1672+
* If the IO scheduler does not respect hardware queues when
1673+
* dispatching, we just don't bother with multiple HW queues and
1674+
* dispatch from hctx for the current CPU since running multiple queues
1675+
* just causes lock contention inside the scheduler and pointless cache
1676+
* bouncing.
1677+
*/
1678+
hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT,
1679+
raw_smp_processor_id());
1680+
if (!blk_mq_hctx_stopped(hctx))
1681+
return hctx;
1682+
return NULL;
1683+
}
1684+
16491685
/**
16501686
* blk_mq_run_hw_queues - Run all hardware queues in a request queue.
16511687
* @q: Pointer to the request queue to run.
16521688
* @async: If we want to run the queue asynchronously.
16531689
*/
16541690
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
16551691
{
1656-
struct blk_mq_hw_ctx *hctx;
1692+
struct blk_mq_hw_ctx *hctx, *sq_hctx;
16571693
int i;
16581694

1695+
sq_hctx = NULL;
1696+
if (blk_mq_has_sqsched(q))
1697+
sq_hctx = blk_mq_get_sq_hctx(q);
16591698
queue_for_each_hw_ctx(q, hctx, i) {
16601699
if (blk_mq_hctx_stopped(hctx))
16611700
continue;
1662-
1663-
blk_mq_run_hw_queue(hctx, async);
1701+
/*
1702+
* Dispatch from this hctx either if there's no hctx preferred
1703+
* by IO scheduler or if it has requests that bypass the
1704+
* scheduler.
1705+
*/
1706+
if (!sq_hctx || sq_hctx == hctx ||
1707+
!list_empty_careful(&hctx->dispatch))
1708+
blk_mq_run_hw_queue(hctx, async);
16641709
}
16651710
}
16661711
EXPORT_SYMBOL(blk_mq_run_hw_queues);
@@ -1672,14 +1717,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
16721717
*/
16731718
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
16741719
{
1675-
struct blk_mq_hw_ctx *hctx;
1720+
struct blk_mq_hw_ctx *hctx, *sq_hctx;
16761721
int i;
16771722

1723+
sq_hctx = NULL;
1724+
if (blk_mq_has_sqsched(q))
1725+
sq_hctx = blk_mq_get_sq_hctx(q);
16781726
queue_for_each_hw_ctx(q, hctx, i) {
16791727
if (blk_mq_hctx_stopped(hctx))
16801728
continue;
1681-
1682-
blk_mq_delay_run_hw_queue(hctx, msecs);
1729+
/*
1730+
* Dispatch from this hctx either if there's no hctx preferred
1731+
* by IO scheduler or if it has requests that bypass the
1732+
* scheduler.
1733+
*/
1734+
if (!sq_hctx || sq_hctx == hctx ||
1735+
!list_empty_careful(&hctx->dispatch))
1736+
blk_mq_delay_run_hw_queue(hctx, msecs);
16831737
}
16841738
}
16851739
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);

block/kyber-iosched.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1029,6 +1029,7 @@ static struct elevator_type kyber_sched = {
10291029
#endif
10301030
.elevator_attrs = kyber_sched_attrs,
10311031
.elevator_name = "kyber",
1032+
.elevator_features = ELEVATOR_F_MQ_AWARE,
10321033
.elevator_owner = THIS_MODULE,
10331034
};
10341035

include/linux/elevator.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
172172

173173
/* Supports zoned block devices sequential write constraint */
174174
#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0)
175+
/* Supports scheduling on multiple hardware queues */
176+
#define ELEVATOR_F_MQ_AWARE (1U << 1)
175177

176178
#endif /* CONFIG_BLOCK */
177179
#endif

0 commit comments

Comments
 (0)