@@ -157,6 +157,7 @@ BFQ_BFQQ_FNS(in_large_burst);
157
157
BFQ_BFQQ_FNS (coop );
158
158
BFQ_BFQQ_FNS (split_coop );
159
159
BFQ_BFQQ_FNS (softrt_update );
160
+ BFQ_BFQQ_FNS (has_waker );
160
161
#undef BFQ_BFQQ_FNS \
161
162
162
163
/* Expiration time of sync (0) and async (1) requests, in ns. */
@@ -1814,6 +1815,111 @@ static void bfq_add_request(struct request *rq)
1814
1815
bfqd -> queued ++ ;
1815
1816
1816
1817
if (RB_EMPTY_ROOT (& bfqq -> sort_list ) && bfq_bfqq_sync (bfqq )) {
1818
+ /*
1819
+ * Detect whether bfqq's I/O seems synchronized with
1820
+ * that of some other queue, i.e., whether bfqq, after
1821
+ * remaining empty, happens to receive new I/O only
1822
+ * right after some I/O request of the other queue has
1823
+ * been completed. We call waker queue the other
1824
+ * queue, and we assume, for simplicity, that bfqq may
1825
+ * have at most one waker queue.
1826
+ *
1827
+ * A remarkable throughput boost can be reached by
1828
+ * unconditionally injecting the I/O of the waker
1829
+ * queue, every time a new bfq_dispatch_request
1830
+ * happens to be invoked while I/O is being plugged
1831
+ * for bfqq. In addition to boosting throughput, this
1832
+ * unblocks bfqq's I/O, thereby improving bandwidth
1833
+ * and latency for bfqq. Note that these same results
1834
+ * may be achieved with the general injection
1835
+ * mechanism, but less effectively. For details on
1836
+ * this aspect, see the comments on the choice of the
1837
+ * queue for injection in bfq_select_queue().
1838
+ *
1839
+ * Turning back to the detection of a waker queue, a
1840
+ * queue Q is deemed as a waker queue for bfqq if, for
1841
+ * two consecutive times, bfqq happens to become non
1842
+ * empty right after a request of Q has been
1843
+ * completed. In particular, on the first time, Q is
1844
+ * tentatively set as a candidate waker queue, while
1845
+ * on the second time, the flag
1846
+ * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
1847
+ * is a waker queue for bfqq. These detection steps
1848
+ * are performed only if bfqq has a long think time,
1849
+ * so as to make it more likely that bfqq's I/O is
1850
+ * actually being blocked by a synchronization. This
1851
+ * last filter, plus the above two-times requirement,
1852
+ * make false positives less likely.
1853
+ *
1854
+ * NOTE
1855
+ *
1856
+ * The sooner a waker queue is detected, the sooner
1857
+ * throughput can be boosted by injecting I/O from the
1858
+ * waker queue. Fortunately, detection is likely to be
1859
+ * actually fast, for the following reasons. While
1860
+ * blocked by synchronization, bfqq has a long think
1861
+ * time. This implies that bfqq's inject limit is at
1862
+ * least equal to 1 (see the comments in
1863
+ * bfq_update_inject_limit()). So, thanks to
1864
+ * injection, the waker queue is likely to be served
1865
+ * during the very first I/O-plugging time interval
1866
+ * for bfqq. This triggers the first step of the
1867
+ * detection mechanism. Thanks again to injection, the
1868
+ * candidate waker queue is then likely to be
1869
+ * confirmed no later than during the next
1870
+ * I/O-plugging interval for bfqq.
1871
+ */
1872
+ if (!bfq_bfqq_has_short_ttime (bfqq ) &&
1873
+ ktime_get_ns () - bfqd -> last_completion <
1874
+ 200 * NSEC_PER_USEC ) {
1875
+ if (bfqd -> last_completed_rq_bfqq != bfqq &&
1876
+ bfqd -> last_completed_rq_bfqq !=
1877
+ bfqq -> waker_bfqq ) {
1878
+ /*
1879
+ * First synchronization detected with
1880
+ * a candidate waker queue, or with a
1881
+ * different candidate waker queue
1882
+ * from the current one.
1883
+ */
1884
+ bfqq -> waker_bfqq = bfqd -> last_completed_rq_bfqq ;
1885
+
1886
+ /*
1887
+ * If the waker queue disappears, then
1888
+ * bfqq->waker_bfqq must be reset. To
1889
+ * this goal, we maintain in each
1890
+ * waker queue a list, woken_list, of
1891
+ * all the queues that reference the
1892
+ * waker queue through their
1893
+ * waker_bfqq pointer. When the waker
1894
+ * queue exits, the waker_bfqq pointer
1895
+ * of all the queues in the woken_list
1896
+ * is reset.
1897
+ *
1898
+ * In addition, if bfqq is already in
1899
+ * the woken_list of a waker queue,
1900
+ * then, before being inserted into
1901
+ * the woken_list of a new waker
1902
+ * queue, bfqq must be removed from
1903
+ * the woken_list of the old waker
1904
+ * queue.
1905
+ */
1906
+ if (!hlist_unhashed (& bfqq -> woken_list_node ))
1907
+ hlist_del_init (& bfqq -> woken_list_node );
1908
+ hlist_add_head (& bfqq -> woken_list_node ,
1909
+ & bfqd -> last_completed_rq_bfqq -> woken_list );
1910
+
1911
+ bfq_clear_bfqq_has_waker (bfqq );
1912
+ } else if (bfqd -> last_completed_rq_bfqq ==
1913
+ bfqq -> waker_bfqq &&
1914
+ !bfq_bfqq_has_waker (bfqq )) {
1915
+ /*
1916
+ * synchronization with waker_bfqq
1917
+ * seen for the second time
1918
+ */
1919
+ bfq_mark_bfqq_has_waker (bfqq );
1920
+ }
1921
+ }
1922
+
1817
1923
/*
1818
1924
* Periodically reset inject limit, to make sure that
1819
1925
* the latter eventually drops in case workload
@@ -4164,18 +4270,89 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
4164
4270
bfqq -> bic -> bfqq [0 ] : NULL ;
4165
4271
4166
4272
/*
4167
- * If the process associated with bfqq has also async
4168
- * I/O pending, then inject it
4169
- * unconditionally. Injecting I/O from the same
4170
- * process can cause no harm to the process. On the
4171
- * contrary, it can only increase bandwidth and reduce
4172
- * latency for the process.
4273
+ * The next three mutually-exclusive ifs decide
4274
+ * whether to try injection, and choose the queue to
4275
+ * pick an I/O request from.
4276
+ *
4277
+ * The first if checks whether the process associated
4278
+ * with bfqq has also async I/O pending. If so, it
4279
+ * injects such I/O unconditionally. Injecting async
4280
+ * I/O from the same process can cause no harm to the
4281
+ * process. On the contrary, it can only increase
4282
+ * bandwidth and reduce latency for the process.
4283
+ *
4284
+ * The second if checks whether there happens to be a
4285
+ * non-empty waker queue for bfqq, i.e., a queue whose
4286
+ * I/O needs to be completed for bfqq to receive new
4287
+ * I/O. This happens, e.g., if bfqq is associated with
4288
+ * a process that does some sync. A sync generates
4289
+ * extra blocking I/O, which must be completed before
4290
+ * the process associated with bfqq can go on with its
4291
+ * I/O. If the I/O of the waker queue is not served,
4292
+ * then bfqq remains empty, and no I/O is dispatched,
4293
+ * until the idle timeout fires for bfqq. This is
4294
+ * likely to result in lower bandwidth and higher
4295
+ * latencies for bfqq, and in a severe loss of total
4296
+ * throughput. The best action to take is therefore to
4297
+ * serve the waker queue as soon as possible. So do it
4298
+ * (without relying on the third alternative below for
4299
+ * eventually serving waker_bfqq's I/O; see the last
4300
+ * paragraph for further details). This systematic
4301
+ * injection of I/O from the waker queue does not
4302
+ * cause any delay to bfqq's I/O. On the contrary,
4303
+ * next bfqq's I/O is brought forward dramatically,
4304
+ * for it is not blocked for milliseconds.
4305
+ *
4306
+ * The third if checks whether bfqq is a queue for
4307
+ * which it is better to avoid injection. It is so if
4308
+ * bfqq delivers more throughput when served without
4309
+ * any further I/O from other queues in the middle, or
4310
+ * if the service times of bfqq's I/O requests both
4311
+ * count more than overall throughput, and may be
4312
+ * easily increased by injection (this happens if bfqq
4313
+ * has a short think time). If none of these
4314
+ * conditions holds, then a candidate queue for
4315
+ * injection is looked for through
4316
+ * bfq_choose_bfqq_for_injection(). Note that the
4317
+ * latter may return NULL (for example if the inject
4318
+ * limit for bfqq is currently 0).
4319
+ *
4320
+ * NOTE: motivation for the second alternative
4321
+ *
4322
+ * Thanks to the way the inject limit is updated in
4323
+ * bfq_update_has_short_ttime(), it is rather likely
4324
+ * that, if I/O is being plugged for bfqq and the
4325
+ * waker queue has pending I/O requests that are
4326
+ * blocking bfqq's I/O, then the third alternative
4327
+ * above lets the waker queue get served before the
4328
+ * I/O-plugging timeout fires. So one may deem the
4329
+ * second alternative superfluous. It is not, because
4330
+ * the third alternative may be way less effective in
4331
+ * case of a synchronization. For two main
4332
+ * reasons. First, throughput may be low because the
4333
+ * inject limit may be too low to guarantee the same
4334
+ * amount of injected I/O, from the waker queue or
4335
+ * other queues, that the second alternative
4336
+ * guarantees (the second alternative unconditionally
4337
+ * injects a pending I/O request of the waker queue
4338
+ * for each bfq_dispatch_request()). Second, with the
4339
+ * third alternative, the duration of the plugging,
4340
+ * i.e., the time before bfqq finally receives new I/O,
4341
+ * may not be minimized, because the waker queue may
4342
+ * happen to be served only after other queues.
4173
4343
*/
4174
4344
if (async_bfqq &&
4175
4345
icq_to_bic (async_bfqq -> next_rq -> elv .icq ) == bfqq -> bic &&
4176
4346
bfq_serv_to_charge (async_bfqq -> next_rq , async_bfqq ) <=
4177
4347
bfq_bfqq_budget_left (async_bfqq ))
4178
4348
bfqq = bfqq -> bic -> bfqq [0 ];
4349
+ else if (bfq_bfqq_has_waker (bfqq ) &&
4350
+ bfq_bfqq_busy (bfqq -> waker_bfqq ) &&
4351
+ bfq_serv_to_charge (bfqq -> waker_bfqq -> next_rq ,
4352
+ bfqq -> waker_bfqq ) <=
4353
+ bfq_bfqq_budget_left (bfqq -> waker_bfqq )
4354
+ )
4355
+ bfqq = bfqq -> waker_bfqq ;
4179
4356
else if (!idling_boosts_thr_without_issues (bfqd , bfqq ) &&
4180
4357
(bfqq -> wr_coeff == 1 || bfqd -> wr_busy_queues > 1 ||
4181
4358
!bfq_bfqq_has_short_ttime (bfqq )))
@@ -4564,6 +4741,9 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq)
4564
4741
4565
4742
static void bfq_exit_bfqq (struct bfq_data * bfqd , struct bfq_queue * bfqq )
4566
4743
{
4744
+ struct bfq_queue * item ;
4745
+ struct hlist_node * n ;
4746
+
4567
4747
if (bfqq == bfqd -> in_service_queue ) {
4568
4748
__bfq_bfqq_expire (bfqd , bfqq );
4569
4749
bfq_schedule_dispatch (bfqd );
@@ -4573,6 +4753,18 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4573
4753
4574
4754
bfq_put_cooperator (bfqq );
4575
4755
4756
+ /* remove bfqq from woken list */
4757
+ if (!hlist_unhashed (& bfqq -> woken_list_node ))
4758
+ hlist_del_init (& bfqq -> woken_list_node );
4759
+
4760
+ /* reset waker for all queues in woken list */
4761
+ hlist_for_each_entry_safe (item , n , & bfqq -> woken_list ,
4762
+ woken_list_node ) {
4763
+ item -> waker_bfqq = NULL ;
4764
+ bfq_clear_bfqq_has_waker (item );
4765
+ hlist_del_init (& item -> woken_list_node );
4766
+ }
4767
+
4576
4768
bfq_put_queue (bfqq ); /* release process reference */
4577
4769
}
4578
4770
@@ -4691,6 +4883,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4691
4883
RB_CLEAR_NODE (& bfqq -> entity .rb_node );
4692
4884
INIT_LIST_HEAD (& bfqq -> fifo );
4693
4885
INIT_HLIST_NODE (& bfqq -> burst_list_node );
4886
+ INIT_HLIST_NODE (& bfqq -> woken_list_node );
4887
+ INIT_HLIST_HEAD (& bfqq -> woken_list );
4694
4888
4695
4889
bfqq -> ref = 0 ;
4696
4890
bfqq -> bfqd = bfqd ;
@@ -4909,28 +5103,27 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4909
5103
* bfqq may have a long think time because of a
4910
5104
* synchronization with some other queue, i.e., because the
4911
5105
* I/O of some other queue may need to be completed for bfqq
4912
- * to receive new I/O. This happens, e.g., if bfqq is
4913
- * associated with a process that does some sync. A sync
4914
- * generates extra blocking I/O, which must be completed
4915
- * before the process associated with bfqq can go on with its
4916
- * I/O.
5106
+ * to receive new I/O. Details in the comments on the choice
5107
+ * of the queue for injection in bfq_select_queue().
4917
5108
*
4918
- * If such a synchronization is actually in place, then,
4919
- * without injection on bfqq, the blocking I/O cannot happen
4920
- * to served while bfqq is in service. As a consequence, if
4921
- * bfqq is granted I/O-dispatch-plugging, then bfqq remains
4922
- * empty, and no I/O is dispatched, until the idle timeout
4923
- * fires. This is likely to result in lower bandwidth and
4924
- * higher latencies for bfqq, and in a severe loss of total
4925
- * throughput.
5109
+ * As stressed in those comments, if such a synchronization is
5110
+ * actually in place, then, without injection on bfqq, the
5111
+ * blocking I/O cannot happen to served while bfqq is in
5112
+ * service. As a consequence, if bfqq is granted
5113
+ * I/O-dispatch-plugging, then bfqq remains empty, and no I/O
5114
+ * is dispatched, until the idle timeout fires. This is likely
5115
+ * to result in lower bandwidth and higher latencies for bfqq,
5116
+ * and in a severe loss of total throughput.
4926
5117
*
4927
5118
* On the opposite end, a non-zero inject limit may allow the
4928
5119
* I/O that blocks bfqq to be executed soon, and therefore
4929
- * bfqq to receive new I/O soon. But, if this actually
4930
- * happens, then the next think-time sample for bfqq may be
4931
- * very low. This in turn may cause bfqq's think time to be
4932
- * deemed short. Without the 100 ms barrier, this new state
4933
- * change would cause the body of the next if to be executed
5120
+ * bfqq to receive new I/O soon.
5121
+ *
5122
+ * But, if the blocking gets actually eliminated, then the
5123
+ * next think-time sample for bfqq may be very low. This in
5124
+ * turn may cause bfqq's think time to be deemed
5125
+ * short. Without the 100 ms barrier, this new state change
5126
+ * would cause the body of the next if to be executed
4934
5127
* immediately. But this would set to 0 the inject
4935
5128
* limit. Without injection, the blocking I/O would cause the
4936
5129
* think time of bfqq to become long again, and therefore the
@@ -4941,11 +5134,11 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4941
5134
* In contrast, if the inject limit is not reset during such a
4942
5135
* long time interval as 100 ms, then the number of short
4943
5136
* think time samples can grow significantly before the reset
4944
- * is allowed . As a consequence, the think time state can
4945
- * become stable before the reset. There will be no state
4946
- * change when the 100 ms elapse, and therefore no reset of
4947
- * the inject limit. The inject limit remains steadily equal
4948
- * to 1 both during and after the 100 ms. So injection can be
5137
+ * is performed . As a consequence, the think time state can
5138
+ * become stable before the reset. Therefore there will be no
5139
+ * state change when the 100 ms elapse, and no reset of the
5140
+ * inject limit. The inject limit remains steadily equal to 1
5141
+ * both during and after the 100 ms. So injection can be
4949
5142
* performed at all times, and throughput gets boosted.
4950
5143
*
4951
5144
* An inject limit equal to 1 is however in conflict, in
@@ -4960,10 +5153,20 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4960
5153
* brought forward, because it is not blocked for
4961
5154
* milliseconds.
4962
5155
*
4963
- * In addition, during the 100 ms, the base value for the
4964
- * total service time is likely to get finally computed,
4965
- * freeing the inject limit from its relation with the think
4966
- * time.
5156
+ * In addition, serving the blocking I/O much sooner, and much
5157
+ * more frequently than once per I/O-plugging timeout, makes
5158
+ * it much quicker to detect a waker queue (the concept of
5159
+ * waker queue is defined in the comments in
5160
+ * bfq_add_request()). This makes it possible to start sooner
5161
+ * to boost throughput more effectively, by injecting the I/O
5162
+ * of the waker queue unconditionally on every
5163
+ * bfq_dispatch_request().
5164
+ *
5165
+ * One last, important benefit of not resetting the inject
5166
+ * limit before 100 ms is that, during this time interval, the
5167
+ * base value for the total service time is likely to get
5168
+ * finally computed for bfqq, freeing the inject limit from
5169
+ * its relation with the think time.
4967
5170
*/
4968
5171
if (state_changed && bfqq -> last_serv_time_ns == 0 &&
4969
5172
(time_is_before_eq_jiffies (bfqq -> decrease_time_jif +
@@ -5278,6 +5481,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
5278
5481
1UL <<(BFQ_RATE_SHIFT - 10 ))
5279
5482
bfq_update_rate_reset (bfqd , NULL );
5280
5483
bfqd -> last_completion = now_ns ;
5484
+ bfqd -> last_completed_rq_bfqq = bfqq ;
5281
5485
5282
5486
/*
5283
5487
* If we are waiting to discover whether the request pattern
0 commit comments