Skip to content

Commit 8525e5f

Browse files
committed
Merge branch 'for-linus' into test
* for-linus: block, bfq: add requeue-request hook bcache: fix for data collapse after re-attaching an attached device bcache: return attach error when no cache set exist bcache: set writeback_rate_update_seconds in range [1, 60] seconds bcache: fix for allocator and register thread race bcache: set error_limit correctly bcache: properly set task state in bch_writeback_thread() bcache: fix high CPU occupancy during journal bcache: add journal statistic block: Add should_fail_bio() for bpf error injection blk-wbt: account flush requests correctly
2 parents 61a6951 + a787739 commit 8525e5f

File tree

12 files changed

+212
-63
lines changed

12 files changed

+212
-63
lines changed

block/bfq-iosched.c

Lines changed: 82 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3823,24 +3823,26 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
38233823
}
38243824

38253825
/*
3826-
* We exploit the bfq_finish_request hook to decrement
3827-
* rq_in_driver, but bfq_finish_request will not be
3828-
* invoked on this request. So, to avoid unbalance,
3829-
* just start this request, without incrementing
3830-
* rq_in_driver. As a negative consequence,
3831-
* rq_in_driver is deceptively lower than it should be
3832-
* while this request is in service. This may cause
3833-
* bfq_schedule_dispatch to be invoked uselessly.
3826+
* We exploit the bfq_finish_requeue_request hook to
3827+
* decrement rq_in_driver, but
3828+
* bfq_finish_requeue_request will not be invoked on
3829+
* this request. So, to avoid unbalance, just start
3830+
* this request, without incrementing rq_in_driver. As
3831+
* a negative consequence, rq_in_driver is deceptively
3832+
* lower than it should be while this request is in
3833+
* service. This may cause bfq_schedule_dispatch to be
3834+
* invoked uselessly.
38343835
*
38353836
* As for implementing an exact solution, the
3836-
* bfq_finish_request hook, if defined, is probably
3837-
* invoked also on this request. So, by exploiting
3838-
* this hook, we could 1) increment rq_in_driver here,
3839-
* and 2) decrement it in bfq_finish_request. Such a
3840-
* solution would let the value of the counter be
3841-
* always accurate, but it would entail using an extra
3842-
* interface function. This cost seems higher than the
3843-
* benefit, being the frequency of non-elevator-private
3837+
* bfq_finish_requeue_request hook, if defined, is
3838+
* probably invoked also on this request. So, by
3839+
* exploiting this hook, we could 1) increment
3840+
* rq_in_driver here, and 2) decrement it in
3841+
* bfq_finish_requeue_request. Such a solution would
3842+
* let the value of the counter be always accurate,
3843+
* but it would entail using an extra interface
3844+
* function. This cost seems higher than the benefit,
3845+
* being the frequency of non-elevator-private
38443846
* requests very low.
38453847
*/
38463848
goto start_rq;
@@ -4515,6 +4517,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
45154517
unsigned int cmd_flags) {}
45164518
#endif
45174519

4520+
static void bfq_prepare_request(struct request *rq, struct bio *bio);
4521+
45184522
static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
45194523
bool at_head)
45204524
{
@@ -4541,6 +4545,18 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
45414545
else
45424546
list_add_tail(&rq->queuelist, &bfqd->dispatch);
45434547
} else {
4548+
if (WARN_ON_ONCE(!bfqq)) {
4549+
/*
4550+
* This should never happen. Most likely rq is
4551+
* a requeued regular request, being
4552+
* re-inserted without being first
4553+
* re-prepared. Do a prepare, to avoid
4554+
* failure.
4555+
*/
4556+
bfq_prepare_request(rq, rq->bio);
4557+
bfqq = RQ_BFQQ(rq);
4558+
}
4559+
45444560
idle_timer_disabled = __bfq_insert_request(bfqd, rq);
45454561
/*
45464562
* Update bfqq, because, if a queue merge has occurred
@@ -4697,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
46974713
bfq_schedule_dispatch(bfqd);
46984714
}
46994715

4700-
static void bfq_finish_request_body(struct bfq_queue *bfqq)
4716+
static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
47014717
{
47024718
bfqq->allocated--;
47034719

47044720
bfq_put_queue(bfqq);
47054721
}
47064722

4707-
static void bfq_finish_request(struct request *rq)
4723+
/*
4724+
* Handle either a requeue or a finish for rq. The things to do are
4725+
* the same in both cases: all references to rq are to be dropped. In
4726+
* particular, rq is considered completed from the point of view of
4727+
* the scheduler.
4728+
*/
4729+
static void bfq_finish_requeue_request(struct request *rq)
47084730
{
4709-
struct bfq_queue *bfqq;
4731+
struct bfq_queue *bfqq = RQ_BFQQ(rq);
47104732
struct bfq_data *bfqd;
47114733

4712-
if (!rq->elv.icq)
4734+
/*
4735+
* Requeue and finish hooks are invoked in blk-mq without
4736+
* checking whether the involved request is actually still
4737+
* referenced in the scheduler. To handle this fact, the
4738+
* following two checks make this function exit in case of
4739+
* spurious invocations, for which there is nothing to do.
4740+
*
4741+
* First, check whether rq has nothing to do with an elevator.
4742+
*/
4743+
if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
4744+
return;
4745+
4746+
/*
4747+
* rq either is not associated with any icq, or is an already
4748+
* requeued request that has not (yet) been re-inserted into
4749+
* a bfq_queue.
4750+
*/
4751+
if (!rq->elv.icq || !bfqq)
47134752
return;
47144753

4715-
bfqq = RQ_BFQQ(rq);
47164754
bfqd = bfqq->bfqd;
47174755

47184756
if (rq->rq_flags & RQF_STARTED)
@@ -4727,13 +4765,14 @@ static void bfq_finish_request(struct request *rq)
47274765
spin_lock_irqsave(&bfqd->lock, flags);
47284766

47294767
bfq_completed_request(bfqq, bfqd);
4730-
bfq_finish_request_body(bfqq);
4768+
bfq_finish_requeue_request_body(bfqq);
47314769

47324770
spin_unlock_irqrestore(&bfqd->lock, flags);
47334771
} else {
47344772
/*
47354773
* Request rq may be still/already in the scheduler,
4736-
* in which case we need to remove it. And we cannot
4774+
* in which case we need to remove it (this should
4775+
* never happen in case of requeue). And we cannot
47374776
* defer such a check and removal, to avoid
47384777
* inconsistencies in the time interval from the end
47394778
* of this function to the start of the deferred work.
@@ -4748,9 +4787,26 @@ static void bfq_finish_request(struct request *rq)
47484787
bfqg_stats_update_io_remove(bfqq_group(bfqq),
47494788
rq->cmd_flags);
47504789
}
4751-
bfq_finish_request_body(bfqq);
4790+
bfq_finish_requeue_request_body(bfqq);
47524791
}
47534792

4793+
/*
4794+
* Reset private fields. In case of a requeue, this allows
4795+
* this function to correctly do nothing if it is spuriously
4796+
* invoked again on this same request (see the check at the
4797+
* beginning of the function). Probably, a better general
4798+
* design would be to prevent blk-mq from invoking the requeue
4799+
* or finish hooks of an elevator, for a request that is not
4800+
* referred by that elevator.
4801+
*
4802+
* Resetting the following fields would break the
4803+
* request-insertion logic if rq is re-inserted into a bfq
4804+
* internal queue, without a re-preparation. Here we assume
4805+
* that re-insertions of requeued requests, without
4806+
* re-preparation, can happen only for pass_through or at_head
4807+
* requests (which are not re-inserted into bfq internal
4808+
* queues).
4809+
*/
47544810
rq->elv.priv[0] = NULL;
47554811
rq->elv.priv[1] = NULL;
47564812
}
@@ -5426,7 +5482,8 @@ static struct elevator_type iosched_bfq_mq = {
54265482
.ops.mq = {
54275483
.limit_depth = bfq_limit_depth,
54285484
.prepare_request = bfq_prepare_request,
5429-
.finish_request = bfq_finish_request,
5485+
.requeue_request = bfq_finish_requeue_request,
5486+
.finish_request = bfq_finish_requeue_request,
54305487
.exit_icq = bfq_exit_icq,
54315488
.insert_requests = bfq_insert_requests,
54325489
.dispatch_request = bfq_dispatch_request,

block/blk-core.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <linux/pm_runtime.h>
3535
#include <linux/blk-cgroup.h>
3636
#include <linux/debugfs.h>
37+
#include <linux/bpf.h>
3738

3839
#define CREATE_TRACE_POINTS
3940
#include <trace/events/block.h>
@@ -2083,6 +2084,14 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
20832084
return false;
20842085
}
20852086

2087+
static noinline int should_fail_bio(struct bio *bio)
2088+
{
2089+
if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
2090+
return -EIO;
2091+
return 0;
2092+
}
2093+
ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
2094+
20862095
/*
20872096
* Remap block n of partition p to block n+start(p) of the disk.
20882097
*/
@@ -2174,7 +2183,7 @@ generic_make_request_checks(struct bio *bio)
21742183
if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
21752184
goto not_supported;
21762185

2177-
if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
2186+
if (should_fail_bio(bio))
21782187
goto end_io;
21792188

21802189
if (!bio->bi_partno) {

block/blk-wbt.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -697,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
697697

698698
static int wbt_data_dir(const struct request *rq)
699699
{
700-
return rq_data_dir(rq);
700+
const int op = req_op(rq);
701+
702+
if (op == REQ_OP_READ)
703+
return READ;
704+
else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH)
705+
return WRITE;
706+
707+
/* don't account */
708+
return -1;
701709
}
702710

703711
int wbt_init(struct request_queue *q)

drivers/md/bcache/alloc.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,10 @@ do { \
287287
break; \
288288
\
289289
mutex_unlock(&(ca)->set->bucket_lock); \
290-
if (kthread_should_stop()) \
290+
if (kthread_should_stop()) { \
291+
set_current_state(TASK_RUNNING); \
291292
return 0; \
293+
} \
292294
\
293295
schedule(); \
294296
mutex_lock(&(ca)->set->bucket_lock); \

drivers/md/bcache/bcache.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -658,10 +658,15 @@ struct cache_set {
658658
atomic_long_t writeback_keys_done;
659659
atomic_long_t writeback_keys_failed;
660660

661+
atomic_long_t reclaim;
662+
atomic_long_t flush_write;
663+
atomic_long_t retry_flush_write;
664+
661665
enum {
662666
ON_ERROR_UNREGISTER,
663667
ON_ERROR_PANIC,
664668
} on_error;
669+
#define DEFAULT_IO_ERROR_LIMIT 8
665670
unsigned error_limit;
666671
unsigned error_decay;
667672

@@ -675,6 +680,8 @@ struct cache_set {
675680

676681
#define BUCKET_HASH_BITS 12
677682
struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
683+
684+
DECLARE_HEAP(struct btree *, flush_btree);
678685
};
679686

680687
struct bbio {
@@ -917,7 +924,7 @@ void bcache_write_super(struct cache_set *);
917924

918925
int bch_flash_dev_create(struct cache_set *c, uint64_t size);
919926

920-
int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
927+
int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *);
921928
void bch_cached_dev_detach(struct cached_dev *);
922929
void bch_cached_dev_run(struct cached_dev *);
923930
void bcache_device_stop(struct bcache_device *);

drivers/md/bcache/btree.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1869,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c)
18691869
*/
18701870
for_each_cache(ca, c, i) {
18711871
for_each_bucket(b, ca) {
1872-
if (fifo_full(&ca->free[RESERVE_PRIO]))
1872+
if (fifo_full(&ca->free[RESERVE_PRIO]) &&
1873+
fifo_full(&ca->free[RESERVE_BTREE]))
18731874
break;
18741875

18751876
if (bch_can_invalidate_bucket(ca, b) &&
18761877
!GC_MARK(b)) {
18771878
__bch_invalidate_one_bucket(ca, b);
1878-
fifo_push(&ca->free[RESERVE_PRIO],
1879-
b - ca->buckets);
1879+
if (!fifo_push(&ca->free[RESERVE_PRIO],
1880+
b - ca->buckets))
1881+
fifo_push(&ca->free[RESERVE_BTREE],
1882+
b - ca->buckets);
18801883
}
18811884
}
18821885
}

drivers/md/bcache/journal.c

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -368,35 +368,54 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
368368
}
369369

370370
/* Journalling */
371+
#define journal_max_cmp(l, r) \
372+
(fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
373+
fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
374+
#define journal_min_cmp(l, r) \
375+
(fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
376+
fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
371377

372378
static void btree_flush_write(struct cache_set *c)
373379
{
374380
/*
375381
* Try to find the btree node with that references the oldest journal
376382
* entry, best is our current candidate and is locked if non NULL:
377383
*/
378-
struct btree *b, *best;
379-
unsigned i;
384+
struct btree *b;
385+
int i;
386+
387+
atomic_long_inc(&c->flush_write);
388+
380389
retry:
381-
best = NULL;
382-
383-
for_each_cached_btree(b, c, i)
384-
if (btree_current_write(b)->journal) {
385-
if (!best)
386-
best = b;
387-
else if (journal_pin_cmp(c,
388-
btree_current_write(best)->journal,
389-
btree_current_write(b)->journal)) {
390-
best = b;
390+
spin_lock(&c->journal.lock);
391+
if (heap_empty(&c->flush_btree)) {
392+
for_each_cached_btree(b, c, i)
393+
if (btree_current_write(b)->journal) {
394+
if (!heap_full(&c->flush_btree))
395+
heap_add(&c->flush_btree, b,
396+
journal_max_cmp);
397+
else if (journal_max_cmp(b,
398+
heap_peek(&c->flush_btree))) {
399+
c->flush_btree.data[0] = b;
400+
heap_sift(&c->flush_btree, 0,
401+
journal_max_cmp);
402+
}
391403
}
392-
}
393404

394-
b = best;
405+
for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
406+
heap_sift(&c->flush_btree, i, journal_min_cmp);
407+
}
408+
409+
b = NULL;
410+
heap_pop(&c->flush_btree, b, journal_min_cmp);
411+
spin_unlock(&c->journal.lock);
412+
395413
if (b) {
396414
mutex_lock(&b->write_lock);
397415
if (!btree_current_write(b)->journal) {
398416
mutex_unlock(&b->write_lock);
399417
/* We raced */
418+
atomic_long_inc(&c->retry_flush_write);
400419
goto retry;
401420
}
402421

@@ -476,6 +495,8 @@ static void journal_reclaim(struct cache_set *c)
476495
unsigned iter, n = 0;
477496
atomic_t p;
478497

498+
atomic_long_inc(&c->reclaim);
499+
479500
while (!atomic_read(&fifo_front(&c->journal.pin)))
480501
fifo_pop(&c->journal.pin, p);
481502

@@ -819,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c)
819840
j->w[0].c = c;
820841
j->w[1].c = c;
821842

822-
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
843+
if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
844+
!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
823845
!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
824846
!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
825847
return -ENOMEM;

0 commit comments

Comments
 (0)