oracle
diff --git a/‎block/bfq-iosched.c
Lines changed: 82 additions & 25 deletions b/‎block/bfq-iosched.c
Lines changed: 82 additions & 25 deletions
diff --git a/‎block/blk-core.c
Lines changed: 10 additions & 1 deletion b/‎block/blk-core.c
Lines changed: 10 additions & 1 deletion
diff --git a/‎block/blk-wbt.c
Lines changed: 9 additions & 1 deletion b/‎block/blk-wbt.c
Lines changed: 9 additions & 1 deletion
diff --git a/‎drivers/md/bcache/alloc.c
Lines changed: 3 additions & 1 deletion b/‎drivers/md/bcache/alloc.c
Lines changed: 3 additions & 1 deletion
diff --git a/‎drivers/md/bcache/bcache.h
Lines changed: 8 additions & 1 deletion b/‎drivers/md/bcache/bcache.h
Lines changed: 8 additions & 1 deletion
diff --git a/‎drivers/md/bcache/btree.c
Lines changed: 6 additions & 3 deletions b/‎drivers/md/bcache/btree.c
Lines changed: 6 additions & 3 deletions
diff --git a/‎drivers/md/bcache/journal.c
Lines changed: 37 additions & 15 deletions b/‎drivers/md/bcache/journal.c
Lines changed: 37 additions & 15 deletions
@@ -3823,24 +3823,26 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 		}
 
 		/*
-		 * We exploit the bfq_finish_request hook to decrement
-		 * rq_in_driver, but bfq_finish_request will not be
-		 * invoked on this request. So, to avoid unbalance,
-		 * just start this request, without incrementing
-		 * rq_in_driver. As a negative consequence,
-		 * rq_in_driver is deceptively lower than it should be
-		 * while this request is in service. This may cause
-		 * bfq_schedule_dispatch to be invoked uselessly.
+		 * We exploit the bfq_finish_requeue_request hook to
+		 * decrement rq_in_driver, but
+		 * bfq_finish_requeue_request will not be invoked on
+		 * this request. So, to avoid unbalance, just start
+		 * this request, without incrementing rq_in_driver. As
+		 * a negative consequence, rq_in_driver is deceptively
+		 * lower than it should be while this request is in
+		 * service. This may cause bfq_schedule_dispatch to be
+		 * invoked uselessly.
 		 *
 		 * As for implementing an exact solution, the
-		 * bfq_finish_request hook, if defined, is probably
-		 * invoked also on this request. So, by exploiting
-		 * this hook, we could 1) increment rq_in_driver here,
-		 * and 2) decrement it in bfq_finish_request. Such a
-		 * solution would let the value of the counter be
-		 * always accurate, but it would entail using an extra
-		 * interface function. This cost seems higher than the
-		 * benefit, being the frequency of non-elevator-private
+		 * bfq_finish_requeue_request hook, if defined, is
+		 * probably invoked also on this request. So, by
+		 * exploiting this hook, we could 1) increment
+		 * rq_in_driver here, and 2) decrement it in
+		 * bfq_finish_requeue_request. Such a solution would
+		 * let the value of the counter be always accurate,
+		 * but it would entail using an extra interface
+		 * function. This cost seems higher than the benefit,
+		 * being the frequency of non-elevator-private
 		 * requests very low.
 		 */
 		goto start_rq;
@@ -4515,6 +4517,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
 					   unsigned int cmd_flags) {}
 #endif
 
+static void bfq_prepare_request(struct request *rq, struct bio *bio);
+
 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 			       bool at_head)
 {
@@ -4541,6 +4545,18 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 		else
 			list_add_tail(&rq->queuelist, &bfqd->dispatch);
 	} else {
+		if (WARN_ON_ONCE(!bfqq)) {
+			/*
+			 * This should never happen. Most likely rq is
+			 * a requeued regular request, being
+			 * re-inserted without being first
+			 * re-prepared. Do a prepare, to avoid
+			 * failure.
+			 */
+			bfq_prepare_request(rq, rq->bio);
+			bfqq = RQ_BFQQ(rq);
+		}
+
 		idle_timer_disabled = __bfq_insert_request(bfqd, rq);
 		/*
 		 * Update bfqq, because, if a queue merge has occurred
@@ -4697,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 		bfq_schedule_dispatch(bfqd);
 }
 
-static void bfq_finish_request_body(struct bfq_queue *bfqq)
+static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
 {
 	bfqq->allocated--;
 
 	bfq_put_queue(bfqq);
 }
 
-static void bfq_finish_request(struct request *rq)
+/*
+ * Handle either a requeue or a finish for rq. The things to do are
+ * the same in both cases: all references to rq are to be dropped. In
+ * particular, rq is considered completed from the point of view of
+ * the scheduler.
+ */
+static void bfq_finish_requeue_request(struct request *rq)
 {
-	struct bfq_queue *bfqq;
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	struct bfq_data *bfqd;
 
-	if (!rq->elv.icq)
+	/*
+	 * Requeue and finish hooks are invoked in blk-mq without
+	 * checking whether the involved request is actually still
+	 * referenced in the scheduler. To handle this fact, the
+	 * following two checks make this function exit in case of
+	 * spurious invocations, for which there is nothing to do.
+	 *
+	 * First, check whether rq has nothing to do with an elevator.
+	 */
+	if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
+		return;
+
+	/*
+	 * rq either is not associated with any icq, or is an already
+	 * requeued request that has not (yet) been re-inserted into
+	 * a bfq_queue.
+	 */
+	if (!rq->elv.icq || !bfqq)
 		return;
 
-	bfqq = RQ_BFQQ(rq);
 	bfqd = bfqq->bfqd;
 
 	if (rq->rq_flags & RQF_STARTED)
@@ -4727,13 +4765,14 @@ static void bfq_finish_request(struct request *rq)
 		spin_lock_irqsave(&bfqd->lock, flags);
 
 		bfq_completed_request(bfqq, bfqd);
-		bfq_finish_request_body(bfqq);
+		bfq_finish_requeue_request_body(bfqq);
 
 		spin_unlock_irqrestore(&bfqd->lock, flags);
 	} else {
 		/*
 		 * Request rq may be still/already in the scheduler,
-		 * in which case we need to remove it. And we cannot
+		 * in which case we need to remove it (this should
+		 * never happen in case of requeue). And we cannot
 		 * defer such a check and removal, to avoid
 		 * inconsistencies in the time interval from the end
 		 * of this function to the start of the deferred work.
@@ -4748,9 +4787,26 @@ static void bfq_finish_request(struct request *rq)
 			bfqg_stats_update_io_remove(bfqq_group(bfqq),
 						    rq->cmd_flags);
 		}
-		bfq_finish_request_body(bfqq);
+		bfq_finish_requeue_request_body(bfqq);
 	}
 
+	/*
+	 * Reset private fields. In case of a requeue, this allows
+	 * this function to correctly do nothing if it is spuriously
+	 * invoked again on this same request (see the check at the
+	 * beginning of the function). Probably, a better general
+	 * design would be to prevent blk-mq from invoking the requeue
+	 * or finish hooks of an elevator, for a request that is not
+	 * referred by that elevator.
+	 *
+	 * Resetting the following fields would break the
+	 * request-insertion logic if rq is re-inserted into a bfq
+	 * internal queue, without a re-preparation. Here we assume
+	 * that re-insertions of requeued requests, without
+	 * re-preparation, can happen only for pass_through or at_head
+	 * requests (which are not re-inserted into bfq internal
+	 * queues).
+	 */
 	rq->elv.priv[0] = NULL;
 	rq->elv.priv[1] = NULL;
 }
@@ -5426,7 +5482,8 @@ static struct elevator_type iosched_bfq_mq = {
 	.ops.mq = {
 		.limit_depth		= bfq_limit_depth,
 		.prepare_request	= bfq_prepare_request,
-		.finish_request		= bfq_finish_request,
+		.requeue_request        = bfq_finish_requeue_request,
+		.finish_request		= bfq_finish_requeue_request,
 		.exit_icq		= bfq_exit_icq,
 		.insert_requests	= bfq_insert_requests,
 		.dispatch_request	= bfq_dispatch_request,
 
@@ -34,6 +34,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/blk-cgroup.h>
 #include <linux/debugfs.h>
+#include <linux/bpf.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -2083,6 +2084,14 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
 	return false;
 }
 
+static noinline int should_fail_bio(struct bio *bio)
+{
+	if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
+		return -EIO;
+	return 0;
+}
+ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
+
 /*
  * Remap block n of partition p to block n+start(p) of the disk.
  */
@@ -2174,7 +2183,7 @@ generic_make_request_checks(struct bio *bio)
 	if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
 		goto not_supported;
 
-	if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
+	if (should_fail_bio(bio))
 		goto end_io;
 
 	if (!bio->bi_partno) {
 
@@ -697,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
 
 static int wbt_data_dir(const struct request *rq)
 {
-	return rq_data_dir(rq);
+	const int op = req_op(rq);
+
+	if (op == REQ_OP_READ)
+		return READ;
+	else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH)
+		return WRITE;
+
+	/* don't account */
+	return -1;
 }
 
 int wbt_init(struct request_queue *q)
 
@@ -287,8 +287,10 @@ do {									\
 			break;						\
 									\
 		mutex_unlock(&(ca)->set->bucket_lock);			\
-		if (kthread_should_stop())				\
+		if (kthread_should_stop()) {				\
+			set_current_state(TASK_RUNNING);		\
 			return 0;					\
+		}							\
 									\
 		schedule();						\
 		mutex_lock(&(ca)->set->bucket_lock);			\
 
@@ -658,10 +658,15 @@ struct cache_set {
 	atomic_long_t		writeback_keys_done;
 	atomic_long_t		writeback_keys_failed;
 
+	atomic_long_t		reclaim;
+	atomic_long_t		flush_write;
+	atomic_long_t		retry_flush_write;
+
 	enum			{
 		ON_ERROR_UNREGISTER,
 		ON_ERROR_PANIC,
 	}			on_error;
+#define DEFAULT_IO_ERROR_LIMIT 8
 	unsigned		error_limit;
 	unsigned		error_decay;
 
@@ -675,6 +680,8 @@ struct cache_set {
 
 #define BUCKET_HASH_BITS	12
 	struct hlist_head	bucket_hash[1 << BUCKET_HASH_BITS];
+
+	DECLARE_HEAP(struct btree *, flush_btree);
 };
 
 struct bbio {
@@ -917,7 +924,7 @@ void bcache_write_super(struct cache_set *);
 
 int bch_flash_dev_create(struct cache_set *c, uint64_t size);
 
-int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
+int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *);
 void bch_cached_dev_detach(struct cached_dev *);
 void bch_cached_dev_run(struct cached_dev *);
 void bcache_device_stop(struct bcache_device *);
 
@@ -1869,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c)
 	 */
 	for_each_cache(ca, c, i) {
 		for_each_bucket(b, ca) {
-			if (fifo_full(&ca->free[RESERVE_PRIO]))
+			if (fifo_full(&ca->free[RESERVE_PRIO]) &&
+			    fifo_full(&ca->free[RESERVE_BTREE]))
 				break;
 
 			if (bch_can_invalidate_bucket(ca, b) &&
 			    !GC_MARK(b)) {
 				__bch_invalidate_one_bucket(ca, b);
-				fifo_push(&ca->free[RESERVE_PRIO],
-					  b - ca->buckets);
+				if (!fifo_push(&ca->free[RESERVE_PRIO],
+				   b - ca->buckets))
+					fifo_push(&ca->free[RESERVE_BTREE],
+						  b - ca->buckets);
 			}
 		}
 	}
 
@@ -368,35 +368,54 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 }
 
 /* Journalling */
+#define journal_max_cmp(l, r) \
+	(fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
+	 fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
+#define journal_min_cmp(l, r) \
+	(fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
+	 fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
 
 static void btree_flush_write(struct cache_set *c)
 {
 	/*
 	 * Try to find the btree node with that references the oldest journal
 	 * entry, best is our current candidate and is locked if non NULL:
 	 */
-	struct btree *b, *best;
-	unsigned i;
+	struct btree *b;
+	int i;
+
+	atomic_long_inc(&c->flush_write);
+
 retry:
-	best = NULL;
-
-	for_each_cached_btree(b, c, i)
-		if (btree_current_write(b)->journal) {
-			if (!best)
-				best = b;
-			else if (journal_pin_cmp(c,
-					btree_current_write(best)->journal,
-					btree_current_write(b)->journal)) {
-				best = b;
+	spin_lock(&c->journal.lock);
+	if (heap_empty(&c->flush_btree)) {
+		for_each_cached_btree(b, c, i)
+			if (btree_current_write(b)->journal) {
+				if (!heap_full(&c->flush_btree))
+					heap_add(&c->flush_btree, b,
+						 journal_max_cmp);
+				else if (journal_max_cmp(b,
+					 heap_peek(&c->flush_btree))) {
+					c->flush_btree.data[0] = b;
+					heap_sift(&c->flush_btree, 0,
+						  journal_max_cmp);
+				}
 			}
-		}
 
-	b = best;
+		for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
+			heap_sift(&c->flush_btree, i, journal_min_cmp);
+	}
+
+	b = NULL;
+	heap_pop(&c->flush_btree, b, journal_min_cmp);
+	spin_unlock(&c->journal.lock);
+
 	if (b) {
 		mutex_lock(&b->write_lock);
 		if (!btree_current_write(b)->journal) {
 			mutex_unlock(&b->write_lock);
 			/* We raced */
+			atomic_long_inc(&c->retry_flush_write);
 			goto retry;
 		}
 
@@ -476,6 +495,8 @@ static void journal_reclaim(struct cache_set *c)
 	unsigned iter, n = 0;
 	atomic_t p;
 
+	atomic_long_inc(&c->reclaim);
+
 	while (!atomic_read(&fifo_front(&c->journal.pin)))
 		fifo_pop(&c->journal.pin, p);
 
@@ -819,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c)
 	j->w[0].c = c;
 	j->w[1].c = c;
 
-	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+	if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
+	    !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
 	    !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
 	    !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
 		return -ENOMEM;
Original file line number	Diff line number	Diff line change
`@@ -287,8 +287,10 @@ do { \`
`287`	`287`	`break; \`
`288`	`288`	`\`
`289`	`289`	`mutex_unlock(&(ca)->set->bucket_lock); \`
`290`		`- if (kthread_should_stop()) \`
	`290`	`+ if (kthread_should_stop()) { \`
	`291`	`+ set_current_state(TASK_RUNNING); \`
`291`	`292`	`return 0; \`
	`293`	`+ } \`
`292`	`294`	`\`
`293`	`295`	`schedule(); \`
`294`	`296`	`mutex_lock(&(ca)->set->bucket_lock); \`