kakra · kakra · Oct 11, 2016 · Oct 11, 2016 · Oct 3, 2020 · Jan 20, 2021
diff --git a/Documentation/admin-guide/bcache.rst b/Documentation/admin-guide/bcache.rst
@@ -147,6 +147,86 @@ the backing devices to passthrough mode.
    writeback mode). It currently doesn't do anything intelligent if it fails to
    read some of the dirty data, though.
 
+SSD LONGEVITY: PER-PROCESS CACHE HINTING WITH IO PRIORITY
+---------------------------------------------------------
+
+Processes can be assigned an IO priority using `ionice` and bcache will
+either try to writeback or bypass the cache based on the IO priority
+level assigned to the process and the configuration of the syfs ioprio
+hints.  If configured properly for your workload, this can both increase
+performance and reduce SSD wear (erase/write cycles).
+
+Having idle IOs bypass the cache can increase performance elsewhere
+since you probably don't care about their performance.  In addition,
+this prevents idle IOs from promoting into (polluting) your cache and
+evicting blocks that are more important elsewhere.
+
+Default sysfs values:
+	2,7: ioprio_bypass is hinted for process IOs at-or-below best-effort-7.
+	0,0: ioprio_writeback hinting is disabled by default.
+
+Cache hinting is configured by writing 'class,level' pairs to sysfs.
+In this example, we write the following:
+
+    echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
+    echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback
+
+Thus, processes with the following IO class (ionice -c) and level (-n)
+will the behave as shown in this table:
+
+	(-c) IO Class    (-n) Class level       Action
+	-----------------------------------------------------
+	(1) Realtime      0-7                   Writeback
+	(2) Best-effort     0                   Writeback
+	(2) Best-effort   1-6                   Normal, as if hinting were disabled
+	(2) Best-effort     7                   Bypass cache
+	(3) Idle          n/a                   Bypass cache
+
+For processes at-or-below best-effort-7 (ionice -c2 -n7), the
+ioprio_bypass behavior is as follows:
+
+* Reads will come from the backing device and will not promote into
+  (pollute) your cache.  If the block being read was already in the cache,
+  then it will be read from the cache (and remain cached).
+
+* If you are using writeback mode, then low-priority bypass-hinted writes
+  will go directly to the backing device.  If the write was dirty in
+  cache, it will cache-invalidate and write directly to the backing
+  device.  If a high-priority task later writes the same block then it
+  will writeback so no performance is lost for write-after-write.
+
+  For read-after-bypassed-write, the block will be read from the backing
+  device (not cached) so there may be a miss penalty when a low-priority
+  process write bypasses the cache followed by a high-priority read that
+  would otherwise have hit.  In practice, this is not an issue; to date,
+  none have wanted low-priority writes and high-priority reads of the
+  same block.
+
+For processes in our example at-or-above best-effort-0 (ionice -c2 -n0),
+the ioprio_writeback behavior is as follows:
+
+* The writeback hint has no effect unless your 'cache_mode' is writeback.
+  Assuming writeback mode, all writes at this priority will writeback.
+  Of course this will increase SSD wear, so only use writeback hinting
+  if you need it.
+
+* Reads are unaffected by ioprio_writeback, except that read-after-write
+  will of course read from the cache.
+
+Linux assigns processes the best-effort class with a level of 4 if
+no process is assigned  Thus, without `ionice` your processes will
+follow normal bcache should_writeback/should_bypass symantecs as if the
+ioprio_writeback/ioprio_bypass sysfs flags were disabled.
+
+Also note that in order to be hinted by ioprio_writeback/ioprio_bypass,
+the process must have a valid ioprio setting as returned by
+get_task_io_context()->ioprio. Thus, a process without an IO context
+will be ignored by the ioprio_writeback/ioprio_bypass hints even if your
+sysfs hints specify that best-effort-4 should be flagged for bypass
+or writeback.  If in doubt, explicitly set the process IO priority with
+`ionice`.
+
+See `man ionice` for more detail about per-process IO priority in Linux.
 
 Howto/cookbook
 --------------

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
@@ -482,8 +482,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
 	unsigned int i;
 
 	for (i = 0; i < KEY_PTRS(k); i++)
-		__bch_bucket_free(PTR_CACHE(c, k, i),
-				  PTR_BUCKET(c, k, i));
+		__bch_bucket_free(c->cache, PTR_BUCKET(c, k, i));
 }
 
 int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
@@ -674,7 +673,7 @@ bool bch_alloc_sectors(struct cache_set *c,
 		SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
 
 		atomic_long_add(sectors,
-				&PTR_CACHE(c, &b->key, i)->sectors_written);
+				&c->cache->sectors_written);
 	}
 
 	if (b->sectors_free < c->cache->sb.block_size)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
@@ -373,6 +373,7 @@ struct cached_dev {
 	unsigned int		partial_stripes_expensive:1;
 	unsigned int		writeback_metadata:1;
 	unsigned int		writeback_running:1;
+	unsigned int		writeback_consider_fragment:1;
 	unsigned char		writeback_percent;
 	unsigned int		writeback_delay;
 
@@ -385,6 +386,9 @@ struct cached_dev {
 	unsigned int		writeback_rate_update_seconds;
 	unsigned int		writeback_rate_i_term_inverse;
 	unsigned int		writeback_rate_p_term_inverse;
+	unsigned int		writeback_rate_fp_term_low;
+	unsigned int		writeback_rate_fp_term_mid;
+	unsigned int		writeback_rate_fp_term_high;
 	unsigned int		writeback_rate_minimum;
 
 	enum stop_on_failure	stop_when_cache_set_failed;
@@ -393,6 +397,9 @@ struct cached_dev {
 	unsigned int		error_limit;
 	unsigned int		offline_seconds;
 
+	unsigned short		ioprio_writeback;
+	unsigned short		ioprio_bypass;
+
 	char			backing_dev_name[BDEVNAME_SIZE];
 };
 
@@ -800,13 +807,6 @@ static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
 	return s & (c->cache->sb.bucket_size - 1);
 }
 
-static inline struct cache *PTR_CACHE(struct cache_set *c,
-				      const struct bkey *k,
-				      unsigned int ptr)
-{
-	return c->cache;
-}
-
 static inline size_t PTR_BUCKET_NR(struct cache_set *c,
 				   const struct bkey *k,
 				   unsigned int ptr)
@@ -818,7 +818,7 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
 					const struct bkey *k,
 					unsigned int ptr)
 {
-	return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
+	return c->cache->buckets + PTR_BUCKET_NR(c, k, ptr);
 }
 
 static inline uint8_t gen_after(uint8_t a, uint8_t b)
@@ -837,7 +837,7 @@ static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
 static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
 				 unsigned int i)
 {
-	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && c->cache;
 }
 
 /* Btree key macros */

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
@@ -426,7 +426,7 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent)
 	do_btree_node_write(b);
 
 	atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size,
-			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
+			&b->c->cache->btree_sectors_written);
 
 	b->written += set_blocks(i, block_bytes(b->c->cache));
 }
@@ -1161,7 +1161,7 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
 
 	for (i = 0; i < KEY_PTRS(k); i++)
 		SET_PTR_GEN(k, i,
-			    bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
+			    bch_inc_gen(b->c->cache,
 					PTR_BUCKET(b->c, &b->key, i)));
 
 	mutex_unlock(&b->c->bucket_lock);

diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
@@ -50,7 +50,7 @@ void bch_btree_verify(struct btree *b)
 	v->keys.ops = b->keys.ops;
 
 	bio = bch_bbio_alloc(b->c);
-	bio_set_dev(bio, PTR_CACHE(b->c, &b->key, 0)->bdev);
+	bio_set_dev(bio, c->cache->bdev);
 	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0);
 	bio->bi_iter.bi_size	= KEY_SIZE(&v->key) << 9;
 	bio->bi_opf		= REQ_OP_READ | REQ_META;

diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
@@ -50,7 +50,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
 
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i)) {
-			struct cache *ca = PTR_CACHE(c, k, i);
+			struct cache *ca = c->cache;
 			size_t bucket = PTR_BUCKET_NR(c, k, i);
 			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
 
@@ -71,7 +71,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
 
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i)) {
-			struct cache *ca = PTR_CACHE(c, k, i);
+			struct cache *ca = c->cache;
 			size_t bucket = PTR_BUCKET_NR(c, k, i);
 			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
 

diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
@@ -36,7 +36,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
 	struct bbio *b = container_of(bio, struct bbio, bio);
 
 	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0);
-	bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev);
+	bio_set_dev(bio, c->cache->bdev);
 
 	b->submit_time_us = local_clock_us();
 	closure_bio_submit(c, bio, bio->bi_private);
@@ -137,7 +137,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
 			      blk_status_t error, const char *m)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
-	struct cache *ca = PTR_CACHE(c, &b->key, 0);
+	struct cache *ca = c->cache;
 	int is_read = (bio_data_dir(bio) == READ ? 1 : 0);
 
 	unsigned int threshold = op_is_write(bio_op(bio))

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
@@ -768,7 +768,7 @@ static void journal_write_unlocked(struct closure *cl)
 	w->data->csum		= csum_set(w->data);
 
 	for (i = 0; i < KEY_PTRS(k); i++) {
-		ca = PTR_CACHE(c, k, i);
+		ca = c->cache;
 		bio = &ca->journal.bio;
 
 		atomic_long_add(sectors, &ca->meta_sectors_written);

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
@@ -367,6 +367,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	unsigned int sectors, congested;
 	struct task_struct *task = current;
 	struct io *i;
+	struct io_context *ioc;
+	unsigned short ioprio;
 
 	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -394,6 +396,29 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 			goto skip;
 	}
 
+	/* If the ioprio already exists on the bio, use that.  We assume that
+	 * the upper layer properly assigned the calling process's ioprio to
+	 * the bio being passed to bcache. Otherwise, use current's ioc. */
+	ioprio = bio_prio(bio);
+	if (!ioprio_valid(ioprio)) {
+		ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE);
+		if (ioc) {
+			if (ioprio_valid(ioc->ioprio))
+				ioprio = ioc->ioprio;
+			put_io_context(ioc);
+			ioc = NULL;
+		}
+	}
+
+	/* If process ioprio is lower-or-equal to dc->ioprio_bypass, and the
+	 * request is not REQ_META|REQ_PRIO, then hint for bypass. Note that a
+	 * lower-priority IO class+value has a greater numeric value. */
+	if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback)
+		&& ioprio >= dc->ioprio_bypass) {
+		if (!(bio->bi_opf & (REQ_META|REQ_PRIO)))
+			goto skip;
+	}
+
 	if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) ||
 	    bio_sectors(bio) & (c->cache->sb.block_size - 1)) {
 		pr_debug("skipping unaligned io\n");

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
@@ -1052,6 +1052,7 @@ static int cached_dev_status_update(void *arg)
 
 int bch_cached_dev_run(struct cached_dev *dc)
 {
+	int ret = 0;
 	struct bcache_device *d = &dc->disk;
 	char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
 	char *env[] = {
@@ -1064,19 +1065,15 @@ int bch_cached_dev_run(struct cached_dev *dc)
 	if (dc->io_disable) {
 		pr_err("I/O disabled on cached dev %s\n",
 		       dc->backing_dev_name);
-		kfree(env[1]);
-		kfree(env[2]);
-		kfree(buf);
-		return -EIO;
+		ret = -EIO;
+		goto out;
 	}
 
 	if (atomic_xchg(&dc->running, 1)) {
-		kfree(env[1]);
-		kfree(env[2]);
-		kfree(buf);
 		pr_info("cached dev %s is running already\n",
 		       dc->backing_dev_name);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto out;
 	}
 
 	if (!d->c &&
@@ -1097,15 +1094,13 @@ int bch_cached_dev_run(struct cached_dev *dc)
 	 * only class / kset properties are persistent
 	 */
 	kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
-	kfree(env[1]);
-	kfree(env[2]);
-	kfree(buf);
 
 	if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
 	    sysfs_create_link(&disk_to_dev(d->disk)->kobj,
 			      &d->kobj, "bcache")) {
 		pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n");
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	dc->status_update_thread = kthread_run(cached_dev_status_update,
@@ -1114,7 +1109,11 @@ int bch_cached_dev_run(struct cached_dev *dc)
 		pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n");
 	}
 
-	return 0;
+out:
+	kfree(env[1]);
+	kfree(env[2]);
+	kfree(buf);
+	return ret;
 }
 
 /*