diff --git a/Documentation/admin-guide/bcache.rst b/Documentation/admin-guide/bcache.rst index 8d3a2d045c0ae7..20636aca454078 100644 --- a/Documentation/admin-guide/bcache.rst +++ b/Documentation/admin-guide/bcache.rst @@ -147,6 +147,86 @@ the backing devices to passthrough mode. writeback mode). It currently doesn't do anything intelligent if it fails to read some of the dirty data, though. +SSD LONGEVITY: PER-PROCESS CACHE HINTING WITH IO PRIORITY +--------------------------------------------------------- + +Processes can be assigned an IO priority using `ionice` and bcache will +either try to writeback or bypass the cache based on the IO priority +level assigned to the process and the configuration of the syfs ioprio +hints. If configured properly for your workload, this can both increase +performance and reduce SSD wear (erase/write cycles). + +Having idle IOs bypass the cache can increase performance elsewhere +since you probably don't care about their performance. In addition, +this prevents idle IOs from promoting into (polluting) your cache and +evicting blocks that are more important elsewhere. + +Default sysfs values: + 2,7: ioprio_bypass is hinted for process IOs at-or-below best-effort-7. + 0,0: ioprio_writeback hinting is disabled by default. + +Cache hinting is configured by writing 'class,level' pairs to sysfs. +In this example, we write the following: + + echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass + echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback + +Thus, processes with the following IO class (ionice -c) and level (-n) +will the behave as shown in this table: + + (-c) IO Class (-n) Class level Action + ----------------------------------------------------- + (1) Realtime 0-7 Writeback + (2) Best-effort 0 Writeback + (2) Best-effort 1-6 Normal, as if hinting were disabled + (2) Best-effort 7 Bypass cache + (3) Idle n/a Bypass cache + +For processes at-or-below best-effort-7 (ionice -c2 -n7), the +ioprio_bypass behavior is as follows: + +* Reads will come from the backing device and will not promote into + (pollute) your cache. If the block being read was already in the cache, + then it will be read from the cache (and remain cached). + +* If you are using writeback mode, then low-priority bypass-hinted writes + will go directly to the backing device. If the write was dirty in + cache, it will cache-invalidate and write directly to the backing + device. If a high-priority task later writes the same block then it + will writeback so no performance is lost for write-after-write. + + For read-after-bypassed-write, the block will be read from the backing + device (not cached) so there may be a miss penalty when a low-priority + process write bypasses the cache followed by a high-priority read that + would otherwise have hit. In practice, this is not an issue; to date, + none have wanted low-priority writes and high-priority reads of the + same block. + +For processes in our example at-or-above best-effort-0 (ionice -c2 -n0), +the ioprio_writeback behavior is as follows: + +* The writeback hint has no effect unless your 'cache_mode' is writeback. + Assuming writeback mode, all writes at this priority will writeback. + Of course this will increase SSD wear, so only use writeback hinting + if you need it. + +* Reads are unaffected by ioprio_writeback, except that read-after-write + will of course read from the cache. + +Linux assigns processes the best-effort class with a level of 4 if +no process is assigned Thus, without `ionice` your processes will +follow normal bcache should_writeback/should_bypass symantecs as if the +ioprio_writeback/ioprio_bypass sysfs flags were disabled. + +Also note that in order to be hinted by ioprio_writeback/ioprio_bypass, +the process must have a valid ioprio setting as returned by +get_task_io_context()->ioprio. Thus, a process without an IO context +will be ignored by the ioprio_writeback/ioprio_bypass hints even if your +sysfs hints specify that best-effort-4 should be flagged for bypass +or writeback. If in doubt, explicitly set the process IO priority with +`ionice`. + +See `man ionice` for more detail about per-process IO priority in Linux. Howto/cookbook -------------- diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 8c371d5eef8eb9..097577ae3c4717 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -482,8 +482,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) - __bch_bucket_free(PTR_CACHE(c, k, i), - PTR_BUCKET(c, k, i)); + __bch_bucket_free(c->cache, PTR_BUCKET(c, k, i)); } int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, @@ -674,7 +673,7 @@ bool bch_alloc_sectors(struct cache_set *c, SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); atomic_long_add(sectors, - &PTR_CACHE(c, &b->key, i)->sectors_written); + &c->cache->sectors_written); } if (b->sectors_free < c->cache->sb.block_size) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index e8bf4f752e8bee..eab16a8a69f670 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -373,6 +373,7 @@ struct cached_dev { unsigned int partial_stripes_expensive:1; unsigned int writeback_metadata:1; unsigned int writeback_running:1; + unsigned int writeback_consider_fragment:1; unsigned char writeback_percent; unsigned int writeback_delay; @@ -385,6 +386,9 @@ struct cached_dev { unsigned int writeback_rate_update_seconds; unsigned int writeback_rate_i_term_inverse; unsigned int writeback_rate_p_term_inverse; + unsigned int writeback_rate_fp_term_low; + unsigned int writeback_rate_fp_term_mid; + unsigned int writeback_rate_fp_term_high; unsigned int writeback_rate_minimum; enum stop_on_failure stop_when_cache_set_failed; @@ -393,6 +397,9 @@ struct cached_dev { unsigned int error_limit; unsigned int offline_seconds; + unsigned short ioprio_writeback; + unsigned short ioprio_bypass; + char backing_dev_name[BDEVNAME_SIZE]; }; @@ -800,13 +807,6 @@ static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) return s & (c->cache->sb.bucket_size - 1); } -static inline struct cache *PTR_CACHE(struct cache_set *c, - const struct bkey *k, - unsigned int ptr) -{ - return c->cache; -} - static inline size_t PTR_BUCKET_NR(struct cache_set *c, const struct bkey *k, unsigned int ptr) @@ -818,7 +818,7 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, const struct bkey *k, unsigned int ptr) { - return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); + return c->cache->buckets + PTR_BUCKET_NR(c, k, ptr); } static inline uint8_t gen_after(uint8_t a, uint8_t b) @@ -837,7 +837,7 @@ static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, static inline bool ptr_available(struct cache_set *c, const struct bkey *k, unsigned int i) { - return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && c->cache; } /* Btree key macros */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fe6dce125aba22..183a58c893774d 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -426,7 +426,7 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent) do_btree_node_write(b); atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, - &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); + &b->c->cache->btree_sectors_written); b->written += set_blocks(i, block_bytes(b->c->cache)); } @@ -1161,7 +1161,7 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) SET_PTR_GEN(k, i, - bch_inc_gen(PTR_CACHE(b->c, &b->key, i), + bch_inc_gen(b->c->cache, PTR_BUCKET(b->c, &b->key, i))); mutex_unlock(&b->c->bucket_lock); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index b00fd08d696b5f..b2eb59b9cd7107 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -50,7 +50,7 @@ void bch_btree_verify(struct btree *b) v->keys.ops = b->keys.ops; bio = bch_bbio_alloc(b->c); - bio_set_dev(bio, PTR_CACHE(b->c, &b->key, 0)->bdev); + bio_set_dev(bio, c->cache->bdev); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; bio->bi_opf = REQ_OP_READ | REQ_META; diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index f4658a1f37b862..d626ffcbecb99c 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -50,7 +50,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); + struct cache *ca = c->cache; size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); @@ -71,7 +71,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); + struct cache *ca = c->cache; size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index dad71a6b78891c..e4388fe3ab7ef9 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -36,7 +36,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) struct bbio *b = container_of(bio, struct bbio, bio); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); - bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); + bio_set_dev(bio, c->cache->bdev); b->submit_time_us = local_clock_us(); closure_bio_submit(c, bio, bio->bi_private); @@ -137,7 +137,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, blk_status_t error, const char *m) { struct bbio *b = container_of(bio, struct bbio, bio); - struct cache *ca = PTR_CACHE(c, &b->key, 0); + struct cache *ca = c->cache; int is_read = (bio_data_dir(bio) == READ ? 1 : 0); unsigned int threshold = op_is_write(bio_op(bio)) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index c6613e81733376..de2c0d7699cf54 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -768,7 +768,7 @@ static void journal_write_unlocked(struct closure *cl) w->data->csum = csum_set(w->data); for (i = 0; i < KEY_PTRS(k); i++) { - ca = PTR_CACHE(c, k, i); + ca = c->cache; bio = &ca->journal.bio; atomic_long_add(sectors, &ca->meta_sectors_written); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 21432638314562..28d922bbbf1c0f 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -367,6 +367,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) unsigned int sectors, congested; struct task_struct *task = current; struct io *i; + struct io_context *ioc; + unsigned short ioprio; if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || c->gc_stats.in_use > CUTOFF_CACHE_ADD || @@ -394,6 +396,29 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; } + /* If the ioprio already exists on the bio, use that. We assume that + * the upper layer properly assigned the calling process's ioprio to + * the bio being passed to bcache. Otherwise, use current's ioc. */ + ioprio = bio_prio(bio); + if (!ioprio_valid(ioprio)) { + ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE); + if (ioc) { + if (ioprio_valid(ioc->ioprio)) + ioprio = ioc->ioprio; + put_io_context(ioc); + ioc = NULL; + } + } + + /* If process ioprio is lower-or-equal to dc->ioprio_bypass, and the + * request is not REQ_META|REQ_PRIO, then hint for bypass. Note that a + * lower-priority IO class+value has a greater numeric value. */ + if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) + && ioprio >= dc->ioprio_bypass) { + if (!(bio->bi_opf & (REQ_META|REQ_PRIO))) + goto skip; + } + if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) || bio_sectors(bio) & (c->cache->sb.block_size - 1)) { pr_debug("skipping unaligned io\n"); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 248bda63f08527..29d5705bda4fdb 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1052,6 +1052,7 @@ static int cached_dev_status_update(void *arg) int bch_cached_dev_run(struct cached_dev *dc) { + int ret = 0; struct bcache_device *d = &dc->disk; char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); char *env[] = { @@ -1064,19 +1065,15 @@ int bch_cached_dev_run(struct cached_dev *dc) if (dc->io_disable) { pr_err("I/O disabled on cached dev %s\n", dc->backing_dev_name); - kfree(env[1]); - kfree(env[2]); - kfree(buf); - return -EIO; + ret = -EIO; + goto out; } if (atomic_xchg(&dc->running, 1)) { - kfree(env[1]); - kfree(env[2]); - kfree(buf); pr_info("cached dev %s is running already\n", dc->backing_dev_name); - return -EBUSY; + ret = -EBUSY; + goto out; } if (!d->c && @@ -1097,15 +1094,13 @@ int bch_cached_dev_run(struct cached_dev *dc) * only class / kset properties are persistent */ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); - kfree(env[1]); - kfree(env[2]); - kfree(buf); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } dc->status_update_thread = kthread_run(cached_dev_status_update, @@ -1114,7 +1109,11 @@ int bch_cached_dev_run(struct cached_dev *dc) pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n"); } - return 0; +out: + kfree(env[1]); + kfree(env[2]); + kfree(buf); + return ret; } /* diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 554e3afc9b688b..6a01e331b1e5bc 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -117,10 +117,14 @@ rw_attribute(writeback_running); rw_attribute(writeback_percent); rw_attribute(writeback_delay); rw_attribute(writeback_rate); +rw_attribute(writeback_consider_fragment); rw_attribute(writeback_rate_update_seconds); rw_attribute(writeback_rate_i_term_inverse); rw_attribute(writeback_rate_p_term_inverse); +rw_attribute(writeback_rate_fp_term_low); +rw_attribute(writeback_rate_fp_term_mid); +rw_attribute(writeback_rate_fp_term_high); rw_attribute(writeback_rate_minimum); read_attribute(writeback_rate_debug); @@ -149,6 +153,9 @@ rw_attribute(idle_max_writeback_rate); rw_attribute(gc_after_writeback); rw_attribute(size); +rw_attribute(ioprio_writeback); +rw_attribute(ioprio_bypass); + static ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], @@ -195,6 +202,7 @@ SHOW(__bch_cached_dev) var_printf(bypass_torture_test, "%i"); var_printf(writeback_metadata, "%i"); var_printf(writeback_running, "%i"); + var_printf(writeback_consider_fragment, "%i"); var_print(writeback_delay); var_print(writeback_percent); sysfs_hprint(writeback_rate, @@ -205,6 +213,9 @@ SHOW(__bch_cached_dev) var_print(writeback_rate_update_seconds); var_print(writeback_rate_i_term_inverse); var_print(writeback_rate_p_term_inverse); + var_print(writeback_rate_fp_term_low); + var_print(writeback_rate_fp_term_mid); + var_print(writeback_rate_fp_term_high); var_print(writeback_rate_minimum); if (attr == &sysfs_writeback_rate_debug) { @@ -277,6 +288,16 @@ SHOW(__bch_cached_dev) return strlen(buf); } + if (attr == &sysfs_ioprio_bypass) + return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n", + IOPRIO_PRIO_CLASS(dc->ioprio_bypass), + IOPRIO_PRIO_DATA(dc->ioprio_bypass)); + + if (attr == &sysfs_ioprio_writeback) + return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n", + IOPRIO_PRIO_CLASS(dc->ioprio_writeback), + IOPRIO_PRIO_DATA(dc->ioprio_writeback)); + #undef var return 0; } @@ -289,6 +310,10 @@ STORE(__cached_dev) ssize_t v; struct cache_set *c; struct kobj_uevent_env *env; + unsigned ioprio_class = 0; /* invalid initial ioprio values */ + unsigned ioprio_level = IOPRIO_BE_NR; + unsigned short *ioprio_hint = NULL; + char *ioprio_type = NULL; /* no user space access if system is rebooting */ if (bcache_is_reboot) @@ -303,6 +328,7 @@ STORE(__cached_dev) sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test); sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata); sysfs_strtoul_bool(writeback_running, dc->writeback_running); + sysfs_strtoul_bool(writeback_consider_fragment, dc->writeback_consider_fragment); sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX); sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, @@ -331,6 +357,16 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_rate_p_term_inverse, dc->writeback_rate_p_term_inverse, 1, UINT_MAX); + sysfs_strtoul_clamp(writeback_rate_fp_term_low, + dc->writeback_rate_fp_term_low, + 1, dc->writeback_rate_fp_term_mid - 1); + sysfs_strtoul_clamp(writeback_rate_fp_term_mid, + dc->writeback_rate_fp_term_mid, + dc->writeback_rate_fp_term_low + 1, + dc->writeback_rate_fp_term_high - 1); + sysfs_strtoul_clamp(writeback_rate_fp_term_high, + dc->writeback_rate_fp_term_high, + dc->writeback_rate_fp_term_mid + 1, UINT_MAX); sysfs_strtoul_clamp(writeback_rate_minimum, dc->writeback_rate_minimum, 1, UINT_MAX); @@ -435,6 +471,57 @@ STORE(__cached_dev) if (attr == &sysfs_stop) bcache_device_stop(&dc->disk); + /* ioprio hinting: we use ioprio_hint to reduce duplicate printk verbiage */ + if (attr == &sysfs_ioprio_writeback) { + ioprio_hint = &dc->ioprio_writeback; + ioprio_type = "writeback"; + } + + if (attr == &sysfs_ioprio_bypass) { + ioprio_hint = &dc->ioprio_bypass; + ioprio_type = "bypass"; + } + + if (ioprio_hint != NULL) + { + if (sscanf(buf, "%u,%u", &ioprio_class, &ioprio_level) != 2 + || ioprio_class > IOPRIO_CLASS_IDLE + || ioprio_level >= IOPRIO_BE_NR) { + pr_err("ioprio_%s invalid, expecting: (class,level) but parsed (%u,%u); ignored.", + ioprio_type, + ioprio_class, ioprio_level); + return size; + } + + /* Use the maximum(/minimum) value in the class shift space to make integer + comparison correct for ioprio_writeback(/ioprio_bypass) for IOPRIO_CLASS_IDLE. + This is necessary because there are no ioprio levels for the idle class. */ + if (ioprio_class == IOPRIO_CLASS_IDLE) { + if (ioprio_hint == &dc->ioprio_writeback) + ioprio_level = IOPRIO_PRIO_MASK; + else + /* Same, but 0 for bypass (inverted vs. writeback) */ + ioprio_level = 0; + } + + *ioprio_hint = IOPRIO_PRIO_VALUE(ioprio_class, ioprio_level); + + if (!ioprio_valid(*ioprio_hint)) + pr_info("disabled ioprio_%s hints.", ioprio_type); + else + pr_info("set hint for cache %s with priority %s: (class,level) = (%u,%u)", + ioprio_type, + ( ioprio_hint == &dc->ioprio_writeback ? "at-or-above" : "at-or-below" ), + ioprio_class, ioprio_level); + + if (ioprio_valid(dc->ioprio_writeback) + && ioprio_valid(dc->ioprio_bypass) + && dc->ioprio_writeback >= dc->ioprio_bypass) + pr_warn( + "warning: ioprio_writeback hint is neither disabled nor higher priority than the bypass hint; " + "will always writeback!\n"); + } + return size; } @@ -499,9 +586,13 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_delay, &sysfs_writeback_percent, &sysfs_writeback_rate, + &sysfs_writeback_consider_fragment, &sysfs_writeback_rate_update_seconds, &sysfs_writeback_rate_i_term_inverse, &sysfs_writeback_rate_p_term_inverse, + &sysfs_writeback_rate_fp_term_low, + &sysfs_writeback_rate_fp_term_mid, + &sysfs_writeback_rate_fp_term_high, &sysfs_writeback_rate_minimum, &sysfs_writeback_rate_debug, &sysfs_io_errors, @@ -522,6 +613,8 @@ static struct attribute *bch_cached_dev_files[] = { #endif &sysfs_backing_dev_name, &sysfs_backing_dev_uuid, + &sysfs_ioprio_bypass, + &sysfs_ioprio_writeback, NULL }; KTYPE(bch_cached_dev); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 3c74996978dade..d1a5ddabb22af3 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -88,6 +88,44 @@ static void __update_writeback_rate(struct cached_dev *dc) int64_t integral_scaled; uint32_t new_rate; + /* + * We need to consider the number of dirty buckets as well + * when calculating the proportional_scaled, Otherwise we might + * have an unreasonable small writeback rate at a highly fragmented situation + * when very few dirty sectors consumed a lot dirty buckets, the + * worst case is when dirty buckets reached cutoff_writeback_sync and + * dirty data is still not even reached to writeback percent, so the rate + * still will be at the minimum value, which will cause the write + * stuck at a non-writeback mode. + */ + struct cache_set *c = dc->disk.c; + + int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets; + + if (dc->writeback_consider_fragment && + c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) { + int64_t fragment = + div_s64((dirty_buckets * c->cache->sb.bucket_size), dirty); + int64_t fp_term; + int64_t fps; + + if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) { + fp_term = (int64_t)dc->writeback_rate_fp_term_low * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW); + } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) { + fp_term = (int64_t)dc->writeback_rate_fp_term_mid * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID); + } else { + fp_term = (int64_t)dc->writeback_rate_fp_term_high * + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH); + } + fps = div_s64(dirty, dirty_buckets) * fp_term; + if (fragment > 3 && fps > proportional_scaled) { + /* Only overrite the p when fragment > 3 */ + proportional_scaled = fps; + } + } + if ((error < 0 && dc->writeback_rate_integral > 0) || (error > 0 && time_before64(local_clock(), dc->writeback_rate.next + NSEC_PER_MSEC))) { @@ -378,7 +416,7 @@ static void read_dirty_endio(struct bio *bio) struct dirty_io *io = w->private; /* is_read = 1 */ - bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), + bch_count_io_errors(io->dc->disk.c->cache, bio->bi_status, 1, "reading dirty data from cache"); @@ -472,8 +510,7 @@ static void read_dirty(struct cached_dev *dc) dirty_init(w); bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); - bio_set_dev(&io->bio, - PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); + bio_set_dev(&io->bio, dc->disk.c->cache->bdev); io->bio.bi_end_io = read_dirty_endio; if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) @@ -968,6 +1005,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_metadata = true; dc->writeback_running = false; + dc->writeback_consider_fragment = true; dc->writeback_percent = 10; dc->writeback_delay = 30; atomic_long_set(&dc->writeback_rate.rate, 1024); @@ -975,8 +1013,21 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_fp_term_low = 1; + dc->writeback_rate_fp_term_mid = 10; + dc->writeback_rate_fp_term_high = 1000; dc->writeback_rate_i_term_inverse = 10000; + /* + * These defaults provide the best SSD life by enabling bypass + * for priorities at-or-below BE-7. This also provides better + * performance (cache hits) by preventing (near-)idle processes from + * polluting the cache working set. Only set ioprio_writeback if + * you really need it: it will wear out your SSD sooner. + */ + dc->ioprio_writeback = IOPRIO_PRIO_VALUE(0, 0); + dc->ioprio_bypass = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, (IOPRIO_BE_NR-1)); + WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 3f1230e22de013..ffa41b1c27a74a 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -16,6 +16,10 @@ #define BCH_AUTO_GC_DIRTY_THRESHOLD 50 +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW 50 +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID 57 +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH 64 + #define BCH_DIRTY_INIT_THRD_MAX 64 /* * 14 (16384ths) is chosen here as something that each backing device @@ -99,6 +103,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { unsigned int in_use = dc->disk.c->gc_stats.in_use; + struct io_context *ioc; + unsigned short ioprio; if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || @@ -116,6 +122,28 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (would_skip) return false; + /* If the ioprio already exists on the bio, use that. We assume that + * the upper layer properly assigned the calling process's ioprio to + * the bio being passed to bcache. Otherwise, use current's ioc. */ + ioprio = bio_prio(bio); + if (!ioprio_valid(ioprio)) { + ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE); + if (ioc) { + if (ioprio_valid(ioc->ioprio)) + ioprio = ioc->ioprio; + put_io_context(ioc); + ioc = NULL; + } + } + + /* If process ioprio is higher-or-equal to dc->ioprio_writeback, then + * hint for writeback. Note that a higher-priority IO class+value + * has a lesser numeric value. */ + if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) + && ioprio <= dc->ioprio_writeback) { + return true; + } + return (op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO) || in_use <= bch_cutoff_writeback);