From 771f393e8ffc9b3066e4830ee5f7391b8e8874f1 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 18 Mar 2018 17:36:17 -0700 Subject: bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags When too many I/Os failed on cache device, bch_cache_set_error() is called in the error handling code path to retire whole problematic cache set. If new I/O requests continue to come and take refcount dc->count, the cache set won't be retired immediately, this is a problem. Further more, there are several kernel thread and self-armed kernel work may still running after bch_cache_set_error() is called. It needs to wait quite a while for them to stop, or they won't stop at all. They also prevent the cache set from being retired. The solution in this patch is, to add per cache set flag to disable I/O request on this cache and all attached backing devices. Then new coming I/O requests can be rejected in *_make_request() before taking refcount, kernel threads and self-armed kernel worker can stop very fast when flags bit CACHE_SET_IO_DISABLE is set. Because bcache also do internal I/Os for writeback, garbage collection, bucket allocation, journaling, this kind of I/O should be disabled after bch_cache_set_error() is called. So closure_bio_submit() is modified to check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set, closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and return, generic_make_request() won't be called. A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit from cache_set->flags, to disable or enable cache set I/O for debugging. It is helpful to trigger more corner case issues for failed cache device. Changelog v4, add wait_for_kthread_stop(), and call it before exits writeback and gc kernel threads. v3, change CACHE_SET_IO_DISABLE from 4 to 3, since it is bit index. remove "bcache: " prefix when printing out kernel message. v2, more changes by previous review, - Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui. - Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this is reported and inspired from origal patch of Pavel Vazharov. v1, initial version. Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Reviewed-by: Michael Lyle Cc: Junhui Tang Cc: Michael Lyle Cc: Pavel Vazharov Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'drivers/md/bcache/request.c') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 6422846b546e..7aca308bee5b 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -747,7 +747,7 @@ static void cached_dev_read_error(struct closure *cl) /* XXX: invalidate cache */ - closure_bio_submit(bio, cl); + closure_bio_submit(s->iop.c, bio, cl); } continue_at(cl, cached_dev_cache_miss_done, NULL); @@ -872,7 +872,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_miss = miss; s->iop.bio = cache_bio; bio_get(cache_bio); - closure_bio_submit(cache_bio, &s->cl); + closure_bio_submit(s->iop.c, cache_bio, &s->cl); return ret; out_put: @@ -880,7 +880,7 @@ out_put: out_submit: miss->bi_end_io = request_endio; miss->bi_private = &s->cl; - closure_bio_submit(miss, &s->cl); + closure_bio_submit(s->iop.c, miss, &s->cl); return ret; } @@ -945,7 +945,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) if ((bio_op(bio) != REQ_OP_DISCARD) || blk_queue_discard(bdev_get_queue(dc->bdev))) - closure_bio_submit(bio, cl); + closure_bio_submit(s->iop.c, bio, cl); } else if (s->iop.writeback) { bch_writeback_add(dc); s->iop.bio = bio; @@ -960,12 +960,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) flush->bi_private = cl; flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - closure_bio_submit(flush, cl); + closure_bio_submit(s->iop.c, flush, cl); } } else { s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); - closure_bio_submit(bio, cl); + closure_bio_submit(s->iop.c, bio, cl); } closure_call(&s->iop.cl, bch_data_insert, NULL, cl); @@ -981,7 +981,7 @@ static void cached_dev_nodata(struct closure *cl) bch_journal_meta(s->iop.c, cl); /* If it's a flush, we send the flush to the backing device too */ - closure_bio_submit(bio, cl); + closure_bio_submit(s->iop.c, bio, cl); continue_at(cl, cached_dev_bio_complete, NULL); } @@ -996,6 +996,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); + if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; + } + atomic_set(&dc->backing_idle, 0); generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); @@ -1112,6 +1118,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, struct bcache_device *d = bio->bi_disk->private_data; int rw = bio_data_dir(bio); + if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; + } + generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); s = search_alloc(bio, d); -- cgit v1.2.1 From bc082a55d25c837341709accaf11311c3a9af727 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Sun, 18 Mar 2018 17:36:19 -0700 Subject: bcache: fix inaccurate io state for detached bcache devices When we run IO in a detached device, and run iostat to shows IO status, normally it will show like bellow (Omitted some fields): Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util sdd ... 15.89 0.53 1.82 0.20 2.23 1.81 52.30 bcache0 ... 15.89 115.42 0.00 0.00 0.00 2.40 69.60 but after IO stopped, there are still very big avgqu-sz and %util values as bellow: Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util bcache0 ... 0 5326.32 0.00 0.00 0.00 0.00 100.10 The reason for this issue is that, only generic_start_io_acct() called and no generic_end_io_acct() called for detached device in cached_dev_make_request(). See the code: //start generic_start_io_acct() generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); if (cached_dev_get(dc)) { //will callback generic_end_io_acct() } else { //will not call generic_end_io_acct() } This patch calls generic_end_io_acct() in the end of IO for detached devices, so we can show IO state correctly. (Modified to use GFP_NOIO in kzalloc() by Coly Li) Changelog: v2: fix typo. v1: the initial version. Signed-off-by: Tang Junhui Reviewed-by: Coly Li Reviewed-by: Hannes Reinecke Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 58 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 7 deletions(-) (limited to 'drivers/md/bcache/request.c') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 7aca308bee5b..5c8ae69c8502 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -986,6 +986,55 @@ static void cached_dev_nodata(struct closure *cl) continue_at(cl, cached_dev_bio_complete, NULL); } +struct detached_dev_io_private { + struct bcache_device *d; + unsigned long start_time; + bio_end_io_t *bi_end_io; + void *bi_private; +}; + +static void detached_dev_end_io(struct bio *bio) +{ + struct detached_dev_io_private *ddip; + + ddip = bio->bi_private; + bio->bi_end_io = ddip->bi_end_io; + bio->bi_private = ddip->bi_private; + + generic_end_io_acct(ddip->d->disk->queue, + bio_data_dir(bio), + &ddip->d->disk->part0, ddip->start_time); + + kfree(ddip); + + bio->bi_end_io(bio); +} + +static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) +{ + struct detached_dev_io_private *ddip; + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + + /* + * no need to call closure_get(&dc->disk.cl), + * because upper layer had already opened bcache device, + * which would call closure_get(&dc->disk.cl) + */ + ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); + ddip->d = d; + ddip->start_time = jiffies; + ddip->bi_end_io = bio->bi_end_io; + ddip->bi_private = bio->bi_private; + bio->bi_end_io = detached_dev_end_io; + bio->bi_private = ddip; + + if ((bio_op(bio) == REQ_OP_DISCARD) && + !blk_queue_discard(bdev_get_queue(dc->bdev))) + bio->bi_end_io(bio); + else + generic_make_request(bio); +} + /* Cached devices - read & write stuff */ static blk_qc_t cached_dev_make_request(struct request_queue *q, @@ -1028,13 +1077,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, else cached_dev_read(dc, s); } - } else { - if ((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(dc->bdev))) - bio_endio(bio); - else - generic_make_request(bio); - } + } else + detached_dev_do_request(d, bio); return BLK_QC_T_NONE; } -- cgit v1.2.1 From 27a40ab9269e79b55672312b324f8f29d94463d4 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 18 Mar 2018 17:36:24 -0700 Subject: bcache: add backing_request_endio() for bi_end_io In order to catch I/O error of backing device, a separate bi_end_io call back is required. Then a per backing device counter can record I/O errors number and retire the backing device if the counter reaches a per backing device I/O error limit. This patch adds backing_request_endio() to bcache backing device I/O code path, this is a preparation for further complicated backing device failure handling. So far there is no real code logic change, I make this change a separate patch to make sure it is stable and reliable for further work. Changelog: v2: Fix code comments typo, remove a redundant bch_writeback_add() line added in v4 patch set. v1: indeed this is new added in this patch set. [mlyle: truncated commit subject] Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Reviewed-by: Michael Lyle Cc: Junhui Tang Cc: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 93 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 16 deletions(-) (limited to 'drivers/md/bcache/request.c') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 5c8ae69c8502..b4a5768afbe9 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl) } op->insert_data_done = true; + /* get in bch_data_insert() */ bio_put(bio); out: continue_at(cl, bch_data_insert_keys, op->wq); @@ -630,6 +631,38 @@ static void request_endio(struct bio *bio) closure_put(cl); } +static void backing_request_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + + if (bio->bi_status) { + struct search *s = container_of(cl, struct search, cl); + /* + * If a bio has REQ_PREFLUSH for writeback mode, it is + * speically assembled in cached_dev_write() for a non-zero + * write request which has REQ_PREFLUSH. we don't set + * s->iop.status by this failure, the status will be decided + * by result of bch_data_insert() operation. + */ + if (unlikely(s->iop.writeback && + bio->bi_opf & REQ_PREFLUSH)) { + char buf[BDEVNAME_SIZE]; + + bio_devname(bio, buf); + pr_err("Can't flush %s: returned bi_status %i", + buf, bio->bi_status); + } else { + /* set to orig_bio->bi_status in bio_complete() */ + s->iop.status = bio->bi_status; + } + s->recoverable = false; + /* should count I/O error for backing device here */ + } + + bio_put(bio); + closure_put(cl); +} + static void bio_complete(struct search *s) { if (s->orig_bio) { @@ -644,13 +677,21 @@ static void bio_complete(struct search *s) } } -static void do_bio_hook(struct search *s, struct bio *orig_bio) +static void do_bio_hook(struct search *s, + struct bio *orig_bio, + bio_end_io_t *end_io_fn) { struct bio *bio = &s->bio.bio; bio_init(bio, NULL, 0); __bio_clone_fast(bio, orig_bio); - bio->bi_end_io = request_endio; + /* + * bi_end_io can be set separately somewhere else, e.g. the + * variants in, + * - cache_bio->bi_end_io from cached_dev_cache_miss() + * - n->bi_end_io from cache_lookup_fn() + */ + bio->bi_end_io = end_io_fn; bio->bi_private = &s->cl; bio_cnt_set(bio, 3); @@ -676,7 +717,7 @@ static inline struct search *search_alloc(struct bio *bio, s = mempool_alloc(d->c->search, GFP_NOIO); closure_init(&s->cl, NULL); - do_bio_hook(s, bio); + do_bio_hook(s, bio, request_endio); s->orig_bio = bio; s->cache_miss = NULL; @@ -743,10 +784,11 @@ static void cached_dev_read_error(struct closure *cl) trace_bcache_read_retry(s->orig_bio); s->iop.status = 0; - do_bio_hook(s, s->orig_bio); + do_bio_hook(s, s->orig_bio, backing_request_endio); /* XXX: invalidate cache */ + /* I/O request sent to backing device */ closure_bio_submit(s->iop.c, bio, cl); } @@ -859,7 +901,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, bio_copy_dev(cache_bio, miss); cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; - cache_bio->bi_end_io = request_endio; + cache_bio->bi_end_io = backing_request_endio; cache_bio->bi_private = &s->cl; bch_bio_map(cache_bio, NULL); @@ -872,14 +914,16 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_miss = miss; s->iop.bio = cache_bio; bio_get(cache_bio); + /* I/O request sent to backing device */ closure_bio_submit(s->iop.c, cache_bio, &s->cl); return ret; out_put: bio_put(cache_bio); out_submit: - miss->bi_end_io = request_endio; + miss->bi_end_io = backing_request_endio; miss->bi_private = &s->cl; + /* I/O request sent to backing device */ closure_bio_submit(s->iop.c, miss, &s->cl); return ret; } @@ -943,31 +987,46 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) s->iop.bio = s->orig_bio; bio_get(s->iop.bio); - if ((bio_op(bio) != REQ_OP_DISCARD) || - blk_queue_discard(bdev_get_queue(dc->bdev))) - closure_bio_submit(s->iop.c, bio, cl); + if (bio_op(bio) == REQ_OP_DISCARD && + !blk_queue_discard(bdev_get_queue(dc->bdev))) + goto insert_data; + + /* I/O request sent to backing device */ + bio->bi_end_io = backing_request_endio; + closure_bio_submit(s->iop.c, bio, cl); + } else if (s->iop.writeback) { bch_writeback_add(dc); s->iop.bio = bio; if (bio->bi_opf & REQ_PREFLUSH) { - /* Also need to send a flush to the backing device */ - struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, - dc->disk.bio_split); - + /* + * Also need to send a flush to the backing + * device. + */ + struct bio *flush; + + flush = bio_alloc_bioset(GFP_NOIO, 0, + dc->disk.bio_split); + if (!flush) { + s->iop.status = BLK_STS_RESOURCE; + goto insert_data; + } bio_copy_dev(flush, bio); - flush->bi_end_io = request_endio; + flush->bi_end_io = backing_request_endio; flush->bi_private = cl; flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - + /* I/O request sent to backing device */ closure_bio_submit(s->iop.c, flush, cl); } } else { s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); - + /* I/O request sent to backing device */ + bio->bi_end_io = backing_request_endio; closure_bio_submit(s->iop.c, bio, cl); } +insert_data: closure_call(&s->iop.cl, bch_data_insert, NULL, cl); continue_at(cl, cached_dev_write_complete, NULL); } @@ -981,6 +1040,7 @@ static void cached_dev_nodata(struct closure *cl) bch_journal_meta(s->iop.c, cl); /* If it's a flush, we send the flush to the backing device too */ + bio->bi_end_io = backing_request_endio; closure_bio_submit(s->iop.c, bio, cl); continue_at(cl, cached_dev_bio_complete, NULL); @@ -1078,6 +1138,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, cached_dev_read(dc, s); } } else + /* I/O request sent to backing device */ detached_dev_do_request(d, bio); return BLK_QC_T_NONE; -- cgit v1.2.1 From c7b7bd07404c52d8b9c6fd2fe794052ac367a818 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 18 Mar 2018 17:36:25 -0700 Subject: bcache: add io_disable to struct cached_dev If a bcache device is configured to writeback mode, current code does not handle write I/O errors on backing devices properly. In writeback mode, write request is written to cache device, and latter being flushed to backing device. If I/O failed when writing from cache device to the backing device, bcache code just ignores the error and upper layer code is NOT noticed that the backing device is broken. This patch tries to handle backing device failure like how the cache device failure is handled, - Add a error counter 'io_errors' and error limit 'error_limit' in struct cached_dev. Add another io_disable to struct cached_dev to disable I/Os on the problematic backing device. - When I/O error happens on backing device, increase io_errors counter. And if io_errors reaches error_limit, set cache_dev->io_disable to true, and stop the bcache device. The result is, if backing device is broken of disconnected, and I/O errors reach its error limit, backing device will be disabled and the associated bcache device will be removed from system. Changelog: v2: remove "bcache: " prefix in pr_error(), and use correct name string to print out bcache device gendisk name. v1: indeed this is new added in v2 patch set. Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Reviewed-by: Michael Lyle Cc: Michael Lyle Cc: Junhui Tang Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'drivers/md/bcache/request.c') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index b4a5768afbe9..5a82237c7025 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -637,6 +637,8 @@ static void backing_request_endio(struct bio *bio) if (bio->bi_status) { struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, + struct cached_dev, disk); /* * If a bio has REQ_PREFLUSH for writeback mode, it is * speically assembled in cached_dev_write() for a non-zero @@ -657,6 +659,7 @@ static void backing_request_endio(struct bio *bio) } s->recoverable = false; /* should count I/O error for backing device here */ + bch_count_backing_io_errors(dc, bio); } bio_put(bio); @@ -1065,8 +1068,14 @@ static void detached_dev_end_io(struct bio *bio) bio_data_dir(bio), &ddip->d->disk->part0, ddip->start_time); - kfree(ddip); + if (bio->bi_status) { + struct cached_dev *dc = container_of(ddip->d, + struct cached_dev, disk); + /* should count I/O error for backing device here */ + bch_count_backing_io_errors(dc, bio); + } + kfree(ddip); bio->bi_end_io(bio); } @@ -1105,7 +1114,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); - if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { + if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || + dc->io_disable)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return BLK_QC_T_NONE; -- cgit v1.2.1 From 47344e330eabc1515cbe6061eb337100a3ab6d37 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 18 Mar 2018 17:36:29 -0700 Subject: bcache: Fix kernel-doc warnings Avoid that building with W=1 triggers warnings about the kernel-doc headers. Signed-off-by: Bart Van Assche Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md/bcache/request.c') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 5a82237c7025..a65e3365eeb9 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -296,6 +296,7 @@ err: /** * bch_data_insert - stick some data in the cache + * @cl: closure pointer. * * This is the starting point for any data to end up in a cache device; it could * be from a normal write, or a writeback write, or a write to a flash only -- cgit v1.2.1