From 0bd1ed4860d0f5f836aa8371797689a3779d1bf5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 10 Feb 2018 08:46:17 +0800 Subject: block: pass inclusive 'lend' parameter to truncate_inode_pages_range The 'lend' parameter of truncate_inode_pages_range is required to be inclusive, so follow the rule. This patch fixes one memory corruption triggered by discard. Cc: Cc: Dmitry Monakhov Fixes: 351499a172c0 ("block: Invalidate cache on discard v2") Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 1668506d8ed8..3884d810efd2 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -225,7 +225,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, if (start + len > i_size_read(bdev->bd_inode)) return -EINVAL; - truncate_inode_pages_range(mapping, start, start + len); + truncate_inode_pages_range(mapping, start, start + len - 1); return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, flags); } -- cgit v1.2.1 From 105976f517791aed3b11f8f53b308a2069d42055 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Feb 2018 23:36:56 +0800 Subject: blk-mq: don't call io sched's .requeue_request when requeueing rq to ->dispatch __blk_mq_requeue_request() covers two cases: - one is that the requeued request is added to hctx->dispatch, such as blk_mq_dispatch_rq_list() - another case is that the request is requeued to io scheduler, such as blk_mq_requeue_request(). We should call io sched's .requeue_request callback only for the 2nd case. Cc: Paolo Valente Cc: Omar Sandoval Fixes: bd166ef183c2 ("blk-mq-sched: add framework for MQ capable IO schedulers") Cc: stable@vger.kernel.org Reviewed-by: Bart Van Assche Acked-by: Paolo Valente Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 357492712b0e..16e83e6df404 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -712,7 +712,6 @@ static void __blk_mq_requeue_request(struct request *rq) trace_block_rq_requeue(q, rq); wbt_requeue(q->rq_wb, &rq->issue_stat); - blk_mq_sched_requeue_request(rq); if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) { blk_mq_rq_update_state(rq, MQ_RQ_IDLE); @@ -725,6 +724,9 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { __blk_mq_requeue_request(rq); + /* this request will be re-inserted to io scheduler queue */ + blk_mq_sched_requeue_request(rq); + BUG_ON(blk_queued_rq(rq)); blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); } -- cgit v1.2.1 From ba989a01469d027861e55c8f1121edadef757797 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Feb 2018 23:36:57 +0800 Subject: block: kyber: fix domain token leak during requeue When requeuing request, the domain token should have been freed before re-inserting the request to io scheduler. Otherwise, the assigned domain token will be leaked, and IO hang can be caused. Cc: Paolo Valente Cc: Omar Sandoval Cc: stable@vger.kernel.org Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/kyber-iosched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index f95c60774ce8..0d6d25e32e1f 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -833,6 +833,7 @@ static struct elevator_type kyber_sched = { .limit_depth = kyber_limit_depth, .prepare_request = kyber_prepare_request, .finish_request = kyber_finish_request, + .requeue_request = kyber_finish_request, .completed_request = kyber_completed_request, .dispatch_request = kyber_dispatch_request, .has_work = kyber_has_work, -- cgit v1.2.1 From d52987b524ccd2e2165ddad9bcc087a6c3f5332c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 26 Feb 2018 13:01:37 +0100 Subject: genhd: Fix leaked module reference for NVME devices Commit 8ddcd653257c "block: introduce GENHD_FL_HIDDEN" added handling of hidden devices to get_gendisk() but forgot to drop module reference which is also acquired by get_disk(). Drop the reference as necessary. Arguably the function naming here is misleading as put_disk() is *not* the counterpart of get_disk() but let's fix that in the follow up commit since that will be more intrusive. Fixes: 8ddcd653257c18a669fcb75ee42c37054908e0d6 CC: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/genhd.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 88a53c188cb7..5098bffe6ba6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -817,7 +817,10 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) } if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) { + struct module *owner = disk->fops->owner; + put_disk(disk); + module_put(owner); disk = NULL; } return disk; -- cgit v1.2.1 From 3079c22ea815775837a4f389ce2f7e1e7b202e09 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 26 Feb 2018 13:01:38 +0100 Subject: genhd: Rename get_disk() to get_disk_and_module() Rename get_disk() to get_disk_and_module() to make sure what the function does. It's not a great name but at least it is now clear that put_disk() is not it's counterpart. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/genhd.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 5098bffe6ba6..21b2843b27d0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -547,7 +547,7 @@ static int exact_lock(dev_t devt, void *data) { struct gendisk *p = data; - if (!get_disk(p)) + if (!get_disk_and_module(p)) return -1; return 0; } @@ -809,7 +809,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) spin_lock_bh(&ext_devt_lock); part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - if (part && get_disk(part_to_disk(part))) { + if (part && get_disk_and_module(part_to_disk(part))) { *partno = part->partno; disk = part_to_disk(part); } @@ -1456,7 +1456,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) } EXPORT_SYMBOL(__alloc_disk_node); -struct kobject *get_disk(struct gendisk *disk) +struct kobject *get_disk_and_module(struct gendisk *disk) { struct module *owner; struct kobject *kobj; @@ -1474,15 +1474,13 @@ struct kobject *get_disk(struct gendisk *disk) return kobj; } - -EXPORT_SYMBOL(get_disk); +EXPORT_SYMBOL(get_disk_and_module); void put_disk(struct gendisk *disk) { if (disk) kobject_put(&disk_to_dev(disk)->kobj); } - EXPORT_SYMBOL(put_disk); static void set_disk_ro_uevent(struct gendisk *gd, int ro) -- cgit v1.2.1 From 9df6c29912315186fef1c79cc15b758ace84175b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 26 Feb 2018 13:01:39 +0100 Subject: genhd: Add helper put_disk_and_module() Add a proper counterpart to get_disk_and_module() - put_disk_and_module(). Currently it is opencoded in several places. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 11 ++--------- block/genhd.c | 20 ++++++++++++++++---- 2 files changed, 18 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4117524ca45b..c2033a232a44 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -812,7 +812,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; - struct module *owner; unsigned int major, minor; int key_len, part, ret; char *body; @@ -904,9 +903,7 @@ fail_unlock: spin_unlock_irq(q->queue_lock); rcu_read_unlock(); fail: - owner = disk->fops->owner; - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -931,13 +928,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); void blkg_conf_finish(struct blkg_conf_ctx *ctx) __releases(ctx->disk->queue->queue_lock) __releases(rcu) { - struct module *owner; - spin_unlock_irq(ctx->disk->queue->queue_lock); rcu_read_unlock(); - owner = ctx->disk->fops->owner; - put_disk(ctx->disk); - module_put(owner); + put_disk_and_module(ctx->disk); } EXPORT_SYMBOL_GPL(blkg_conf_finish); diff --git a/block/genhd.c b/block/genhd.c index 21b2843b27d0..4c0590434591 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -817,10 +817,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) } if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); disk = NULL; } return disk; @@ -1483,6 +1480,21 @@ void put_disk(struct gendisk *disk) } EXPORT_SYMBOL(put_disk); +/* + * This is a counterpart of get_disk_and_module() and thus also of + * get_gendisk(). + */ +void put_disk_and_module(struct gendisk *disk) +{ + if (disk) { + struct module *owner = disk->fops->owner; + + put_disk(disk); + module_put(owner); + } +} +EXPORT_SYMBOL(put_disk_and_module); + static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; -- cgit v1.2.1 From 56c0908c855afbb2bdda17c15d2879949a091ad3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 26 Feb 2018 13:01:41 +0100 Subject: genhd: Fix BUG in blkdev_open() When two blkdev_open() calls for a partition race with device removal and recreation, we can hit BUG_ON(!bd_may_claim(bdev, whole, holder)) in blkdev_open(). The race can happen as follows: CPU0 CPU1 CPU2 del_gendisk() bdev_unhash_inode(part1); blkdev_open(part1, O_EXCL) blkdev_open(part1, O_EXCL) bdev = bd_acquire() bdev = bd_acquire() blkdev_get(bdev) bd_start_claiming(bdev) - finds old inode 'whole' bd_prepare_to_claim() -> 0 bdev_unhash_inode(whole); blkdev_get(bdev); bd_start_claiming(bdev) - finds new inode 'whole' bd_prepare_to_claim() - this also succeeds as we have different 'whole' here... - bad things happen now as we have two exclusive openers of the same bdev The problem here is that block device opens can see various intermediate states while gendisk is shutting down and then being recreated. We fix the problem by introducing new lookup_sem in gendisk that synchronizes gendisk deletion with get_gendisk() and furthermore by making sure that get_gendisk() does not return gendisk that is being (or has been) deleted. This makes sure that once we ever manage to look up newly created bdev inode, we are also guaranteed that following get_gendisk() will either return failure (and we fail open) or it returns gendisk for the new device and following bdget_disk() will return new bdev inode (i.e., blkdev_open() follows the path as if it is completely run after new device is created). Reported-and-analyzed-by: Hou Tao Tested-by: Hou Tao Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/genhd.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 4c0590434591..9656f9e9f99e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -717,6 +717,11 @@ void del_gendisk(struct gendisk *disk) blk_integrity_del(disk); disk_del_events(disk); + /* + * Block lookups of the disk until all bdevs are unhashed and the + * disk is marked as dead (GENHD_FL_UP cleared). + */ + down_write(&disk->lookup_sem); /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); @@ -731,6 +736,7 @@ void del_gendisk(struct gendisk *disk) bdev_unhash_inode(disk_devt(disk)); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; + up_write(&disk->lookup_sem); if (!(disk->flags & GENHD_FL_HIDDEN)) sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); @@ -816,9 +822,21 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) spin_unlock_bh(&ext_devt_lock); } - if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) { + if (!disk) + return NULL; + + /* + * Synchronize with del_gendisk() to not return disk that is being + * destroyed. + */ + down_read(&disk->lookup_sem); + if (unlikely((disk->flags & GENHD_FL_HIDDEN) || + !(disk->flags & GENHD_FL_UP))) { + up_read(&disk->lookup_sem); put_disk_and_module(disk); disk = NULL; + } else { + up_read(&disk->lookup_sem); } return disk; } @@ -1418,6 +1436,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) kfree(disk); return NULL; } + init_rwsem(&disk->lookup_sem); disk->node_id = node_id; if (disk_expand_part_tbl(disk, 0)) { free_part_stats(&disk->part0); -- cgit v1.2.1 From f3bc78d2d4b489590540ab2788d5376583e28173 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 28 Feb 2018 09:35:29 -0800 Subject: mq-deadline: Make sure to always unlock zones In case of a failed write request (all retries failed) and when using libata, the SCSI error handler calls scsi_finish_command(). In the case of blk-mq this means that scsi_mq_done() does not get called, that blk_mq_complete_request() does not get called and also that the mq-deadline .completed_request() method is not called. This results in the target zone of the failed write request being left in a locked state, preventing that any new write requests are issued to the same zone. Fix this by replacing the .completed_request() method with the .finish_request() method as this method is always called whether or not a request completes successfully. Since the .finish_request() method is only called by the blk-mq core if a .prepare_request() method exists, add a dummy .prepare_request() method. Fixes: 5700f69178e9 ("mq-deadline: Introduce zone locking support") Cc: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Damien Le Moal [ bvanassche: edited patch description ] Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/mq-deadline.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index c56f211c8440..8ec0ba9f5386 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -535,13 +535,22 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, spin_unlock(&dd->lock); } +/* + * Nothing to do here. This is defined only to ensure that .finish_request + * method is called upon request completion. + */ +static void dd_prepare_request(struct request *rq, struct bio *bio) +{ +} + /* * For zoned block devices, write unlock the target zone of * completed write requests. Do this while holding the zone lock * spinlock so that the zone is never unlocked while deadline_fifo_request() - * while deadline_next_request() are executing. + * or deadline_next_request() are executing. This function is called for + * all requests, whether or not these requests complete successfully. */ -static void dd_completed_request(struct request *rq) +static void dd_finish_request(struct request *rq) { struct request_queue *q = rq->q; @@ -756,7 +765,8 @@ static struct elevator_type mq_deadline = { .ops.mq = { .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, - .completed_request = dd_completed_request, + .prepare_request = dd_prepare_request, + .finish_request = dd_finish_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, -- cgit v1.2.1 From 7c5a0dcf557c6511a61e092ba887de28882fe857 Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Tue, 27 Feb 2018 20:10:03 +0800 Subject: block: fix the count of PGPGOUT for WRITE_SAME The vm counters is counted in sectors, so we should do the conversation in submit_bio. Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index") Cc: stable@vger.kernel.org Reviewed-by: Omar Sandoval Reviewed-by: Christoph Hellwig Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 2d1a7bbe0634..6d82c4f7fadd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2434,7 +2434,7 @@ blk_qc_t submit_bio(struct bio *bio) unsigned int count; if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - count = queue_logical_block_size(bio->bi_disk->queue); + count = queue_logical_block_size(bio->bi_disk->queue) >> 9; else count = bio_sectors(bio); -- cgit v1.2.1 From 9c0fb1e313aaf4e8edec22433c8b22dd308e466c Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Tue, 27 Feb 2018 20:10:18 +0800 Subject: block: display the correct diskname for bio bio_devname use __bdevname to display the device name, and can only show the major and minor of the part0, Fix this by using disk_name to display the correct name. Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index") Reviewed-by: Omar Sandoval Reviewed-by: Christoph Hellwig Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- block/partition-generic.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block') diff --git a/block/partition-generic.c b/block/partition-generic.c index 91622db9aedf..08dabcd8b6ae 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -51,6 +51,12 @@ const char *bdevname(struct block_device *bdev, char *buf) EXPORT_SYMBOL(bdevname); +const char *bio_devname(struct bio *bio, char *buf) +{ + return disk_name(bio->bi_disk, bio->bi_partno, buf); +} +EXPORT_SYMBOL(bio_devname); + /* * There's very little reason to use this, you should really * have a struct block_device just about everywhere and use -- cgit v1.2.1