From b126194cbb799f9980b92a77e58db6ad794c8082 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Wed, 24 Jan 2018 12:17:38 +0800 Subject: MD: Free bioset when md_run fails Signed-off-by: Xiao Ni Acked-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index bc67ab6844f0..bcf4ab9ab3df 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5497,8 +5497,10 @@ int md_run(struct mddev *mddev) } if (mddev->sync_set == NULL) { mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (!mddev->sync_set) - return -ENOMEM; + if (!mddev->sync_set) { + err = -ENOMEM; + goto abort; + } } spin_lock(&pers_lock); @@ -5511,7 +5513,8 @@ int md_run(struct mddev *mddev) else pr_warn("md: personality for level %s is not loaded!\n", mddev->clevel); - return -EINVAL; + err = -EINVAL; + goto abort; } spin_unlock(&pers_lock); if (mddev->level != pers->level) { @@ -5524,7 +5527,8 @@ int md_run(struct mddev *mddev) pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ module_put(pers->owner); - return -EINVAL; + err = -EINVAL; + goto abort; } if (pers->sync_request) { @@ -5593,7 +5597,7 @@ int md_run(struct mddev *mddev) mddev->private = NULL; module_put(pers->owner); bitmap_destroy(mddev); - return err; + goto abort; } if (mddev->queue) { bool nonrot = true; @@ -5655,6 +5659,18 @@ int md_run(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_action); sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; + +abort: + if (mddev->bio_set) { + bioset_free(mddev->bio_set); + mddev->bio_set = NULL; + } + if (mddev->sync_set) { + bioset_free(mddev->sync_set); + mddev->sync_set = NULL; + } + + return err; } EXPORT_SYMBOL_GPL(md_run); -- cgit v1.2.1 From 4b6c1060eaa6495aa5b0032e8f2d51dd936b1257 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Fri, 2 Feb 2018 23:13:19 +0100 Subject: md: fix md_write_start() deadlock w/o metadata devices If no metadata devices are configured on raid1/4/5/6/10 (e.g. via dm-raid), md_write_start() unconditionally waits for superblocks to be written thus deadlocking. Fix introduces mddev->has_superblocks bool, defines it in md_run() and checks for it in md_write_start() to conditionally avoid waiting. Once on it, check for non-existing superblocks in md_super_write(). Link: https://bugzilla.kernel.org/show_bug.cgi?id=198647 Fixes: cc27b0c78c796 ("md: fix deadlock between mddev_suspend() and md_write_start()") Signed-off-by: Heinz Mauelshagen Signed-off-by: Shaohua Li --- drivers/md/md.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index bcf4ab9ab3df..9b73cf139b80 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -801,6 +801,9 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio; int ff = 0; + if (!page) + return; + if (test_bit(Faulty, &rdev->flags)) return; @@ -5452,6 +5455,7 @@ int md_run(struct mddev *mddev) * the only valid external interface is through the md * device. */ + mddev->has_superblocks = false; rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; @@ -5465,6 +5469,9 @@ int md_run(struct mddev *mddev) set_disk_ro(mddev->gendisk, 1); } + if (rdev->sb_page) + mddev->has_superblocks = true; + /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, * Internal Bitmap issues have been handled elsewhere. @@ -8065,6 +8072,7 @@ EXPORT_SYMBOL(md_done_sync); bool md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; + if (bio_data_dir(bi) != WRITE) return true; @@ -8097,6 +8105,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) rcu_read_unlock(); if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); + if (!mddev->has_superblocks) + return true; wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || mddev->suspended); -- cgit v1.2.1 From 39772f0a7be3b3dc26c74ea13fe7847fd1522c8b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 3 Feb 2018 09:19:30 +1100 Subject: md: only allow remove_and_add_spares when no sync_thread running. The locking protocols in md assume that a device will never be removed from an array during resync/recovery/reshape. When that isn't happening, rcu or reconfig_mutex is needed to protect an rdev pointer while taking a refcount. When it is happening, that protection isn't needed. Unfortunately there are cases were remove_and_add_spares() is called when recovery might be happening: is state_store(), slot_store() and hot_remove_disk(). In each case, this is just an optimization, to try to expedite removal from the personality so the device can be removed from the array. If resync etc is happening, we just have to wait for md_check_recover to find a suitable time to call remove_and_add_spares(). This optimization and not essential so it doesn't matter if it fails. So change remove_and_add_spares() to abort early if resync/recovery/reshape is happening, unless it is called from md_check_recovery() as part of a newly started recovery. The parameter "this" is only NULL when called from md_check_recovery() so when it is NULL, there is no need to abort. As this can result in a NULL dereference, the fix is suitable for -stable. cc: yuyufen Cc: Tomasz Majchrzak Fixes: 8430e7e0af9a ("md: disconnect device from personality before trying to remove it.") Cc: stable@ver.kernel.org (v4.8+) Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9b73cf139b80..ba152dddaaa3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8595,6 +8595,10 @@ static int remove_and_add_spares(struct mddev *mddev, int removed = 0; bool remove_some = false; + if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + /* Mustn't remove devices when resync thread is running */ + return 0; + rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev->raid_disk >= 0 && -- cgit v1.2.1 From 8876391e440ba615b10eef729576e111f0315f87 Mon Sep 17 00:00:00 2001 From: BingJing Chang Date: Thu, 22 Feb 2018 13:34:46 +0800 Subject: md: fix a potential deadlock of raid5/raid10 reshape There is a potential deadlock if mount/umount happens when raid5_finish_reshape() tries to grow the size of emulated disk. How the deadlock happens? 1) The raid5 resync thread finished reshape (expanding array). 2) The mount or umount thread holds VFS sb->s_umount lock and tries to write through critical data into raid5 emulated block device. So it waits for raid5 kernel thread handling stripes in order to finish it I/Os. 3) In the routine of raid5 kernel thread, md_check_recovery() will be called first in order to reap the raid5 resync thread. That is, raid5_finish_reshape() will be called. In this function, it will try to update conf and call VFS revalidate_disk() to grow the raid5 emulated block device. It will try to acquire VFS sb->s_umount lock. The raid5 kernel thread cannot continue, so no one can handle mount/ umount I/Os (stripes). Once the write-through I/Os cannot be finished, mount/umount will not release sb->s_umount lock. The deadlock happens. The raid5 kernel thread is an emulated block device. It is responible to handle I/Os (stripes) from upper layers. The emulated block device should not request any I/Os on itself. That is, it should not call VFS layer functions. (If it did, it will try to acquire VFS locks to guarantee the I/Os sequence.) So we have the resync thread to send resync I/O requests and to wait for the results. For solving this potential deadlock, we can put the size growth of the emulated block device as the final step of reshape thread. 2017/12/29: Thanks to Guoqing Jiang , we confirmed that there is the same deadlock issue in raid10. It's reproducible and can be fixed by this patch. For raid10.c, we can remove the similar code to prevent deadlock as well since they has been called before. Reported-by: Alex Wu Reviewed-by: Alex Wu Reviewed-by: Chung-Chiang Cheng Signed-off-by: BingJing Chang Signed-off-by: Shaohua Li --- drivers/md/md.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index ba152dddaaa3..254e44e44668 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8569,6 +8569,19 @@ void md_do_sync(struct md_thread *thread) set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + mddev->delta_disks > 0 && + mddev->pers->finish_reshape && + mddev->pers->size && + mddev->queue) { + mddev_lock_nointr(mddev); + md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); + mddev_unlock(mddev); + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ -- cgit v1.2.1