From 4aaf7694f841edc96fe0f72958aabe59204b3611 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Tue, 4 Jul 2017 11:20:30 +0800
Subject: md/bitmap: don't read page from device with Bitmap_sync

The device owns Bitmap_sync flag needs recovery
to become in sync, and read page from this type
device could get stale status.

Also add comments for Bitmap_sync bit per the
suggestion from Shaohua and Neil.

Previous disscussion can be found here:
https://marc.info/?t=149760428900004&r=1&w=2

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 3 ++-
 drivers/md/md.h     | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index f4eace5ea184..40f3cd7eab0f 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -156,7 +156,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
 
 	rdev_for_each(rdev, mddev) {
 		if (! test_bit(In_sync, &rdev->flags)
-		    || test_bit(Faulty, &rdev->flags))
+		    || test_bit(Faulty, &rdev->flags)
+		    || test_bit(Bitmap_sync, &rdev->flags))
 			continue;
 
 		target = offset + index * (PAGE_SIZE/512);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 991f0fe2dcc6..b50eb4ac1b82 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -134,7 +134,9 @@ enum flag_bits {
 	Faulty,			/* device is known to have a fault */
 	In_sync,		/* device is in_sync with rest of array */
 	Bitmap_sync,		/* ..actually, not quite In_sync.  Need a
-				 * bitmap-based recovery to get fully in sync
+				 * bitmap-based recovery to get fully in sync.
+				 * The bit is only meaningful before device
+				 * has been passed to pers->hot_add_disk.
 				 */
 	WriteMostly,		/* Avoid reading if at all possible */
 	AutoDetected,		/* added by auto-detect */
-- 
cgit v1.2.1


From b5d27718f38843a74552e9a93d32e2391fd3999f Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Wed, 5 Jul 2017 17:34:04 +0800
Subject: Raid5 should update rdev->sectors after reshape

The raid5 md device is created by the disks which we don't use the total size. For example,
the size of the device is 5G and it just uses 3G of the devices to create one raid5 device.
Then change the chunksize and wait reshape to finish. After reshape finishing stop the raid
and assemble it again. It fails.
mdadm -CR /dev/md0 -l5 -n3 /dev/loop[0-2] --size=3G --chunk=32 --assume-clean
mdadm /dev/md0 --grow --chunk=64
wait reshape to finish
mdadm -S /dev/md0
mdadm -As
The error messages:
[197519.814302] md: loop1 does not have a valid v1.2 superblock, not importing!
[197519.821686] md: md_import_device returned -22

After reshape the data offset is changed. It selects backwards direction in this condition.
In function super_1_load it compares the available space of the underlying device with
sb->data_size. The new data offset gets bigger after reshape. So super_1_load returns -EINVAL.
rdev->sectors is updated in md_finish_reshape. Then sb->data_size is set in super_1_sync based
on rdev->sectors. So add md_finish_reshape in end_reshape.

Signed-off-by: Xiao Ni <xni@redhat.com>
Acked-by: Guoqing Jiang <gqjiang@suse.com>
Cc: stable@vger.kernel.org
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2ceb338b094b..aeeb8d6854e2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7951,12 +7951,10 @@ static void end_reshape(struct r5conf *conf)
 {
 
 	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
-		struct md_rdev *rdev;
 
 		spin_lock_irq(&conf->device_lock);
 		conf->previous_raid_disks = conf->raid_disks;
-		rdev_for_each(rdev, conf->mddev)
-			rdev->data_offset = rdev->new_data_offset;
+		md_finish_reshape(conf->mddev);
 		smp_wmb();
 		conf->reshape_progress = MaxSector;
 		conf->mddev->reshape_position = MaxSector;
-- 
cgit v1.2.1


From 6409e84ec58fc4c0085d8921f8e01815dc871971 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Tue, 11 Jul 2017 16:16:24 +0200
Subject: raid5-ppl: use BIOSET_NEED_BVECS when creating bioset

This bioset is used for allocating bios with nr_iovecs > 0 so this flag
must be set.

Fixes: 011067b05668 ("blk: replace bioset_create_nobvec() with a flags arg to bioset_create()")
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-ppl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 77cce3573aa8..44ad5baf3206 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1150,7 +1150,7 @@ int ppl_init_log(struct r5conf *conf)
 		goto err;
 	}
 
-	ppl_conf->bs = bioset_create(conf->raid_disks, 0, 0);
+	ppl_conf->bs = bioset_create(conf->raid_disks, 0, BIOSET_NEED_BVECS);
 	if (!ppl_conf->bs) {
 		ret = -ENOMEM;
 		goto err;
-- 
cgit v1.2.1


From 9dd59727dbc21d33a9add4c5b308a5775cd5a6ef Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 19 Jul 2017 11:23:40 -0400
Subject: dm integrity: fix inefficient allocation of journal space

When using a block size greater than 512 bytes, the dm-integrity target
allocates journal space inefficiently.  It allocates one journal entry
for each 512-byte chunk of data, fills an entry for each block of data
and leaves the remaining entries unused.

This issue doesn't cause data corruption, but all the unused journal
entries degrade performance severely.

For example, with 4k blocks and an 8k bio, it would allocate 16 journal
entries but only use 2 entries.  The remaining 14 entries were left
unused.

Fix this by adding the missing 'log2_sectors_per_block' shifts that are
required to have each journal entry map to a full block.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 7eada909bfd7 ("dm: add integrity target")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 1b224aa9cf15..4b2fd524e38d 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1587,16 +1587,18 @@ retry:
 	if (likely(ic->mode == 'J')) {
 		if (dio->write) {
 			unsigned next_entry, i, pos;
-			unsigned ws, we;
+			unsigned ws, we, range_sectors;
 
-			dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors);
+			dio->range.n_sectors = min(dio->range.n_sectors,
+						   ic->free_sectors << ic->sb->log2_sectors_per_block);
 			if (unlikely(!dio->range.n_sectors))
 				goto sleep;
-			ic->free_sectors -= dio->range.n_sectors;
+			range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block;
+			ic->free_sectors -= range_sectors;
 			journal_section = ic->free_section;
 			journal_entry = ic->free_section_entry;
 
-			next_entry = ic->free_section_entry + dio->range.n_sectors;
+			next_entry = ic->free_section_entry + range_sectors;
 			ic->free_section_entry = next_entry % ic->journal_section_entries;
 			ic->free_section += next_entry / ic->journal_section_entries;
 			ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
-- 
cgit v1.2.1


From a7c3e62bdc71d33f75803115d44e3ee7dab3d811 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 19 Jul 2017 11:24:08 -0400
Subject: dm integrity: use plugging when writing the journal

When copying data from the journal to the appropriate place, we submit
many IOs.  Some of these IOs could go to adjacent areas.  Use on-stack
plugging so that adjacent IOs get merged during submission.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 4b2fd524e38d..be3b6f42095c 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1823,6 +1823,9 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 {
 	unsigned i, j, n;
 	struct journal_completion comp;
+	struct blk_plug plug;
+
+	blk_start_plug(&plug);
 
 	comp.ic = ic;
 	comp.in_flight = (atomic_t)ATOMIC_INIT(1);
@@ -1947,6 +1950,8 @@ skip_io:
 
 	dm_bufio_write_dirty_buffers_async(ic->bufio);
 
+	blk_finish_plug(&plug);
+
 	complete_journal_op(&comp);
 	wait_for_completion_io(&comp.comp);
 
-- 
cgit v1.2.1


From 022e510fcbda79183fd2cdc01abb01b4be80d03f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 14 Jul 2017 16:14:42 +0800
Subject: md: remove 'idx' from 'struct resync_pages'

bio_add_page() won't fail for resync bio, and the page index for each
bio is same, so remove it.

More importantly the 'idx' of 'struct resync_pages' is initialized in
mempool allocator function, the current way is wrong since mempool is
only responsible for allocation, we can't use that for initialization.

Suggested-by: NeilBrown <neilb@suse.com>
Reported-by: NeilBrown <neilb@suse.com>
Reported-and-tested-by: Patrick <dto@gmx.net>
Fixes: f0250618361d(md: raid10: don't use bio's vec table to manage resync pages)
Fixes: 98d30c5812c3(md: raid1: don't use bio's vec table to manage resync pages)
Cc: stable@vger.kernel.org (4.12+)
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.h     | 1 -
 drivers/md/raid1.c  | 6 +++---
 drivers/md/raid10.c | 6 +++---
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.h b/drivers/md/md.h
index b50eb4ac1b82..991769cc3615 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -738,7 +738,6 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
 
 /* for managing resync I/O pages */
 struct resync_pages {
-	unsigned	idx;	/* for get/put page from the pool */
 	void		*raid_bio;
 	struct page	*pages[RESYNC_PAGES];
 };
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3febfc8391fb..0896c772a560 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -170,7 +170,6 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 			resync_get_all_pages(rp);
 		}
 
-		rp->idx = 0;
 		rp->raid_bio = r1_bio;
 		bio->bi_private = rp;
 	}
@@ -2619,6 +2618,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	int good_sectors = RESYNC_SECTORS;
 	int min_bad = 0; /* number of sectors that are bad in all devices */
 	int idx = sector_to_idx(sector_nr);
+	int page_idx = 0;
 
 	if (!conf->r1buf_pool)
 		if (init_resync(conf))
@@ -2846,7 +2846,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 			bio = r1_bio->bios[i];
 			rp = get_resync_pages(bio);
 			if (bio->bi_end_io) {
-				page = resync_fetch_page(rp, rp->idx++);
+				page = resync_fetch_page(rp, page_idx);
 
 				/*
 				 * won't fail because the vec table is big
@@ -2858,7 +2858,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 		sync_blocks -= (len>>9);
-	} while (get_resync_pages(r1_bio->bios[disk]->bi_private)->idx < RESYNC_PAGES);
+	} while (++page_idx < RESYNC_PAGES);
 
 	r1_bio->sectors = nr_sectors;
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5026e7ad51d3..fa8bcf04e791 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -221,7 +221,6 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 			resync_get_all_pages(rp);
 		}
 
-		rp->idx = 0;
 		rp->raid_bio = r10_bio;
 		bio->bi_private = rp;
 		if (rbio) {
@@ -2853,6 +2852,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 	sector_t sectors_skipped = 0;
 	int chunks_skipped = 0;
 	sector_t chunk_mask = conf->geo.chunk_mask;
+	int page_idx = 0;
 
 	if (!conf->r10buf_pool)
 		if (init_resync(conf))
@@ -3355,7 +3355,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			break;
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
 			struct resync_pages *rp = get_resync_pages(bio);
-			page = resync_fetch_page(rp, rp->idx++);
+			page = resync_fetch_page(rp, page_idx);
 			/*
 			 * won't fail because the vec table is big enough
 			 * to hold all these pages
@@ -3364,7 +3364,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
-	} while (get_resync_pages(biolist)->idx < RESYNC_PAGES);
+	} while (++page_idx < RESYNC_PAGES);
 	r10_bio->sectors = nr_sectors;
 
 	while (biolist) {
-- 
cgit v1.2.1


From fb0eb5df09307603b21845af1d143cc910154593 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 14 Jul 2017 16:14:43 +0800
Subject: md: raid1/raid10: initialize bvec table via bio_add_page()

We will support multipage bvec soon, so initialize bvec
table using the standardy way instead of writing the
talbe directly. Otherwise it won't work any more once
multipage bvec is enabled.

Acked-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1-10.c | 19 +++++++++++++++++++
 drivers/md/raid1.c    | 18 ++++--------------
 drivers/md/raid10.c   |  6 ++++--
 3 files changed, 27 insertions(+), 16 deletions(-)
 create mode 100644 drivers/md/raid1-10.c

(limited to 'drivers/md')

diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
new file mode 100644
index 000000000000..3adb5b9dc4b4
--- /dev/null
+++ b/drivers/md/raid1-10.c
@@ -0,0 +1,19 @@
+/* generally called after bio_reset() for reseting bvec */
+static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
+			       int size)
+{
+	int idx = 0;
+
+	/* initialize bvec table again */
+	do {
+		struct page *page = resync_fetch_page(rp, idx);
+		int len = min_t(int, size, PAGE_SIZE);
+
+		/*
+		 * won't fail because the vec table is big
+		 * enough to hold all these pages
+		 */
+		bio_add_page(bio, page, len, 0);
+		size -= len;
+	} while (idx++ < RESYNC_PAGES && size > 0);
+}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0896c772a560..fe86ab18961b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -81,6 +81,8 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 #define raid1_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
 
+#include "raid1-10.c"
+
 /*
  * 'strct resync_pages' stores actual pages used for doing the resync
  *  IO, and it is per-bio, so make .bi_private points to it.
@@ -2085,10 +2087,7 @@ static void process_checks(struct r1bio *r1_bio)
 	/* Fix variable parts of all bios */
 	vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
 	for (i = 0; i < conf->raid_disks * 2; i++) {
-		int j;
-		int size;
 		blk_status_t status;
-		struct bio_vec *bi;
 		struct bio *b = r1_bio->bios[i];
 		struct resync_pages *rp = get_resync_pages(b);
 		if (b->bi_end_io != end_sync_read)
@@ -2097,8 +2096,6 @@ static void process_checks(struct r1bio *r1_bio)
 		status = b->bi_status;
 		bio_reset(b);
 		b->bi_status = status;
-		b->bi_vcnt = vcnt;
-		b->bi_iter.bi_size = r1_bio->sectors << 9;
 		b->bi_iter.bi_sector = r1_bio->sector +
 			conf->mirrors[i].rdev->data_offset;
 		b->bi_bdev = conf->mirrors[i].rdev->bdev;
@@ -2106,15 +2103,8 @@ static void process_checks(struct r1bio *r1_bio)
 		rp->raid_bio = r1_bio;
 		b->bi_private = rp;
 
-		size = b->bi_iter.bi_size;
-		bio_for_each_segment_all(bi, b, j) {
-			bi->bv_offset = 0;
-			if (size > PAGE_SIZE)
-				bi->bv_len = PAGE_SIZE;
-			else
-				bi->bv_len = size;
-			size -= PAGE_SIZE;
-		}
+		/* initialize bvec table again */
+		md_bio_reset_resync_pages(b, rp, r1_bio->sectors << 9);
 	}
 	for (primary = 0; primary < conf->raid_disks * 2; primary++)
 		if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index fa8bcf04e791..9952721e1cde 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -110,6 +110,8 @@ static void end_reshape(struct r10conf *conf);
 #define raid10_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
 
+#include "raid1-10.c"
+
 /*
  * 'strct resync_pages' stores actual pages used for doing the resync
  *  IO, and it is per-bio, so make .bi_private points to it.
@@ -2086,8 +2088,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		rp = get_resync_pages(tbio);
 		bio_reset(tbio);
 
-		tbio->bi_vcnt = vcnt;
-		tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
+		md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
+
 		rp->raid_bio = r10_bio;
 		tbio->bi_private = rp;
 		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
-- 
cgit v1.2.1


From be453e7761d0e72d8a1b2fcfde6d1a7e53881190 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 14 Jul 2017 16:14:44 +0800
Subject: md: raid1-10: move raid1/raid10 common code into raid1-10.c

No function change, just move 'struct resync_pages' and related
helpers into raid1-10.c

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.h       | 53 -------------------------------------------
 drivers/md/raid1-10.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid1.c    |  9 --------
 drivers/md/raid10.c   |  9 --------
 4 files changed, 62 insertions(+), 71 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 991769cc3615..09db03455801 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -731,57 +731,4 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
 	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
 		mddev->queue->limits.max_write_zeroes_sectors = 0;
 }
-
-/* Maximum size of each resync request */
-#define RESYNC_BLOCK_SIZE (64*1024)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
-
-/* for managing resync I/O pages */
-struct resync_pages {
-	void		*raid_bio;
-	struct page	*pages[RESYNC_PAGES];
-};
-
-static inline int resync_alloc_pages(struct resync_pages *rp,
-				     gfp_t gfp_flags)
-{
-	int i;
-
-	for (i = 0; i < RESYNC_PAGES; i++) {
-		rp->pages[i] = alloc_page(gfp_flags);
-		if (!rp->pages[i])
-			goto out_free;
-	}
-
-	return 0;
-
-out_free:
-	while (--i >= 0)
-		put_page(rp->pages[i]);
-	return -ENOMEM;
-}
-
-static inline void resync_free_pages(struct resync_pages *rp)
-{
-	int i;
-
-	for (i = 0; i < RESYNC_PAGES; i++)
-		put_page(rp->pages[i]);
-}
-
-static inline void resync_get_all_pages(struct resync_pages *rp)
-{
-	int i;
-
-	for (i = 0; i < RESYNC_PAGES; i++)
-		get_page(rp->pages[i]);
-}
-
-static inline struct page *resync_fetch_page(struct resync_pages *rp,
-					     unsigned idx)
-{
-	if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
-		return NULL;
-	return rp->pages[idx];
-}
 #endif /* _MD_MD_H */
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 3adb5b9dc4b4..9f2670b45f31 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -1,3 +1,65 @@
+/* Maximum size of each resync request */
+#define RESYNC_BLOCK_SIZE (64*1024)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+
+/* for managing resync I/O pages */
+struct resync_pages {
+	void		*raid_bio;
+	struct page	*pages[RESYNC_PAGES];
+};
+
+static inline int resync_alloc_pages(struct resync_pages *rp,
+				     gfp_t gfp_flags)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++) {
+		rp->pages[i] = alloc_page(gfp_flags);
+		if (!rp->pages[i])
+			goto out_free;
+	}
+
+	return 0;
+
+out_free:
+	while (--i >= 0)
+		put_page(rp->pages[i]);
+	return -ENOMEM;
+}
+
+static inline void resync_free_pages(struct resync_pages *rp)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++)
+		put_page(rp->pages[i]);
+}
+
+static inline void resync_get_all_pages(struct resync_pages *rp)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++)
+		get_page(rp->pages[i]);
+}
+
+static inline struct page *resync_fetch_page(struct resync_pages *rp,
+					     unsigned idx)
+{
+	if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
+		return NULL;
+	return rp->pages[idx];
+}
+
+/*
+ * 'strct resync_pages' stores actual pages used for doing the resync
+ *  IO, and it is per-bio, so make .bi_private points to it.
+ */
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+	return bio->bi_private;
+}
+
 /* generally called after bio_reset() for reseting bvec */
 static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
 			       int size)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fe86ab18961b..8387eb1540cd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -83,15 +83,6 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
 #include "raid1-10.c"
 
-/*
- * 'strct resync_pages' stores actual pages used for doing the resync
- *  IO, and it is per-bio, so make .bi_private points to it.
- */
-static inline struct resync_pages *get_resync_pages(struct bio *bio)
-{
-	return bio->bi_private;
-}
-
 /*
  * for resync bio, r1bio pointer can be retrieved from the per-bio
  * 'struct resync_pages'.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9952721e1cde..e2617d0f37dc 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -112,15 +112,6 @@ static void end_reshape(struct r10conf *conf);
 
 #include "raid1-10.c"
 
-/*
- * 'strct resync_pages' stores actual pages used for doing the resync
- *  IO, and it is per-bio, so make .bi_private points to it.
- */
-static inline struct resync_pages *get_resync_pages(struct bio *bio)
-{
-	return bio->bi_private;
-}
-
 /*
  * for resync bio, r10bio pointer can be retrieved from the per-bio
  * 'struct resync_pages'.
-- 
cgit v1.2.1


From 16d56e2fcc1fc15b981369653c3b41d7ff0b443d Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 17 Jul 2017 14:33:48 -0700
Subject: md/raid1: fix writebehind bio clone

After bio is submitted, we should not clone it as its bi_iter might be
invalid by driver. This is the case of behind_master_bio. In certain
situration, we could dispatch behind_master_bio immediately for the
first disk and then clone it for other disks.

https://bugzilla.kernel.org/show_bug.cgi?id=196383

Reported-and-tested-by: Markus <m4rkusxxl@web.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Fix: 841c1316c7da(md: raid1: improve write behind)
Cc: stable@vger.kernel.org (4.12+)
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8387eb1540cd..1d235cc8b402 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -484,10 +484,6 @@ static void raid1_end_write_request(struct bio *bio)
 	}
 
 	if (behind) {
-		/* we release behind master bio when all write are done */
-		if (r1_bio->behind_master_bio == bio)
-			to_put = NULL;
-
 		if (test_bit(WriteMostly, &rdev->flags))
 			atomic_dec(&r1_bio->behind_remaining);
 
@@ -1080,7 +1076,7 @@ static void unfreeze_array(struct r1conf *conf)
 	wake_up(&conf->wait_barrier);
 }
 
-static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
+static void alloc_behind_master_bio(struct r1bio *r1_bio,
 					   struct bio *bio)
 {
 	int size = bio->bi_iter.bi_size;
@@ -1090,11 +1086,13 @@ static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
 
 	behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev);
 	if (!behind_bio)
-		goto fail;
+		return;
 
 	/* discard op, we don't support writezero/writesame yet */
-	if (!bio_has_data(bio))
+	if (!bio_has_data(bio)) {
+		behind_bio->bi_iter.bi_size = size;
 		goto skip_copy;
+	}
 
 	while (i < vcnt && size) {
 		struct page *page;
@@ -1115,14 +1113,13 @@ skip_copy:
 	r1_bio->behind_master_bio = behind_bio;;
 	set_bit(R1BIO_BehindIO, &r1_bio->state);
 
-	return behind_bio;
+	return;
 
 free_pages:
 	pr_debug("%dB behind alloc failed, doing sync I/O\n",
 		 bio->bi_iter.bi_size);
 	bio_free_pages(behind_bio);
-fail:
-	return behind_bio;
+	bio_put(behind_bio);
 }
 
 struct raid1_plug_cb {
@@ -1475,7 +1472,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			    (atomic_read(&bitmap->behind_writes)
 			     < mddev->bitmap_info.max_write_behind) &&
 			    !waitqueue_active(&bitmap->behind_wait)) {
-				mbio = alloc_behind_master_bio(r1_bio, bio);
+				alloc_behind_master_bio(r1_bio, bio);
 			}
 
 			bitmap_startwrite(bitmap, r1_bio->sector,
@@ -1485,14 +1482,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			first_clone = 0;
 		}
 
-		if (!mbio) {
-			if (r1_bio->behind_master_bio)
-				mbio = bio_clone_fast(r1_bio->behind_master_bio,
-						      GFP_NOIO,
-						      mddev->bio_set);
-			else
-				mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-		}
+		if (r1_bio->behind_master_bio)
+			mbio = bio_clone_fast(r1_bio->behind_master_bio,
+					      GFP_NOIO, mddev->bio_set);
+		else
+			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 
 		if (r1_bio->behind_master_bio) {
 			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
@@ -2346,8 +2340,6 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 			wbio = bio_clone_fast(r1_bio->behind_master_bio,
 					      GFP_NOIO,
 					      mddev->bio_set);
-			/* We really need a _all clone */
-			wbio->bi_iter = (struct bvec_iter){ 0 };
 		} else {
 			wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
 					      mddev->bio_set);
-- 
cgit v1.2.1


From 6308d8e3d42bea15461c696df1ad74c2944b5c23 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 21 Jul 2017 16:33:44 +0800
Subject: md: simplify code with bio_io_error

Since bio_io_error sets bi_status to BLK_STS_IOERR,
and calls bio_endio, so we can use it directly.

And as mentioned by Shaohua, there are also two
places in raid5.c can use bio_io_error either.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c  | 3 +--
 drivers/md/raid10.c | 6 ++----
 drivers/md/raid5.c  | 9 +++------
 3 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1d235cc8b402..f50958ded9f0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -790,8 +790,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio)
 		bio->bi_next = NULL;
 		bio->bi_bdev = rdev->bdev;
 		if (test_bit(Faulty, &rdev->flags)) {
-			bio->bi_status = BLK_STS_IOERR;
-			bio_endio(bio);
+			bio_io_error(bio);
 		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
 				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
 			/* Just ignore it */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e2617d0f37dc..f55d4cc085f6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -905,8 +905,7 @@ static void flush_pending_writes(struct r10conf *conf)
 			bio->bi_next = NULL;
 			bio->bi_bdev = rdev->bdev;
 			if (test_bit(Faulty, &rdev->flags)) {
-				bio->bi_status = BLK_STS_IOERR;
-				bio_endio(bio);
+				bio_io_error(bio);
 			} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
 					    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
 				/* Just ignore it */
@@ -1090,8 +1089,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		bio->bi_next = NULL;
 		bio->bi_bdev = rdev->bdev;
 		if (test_bit(Faulty, &rdev->flags)) {
-			bio->bi_status = BLK_STS_IOERR;
-			bio_endio(bio);
+			bio_io_error(bio);
 		} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
 				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
 			/* Just ignore it */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index aeeb8d6854e2..4904dffec915 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3381,9 +3381,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 
-			bi->bi_status = BLK_STS_IOERR;
 			md_write_end(conf->mddev);
-			bio_endio(bi);
+			bio_io_error(bi);
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3403,9 +3402,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		       sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 
-			bi->bi_status = BLK_STS_IOERR;
 			md_write_end(conf->mddev);
-			bio_endio(bi);
+			bio_io_error(bi);
 			bi = bi2;
 		}
 
@@ -3429,8 +3427,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 				struct bio *nextbi =
 					r5_next_bio(bi, sh->dev[i].sector);
 
-				bi->bi_status = BLK_STS_IOERR;
-				bio_endio(bi);
+				bio_io_error(bi);
 				bi = nextbi;
 			}
 		}
-- 
cgit v1.2.1


From 7e96d559634b73a8158ee99a7abece2eacec2668 Mon Sep 17 00:00:00 2001
From: Ofer Heifetz <oferh@marvell.com>
Date: Mon, 24 Jul 2017 09:17:40 +0300
Subject: md/raid5: add thread_group worker async_tx_issue_pending_all

Since thread_group worker and raid5d kthread are not in sync, if
worker writes stripe before raid5d then requests will be waiting
for issue_pendig.

Issue observed when building raid5 with ext4, in some build runs
jbd2 would get hung and requests were waiting in the HW engine
waiting to be issued.

Fix this by adding a call to async_tx_issue_pending_all in the
raid5_do_work.

Signed-off-by: Ofer Heifetz <oferh@marvell.com>
Cc: stable@vger.kernel.org
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4904dffec915..0fc2748aaf95 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6234,6 +6234,8 @@ static void raid5_do_work(struct work_struct *work)
 	pr_debug("%d stripes handled\n", handled);
 
 	spin_unlock_irq(&conf->device_lock);
+
+	async_tx_issue_pending_all();
 	blk_finish_plug(&plug);
 
 	pr_debug("--- raid5worker inactive\n");
-- 
cgit v1.2.1


From aa03a91ffaefcffb397cddf88b97215b3eff726d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 21 Jul 2017 13:16:06 -0400
Subject: dm integrity: WARN_ON if variables representing journal usage get out
 of sync

If this WARN_ON triggers it speaks to programmer error, and likely
implies corruption, but no released kernel should trigger it.  This
WARN_ON serves to assist DM integrity developers as changes are
made/tested in the future.

BUG_ON is excessive for catching programmer error, if a user or
developer would like warnings to trigger a panic, they can enable that
via /proc/sys/kernel/panic_on_warn

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index be3b6f42095c..a7a3708700c0 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1729,6 +1729,8 @@ static void pad_uncommitted(struct dm_integrity_c *ic)
 		wraparound_section(ic, &ic->free_section);
 		ic->n_uncommitted_sections++;
 	}
+	WARN_ON(ic->journal_sections * ic->journal_section_entries !=
+		(ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors);
 }
 
 static void integrity_commit(struct work_struct *w)
-- 
cgit v1.2.1


From bc86a41e96c5b6f07453c405e036d95acc673389 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 21 Jul 2017 11:58:38 -0400
Subject: dm integrity: test for corrupted disk format during table load

If the dm-integrity superblock was corrupted in such a way that the
journal_sections field was zero, the integrity target would deadlock
because it would wait forever for free space in the journal.

Detect this situation and refuse to activate the device.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 7eada909bfd7 ("dm: add integrity target")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index a7a3708700c0..3acce09bba35 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -3028,6 +3028,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		ti->error = "Block size doesn't match the information in superblock";
 		goto bad;
 	}
+	if (!le32_to_cpu(ic->sb->journal_sections)) {
+		r = -EINVAL;
+		ti->error = "Corrupted superblock, journal_sections is 0";
+		goto bad;
+	}
 	/* make sure that ti->max_io_len doesn't overflow */
 	if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
 	    ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
-- 
cgit v1.2.1


From edc11d49f88e3281457853a530c9e5a5540b355a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 12 Jul 2017 10:26:34 +0300
Subject: dm bufio: fix error code in dm_bufio_write_dirty_buffers()

We should be returning normal negative error codes here.  The "a"
variables comes from &c->async_write_error which is a blk_status_t
converted to a regular error code.

In the current code, the blk_status_t gets propogated back to
pool_create() and eventually results in an Oops.

Fixes: 4e4cbee93d56 ("block: switch bios to blk_status_t")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 850ff6c67994..44f4a8ac95bd 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1258,8 +1258,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
  */
 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
 {
-	blk_status_t a;
-	int f;
+	int a, f;
 	unsigned long buffers_processed = 0;
 	struct dm_buffer *b, *tmp;
 
-- 
cgit v1.2.1


From bbac1e06a415c0658dd328ba9c5f640c2d97be3a Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 13 Jul 2017 17:33:22 +0200
Subject: dm raid: remove WARN_ON() in raid10_md_layout_to_format()

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2e10c2f13a34..b409015c6909 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -564,9 +564,10 @@ static const char *raid10_md_layout_to_format(int layout)
 	if (__raid10_near_copies(layout) > 1)
 		return "near";
 
-	WARN_ON(__raid10_far_copies(layout) < 2);
+	if (__raid10_far_copies(layout) > 1)
+		return "far";
 
-	return "far";
+	return "unknown";
 }
 
 /* Return md raid10 algorithm for @name */
-- 
cgit v1.2.1


From f4af3f82daed14ea06ac22eac198a45f56eb2cb1 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 13 Jul 2017 17:36:12 +0200
Subject: dm raid: fix activation check in validate_raid_redundancy()

During growing reshapes (i.e. stripes being added to a raid set), the
new stripe images are not in-sync and not part of the raid set until
the reshape is started.

LVM2 has to request multiple table reloads involving superblock updates
in order to reflect proper size of SubLVs in the cluster.  Before a stripe
adding reshape starts, validate_raid_redundancy() fails as a result of that
because it checks the total number of devices against the number of rebuild
ones rather than the actual ones in the raid set (as retrieved from the
superblock) thus resulting in failed raid4/5/6/10 redundancy checks.

E.g. convert 3 stripes -> 7 stripes raid5 (which only allows for maximum
1 device to fail) requesting +4 delta disks causing 4 devices to rebuild
during reshaping thus failing activation.

To fix this, move validate_raid_redundancy() to get access to the
current raid_set members.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b409015c6909..c256a723b964 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2541,11 +2541,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	if (!freshest)
 		return 0;
 
-	if (validate_raid_redundancy(rs)) {
-		rs->ti->error = "Insufficient redundancy to activate array";
-		return -EINVAL;
-	}
-
 	/*
 	 * Validation of the freshest device provides the source of
 	 * validation for the remaining devices.
@@ -2554,6 +2549,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	if (super_validate(rs, freshest))
 		return -EINVAL;
 
+	if (validate_raid_redundancy(rs)) {
+		rs->ti->error = "Insufficient redundancy to activate array";
+		return -EINVAL;
+	}
+
 	rdev_for_each(rdev, mddev)
 		if (!test_bit(Journal, &rdev->flags) &&
 		    rdev != freshest &&
-- 
cgit v1.2.1


From 0cf352e5a00a0ccf362e4ae60dcaa3318933f6e4 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 13 Jul 2017 17:34:24 +0200
Subject: dm raid: avoid mddev->suspended access

Use runtime flag to ensure that an mddev gets suspended/resumed just once.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c256a723b964..757355b2f1a1 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -208,6 +208,7 @@ struct raid_dev {
 #define RT_FLAG_RS_BITMAP_LOADED	2
 #define RT_FLAG_UPDATE_SBS		3
 #define RT_FLAG_RESHAPE_RS		4
+#define RT_FLAG_RS_SUSPENDED		5
 
 /* Array elements of 64 bit needed for rebuild/failed disk bits */
 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -3169,6 +3170,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	mddev_suspend(&rs->md);
+	set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
 
 	/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
 	if (rs_is_raid456(rs)) {
@@ -3626,7 +3628,7 @@ static void raid_postsuspend(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
 
-	if (!rs->md.suspended)
+	if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
 		mddev_suspend(&rs->md);
 
 	rs->md.ro = 1;
@@ -3760,7 +3762,7 @@ static int rs_start_reshape(struct raid_set *rs)
 		return r;
 
 	/* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */
-	if (mddev->suspended)
+	if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
 		mddev_resume(mddev);
 
 	/*
@@ -3787,8 +3789,8 @@ static int rs_start_reshape(struct raid_set *rs)
 	}
 
 	/* Suspend because a resume will happen in raid_resume() */
-	if (!mddev->suspended)
-		mddev_suspend(mddev);
+	set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
+	mddev_suspend(mddev);
 
 	/*
 	 * Now reshape got set up, update superblocks to
@@ -3884,7 +3886,7 @@ static void raid_resume(struct dm_target *ti)
 	if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 
-	if (mddev->suspended)
+	if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
 		mddev_resume(mddev);
 }
 
-- 
cgit v1.2.1


From ac6a318888f6b94925de603d4563edf7e86e04c8 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 13 Jul 2017 17:52:18 +0200
Subject: dm raid: bump target version

Bumo dm-raid target version to 1.12.1 to reflect that commit cc27b0c78c
("md: fix deadlock between mddev_suspend() and md_write_start()") is
available.

This version change allows userspace to detect that MD fix is available.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 757355b2f1a1..5bfe285ea9d1 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3892,7 +3892,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 11, 1},
+	.version = {1, 12, 1},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
-- 
cgit v1.2.1


From edbe9597aca2e2601e6e152bdea200b6cf1b8b47 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 21 Jul 2017 11:56:46 -0400
Subject: dm zoned: remove test for impossible REQ_OP_FLUSH conditions

The value REQ_OP_FLUSH is only used by the block code for
request-based devices.

Remove the tests for REQ_OP_FLUSH from the bio-based dm-zoned-target.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-zoned-target.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 2b538fa817f4..71eae8a23dae 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -588,7 +588,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
 
 	bio->bi_bdev = dev->bdev;
 
-	if (!nr_sectors && (bio_op(bio) != REQ_OP_FLUSH) && (bio_op(bio) != REQ_OP_WRITE))
+	if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
 		return DM_MAPIO_REMAPPED;
 
 	/* The BIO should be block aligned */
@@ -603,7 +603,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
 	bioctx->status = BLK_STS_OK;
 
 	/* Set the BIO pending in the flush list */
-	if (bio_op(bio) == REQ_OP_FLUSH || (!nr_sectors && bio_op(bio) == REQ_OP_WRITE)) {
+	if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
 		spin_lock(&dmz->flush_lock);
 		bio_list_add(&dmz->flush_list, bio);
 		spin_unlock(&dmz->flush_lock);
-- 
cgit v1.2.1


From ed9b66d21866ae3bf16557406258ebe6c00a9a84 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 25 Jul 2017 15:18:13 -0700
Subject: MD: fix warnning for UP case

spin_is_locked always returns 0 for UP case, so ignores it

Reported-by: Joshua Kinard <kumba@gentoo.org>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8cdca0296749..c99634612fc4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2287,7 +2287,7 @@ static void export_array(struct mddev *mddev)
 
 static bool set_in_sync(struct mddev *mddev)
 {
-	WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
+	WARN_ON_ONCE(NR_CPUS != 1 && !spin_is_locked(&mddev->lock));
 	if (!mddev->in_sync) {
 		mddev->sync_checkers++;
 		spin_unlock(&mddev->lock);
-- 
cgit v1.2.1


From 4218a9554653bd5be6e3c740749282b57434bd73 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Mon, 24 Jul 2017 16:44:37 +0900
Subject: dm zoned: use GFP_NOIO in I/O path

Use GFP_NOIO for memory allocations in the I/O path.  Other memory
allocations in the initialization path can use GFP_KERNEL.

Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-zoned-metadata.c | 12 ++++++------
 drivers/md/dm-zoned-reclaim.c  |  2 +-
 drivers/md/dm-zoned-target.c   |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 884ff7c170a0..a4fa2ada6883 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -624,7 +624,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
 
 	ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page);
 	if (ret == 0)
-		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
 
 	return ret;
 }
@@ -658,7 +658,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
 
 	/* Flush drive cache (this will also sync data) */
 	if (ret == 0)
-		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
 
 	return ret;
 }
@@ -722,7 +722,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
 
 	/* If there are no dirty metadata blocks, just flush the device cache */
 	if (list_empty(&write_list)) {
-		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
 		goto out;
 	}
 
@@ -927,7 +927,7 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set)
 			(zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
 	}
 
-	page = alloc_page(GFP_KERNEL);
+	page = alloc_page(GFP_NOIO);
 	if (!page)
 		return -ENOMEM;
 
@@ -1183,7 +1183,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
 
 	/* Get zone information from disk */
 	ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
-				  &blkz, &nr_blkz, GFP_KERNEL);
+				  &blkz, &nr_blkz, GFP_NOIO);
 	if (ret) {
 		dmz_dev_err(zmd->dev, "Get zone %u report failed",
 			    dmz_id(zmd, zone));
@@ -1257,7 +1257,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
 
 		ret = blkdev_reset_zones(dev->bdev,
 					 dmz_start_sect(zmd, zone),
-					 dev->zone_nr_sectors, GFP_KERNEL);
+					 dev->zone_nr_sectors, GFP_NOIO);
 		if (ret) {
 			dmz_dev_err(dev, "Reset zone %u failed %d",
 				    dmz_id(zmd, zone), ret);
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index 05c0a126f5c8..44a119e12f1a 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -75,7 +75,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
 	nr_blocks = block - wp_block;
 	ret = blkdev_issue_zeroout(zrc->dev->bdev,
 				   dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
-				   dmz_blk2sect(nr_blocks), GFP_NOFS, false);
+				   dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
 	if (ret) {
 		dmz_dev_err(zrc->dev,
 			    "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 71eae8a23dae..b08bbbd4d902 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -541,7 +541,7 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
 		int ret;
 
 		/* Create a new chunk work */
-		cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOFS);
+		cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
 		if (!cw)
 			goto out;
 
@@ -785,7 +785,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	/* Chunk BIO work */
 	mutex_init(&dmz->chunk_lock);
-	INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOFS);
+	INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_KERNEL);
 	dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
 					0, dev->name);
 	if (!dmz->chunk_wq) {
-- 
cgit v1.2.1


From 34c96507e8f6be497c15497be05f489fb34c5880 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 10 Apr 2017 12:13:00 +1000
Subject: dm verity fec: fix GFP flags used with mempool_alloc()

mempool_alloc() cannot fail for GFP_NOIO allocation, so there is no
point testing for failure.

One place the code tested for failure was passing "0" as the GFP
flags.  This is most unusual and is probably meant to be GFP_NOIO,
so that is changed.

Also, allocation from ->extra_pool and ->prealloc_pool are repeated
before releasing the previous allocation.  This can deadlock if the code
is servicing a write under high memory pressure.  To avoid deadlocks,
change these to use GFP_NOWAIT and leave the error handling in place.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-verity-fec.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 504ba3fa328b..e13f90832b6b 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -308,19 +308,14 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
 {
 	unsigned n;
 
-	if (!fio->rs) {
-		fio->rs = mempool_alloc(v->fec->rs_pool, 0);
-		if (unlikely(!fio->rs)) {
-			DMERR("failed to allocate RS");
-			return -ENOMEM;
-		}
-	}
+	if (!fio->rs)
+		fio->rs = mempool_alloc(v->fec->rs_pool, GFP_NOIO);
 
 	fec_for_each_prealloc_buffer(n) {
 		if (fio->bufs[n])
 			continue;
 
-		fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO);
+		fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOWAIT);
 		if (unlikely(!fio->bufs[n])) {
 			DMERR("failed to allocate FEC buffer");
 			return -ENOMEM;
@@ -332,22 +327,16 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
 		if (fio->bufs[n])
 			continue;
 
-		fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO);
+		fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOWAIT);
 		/* we can manage with even one buffer if necessary */
 		if (unlikely(!fio->bufs[n]))
 			break;
 	}
 	fio->nbufs = n;
 
-	if (!fio->output) {
+	if (!fio->output)
 		fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO);
 
-		if (!fio->output) {
-			DMERR("failed to allocate FEC page");
-			return -ENOMEM;
-		}
-	}
-
 	return 0;
 }
 
-- 
cgit v1.2.1


From 273752c9ff03eb83856601b2a3458218bb949e46 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 26 Jul 2017 09:35:09 -0400
Subject: dm, dax: Make sure dm_dax_flush() is called if device supports it

Currently dm_dax_flush() is not being called, even if underlying dax
device supports write cache, because DAXDEV_WRITE_CACHE is not being
propagated up to the DM dax device.

If the underlying dax device supports write cache, set
DAXDEV_WRITE_CACHE on the DM dax device.  This will cause dm_dax_flush()
to be called.

Fixes: abebfbe2f7 ("dm: add ->flush() dax operation support")
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a39bcd9b982a..28a4071cdf85 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -20,6 +20,7 @@
 #include <linux/atomic.h>
 #include <linux/blk-mq.h>
 #include <linux/mount.h>
+#include <linux/dax.h>
 
 #define DM_MSG_PREFIX "table"
 
@@ -1630,6 +1631,37 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 	return false;
 }
 
+static int device_dax_write_cache_enabled(struct dm_target *ti,
+					  struct dm_dev *dev, sector_t start,
+					  sector_t len, void *data)
+{
+	struct dax_device *dax_dev = dev->dax_dev;
+
+	if (!dax_dev)
+		return false;
+
+	if (dax_write_cache_enabled(dax_dev))
+		return true;
+	return false;
+}
+
+static int dm_table_supports_dax_write_cache(struct dm_table *t)
+{
+	struct dm_target *ti;
+	unsigned i;
+
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
+
+		if (ti->type->iterate_devices &&
+		    ti->type->iterate_devices(ti,
+				device_dax_write_cache_enabled, NULL))
+			return true;
+	}
+
+	return false;
+}
+
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
 			    sector_t start, sector_t len, void *data)
 {
@@ -1785,6 +1817,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	}
 	blk_queue_write_cache(q, wc, fua);
 
+	if (dm_table_supports_dax_write_cache(t))
+		dax_write_cache(t->md->dax_dev, true);
+
 	/* Ensure that all underlying devices are non-rotational. */
 	if (dm_table_all_devices_attribute(t, device_is_nonrot))
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
-- 
cgit v1.2.1


From 33182d15c6bf182f7ae32a66ea4a547d979cd6d7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 8 Aug 2017 16:56:36 +1000
Subject: md: always clear ->safemode when md_check_recovery gets the mddev
 lock.

If ->safemode == 1, md_check_recovery() will try to get the mddev lock
and perform various other checks.
If mddev->in_sync is zero, it will call set_in_sync, and clear
->safemode.  However if mddev->in_sync is not zero, ->safemode will not
be cleared.

When md_check_recovery() drops the mddev lock, the thread is woken
up again.  Normally it would just check if there was anything else to
do, find nothing, and go to sleep.  However as ->safemode was not
cleared, it will take the mddev lock again, then wake itself up
when unlocking.

This results in an infinite loop, repeatedly calling
md_check_recovery(), which RCU or the soft-lockup detector
will eventually complain about.

Prior to commit 4ad23a976413 ("MD: use per-cpu counter for
writes_pending"), safemode would only be set to one when the
writes_pending counter reached zero, and would be cleared again
when writes_pending is incremented.  Since that patch, safemode
is set more freely, but is not reliably cleared.

So in md_check_recovery() clear ->safemode before checking ->in_sync.

Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending")
Cc: stable@vger.kernel.org (4.12+)
Reported-by: Dominik Brodowski <linux@dominikbrodowski.net>
Reported-by: David R <david@unsolicited.net>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c99634612fc4..d84aceede1cb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8656,6 +8656,9 @@ void md_check_recovery(struct mddev *mddev)
 	if (mddev_trylock(mddev)) {
 		int spares = 0;
 
+		if (mddev->safemode == 1)
+			mddev->safemode = 0;
+
 		if (mddev->ro) {
 			struct md_rdev *rdev;
 			if (!mddev->external && mddev->in_sync)
-- 
cgit v1.2.1


From 81fe48e9aa00bdd509bd3c37a76d1132da6b9f09 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 8 Aug 2017 16:56:36 +1000
Subject: md: fix test in md_write_start()

md_write_start() needs to clear the in_sync flag is it is set, or if
there might be a race with set_in_sync() such that the later will
set it very soon.  In the later case it is sufficient to take the
spinlock to synchronize with set_in_sync(), and then set the flag
if needed.

The current test is incorrect.
It should be:
  if "flag is set" or "race is possible"

"flag is set" is trivially "mddev->in_sync".
"race is possible" should be tested by "mddev->sync_checkers".

If sync_checkers is 0, then there can be no race.  set_in_sync() will
wait in percpu_ref_switch_to_atomic_sync() for an RCU grace period,
and as md_write_start() holds the rcu_read_lock(), set_in_sync() will
be sure ot see the update to writes_pending.

If sync_checkers is > 0, there could be race.  If md_write_start()
happened entirely between
		if (!mddev->in_sync &&
		    percpu_ref_is_zero(&mddev->writes_pending)) {
and
			mddev->in_sync = 1;
in set_in_sync(), then it would not see that is_sync had been set,
and set_in_sync() would not see that writes_pending had been
incremented.

This bug means that in_sync is sometimes not set when it should be.
Consequently there is a small chance that the array will be marked as
"clean" when in fact it is inconsistent.

Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending")
cc: stable@vger.kernel.org (v4.12+)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d84aceede1cb..e4ba95f6cd59 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7996,7 +7996,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
 	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
-	if (mddev->in_sync || !mddev->sync_checkers) {
+	if (mddev->in_sync || mddev->sync_checkers) {
 		spin_lock(&mddev->lock);
 		if (mddev->in_sync) {
 			mddev->in_sync = 0;
-- 
cgit v1.2.1


From b44886c54a999771060371c3a05d5fedfc7e2102 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 31 Jul 2017 14:52:26 -0700
Subject: md/r5cache: call mddev_lock/unlock() in r5c_journal_mode_set

In r5c_journal_mode_set(), it is necessary to call mddev_lock()
before accessing conf and conf->log. Otherwise, the conf->log
may change (and become NULL).

Shaohua: fix unlock in failure cases

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index bfa1e907c472..ce98414a6e34 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2540,23 +2540,32 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
  */
 int r5c_journal_mode_set(struct mddev *mddev, int mode)
 {
-	struct r5conf *conf = mddev->private;
-	struct r5l_log *log = conf->log;
-
-	if (!log)
-		return -ENODEV;
+	struct r5conf *conf;
+	int err;
 
 	if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
 	    mode > R5C_JOURNAL_MODE_WRITE_BACK)
 		return -EINVAL;
 
+	err = mddev_lock(mddev);
+	if (err)
+		return err;
+	conf = mddev->private;
+	if (!conf || !conf->log) {
+		mddev_unlock(mddev);
+		return -ENODEV;
+	}
+
 	if (raid5_calc_degraded(conf) > 0 &&
-	    mode == R5C_JOURNAL_MODE_WRITE_BACK)
+	    mode == R5C_JOURNAL_MODE_WRITE_BACK) {
+		mddev_unlock(mddev);
 		return -EINVAL;
+	}
 
 	mddev_suspend(mddev);
 	conf->log->r5c_journal_mode = mode;
 	mddev_resume(mddev);
+	mddev_unlock(mddev);
 
 	pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
 		 mdname(mddev), mode, r5c_journal_mode_str[mode]);
-- 
cgit v1.2.1


From a9501d742127e613d744e29814e9532bacb147e8 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 3 Aug 2017 10:03:17 -0700
Subject: md/r5cache: fix io_unit handling in r5l_log_endio()

In r5l_log_endio(), once log->io_list_lock is released, the io unit
may be accessed (or even freed) by other threads. Current code
doesn't handle the io_unit properly, which leads to potential race
conditions.

This patch solves this race condition by:

1. Add a pending_stripe count flush_payload. Multiple flush_payloads
   are counted as only one pending_stripe. Flag has_flush_payload is
   added to show whether the io unit has flush_payload;
2. In r5l_log_endio(), check flags has_null_flush and
   has_flush_payload with log->io_list_lock held. After the lock
   is released, this IO unit is only accessed when we know the
   pending_stripe counter cannot be zeroed by other threads.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index ce98414a6e34..2dcbafa8e66c 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -236,9 +236,10 @@ struct r5l_io_unit {
 	bool need_split_bio;
 	struct bio *split_bio;
 
-	unsigned int has_flush:1;      /* include flush request */
-	unsigned int has_fua:1;        /* include fua request */
-	unsigned int has_null_flush:1; /* include empty flush request */
+	unsigned int has_flush:1;		/* include flush request */
+	unsigned int has_fua:1;			/* include fua request */
+	unsigned int has_null_flush:1;		/* include null flush request */
+	unsigned int has_flush_payload:1;	/* include flush payload  */
 	/*
 	 * io isn't sent yet, flush/fua request can only be submitted till it's
 	 * the first IO in running_ios list
@@ -571,6 +572,8 @@ static void r5l_log_endio(struct bio *bio)
 	struct r5l_io_unit *io_deferred;
 	struct r5l_log *log = io->log;
 	unsigned long flags;
+	bool has_null_flush;
+	bool has_flush_payload;
 
 	if (bio->bi_status)
 		md_error(log->rdev->mddev, log->rdev);
@@ -580,6 +583,16 @@ static void r5l_log_endio(struct bio *bio)
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
+
+	/*
+	 * if the io doesn't not have null_flush or flush payload,
+	 * it is not safe to access it after releasing io_list_lock.
+	 * Therefore, it is necessary to check the condition with
+	 * the lock held.
+	 */
+	has_null_flush = io->has_null_flush;
+	has_flush_payload = io->has_flush_payload;
+
 	if (log->need_cache_flush && !list_empty(&io->stripe_list))
 		r5l_move_to_end_ios(log);
 	else
@@ -600,19 +613,23 @@ static void r5l_log_endio(struct bio *bio)
 	if (log->need_cache_flush)
 		md_wakeup_thread(log->rdev->mddev->thread);
 
-	if (io->has_null_flush) {
+	/* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
+	if (has_null_flush) {
 		struct bio *bi;
 
 		WARN_ON(bio_list_empty(&io->flush_barriers));
 		while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
 			bio_endio(bi);
-			atomic_dec(&io->pending_stripe);
+			if (atomic_dec_and_test(&io->pending_stripe)) {
+				__r5l_stripe_write_finished(io);
+				return;
+			}
 		}
 	}
-
-	/* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
-	if (atomic_read(&io->pending_stripe) == 0)
-		__r5l_stripe_write_finished(io);
+	/* decrease pending_stripe for flush payload */
+	if (has_flush_payload)
+		if (atomic_dec_and_test(&io->pending_stripe))
+			__r5l_stripe_write_finished(io);
 }
 
 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
@@ -881,6 +898,11 @@ static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
 	payload->size = cpu_to_le32(sizeof(__le64));
 	payload->flush_stripes[0] = cpu_to_le64(sect);
 	io->meta_offset += meta_size;
+	/* multiple flush payloads count as one pending_stripe */
+	if (!io->has_flush_payload) {
+		io->has_flush_payload = 1;
+		atomic_inc(&io->pending_stripe);
+	}
 	mutex_unlock(&log->io_mutex);
 }
 
-- 
cgit v1.2.1


From afc1f55ca44e257f69da8f43e0714a76686ae8d1 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 11 Aug 2017 20:34:45 -0700
Subject: MD: not clear ->safemode for external metadata array

->safemode should be triggered by mdadm for external metadaa array, otherwise
array's state confuses mdadm.

Fixes: 33182d15c6bf(md: always clear ->safemode when md_check_recovery gets the mddev lock.)
Cc: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e4ba95f6cd59..b01e458d31e9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8656,7 +8656,7 @@ void md_check_recovery(struct mddev *mddev)
 	if (mddev_trylock(mddev)) {
 		int spares = 0;
 
-		if (mddev->safemode == 1)
+		if (!mddev->external && mddev->safemode == 1)
 			mddev->safemode = 0;
 
 		if (mddev->ro) {
-- 
cgit v1.2.1


From 54385bf75cc6451f29f3a149582584d5015d2c98 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 9 Aug 2017 11:32:10 -0700
Subject: dm: fix the second dec_pending() argument in
 __split_and_process_bio()

Detected by sparse.

Fixes: 4e4cbee93d56 ("block: switch bios to blk_status_t")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2edbcc2d7d3f..8298670757e9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1523,7 +1523,7 @@ static void __split_and_process_bio(struct mapped_device *md,
 	}
 
 	/* drop the extra reference count */
-	dec_pending(ci.io, error);
+	dec_pending(ci.io, errno_to_blk_status(error));
 }
 /*-----------------------------------------------------------------
  * CRUD END
-- 
cgit v1.2.1


From 68515cc72191f5c5d86f6ef7091dc8ba714d96e4 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 9 Aug 2017 11:32:15 -0700
Subject: dm mpath: retry BLK_STS_RESOURCE errors

Retry requests instead of failing them if an out-of-memory error occurs
or the block driver below dm-mpath is busy.  This restores the v4.12
behavior of noretry_error(), namely that -ENOMEM results in a retry.

Fixes: 2a842acab109 ("block: introduce new block status code type")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 0e8ab5bb3575..3486c829fa50 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1458,7 +1458,6 @@ static int noretry_error(blk_status_t error)
 	case BLK_STS_TARGET:
 	case BLK_STS_NEXUS:
 	case BLK_STS_MEDIUM:
-	case BLK_STS_RESOURCE:
 		return 1;
 	}
 
-- 
cgit v1.2.1


From 604407890ecf624c2fb41013c82b22aade59b455 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 9 Aug 2017 11:32:11 -0700
Subject: dm: fix printk() rate limiting code

Using the same rate limiting state for different kinds of messages
is wrong because this can cause a high frequency message to suppress
a report of a low frequency message. Hence use a unique rate limiting
state per message type.

Fixes: 71a16736a15e ("dm: use local printk ratelimit")
Cc: stable@vger.kernel.org
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8298670757e9..d669fddd9290 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -27,16 +27,6 @@
 
 #define DM_MSG_PREFIX "core"
 
-#ifdef CONFIG_PRINTK
-/*
- * ratelimit state to be used in DMXXX_LIMIT().
- */
-DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
-		       DEFAULT_RATELIMIT_INTERVAL,
-		       DEFAULT_RATELIMIT_BURST);
-EXPORT_SYMBOL(dm_ratelimit_state);
-#endif
-
 /*
  * Cookies are numeric values sent with CHANGE and REMOVE
  * uevents while resuming, removing or renaming the device.
-- 
cgit v1.2.1


From 1c23484c355ec360ca2f37914f8a4802c6baeead Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 9 Aug 2017 11:32:12 -0700
Subject: dm mpath: do not lock up a CPU with requeuing activity

When using the block layer in single queue mode, get_request()
returns ERR_PTR(-EAGAIN) if the queue is dying and the REQ_NOWAIT
flag has been passed to get_request(). Avoid that the kernel
reports soft lockup complaints in this case due to continuous
requeuing activity.

Fixes: 7083abbbf ("dm mpath: avoid that path removal can trigger an infinite loop")
Cc: stable@vger.kernel.org
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 3486c829fa50..d24e4b05f5da 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -504,7 +504,6 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 		if (queue_dying) {
 			atomic_inc(&m->pg_init_in_progress);
 			activate_or_offline_path(pgpath);
-			return DM_MAPIO_REQUEUE;
 		}
 		return DM_MAPIO_DELAY_REQUEUE;
 	}
-- 
cgit v1.2.1