summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2016-04-18 11:18:55 +0200
committerJiri Kosina <jkosina@suse.cz>2016-04-18 11:18:55 +0200
commit9938b04472d5c59f8bd8152a548533a8599596a2 (patch)
tree0fc8318100878c5e446076613ec02a97aa179119 /block
parentbd7ced98812dbb906950d8b0ec786f14f631cede (diff)
parentc3b46c73264b03000d1e18b22f5caf63332547c9 (diff)
downloadlinux-9938b04472d5c59f8bd8152a548533a8599596a2.tar.gz
linux-9938b04472d5c59f8bd8152a548533a8599596a2.tar.xz
Merge branch 'master' into for-next
Sync with Linus' tree so that patches against newer codebase can be applied. Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig13
-rw-r--r--block/Makefile4
-rw-r--r--block/badblocks.c585
-rw-r--r--block/bio-integrity.c13
-rw-r--r--block/bio.c101
-rw-r--r--block/blk-cgroup.c15
-rw-r--r--block/blk-core.c196
-rw-r--r--block/blk-ioc.c2
-rw-r--r--block/blk-iopoll.c224
-rw-r--r--block/blk-map.c91
-rw-r--r--block/blk-merge.c77
-rw-r--r--block/blk-mq-cpumap.c2
-rw-r--r--block/blk-mq-sysfs.c25
-rw-r--r--block/blk-mq-tag.c11
-rw-r--r--block/blk-mq.c312
-rw-r--r--block/blk-mq.h13
-rw-r--r--block/blk-settings.c46
-rw-r--r--block/blk-sysfs.c51
-rw-r--r--block/blk-timeout.c25
-rw-r--r--block/blk.h4
-rw-r--r--block/cfq-iosched.c45
-rw-r--r--block/compat_ioctl.c4
-rw-r--r--block/deadline-iosched.c3
-rw-r--r--block/genhd.c32
-rw-r--r--block/ioctl.c37
-rw-r--r--block/ioprio.c6
-rw-r--r--block/noop-iosched.c10
-rw-r--r--block/partition-generic.c48
-rw-r--r--block/partitions/mac.c10
-rw-r--r--block/scsi_ioctl.c6
30 files changed, 1402 insertions, 609 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a879..0363cd731320 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -88,6 +88,19 @@ config BLK_DEV_INTEGRITY
T10/SCSI Data Integrity Field or the T13/ATA External Path
Protection. If in doubt, say N.
+config BLK_DEV_DAX
+ bool "Block device DAX support"
+ depends on FS_DAX
+ depends on BROKEN
+ help
+ When DAX support is available (CONFIG_FS_DAX) raw block
+ devices can also support direct userspace access to the
+ storage capacity via MMAP(2) similar to a file on a
+ DAX-enabled filesystem. However, the DAX I/O-path disables
+ some standard I/O-statistics, and the MMAP(2) path has some
+ operational differences due to bypassing the page
+ cache. If in doubt, say N.
+
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"
depends on BLK_CGROUP=y
diff --git a/block/Makefile b/block/Makefile
index 00ecc97629db..9eda2322b2d4 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,10 +5,10 @@
obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+ blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
- partitions/
+ badblocks.o partitions/
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
diff --git a/block/badblocks.c b/block/badblocks.c
new file mode 100644
index 000000000000..7be53cb1cc3c
--- /dev/null
+++ b/block/badblocks.c
@@ -0,0 +1,585 @@
+/*
+ * Bad block management
+ *
+ * - Heavily based on MD badblocks code from Neil Brown
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/badblocks.h>
+#include <linux/seqlock.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+/**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: sector (start) at which to check for badblocks
+ * @sectors: number of sectors to check for badblocks
+ * @first_bad: pointer to store location of the first badblock
+ * @bad_sectors: pointer to store number of badblocks after @first_bad
+ *
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide. This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ * A 'shift' can be set so that larger blocks are tracked and
+ * consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so badblocks_check
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad. So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ *
+ * Return:
+ * 0: there are no known bad blocks in the range
+ * 1: there are known bad block which are all acknowledged
+ * -1: there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ int hi;
+ int lo;
+ u64 *p = bb->page;
+ int rv;
+ sector_t target = s + sectors;
+ unsigned seq;
+
+ if (bb->shift > 0) {
+ /* round the start down, and the end up */
+ s >>= bb->shift;
+ target += (1<<bb->shift) - 1;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+ /* 'target' is now the first block after the bad range */
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+ lo = 0;
+ rv = 0;
+ hi = bb->count;
+
+ /* Binary search between lo and hi for 'target'
+ * i.e. for the last range that starts before 'target'
+ */
+ /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+ * are known not to be the last range before target.
+ * VARIANT: hi-lo is the number of possible
+ * ranges, and decreases until it reaches 1
+ */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a < target)
+ /* This could still be the one, earlier ranges
+ * could not.
+ */
+ lo = mid;
+ else
+ /* This and later ranges are definitely out. */
+ hi = mid;
+ }
+ /* 'lo' might be the last that started before target, but 'hi' isn't */
+ if (hi > lo) {
+ /* need to check all range that end after 's' to see if
+ * any are unacknowledged.
+ */
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ if (BB_OFFSET(p[lo]) < target) {
+ /* starts before the end, and finishes after
+ * the start, so they must overlap
+ */
+ if (rv != -1 && BB_ACK(p[lo]))
+ rv = 1;
+ else
+ rv = -1;
+ *first_bad = BB_OFFSET(p[lo]);
+ *bad_sectors = BB_LEN(p[lo]);
+ }
+ lo--;
+ }
+ }
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_check);
+
+/**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ * @acknowledged: weather to mark the bad sectors as acknowledged
+ *
+ * This might extend the table, or might contract it if two adjacent ranges
+ * can be merged. We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to set badblocks (out of space)
+ */
+int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+{
+ u64 *p;
+ int lo, hi;
+ int rv = 0;
+ unsigned long flags;
+
+ if (bb->shift < 0)
+ /* badblocks are disabled */
+ return 0;
+
+ if (bb->shift) {
+ /* round the start down, and the end up */
+ sector_t next = s + sectors;
+
+ s >>= bb->shift;
+ next += (1<<bb->shift) - 1;
+ next >>= bb->shift;
+ sectors = next - s;
+ }
+
+ write_seqlock_irqsave(&bb->lock, flags);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts at-or-before 's' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a <= s)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo && BB_OFFSET(p[lo]) > s)
+ hi = lo;
+
+ if (hi > lo) {
+ /* we found a range that might merge with the start
+ * of our new range
+ */
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t e = a + BB_LEN(p[lo]);
+ int ack = BB_ACK(p[lo]);
+
+ if (e >= s) {
+ /* Yes, we can merge with a previous range */
+ if (s == a && s + sectors >= e)
+ /* new range covers old */
+ ack = acknowledged;
+ else
+ ack = ack && acknowledged;
+
+ if (e < s + sectors)
+ e = s + sectors;
+ if (e - a <= BB_MAX_LEN) {
+ p[lo] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ /* does not all fit in one range,
+ * make p[lo] maximal
+ */
+ if (BB_LEN(p[lo]) != BB_MAX_LEN)
+ p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ }
+ }
+ if (sectors && hi < bb->count) {
+ /* 'hi' points to the first range that starts after 's'.
+ * Maybe we can merge with the start of that range
+ */
+ sector_t a = BB_OFFSET(p[hi]);
+ sector_t e = a + BB_LEN(p[hi]);
+ int ack = BB_ACK(p[hi]);
+
+ if (a <= s + sectors) {
+ /* merging is possible */
+ if (e <= s + sectors) {
+ /* full overlap */
+ e = s + sectors;
+ ack = acknowledged;
+ } else
+ ack = ack && acknowledged;
+
+ a = s;
+ if (e - a <= BB_MAX_LEN) {
+ p[hi] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ lo = hi;
+ hi++;
+ }
+ }
+ if (sectors == 0 && hi < bb->count) {
+ /* we might be able to combine lo and hi */
+ /* Note: 's' is at the end of 'lo' */
+ sector_t a = BB_OFFSET(p[hi]);
+ int lolen = BB_LEN(p[lo]);
+ int hilen = BB_LEN(p[hi]);
+ int newlen = lolen + hilen - (s - a);
+
+ if (s >= a && newlen < BB_MAX_LEN) {
+ /* yes, we can combine them */
+ int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+
+ p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+ memmove(p + hi, p + hi + 1,
+ (bb->count - hi - 1) * 8);
+ bb->count--;
+ }
+ }
+ while (sectors) {
+ /* didn't merge (it all).
+ * Need to add a range just before 'hi'
+ */
+ if (bb->count >= MAX_BADBLOCKS) {
+ /* No room for more */
+ rv = 1;
+ break;
+ } else {
+ int this_sectors = sectors;
+
+ memmove(p + hi + 1, p + hi,
+ (bb->count - hi) * 8);
+ bb->count++;
+
+ if (this_sectors > BB_MAX_LEN)
+ this_sectors = BB_MAX_LEN;
+ p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+ sectors -= this_sectors;
+ s += this_sectors;
+ }
+ }
+
+ bb->changed = 1;
+ if (!acknowledged)
+ bb->unacked_exist = 1;
+ write_sequnlock_irqrestore(&bb->lock, flags);
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_set);
+
+/**
+ * badblocks_clear() - Remove a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ *
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail. So if the table becomes full, we just
+ * drop the remove request.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to clear badblocks
+ */
+int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+ u64 *p;
+ int lo, hi;
+ sector_t target = s + sectors;
+ int rv = 0;
+
+ if (bb->shift > 0) {
+ /* When clearing we round the start up and the end down.
+ * This should not matter as the shift should align with
+ * the block size and no rounding should ever be needed.
+ * However it is better the think a block is bad when it
+ * isn't than to think a block is not bad when it is.
+ */
+ s += (1<<bb->shift) - 1;
+ s >>= bb->shift;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+
+ write_seqlock_irq(&bb->lock);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts before 'target' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a < target)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo) {
+ /* p[lo] is the last range that could overlap the
+ * current range. Earlier ranges could also overlap,
+ * but only this one can overlap the end of the range.
+ */
+ if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
+ /* Partial overlap, leave the tail of this range */
+ int ack = BB_ACK(p[lo]);
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t end = a + BB_LEN(p[lo]);
+
+ if (a < s) {
+ /* we need to split this range */
+ if (bb->count >= MAX_BADBLOCKS) {
+ rv = -ENOSPC;
+ goto out;
+ }
+ memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+ bb->count++;
+ p[lo] = BB_MAKE(a, s-a, ack);
+ lo++;
+ }
+ p[lo] = BB_MAKE(target, end - target, ack);
+ /* there is no longer an overlap */
+ hi = lo;
+ lo--;
+ }
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ /* This range does overlap */
+ if (BB_OFFSET(p[lo]) < s) {
+ /* Keep the early parts of this range. */
+ int ack = BB_ACK(p[lo]);
+ sector_t start = BB_OFFSET(p[lo]);
+
+ p[lo] = BB_MAKE(start, s - start, ack);
+ /* now low doesn't overlap, so.. */
+ break;
+ }
+ lo--;
+ }
+ /* 'lo' is strictly before, 'hi' is strictly after,
+ * anything between needs to be discarded
+ */
+ if (hi - lo > 1) {
+ memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+ bb->count -= (hi - lo - 1);
+ }
+ }
+
+ bb->changed = 1;
+out:
+ write_sequnlock_irq(&bb->lock);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_clear);
+
+/**
+ * ack_all_badblocks() - Acknowledge all bad blocks in a list.
+ * @bb: the badblocks structure that holds all badblock information
+ *
+ * This only succeeds if ->changed is clear. It is used by
+ * in-kernel metadata updates
+ */
+void ack_all_badblocks(struct badblocks *bb)
+{
+ if (bb->page == NULL || bb->changed)
+ /* no point even trying */
+ return;
+ write_seqlock_irq(&bb->lock);
+
+ if (bb->changed == 0 && bb->unacked_exist) {
+ u64 *p = bb->page;
+ int i;
+
+ for (i = 0; i < bb->count ; i++) {
+ if (!BB_ACK(p[i])) {
+ sector_t start = BB_OFFSET(p[i]);
+ int len = BB_LEN(p[i]);
+
+ p[i] = BB_MAKE(start, len, 1);
+ }
+ }
+ bb->unacked_exist = 0;
+ }
+ write_sequnlock_irq(&bb->lock);
+}
+EXPORT_SYMBOL_GPL(ack_all_badblocks);
+
+/**
+ * badblocks_show() - sysfs access to bad-blocks list
+ * @bb: the badblocks structure that holds all badblock information
+ * @page: buffer received from sysfs
+ * @unack: weather to show unacknowledged badblocks
+ *
+ * Return:
+ * Length of returned data
+ */
+ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
+{
+ size_t len;
+ int i;
+ u64 *p = bb->page;
+ unsigned seq;
+
+ if (bb->shift < 0)
+ return 0;
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ len = 0;
+ i = 0;
+
+ while (len < PAGE_SIZE && i < bb->count) {
+ sector_t s = BB_OFFSET(p[i]);
+ unsigned int length = BB_LEN(p[i]);
+ int ack = BB_ACK(p[i]);
+
+ i++;
+
+ if (unack && ack)
+ continue;
+
+ len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
+ (unsigned long long)s << bb->shift,
+ length << bb->shift);
+ }
+ if (unack && len == 0)
+ bb->unacked_exist = 0;
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(badblocks_show);
+
+/**
+ * badblocks_store() - sysfs access to bad-blocks list
+ * @bb: the badblocks structure that holds all badblock information
+ * @page: buffer received from sysfs
+ * @len: length of data received from sysfs
+ * @unack: weather to show unacknowledged badblocks
+ *
+ * Return:
+ * Length of the buffer processed or -ve error.
+ */
+ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
+ int unack)
+{
+ unsigned long long sector;
+ int length;
+ char newline;
+
+ switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
+ case 3:
+ if (newline != '\n')
+ return -EINVAL;
+ case 2:
+ if (length <= 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (badblocks_set(bb, sector, length, !unack))
+ return -ENOSPC;
+ else
+ return len;
+}
+EXPORT_SYMBOL_GPL(badblocks_store);
+
+static int __badblocks_init(struct device *dev, struct badblocks *bb,
+ int enable)
+{
+ bb->dev = dev;
+ bb->count = 0;
+ if (enable)
+ bb->shift = 0;
+ else
+ bb->shift = -1;
+ if (dev)
+ bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
+ else
+ bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!bb->page) {
+ bb->shift = -1;
+ return -ENOMEM;
+ }
+ seqlock_init(&bb->lock);
+
+ return 0;
+}
+
+/**
+ * badblocks_init() - initialize the badblocks structure
+ * @bb: the badblocks structure that holds all badblock information
+ * @enable: weather to enable badblocks accounting
+ *
+ * Return:
+ * 0: success
+ * -ve errno: on error
+ */
+int badblocks_init(struct badblocks *bb, int enable)
+{
+ return __badblocks_init(NULL, bb, enable);
+}
+EXPORT_SYMBOL_GPL(badblocks_init);
+
+int devm_init_badblocks(struct device *dev, struct badblocks *bb)
+{
+ if (!bb)
+ return -EINVAL;
+ return __badblocks_init(dev, bb, 1);
+}
+EXPORT_SYMBOL_GPL(devm_init_badblocks);
+
+/**
+ * badblocks_exit() - free the badblocks structure
+ * @bb: the badblocks structure that holds all badblock information
+ */
+void badblocks_exit(struct badblocks *bb)
+{
+ if (!bb)
+ return;
+ if (bb->dev)
+ devm_kfree(bb->dev, bb->page);
+ else
+ kfree(bb->page);
+ bb->page = NULL;
+}
+EXPORT_SYMBOL_GPL(badblocks_exit);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index f6325d573c10..711e4d8de6fa 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -66,7 +66,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
}
if (unlikely(!bip))
- return NULL;
+ return ERR_PTR(-ENOMEM);
memset(bip, 0, sizeof(*bip));
@@ -89,7 +89,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
return bip;
err:
mempool_free(bip, bs->bio_integrity_pool);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(bio_integrity_alloc);
@@ -298,10 +298,10 @@ int bio_integrity_prep(struct bio *bio)
/* Allocate bio integrity payload and integrity vectors */
bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
- if (unlikely(bip == NULL)) {
+ if (IS_ERR(bip)) {
printk(KERN_ERR "could not allocate data integrity bioset\n");
kfree(buf);
- return -EIO;
+ return PTR_ERR(bip);
}
bip->bip_flags |= BIP_BLOCK_INTEGRITY;
@@ -465,9 +465,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
BUG_ON(bip_src == NULL);
bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
-
- if (bip == NULL)
- return -EIO;
+ if (IS_ERR(bip))
+ return PTR_ERR(bip);
memcpy(bip->bip_vec, bip_src->bip_vec,
bip_src->bip_vcnt * sizeof(struct bio_vec));
diff --git a/block/bio.c b/block/bio.c
index ad3f276d74bc..807d25e466ec 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -211,7 +211,7 @@ fallback:
bvl = mempool_alloc(pool, gfp_mask);
} else {
struct biovec_slab *bvs = bvec_slabs + *idx;
- gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+ gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
/*
* Make this allocation restricted and don't dump info on
@@ -221,11 +221,11 @@ fallback:
__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
/*
- * Try a slab allocation. If this fails and __GFP_WAIT
+ * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
* is set, retry with the 1-entry mempool
*/
bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
- if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
+ if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
*idx = BIOVEC_MAX_IDX;
goto fallback;
}
@@ -296,13 +296,19 @@ void bio_reset(struct bio *bio)
}
EXPORT_SYMBOL(bio_reset);
-static void bio_chain_endio(struct bio *bio)
+static struct bio *__bio_chain_endio(struct bio *bio)
{
struct bio *parent = bio->bi_private;
- parent->bi_error = bio->bi_error;
- bio_endio(parent);
+ if (!parent->bi_error)
+ parent->bi_error = bio->bi_error;
bio_put(bio);
+ return parent;
+}
+
+static void bio_chain_endio(struct bio *bio)
+{
+ bio_endio(__bio_chain_endio(bio));
}
/*
@@ -395,12 +401,12 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
* If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
* backed by the @bs's mempool.
*
- * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
- * able to allocate a bio. This is due to the mempool guarantees. To make this
- * work, callers must never allocate more than 1 bio at a time from this pool.
- * Callers that need to allocate more than 1 bio must always submit the
- * previously allocated bio for IO before attempting to allocate a new one.
- * Failure to do so can cause deadlocks under memory pressure.
+ * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will
+ * always be able to allocate a bio. This is due to the mempool guarantees.
+ * To make this work, callers must never allocate more than 1 bio at a time
+ * from this pool. Callers that need to allocate more than 1 bio must always
+ * submit the previously allocated bio for IO before attempting to allocate
+ * a new one. Failure to do so can cause deadlocks under memory pressure.
*
* Note that when running under generic_make_request() (i.e. any block
* driver), bios are not submitted until after you return - see the code in
@@ -459,13 +465,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
* We solve this, and guarantee forward progress, with a rescuer
* workqueue per bio_set. If we go to allocate and there are
* bios on current->bio_list, we first try the allocation
- * without __GFP_WAIT; if that fails, we punt those bios we
- * would be blocking to the rescuer workqueue before we retry
- * with the original gfp_flags.
+ * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
+ * bios we would be blocking to the rescuer workqueue before
+ * we retry with the original gfp_flags.
*/
if (current->bio_list && !bio_list_empty(current->bio_list))
- gfp_mask &= ~__GFP_WAIT;
+ gfp_mask &= ~__GFP_DIRECT_RECLAIM;
p = mempool_alloc(bs->bio_pool, gfp_mask);
if (!p && gfp_mask != saved_gfp) {
@@ -874,7 +880,7 @@ int submit_bio_wait(int rw, struct bio *bio)
bio->bi_private = &ret;
bio->bi_end_io = submit_bio_wait_endio;
submit_bio(rw, bio);
- wait_for_completion(&ret.event);
+ wait_for_completion_io(&ret.event);
return ret.error;
}
@@ -1090,9 +1096,12 @@ int bio_uncopy_user(struct bio *bio)
if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
/*
* if we're in a workqueue, the request is orphaned, so
- * don't copy into a random user address space, just free.
+ * don't copy into a random user address space, just free
+ * and return -EINTR so user space doesn't expect any data.
*/
- if (current->mm && bio_data_dir(bio) == READ)
+ if (!current->mm)
+ ret = -EINTR;
+ else if (bio_data_dir(bio) == READ)
ret = bio_copy_to_iter(bio, bmd->iter);
if (bmd->is_our_pages)
bio_free_pages(bio);
@@ -1125,7 +1134,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
int i, ret;
int nr_pages = 0;
unsigned int len = iter->count;
- unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
+ unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0;
for (i = 0; i < iter->nr_segs; i++) {
unsigned long uaddr;
@@ -1304,7 +1313,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
goto out_unmap;
}
- offset = uaddr & ~PAGE_MASK;
+ offset = offset_in_page(uaddr);
for (j = cur_page; j < page_limit; j++) {
unsigned int bytes = PAGE_SIZE - offset;
@@ -1330,7 +1339,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
* release the pages we didn't map into the bio, if any
*/
while (j < page_limit)
- page_cache_release(pages[j++]);
+ put_page(pages[j++]);
}
kfree(pages);
@@ -1356,7 +1365,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
for (j = 0; j < nr_pages; j++) {
if (!pages[j])
break;
- page_cache_release(pages[j]);
+ put_page(pages[j]);
}
out:
kfree(pages);
@@ -1376,7 +1385,7 @@ static void __bio_unmap_user(struct bio *bio)
if (bio_data_dir(bio) == READ)
set_page_dirty_lock(bvec->bv_page);
- page_cache_release(bvec->bv_page);
+ put_page(bvec->bv_page);
}
bio_put(bio);
@@ -1606,8 +1615,8 @@ static void bio_release_pages(struct bio *bio)
* the BIO and the offending pages and re-dirty the pages in process context.
*
* It is expected that bio_check_pages_dirty() will wholly own the BIO from
- * here on. It will run one page_cache_release() against each page and will
- * run one bio_put() against the BIO.
+ * here on. It will run one put_page() against each page and will run one
+ * bio_put() against the BIO.
*/
static void bio_dirty_fn(struct work_struct *work);
@@ -1649,7 +1658,7 @@ void bio_check_pages_dirty(struct bio *bio)
struct page *page = bvec->bv_page;
if (PageDirty(page) || PageCompound(page)) {
- page_cache_release(page);
+ put_page(page);
bvec->bv_page = NULL;
} else {
nr_clean_pages++;
@@ -1739,29 +1748,25 @@ static inline bool bio_remaining_done(struct bio *bio)
**/
void bio_endio(struct bio *bio)
{
- while (bio) {
- if (unlikely(!bio_remaining_done(bio)))
- break;
+again:
+ if (!bio_remaining_done(bio))
+ return;
- /*
- * Need to have a real endio function for chained bios,
- * otherwise various corner cases will break (like stacking
- * block devices that save/restore bi_end_io) - however, we want
- * to avoid unbounded recursion and blowing the stack. Tail call
- * optimization would handle this, but compiling with frame
- * pointers also disables gcc's sibling call optimization.
- */
- if (bio->bi_end_io == bio_chain_endio) {
- struct bio *parent = bio->bi_private;
- parent->bi_error = bio->bi_error;
- bio_put(bio);
- bio = parent;
- } else {
- if (bio->bi_end_io)
- bio->bi_end_io(bio);
- bio = NULL;
- }
+ /*
+ * Need to have a real endio function for chained bios, otherwise
+ * various corner cases will break (like stacking block devices that
+ * save/restore bi_end_io) - however, we want to avoid unbounded
+ * recursion and blowing the stack. Tail call optimization would
+ * handle this, but compiling with frame pointers also disables
+ * gcc's sibling call optimization.
+ */
+ if (bio->bi_end_io == bio_chain_endio) {
+ bio = __bio_chain_endio(bio);
+ goto again;
}
+
+ if (bio->bi_end_io)
+ bio->bi_end_io(bio);
}
EXPORT_SYMBOL(bio_endio);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5bcdfc10c23a..66e6f1aae02e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -788,6 +788,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
{
struct gendisk *disk;
struct blkcg_gq *blkg;
+ struct module *owner;
unsigned int major, minor;
int key_len, part, ret;
char *body;
@@ -804,7 +805,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
if (!disk)
return -ENODEV;
if (part) {
+ owner = disk->fops->owner;
put_disk(disk);
+ module_put(owner);
return -ENODEV;
}
@@ -820,7 +823,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
ret = PTR_ERR(blkg);
rcu_read_unlock();
spin_unlock_irq(disk->queue->queue_lock);
+ owner = disk->fops->owner;
put_disk(disk);
+ module_put(owner);
/*
* If queue was bypassing, we should retry. Do so after a
* short msleep(). It isn't strictly necessary but queue
@@ -851,9 +856,13 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
__releases(ctx->disk->queue->queue_lock) __releases(rcu)
{
+ struct module *owner;
+
spin_unlock_irq(ctx->disk->queue->queue_lock);
rcu_read_unlock();
+ owner = ctx->disk->fops->owner;
put_disk(ctx->disk);
+ module_put(owner);
}
EXPORT_SYMBOL_GPL(blkg_conf_finish);
@@ -1127,15 +1136,15 @@ void blkcg_exit_queue(struct request_queue *q)
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
-static int blkcg_can_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static int blkcg_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
+ struct cgroup_subsys_state *dst_css;
struct io_context *ioc;
int ret = 0;
/* task_lock() is needed to avoid races with exit_io_context() */
- cgroup_taskset_for_each(task, tset) {
+ cgroup_taskset_for_each(task, dst_css, tset) {
task_lock(task);
ioc = task->io_context;
if (ioc && atomic_read(&ioc->nr_tasks) > 1)
diff --git a/block/blk-core.c b/block/blk-core.c
index 89eec7965870..b60537b2c35b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -51,7 +51,7 @@ DEFINE_IDA(blk_queue_ida);
/*
* For the allocated request tables
*/
-struct kmem_cache *request_cachep = NULL;
+struct kmem_cache *request_cachep;
/*
* For queue allocation
@@ -207,6 +207,22 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs)
EXPORT_SYMBOL(blk_delay_queue);
/**
+ * blk_start_queue_async - asynchronously restart a previously stopped queue
+ * @q: The &struct request_queue in question
+ *
+ * Description:
+ * blk_start_queue_async() will clear the stop flag on the queue, and
+ * ensure that the request_fn for the queue is run from an async
+ * context.
+ **/
+void blk_start_queue_async(struct request_queue *q)
+{
+ queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+ blk_run_queue_async(q);
+}
+EXPORT_SYMBOL(blk_start_queue_async);
+
+/**
* blk_start_queue - restart a previously stopped queue
* @q: The &struct request_queue in question
*
@@ -630,7 +646,7 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
}
EXPORT_SYMBOL(blk_alloc_queue);
-int blk_queue_enter(struct request_queue *q, gfp_t gfp)
+int blk_queue_enter(struct request_queue *q, bool nowait)
{
while (true) {
int ret;
@@ -638,7 +654,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
if (percpu_ref_tryget_live(&q->q_usage_counter))
return 0;
- if (!(gfp & __GFP_WAIT))
+ if (nowait)
return -EBUSY;
ret = wait_event_interruptible(q->mq_freeze_wq,
@@ -664,6 +680,13 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
wake_up_all(&q->mq_freeze_wq);
}
+static void blk_rq_timed_out_timer(unsigned long data)
+{
+ struct request_queue *q = (struct request_queue *)data;
+
+ kblockd_schedule_work(&q->timeout_work);
+}
+
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
{
struct request_queue *q;
@@ -683,7 +706,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
goto fail_id;
q->backing_dev_info.ra_pages =
- (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->backing_dev_info.name = "block";
q->node = node_id;
@@ -809,7 +832,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
}
EXPORT_SYMBOL(blk_init_queue_node);
-static void blk_queue_bio(struct request_queue *q, struct bio *bio);
+static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
struct request_queue *
blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
@@ -825,6 +848,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
goto fail;
+ INIT_WORK(&q->timeout_work, blk_timeout_work);
q->request_fn = rfn;
q->prep_rq_fn = NULL;
q->unprep_rq_fn = NULL;
@@ -1206,8 +1230,8 @@ rq_starved:
* @bio: bio to allocate request for (can be %NULL)
* @gfp_mask: allocation mask
*
- * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this
- * function keeps retrying under memory pressure and fails iff @q is dead.
+ * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
+ * this function keeps retrying under memory pressure and fails iff @q is dead.
*
* Must be called with @q->queue_lock held and,
* Returns ERR_PTR on failure, with @q->queue_lock held.
@@ -1227,7 +1251,7 @@ retry:
if (!IS_ERR(rq))
return rq;
- if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
+ if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
blk_put_rl(rl);
return rq;
}
@@ -1276,7 +1300,9 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
{
if (q->mq_ops)
- return blk_mq_alloc_request(q, rw, gfp_mask, false);
+ return blk_mq_alloc_request(q, rw,
+ (gfp_mask & __GFP_DIRECT_RECLAIM) ?
+ 0 : BLK_MQ_REQ_NOWAIT);
else
return blk_old_get_request(q, rw, gfp_mask);
}
@@ -1305,11 +1331,11 @@ EXPORT_SYMBOL(blk_get_request);
* BUG.
*
* WARNING: When allocating/cloning a bio-chain, careful consideration should be
- * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
- * anything but the first bio in the chain. Otherwise you risk waiting for IO
- * completion of a bio that hasn't been submitted yet, thus resulting in a
- * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
- * of bio_alloc(), as that avoids the mempool deadlock.
+ * given to how you allocate bios. In particular, you cannot use
+ * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise
+ * you risk waiting for IO completion of a bio that hasn't been submitted yet,
+ * thus resulting in a deadlock. Alternatively bios should be allocated using
+ * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock.
* If possible a big IO should be split into smaller parts when allocation
* fails. Partial allocation should not be an error, or you risk a live-lock.
*/
@@ -1575,6 +1601,9 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @request_count: out parameter for number of traversed plugged requests
+ * @same_queue_rq: pointer to &struct request that gets filled in when
+ * another request associated with @q is found on the plug list
+ * (optional, may be %NULL)
*
* Determine whether @bio being queued on @q can be merged with a request
* on %current's plugged list. Returns %true if merge was successful,
@@ -1678,7 +1707,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
blk_rq_bio_prep(req->q, req, bio);
}
-static void blk_queue_bio(struct request_queue *q, struct bio *bio)
+static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
{
const bool sync = !!(bio->bi_rw & REQ_SYNC);
struct blk_plug *plug;
@@ -1686,8 +1715,6 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
struct request *req;
unsigned int request_count = 0;
- blk_queue_split(q, &bio, q->bio_split);
-
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
@@ -1695,10 +1722,12 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
*/
blk_queue_bounce(q, &bio);
+ blk_queue_split(q, &bio, q->bio_split);
+
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio->bi_error = -EIO;
bio_endio(bio);
- return;
+ return BLK_QC_T_NONE;
}
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
@@ -1713,7 +1742,7 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
*/
if (!blk_queue_nomerges(q)) {
if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
- return;
+ return BLK_QC_T_NONE;
} else
request_count = blk_plug_queued_count(q);
@@ -1791,6 +1820,8 @@ get_rq:
out_unlock:
spin_unlock_irq(q->queue_lock);
}
+
+ return BLK_QC_T_NONE;
}
/*
@@ -1996,12 +2027,13 @@ end_io:
* a lower device by calling into generic_make_request recursively, which
* means the bio should NOT be touched after the call to ->make_request_fn.
*/
-void generic_make_request(struct bio *bio)
+blk_qc_t generic_make_request(struct bio *bio)
{
struct bio_list bio_list_on_stack;
+ blk_qc_t ret = BLK_QC_T_NONE;
if (!generic_make_request_checks(bio))
- return;
+ goto out;
/*
* We only want one ->make_request_fn to be active at a time, else
@@ -2015,7 +2047,7 @@ void generic_make_request(struct bio *bio)
*/
if (current->bio_list) {
bio_list_add(current->bio_list, bio);
- return;
+ goto out;
}
/* following loop may be a bit non-obvious, and so deserves some
@@ -2038,9 +2070,8 @@ void generic_make_request(struct bio *bio)
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- if (likely(blk_queue_enter(q, __GFP_WAIT) == 0)) {
-
- q->make_request_fn(q, bio);
+ if (likely(blk_queue_enter(q, false) == 0)) {
+ ret = q->make_request_fn(q, bio);
blk_queue_exit(q);
@@ -2053,6 +2084,9 @@ void generic_make_request(struct bio *bio)
}
} while (bio);
current->bio_list = NULL; /* deactivate */
+
+out:
+ return ret;
}
EXPORT_SYMBOL(generic_make_request);
@@ -2066,7 +2100,7 @@ EXPORT_SYMBOL(generic_make_request);
* interfaces; @bio must be presetup and ready for I/O.
*
*/
-void submit_bio(int rw, struct bio *bio)
+blk_qc_t submit_bio(int rw, struct bio *bio)
{
bio->bi_rw |= rw;
@@ -2100,12 +2134,13 @@ void submit_bio(int rw, struct bio *bio)
}
}
- generic_make_request(bio);
+ return generic_make_request(bio);
}
EXPORT_SYMBOL(submit_bio);
/**
- * blk_rq_check_limits - Helper function to check a request for the queue limit
+ * blk_cloned_rq_check_limits - Helper function to check a cloned request
+ * for new the queue limits
* @q: the queue
* @rq: the request being checked
*
@@ -2116,20 +2151,13 @@ EXPORT_SYMBOL(submit_bio);
* after it is inserted to @q, it should be checked against @q before
* the insertion using this generic function.
*
- * This function should also be useful for request stacking drivers
- * in some cases below, so export this function.
* Request stacking drivers like request-based dm may change the queue
- * limits while requests are in the queue (e.g. dm's table swapping).
- * Such request stacking drivers should check those requests against
- * the new queue limits again when they dispatch those requests,
- * although such checkings are also done against the old queue limits
- * when submitting requests.
+ * limits when retrying requests on other queues. Those requests need
+ * to be checked against the new queue limits again during dispatch.
*/
-int blk_rq_check_limits(struct request_queue *q, struct request *rq)
+static int blk_cloned_rq_check_limits(struct request_queue *q,
+ struct request *rq)
{
- if (!rq_mergeable(rq))
- return 0;
-
if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) {
printk(KERN_ERR "%s: over max size limit.\n", __func__);
return -EIO;
@@ -2149,7 +2177,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
return 0;
}
-EXPORT_SYMBOL_GPL(blk_rq_check_limits);
/**
* blk_insert_cloned_request - Helper for stacking drivers to submit a request
@@ -2161,7 +2188,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
unsigned long flags;
int where = ELEVATOR_INSERT_BACK;
- if (blk_rq_check_limits(q, rq))
+ if (blk_cloned_rq_check_limits(q, rq))
return -EIO;
if (rq->rq_disk &&
@@ -2171,7 +2198,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
if (q->mq_ops) {
if (blk_queue_io_stat(q))
blk_account_io_start(rq, true);
- blk_mq_insert_request(rq, false, true, true);
+ blk_mq_insert_request(rq, false, true, false);
return 0;
}
@@ -2428,14 +2455,16 @@ struct request *blk_peek_request(struct request_queue *q)
rq = NULL;
break;
- } else if (ret == BLKPREP_KILL) {
+ } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
+ int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO;
+
rq->cmd_flags |= REQ_QUIET;
/*
* Mark this request as started so we don't trigger
* any debug logic in the end I/O path.
*/
blk_start_request(rq);
- __blk_end_request_all(rq, -EIO);
+ __blk_end_request_all(rq, err);
} else {
printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
break;
@@ -3306,6 +3335,47 @@ void blk_finish_plug(struct blk_plug *plug)
}
EXPORT_SYMBOL(blk_finish_plug);
+bool blk_poll(struct request_queue *q, blk_qc_t cookie)
+{
+ struct blk_plug *plug;
+ long state;
+
+ if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ return false;
+
+ plug = current->plug;
+ if (plug)
+ blk_flush_plug_list(plug, false);
+
+ state = current->state;
+ while (!need_resched()) {
+ unsigned int queue_num = blk_qc_t_to_queue_num(cookie);
+ struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num];
+ int ret;
+
+ hctx->poll_invoked++;
+
+ ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
+ if (ret > 0) {
+ hctx->poll_success++;
+ set_current_state(TASK_RUNNING);
+ return true;
+ }
+
+ if (signal_pending_state(state, current))
+ set_current_state(TASK_RUNNING);
+
+ if (current->state == TASK_RUNNING)
+ return true;
+ if (ret < 0)
+ break;
+ cpu_relax();
+ }
+
+ return false;
+}
+
#ifdef CONFIG_PM
/**
* blk_pm_runtime_init - Block layer runtime PM initialization routine
@@ -3362,6 +3432,9 @@ int blk_pre_runtime_suspend(struct request_queue *q)
{
int ret = 0;
+ if (!q->dev)
+ return ret;
+
spin_lock_irq(q->queue_lock);
if (q->nr_pending) {
ret = -EBUSY;
@@ -3389,6 +3462,9 @@ EXPORT_SYMBOL(blk_pre_runtime_suspend);
*/
void blk_post_runtime_suspend(struct request_queue *q, int err)
{
+ if (!q->dev)
+ return;
+
spin_lock_irq(q->queue_lock);
if (!err) {
q->rpm_status = RPM_SUSPENDED;
@@ -3413,6 +3489,9 @@ EXPORT_SYMBOL(blk_post_runtime_suspend);
*/
void blk_pre_runtime_resume(struct request_queue *q)
{
+ if (!q->dev)
+ return;
+
spin_lock_irq(q->queue_lock);
q->rpm_status = RPM_RESUMING;
spin_unlock_irq(q->queue_lock);
@@ -3435,6 +3514,9 @@ EXPORT_SYMBOL(blk_pre_runtime_resume);
*/
void blk_post_runtime_resume(struct request_queue *q, int err)
{
+ if (!q->dev)
+ return;
+
spin_lock_irq(q->queue_lock);
if (!err) {
q->rpm_status = RPM_ACTIVE;
@@ -3447,6 +3529,30 @@ void blk_post_runtime_resume(struct request_queue *q, int err)
spin_unlock_irq(q->queue_lock);
}
EXPORT_SYMBOL(blk_post_runtime_resume);
+
+/**
+ * blk_set_runtime_active - Force runtime status of the queue to be active
+ * @q: the queue of the device
+ *
+ * If the device is left runtime suspended during system suspend the resume
+ * hook typically resumes the device and corrects runtime status
+ * accordingly. However, that does not affect the queue runtime PM status
+ * which is still "suspended". This prevents processing requests from the
+ * queue.
+ *
+ * This function can be used in driver's resume hook to correct queue
+ * runtime PM status and re-enable peeking requests from the queue. It
+ * should be called before first request is added to the queue.
+ */
+void blk_set_runtime_active(struct request_queue *q)
+{
+ spin_lock_irq(q->queue_lock);
+ q->rpm_status = RPM_ACTIVE;
+ pm_runtime_mark_last_busy(q->dev);
+ pm_request_autosuspend(q->dev);
+ spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_set_runtime_active);
#endif
int __init blk_dev_init(void)
@@ -3463,7 +3569,7 @@ int __init blk_dev_init(void)
request_cachep = kmem_cache_create("blkdev_requests",
sizeof(struct request), 0, SLAB_PANIC, NULL);
- blk_requestq_cachep = kmem_cache_create("blkdev_queue",
+ blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
return 0;
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 1a27f45ec776..381cb50a673c 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -289,7 +289,7 @@ struct io_context *get_task_io_context(struct task_struct *task,
{
struct io_context *ioc;
- might_sleep_if(gfp_flags & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(gfp_flags));
do {
task_lock(task);
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
deleted file mode 100644
index 0736729d6494..000000000000
--- a/block/blk-iopoll.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Functions related to interrupt-poll handling in the block layer. This
- * is similar to NAPI for network devices.
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/blk-iopoll.h>
-#include <linux/delay.h>
-
-#include "blk.h"
-
-static unsigned int blk_iopoll_budget __read_mostly = 256;
-
-static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
-
-/**
- * blk_iopoll_sched - Schedule a run of the iopoll handler
- * @iop: The parent iopoll structure
- *
- * Description:
- * Add this blk_iopoll structure to the pending poll list and trigger the
- * raise of the blk iopoll softirq. The driver must already have gotten a
- * successful return from blk_iopoll_sched_prep() before calling this.
- **/
-void blk_iopoll_sched(struct blk_iopoll *iop)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
- __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_iopoll_sched);
-
-/**
- * __blk_iopoll_complete - Mark this @iop as un-polled again
- * @iop: The parent iopoll structure
- *
- * Description:
- * See blk_iopoll_complete(). This function must be called with interrupts
- * disabled.
- **/
-void __blk_iopoll_complete(struct blk_iopoll *iop)
-{
- list_del(&iop->list);
- smp_mb__before_atomic();
- clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(__blk_iopoll_complete);
-
-/**
- * blk_iopoll_complete - Mark this @iop as un-polled again
- * @iop: The parent iopoll structure
- *
- * Description:
- * If a driver consumes less than the assigned budget in its run of the
- * iopoll handler, it'll end the polled mode by calling this function. The
- * iopoll handler will not be invoked again before blk_iopoll_sched_prep()
- * is called.
- **/
-void blk_iopoll_complete(struct blk_iopoll *iop)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __blk_iopoll_complete(iop);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_iopoll_complete);
-
-static void blk_iopoll_softirq(struct softirq_action *h)
-{
- struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
- int rearm = 0, budget = blk_iopoll_budget;
- unsigned long start_time = jiffies;
-
- local_irq_disable();
-
- while (!list_empty(list)) {
- struct blk_iopoll *iop;
- int work, weight;
-
- /*
- * If softirq window is exhausted then punt.
- */
- if (budget <= 0 || time_after(jiffies, start_time)) {
- rearm = 1;
- break;
- }
-
- local_irq_enable();
-
- /* Even though interrupts have been re-enabled, this
- * access is safe because interrupts can only add new
- * entries to the tail of this list, and only ->poll()
- * calls can remove this head entry from the list.
- */
- iop = list_entry(list->next, struct blk_iopoll, list);
-
- weight = iop->weight;
- work = 0;
- if (test_bit(IOPOLL_F_SCHED, &iop->state))
- work = iop->poll(iop, weight);
-
- budget -= work;
-
- local_irq_disable();
-
- /*
- * Drivers must not modify the iopoll state, if they
- * consume their assigned weight (or more, some drivers can't
- * easily just stop processing, they have to complete an
- * entire mask of commands).In such cases this code
- * still "owns" the iopoll instance and therefore can
- * move the instance around on the list at-will.
- */
- if (work >= weight) {
- if (blk_iopoll_disable_pending(iop))
- __blk_iopoll_complete(iop);
- else
- list_move_tail(&iop->list, list);
- }
- }
-
- if (rearm)
- __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
-
- local_irq_enable();
-}
-
-/**
- * blk_iopoll_disable - Disable iopoll on this @iop
- * @iop: The parent iopoll structure
- *
- * Description:
- * Disable io polling and wait for any pending callbacks to have completed.
- **/
-void blk_iopoll_disable(struct blk_iopoll *iop)
-{
- set_bit(IOPOLL_F_DISABLE, &iop->state);
- while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
- msleep(1);
- clear_bit(IOPOLL_F_DISABLE, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_disable);
-
-/**
- * blk_iopoll_enable - Enable iopoll on this @iop
- * @iop: The parent iopoll structure
- *
- * Description:
- * Enable iopoll on this @iop. Note that the handler run will not be
- * scheduled, it will only mark it as active.
- **/
-void blk_iopoll_enable(struct blk_iopoll *iop)
-{
- BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
- smp_mb__before_atomic();
- clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_enable);
-
-/**
- * blk_iopoll_init - Initialize this @iop
- * @iop: The parent iopoll structure
- * @weight: The default weight (or command completion budget)
- * @poll_fn: The handler to invoke
- *
- * Description:
- * Initialize this blk_iopoll structure. Before being actively used, the
- * driver must call blk_iopoll_enable().
- **/
-void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
-{
- memset(iop, 0, sizeof(*iop));
- INIT_LIST_HEAD(&iop->list);
- iop->weight = weight;
- iop->poll = poll_fn;
- set_bit(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_init);
-
-static int blk_iopoll_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- /*
- * If a CPU goes away, splice its entries to the current CPU
- * and trigger a run of the softirq
- */
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- int cpu = (unsigned long) hcpu;
-
- local_irq_disable();
- list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
- this_cpu_ptr(&blk_cpu_iopoll));
- __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
- local_irq_enable();
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block blk_iopoll_cpu_notifier = {
- .notifier_call = blk_iopoll_cpu_notify,
-};
-
-static __init int blk_iopoll_setup(void)
-{
- int i;
-
- for_each_possible_cpu(i)
- INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
-
- open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
- register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
- return 0;
-}
-subsys_initcall(blk_iopoll_setup);
diff --git a/block/blk-map.c b/block/blk-map.c
index f565e11f465a..a54f0543b956 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -57,6 +57,49 @@ static int __blk_rq_unmap_user(struct bio *bio)
return ret;
}
+static int __blk_rq_map_user_iov(struct request *rq,
+ struct rq_map_data *map_data, struct iov_iter *iter,
+ gfp_t gfp_mask, bool copy)
+{
+ struct request_queue *q = rq->q;
+ struct bio *bio, *orig_bio;
+ int ret;
+
+ if (copy)
+ bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
+ else
+ bio = bio_map_user_iov(q, iter, gfp_mask);
+
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ if (map_data && map_data->null_mapped)
+ bio_set_flag(bio, BIO_NULL_MAPPED);
+
+ iov_iter_advance(iter, bio->bi_iter.bi_size);
+ if (map_data)
+ map_data->offset += bio->bi_iter.bi_size;
+
+ orig_bio = bio;
+ blk_queue_bounce(q, &bio);
+
+ /*
+ * We link the bounce buffer in and could have to traverse it
+ * later so we have to get a ref to prevent it from being freed
+ */
+ bio_get(bio);
+
+ ret = blk_rq_append_bio(q, rq, bio);
+ if (ret) {
+ bio_endio(bio);
+ __blk_rq_unmap_user(orig_bio);
+ bio_put(bio);
+ return ret;
+ }
+
+ return 0;
+}
+
/**
* blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
* @q: request queue where request should be inserted
@@ -82,10 +125,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data,
const struct iov_iter *iter, gfp_t gfp_mask)
{
- struct bio *bio;
- int unaligned = 0;
- struct iov_iter i;
struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
+ bool copy = (q->dma_pad_mask & iter->count) || map_data;
+ struct bio *bio = NULL;
+ struct iov_iter i;
+ int ret;
if (!iter || !iter->count)
return -EINVAL;
@@ -101,42 +145,29 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
*/
if ((uaddr & queue_dma_alignment(q)) ||
iovec_gap_to_prv(q, &prv, &iov))
- unaligned = 1;
+ copy = true;
prv.iov_base = iov.iov_base;
prv.iov_len = iov.iov_len;
}
- if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
- bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
- else
- bio = bio_map_user_iov(q, iter, gfp_mask);
-
- if (IS_ERR(bio))
- return PTR_ERR(bio);
-
- if (map_data && map_data->null_mapped)
- bio_set_flag(bio, BIO_NULL_MAPPED);
-
- if (bio->bi_iter.bi_size != iter->count) {
- /*
- * Grab an extra reference to this bio, as bio_unmap_user()
- * expects to be able to drop it twice as it happens on the
- * normal IO completion path
- */
- bio_get(bio);
- bio_endio(bio);
- __blk_rq_unmap_user(bio);
- return -EINVAL;
- }
+ i = *iter;
+ do {
+ ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
+ if (ret)
+ goto unmap_rq;
+ if (!bio)
+ bio = rq->bio;
+ } while (iov_iter_count(&i));
if (!bio_flagged(bio, BIO_USER_MAPPED))
rq->cmd_flags |= REQ_COPY_USER;
-
- blk_queue_bounce(q, &bio);
- bio_get(bio);
- blk_rq_bio_prep(q, rq, bio);
return 0;
+
+unmap_rq:
+ __blk_rq_unmap_user(bio);
+ rq->bio = NULL;
+ return -EINVAL;
}
EXPORT_SYMBOL(blk_rq_map_user_iov);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index de5716d8e525..261353166dcf 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -7,6 +7,8 @@
#include <linux/blkdev.h>
#include <linux/scatterlist.h>
+#include <trace/events/block.h>
+
#include "blk.h"
static struct bio *blk_bio_discard_split(struct request_queue *q,
@@ -68,6 +70,18 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
}
+static inline unsigned get_max_io_size(struct request_queue *q,
+ struct bio *bio)
+{
+ unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
+ unsigned mask = queue_logical_block_size(q) - 1;
+
+ /* aligned to logical block size */
+ sectors &= ~(mask >> 9);
+
+ return sectors;
+}
+
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
@@ -76,11 +90,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
unsigned seg_size = 0, nsegs = 0, sectors = 0;
+ unsigned front_seg_size = bio->bi_seg_front_size;
+ bool do_split = true;
+ struct bio *new = NULL;
+ const unsigned max_sectors = get_max_io_size(q, bio);
bio_for_each_segment(bv, bio, iter) {
- if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q))
- goto split;
-
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
@@ -88,6 +103,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
goto split;
+ if (sectors + (bv.bv_len >> 9) > max_sectors) {
+ /*
+ * Consider this a new segment if we're splitting in
+ * the middle of this vector.
+ */
+ if (nsegs < queue_max_segments(q) &&
+ sectors < max_sectors) {
+ nsegs++;
+ sectors = max_sectors;
+ }
+ if (sectors)
+ goto split;
+ /* Make this single bvec as the 1st segment */
+ }
+
if (bvprvp && blk_queue_cluster(q)) {
if (seg_size + bv.bv_len > queue_max_segment_size(q))
goto new_segment;
@@ -98,8 +128,11 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
seg_size += bv.bv_len;
bvprv = bv;
- bvprvp = &bv;
+ bvprvp = &bvprv;
sectors += bv.bv_len >> 9;
+
+ if (nsegs == 1 && seg_size > front_seg_size)
+ front_seg_size = seg_size;
continue;
}
new_segment:
@@ -108,16 +141,29 @@ new_segment:
nsegs++;
bvprv = bv;
- bvprvp = &bv;
+ bvprvp = &bvprv;
seg_size = bv.bv_len;
sectors += bv.bv_len >> 9;
+
+ if (nsegs == 1 && seg_size > front_seg_size)
+ front_seg_size = seg_size;
}
- *segs = nsegs;
- return NULL;
+ do_split = false;
split:
*segs = nsegs;
- return bio_split(bio, sectors, GFP_NOIO, bs);
+
+ if (do_split) {
+ new = bio_split(bio, sectors, GFP_NOIO, bs);
+ if (new)
+ bio = new;
+ }
+
+ bio->bi_seg_front_size = front_seg_size;
+ if (seg_size > bio->bi_seg_back_size)
+ bio->bi_seg_back_size = seg_size;
+
+ return do_split ? new : NULL;
}
void blk_queue_split(struct request_queue *q, struct bio **bio,
@@ -143,6 +189,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
split->bi_rw |= REQ_NOMERGE;
bio_chain(split, *bio);
+ trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
generic_make_request(*bio);
*bio = split;
}
@@ -257,7 +304,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
struct bio *nxt)
{
struct bio_vec end_bv = { NULL }, nxt_bv;
- struct bvec_iter iter;
if (!blk_queue_cluster(q))
return 0;
@@ -269,11 +315,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
if (!bio_has_data(bio))
return 1;
- bio_for_each_segment(end_bv, bio, iter)
- if (end_bv.bv_len == iter.bi_size)
- break;
-
- nxt_bv = bio_iovec(nxt);
+ bio_get_last_bvec(bio, &end_bv);
+ bio_get_first_bvec(nxt, &nxt_bv);
if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
return 0;
@@ -412,6 +455,12 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
if (sg)
sg_mark_end(sg);
+ /*
+ * Something must have been wrong if the figured number of
+ * segment is bigger than number of req's physical segments
+ */
+ WARN_ON(nsegs > rq->nr_phys_segments);
+
return nsegs;
}
EXPORT_SYMBOL(blk_rq_map_sg);
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 8764c241e5bb..d0634bcf322f 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -113,7 +113,7 @@ int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
for_each_possible_cpu(i) {
if (index == mq_map[i])
- return cpu_to_node(i);
+ return local_memory_node(cpu_to_node(i));
}
return NUMA_NO_NODE;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 6f57a110289c..4ea4dd8a1eed 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -174,6 +174,11 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
return ret;
}
+static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+ return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success);
+}
+
static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
char *page)
{
@@ -295,6 +300,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
.attr = {.name = "cpu_list", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_cpus_show,
};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
+ .attr = {.name = "io_poll", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_poll_show,
+};
static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_queued.attr,
@@ -304,6 +313,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_tags.attr,
&blk_mq_hw_sysfs_cpus.attr,
&blk_mq_hw_sysfs_active.attr,
+ &blk_mq_hw_sysfs_poll.attr,
NULL,
};
@@ -398,19 +408,22 @@ void blk_mq_unregister_disk(struct gendisk *disk)
blk_mq_enable_hotplug();
}
+void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
+{
+ kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+}
+
static void blk_mq_sysfs_init(struct request_queue *q)
{
- struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
- int i;
+ int cpu;
kobject_init(&q->mq_kobj, &blk_mq_ktype);
- queue_for_each_hw_ctx(q, hctx, i)
- kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
-
- queue_for_each_ctx(q, ctx, i)
+ for_each_possible_cpu(cpu) {
+ ctx = per_cpu_ptr(q->queue_ctx, cpu);
kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
+ }
}
int blk_mq_register_disk(struct gendisk *disk)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 60ac684c8b8c..abdbb47405cb 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
if (tag != -1)
return tag;
- if (!(data->gfp & __GFP_WAIT))
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
return -1;
bs = bt_wait_ptr(bt, hctx);
@@ -303,7 +303,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
data->ctx = blk_mq_get_ctx(data->q);
data->hctx = data->q->mq_ops->map_queue(data->q,
data->ctx->cpu);
- if (data->reserved) {
+ if (data->flags & BLK_MQ_REQ_RESERVED) {
bt = &data->hctx->tags->breserved_tags;
} else {
last_tag = &data->ctx->last_tag;
@@ -349,10 +349,9 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
- if (!data->reserved)
- return __blk_mq_get_tag(data);
-
- return __blk_mq_get_reserved_tag(data);
+ if (data->flags & BLK_MQ_REQ_RESERVED)
+ return __blk_mq_get_reserved_tag(data);
+ return __blk_mq_get_tag(data);
}
static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1c27b3eaef64..1699baf39b78 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -229,8 +229,8 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
return NULL;
}
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
- bool reserved)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+ unsigned int flags)
{
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
@@ -238,24 +238,22 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
struct blk_mq_alloc_data alloc_data;
int ret;
- ret = blk_queue_enter(q, gfp);
+ ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
if (ret)
return ERR_PTR(ret);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
- reserved, ctx, hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
- if (!rq && (gfp & __GFP_WAIT)) {
+ if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
__blk_mq_run_hw_queue(hctx);
blk_mq_put_ctx(ctx);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
- hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
ctx = alloc_data.ctx;
}
@@ -358,7 +356,7 @@ static void blk_mq_ipi_complete_request(struct request *rq)
put_cpu();
}
-void __blk_mq_complete_request(struct request *rq)
+static void __blk_mq_complete_request(struct request *rq)
{
struct request_queue *q = rq->q;
@@ -546,7 +544,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
- return tags->rqs[tag];
+ if (tag < tags->nr_tags)
+ return tags->rqs[tag];
+
+ return NULL;
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);
@@ -601,12 +602,12 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
* If a request wasn't started before the queue was
* marked dying, kill it here or it'll go unnoticed.
*/
- if (unlikely(blk_queue_dying(rq->q)))
- blk_mq_complete_request(rq, -EIO);
+ if (unlikely(blk_queue_dying(rq->q))) {
+ rq->errors = -EIO;
+ blk_mq_end_request(rq, rq->errors);
+ }
return;
}
- if (rq->cmd_flags & REQ_NO_TIMEOUT)
- return;
if (time_after_eq(jiffies, rq->deadline)) {
if (!blk_mark_rq_complete(rq))
@@ -617,15 +618,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
}
}
-static void blk_mq_rq_timer(unsigned long priv)
+static void blk_mq_timeout_work(struct work_struct *work)
{
- struct request_queue *q = (struct request_queue *)priv;
+ struct request_queue *q =
+ container_of(work, struct request_queue, timeout_work);
struct blk_mq_timeout_data data = {
.next = 0,
.next_set = 0,
};
int i;
+ if (blk_queue_enter(q, true))
+ return;
+
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
if (data.next_set) {
@@ -640,6 +645,7 @@ static void blk_mq_rq_timer(unsigned long priv)
blk_mq_tag_idle(hctx);
}
}
+ blk_queue_exit(q);
}
/*
@@ -1175,8 +1181,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
rw |= REQ_SYNC;
trace_block_getrq(q, bio, rw);
- blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
- hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
if (unlikely(!rq)) {
__blk_mq_run_hw_queue(hctx);
@@ -1185,8 +1190,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q,
- __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
ctx = alloc_data.ctx;
hctx = alloc_data.hctx;
@@ -1198,7 +1202,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
return rq;
}
-static int blk_mq_direct_issue_request(struct request *rq)
+static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
{
int ret;
struct request_queue *q = rq->q;
@@ -1209,6 +1213,7 @@ static int blk_mq_direct_issue_request(struct request *rq)
.list = NULL,
.last = 1
};
+ blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
/*
* For OK queue, we are done. For error, kill it. Any other
@@ -1216,18 +1221,21 @@ static int blk_mq_direct_issue_request(struct request *rq)
* would have done
*/
ret = q->mq_ops->queue_rq(hctx, &bd);
- if (ret == BLK_MQ_RQ_QUEUE_OK)
+ if (ret == BLK_MQ_RQ_QUEUE_OK) {
+ *cookie = new_cookie;
return 0;
- else {
- __blk_mq_requeue_request(rq);
+ }
- if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
- rq->errors = -EIO;
- blk_mq_end_request(rq, rq->errors);
- return 0;
- }
- return -1;
+ __blk_mq_requeue_request(rq);
+
+ if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+ *cookie = BLK_QC_T_NONE;
+ rq->errors = -EIO;
+ blk_mq_end_request(rq, rq->errors);
+ return 0;
}
+
+ return -1;
}
/*
@@ -1235,7 +1243,7 @@ static int blk_mq_direct_issue_request(struct request *rq)
* but will attempt to bypass the hctx queueing if we can go straight to
* hardware for SYNC IO.
*/
-static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = rw_is_sync(bio->bi_rw);
const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
@@ -1244,12 +1252,13 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
unsigned int request_count = 0;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
+ blk_qc_t cookie;
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio_io_error(bio);
- return;
+ return BLK_QC_T_NONE;
}
blk_queue_split(q, &bio, q->bio_split);
@@ -1257,13 +1266,15 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
if (!is_flush_fua && !blk_queue_nomerges(q)) {
if (blk_attempt_plug_merge(q, bio, &request_count,
&same_queue_rq))
- return;
+ return BLK_QC_T_NONE;
} else
request_count = blk_plug_queued_count(q);
rq = blk_mq_map_request(q, bio, &data);
if (unlikely(!rq))
- return;
+ return BLK_QC_T_NONE;
+
+ cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
if (unlikely(is_flush_fua)) {
blk_mq_bio_to_request(rq, bio);
@@ -1284,15 +1295,16 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_bio_to_request(rq, bio);
/*
- * we do limited pluging. If bio can be merged, do merge.
+ * We do limited pluging. If the bio can be merged, do that.
* Otherwise the existing request in the plug list will be
* issued. So the plug list will have one request at most
*/
if (plug) {
/*
* The plug list might get flushed before this. If that
- * happens, same_queue_rq is invalid and plug list is empty
- **/
+ * happens, same_queue_rq is invalid and plug list is
+ * empty
+ */
if (same_queue_rq && !list_empty(&plug->mq_list)) {
old_rq = same_queue_rq;
list_del_init(&old_rq->queuelist);
@@ -1302,11 +1314,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
old_rq = rq;
blk_mq_put_ctx(data.ctx);
if (!old_rq)
- return;
- if (!blk_mq_direct_issue_request(old_rq))
- return;
+ goto done;
+ if (!blk_mq_direct_issue_request(old_rq, &cookie))
+ goto done;
blk_mq_insert_request(old_rq, false, true, true);
- return;
+ goto done;
}
if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1320,13 +1332,15 @@ run_queue:
blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
}
blk_mq_put_ctx(data.ctx);
+done:
+ return cookie;
}
/*
* Single hardware queue variant. This will attempt to use any per-process
* plug for merging and IO deferral.
*/
-static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = rw_is_sync(bio->bi_rw);
const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
@@ -1334,23 +1348,26 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
unsigned int request_count = 0;
struct blk_map_ctx data;
struct request *rq;
+ blk_qc_t cookie;
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio_io_error(bio);
- return;
+ return BLK_QC_T_NONE;
}
blk_queue_split(q, &bio, q->bio_split);
if (!is_flush_fua && !blk_queue_nomerges(q) &&
blk_attempt_plug_merge(q, bio, &request_count, NULL))
- return;
+ return BLK_QC_T_NONE;
rq = blk_mq_map_request(q, bio, &data);
if (unlikely(!rq))
- return;
+ return BLK_QC_T_NONE;
+
+ cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
if (unlikely(is_flush_fua)) {
blk_mq_bio_to_request(rq, bio);
@@ -1368,13 +1385,16 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_bio_to_request(rq, bio);
if (!request_count)
trace_block_plug(q);
- else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+
+ blk_mq_put_ctx(data.ctx);
+
+ if (request_count >= BLK_MAX_REQUEST_COUNT) {
blk_flush_plug_list(plug, false);
trace_block_plug(q);
}
+
list_add_tail(&rq->queuelist, &plug->mq_list);
- blk_mq_put_ctx(data.ctx);
- return;
+ return cookie;
}
if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1389,6 +1409,7 @@ run_queue:
}
blk_mq_put_ctx(data.ctx);
+ return cookie;
}
/*
@@ -1726,31 +1747,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
return -1;
}
-static int blk_mq_init_hw_queues(struct request_queue *q,
- struct blk_mq_tag_set *set)
-{
- struct blk_mq_hw_ctx *hctx;
- unsigned int i;
-
- /*
- * Initialize hardware queues
- */
- queue_for_each_hw_ctx(q, hctx, i) {
- if (blk_mq_init_hctx(q, set, hctx, i))
- break;
- }
-
- if (i == q->nr_hw_queues)
- return 0;
-
- /*
- * Init failed
- */
- blk_mq_exit_hw_queues(q, set, i);
-
- return 1;
-}
-
static void blk_mq_init_cpu_queues(struct request_queue *q,
unsigned int nr_hw_queues)
{
@@ -1777,7 +1773,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
* not, we remain on the home node of the device
*/
if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
- hctx->numa_node = cpu_to_node(i);
+ hctx->numa_node = local_memory_node(cpu_to_node(i));
}
}
@@ -1802,12 +1798,14 @@ static void blk_mq_map_swqueue(struct request_queue *q,
/*
* Map software to hardware queues
*/
- queue_for_each_ctx(q, ctx, i) {
+ for_each_possible_cpu(i) {
/* If the cpu isn't online, the cpu is mapped to first hctx */
if (!cpumask_test_cpu(i, online_mask))
continue;
+ ctx = per_cpu_ptr(q->queue_ctx, i);
hctx = q->mq_ops->map_queue(q, i);
+
cpumask_set_cpu(i, hctx->cpumask);
ctx->index_hw = hctx->nr_ctx;
hctx->ctxs[hctx->nr_ctx++] = ctx;
@@ -1837,6 +1835,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
hctx->tags = set->tags[i];
WARN_ON(!hctx->tags);
+ cpumask_copy(hctx->tags->cpumask, hctx->cpumask);
/*
* Set the map size to the number of mapped software queues.
* This is more accurate and more efficient than looping
@@ -1850,14 +1849,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
hctx->next_cpu = cpumask_first(hctx->cpumask);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
-
- queue_for_each_ctx(q, ctx, i) {
- if (!cpumask_test_cpu(i, online_mask))
- continue;
-
- hctx = q->mq_ops->map_queue(q, i);
- cpumask_set_cpu(i, hctx->tags->cpumask);
- }
}
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
@@ -1963,56 +1954,93 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_init_queue);
-struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
- struct request_queue *q)
+static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
+ struct request_queue *q)
{
- struct blk_mq_hw_ctx **hctxs;
- struct blk_mq_ctx __percpu *ctx;
- unsigned int *map;
- int i;
-
- ctx = alloc_percpu(struct blk_mq_ctx);
- if (!ctx)
- return ERR_PTR(-ENOMEM);
-
- hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
- set->numa_node);
-
- if (!hctxs)
- goto err_percpu;
-
- map = blk_mq_make_queue_map(set);
- if (!map)
- goto err_map;
+ int i, j;
+ struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
+ blk_mq_sysfs_unregister(q);
for (i = 0; i < set->nr_hw_queues; i++) {
- int node = blk_mq_hw_queue_to_node(map, i);
+ int node;
+ if (hctxs[i])
+ continue;
+
+ node = blk_mq_hw_queue_to_node(q->mq_map, i);
hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
GFP_KERNEL, node);
if (!hctxs[i])
- goto err_hctxs;
+ break;
if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
- node))
- goto err_hctxs;
+ node)) {
+ kfree(hctxs[i]);
+ hctxs[i] = NULL;
+ break;
+ }
atomic_set(&hctxs[i]->nr_active, 0);
hctxs[i]->numa_node = node;
hctxs[i]->queue_num = i;
+
+ if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
+ free_cpumask_var(hctxs[i]->cpumask);
+ kfree(hctxs[i]);
+ hctxs[i] = NULL;
+ break;
+ }
+ blk_mq_hctx_kobj_init(hctxs[i]);
+ }
+ for (j = i; j < q->nr_hw_queues; j++) {
+ struct blk_mq_hw_ctx *hctx = hctxs[j];
+
+ if (hctx) {
+ if (hctx->tags) {
+ blk_mq_free_rq_map(set, hctx->tags, j);
+ set->tags[j] = NULL;
+ }
+ blk_mq_exit_hctx(q, set, hctx, j);
+ free_cpumask_var(hctx->cpumask);
+ kobject_put(&hctx->kobj);
+ kfree(hctx->ctxs);
+ kfree(hctx);
+ hctxs[j] = NULL;
+
+ }
}
+ q->nr_hw_queues = i;
+ blk_mq_sysfs_register(q);
+}
+
+struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
+ struct request_queue *q)
+{
+ /* mark the queue as mq asap */
+ q->mq_ops = set->ops;
+
+ q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
+ if (!q->queue_ctx)
+ return ERR_PTR(-ENOMEM);
+
+ q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
+ GFP_KERNEL, set->numa_node);
+ if (!q->queue_hw_ctx)
+ goto err_percpu;
+
+ q->mq_map = blk_mq_make_queue_map(set);
+ if (!q->mq_map)
+ goto err_map;
- setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
+ blk_mq_realloc_hw_ctxs(set, q);
+ if (!q->nr_hw_queues)
+ goto err_hctxs;
+
+ INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
q->nr_queues = nr_cpu_ids;
- q->nr_hw_queues = set->nr_hw_queues;
- q->mq_map = map;
-
- q->queue_ctx = ctx;
- q->queue_hw_ctx = hctxs;
- q->mq_ops = set->ops;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
if (!(set->flags & BLK_MQ_F_SG_MERGE))
@@ -2039,9 +2067,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
- if (blk_mq_init_hw_queues(q, set))
- goto err_hctxs;
-
get_online_cpus();
mutex_lock(&all_q_mutex);
@@ -2055,17 +2080,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
return q;
err_hctxs:
- kfree(map);
- for (i = 0; i < set->nr_hw_queues; i++) {
- if (!hctxs[i])
- break;
- free_cpumask_var(hctxs[i]->cpumask);
- kfree(hctxs[i]);
- }
+ kfree(q->mq_map);
err_map:
- kfree(hctxs);
+ kfree(q->queue_hw_ctx);
err_percpu:
- free_percpu(ctx);
+ free_percpu(q->queue_ctx);
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
@@ -2273,9 +2292,13 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
set->nr_hw_queues = 1;
set->queue_depth = min(64U, set->queue_depth);
}
+ /*
+ * There is no use for more h/w queues than cpus.
+ */
+ if (set->nr_hw_queues > nr_cpu_ids)
+ set->nr_hw_queues = nr_cpu_ids;
- set->tags = kmalloc_node(set->nr_hw_queues *
- sizeof(struct blk_mq_tags *),
+ set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
GFP_KERNEL, set->numa_node);
if (!set->tags)
return -ENOMEM;
@@ -2298,7 +2321,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
int i;
- for (i = 0; i < set->nr_hw_queues; i++) {
+ for (i = 0; i < nr_cpu_ids; i++) {
if (set->tags[i])
blk_mq_free_rq_map(set, set->tags[i], i);
}
@@ -2319,6 +2342,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->tags)
+ continue;
ret = blk_mq_tag_update_depth(hctx->tags, nr);
if (ret)
break;
@@ -2330,6 +2355,35 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return ret;
}
+void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
+{
+ struct request_queue *q;
+
+ if (nr_hw_queues > nr_cpu_ids)
+ nr_hw_queues = nr_cpu_ids;
+ if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
+ return;
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ blk_mq_freeze_queue(q);
+
+ set->nr_hw_queues = nr_hw_queues;
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ blk_mq_realloc_hw_ctxs(set, q);
+
+ if (q->nr_hw_queues > 1)
+ blk_queue_make_request(q, blk_mq_make_request);
+ else
+ blk_queue_make_request(q, blk_sq_make_request);
+
+ blk_mq_queue_reinit(q, cpu_online_mask);
+ }
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ blk_mq_unfreeze_queue(q);
+}
+EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
+
void blk_mq_disable_hotplug(void)
{
mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index b44dce165761..9087b11037b7 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -25,7 +25,6 @@ struct blk_mq_ctx {
struct kobject kobj;
} ____cacheline_aligned_in_smp;
-void __blk_mq_complete_request(struct request *rq);
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
@@ -58,6 +57,7 @@ extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
*/
extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
+extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
@@ -97,8 +97,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
struct blk_mq_alloc_data {
/* input parameter */
struct request_queue *q;
- gfp_t gfp;
- bool reserved;
+ unsigned int flags;
/* input & output parameter */
struct blk_mq_ctx *ctx;
@@ -106,13 +105,11 @@ struct blk_mq_alloc_data {
};
static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
- struct request_queue *q, gfp_t gfp, bool reserved,
- struct blk_mq_ctx *ctx,
- struct blk_mq_hw_ctx *hctx)
+ struct request_queue *q, unsigned int flags,
+ struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
{
data->q = q;
- data->gfp = gfp;
- data->reserved = reserved;
+ data->flags = flags;
data->ctx = ctx;
data->hctx = hctx;
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 7d8f129a1516..331e4eee0dda 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -92,6 +92,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->virt_boundary_mask = 0;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+ lim->max_dev_sectors = 0;
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
lim->max_discard_sectors = 0;
@@ -127,6 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_hw_sectors = UINT_MAX;
lim->max_segment_size = UINT_MAX;
lim->max_sectors = UINT_MAX;
+ lim->max_dev_sectors = UINT_MAX;
lim->max_write_same_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -214,8 +216,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
EXPORT_SYMBOL(blk_queue_bounce_limit);
/**
- * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request
- * @limits: the queue limits
+ * blk_queue_max_hw_sectors - set max sectors for a request for this queue
+ * @q: the request queue for the device
* @max_hw_sectors: max hardware sectors in the usual 512b unit
*
* Description:
@@ -224,36 +226,29 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
* the device driver based upon the capabilities of the I/O
* controller.
*
+ * max_dev_sectors is a hard limit imposed by the storage device for
+ * READ/WRITE requests. It is set by the disk driver.
+ *
* max_sectors is a soft limit imposed by the block layer for
* filesystem type requests. This value can be overridden on a
* per-device basis in /sys/block/<device>/queue/max_sectors_kb.
* The soft limit can not exceed max_hw_sectors.
**/
-void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors)
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
{
- if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
- max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
+ struct queue_limits *limits = &q->limits;
+ unsigned int max_sectors;
+
+ if ((max_hw_sectors << 9) < PAGE_SIZE) {
+ max_hw_sectors = 1 << (PAGE_SHIFT - 9);
printk(KERN_INFO "%s: set to minimum %d\n",
__func__, max_hw_sectors);
}
limits->max_hw_sectors = max_hw_sectors;
- limits->max_sectors = min_t(unsigned int, max_hw_sectors,
- BLK_DEF_MAX_SECTORS);
-}
-EXPORT_SYMBOL(blk_limits_max_hw_sectors);
-
-/**
- * blk_queue_max_hw_sectors - set max sectors for a request for this queue
- * @q: the request queue for the device
- * @max_hw_sectors: max hardware sectors in the usual 512b unit
- *
- * Description:
- * See description for blk_limits_max_hw_sectors().
- **/
-void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
-{
- blk_limits_max_hw_sectors(&q->limits, max_hw_sectors);
+ max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
+ max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
+ limits->max_sectors = max_sectors;
}
EXPORT_SYMBOL(blk_queue_max_hw_sectors);
@@ -334,8 +329,8 @@ EXPORT_SYMBOL(blk_queue_max_segments);
**/
void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
{
- if (max_size < PAGE_CACHE_SIZE) {
- max_size = PAGE_CACHE_SIZE;
+ if (max_size < PAGE_SIZE) {
+ max_size = PAGE_SIZE;
printk(KERN_INFO "%s: set to minimum %d\n",
__func__, max_size);
}
@@ -527,6 +522,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+ t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_same_sectors = min(t->max_write_same_sectors,
b->max_write_same_sectors);
t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
@@ -764,8 +760,8 @@ EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
**/
void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
{
- if (mask < PAGE_CACHE_SIZE - 1) {
- mask = PAGE_CACHE_SIZE - 1;
+ if (mask < PAGE_SIZE - 1) {
+ mask = PAGE_SIZE - 1;
printk(KERN_INFO "%s: set to minimum %lx\n",
__func__, mask);
}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 31849e328b45..995b58d46ed1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -76,7 +76,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
static ssize_t queue_ra_show(struct request_queue *q, char *page)
{
unsigned long ra_kb = q->backing_dev_info.ra_pages <<
- (PAGE_CACHE_SHIFT - 10);
+ (PAGE_SHIFT - 10);
return queue_var_show(ra_kb, (page));
}
@@ -90,7 +90,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
if (ret < 0)
return ret;
- q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
+ q->backing_dev_info.ra_pages = ra_kb >> (PAGE_SHIFT - 10);
return ret;
}
@@ -117,7 +117,7 @@ static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
if (blk_queue_cluster(q))
return queue_var_show(queue_max_segment_size(q), (page));
- return queue_var_show(PAGE_CACHE_SIZE, (page));
+ return queue_var_show(PAGE_SIZE, (page));
}
static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
@@ -147,10 +147,9 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
{
- unsigned long long val;
- val = q->limits.max_hw_discard_sectors << 9;
- return sprintf(page, "%llu\n", val);
+ return sprintf(page, "%llu\n",
+ (unsigned long long)q->limits.max_hw_discard_sectors << 9);
}
static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
@@ -199,12 +198,15 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
unsigned long max_sectors_kb,
max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
- page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
+ page_kb = 1 << (PAGE_SHIFT - 10);
ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
if (ret < 0)
return ret;
+ max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long)
+ q->limits.max_dev_sectors >> 1);
+
if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
return -EINVAL;
@@ -317,6 +319,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
return ret;
}
+static ssize_t queue_poll_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
+}
+
+static ssize_t queue_poll_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ unsigned long poll_on;
+ ssize_t ret;
+
+ if (!q->mq_ops || !q->mq_ops->poll)
+ return -EINVAL;
+
+ ret = queue_var_store(&poll_on, page, count);
+ if (ret < 0)
+ return ret;
+
+ spin_lock_irq(q->queue_lock);
+ if (poll_on)
+ queue_flag_set(QUEUE_FLAG_POLL, q);
+ else
+ queue_flag_clear(QUEUE_FLAG_POLL, q);
+ spin_unlock_irq(q->queue_lock);
+
+ return ret;
+}
+
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.show = queue_requests_show,
@@ -442,6 +472,12 @@ static struct queue_sysfs_entry queue_random_entry = {
.store = queue_store_random,
};
+static struct queue_sysfs_entry queue_poll_entry = {
+ .attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_poll_show,
+ .store = queue_poll_store,
+};
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -466,6 +502,7 @@ static struct attribute *default_attrs[] = {
&queue_rq_affinity_entry.attr,
&queue_iostats_entry.attr,
&queue_random_entry.attr,
+ &queue_poll_entry.attr,
NULL,
};
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 246dfb16c3d9..a30441a200c0 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -127,13 +127,16 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
}
}
-void blk_rq_timed_out_timer(unsigned long data)
+void blk_timeout_work(struct work_struct *work)
{
- struct request_queue *q = (struct request_queue *) data;
+ struct request_queue *q =
+ container_of(work, struct request_queue, timeout_work);
unsigned long flags, next = 0;
struct request *rq, *tmp;
int next_set = 0;
+ if (blk_queue_enter(q, true))
+ return;
spin_lock_irqsave(q->queue_lock, flags);
list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
@@ -143,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data)
mod_timer(&q->timeout, round_jiffies_up(next));
spin_unlock_irqrestore(q->queue_lock, flags);
+ blk_queue_exit(q);
}
/**
@@ -158,11 +162,13 @@ void blk_abort_request(struct request *req)
{
if (blk_mark_rq_complete(req))
return;
- blk_delete_timer(req);
- if (req->q->mq_ops)
+
+ if (req->q->mq_ops) {
blk_mq_rq_timed_out(req, false);
- else
+ } else {
+ blk_delete_timer(req);
blk_rq_timed_out(req);
+ }
}
EXPORT_SYMBOL_GPL(blk_abort_request);
@@ -184,15 +190,13 @@ unsigned long blk_rq_timeout(unsigned long timeout)
* Notes:
* Each request has its own timer, and as it is added to the queue, we
* set up the timer. When the request completes, we cancel the timer.
+ * Queue lock must be held for the non-mq case, mq case doesn't care.
*/
void blk_add_timer(struct request *req)
{
struct request_queue *q = req->q;
unsigned long expiry;
- if (req->cmd_flags & REQ_NO_TIMEOUT)
- return;
-
/* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
if (!q->mq_ops && !q->rq_timed_out_fn)
return;
@@ -207,6 +211,11 @@ void blk_add_timer(struct request *req)
req->timeout = q->rq_timeout;
req->deadline = jiffies + req->timeout;
+
+ /*
+ * Only the non-mq case needs to add the request to a protected list.
+ * For the mq case we simply scan the tag map.
+ */
if (!q->mq_ops)
list_add_tail(&req->timeout_list, &req->q->timeout_list);
diff --git a/block/blk.h b/block/blk.h
index da722eb786df..70e4aee9cdcb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -72,8 +72,6 @@ void blk_dequeue_request(struct request *rq);
void __blk_queue_free_tags(struct request_queue *q);
bool __blk_end_bidi_request(struct request *rq, int error,
unsigned int nr_bytes, unsigned int bidi_bytes);
-int blk_queue_enter(struct request_queue *q, gfp_t gfp);
-void blk_queue_exit(struct request_queue *q);
void blk_freeze_queue(struct request_queue *q);
static inline void blk_queue_enter_live(struct request_queue *q)
@@ -95,7 +93,7 @@ static inline void blk_flush_integrity(void)
}
#endif
-void blk_rq_timed_out_timer(unsigned long data);
+void blk_timeout_work(struct work_struct *work);
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
void blk_delete_timer(struct request *);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1f9093e901da..4a349787bc62 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -632,6 +632,13 @@ static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
return pblkg ? blkg_to_cfqg(pblkg) : NULL;
}
+static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
+ struct cfq_group *ancestor)
+{
+ return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
+ cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
+}
+
static inline void cfqg_get(struct cfq_group *cfqg)
{
return blkg_get(cfqg_to_blkg(cfqg));
@@ -758,6 +765,11 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
#else /* CONFIG_CFQ_GROUP_IOSCHED */
static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
+static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
+ struct cfq_group *ancestor)
+{
+ return true;
+}
static inline void cfqg_get(struct cfq_group *cfqg) { }
static inline void cfqg_put(struct cfq_group *cfqg) { }
@@ -2897,6 +2909,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
static void cfq_arm_slice_timer(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq = cfqd->active_queue;
+ struct cfq_rb_root *st = cfqq->service_tree;
struct cfq_io_cq *cic;
unsigned long sl, group_idle = 0;
@@ -2947,8 +2960,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
return;
}
- /* There are other queues in the group, don't do group idle */
- if (group_idle && cfqq->cfqg->nr_cfqq > 1)
+ /*
+ * There are other queues in the group or this is the only group and
+ * it has too big thinktime, don't do group idle.
+ */
+ if (group_idle &&
+ (cfqq->cfqg->nr_cfqq > 1 ||
+ cfq_io_thinktime_big(cfqd, &st->ttime, true)))
return;
cfq_mark_cfqq_wait_request(cfqq);
@@ -3947,16 +3965,27 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
return true;
- if (new_cfqq->cfqg != cfqq->cfqg)
+ /*
+ * Treat ancestors of current cgroup the same way as current cgroup.
+ * For anybody else we disallow preemption to guarantee service
+ * fairness among cgroups.
+ */
+ if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
return false;
if (cfq_slice_used(cfqq))
return true;
+ /*
+ * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
+ */
+ if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
+ return true;
+
+ WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
/* Allow preemption only if we are idling on sync-noidle tree */
if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
- new_cfqq->service_tree->count == 2 &&
RB_EMPTY_ROOT(&cfqq->sort_list))
return true;
@@ -3967,12 +3996,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
return true;
- /*
- * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
- */
- if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
- return true;
-
/* An idle queue should not be idle now for some reason */
if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
return true;
@@ -4052,7 +4075,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
* idle timer unplug to continue working.
*/
if (cfq_cfqq_wait_request(cfqq)) {
- if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
+ if (blk_rq_bytes(rq) > PAGE_SIZE ||
cfqd->busy_queues > 1) {
cfq_del_timer(cfqd, cfqq);
cfq_clear_cfqq_wait_request(cfqq);
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f678c733df40..556826ac7cb4 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -710,7 +710,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return -EINVAL;
bdi = blk_get_backing_dev_info(bdev);
return compat_put_long(arg,
- (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
+ (bdi->ra_pages * PAGE_SIZE) / 512);
case BLKROGET: /* compatible */
return compat_put_int(arg, bdev_read_only(bdev) != 0);
case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
@@ -729,7 +729,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
bdi = blk_get_backing_dev_info(bdev);
- bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
+ bdi->ra_pages = (arg * 512) / PAGE_SIZE;
return 0;
case BLKGETSIZE:
size = i_size_read(bdev->bd_inode);
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index a753df2b3fc2..d0dd7882d8c7 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -39,7 +39,6 @@ struct deadline_data {
*/
struct request *next_rq[2];
unsigned int batching; /* number of sequential requests made */
- sector_t last_sector; /* head position */
unsigned int starved; /* times reads have starved writes */
/*
@@ -210,8 +209,6 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
dd->next_rq[WRITE] = NULL;
dd->next_rq[data_dir] = deadline_latter_request(rq);
- dd->last_sector = rq_end_sector(rq);
-
/*
* take it off the sort and fifo list, move
* to dispatch queue
diff --git a/block/genhd.c b/block/genhd.c
index e5cafa51567c..9f42526b4d62 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -20,6 +20,7 @@
#include <linux/idr.h>
#include <linux/log2.h>
#include <linux/pm_runtime.h>
+#include <linux/badblocks.h>
#include "blk.h"
@@ -664,7 +665,6 @@ void del_gendisk(struct gendisk *disk)
kobject_put(disk->part0.holder_dir);
kobject_put(disk->slave_dir);
- disk->driverfs_dev = NULL;
if (!sysfs_deprecated)
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -672,6 +672,31 @@ void del_gendisk(struct gendisk *disk)
}
EXPORT_SYMBOL(del_gendisk);
+/* sysfs access to bad-blocks list. */
+static ssize_t disk_badblocks_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!disk->bb)
+ return sprintf(page, "\n");
+
+ return badblocks_show(disk->bb, page, 0);
+}
+
+static ssize_t disk_badblocks_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *page, size_t len)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!disk->bb)
+ return -ENXIO;
+
+ return badblocks_store(disk->bb, page, len, 0);
+}
+
/**
* get_gendisk - get partitioning information for a given device
* @devt: device to get partitioning information for
@@ -990,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
+ disk_badblocks_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -1011,6 +1038,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_capability.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
+ &dev_attr_badblocks.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -1421,7 +1449,7 @@ static DEFINE_MUTEX(disk_events_mutex);
static LIST_HEAD(disk_events);
/* disable in-kernel polling by default */
-static unsigned long disk_events_dfl_poll_msecs = 0;
+static unsigned long disk_events_dfl_poll_msecs;
static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
{
diff --git a/block/ioctl.c b/block/ioctl.c
index 0918aed2d847..4ff1f92f89ca 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -4,6 +4,7 @@
#include <linux/gfp.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
+#include <linux/badblocks.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/blktrace_api.h>
@@ -406,6 +407,35 @@ static inline int is_unrecognized_ioctl(int ret)
ret == -ENOIOCTLCMD;
}
+#ifdef CONFIG_FS_DAX
+bool blkdev_dax_capable(struct block_device *bdev)
+{
+ struct gendisk *disk = bdev->bd_disk;
+
+ if (!disk->fops->direct_access)
+ return false;
+
+ /*
+ * If the partition is not aligned on a page boundary, we can't
+ * do dax I/O to it.
+ */
+ if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
+ || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+ return false;
+
+ /*
+ * If the device has known bad blocks, force all I/O through the
+ * driver / page cache.
+ *
+ * TODO: support finer grained dax error handling
+ */
+ if (disk->bb && disk->bb->count)
+ return false;
+
+ return true;
+}
+#endif
+
static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
@@ -520,7 +550,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
if (!arg)
return -EINVAL;
bdi = blk_get_backing_dev_info(bdev);
- return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
+ return put_long(arg, (bdi->ra_pages * PAGE_SIZE) / 512);
case BLKROGET:
return put_int(arg, bdev_read_only(bdev) != 0);
case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
@@ -548,7 +578,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
if(!capable(CAP_SYS_ADMIN))
return -EACCES;
bdi = blk_get_backing_dev_info(bdev);
- bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
+ bdi->ra_pages = (arg * 512) / PAGE_SIZE;
return 0;
case BLKBSZSET:
return blkdev_bszset(bdev, mode, argp);
@@ -568,6 +598,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
+ case BLKDAXGET:
+ return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
+ break;
case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, argp);
case IOC_PR_RESERVE:
diff --git a/block/ioprio.c b/block/ioprio.c
index 31666c92b46a..cc7800e9eb44 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -123,7 +123,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
break;
do_each_thread(g, p) {
- if (!uid_eq(task_uid(p), uid))
+ if (!uid_eq(task_uid(p), uid) ||
+ !task_pid_vnr(p))
continue;
ret = set_task_ioprio(p, ioprio);
if (ret)
@@ -220,7 +221,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
break;
do_each_thread(g, p) {
- if (!uid_eq(task_uid(p), user->uid))
+ if (!uid_eq(task_uid(p), user->uid) ||
+ !task_pid_vnr(p))
continue;
tmpio = get_task_ioprio(p);
if (tmpio < 0)
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 3de89d4690f3..a163c487cf38 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -21,10 +21,10 @@ static void noop_merged_requests(struct request_queue *q, struct request *rq,
static int noop_dispatch(struct request_queue *q, int force)
{
struct noop_data *nd = q->elevator->elevator_data;
+ struct request *rq;
- if (!list_empty(&nd->queue)) {
- struct request *rq;
- rq = list_entry(nd->queue.next, struct request, queuelist);
+ rq = list_first_entry_or_null(&nd->queue, struct request, queuelist);
+ if (rq) {
list_del_init(&rq->queuelist);
elv_dispatch_sort(q, rq);
return 1;
@@ -46,7 +46,7 @@ noop_former_request(struct request_queue *q, struct request *rq)
if (rq->queuelist.prev == &nd->queue)
return NULL;
- return list_entry(rq->queuelist.prev, struct request, queuelist);
+ return list_prev_entry(rq, queuelist);
}
static struct request *
@@ -56,7 +56,7 @@ noop_latter_request(struct request_queue *q, struct request *rq)
if (rq->queuelist.next == &nd->queue)
return NULL;
- return list_entry(rq->queuelist.next, struct request, queuelist);
+ return list_next_entry(rq, queuelist);
}
static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 3b030157ec85..d7eb77e1e3a8 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -16,6 +16,7 @@
#include <linux/kmod.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
+#include <linux/dax.h>
#include <linux/blktrace_api.h>
#include "partitions/check.h"
@@ -216,10 +217,21 @@ static void part_release(struct device *dev)
kfree(p);
}
+static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct hd_struct *part = dev_to_part(dev);
+
+ add_uevent_var(env, "PARTN=%u", part->partno);
+ if (part->info && part->info->volname[0])
+ add_uevent_var(env, "PARTNAME=%s", part->info->volname);
+ return 0;
+}
+
struct device_type part_type = {
.name = "partition",
.groups = part_attr_groups,
.release = part_release,
+ .uevent = part_uevent,
};
static void delete_partition_rcu_cb(struct rcu_head *head)
@@ -349,15 +361,20 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
goto out_del;
}
+ err = hd_ref_init(p);
+ if (err) {
+ if (flags & ADDPART_FLAG_WHOLEDISK)
+ goto out_remove_file;
+ goto out_del;
+ }
+
/* everything is up and running, commence */
rcu_assign_pointer(ptbl->part[partno], p);
/* suppress uevent if the disk suppresses it */
if (!dev_get_uevent_suppress(ddev))
kobject_uevent(&pdev->kobj, KOBJ_ADD);
-
- if (!hd_ref_init(p))
- return p;
+ return p;
out_free_info:
free_part_info(p);
@@ -366,6 +383,8 @@ out_free_stats:
out_free:
kfree(p);
return ERR_PTR(err);
+out_remove_file:
+ device_remove_file(pdev, &dev_attr_whole_disk);
out_del:
kobject_put(p->holder_dir);
device_del(pdev);
@@ -397,7 +416,7 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
struct hd_struct *part;
int res;
- if (bdev->bd_part_count)
+ if (bdev->bd_part_count || bdev->bd_super)
return -EBUSY;
res = invalidate_partition(disk, 0);
if (res)
@@ -550,20 +569,31 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
return 0;
}
-unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
+static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
- struct page *page;
- page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+ return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)),
NULL);
+}
+
+unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
+{
+ struct page *page;
+
+ /* don't populate page cache for dax capable devices */
+ if (IS_DAX(bdev->bd_inode))
+ page = read_dax_sector(bdev, n);
+ else
+ page = read_pagecache_sector(bdev, n);
+
if (!IS_ERR(page)) {
if (PageError(page))
goto fail;
p->v = page;
- return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9);
+ return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << 9);
fail:
- page_cache_release(page);
+ put_page(page);
}
p->v = NULL;
return NULL;
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
index c2c48ec64b27..621317ac4d59 100644
--- a/block/partitions/mac.c
+++ b/block/partitions/mac.c
@@ -32,7 +32,7 @@ int mac_partition(struct parsed_partitions *state)
Sector sect;
unsigned char *data;
int slot, blocks_in_map;
- unsigned secsize;
+ unsigned secsize, datasize, partoffset;
#ifdef CONFIG_PPC_PMAC
int found_root = 0;
int found_root_goodness = 0;
@@ -50,10 +50,14 @@ int mac_partition(struct parsed_partitions *state)
}
secsize = be16_to_cpu(md->block_size);
put_dev_sector(sect);
- data = read_part_sector(state, secsize/512, &sect);
+ datasize = round_down(secsize, 512);
+ data = read_part_sector(state, datasize / 512, &sect);
if (!data)
return -1;
- part = (struct mac_partition *) (data + secsize%512);
+ partoffset = secsize % 512;
+ if (partoffset + sizeof(*part) > datasize)
+ return -1;
+ part = (struct mac_partition *) (data + partoffset);
if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
put_dev_sector(sect);
return 0; /* not a MacOS disk */
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index dda653ce7b24..0774799942e0 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -444,7 +444,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
}
- rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
+ rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
goto error_free_buffer;
@@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
break;
}
- if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) {
+ if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) {
err = DRIVER_ERROR << 24;
goto error;
}
@@ -536,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
struct request *rq;
int err;
- rq = blk_get_request(q, WRITE, __GFP_WAIT);
+ rq = blk_get_request(q, WRITE, __GFP_RECLAIM);
if (IS_ERR(rq))
return PTR_ERR(rq);
blk_rq_set_block_pc(rq);