summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorMauro Carvalho Chehab <mchehab@s-opensource.com>2017-07-17 11:17:36 -0300
committerMauro Carvalho Chehab <mchehab@s-opensource.com>2017-07-17 11:17:36 -0300
commita3db9d60a118571e696b684a6e8c692a2b064941 (patch)
treeff7bae0f79b7a2ee0bce03de4f883550200c52a9 /mm
parent2748e76ddb2967c4030171342ebdd3faa6a5e8e8 (diff)
parent5771a8c08880cdca3bfb4a3fc6d309d6bba20877 (diff)
downloadlinux-a3db9d60a118571e696b684a6e8c692a2b064941.tar.gz
linux-a3db9d60a118571e696b684a6e8c692a2b064941.tar.xz
Merge tag 'v4.13-rc1' into patchwork
Linux v4.13-rc1 * tag 'v4.13-rc1': (11136 commits) Linux v4.13-rc1 random: reorder READ_ONCE() in get_random_uXX random: suppress spammy warnings about unseeded randomness replace incorrect strscpy use in FORTIFY_SOURCE kmod: throttle kmod thread limit kmod: add test driver to stress test the module loader MAINTAINERS: give kmod some maintainer love xtensa: use generic fb.h fault-inject: add /proc/<pid>/fail-nth fault-inject: simplify access check for fail-nth fault-inject: make fail-nth read/write interface symmetric fault-inject: parse as natural 1-based value for fail-nth write interface fault-inject: automatically detect the number base for fail-nth write interface kernel/watchdog.c: use better pr_fmt prefix MAINTAINERS: move the befs tree to kernel.org lib/atomic64_test.c: add a test that atomic64_inc_not_zero() returns an int mm: fix overflow check in expand_upwards() ubifs: Set double hash cookie also for RENAME_EXCHANGE ubifs: Massage assert in ubifs_xattr_set() wrt. init_xattrs ubifs: Don't leak kernel memory to the MTD ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig55
-rw-r--r--mm/Makefile1
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/cleancache.c2
-rw-r--r--mm/cma.c20
-rw-r--r--mm/compaction.c5
-rw-r--r--mm/filemap.c212
-rw-r--r--mm/gup.c211
-rw-r--r--mm/huge_memory.c31
-rw-r--r--mm/hugetlb.c360
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kasan/kasan.c152
-rw-r--r--mm/kasan/kasan_init.c12
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/khugepaged.c4
-rw-r--r--mm/kmemleak.c136
-rw-r--r--mm/ksm.c820
-rw-r--r--mm/list_lru.c14
-rw-r--r--mm/madvise.c46
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memcontrol.c143
-rw-r--r--mm/memory-failure.c331
-rw-r--r--mm/memory.c12
-rw-r--r--mm/memory_hotplug.c661
-rw-r--r--mm/mempolicy.c184
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c38
-rw-r--r--mm/mmap.c44
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/nobootmem.c2
-rw-r--r--mm/oom_kill.c12
-rw-r--r--mm/page-writeback.c44
-rw-r--r--mm/page_alloc.c214
-rw-r--r--mm/page_io.c27
-rw-r--r--mm/page_isolation.c44
-rw-r--r--mm/page_owner.c6
-rw-r--r--mm/page_vma_mapped.c3
-rw-r--r--mm/pagewalk.c3
-rw-r--r--mm/percpu-internal.h166
-rw-r--r--mm/percpu-km.c11
-rw-r--r--mm/percpu-stats.c222
-rw-r--r--mm/percpu-vm.c12
-rw-r--r--mm/percpu.c85
-rw-r--r--mm/rmap.c31
-rw-r--r--mm/shmem.c23
-rw-r--r--mm/slab.c20
-rw-r--r--mm/slab.h18
-rw-r--r--mm/slab_common.c5
-rw-r--r--mm/slub.c153
-rw-r--r--mm/sparse-vmemmap.c4
-rw-r--r--mm/sparse.c104
-rw-r--r--mm/swap.c12
-rw-r--r--mm/swap_cgroup.c40
-rw-r--r--mm/swap_slots.c21
-rw-r--r--mm/swap_state.c108
-rw-r--r--mm/swapfile.c291
-rw-r--r--mm/truncate.c10
-rw-r--r--mm/util.c38
-rw-r--r--mm/vmalloc.c34
-rw-r--r--mm/vmpressure.c122
-rw-r--r--mm/vmscan.c100
-rw-r--r--mm/vmstat.c50
-rw-r--r--mm/workingset.c9
-rw-r--r--mm/zsmalloc.c54
-rw-r--r--mm/zswap.c11
65 files changed, 3577 insertions, 2039 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index beb7a455915d..48b1af447fa7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP
config HAVE_MEMBLOCK_PHYS_MAP
bool
-config HAVE_GENERIC_RCU_GUP
+config HAVE_GENERIC_GUP
bool
config ARCH_DISCARD_MEMBLOCK
@@ -149,32 +149,6 @@ config NO_BOOTMEM
config MEMORY_ISOLATION
bool
-config MOVABLE_NODE
- bool "Enable to assign a node which has only movable memory"
- depends on HAVE_MEMBLOCK
- depends on NO_BOOTMEM
- depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
- depends on NUMA
- default n
- help
- Allow a node to have only movable memory. Pages used by the kernel,
- such as direct mapping pages cannot be migrated. So the corresponding
- memory device cannot be hotplugged. This option allows the following
- two things:
- - When the system is booting, node full of hotpluggable memory can
- be arranged to have only movable memory so that the whole node can
- be hot-removed. (need movable_node boot option specified).
- - After the system is up, the option allows users to online all the
- memory of a node as movable memory so that the whole node can be
- hot-removed.
-
- Users who don't use the memory hotplug feature are fine with this
- option on since they don't specify movable_node boot option or they
- don't online memory as movable.
-
- Say Y here if you want to hotplug a whole node.
- Say N here if you want kernel to use memory on all nodes evenly.
-
#
# Only be set on architectures that have completely implemented memory hotplug
# feature. If you are not sure, don't touch it.
@@ -187,7 +161,6 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
- depends on COMPILE_TEST || !KASAN
config MEMORY_HOTPLUG_SPARSE
def_bool y
@@ -446,6 +419,18 @@ choice
benefit.
endchoice
+config ARCH_WANTS_THP_SWAP
+ def_bool n
+
+config THP_SWAP
+ def_bool y
+ depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
+ help
+ Swap transparent huge pages in one piece, without splitting.
+ XXX: For now this only does clustered swap space allocation.
+
+ For selection by architectures with reasonable THP sizes.
+
config TRANSPARENT_HUGE_PAGECACHE
def_bool y
depends on TRANSPARENT_HUGEPAGE
@@ -683,12 +668,16 @@ config IDLE_PAGE_TRACKING
See Documentation/vm/idle_page_tracking.txt for more details.
+# arch_add_memory() comprehends device memory
+config ARCH_HAS_ZONE_DEVICE
+ bool
+
config ZONE_DEVICE
bool "Device memory (pmem, etc...) hotplug support"
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
- depends on X86_64 #arch_add_memory() comprehends device memory
+ depends on ARCH_HAS_ZONE_DEVICE
help
Device memory hotplug support allows for establishing pmem,
@@ -706,3 +695,11 @@ config ARCH_USES_HIGH_VMA_FLAGS
bool
config ARCH_HAS_PKEYS
bool
+
+config PERCPU_STATS
+ bool "Collect percpu memory statistics"
+ default n
+ help
+ This feature collects and exposes statistics via debugfs. The
+ information includes global and per chunk statistics, which can
+ be used to help understand percpu memory usage.
diff --git a/mm/Makefile b/mm/Makefile
index 026f6a828a50..411bd24d4a7c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -103,3 +103,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
+obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index da91df50ba31..9075aa54e955 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -24,7 +24,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
{
unsigned long flags;
struct page *page = alloc_page(balloon_mapping_gfp_mask() |
- __GFP_NOMEMALLOC | __GFP_NORETRY);
+ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_ZERO);
if (!page)
return NULL;
diff --git a/mm/cleancache.c b/mm/cleancache.c
index ba5d8f3e6d68..f7b9fdc79d97 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -130,7 +130,7 @@ void __cleancache_init_shared_fs(struct super_block *sb)
int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
if (cleancache_ops) {
- pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
+ pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE);
if (pool_id < 0)
pool_id = CLEANCACHE_NO_POOL;
}
diff --git a/mm/cma.c b/mm/cma.c
index 978b4a1441ef..c0da318c020e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -59,7 +59,7 @@ const char *cma_get_name(const struct cma *cma)
}
static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
- int align_order)
+ unsigned int align_order)
{
if (align_order <= cma->order_per_bit)
return 0;
@@ -67,17 +67,14 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
}
/*
- * Find a PFN aligned to the specified order and return an offset represented in
- * order_per_bits.
+ * Find the offset of the base PFN from the specified align_order.
+ * The value returned is represented in order_per_bits.
*/
static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
- int align_order)
+ unsigned int align_order)
{
- if (align_order <= cma->order_per_bit)
- return 0;
-
- return (ALIGN(cma->base_pfn, (1UL << align_order))
- - cma->base_pfn) >> cma->order_per_bit;
+ return (cma->base_pfn & ((1UL << align_order) - 1))
+ >> cma->order_per_bit;
}
static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
@@ -127,7 +124,7 @@ static int __init cma_activate_area(struct cma *cma)
* to be in the same zone.
*/
if (page_zone(pfn_to_page(pfn)) != zone)
- goto err;
+ goto not_in_zone;
}
init_cma_reserved_pageblock(pfn_to_page(base_pfn));
} while (--i);
@@ -141,7 +138,8 @@ static int __init cma_activate_area(struct cma *cma)
return 0;
-err:
+not_in_zone:
+ pr_err("CMA area %s could not be activated\n", cma->name);
kfree(cma->bitmap);
cma->count = 0;
return -EINVAL;
diff --git a/mm/compaction.c b/mm/compaction.c
index 613c59e928cb..fb548e4c7bd4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -236,10 +236,9 @@ static void __reset_isolation_suitable(struct zone *zone)
cond_resched();
- if (!pfn_valid(pfn))
+ page = pfn_to_online_page(pfn);
+ if (!page)
continue;
-
- page = pfn_to_page(pfn);
if (zone != page_zone(page))
continue;
diff --git a/mm/filemap.c b/mm/filemap.c
index 6f1be573a5e6..a49702445ce0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -239,14 +239,16 @@ void __delete_from_page_cache(struct page *page, void *shadow)
/* Leave page->index set: truncation lookup relies upon it */
/* hugetlb pages do not participate in page cache accounting. */
- if (!PageHuge(page))
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+ if (PageHuge(page))
+ return;
+
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
if (PageSwapBacked(page)) {
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
} else {
- VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page);
+ VM_BUG_ON_PAGE(PageTransHuge(page), page);
}
/*
@@ -309,6 +311,16 @@ int filemap_check_errors(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_check_errors);
+static int filemap_check_and_keep_errors(struct address_space *mapping)
+{
+ /* Check for outstanding write errors */
+ if (test_bit(AS_EIO, &mapping->flags))
+ return -EIO;
+ if (test_bit(AS_ENOSPC, &mapping->flags))
+ return -ENOSPC;
+ return 0;
+}
+
/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
@@ -376,17 +388,48 @@ int filemap_flush(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_flush);
-static int __filemap_fdatawait_range(struct address_space *mapping,
+/**
+ * filemap_range_has_page - check if a page exists in range.
+ * @mapping: address space within which to check
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback.
+ */
+bool filemap_range_has_page(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ pgoff_t index = start_byte >> PAGE_SHIFT;
+ pgoff_t end = end_byte >> PAGE_SHIFT;
+ struct pagevec pvec;
+ bool ret;
+
+ if (end_byte < start_byte)
+ return false;
+
+ if (mapping->nrpages == 0)
+ return false;
+
+ pagevec_init(&pvec, 0);
+ if (!pagevec_lookup(&pvec, mapping, index, 1))
+ return false;
+ ret = (pvec.pages[0]->index <= end);
+ pagevec_release(&pvec);
+ return ret;
+}
+EXPORT_SYMBOL(filemap_range_has_page);
+
+static void __filemap_fdatawait_range(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
pgoff_t index = start_byte >> PAGE_SHIFT;
pgoff_t end = end_byte >> PAGE_SHIFT;
struct pagevec pvec;
int nr_pages;
- int ret = 0;
if (end_byte < start_byte)
- goto out;
+ return;
pagevec_init(&pvec, 0);
while ((index <= end) &&
@@ -403,14 +446,11 @@ static int __filemap_fdatawait_range(struct address_space *mapping,
continue;
wait_on_page_writeback(page);
- if (TestClearPageError(page))
- ret = -EIO;
+ ClearPageError(page);
}
pagevec_release(&pvec);
cond_resched();
}
-out:
- return ret;
}
/**
@@ -430,14 +470,8 @@ out:
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
loff_t end_byte)
{
- int ret, ret2;
-
- ret = __filemap_fdatawait_range(mapping, start_byte, end_byte);
- ret2 = filemap_check_errors(mapping);
- if (!ret)
- ret = ret2;
-
- return ret;
+ __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ return filemap_check_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range);
@@ -453,15 +487,17 @@ EXPORT_SYMBOL(filemap_fdatawait_range);
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
* fsfreeze(8)
*/
-void filemap_fdatawait_keep_errors(struct address_space *mapping)
+int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
loff_t i_size = i_size_read(mapping->host);
if (i_size == 0)
- return;
+ return 0;
__filemap_fdatawait_range(mapping, 0, i_size - 1);
+ return filemap_check_and_keep_errors(mapping);
}
+EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
/**
* filemap_fdatawait - wait for all under-writeback pages to complete
@@ -503,6 +539,9 @@ int filemap_write_and_wait(struct address_space *mapping)
int err2 = filemap_fdatawait(mapping);
if (!err)
err = err2;
+ } else {
+ /* Clear any previously stored errors */
+ filemap_check_errors(mapping);
}
} else {
err = filemap_check_errors(mapping);
@@ -537,6 +576,9 @@ int filemap_write_and_wait_range(struct address_space *mapping,
lstart, lend);
if (!err)
err = err2;
+ } else {
+ /* Clear any previously stored errors */
+ filemap_check_errors(mapping);
}
} else {
err = filemap_check_errors(mapping);
@@ -545,6 +587,90 @@ int filemap_write_and_wait_range(struct address_space *mapping,
}
EXPORT_SYMBOL(filemap_write_and_wait_range);
+void __filemap_set_wb_err(struct address_space *mapping, int err)
+{
+ errseq_t eseq = __errseq_set(&mapping->wb_err, err);
+
+ trace_filemap_set_wb_err(mapping, eseq);
+}
+EXPORT_SYMBOL(__filemap_set_wb_err);
+
+/**
+ * file_check_and_advance_wb_err - report wb error (if any) that was previously
+ * and advance wb_err to current one
+ * @file: struct file on which the error is being reported
+ *
+ * When userland calls fsync (or something like nfsd does the equivalent), we
+ * want to report any writeback errors that occurred since the last fsync (or
+ * since the file was opened if there haven't been any).
+ *
+ * Grab the wb_err from the mapping. If it matches what we have in the file,
+ * then just quickly return 0. The file is all caught up.
+ *
+ * If it doesn't match, then take the mapping value, set the "seen" flag in
+ * it and try to swap it into place. If it works, or another task beat us
+ * to it with the new value, then update the f_wb_err and return the error
+ * portion. The error at this point must be reported via proper channels
+ * (a'la fsync, or NFS COMMIT operation, etc.).
+ *
+ * While we handle mapping->wb_err with atomic operations, the f_wb_err
+ * value is protected by the f_lock since we must ensure that it reflects
+ * the latest value swapped in for this file descriptor.
+ */
+int file_check_and_advance_wb_err(struct file *file)
+{
+ int err = 0;
+ errseq_t old = READ_ONCE(file->f_wb_err);
+ struct address_space *mapping = file->f_mapping;
+
+ /* Locklessly handle the common case where nothing has changed */
+ if (errseq_check(&mapping->wb_err, old)) {
+ /* Something changed, must use slow path */
+ spin_lock(&file->f_lock);
+ old = file->f_wb_err;
+ err = errseq_check_and_advance(&mapping->wb_err,
+ &file->f_wb_err);
+ trace_file_check_and_advance_wb_err(file, old);
+ spin_unlock(&file->f_lock);
+ }
+ return err;
+}
+EXPORT_SYMBOL(file_check_and_advance_wb_err);
+
+/**
+ * file_write_and_wait_range - write out & wait on a file range
+ * @file: file pointing to address_space with pages
+ * @lstart: offset in bytes where the range starts
+ * @lend: offset in bytes where the range ends (inclusive)
+ *
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that @lend is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ *
+ * After writing out and waiting on the data, we check and advance the
+ * f_wb_err cursor to the latest value, and return any errors detected there.
+ */
+int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
+{
+ int err = 0, err2;
+ struct address_space *mapping = file->f_mapping;
+
+ if ((!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional)) {
+ err = __filemap_fdatawrite_range(mapping, lstart, lend,
+ WB_SYNC_ALL);
+ /* See comment of filemap_write_and_wait() */
+ if (err != -EIO)
+ __filemap_fdatawait_range(mapping, lstart, lend);
+ }
+ err2 = file_check_and_advance_wb_err(file);
+ if (!err)
+ err = err2;
+ return err;
+}
+EXPORT_SYMBOL(file_write_and_wait_range);
+
/**
* replace_page_cache_page - replace a pagecache page with a new one
* @old: page to be replaced
@@ -768,10 +894,10 @@ struct wait_page_key {
struct wait_page_queue {
struct page *page;
int bit_nr;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
};
-static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
@@ -834,7 +960,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, bool lock)
{
struct wait_page_queue wait_page;
- wait_queue_t *wait = &wait_page.wait;
+ wait_queue_entry_t *wait = &wait_page.wait;
int ret = 0;
init_wait(wait);
@@ -845,9 +971,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
for (;;) {
spin_lock_irq(&q->lock);
- if (likely(list_empty(&wait->task_list))) {
+ if (likely(list_empty(&wait->entry))) {
if (lock)
- __add_wait_queue_tail_exclusive(q, wait);
+ __add_wait_queue_entry_tail_exclusive(q, wait);
else
__add_wait_queue(q, wait);
SetPageWaiters(page);
@@ -907,7 +1033,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
*
* Add an arbitrary @waiter to the wait queue for the nominated @page.
*/
-void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
{
wait_queue_head_t *q = page_waitqueue(page);
unsigned long flags;
@@ -2038,10 +2164,17 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
loff_t size;
size = i_size_read(inode);
- retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (retval < 0)
- goto out;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_has_page(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1))
+ return -EAGAIN;
+ } else {
+ retval = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (retval < 0)
+ goto out;
+ }
file_accessed(file);
@@ -2226,7 +2359,7 @@ int filemap_fault(struct vm_fault *vmf)
/* No page in the page cache at all */
do_sync_mmap_readahead(vmf->vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
+ count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
page = find_get_page(mapping, offset);
@@ -2642,6 +2775,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
pos = iocb->ki_pos;
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ return -EINVAL;
+
if (limit != RLIM_INFINITY) {
if (iocb->ki_pos >= limit) {
send_sig(SIGXFSZ, current, 0);
@@ -2710,9 +2846,17 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
- written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
- if (written)
- goto out;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* If there are pages to writeback, return */
+ if (filemap_range_has_page(inode->i_mapping, pos,
+ pos + iov_iter_count(from)))
+ return -EAGAIN;
+ } else {
+ written = filemap_write_and_wait_range(mapping, pos,
+ pos + write_len - 1);
+ if (written)
+ goto out;
+ }
/*
* After a write we want buffered reads to be sure to go to disk to get
diff --git a/mm/gup.c b/mm/gup.c
index 576c4df58882..23f01c40c88f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -208,72 +208,28 @@ no_page:
return no_page_table(vma, flags);
}
-/**
- * follow_page_mask - look up a page descriptor from a user-virtual address
- * @vma: vm_area_struct mapping @address
- * @address: virtual address to look up
- * @flags: flags modifying lookup behaviour
- * @page_mask: on output, *page_mask is set according to the size of the page
- *
- * @flags can have FOLL_ flags set, defined in <linux/mm.h>
- *
- * Returns the mapped (struct page *), %NULL if no mapping exists, or
- * an error pointer if there is a mapping to something not represented
- * by a page descriptor (see also vm_normal_page()).
- */
-struct page *follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned int *page_mask)
+static struct page *follow_pmd_mask(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp,
+ unsigned int flags, unsigned int *page_mask)
{
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
pmd_t *pmd;
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
- *page_mask = 0;
-
- page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
- if (!IS_ERR(page)) {
- BUG_ON(flags & FOLL_GET);
- return page;
- }
-
- pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- return no_page_table(vma, flags);
- p4d = p4d_offset(pgd, address);
- if (p4d_none(*p4d))
- return no_page_table(vma, flags);
- BUILD_BUG_ON(p4d_huge(*p4d));
- if (unlikely(p4d_bad(*p4d)))
- return no_page_table(vma, flags);
- pud = pud_offset(p4d, address);
- if (pud_none(*pud))
+ pmd = pmd_offset(pudp, address);
+ if (pmd_none(*pmd))
return no_page_table(vma, flags);
- if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
- page = follow_huge_pud(mm, address, pud, flags);
+ if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
+ page = follow_huge_pmd(mm, address, pmd, flags);
if (page)
return page;
return no_page_table(vma, flags);
}
- if (pud_devmap(*pud)) {
- ptl = pud_lock(mm, pud);
- page = follow_devmap_pud(vma, address, pud, flags);
- spin_unlock(ptl);
- if (page)
- return page;
- }
- if (unlikely(pud_bad(*pud)))
- return no_page_table(vma, flags);
-
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
- return no_page_table(vma, flags);
- if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
- page = follow_huge_pmd(mm, address, pmd, flags);
+ if (is_hugepd(__hugepd(pmd_val(*pmd)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(pmd_val(*pmd)), flags,
+ PMD_SHIFT);
if (page)
return page;
return no_page_table(vma, flags);
@@ -319,13 +275,131 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
return ret ? ERR_PTR(ret) :
follow_page_pte(vma, address, pmd, flags);
}
-
page = follow_trans_huge_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
*page_mask = HPAGE_PMD_NR - 1;
return page;
}
+
+static struct page *follow_pud_mask(struct vm_area_struct *vma,
+ unsigned long address, p4d_t *p4dp,
+ unsigned int flags, unsigned int *page_mask)
+{
+ pud_t *pud;
+ spinlock_t *ptl;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+
+ pud = pud_offset(p4dp, address);
+ if (pud_none(*pud))
+ return no_page_table(vma, flags);
+ if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
+ page = follow_huge_pud(mm, address, pud, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ if (is_hugepd(__hugepd(pud_val(*pud)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(pud_val(*pud)), flags,
+ PUD_SHIFT);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ if (pud_devmap(*pud)) {
+ ptl = pud_lock(mm, pud);
+ page = follow_devmap_pud(vma, address, pud, flags);
+ spin_unlock(ptl);
+ if (page)
+ return page;
+ }
+ if (unlikely(pud_bad(*pud)))
+ return no_page_table(vma, flags);
+
+ return follow_pmd_mask(vma, address, pud, flags, page_mask);
+}
+
+
+static struct page *follow_p4d_mask(struct vm_area_struct *vma,
+ unsigned long address, pgd_t *pgdp,
+ unsigned int flags, unsigned int *page_mask)
+{
+ p4d_t *p4d;
+ struct page *page;
+
+ p4d = p4d_offset(pgdp, address);
+ if (p4d_none(*p4d))
+ return no_page_table(vma, flags);
+ BUILD_BUG_ON(p4d_huge(*p4d));
+ if (unlikely(p4d_bad(*p4d)))
+ return no_page_table(vma, flags);
+
+ if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(p4d_val(*p4d)), flags,
+ P4D_SHIFT);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ return follow_pud_mask(vma, address, p4d, flags, page_mask);
+}
+
+/**
+ * follow_page_mask - look up a page descriptor from a user-virtual address
+ * @vma: vm_area_struct mapping @address
+ * @address: virtual address to look up
+ * @flags: flags modifying lookup behaviour
+ * @page_mask: on output, *page_mask is set according to the size of the page
+ *
+ * @flags can have FOLL_ flags set, defined in <linux/mm.h>
+ *
+ * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * an error pointer if there is a mapping to something not represented
+ * by a page descriptor (see also vm_normal_page()).
+ */
+struct page *follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned int *page_mask)
+{
+ pgd_t *pgd;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+
+ *page_mask = 0;
+
+ /* make this handle hugepd */
+ page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+ if (!IS_ERR(page)) {
+ BUG_ON(flags & FOLL_GET);
+ return page;
+ }
+
+ pgd = pgd_offset(mm, address);
+
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ return no_page_table(vma, flags);
+
+ if (pgd_huge(*pgd)) {
+ page = follow_huge_pgd(mm, address, pgd, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(pgd_val(*pgd)), flags,
+ PGDIR_SHIFT);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+
+ return follow_p4d_mask(vma, address, pgd, flags, page_mask);
+}
+
static int get_gate_page(struct mm_struct *mm, unsigned long address,
unsigned int gup_flags, struct vm_area_struct **vma,
struct page **page)
@@ -1146,7 +1220,7 @@ struct page *get_dump_page(unsigned long addr)
#endif /* CONFIG_ELF_CORE */
/*
- * Generic RCU Fast GUP
+ * Generic Fast GUP
*
* get_user_pages_fast attempts to pin user pages by walking the page
* tables directly and avoids taking locks. Thus the walker needs to be
@@ -1167,8 +1241,8 @@ struct page *get_dump_page(unsigned long addr)
* Before activating this code, please be aware that the following assumptions
* are currently made:
*
- * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
- * pages containing page tables.
+ * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
+ * free pages containing page tables or TLB flushing requires IPI broadcast.
*
* *) ptes can be read atomically by the architecture.
*
@@ -1178,7 +1252,7 @@ struct page *get_dump_page(unsigned long addr)
*
* This code is based heavily on the PowerPC implementation by Nick Piggin.
*/
-#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+#ifdef CONFIG_HAVE_GENERIC_GUP
#ifndef gup_get_pte
/*
@@ -1349,16 +1423,15 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return __gup_device_huge_pmd(orig, addr, end, pages, nr);
refs = 0;
- head = pmd_page(orig);
- page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
+ head = compound_head(pmd_page(orig));
if (!page_cache_add_speculative(head, refs)) {
*nr -= refs;
return 0;
@@ -1388,16 +1461,15 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return __gup_device_huge_pud(orig, addr, end, pages, nr);
refs = 0;
- head = pud_page(orig);
- page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
+ head = compound_head(pud_page(orig));
if (!page_cache_add_speculative(head, refs)) {
*nr -= refs;
return 0;
@@ -1426,16 +1498,15 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
BUILD_BUG_ON(pgd_devmap(orig));
refs = 0;
- head = pgd_page(orig);
- page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
+ page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
+ head = compound_head(pgd_page(orig));
if (!page_cache_add_speculative(head, refs)) {
*nr -= refs;
return 0;
@@ -1668,4 +1739,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
return ret;
}
-#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
+#endif /* CONFIG_HAVE_GENERIC_GUP */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 88c6167f194d..86975dec0ba1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1575,8 +1575,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
get_page(page);
spin_unlock(ptl);
split_huge_page(page);
- put_page(page);
unlock_page(page);
+ put_page(page);
goto out_unlocked;
}
@@ -2203,7 +2203,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
* atomic_set() here would be safe on all archs (and not only on x86),
* it's safer to use atomic_inc()/atomic_add().
*/
- if (PageAnon(head)) {
+ if (PageAnon(head) && !PageSwapCache(head)) {
page_ref_inc(page_tail);
} else {
/* Additional pin to radix tree */
@@ -2214,6 +2214,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
page_tail->flags |= (head->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
+ (1L << PG_swapcache) |
(1L << PG_mlocked) |
(1L << PG_uptodate) |
(1L << PG_active) |
@@ -2276,7 +2277,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageCompound(head);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
- page_ref_inc(head);
+ /* Additional pin to radix tree of swap cache */
+ if (PageSwapCache(head))
+ page_ref_add(head, 2);
+ else
+ page_ref_inc(head);
} else {
/* Additional pin to radix tree */
page_ref_add(head, 2);
@@ -2385,6 +2390,21 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
return ret;
}
+/* Racy check whether the huge page can be split */
+bool can_split_huge_page(struct page *page, int *pextra_pins)
+{
+ int extra_pins;
+
+ /* Additional pins from radix tree */
+ if (PageAnon(page))
+ extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
+ else
+ extra_pins = HPAGE_PMD_NR;
+ if (pextra_pins)
+ *pextra_pins = extra_pins;
+ return total_mapcount(page) == page_count(page) - extra_pins - 1;
+}
+
/*
* This function splits huge page into normal pages. @page can point to any
* subpage of huge page to split. Split doesn't change the position of @page.
@@ -2432,7 +2452,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
ret = -EBUSY;
goto out;
}
- extra_pins = 0;
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
@@ -2444,8 +2463,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out;
}
- /* Addidional pins from radix tree */
- extra_pins = HPAGE_PMD_NR;
anon_vma = NULL;
i_mmap_lock_read(mapping);
}
@@ -2454,7 +2471,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* Racy check if we can split the page, before freeze_page() will
* split PMDs
*/
- if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
+ if (!can_split_huge_page(head, &extra_pins)) {
ret = -EBUSY;
goto out_unlock;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3eedb187e549..bc48ee783dd9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -20,9 +20,9 @@
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
+#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
-#include <linux/page-isolation.h>
#include <linux/jhash.h>
#include <asm/page.h>
@@ -867,12 +867,12 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
h->free_huge_pages_node[nid]++;
}
-static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
- if (!is_migrate_isolate_page(page))
+ if (!PageHWPoison(page))
break;
/*
* if 'non-isolated free hugepage' not found on the list,
@@ -887,6 +887,42 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
return page;
}
+static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
+ nodemask_t *nmask)
+{
+ unsigned int cpuset_mems_cookie;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+ int node = -1;
+
+ zonelist = node_zonelist(nid, gfp_mask);
+
+retry_cpuset:
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
+ struct page *page;
+
+ if (!cpuset_zone_allowed(zone, gfp_mask))
+ continue;
+ /*
+ * no need to ask again on the same node. Pool is node rather than
+ * zone aware
+ */
+ if (zone_to_nid(zone) == node)
+ continue;
+ node = zone_to_nid(zone);
+
+ page = dequeue_huge_page_node_exact(h, node);
+ if (page)
+ return page;
+ }
+ if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
+ return NULL;
+}
+
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
@@ -901,13 +937,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
unsigned long address, int avoid_reserve,
long chg)
{
- struct page *page = NULL;
+ struct page *page;
struct mempolicy *mpol;
+ gfp_t gfp_mask;
nodemask_t *nodemask;
- struct zonelist *zonelist;
- struct zone *zone;
- struct zoneref *z;
- unsigned int cpuset_mems_cookie;
+ int nid;
/*
* A child process with MAP_PRIVATE mappings created by their parent
@@ -922,31 +956,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
goto err;
-retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = huge_zonelist(vma, address,
- htlb_alloc_mask(h), &mpol, &nodemask);
-
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- MAX_NR_ZONES - 1, nodemask) {
- if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
- page = dequeue_huge_page_node(h, zone_to_nid(zone));
- if (page) {
- if (avoid_reserve)
- break;
- if (!vma_has_reserves(vma, chg))
- break;
-
- SetPagePrivate(page);
- h->resv_huge_pages--;
- break;
- }
- }
+ gfp_mask = htlb_alloc_mask(h);
+ nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+ if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
+ SetPagePrivate(page);
+ h->resv_huge_pages--;
}
mpol_cond_put(mpol);
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
- goto retry_cpuset;
return page;
err:
@@ -1024,9 +1042,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \
- ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
- defined(CONFIG_CMA))
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
static void destroy_compound_gigantic_page(struct page *page,
unsigned int order)
{
@@ -1158,8 +1174,7 @@ static int alloc_fresh_gigantic_page(struct hstate *h,
return 0;
}
-static inline bool gigantic_page_supported(void) { return true; }
-#else
+#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
static inline bool gigantic_page_supported(void) { return false; }
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
@@ -1369,7 +1384,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
page = __alloc_pages_node(nid,
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
- __GFP_REPEAT|__GFP_NOWARN,
+ __GFP_RETRY_MAYFAIL|__GFP_NOWARN,
huge_page_order(h));
if (page) {
prep_new_huge_page(h, page, nid);
@@ -1444,7 +1459,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
* number of free hugepages would be reduced below the number of reserved
* hugepages.
*/
-static int dissolve_free_huge_page(struct page *page)
+int dissolve_free_huge_page(struct page *page)
{
int rc = 0;
@@ -1457,6 +1472,14 @@ static int dissolve_free_huge_page(struct page *page)
rc = -EBUSY;
goto out;
}
+ /*
+ * Move PageHWPoison flag from head page to the raw error page,
+ * which makes any subpages rather than the error page reusable.
+ */
+ if (PageHWPoison(head) && page != head) {
+ SetPageHWPoison(page);
+ ClearPageHWPoison(head);
+ }
list_del(&head->lru);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
@@ -1497,82 +1520,19 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
return rc;
}
-/*
- * There are 3 ways this can get called:
- * 1. With vma+addr: we use the VMA's memory policy
- * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge
- * page from any node, and let the buddy allocator itself figure
- * it out.
- * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page
- * strictly from 'nid'
- */
static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr, int nid)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
int order = huge_page_order(h);
- gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
- unsigned int cpuset_mems_cookie;
- /*
- * We need a VMA to get a memory policy. If we do not
- * have one, we use the 'nid' argument.
- *
- * The mempolicy stuff below has some non-inlined bits
- * and calls ->vm_ops. That makes it hard to optimize at
- * compile-time, even when NUMA is off and it does
- * nothing. This helps the compiler optimize it out.
- */
- if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
- /*
- * If a specific node is requested, make sure to
- * get memory from there, but only when a node
- * is explicitly specified.
- */
- if (nid != NUMA_NO_NODE)
- gfp |= __GFP_THISNODE;
- /*
- * Make sure to call something that can handle
- * nid=NUMA_NO_NODE
- */
- return alloc_pages_node(nid, gfp, order);
- }
-
- /*
- * OK, so we have a VMA. Fetch the mempolicy and try to
- * allocate a huge page with it. We will only reach this
- * when CONFIG_NUMA=y.
- */
- do {
- struct page *page;
- struct mempolicy *mpol;
- struct zonelist *zl;
- nodemask_t *nodemask;
-
- cpuset_mems_cookie = read_mems_allowed_begin();
- zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
- mpol_cond_put(mpol);
- page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
- if (page)
- return page;
- } while (read_mems_allowed_retry(cpuset_mems_cookie));
-
- return NULL;
+ gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+ return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
}
-/*
- * There are two ways to allocate a huge page:
- * 1. When you have a VMA and an address (like a fault)
- * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
- *
- * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in
- * this case which signifies that the allocation should be done with
- * respect for the VMA's memory policy.
- *
- * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
- * implies that memory policies will not be taken in to account.
- */
-static struct page *__alloc_buddy_huge_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr, int nid)
+static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
{
struct page *page;
unsigned int r_nid;
@@ -1581,15 +1541,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
return NULL;
/*
- * Make sure that anyone specifying 'nid' is not also specifying a VMA.
- * This makes sure the caller is picking _one_ of the modes with which
- * we can call this function, not both.
- */
- if (vma || (addr != -1)) {
- VM_WARN_ON_ONCE(addr == -1);
- VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
- }
- /*
* Assume we will successfully allocate the surplus page to
* prevent racing processes from causing the surplus to exceed
* overcommit
@@ -1622,7 +1573,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
}
spin_unlock(&hugetlb_lock);
- page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
+ page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
spin_lock(&hugetlb_lock);
if (page) {
@@ -1647,26 +1598,23 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
}
/*
- * Allocate a huge page from 'nid'. Note, 'nid' may be
- * NUMA_NO_NODE, which means that it may be allocated
- * anywhere.
- */
-static
-struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
-{
- unsigned long addr = -1;
-
- return __alloc_buddy_huge_page(h, NULL, addr, nid);
-}
-
-/*
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
static
struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+ struct page *page;
+ struct mempolicy *mpol;
+ gfp_t gfp_mask = htlb_alloc_mask(h);
+ int nid;
+ nodemask_t *nodemask;
+
+ nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
+ page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+ mpol_cond_put(mpol);
+
+ return page;
}
/*
@@ -1676,19 +1624,46 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
*/
struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
+ gfp_t gfp_mask = htlb_alloc_mask(h);
struct page *page = NULL;
+ if (nid != NUMA_NO_NODE)
+ gfp_mask |= __GFP_THISNODE;
+
spin_lock(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0)
- page = dequeue_huge_page_node(h, nid);
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
spin_unlock(&hugetlb_lock);
if (!page)
- page = __alloc_buddy_huge_page_no_mpol(h, nid);
+ page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
return page;
}
+
+struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
+ nodemask_t *nmask)
+{
+ gfp_t gfp_mask = htlb_alloc_mask(h);
+
+ spin_lock(&hugetlb_lock);
+ if (h->free_huge_pages - h->resv_huge_pages > 0) {
+ struct page *page;
+
+ page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
+ if (page) {
+ spin_unlock(&hugetlb_lock);
+ return page;
+ }
+ }
+ spin_unlock(&hugetlb_lock);
+
+ /* No reservations, try to overcommit */
+
+ return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
+}
+
/*
* Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
@@ -1714,12 +1689,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
+ page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+ NUMA_NO_NODE, NULL);
if (!page) {
alloc_ok = false;
break;
}
list_add(&page->lru, &surplus_list);
+ cond_resched();
}
allocated += i;
@@ -2188,8 +2165,16 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
} else if (!alloc_fresh_huge_page(h,
&node_states[N_MEMORY]))
break;
+ cond_resched();
+ }
+ if (i < h->max_huge_pages) {
+ char buf[32];
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
+ h->max_huge_pages, buf, i);
+ h->max_huge_pages = i;
}
- h->max_huge_pages = i;
}
static void __init hugetlb_init_hstates(void)
@@ -2207,26 +2192,16 @@ static void __init hugetlb_init_hstates(void)
VM_BUG_ON(minimum_order == UINT_MAX);
}
-static char * __init memfmt(char *buf, unsigned long n)
-{
- if (n >= (1UL << 30))
- sprintf(buf, "%lu GB", n >> 30);
- else if (n >= (1UL << 20))
- sprintf(buf, "%lu MB", n >> 20);
- else
- sprintf(buf, "%lu KB", n >> 10);
- return buf;
-}
-
static void __init report_hugepages(void)
{
struct hstate *h;
for_each_hstate(h) {
char buf[32];
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
- memfmt(buf, huge_page_size(h)),
- h->free_huge_pages);
+ buf, h->free_huge_pages);
}
}
@@ -2785,6 +2760,11 @@ static int __init hugetlb_init(void)
return 0;
if (!size_to_hstate(default_hstate_size)) {
+ if (default_hstate_size != 0) {
+ pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
+ default_hstate_size, HPAGE_SIZE);
+ }
+
default_hstate_size = HPAGE_SIZE;
if (!size_to_hstate(default_hstate_size))
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
@@ -3185,17 +3165,17 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);
}
-static int is_hugetlb_entry_migration(pte_t pte)
+bool is_hugetlb_entry_migration(pte_t pte)
{
swp_entry_t swp;
if (huge_pte_none(pte) || pte_present(pte))
- return 0;
+ return false;
swp = pte_to_swp_entry(pte);
if (non_swap_entry(swp) && is_migration_entry(swp))
- return 1;
+ return true;
else
- return 0;
+ return false;
}
static int is_hugetlb_entry_hwpoisoned(pte_t pte)
@@ -3233,7 +3213,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
- src_pte = huge_pte_offset(src, addr);
+ src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
dst_pte = huge_pte_alloc(dst, addr, sz);
@@ -3263,9 +3243,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
*/
make_migration_entry_read(&swp_entry);
entry = swp_entry_to_pte(swp_entry);
- set_huge_pte_at(src, addr, src_pte, entry);
+ set_huge_swap_pte_at(src, addr, src_pte,
+ entry, sz);
}
- set_huge_pte_at(dst, addr, dst_pte, entry);
+ set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
} else {
if (cow) {
huge_ptep_set_wrprotect(src, addr, src_pte);
@@ -3317,7 +3298,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
for (; address < end; address += sz) {
- ptep = huge_pte_offset(mm, address);
+ ptep = huge_pte_offset(mm, address, sz);
if (!ptep)
continue;
@@ -3338,7 +3319,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
- huge_pte_clear(mm, address, ptep);
+ huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
}
@@ -3535,7 +3516,8 @@ retry_avoidcopy:
unmap_ref_private(mm, vma, old_page, address);
BUG_ON(huge_pte_none(pte));
spin_lock(ptl);
- ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h),
+ huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -3574,7 +3556,8 @@ retry_avoidcopy:
* before the page tables are altered
*/
spin_lock(ptl);
- ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h),
+ huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearPagePrivate(new_page);
@@ -3861,7 +3844,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
address &= huge_page_mask(h);
- ptep = huge_pte_offset(mm, address);
+ ptep = huge_pte_offset(mm, address, huge_page_size(h));
if (ptep) {
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
@@ -4118,7 +4101,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*
* Note that page table lock is not held when pte is null.
*/
- pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
+ huge_page_size(h));
if (pte)
ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
@@ -4257,7 +4241,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
- ptep = huge_pte_offset(mm, address);
+ ptep = huge_pte_offset(mm, address, huge_page_size(h));
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
@@ -4279,7 +4263,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
make_migration_entry_read(&entry);
newpte = swp_entry_to_pte(entry);
- set_huge_pte_at(mm, address, ptep, newpte);
+ set_huge_swap_pte_at(mm, address, ptep,
+ newpte, huge_page_size(h));
pages++;
}
spin_unlock(ptl);
@@ -4521,7 +4506,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr);
+ spte = huge_pte_offset(svma->vm_mm, saddr,
+ vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
break;
@@ -4617,7 +4603,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
return pte;
}
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4653,6 +4640,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address,
}
struct page * __weak
+follow_huge_pd(struct vm_area_struct *vma,
+ unsigned long address, hugepd_t hpd, int flags, int pdshift)
+{
+ WARN(1, "hugepd follow called with no support for hugepage directory format\n");
+ return NULL;
+}
+
+struct page * __weak
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int flags)
{
@@ -4699,39 +4694,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
}
-#ifdef CONFIG_MEMORY_FAILURE
-
-/*
- * This function is called from memory failure code.
- */
-int dequeue_hwpoisoned_huge_page(struct page *hpage)
+struct page * __weak
+follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
{
- struct hstate *h = page_hstate(hpage);
- int nid = page_to_nid(hpage);
- int ret = -EBUSY;
+ if (flags & FOLL_GET)
+ return NULL;
- spin_lock(&hugetlb_lock);
- /*
- * Just checking !page_huge_active is not enough, because that could be
- * an isolated/hwpoisoned hugepage (which have >0 refcount).
- */
- if (!page_huge_active(hpage) && !page_count(hpage)) {
- /*
- * Hwpoisoned hugepage isn't linked to activelist or freelist,
- * but dangling hpage->lru can trigger list-debug warnings
- * (this happens when we call unpoison_memory() on it),
- * so let it point to itself with list_del_init().
- */
- list_del_init(&hpage->lru);
- set_page_refcounted(hpage);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- ret = 0;
- }
- spin_unlock(&hugetlb_lock);
- return ret;
+ return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
}
-#endif
bool isolate_huge_page(struct page *page, struct list_head *list)
{
diff --git a/mm/internal.h b/mm/internal.h
index 0e4f558412fb..24d88f084705 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -23,7 +23,7 @@
* hints such as HIGHMEM usage.
*/
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
- __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
+ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
__GFP_ATOMIC)
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index c81549d5c833..ca11bc4ce205 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -134,97 +134,33 @@ static __always_inline bool memory_is_poisoned_1(unsigned long addr)
return false;
}
-static __always_inline bool memory_is_poisoned_2(unsigned long addr)
+static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
+ unsigned long size)
{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
-
- if (unlikely(*shadow_addr)) {
- if (memory_is_poisoned_1(addr + 1))
- return true;
-
- /*
- * If single shadow byte covers 2-byte access, we don't
- * need to do anything more. Otherwise, test the first
- * shadow byte.
- */
- if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
- return false;
-
- return unlikely(*(u8 *)shadow_addr);
- }
-
- return false;
-}
-
-static __always_inline bool memory_is_poisoned_4(unsigned long addr)
-{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+ u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
- if (unlikely(*shadow_addr)) {
- if (memory_is_poisoned_1(addr + 3))
- return true;
-
- /*
- * If single shadow byte covers 4-byte access, we don't
- * need to do anything more. Otherwise, test the first
- * shadow byte.
- */
- if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
- return false;
-
- return unlikely(*(u8 *)shadow_addr);
- }
-
- return false;
-}
-
-static __always_inline bool memory_is_poisoned_8(unsigned long addr)
-{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
-
- if (unlikely(*shadow_addr)) {
- if (memory_is_poisoned_1(addr + 7))
- return true;
-
- /*
- * If single shadow byte covers 8-byte access, we don't
- * need to do anything more. Otherwise, test the first
- * shadow byte.
- */
- if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
- return false;
-
- return unlikely(*(u8 *)shadow_addr);
- }
+ /*
+ * Access crosses 8(shadow size)-byte boundary. Such access maps
+ * into 2 shadow bytes, so we need to check them both.
+ */
+ if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
+ return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
- return false;
+ return memory_is_poisoned_1(addr + size - 1);
}
static __always_inline bool memory_is_poisoned_16(unsigned long addr)
{
- u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
-
- if (unlikely(*shadow_addr)) {
- u16 shadow_first_bytes = *(u16 *)shadow_addr;
-
- if (unlikely(shadow_first_bytes))
- return true;
-
- /*
- * If two shadow bytes covers 16-byte access, we don't
- * need to do anything more. Otherwise, test the last
- * shadow byte.
- */
- if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
- return false;
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
- return memory_is_poisoned_1(addr + 15);
- }
+ /* Unaligned 16-bytes access maps into 3 shadow bytes. */
+ if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
+ return *shadow_addr || memory_is_poisoned_1(addr + 15);
- return false;
+ return *shadow_addr;
}
-static __always_inline unsigned long bytes_is_zero(const u8 *start,
+static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
size_t size)
{
while (size) {
@@ -237,7 +173,7 @@ static __always_inline unsigned long bytes_is_zero(const u8 *start,
return 0;
}
-static __always_inline unsigned long memory_is_zero(const void *start,
+static __always_inline unsigned long memory_is_nonzero(const void *start,
const void *end)
{
unsigned int words;
@@ -245,11 +181,11 @@ static __always_inline unsigned long memory_is_zero(const void *start,
unsigned int prefix = (unsigned long)start % 8;
if (end - start <= 16)
- return bytes_is_zero(start, end - start);
+ return bytes_is_nonzero(start, end - start);
if (prefix) {
prefix = 8 - prefix;
- ret = bytes_is_zero(start, prefix);
+ ret = bytes_is_nonzero(start, prefix);
if (unlikely(ret))
return ret;
start += prefix;
@@ -258,12 +194,12 @@ static __always_inline unsigned long memory_is_zero(const void *start,
words = (end - start) / 8;
while (words) {
if (unlikely(*(u64 *)start))
- return bytes_is_zero(start, 8);
+ return bytes_is_nonzero(start, 8);
start += 8;
words--;
}
- return bytes_is_zero(start, (end - start) % 8);
+ return bytes_is_nonzero(start, (end - start) % 8);
}
static __always_inline bool memory_is_poisoned_n(unsigned long addr,
@@ -271,7 +207,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr,
{
unsigned long ret;
- ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
+ ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
kasan_mem_to_shadow((void *)addr + size - 1) + 1);
if (unlikely(ret)) {
@@ -292,11 +228,9 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
case 1:
return memory_is_poisoned_1(addr);
case 2:
- return memory_is_poisoned_2(addr);
case 4:
- return memory_is_poisoned_4(addr);
case 8:
- return memory_is_poisoned_8(addr);
+ return memory_is_poisoned_2_4_8(addr, size);
case 16:
return memory_is_poisoned_16(addr);
default:
@@ -803,17 +737,47 @@ void __asan_unpoison_stack_memory(const void *addr, size_t size)
EXPORT_SYMBOL(__asan_unpoison_stack_memory);
#ifdef CONFIG_MEMORY_HOTPLUG
-static int kasan_mem_notifier(struct notifier_block *nb,
+static int __meminit kasan_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
- return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK;
+ struct memory_notify *mem_data = data;
+ unsigned long nr_shadow_pages, start_kaddr, shadow_start;
+ unsigned long shadow_end, shadow_size;
+
+ nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
+ start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
+ shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
+ shadow_size = nr_shadow_pages << PAGE_SHIFT;
+ shadow_end = shadow_start + shadow_size;
+
+ if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) ||
+ WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT)))
+ return NOTIFY_BAD;
+
+ switch (action) {
+ case MEM_GOING_ONLINE: {
+ void *ret;
+
+ ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
+ shadow_end, GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD,
+ pfn_to_nid(mem_data->start_pfn),
+ __builtin_return_address(0));
+ if (!ret)
+ return NOTIFY_BAD;
+
+ kmemleak_ignore(ret);
+ return NOTIFY_OK;
+ }
+ case MEM_OFFLINE:
+ vfree((void *)shadow_start);
+ }
+
+ return NOTIFY_OK;
}
static int __init kasan_memhotplug_init(void)
{
- pr_info("WARNING: KASAN doesn't support memory hot-add\n");
- pr_info("Memory hot-add will be disabled\n");
-
hotplug_memory_notifier(kasan_mem_notifier, 0);
return 0;
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index b96a5f773d88..554e4c0f23a2 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -118,6 +118,18 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
do {
next = p4d_addr_end(addr, end);
+ if (IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) {
+ pud_t *pud;
+ pmd_t *pmd;
+
+ p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud));
+ pud = pud_offset(p4d, addr);
+ pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
+ pmd = pmd_offset(pud, addr);
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_zero_pte));
+ continue;
+ }
if (p4d_none(*p4d)) {
p4d_populate(&init_mm, p4d,
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index beee0e980e2d..04bb1d3eb9ec 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -107,7 +107,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
return bug_type;
}
-const char *get_wild_bug_type(struct kasan_access_info *info)
+static const char *get_wild_bug_type(struct kasan_access_info *info)
{
const char *bug_type = "unknown-crash";
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 945fd1ca49b5..c01f177a1120 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -652,7 +652,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
- cond_resched();
}
}
@@ -817,7 +816,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
static bool hugepage_vma_check(struct vm_area_struct *vma)
{
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vma->vm_flags & VM_NOHUGEPAGE))
+ (vma->vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
return false;
if (shmem_file(vma->vm_file)) {
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 20036d4f9f13..7780cd83a495 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -150,7 +150,7 @@ struct kmemleak_scan_area {
*/
struct kmemleak_object {
spinlock_t lock;
- unsigned long flags; /* object status flags */
+ unsigned int flags; /* object status flags */
struct list_head object_list;
struct list_head gray_list;
struct rb_node rb_node;
@@ -159,6 +159,8 @@ struct kmemleak_object {
atomic_t use_count;
unsigned long pointer;
size_t size;
+ /* pass surplus references to this pointer */
+ unsigned long excess_ref;
/* minimum number of a pointers found before it is considered leak */
int min_count;
/* the total number of pointers found pointing to this object */
@@ -253,7 +255,8 @@ enum {
KMEMLEAK_NOT_LEAK,
KMEMLEAK_IGNORE,
KMEMLEAK_SCAN_AREA,
- KMEMLEAK_NO_SCAN
+ KMEMLEAK_NO_SCAN,
+ KMEMLEAK_SET_EXCESS_REF
};
/*
@@ -262,9 +265,12 @@ enum {
*/
struct early_log {
int op_type; /* kmemleak operation type */
- const void *ptr; /* allocated/freed memory block */
- size_t size; /* memory block size */
int min_count; /* minimum reference count */
+ const void *ptr; /* allocated/freed memory block */
+ union {
+ size_t size; /* memory block size */
+ unsigned long excess_ref; /* surplus reference passing */
+ };
unsigned long trace[MAX_TRACE]; /* stack trace */
unsigned int trace_len; /* stack trace length */
};
@@ -393,7 +399,7 @@ static void dump_object_info(struct kmemleak_object *object)
object->comm, object->pid, object->jiffies);
pr_notice(" min_count = %d\n", object->min_count);
pr_notice(" count = %d\n", object->count);
- pr_notice(" flags = 0x%lx\n", object->flags);
+ pr_notice(" flags = 0x%x\n", object->flags);
pr_notice(" checksum = %u\n", object->checksum);
pr_notice(" backtrace:\n");
print_stack_trace(&trace, 4);
@@ -562,6 +568,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
object->flags = OBJECT_ALLOCATED;
object->pointer = ptr;
object->size = size;
+ object->excess_ref = 0;
object->min_count = min_count;
object->count = 0; /* white color initially */
object->jiffies = jiffies;
@@ -795,6 +802,30 @@ out:
}
/*
+ * Any surplus references (object already gray) to 'ptr' are passed to
+ * 'excess_ref'. This is used in the vmalloc() case where a pointer to
+ * vm_struct may be used as an alternative reference to the vmalloc'ed object
+ * (see free_thread_stack()).
+ */
+static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n",
+ ptr);
+ return;
+ }
+
+ spin_lock_irqsave(&object->lock, flags);
+ object->excess_ref = excess_ref;
+ spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/*
* Set the OBJECT_NO_SCAN flag for the object corresponding to the give
* pointer. Such object will not be scanned by kmemleak but references to it
* are searched.
@@ -908,7 +939,7 @@ static void early_alloc_percpu(struct early_log *log)
* @gfp: kmalloc() flags used for kmemleak internal memory allocations
*
* This function is called from the kernel allocators when a new object
- * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
+ * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.).
*/
void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
gfp_t gfp)
@@ -952,6 +983,36 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
/**
+ * kmemleak_vmalloc - register a newly vmalloc'ed object
+ * @area: pointer to vm_struct
+ * @size: size of the object
+ * @gfp: __vmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the vmalloc() kernel allocator when a new
+ * object (memory block) is allocated.
+ */
+void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp)
+{
+ pr_debug("%s(0x%p, %zu)\n", __func__, area, size);
+
+ /*
+ * A min_count = 2 is needed because vm_struct contains a reference to
+ * the virtual address of the vmalloc'ed block.
+ */
+ if (kmemleak_enabled) {
+ create_object((unsigned long)area->addr, size, 2, gfp);
+ object_set_excess_ref((unsigned long)area,
+ (unsigned long)area->addr);
+ } else if (kmemleak_early_log) {
+ log_early(KMEMLEAK_ALLOC, area->addr, size, 2);
+ /* reusing early_log.size for storing area->addr */
+ log_early(KMEMLEAK_SET_EXCESS_REF,
+ area, (unsigned long)area->addr, 0);
+ }
+}
+EXPORT_SYMBOL_GPL(kmemleak_vmalloc);
+
+/**
* kmemleak_free - unregister a previously registered object
* @ptr: pointer to beginning of the object
*
@@ -1188,6 +1249,30 @@ static bool update_checksum(struct kmemleak_object *object)
}
/*
+ * Update an object's references. object->lock must be held by the caller.
+ */
+static void update_refs(struct kmemleak_object *object)
+{
+ if (!color_white(object)) {
+ /* non-orphan, ignored or new */
+ return;
+ }
+
+ /*
+ * Increase the object's reference count (number of pointers to the
+ * memory block). If this count reaches the required minimum, the
+ * object's color will become gray and it will be added to the
+ * gray_list.
+ */
+ object->count++;
+ if (color_gray(object)) {
+ /* put_object() called when removing from gray_list */
+ WARN_ON(!get_object(object));
+ list_add_tail(&object->gray_list, &gray_list);
+ }
+}
+
+/*
* Memory scanning is a long process and it needs to be interruptable. This
* function checks whether such interrupt condition occurred.
*/
@@ -1224,6 +1309,7 @@ static void scan_block(void *_start, void *_end,
for (ptr = start; ptr < end; ptr++) {
struct kmemleak_object *object;
unsigned long pointer;
+ unsigned long excess_ref;
if (scan_should_stop())
break;
@@ -1259,25 +1345,27 @@ static void scan_block(void *_start, void *_end,
* enclosed by scan_mutex.
*/
spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
- if (!color_white(object)) {
- /* non-orphan, ignored or new */
- spin_unlock(&object->lock);
- continue;
- }
-
- /*
- * Increase the object's reference count (number of pointers
- * to the memory block). If this count reaches the required
- * minimum, the object's color will become gray and it will be
- * added to the gray_list.
- */
- object->count++;
+ /* only pass surplus references (object already gray) */
if (color_gray(object)) {
- /* put_object() called when removing from gray_list */
- WARN_ON(!get_object(object));
- list_add_tail(&object->gray_list, &gray_list);
+ excess_ref = object->excess_ref;
+ /* no need for update_refs() if object already gray */
+ } else {
+ excess_ref = 0;
+ update_refs(object);
}
spin_unlock(&object->lock);
+
+ if (excess_ref) {
+ object = lookup_object(excess_ref, 0);
+ if (!object)
+ continue;
+ if (object == scanned)
+ /* circular reference, ignore */
+ continue;
+ spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ update_refs(object);
+ spin_unlock(&object->lock);
+ }
}
read_unlock_irqrestore(&kmemleak_lock, flags);
}
@@ -1980,6 +2068,10 @@ void __init kmemleak_init(void)
case KMEMLEAK_NO_SCAN:
kmemleak_no_scan(log->ptr);
break;
+ case KMEMLEAK_SET_EXCESS_REF:
+ object_set_excess_ref((unsigned long)log->ptr,
+ log->excess_ref);
+ break;
default:
kmemleak_warn("Unknown early log operation: %d\n",
log->op_type);
diff --git a/mm/ksm.c b/mm/ksm.c
index 216184af0e19..4dc92f138786 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -128,9 +128,12 @@ struct ksm_scan {
* struct stable_node - node of the stable rbtree
* @node: rb node of this ksm page in the stable tree
* @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
+ * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
* @list: linked into migrate_nodes, pending placement in the proper node tree
* @hlist: hlist head of rmap_items using this ksm page
* @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
+ * @chain_prune_time: time of the last full garbage collection
+ * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
* @nid: NUMA node id of stable tree in which linked (may not match kpfn)
*/
struct stable_node {
@@ -138,11 +141,24 @@ struct stable_node {
struct rb_node node; /* when node of stable tree */
struct { /* when listed for migration */
struct list_head *head;
- struct list_head list;
+ struct {
+ struct hlist_node hlist_dup;
+ struct list_head list;
+ };
};
};
struct hlist_head hlist;
- unsigned long kpfn;
+ union {
+ unsigned long kpfn;
+ unsigned long chain_prune_time;
+ };
+ /*
+ * STABLE_NODE_CHAIN can be any negative number in
+ * rmap_hlist_len negative range, but better not -1 to be able
+ * to reliably detect underflows.
+ */
+#define STABLE_NODE_CHAIN -1024
+ int rmap_hlist_len;
#ifdef CONFIG_NUMA
int nid;
#endif
@@ -192,6 +208,7 @@ static struct rb_root *root_unstable_tree = one_unstable_tree;
/* Recently migrated nodes of stable tree, pending proper placement */
static LIST_HEAD(migrate_nodes);
+#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -219,6 +236,18 @@ static unsigned long ksm_pages_unshared;
/* The number of rmap_items in use: to calculate pages_volatile */
static unsigned long ksm_rmap_items;
+/* The number of stable_node chains */
+static unsigned long ksm_stable_node_chains;
+
+/* The number of stable_node dups linked to the stable_node chains */
+static unsigned long ksm_stable_node_dups;
+
+/* Delay in pruning stale stable_node_dups in the stable_node_chains */
+static int ksm_stable_node_chains_prune_millisecs = 2000;
+
+/* Maximum number of page slots sharing a stable node */
+static int ksm_max_page_sharing = 256;
+
/* Number of pages ksmd should scan in one batch */
static unsigned int ksm_thread_pages_to_scan = 100;
@@ -287,6 +316,45 @@ static void __init ksm_slab_free(void)
mm_slot_cache = NULL;
}
+static __always_inline bool is_stable_node_chain(struct stable_node *chain)
+{
+ return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
+}
+
+static __always_inline bool is_stable_node_dup(struct stable_node *dup)
+{
+ return dup->head == STABLE_NODE_DUP_HEAD;
+}
+
+static inline void stable_node_chain_add_dup(struct stable_node *dup,
+ struct stable_node *chain)
+{
+ VM_BUG_ON(is_stable_node_dup(dup));
+ dup->head = STABLE_NODE_DUP_HEAD;
+ VM_BUG_ON(!is_stable_node_chain(chain));
+ hlist_add_head(&dup->hlist_dup, &chain->hlist);
+ ksm_stable_node_dups++;
+}
+
+static inline void __stable_node_dup_del(struct stable_node *dup)
+{
+ VM_BUG_ON(!is_stable_node_dup(dup));
+ hlist_del(&dup->hlist_dup);
+ ksm_stable_node_dups--;
+}
+
+static inline void stable_node_dup_del(struct stable_node *dup)
+{
+ VM_BUG_ON(is_stable_node_chain(dup));
+ if (is_stable_node_dup(dup))
+ __stable_node_dup_del(dup);
+ else
+ rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
+#ifdef CONFIG_DEBUG_VM
+ dup->head = NULL;
+#endif
+}
+
static inline struct rmap_item *alloc_rmap_item(void)
{
struct rmap_item *rmap_item;
@@ -317,6 +385,8 @@ static inline struct stable_node *alloc_stable_node(void)
static inline void free_stable_node(struct stable_node *stable_node)
{
+ VM_BUG_ON(stable_node->rmap_hlist_len &&
+ !is_stable_node_chain(stable_node));
kmem_cache_free(stable_node_cache, stable_node);
}
@@ -498,25 +568,82 @@ static inline int get_kpfn_nid(unsigned long kpfn)
return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
}
+static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
+ struct rb_root *root)
+{
+ struct stable_node *chain = alloc_stable_node();
+ VM_BUG_ON(is_stable_node_chain(dup));
+ if (likely(chain)) {
+ INIT_HLIST_HEAD(&chain->hlist);
+ chain->chain_prune_time = jiffies;
+ chain->rmap_hlist_len = STABLE_NODE_CHAIN;
+#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
+ chain->nid = -1; /* debug */
+#endif
+ ksm_stable_node_chains++;
+
+ /*
+ * Put the stable node chain in the first dimension of
+ * the stable tree and at the same time remove the old
+ * stable node.
+ */
+ rb_replace_node(&dup->node, &chain->node, root);
+
+ /*
+ * Move the old stable node to the second dimension
+ * queued in the hlist_dup. The invariant is that all
+ * dup stable_nodes in the chain->hlist point to pages
+ * that are wrprotected and have the exact same
+ * content.
+ */
+ stable_node_chain_add_dup(dup, chain);
+ }
+ return chain;
+}
+
+static inline void free_stable_node_chain(struct stable_node *chain,
+ struct rb_root *root)
+{
+ rb_erase(&chain->node, root);
+ free_stable_node(chain);
+ ksm_stable_node_chains--;
+}
+
static void remove_node_from_stable_tree(struct stable_node *stable_node)
{
struct rmap_item *rmap_item;
+ /* check it's not STABLE_NODE_CHAIN or negative */
+ BUG_ON(stable_node->rmap_hlist_len < 0);
+
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
if (rmap_item->hlist.next)
ksm_pages_sharing--;
else
ksm_pages_shared--;
+ VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
+ stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
cond_resched();
}
+ /*
+ * We need the second aligned pointer of the migrate_nodes
+ * list_head to stay clear from the rb_parent_color union
+ * (aligned and different than any node) and also different
+ * from &migrate_nodes. This will verify that future list.h changes
+ * don't break STABLE_NODE_DUP_HEAD.
+ */
+#if GCC_VERSION >= 40903 /* only recent gcc can handle it */
+ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
+ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
+#endif
+
if (stable_node->head == &migrate_nodes)
list_del(&stable_node->list);
else
- rb_erase(&stable_node->node,
- root_stable_tree + NUMA(stable_node->nid));
+ stable_node_dup_del(stable_node);
free_stable_node(stable_node);
}
@@ -635,6 +762,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
ksm_pages_sharing--;
else
ksm_pages_shared--;
+ VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
+ stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
@@ -743,6 +872,31 @@ static int remove_stable_node(struct stable_node *stable_node)
return err;
}
+static int remove_stable_node_chain(struct stable_node *stable_node,
+ struct rb_root *root)
+{
+ struct stable_node *dup;
+ struct hlist_node *hlist_safe;
+
+ if (!is_stable_node_chain(stable_node)) {
+ VM_BUG_ON(is_stable_node_dup(stable_node));
+ if (remove_stable_node(stable_node))
+ return true;
+ else
+ return false;
+ }
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ VM_BUG_ON(!is_stable_node_dup(dup));
+ if (remove_stable_node(dup))
+ return true;
+ }
+ BUG_ON(!hlist_empty(&stable_node->hlist));
+ free_stable_node_chain(stable_node, root);
+ return false;
+}
+
static int remove_all_stable_nodes(void)
{
struct stable_node *stable_node, *next;
@@ -753,7 +907,8 @@ static int remove_all_stable_nodes(void)
while (root_stable_tree[nid].rb_node) {
stable_node = rb_entry(root_stable_tree[nid].rb_node,
struct stable_node, node);
- if (remove_stable_node(stable_node)) {
+ if (remove_stable_node_chain(stable_node,
+ root_stable_tree + nid)) {
err = -EBUSY;
break; /* proceed to next nid */
}
@@ -1138,6 +1293,214 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
return err ? NULL : page;
}
+static __always_inline
+bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
+{
+ VM_BUG_ON(stable_node->rmap_hlist_len < 0);
+ /*
+ * Check that at least one mapping still exists, otherwise
+ * there's no much point to merge and share with this
+ * stable_node, as the underlying tree_page of the other
+ * sharer is going to be freed soon.
+ */
+ return stable_node->rmap_hlist_len &&
+ stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
+}
+
+static __always_inline
+bool is_page_sharing_candidate(struct stable_node *stable_node)
+{
+ return __is_page_sharing_candidate(stable_node, 0);
+}
+
+struct page *stable_node_dup(struct stable_node **_stable_node_dup,
+ struct stable_node **_stable_node,
+ struct rb_root *root,
+ bool prune_stale_stable_nodes)
+{
+ struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
+ struct hlist_node *hlist_safe;
+ struct page *_tree_page, *tree_page = NULL;
+ int nr = 0;
+ int found_rmap_hlist_len;
+
+ if (!prune_stale_stable_nodes ||
+ time_before(jiffies, stable_node->chain_prune_time +
+ msecs_to_jiffies(
+ ksm_stable_node_chains_prune_millisecs)))
+ prune_stale_stable_nodes = false;
+ else
+ stable_node->chain_prune_time = jiffies;
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ cond_resched();
+ /*
+ * We must walk all stable_node_dup to prune the stale
+ * stable nodes during lookup.
+ *
+ * get_ksm_page can drop the nodes from the
+ * stable_node->hlist if they point to freed pages
+ * (that's why we do a _safe walk). The "dup"
+ * stable_node parameter itself will be freed from
+ * under us if it returns NULL.
+ */
+ _tree_page = get_ksm_page(dup, false);
+ if (!_tree_page)
+ continue;
+ nr += 1;
+ if (is_page_sharing_candidate(dup)) {
+ if (!found ||
+ dup->rmap_hlist_len > found_rmap_hlist_len) {
+ if (found)
+ put_page(tree_page);
+ found = dup;
+ found_rmap_hlist_len = found->rmap_hlist_len;
+ tree_page = _tree_page;
+
+ /* skip put_page for found dup */
+ if (!prune_stale_stable_nodes)
+ break;
+ continue;
+ }
+ }
+ put_page(_tree_page);
+ }
+
+ if (found) {
+ /*
+ * nr is counting all dups in the chain only if
+ * prune_stale_stable_nodes is true, otherwise we may
+ * break the loop at nr == 1 even if there are
+ * multiple entries.
+ */
+ if (prune_stale_stable_nodes && nr == 1) {
+ /*
+ * If there's not just one entry it would
+ * corrupt memory, better BUG_ON. In KSM
+ * context with no lock held it's not even
+ * fatal.
+ */
+ BUG_ON(stable_node->hlist.first->next);
+
+ /*
+ * There's just one entry and it is below the
+ * deduplication limit so drop the chain.
+ */
+ rb_replace_node(&stable_node->node, &found->node,
+ root);
+ free_stable_node(stable_node);
+ ksm_stable_node_chains--;
+ ksm_stable_node_dups--;
+ /*
+ * NOTE: the caller depends on the stable_node
+ * to be equal to stable_node_dup if the chain
+ * was collapsed.
+ */
+ *_stable_node = found;
+ /*
+ * Just for robustneess as stable_node is
+ * otherwise left as a stable pointer, the
+ * compiler shall optimize it away at build
+ * time.
+ */
+ stable_node = NULL;
+ } else if (stable_node->hlist.first != &found->hlist_dup &&
+ __is_page_sharing_candidate(found, 1)) {
+ /*
+ * If the found stable_node dup can accept one
+ * more future merge (in addition to the one
+ * that is underway) and is not at the head of
+ * the chain, put it there so next search will
+ * be quicker in the !prune_stale_stable_nodes
+ * case.
+ *
+ * NOTE: it would be inaccurate to use nr > 1
+ * instead of checking the hlist.first pointer
+ * directly, because in the
+ * prune_stale_stable_nodes case "nr" isn't
+ * the position of the found dup in the chain,
+ * but the total number of dups in the chain.
+ */
+ hlist_del(&found->hlist_dup);
+ hlist_add_head(&found->hlist_dup,
+ &stable_node->hlist);
+ }
+ }
+
+ *_stable_node_dup = found;
+ return tree_page;
+}
+
+static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
+ struct rb_root *root)
+{
+ if (!is_stable_node_chain(stable_node))
+ return stable_node;
+ if (hlist_empty(&stable_node->hlist)) {
+ free_stable_node_chain(stable_node, root);
+ return NULL;
+ }
+ return hlist_entry(stable_node->hlist.first,
+ typeof(*stable_node), hlist_dup);
+}
+
+/*
+ * Like for get_ksm_page, this function can free the *_stable_node and
+ * *_stable_node_dup if the returned tree_page is NULL.
+ *
+ * It can also free and overwrite *_stable_node with the found
+ * stable_node_dup if the chain is collapsed (in which case
+ * *_stable_node will be equal to *_stable_node_dup like if the chain
+ * never existed). It's up to the caller to verify tree_page is not
+ * NULL before dereferencing *_stable_node or *_stable_node_dup.
+ *
+ * *_stable_node_dup is really a second output parameter of this
+ * function and will be overwritten in all cases, the caller doesn't
+ * need to initialize it.
+ */
+static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
+ struct stable_node **_stable_node,
+ struct rb_root *root,
+ bool prune_stale_stable_nodes)
+{
+ struct stable_node *stable_node = *_stable_node;
+ if (!is_stable_node_chain(stable_node)) {
+ if (is_page_sharing_candidate(stable_node)) {
+ *_stable_node_dup = stable_node;
+ return get_ksm_page(stable_node, false);
+ }
+ /*
+ * _stable_node_dup set to NULL means the stable_node
+ * reached the ksm_max_page_sharing limit.
+ */
+ *_stable_node_dup = NULL;
+ return NULL;
+ }
+ return stable_node_dup(_stable_node_dup, _stable_node, root,
+ prune_stale_stable_nodes);
+}
+
+static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
+ struct stable_node **s_n,
+ struct rb_root *root)
+{
+ return __stable_node_chain(s_n_d, s_n, root, true);
+}
+
+static __always_inline struct page *chain(struct stable_node **s_n_d,
+ struct stable_node *s_n,
+ struct rb_root *root)
+{
+ struct stable_node *old_stable_node = s_n;
+ struct page *tree_page;
+
+ tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
+ /* not pruning dups so s_n cannot have changed */
+ VM_BUG_ON(s_n != old_stable_node);
+ return tree_page;
+}
+
/*
* stable_tree_search - search for page inside the stable tree
*
@@ -1153,7 +1516,7 @@ static struct page *stable_tree_search(struct page *page)
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node;
+ struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
struct stable_node *page_node;
page_node = page_stable_node(page);
@@ -1175,7 +1538,44 @@ again:
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
- tree_page = get_ksm_page(stable_node, false);
+ stable_node_any = NULL;
+ tree_page = chain_prune(&stable_node_dup, &stable_node, root);
+ /*
+ * NOTE: stable_node may have been freed by
+ * chain_prune() if the returned stable_node_dup is
+ * not NULL. stable_node_dup may have been inserted in
+ * the rbtree instead as a regular stable_node (in
+ * order to collapse the stable_node chain if a single
+ * stable_node dup was found in it). In such case the
+ * stable_node is overwritten by the calleee to point
+ * to the stable_node_dup that was collapsed in the
+ * stable rbtree and stable_node will be equal to
+ * stable_node_dup like if the chain never existed.
+ */
+ if (!stable_node_dup) {
+ /*
+ * Either all stable_node dups were full in
+ * this stable_node chain, or this chain was
+ * empty and should be rb_erased.
+ */
+ stable_node_any = stable_node_dup_any(stable_node,
+ root);
+ if (!stable_node_any) {
+ /* rb_erase just run */
+ goto again;
+ }
+ /*
+ * Take any of the stable_node dups page of
+ * this stable_node chain to let the tree walk
+ * continue. All KSM pages belonging to the
+ * stable_node dups in a stable_node chain
+ * have the same content and they're
+ * wrprotected at all times. Any will work
+ * fine to continue the walk.
+ */
+ tree_page = get_ksm_page(stable_node_any, false);
+ }
+ VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
/*
* If we walked over a stale stable_node,
@@ -1198,6 +1598,34 @@ again:
else if (ret > 0)
new = &parent->rb_right;
else {
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ /*
+ * Test if the migrated page should be merged
+ * into a stable node dup. If the mapcount is
+ * 1 we can migrate it with another KSM page
+ * without adding it to the chain.
+ */
+ if (page_mapcount(page) > 1)
+ goto chain_append;
+ }
+
+ if (!stable_node_dup) {
+ /*
+ * If the stable_node is a chain and
+ * we got a payload match in memcmp
+ * but we cannot merge the scanned
+ * page in any of the existing
+ * stable_node dups because they're
+ * all full, we need to wait the
+ * scanned page to find itself a match
+ * in the unstable tree to create a
+ * brand new KSM page to add later to
+ * the dups of this stable_node.
+ */
+ return NULL;
+ }
+
/*
* Lock and unlock the stable_node's page (which
* might already have been migrated) so that page
@@ -1205,23 +1633,21 @@ again:
* It would be more elegant to return stable_node
* than kpage, but that involves more changes.
*/
- tree_page = get_ksm_page(stable_node, true);
- if (tree_page) {
- unlock_page(tree_page);
- if (get_kpfn_nid(stable_node->kpfn) !=
- NUMA(stable_node->nid)) {
- put_page(tree_page);
- goto replace;
- }
- return tree_page;
- }
- /*
- * There is now a place for page_node, but the tree may
- * have been rebalanced, so re-evaluate parent and new.
- */
- if (page_node)
+ tree_page = get_ksm_page(stable_node_dup, true);
+ if (unlikely(!tree_page))
+ /*
+ * The tree may have been rebalanced,
+ * so re-evaluate parent and new.
+ */
goto again;
- return NULL;
+ unlock_page(tree_page);
+
+ if (get_kpfn_nid(stable_node_dup->kpfn) !=
+ NUMA(stable_node_dup->nid)) {
+ put_page(tree_page);
+ goto replace;
+ }
+ return tree_page;
}
}
@@ -1232,22 +1658,95 @@ again:
DO_NUMA(page_node->nid = nid);
rb_link_node(&page_node->node, parent, new);
rb_insert_color(&page_node->node, root);
- get_page(page);
- return page;
+out:
+ if (is_page_sharing_candidate(page_node)) {
+ get_page(page);
+ return page;
+ } else
+ return NULL;
replace:
- if (page_node) {
- list_del(&page_node->list);
- DO_NUMA(page_node->nid = nid);
- rb_replace_node(&stable_node->node, &page_node->node, root);
- get_page(page);
+ /*
+ * If stable_node was a chain and chain_prune collapsed it,
+ * stable_node has been updated to be the new regular
+ * stable_node. A collapse of the chain is indistinguishable
+ * from the case there was no chain in the stable
+ * rbtree. Otherwise stable_node is the chain and
+ * stable_node_dup is the dup to replace.
+ */
+ if (stable_node_dup == stable_node) {
+ VM_BUG_ON(is_stable_node_chain(stable_node_dup));
+ VM_BUG_ON(is_stable_node_dup(stable_node_dup));
+ /* there is no chain */
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ rb_replace_node(&stable_node_dup->node,
+ &page_node->node,
+ root);
+ if (is_page_sharing_candidate(page_node))
+ get_page(page);
+ else
+ page = NULL;
+ } else {
+ rb_erase(&stable_node_dup->node, root);
+ page = NULL;
+ }
} else {
- rb_erase(&stable_node->node, root);
- page = NULL;
+ VM_BUG_ON(!is_stable_node_chain(stable_node));
+ __stable_node_dup_del(stable_node_dup);
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ stable_node_chain_add_dup(page_node, stable_node);
+ if (is_page_sharing_candidate(page_node))
+ get_page(page);
+ else
+ page = NULL;
+ } else {
+ page = NULL;
+ }
}
- stable_node->head = &migrate_nodes;
- list_add(&stable_node->list, stable_node->head);
+ stable_node_dup->head = &migrate_nodes;
+ list_add(&stable_node_dup->list, stable_node_dup->head);
return page;
+
+chain_append:
+ /* stable_node_dup could be null if it reached the limit */
+ if (!stable_node_dup)
+ stable_node_dup = stable_node_any;
+ /*
+ * If stable_node was a chain and chain_prune collapsed it,
+ * stable_node has been updated to be the new regular
+ * stable_node. A collapse of the chain is indistinguishable
+ * from the case there was no chain in the stable
+ * rbtree. Otherwise stable_node is the chain and
+ * stable_node_dup is the dup to replace.
+ */
+ if (stable_node_dup == stable_node) {
+ VM_BUG_ON(is_stable_node_chain(stable_node_dup));
+ VM_BUG_ON(is_stable_node_dup(stable_node_dup));
+ /* chain is missing so create it */
+ stable_node = alloc_stable_node_chain(stable_node_dup,
+ root);
+ if (!stable_node)
+ return NULL;
+ }
+ /*
+ * Add this stable_node dup that was
+ * migrated to the stable_node chain
+ * of the current nid for this page
+ * content.
+ */
+ VM_BUG_ON(!is_stable_node_chain(stable_node));
+ VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ stable_node_chain_add_dup(page_node, stable_node);
+ goto out;
}
/*
@@ -1264,7 +1763,8 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node;
+ struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
+ bool need_chain = false;
kpfn = page_to_pfn(kpage);
nid = get_kpfn_nid(kpfn);
@@ -1279,7 +1779,32 @@ again:
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
- tree_page = get_ksm_page(stable_node, false);
+ stable_node_any = NULL;
+ tree_page = chain(&stable_node_dup, stable_node, root);
+ if (!stable_node_dup) {
+ /*
+ * Either all stable_node dups were full in
+ * this stable_node chain, or this chain was
+ * empty and should be rb_erased.
+ */
+ stable_node_any = stable_node_dup_any(stable_node,
+ root);
+ if (!stable_node_any) {
+ /* rb_erase just run */
+ goto again;
+ }
+ /*
+ * Take any of the stable_node dups page of
+ * this stable_node chain to let the tree walk
+ * continue. All KSM pages belonging to the
+ * stable_node dups in a stable_node chain
+ * have the same content and they're
+ * wrprotected at all times. Any will work
+ * fine to continue the walk.
+ */
+ tree_page = get_ksm_page(stable_node_any, false);
+ }
+ VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
/*
* If we walked over a stale stable_node,
@@ -1302,27 +1827,37 @@ again:
else if (ret > 0)
new = &parent->rb_right;
else {
- /*
- * It is not a bug that stable_tree_search() didn't
- * find this node: because at that time our page was
- * not yet write-protected, so may have changed since.
- */
- return NULL;
+ need_chain = true;
+ break;
}
}
- stable_node = alloc_stable_node();
- if (!stable_node)
+ stable_node_dup = alloc_stable_node();
+ if (!stable_node_dup)
return NULL;
- INIT_HLIST_HEAD(&stable_node->hlist);
- stable_node->kpfn = kpfn;
- set_page_stable_node(kpage, stable_node);
- DO_NUMA(stable_node->nid = nid);
- rb_link_node(&stable_node->node, parent, new);
- rb_insert_color(&stable_node->node, root);
+ INIT_HLIST_HEAD(&stable_node_dup->hlist);
+ stable_node_dup->kpfn = kpfn;
+ set_page_stable_node(kpage, stable_node_dup);
+ stable_node_dup->rmap_hlist_len = 0;
+ DO_NUMA(stable_node_dup->nid = nid);
+ if (!need_chain) {
+ rb_link_node(&stable_node_dup->node, parent, new);
+ rb_insert_color(&stable_node_dup->node, root);
+ } else {
+ if (!is_stable_node_chain(stable_node)) {
+ struct stable_node *orig = stable_node;
+ /* chain is missing so create it */
+ stable_node = alloc_stable_node_chain(orig, root);
+ if (!stable_node) {
+ free_stable_node(stable_node_dup);
+ return NULL;
+ }
+ }
+ stable_node_chain_add_dup(stable_node_dup, stable_node);
+ }
- return stable_node;
+ return stable_node_dup;
}
/*
@@ -1412,8 +1947,27 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
* the same ksm page.
*/
static void stable_tree_append(struct rmap_item *rmap_item,
- struct stable_node *stable_node)
+ struct stable_node *stable_node,
+ bool max_page_sharing_bypass)
{
+ /*
+ * rmap won't find this mapping if we don't insert the
+ * rmap_item in the right stable_node
+ * duplicate. page_migration could break later if rmap breaks,
+ * so we can as well crash here. We really need to check for
+ * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
+ * for other negative values as an undeflow if detected here
+ * for the first time (and not when decreasing rmap_hlist_len)
+ * would be sign of memory corruption in the stable_node.
+ */
+ BUG_ON(stable_node->rmap_hlist_len < 0);
+
+ stable_node->rmap_hlist_len++;
+ if (!max_page_sharing_bypass)
+ /* possibly non fatal but unexpected overflow, only warn */
+ WARN_ON_ONCE(stable_node->rmap_hlist_len >
+ ksm_max_page_sharing);
+
rmap_item->head = stable_node;
rmap_item->address |= STABLE_FLAG;
hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
@@ -1441,19 +1995,26 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
struct page *kpage;
unsigned int checksum;
int err;
+ bool max_page_sharing_bypass = false;
stable_node = page_stable_node(page);
if (stable_node) {
if (stable_node->head != &migrate_nodes &&
- get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
- rb_erase(&stable_node->node,
- root_stable_tree + NUMA(stable_node->nid));
+ get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
+ NUMA(stable_node->nid)) {
+ stable_node_dup_del(stable_node);
stable_node->head = &migrate_nodes;
list_add(&stable_node->list, stable_node->head);
}
if (stable_node->head != &migrate_nodes &&
rmap_item->head == stable_node)
return;
+ /*
+ * If it's a KSM fork, allow it to go over the sharing limit
+ * without warnings.
+ */
+ if (!is_page_sharing_candidate(stable_node))
+ max_page_sharing_bypass = true;
}
/* We first start with searching the page inside the stable tree */
@@ -1473,7 +2034,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
* add its rmap_item to the stable tree.
*/
lock_page(kpage);
- stable_tree_append(rmap_item, page_stable_node(kpage));
+ stable_tree_append(rmap_item, page_stable_node(kpage),
+ max_page_sharing_bypass);
unlock_page(kpage);
}
put_page(kpage);
@@ -1523,8 +2085,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
lock_page(kpage);
stable_node = stable_tree_insert(kpage);
if (stable_node) {
- stable_tree_append(tree_rmap_item, stable_node);
- stable_tree_append(rmap_item, stable_node);
+ stable_tree_append(tree_rmap_item, stable_node,
+ false);
+ stable_tree_append(rmap_item, stable_node,
+ false);
}
unlock_page(kpage);
@@ -2028,6 +2592,48 @@ static void wait_while_offlining(void)
}
}
+static bool stable_node_dup_remove_range(struct stable_node *stable_node,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ if (stable_node->kpfn >= start_pfn &&
+ stable_node->kpfn < end_pfn) {
+ /*
+ * Don't get_ksm_page, page has already gone:
+ * which is why we keep kpfn instead of page*
+ */
+ remove_node_from_stable_tree(stable_node);
+ return true;
+ }
+ return false;
+}
+
+static bool stable_node_chain_remove_range(struct stable_node *stable_node,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ struct rb_root *root)
+{
+ struct stable_node *dup;
+ struct hlist_node *hlist_safe;
+
+ if (!is_stable_node_chain(stable_node)) {
+ VM_BUG_ON(is_stable_node_dup(stable_node));
+ return stable_node_dup_remove_range(stable_node, start_pfn,
+ end_pfn);
+ }
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ VM_BUG_ON(!is_stable_node_dup(dup));
+ stable_node_dup_remove_range(dup, start_pfn, end_pfn);
+ }
+ if (hlist_empty(&stable_node->hlist)) {
+ free_stable_node_chain(stable_node, root);
+ return true; /* notify caller that tree was rebalanced */
+ } else
+ return false;
+}
+
static void ksm_check_stable_tree(unsigned long start_pfn,
unsigned long end_pfn)
{
@@ -2039,15 +2645,12 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
node = rb_first(root_stable_tree + nid);
while (node) {
stable_node = rb_entry(node, struct stable_node, node);
- if (stable_node->kpfn >= start_pfn &&
- stable_node->kpfn < end_pfn) {
- /*
- * Don't get_ksm_page, page has already gone:
- * which is why we keep kpfn instead of page*
- */
- remove_node_from_stable_tree(stable_node);
+ if (stable_node_chain_remove_range(stable_node,
+ start_pfn, end_pfn,
+ root_stable_tree +
+ nid))
node = rb_first(root_stable_tree + nid);
- } else
+ else
node = rb_next(node);
cond_resched();
}
@@ -2293,6 +2896,47 @@ static ssize_t use_zero_pages_store(struct kobject *kobj,
}
KSM_ATTR(use_zero_pages);
+static ssize_t max_page_sharing_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_max_page_sharing);
+}
+
+static ssize_t max_page_sharing_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ int knob;
+
+ err = kstrtoint(buf, 10, &knob);
+ if (err)
+ return err;
+ /*
+ * When a KSM page is created it is shared by 2 mappings. This
+ * being a signed comparison, it implicitly verifies it's not
+ * negative.
+ */
+ if (knob < 2)
+ return -EINVAL;
+
+ if (READ_ONCE(ksm_max_page_sharing) == knob)
+ return count;
+
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ if (ksm_max_page_sharing != knob) {
+ if (ksm_pages_shared || remove_all_stable_nodes())
+ err = -EBUSY;
+ else
+ ksm_max_page_sharing = knob;
+ }
+ mutex_unlock(&ksm_thread_mutex);
+
+ return err ? err : count;
+}
+KSM_ATTR(max_page_sharing);
+
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -2331,6 +2975,46 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
}
KSM_ATTR_RO(pages_volatile);
+static ssize_t stable_node_dups_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_stable_node_dups);
+}
+KSM_ATTR_RO(stable_node_dups);
+
+static ssize_t stable_node_chains_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_stable_node_chains);
+}
+KSM_ATTR_RO(stable_node_chains);
+
+static ssize_t
+stable_node_chains_prune_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
+}
+
+static ssize_t
+stable_node_chains_prune_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ ksm_stable_node_chains_prune_millisecs = msecs;
+
+ return count;
+}
+KSM_ATTR(stable_node_chains_prune_millisecs);
+
static ssize_t full_scans_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -2350,6 +3034,10 @@ static struct attribute *ksm_attrs[] = {
#ifdef CONFIG_NUMA
&merge_across_nodes_attr.attr,
#endif
+ &max_page_sharing_attr.attr,
+ &stable_node_chains_attr.attr,
+ &stable_node_dups_attr.attr,
+ &stable_node_chains_prune_millisecs_attr.attr,
&use_zero_pages_attr.attr,
NULL,
};
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 234676e31edd..7a40fa2be858 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -117,6 +117,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
l = list_lru_from_kmem(nlru, item);
list_add_tail(item, &l->list);
l->nr_items++;
+ nlru->nr_items++;
spin_unlock(&nlru->lock);
return true;
}
@@ -136,6 +137,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
l = list_lru_from_kmem(nlru, item);
list_del_init(item);
l->nr_items--;
+ nlru->nr_items--;
spin_unlock(&nlru->lock);
return true;
}
@@ -183,15 +185,10 @@ EXPORT_SYMBOL_GPL(list_lru_count_one);
unsigned long list_lru_count_node(struct list_lru *lru, int nid)
{
- long count = 0;
- int memcg_idx;
+ struct list_lru_node *nlru;
- count += __list_lru_count_one(lru, nid, -1);
- if (list_lru_memcg_aware(lru)) {
- for_each_memcg_cache_index(memcg_idx)
- count += __list_lru_count_one(lru, nid, memcg_idx);
- }
- return count;
+ nlru = &lru->node[nid];
+ return nlru->nr_items;
}
EXPORT_SYMBOL_GPL(list_lru_count_node);
@@ -226,6 +223,7 @@ restart:
assert_spin_locked(&nlru->lock);
case LRU_REMOVED:
isolated++;
+ nlru->nr_items--;
/*
* If the lru lock has been dropped, our list
* traversal is now invalid and so we have to
diff --git a/mm/madvise.c b/mm/madvise.c
index 25b78ee4fc2c..9976852f1e1c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -205,7 +205,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
continue;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, index);
+ vma, index, false);
if (page)
put_page(page);
}
@@ -246,7 +246,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
}
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
- NULL, 0);
+ NULL, 0, false);
if (page)
put_page(page);
}
@@ -451,9 +451,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
- if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
- return -EINVAL;
-
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
return -EINVAL;
@@ -477,14 +474,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
return 0;
}
-static long madvise_free(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
-{
- *prev = vma;
- return madvise_free_single_vma(vma, start, end);
-}
-
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
@@ -504,9 +493,17 @@ static long madvise_free(struct vm_area_struct *vma,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ zap_page_range(vma, start, end - start);
+ return 0;
+}
+
+static long madvise_dontneed_free(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ int behavior)
{
*prev = vma;
if (!can_madv_dontneed_vma(vma))
@@ -526,7 +523,8 @@ static long madvise_dontneed(struct vm_area_struct *vma,
* is also < vma->vm_end. If start <
* vma->vm_start it means an hole materialized
* in the user address space within the
- * virtual range passed to MADV_DONTNEED.
+ * virtual range passed to MADV_DONTNEED
+ * or MADV_FREE.
*/
return -ENOMEM;
}
@@ -537,7 +535,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
* Don't fail if end > vma->vm_end. If the old
* vma was splitted while the mmap_sem was
* released the effect of the concurrent
- * operation may not cause MADV_DONTNEED to
+ * operation may not cause madvise() to
* have an undefined result. There may be an
* adjacent next vma that we'll walk
* next. userfaultfd_remove() will generate an
@@ -549,8 +547,13 @@ static long madvise_dontneed(struct vm_area_struct *vma,
}
VM_WARN_ON(start >= end);
}
- zap_page_range(vma, start, end - start);
- return 0;
+
+ if (behavior == MADV_DONTNEED)
+ return madvise_dontneed_single_vma(vma, start, end);
+ else if (behavior == MADV_FREE)
+ return madvise_free_single_vma(vma, start, end);
+ else
+ return -EINVAL;
}
/*
@@ -656,9 +659,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_FREE:
- return madvise_free(vma, prev, start, end);
case MADV_DONTNEED:
- return madvise_dontneed(vma, prev, start, end);
+ return madvise_dontneed_free(vma, prev, start, end, behavior);
default:
return madvise_behavior(vma, prev, start, end, behavior);
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 7b8a5db76a2f..2cb25fe4452c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -54,9 +54,6 @@ struct memblock memblock __initdata_memblock = {
};
int memblock_debug __initdata_memblock;
-#ifdef CONFIG_MOVABLE_NODE
-bool movable_node_enabled __initdata_memblock = false;
-#endif
static bool system_has_some_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 94172089f52f..3df3c04d73ab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -170,7 +170,7 @@ struct mem_cgroup_event {
*/
poll_table pt;
wait_queue_head_t *wqh;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct work_struct remove;
};
@@ -631,7 +631,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
val = __this_cpu_read(memcg->stat->nr_page_events);
next = __this_cpu_read(memcg->stat->targets[target]);
/* from time_after() in jiffies.h */
- if ((long)next - (long)val < 0) {
+ if ((long)(next - val) < 0) {
switch (target) {
case MEM_CGROUP_TARGET_THRESH:
next = val + THRESHOLDS_EVENTS_TARGET;
@@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
struct mem_cgroup *memcg;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
};
-static int memcg_oom_wake_function(wait_queue_t *wait,
+static int memcg_oom_wake_function(wait_queue_entry_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
@@ -1570,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
- INIT_LIST_HEAD(&owait.wait.task_list);
+ INIT_LIST_HEAD(&owait.wait.entry);
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
mem_cgroup_mark_under_oom(memcg);
@@ -2376,10 +2376,9 @@ void mem_cgroup_split_huge_fixup(struct page *head)
#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
- bool charge)
+ int nr_entries)
{
- int val = (charge) ? 1 : -1;
- this_cpu_add(memcg->stat->count[MEMCG_SWAP], val);
+ this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries);
}
/**
@@ -2405,8 +2404,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
new_id = mem_cgroup_id(to);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
- mem_cgroup_swap_statistics(from, false);
- mem_cgroup_swap_statistics(to, true);
+ mem_cgroup_swap_statistics(from, -1);
+ mem_cgroup_swap_statistics(to, 1);
return 0;
}
return -EINVAL;
@@ -3574,6 +3573,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
+ seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
return 0;
}
@@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work)
*
* Called with wqh->lock held and interrupts disabled.
*/
-static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
struct mem_cgroup_event *event =
@@ -4122,6 +4122,12 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
+ pn->lruvec_stat = alloc_percpu(struct lruvec_stat);
+ if (!pn->lruvec_stat) {
+ kfree(pn);
+ return 1;
+ }
+
lruvec_init(&pn->lruvec);
pn->usage_in_excess = 0;
pn->on_tree = false;
@@ -4133,7 +4139,10 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
- kfree(memcg->nodeinfo[node]);
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+
+ free_percpu(pn->lruvec_stat);
+ kfree(pn);
}
static void __mem_cgroup_free(struct mem_cgroup *memcg)
@@ -5165,6 +5174,7 @@ static int memory_events_show(struct seq_file *m, void *v)
seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
+ seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
return 0;
}
@@ -5197,8 +5207,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "kernel_stack %llu\n",
(u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
seq_printf(m, "slab %llu\n",
- (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
- stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+ (u64)(stat[NR_SLAB_RECLAIMABLE] +
+ stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n",
(u64)stat[MEMCG_SOCK] * PAGE_SIZE);
@@ -5222,15 +5232,25 @@ static int memory_stat_show(struct seq_file *m, void *v)
}
seq_printf(m, "slab_reclaimable %llu\n",
- (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
+ (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
seq_printf(m, "slab_unreclaimable %llu\n",
- (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+ (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
/* Accumulated memory events */
seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+ seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
+ seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
+ events[PGSCAN_DIRECT]);
+ seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
+ events[PGSTEAL_DIRECT]);
+ seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
+ seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
+ seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
+ seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
+
seq_printf(m, "workingset_refault %lu\n",
stat[WORKINGSET_REFAULT]);
seq_printf(m, "workingset_activate %lu\n",
@@ -5297,38 +5317,52 @@ struct cgroup_subsys memory_cgrp_subsys = {
/**
* mem_cgroup_low - check if memory consumption is below the normal range
- * @root: the highest ancestor to consider
+ * @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
* Returns %true if memory consumption of @memcg, and that of all
- * configurable ancestors up to @root, is below the normal range.
+ * ancestors up to (but not including) @root, is below the normal range.
+ *
+ * @root is exclusive; it is never low when looked at directly and isn't
+ * checked when traversing the hierarchy.
+ *
+ * Excluding @root enables using memory.low to prioritize memory usage
+ * between cgroups within a subtree of the hierarchy that is limited by
+ * memory.high or memory.max.
+ *
+ * For example, given cgroup A with children B and C:
+ *
+ * A
+ * / \
+ * B C
+ *
+ * and
+ *
+ * 1. A/memory.current > A/memory.high
+ * 2. A/B/memory.current < A/B/memory.low
+ * 3. A/C/memory.current >= A/C/memory.low
+ *
+ * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
+ * should reclaim from 'C' until 'A' is no longer high or until we can
+ * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by
+ * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered
+ * low and we will reclaim indiscriminately from both 'B' and 'C'.
*/
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled())
return false;
- /*
- * The toplevel group doesn't have a configurable range, so
- * it's never low when looked at directly, and it is not
- * considered an ancestor when assessing the hierarchy.
- */
-
- if (memcg == root_mem_cgroup)
- return false;
-
- if (page_counter_read(&memcg->memory) >= memcg->low)
+ if (!root)
+ root = root_mem_cgroup;
+ if (memcg == root)
return false;
- while (memcg != root) {
- memcg = parent_mem_cgroup(memcg);
-
- if (memcg == root_mem_cgroup)
- break;
-
+ for (; memcg != root; memcg = parent_mem_cgroup(memcg)) {
if (page_counter_read(&memcg->memory) >= memcg->low)
return false;
}
+
return true;
}
@@ -5445,7 +5479,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
* let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate.
*/
- mem_cgroup_uncharge_swap(entry);
+ mem_cgroup_uncharge_swap(entry, nr_pages);
}
}
@@ -5873,9 +5907,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg));
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(swap_memcg, true);
+ mem_cgroup_swap_statistics(swap_memcg, 1);
page->mem_cgroup = NULL;
@@ -5902,19 +5936,20 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
css_put(&memcg->css);
}
-/*
- * mem_cgroup_try_charge_swap - try charging a swap entry
+/**
+ * mem_cgroup_try_charge_swap - try charging swap space for a page
* @page: page being added to swap
* @entry: swap entry to charge
*
- * Try to charge @entry to the memcg that @page belongs to.
+ * Try to charge @page's memcg for the swap space at @entry.
*
* Returns 0 on success, -ENOMEM on failure.
*/
int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
{
- struct mem_cgroup *memcg;
+ unsigned int nr_pages = hpage_nr_pages(page);
struct page_counter *counter;
+ struct mem_cgroup *memcg;
unsigned short oldid;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
@@ -5929,25 +5964,27 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
memcg = mem_cgroup_id_get_online(memcg);
if (!mem_cgroup_is_root(memcg) &&
- !page_counter_try_charge(&memcg->swap, 1, &counter)) {
+ !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
mem_cgroup_id_put(memcg);
return -ENOMEM;
}
- oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+ /* Get references for the tail pages, too */
+ if (nr_pages > 1)
+ mem_cgroup_id_get_many(memcg, nr_pages - 1);
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(memcg, true);
+ mem_cgroup_swap_statistics(memcg, nr_pages);
return 0;
}
/**
- * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * mem_cgroup_uncharge_swap - uncharge swap space
* @entry: swap entry to uncharge
- *
- * Drop the swap charge associated with @entry.
+ * @nr_pages: the amount of swap space to uncharge
*/
-void mem_cgroup_uncharge_swap(swp_entry_t entry)
+void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
struct mem_cgroup *memcg;
unsigned short id;
@@ -5955,18 +5992,18 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
if (!do_swap_account)
return;
- id = swap_cgroup_record(entry, 0);
+ id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
if (!mem_cgroup_is_root(memcg)) {
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->swap, 1);
+ page_counter_uncharge(&memcg->swap, nr_pages);
else
- page_counter_uncharge(&memcg->memsw, 1);
+ page_counter_uncharge(&memcg->memsw, nr_pages);
}
- mem_cgroup_swap_statistics(memcg, false);
- mem_cgroup_id_put(memcg);
+ mem_cgroup_swap_statistics(memcg, -nr_pages);
+ mem_cgroup_id_put_many(memcg, nr_pages);
}
rcu_read_unlock();
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ecc183fd94f3..1cd3b3569af8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -49,7 +49,6 @@
#include <linux/swap.h>
#include <linux/backing-dev.h>
#include <linux/migrate.h>
-#include <linux/page-isolation.h>
#include <linux/suspend.h>
#include <linux/slab.h>
#include <linux/swapops.h>
@@ -555,6 +554,39 @@ static int delete_from_lru_cache(struct page *p)
return -EIO;
}
+static int truncate_error_page(struct page *p, unsigned long pfn,
+ struct address_space *mapping)
+{
+ int ret = MF_FAILED;
+
+ if (mapping->a_ops->error_remove_page) {
+ int err = mapping->a_ops->error_remove_page(mapping, p);
+
+ if (err != 0) {
+ pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
+ pfn, err);
+ } else if (page_has_private(p) &&
+ !try_to_release_page(p, GFP_NOIO)) {
+ pr_info("Memory failure: %#lx: failed to release buffers\n",
+ pfn);
+ } else {
+ ret = MF_RECOVERED;
+ }
+ } else {
+ /*
+ * If the file system doesn't support it just invalidate
+ * This fails on dirty or anything with private pages
+ */
+ if (invalidate_inode_page(p))
+ ret = MF_RECOVERED;
+ else
+ pr_info("Memory failure: %#lx: Failed to invalidate\n",
+ pfn);
+ }
+
+ return ret;
+}
+
/*
* Error hit kernel page.
* Do nothing, try to be lucky and not touch this instead. For a few cases we
@@ -579,8 +611,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
*/
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
- int err;
- int ret = MF_FAILED;
struct address_space *mapping;
delete_from_lru_cache(p);
@@ -612,30 +642,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
- if (mapping->a_ops->error_remove_page) {
- err = mapping->a_ops->error_remove_page(mapping, p);
- if (err != 0) {
- pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
- pfn, err);
- } else if (page_has_private(p) &&
- !try_to_release_page(p, GFP_NOIO)) {
- pr_info("Memory failure: %#lx: failed to release buffers\n",
- pfn);
- } else {
- ret = MF_RECOVERED;
- }
- } else {
- /*
- * If the file system doesn't support it just invalidate
- * This fails on dirty or anything with private pages
- */
- if (invalidate_inode_page(p))
- ret = MF_RECOVERED;
- else
- pr_info("Memory failure: %#lx: Failed to invalidate\n",
- pfn);
- }
- return ret;
+ return truncate_error_page(p, pfn, mapping);
}
/*
@@ -684,7 +691,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* the first EIO, but we're not worse than other parts
* of the kernel.
*/
- mapping_set_error(mapping, EIO);
+ mapping_set_error(mapping, -EIO);
}
return me_pagecache_clean(p, pfn);
@@ -741,24 +748,29 @@ static int me_huge_page(struct page *p, unsigned long pfn)
{
int res = 0;
struct page *hpage = compound_head(p);
+ struct address_space *mapping;
if (!PageHuge(hpage))
return MF_DELAYED;
- /*
- * We can safely recover from error on free or reserved (i.e.
- * not in-use) hugepage by dequeuing it from freelist.
- * To check whether a hugepage is in-use or not, we can't use
- * page->lru because it can be used in other hugepage operations,
- * such as __unmap_hugepage_range() and gather_surplus_pages().
- * So instead we use page_mapping() and PageAnon().
- */
- if (!(page_mapping(hpage) || PageAnon(hpage))) {
- res = dequeue_hwpoisoned_huge_page(hpage);
- if (!res)
- return MF_RECOVERED;
+ mapping = page_mapping(hpage);
+ if (mapping) {
+ res = truncate_error_page(hpage, pfn, mapping);
+ } else {
+ unlock_page(hpage);
+ /*
+ * migration entry prevents later access on error anonymous
+ * hugepage, so we can free and dissolve it into buddy to
+ * save healthy subpages.
+ */
+ if (PageAnon(hpage))
+ put_page(hpage);
+ dissolve_free_huge_page(p);
+ res = MF_RECOVERED;
+ lock_page(hpage);
}
- return MF_DELAYED;
+
+ return res;
}
/*
@@ -857,7 +869,7 @@ static int page_action(struct page_state *ps, struct page *p,
count = page_count(p) - 1;
if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
- if (count != 0) {
+ if (count > 0) {
pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
pfn, action_page_types[ps->type], count);
result = MF_FAILED;
@@ -1010,20 +1022,84 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
return unmap_success;
}
-static void set_page_hwpoison_huge_page(struct page *hpage)
+static int identify_page_state(unsigned long pfn, struct page *p,
+ unsigned long page_flags)
{
- int i;
- int nr_pages = 1 << compound_order(hpage);
- for (i = 0; i < nr_pages; i++)
- SetPageHWPoison(hpage + i);
+ struct page_state *ps;
+
+ /*
+ * The first check uses the current page flags which may not have any
+ * relevant information. The second check with the saved page flags is
+ * carried out only if the first check can't determine the page status.
+ */
+ for (ps = error_states;; ps++)
+ if ((p->flags & ps->mask) == ps->res)
+ break;
+
+ page_flags |= (p->flags & (1UL << PG_dirty));
+
+ if (!ps->mask)
+ for (ps = error_states;; ps++)
+ if ((page_flags & ps->mask) == ps->res)
+ break;
+ return page_action(ps, p, pfn);
}
-static void clear_page_hwpoison_huge_page(struct page *hpage)
+static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags)
{
- int i;
- int nr_pages = 1 << compound_order(hpage);
- for (i = 0; i < nr_pages; i++)
- ClearPageHWPoison(hpage + i);
+ struct page *p = pfn_to_page(pfn);
+ struct page *head = compound_head(p);
+ int res;
+ unsigned long page_flags;
+
+ if (TestSetPageHWPoison(head)) {
+ pr_err("Memory failure: %#lx: already hardware poisoned\n",
+ pfn);
+ return 0;
+ }
+
+ num_poisoned_pages_inc();
+
+ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
+ /*
+ * Check "filter hit" and "race with other subpage."
+ */
+ lock_page(head);
+ if (PageHWPoison(head)) {
+ if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != head && TestSetPageHWPoison(head))) {
+ num_poisoned_pages_dec();
+ unlock_page(head);
+ return 0;
+ }
+ }
+ unlock_page(head);
+ dissolve_free_huge_page(p);
+ action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
+ return 0;
+ }
+
+ lock_page(head);
+ page_flags = head->flags;
+
+ if (!PageHWPoison(head)) {
+ pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
+ num_poisoned_pages_dec();
+ unlock_page(head);
+ put_hwpoison_page(head);
+ return 0;
+ }
+
+ if (!hwpoison_user_mappings(p, pfn, trapno, flags, &head)) {
+ action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
+ res = identify_page_state(pfn, p, page_flags);
+out:
+ unlock_page(head);
+ return res;
}
/**
@@ -1046,12 +1122,10 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)
*/
int memory_failure(unsigned long pfn, int trapno, int flags)
{
- struct page_state *ps;
struct page *p;
struct page *hpage;
struct page *orig_head;
int res;
- unsigned int nr_pages;
unsigned long page_flags;
if (!sysctl_memory_failure_recovery)
@@ -1064,34 +1138,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
p = pfn_to_page(pfn);
- orig_head = hpage = compound_head(p);
+ if (PageHuge(p))
+ return memory_failure_hugetlb(pfn, trapno, flags);
if (TestSetPageHWPoison(p)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
return 0;
}
- /*
- * Currently errors on hugetlbfs pages are measured in hugepage units,
- * so nr_pages should be 1 << compound_order. OTOH when errors are on
- * transparent hugepages, they are supposed to be split and error
- * measurement is done in normal page units. So nr_pages should be one
- * in this case.
- */
- if (PageHuge(p))
- nr_pages = 1 << compound_order(hpage);
- else /* normal page or thp */
- nr_pages = 1;
- num_poisoned_pages_add(nr_pages);
+ orig_head = hpage = compound_head(p);
+ num_poisoned_pages_inc();
/*
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
* prep_new_page() will be the gate keeper.
- * 2) it's a free hugepage, which is also safe:
- * an affected hugepage will be dequeued from hugepage freelist,
- * so there's no concern about reusing it ever after.
- * 3) it's part of a non-compound high order page.
+ * 2) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
@@ -1102,32 +1164,13 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (is_free_buddy_page(p)) {
action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
return 0;
- } else if (PageHuge(hpage)) {
- /*
- * Check "filter hit" and "race with other subpage."
- */
- lock_page(hpage);
- if (PageHWPoison(hpage)) {
- if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
- || (p != hpage && TestSetPageHWPoison(hpage))) {
- num_poisoned_pages_sub(nr_pages);
- unlock_page(hpage);
- return 0;
- }
- }
- set_page_hwpoison_huge_page(hpage);
- res = dequeue_hwpoisoned_huge_page(hpage);
- action_result(pfn, MF_MSG_FREE_HUGE,
- res ? MF_IGNORED : MF_DELAYED);
- unlock_page(hpage);
- return res;
} else {
action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
return -EBUSY;
}
}
- if (!PageHuge(p) && PageTransHuge(hpage)) {
+ if (PageTransHuge(hpage)) {
lock_page(p);
if (!PageAnon(p) || unlikely(split_huge_page(p))) {
unlock_page(p);
@@ -1138,7 +1181,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
pr_err("Memory failure: %#lx: thp split failed\n",
pfn);
if (TestClearPageHWPoison(p))
- num_poisoned_pages_sub(nr_pages);
+ num_poisoned_pages_dec();
put_hwpoison_page(p);
return -EBUSY;
}
@@ -1165,7 +1208,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
}
- lock_page(hpage);
+ lock_page(p);
/*
* The page could have changed compound pages during the locking.
@@ -1194,42 +1237,23 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (!PageHWPoison(p)) {
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
- num_poisoned_pages_sub(nr_pages);
- unlock_page(hpage);
- put_hwpoison_page(hpage);
+ num_poisoned_pages_dec();
+ unlock_page(p);
+ put_hwpoison_page(p);
return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- num_poisoned_pages_sub(nr_pages);
- unlock_page(hpage);
- put_hwpoison_page(hpage);
+ num_poisoned_pages_dec();
+ unlock_page(p);
+ put_hwpoison_page(p);
return 0;
}
- if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
+ if (!PageTransTail(p) && !PageLRU(p))
goto identify_page_state;
/*
- * For error on the tail page, we should set PG_hwpoison
- * on the head page to show that the hugepage is hwpoisoned
- */
- if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
- action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
- unlock_page(hpage);
- put_hwpoison_page(hpage);
- return 0;
- }
- /*
- * Set PG_hwpoison on all pages in an error hugepage,
- * because containment is done in hugepage unit for now.
- * Since we have done TestSetPageHWPoison() for the head page with
- * page lock held, we can safely set PG_hwpoison bits on tail pages.
- */
- if (PageHuge(p))
- set_page_hwpoison_huge_page(hpage);
-
- /*
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
*/
@@ -1258,25 +1282,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
identify_page_state:
- res = -EBUSY;
- /*
- * The first check uses the current page flags which may not have any
- * relevant information. The second check with the saved page flagss is
- * carried out only if the first check can't determine the page status.
- */
- for (ps = error_states;; ps++)
- if ((p->flags & ps->mask) == ps->res)
- break;
-
- page_flags |= (p->flags & (1UL << PG_dirty));
-
- if (!ps->mask)
- for (ps = error_states;; ps++)
- if ((page_flags & ps->mask) == ps->res)
- break;
- res = page_action(ps, p, pfn);
+ res = identify_page_state(pfn, p, page_flags);
out:
- unlock_page(hpage);
+ unlock_page(p);
return res;
}
EXPORT_SYMBOL_GPL(memory_failure);
@@ -1398,7 +1406,6 @@ int unpoison_memory(unsigned long pfn)
struct page *page;
struct page *p;
int freeit = 0;
- unsigned int nr_pages;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -1443,20 +1450,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
- nr_pages = 1 << compound_order(page);
-
if (!get_hwpoison_page(p)) {
- /*
- * Since HWPoisoned hugepage should have non-zero refcount,
- * race between memory failure and unpoison seems to happen.
- * In such case unpoison fails and memory failure runs
- * to the end.
- */
- if (PageHuge(page)) {
- unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n",
- pfn, &unpoison_rs);
- return 0;
- }
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
@@ -1474,10 +1468,8 @@ int unpoison_memory(unsigned long pfn)
if (TestClearPageHWPoison(page)) {
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
pfn, &unpoison_rs);
- num_poisoned_pages_sub(nr_pages);
+ num_poisoned_pages_dec();
freeit = 1;
- if (PageHuge(page))
- clear_page_hwpoison_huge_page(page);
}
unlock_page(page);
@@ -1492,11 +1484,8 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private, int **x)
{
int nid = page_to_nid(p);
- if (PageHuge(p))
- return alloc_huge_page_node(page_hstate(compound_head(p)),
- nid);
- else
- return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+
+ return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
}
/*
@@ -1603,15 +1592,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
- /* overcommit hugetlb page will be freed to buddy */
- if (PageHuge(page)) {
- set_page_hwpoison_huge_page(hpage);
- dequeue_hwpoisoned_huge_page(hpage);
- num_poisoned_pages_add(1 << compound_order(hpage));
- } else {
- SetPageHWPoison(page);
- num_poisoned_pages_inc();
- }
+ if (PageHuge(page))
+ dissolve_free_huge_page(page);
}
return ret;
}
@@ -1727,15 +1709,12 @@ static int soft_offline_in_use_page(struct page *page, int flags)
static void soft_offline_free_page(struct page *page)
{
- if (PageHuge(page)) {
- struct page *hpage = compound_head(page);
+ struct page *head = compound_head(page);
- set_page_hwpoison_huge_page(hpage);
- if (!dequeue_hwpoisoned_huge_page(hpage))
- num_poisoned_pages_add(1 << compound_order(hpage));
- } else {
- if (!TestSetPageHWPoison(page))
- num_poisoned_pages_inc();
+ if (!TestSetPageHWPoison(head)) {
+ num_poisoned_pages_inc();
+ if (PageHuge(head))
+ dissolve_free_huge_page(page);
}
}
diff --git a/mm/memory.c b/mm/memory.c
index bb11c474857e..0e517be91a89 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2719,7 +2719,7 @@ int do_swap_page(struct vm_fault *vmf)
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
} else if (PageHWPoison(page)) {
/*
* hwpoisoned dirty swapcache pages are kept for killing
@@ -3262,14 +3262,14 @@ static int fault_around_bytes_set(void *data, u64 val)
fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
+DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
static int __init fault_around_debugfs(void)
{
void *ret;
- ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
+ ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
&fault_around_bytes_fops);
if (!ret)
pr_warn("Failed to create fault_around_bytes in debugfs");
@@ -3591,7 +3591,7 @@ out:
return 0;
}
-static int create_huge_pmd(struct vm_fault *vmf)
+static inline int create_huge_pmd(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_anonymous_page(vmf);
@@ -3837,7 +3837,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT);
+ count_memcg_event_mm(vma->vm_mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
@@ -4014,8 +4014,6 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
- if (!ptep)
- goto out;
if (!pte_present(*ptep))
goto unlock;
*ptepp = ptep;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b63d7d1239df..8dccc317aac2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -52,32 +52,19 @@ static void generic_online_page(struct page *page);
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
-/* The same as the cpu_hotplug lock, but for memory hotplug. */
-static struct {
- struct task_struct *active_writer;
- struct mutex lock; /* Synchronizes accesses to refcount, */
- /*
- * Also blocks the new readers during
- * an ongoing mem hotplug operation.
- */
- int refcount;
+DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-} mem_hotplug = {
- .active_writer = NULL,
- .lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
- .refcount = 0,
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- .dep_map = {.name = "mem_hotplug.lock" },
-#endif
-};
+void get_online_mems(void)
+{
+ percpu_down_read(&mem_hotplug_lock);
+}
-/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
-#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
-#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
-#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+void put_online_mems(void)
+{
+ percpu_up_read(&mem_hotplug_lock);
+}
+
+bool movable_node_enabled = false;
#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
bool memhp_auto_online;
@@ -97,60 +84,16 @@ static int __init setup_memhp_default_state(char *str)
}
__setup("memhp_default_state=", setup_memhp_default_state);
-void get_online_mems(void)
-{
- might_sleep();
- if (mem_hotplug.active_writer == current)
- return;
- memhp_lock_acquire_read();
- mutex_lock(&mem_hotplug.lock);
- mem_hotplug.refcount++;
- mutex_unlock(&mem_hotplug.lock);
-
-}
-
-void put_online_mems(void)
-{
- if (mem_hotplug.active_writer == current)
- return;
- mutex_lock(&mem_hotplug.lock);
-
- if (WARN_ON(!mem_hotplug.refcount))
- mem_hotplug.refcount++; /* try to fix things up */
-
- if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
- wake_up_process(mem_hotplug.active_writer);
- mutex_unlock(&mem_hotplug.lock);
- memhp_lock_release();
-
-}
-
-/* Serializes write accesses to mem_hotplug.active_writer. */
-static DEFINE_MUTEX(memory_add_remove_lock);
-
void mem_hotplug_begin(void)
{
- mutex_lock(&memory_add_remove_lock);
-
- mem_hotplug.active_writer = current;
-
- memhp_lock_acquire();
- for (;;) {
- mutex_lock(&mem_hotplug.lock);
- if (likely(!mem_hotplug.refcount))
- break;
- __set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&mem_hotplug.lock);
- schedule();
- }
+ cpus_read_lock();
+ percpu_down_write(&mem_hotplug_lock);
}
void mem_hotplug_done(void)
{
- mem_hotplug.active_writer = NULL;
- mutex_unlock(&mem_hotplug.lock);
- memhp_lock_release();
- mutex_unlock(&memory_add_remove_lock);
+ percpu_up_write(&mem_hotplug_lock);
+ cpus_read_unlock();
}
/* add this memory to iomem resource */
@@ -300,229 +243,38 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
}
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
-static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long old_zone_end_pfn;
-
- zone_span_writelock(zone);
-
- old_zone_end_pfn = zone_end_pfn(zone);
- if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
- zone->zone_start_pfn = start_pfn;
-
- zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
- zone->zone_start_pfn;
-
- zone_span_writeunlock(zone);
-}
-
-static void resize_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- zone_span_writelock(zone);
-
- if (end_pfn - start_pfn) {
- zone->zone_start_pfn = start_pfn;
- zone->spanned_pages = end_pfn - start_pfn;
- } else {
- /*
- * make it consist as free_area_init_core(),
- * if spanned_pages = 0, then keep start_pfn = 0
- */
- zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
- }
-
- zone_span_writeunlock(zone);
-}
-
-static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- enum zone_type zid = zone_idx(zone);
- int nid = zone->zone_pgdat->node_id;
- unsigned long pfn;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn++)
- set_page_links(pfn_to_page(pfn), zid, nid, pfn);
-}
-
-/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
- * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
-static int __ref ensure_zone_is_initialized(struct zone *zone,
- unsigned long start_pfn, unsigned long num_pages)
-{
- if (!zone_is_initialized(zone))
- return init_currently_empty_zone(zone, start_pfn, num_pages);
-
- return 0;
-}
-
-static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- int ret;
- unsigned long flags;
- unsigned long z1_start_pfn;
-
- ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
- if (ret)
- return ret;
-
- pgdat_resize_lock(z1->zone_pgdat, &flags);
-
- /* can't move pfns which are higher than @z2 */
- if (end_pfn > zone_end_pfn(z2))
- goto out_fail;
- /* the move out part must be at the left most of @z2 */
- if (start_pfn > z2->zone_start_pfn)
- goto out_fail;
- /* must included/overlap */
- if (end_pfn <= z2->zone_start_pfn)
- goto out_fail;
-
- /* use start_pfn for z1's start_pfn if z1 is empty */
- if (!zone_is_empty(z1))
- z1_start_pfn = z1->zone_start_pfn;
- else
- z1_start_pfn = start_pfn;
-
- resize_zone(z1, z1_start_pfn, end_pfn);
- resize_zone(z2, end_pfn, zone_end_pfn(z2));
-
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
-
- fix_zone_id(z1, start_pfn, end_pfn);
-
- return 0;
-out_fail:
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
- return -1;
-}
-
-static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
- unsigned long start_pfn, unsigned long end_pfn)
+static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
+ bool want_memblock)
{
int ret;
- unsigned long flags;
- unsigned long z2_end_pfn;
-
- ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
- if (ret)
- return ret;
-
- pgdat_resize_lock(z1->zone_pgdat, &flags);
-
- /* can't move pfns which are lower than @z1 */
- if (z1->zone_start_pfn > start_pfn)
- goto out_fail;
- /* the move out part mast at the right most of @z1 */
- if (zone_end_pfn(z1) > end_pfn)
- goto out_fail;
- /* must included/overlap */
- if (start_pfn >= zone_end_pfn(z1))
- goto out_fail;
-
- /* use end_pfn for z2's end_pfn if z2 is empty */
- if (!zone_is_empty(z2))
- z2_end_pfn = zone_end_pfn(z2);
- else
- z2_end_pfn = end_pfn;
-
- resize_zone(z1, z1->zone_start_pfn, start_pfn);
- resize_zone(z2, start_pfn, z2_end_pfn);
-
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
-
- fix_zone_id(z2, start_pfn, end_pfn);
-
- return 0;
-out_fail:
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
- return -1;
-}
-
-static struct zone * __meminit move_pfn_range(int zone_shift,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- struct zone *zone = page_zone(pfn_to_page(start_pfn));
- int ret = 0;
-
- if (zone_shift < 0)
- ret = move_pfn_range_left(zone + zone_shift, zone,
- start_pfn, end_pfn);
- else if (zone_shift)
- ret = move_pfn_range_right(zone, zone + zone_shift,
- start_pfn, end_pfn);
-
- if (ret)
- return NULL;
-
- return zone + zone_shift;
-}
-
-static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
-
- if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
- pgdat->node_start_pfn = start_pfn;
-
- pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
- pgdat->node_start_pfn;
-}
+ int i;
-static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- int nr_pages = PAGES_PER_SECTION;
- int nid = pgdat->node_id;
- int zone_type;
- unsigned long flags, pfn;
- int ret;
+ if (pfn_valid(phys_start_pfn))
+ return -EEXIST;
- zone_type = zone - pgdat->node_zones;
- ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
- if (ret)
+ ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
+ if (ret < 0)
return ret;
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
- grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
- phys_start_pfn + nr_pages);
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
- memmap_init_zone(nr_pages, nid, zone_type,
- phys_start_pfn, MEMMAP_HOTPLUG);
-
- /* online_page_range is called later and expects pages reserved */
- for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) {
+ /*
+ * Make all the pages reserved so that nobody will stumble over half
+ * initialized state.
+ * FIXME: We also have to associate it with a node because pfn_to_node
+ * relies on having page with the proper node.
+ */
+ for (i = 0; i < PAGES_PER_SECTION; i++) {
+ unsigned long pfn = phys_start_pfn + i;
+ struct page *page;
if (!pfn_valid(pfn))
continue;
- SetPageReserved(pfn_to_page(pfn));
+ page = pfn_to_page(pfn);
+ set_page_node(page, nid);
+ SetPageReserved(page);
}
- return 0;
-}
-
-static int __meminit __add_section(int nid, struct zone *zone,
- unsigned long phys_start_pfn)
-{
- int ret;
-
- if (pfn_valid(phys_start_pfn))
- return -EEXIST;
-
- ret = sparse_add_one_section(zone, phys_start_pfn);
-
- if (ret < 0)
- return ret;
- ret = __add_zone(zone, phys_start_pfn);
-
- if (ret < 0)
- return ret;
+ if (!want_memblock)
+ return 0;
return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
}
@@ -533,16 +285,14 @@ static int __meminit __add_section(int nid, struct zone *zone,
* call this function after deciding the zone to which to
* add the new pages.
*/
-int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
- unsigned long nr_pages)
+int __ref __add_pages(int nid, unsigned long phys_start_pfn,
+ unsigned long nr_pages, bool want_memblock)
{
unsigned long i;
int err = 0;
int start_sec, end_sec;
struct vmem_altmap *altmap;
- clear_zone_contiguous(zone);
-
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -562,7 +312,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
}
for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, zone, section_nr_to_pfn(i));
+ err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
/*
* EEXIST is finally dealt with by ioresource collision
@@ -575,7 +325,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
}
vmemmap_populate_print_last();
out:
- set_zone_contiguous(zone);
return err;
}
EXPORT_SYMBOL_GPL(__add_pages);
@@ -772,11 +521,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nr_pages = PAGES_PER_SECTION;
- int zone_type;
unsigned long flags;
- zone_type = zone - pgdat->node_zones;
-
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
@@ -939,33 +685,20 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
unsigned long i;
unsigned long onlined_pages = *(unsigned long *)arg;
struct page *page;
+
if (PageReserved(pfn_to_page(start_pfn)))
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(start_pfn + i);
(*online_page_callback)(page);
onlined_pages++;
}
+
+ online_mem_sections(start_pfn, start_pfn + nr_pages);
+
*(unsigned long *)arg = onlined_pages;
return 0;
}
-#ifdef CONFIG_MOVABLE_NODE
-/*
- * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
- * normal memory.
- */
-static bool can_online_high_movable(struct zone *zone)
-{
- return true;
-}
-#else /* CONFIG_MOVABLE_NODE */
-/* ensure every online node has NORMAL memory */
-static bool can_online_high_movable(struct zone *zone)
-{
- return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
-}
-#endif /* CONFIG_MOVABLE_NODE */
-
/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
@@ -1040,39 +773,144 @@ static void node_states_set_node(int node, struct memory_notify *arg)
node_set_state(node, N_MEMORY);
}
-bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
- enum zone_type target, int *zone_shift)
+bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
{
- struct zone *zone = page_zone(pfn_to_page(pfn));
- enum zone_type idx = zone_idx(zone);
- int i;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+ struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
- *zone_shift = 0;
+ /*
+ * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
+ * physically before ZONE_MOVABLE. All we need is they do not
+ * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
+ * though so let's stick with it for simplicity for now.
+ * TODO make sure we do not overlap with ZONE_DEVICE
+ */
+ if (online_type == MMOP_ONLINE_KERNEL) {
+ if (zone_is_empty(movable_zone))
+ return true;
+ return movable_zone->zone_start_pfn >= pfn + nr_pages;
+ } else if (online_type == MMOP_ONLINE_MOVABLE) {
+ return zone_end_pfn(default_zone) <= pfn;
+ }
- if (idx < target) {
- /* pages must be at end of current zone */
- if (pfn + nr_pages != zone_end_pfn(zone))
- return false;
+ /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
+ return online_type == MMOP_ONLINE_KEEP;
+}
+
+static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long old_end_pfn = zone_end_pfn(zone);
+
+ if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
+ zone->zone_start_pfn = start_pfn;
+
+ zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
+}
- /* no zones in use between current zone and target */
- for (i = idx + 1; i < target; i++)
- if (zone_is_initialized(zone - idx + i))
- return false;
+static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
+
+ if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
+ pgdat->node_start_pfn = start_pfn;
+
+ pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
+}
+
+void __ref move_pfn_range_to_zone(struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nid = pgdat->node_id;
+ unsigned long flags;
+
+ if (zone_is_empty(zone))
+ init_currently_empty_zone(zone, start_pfn, nr_pages);
+
+ clear_zone_contiguous(zone);
+
+ /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
+ pgdat_resize_lock(pgdat, &flags);
+ zone_span_writelock(zone);
+ resize_zone_range(zone, start_pfn, nr_pages);
+ zone_span_writeunlock(zone);
+ resize_pgdat_range(pgdat, start_pfn, nr_pages);
+ pgdat_resize_unlock(pgdat, &flags);
+
+ /*
+ * TODO now we have a visible range of pages which are not associated
+ * with their zone properly. Not nice but set_pfnblock_flags_mask
+ * expects the zone spans the pfn range. All the pages in the range
+ * are reserved so nobody should be touching them so we should be safe
+ */
+ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
+
+ set_zone_contiguous(zone);
+}
+
+/*
+ * Returns a default kernel memory zone for the given pfn range.
+ * If no kernel zone covers this pfn range it will automatically go
+ * to the ZONE_NORMAL.
+ */
+struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ int zid;
+
+ for (zid = 0; zid <= ZONE_NORMAL; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (zone_intersects(zone, start_pfn, nr_pages))
+ return zone;
}
- if (target < idx) {
- /* pages must be at beginning of current zone */
- if (pfn != zone->zone_start_pfn)
- return false;
+ return &pgdat->node_zones[ZONE_NORMAL];
+}
+
+static inline bool movable_pfn_range(int nid, struct zone *default_zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
+ MMOP_ONLINE_KERNEL))
+ return true;
+
+ if (!movable_node_is_enabled())
+ return false;
- /* no zones in use between current zone and target */
- for (i = target + 1; i < idx; i++)
- if (zone_is_initialized(zone - idx + i))
- return false;
+ return !zone_intersects(default_zone, start_pfn, nr_pages);
+}
+
+/*
+ * Associates the given pfn range with the given node and the zone appropriate
+ * for the given online type.
+ */
+static struct zone * __meminit move_pfn_range(int online_type, int nid,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
+
+ if (online_type == MMOP_ONLINE_KEEP) {
+ struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+ /*
+ * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
+ * movable zone if that is not possible (e.g. we are within
+ * or past the existing movable zone). movable_node overrides
+ * this default and defaults to movable zone
+ */
+ if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
+ zone = movable_zone;
+ } else if (online_type == MMOP_ONLINE_MOVABLE) {
+ zone = &pgdat->node_zones[ZONE_MOVABLE];
}
- *zone_shift = target - idx;
- return true;
+ move_pfn_range_to_zone(zone, start_pfn, nr_pages);
+ return zone;
}
/* Must be protected by mem_hotplug_begin() */
@@ -1085,38 +923,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int nid;
int ret;
struct memory_notify arg;
- int zone_shift = 0;
-
- /*
- * This doesn't need a lock to do pfn_to_page().
- * The section can't be removed here because of the
- * memory_block->state_mutex.
- */
- zone = page_zone(pfn_to_page(pfn));
- if ((zone_idx(zone) > ZONE_NORMAL ||
- online_type == MMOP_ONLINE_MOVABLE) &&
- !can_online_high_movable(zone))
+ nid = pfn_to_nid(pfn);
+ if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
return -EINVAL;
- if (online_type == MMOP_ONLINE_KERNEL) {
- if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
- return -EINVAL;
- } else if (online_type == MMOP_ONLINE_MOVABLE) {
- if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
- return -EINVAL;
- }
-
- zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
- if (!zone)
- return -EINVAL;
+ /* associate pfn range with the zone */
+ zone = move_pfn_range(online_type, nid, pfn, nr_pages);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = zone_to_nid(zone);
-
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
if (ret)
@@ -1311,39 +1129,6 @@ static int check_hotplug_memory_range(u64 start, u64 size)
return 0;
}
-/*
- * If movable zone has already been setup, newly added memory should be check.
- * If its address is higher than movable zone, it should be added as movable.
- * Without this check, movable zone may overlap with other zone.
- */
-static int should_add_memory_movable(int nid, u64 start, u64 size)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- pg_data_t *pgdat = NODE_DATA(nid);
- struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE;
-
- if (zone_is_empty(movable_zone))
- return 0;
-
- if (movable_zone->zone_start_pfn <= start_pfn)
- return 1;
-
- return 0;
-}
-
-int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
- bool for_device)
-{
-#ifdef CONFIG_ZONE_DEVICE
- if (for_device)
- return ZONE_DEVICE;
-#endif
- if (should_add_memory_movable(nid, start, size))
- return ZONE_MOVABLE;
-
- return zone_default;
-}
-
static int online_memory_block(struct memory_block *mem, void *arg)
{
return device_online(&mem->dev);
@@ -1389,7 +1174,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
}
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, false);
+ ret = arch_add_memory(nid, start, size, true);
if (ret < 0)
goto error;
@@ -1398,7 +1183,22 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
node_set_online(nid);
if (new_node) {
- ret = register_one_node(nid);
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ ret = __register_one_node(nid);
+ if (ret)
+ goto register_fail;
+
+ /*
+ * link memory sections under this node. This is already
+ * done when creatig memory section in register_new_memory
+ * but that depends to have the node registered so offline
+ * nodes have to go through register_node.
+ * TODO clean up this mess.
+ */
+ ret = link_mem_sections(nid, start_pfn, nr_pages);
+register_fail:
/*
* If sysfs file of new node can't create, cpu on the node
* can't be hot-added. There is no rollback way now.
@@ -1419,7 +1219,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
error:
/* rollback pgdat allocation and others */
- if (new_pgdat)
+ if (new_pgdat && pgdat)
rollback_node_hotadd(nid, pgdat);
memblock_remove(start, size);
@@ -1571,34 +1371,19 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
static struct page *new_node_page(struct page *page, unsigned long private,
int **result)
{
- gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
int nid = page_to_nid(page);
nodemask_t nmask = node_states[N_MEMORY];
- struct page *new_page = NULL;
/*
- * TODO: allocate a destination hugepage from a nearest neighbor node,
- * accordance with memory policy of the user process if possible. For
- * now as a simple work-around, we use the next node for destination.
+ * try to allocate from a different node but reuse this node if there
+ * are no other online nodes to be used (e.g. we are offlining a part
+ * of the only existing node)
*/
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- next_node_in(nid, nmask));
-
node_clear(nid, nmask);
+ if (nodes_empty(nmask))
+ node_set(nid, nmask);
- if (PageHighMem(page)
- || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
- gfp_mask |= __GFP_HIGHMEM;
-
- if (!nodes_empty(nmask))
- new_page = __alloc_pages_nodemask(gfp_mask, 0,
- node_zonelist(nid, gfp_mask), &nmask);
- if (!new_page)
- new_page = __alloc_pages(gfp_mask, 0,
- node_zonelist(nid, gfp_mask));
-
- return new_page;
+ return new_page_nodemask(page, nid, &nmask);
}
#define NR_OFFLINE_AT_ONCE_PAGES (256)
@@ -1725,47 +1510,12 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
return offlined;
}
-#ifdef CONFIG_MOVABLE_NODE
-/*
- * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
- * normal memory.
- */
-static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
-{
- return true;
-}
-#else /* CONFIG_MOVABLE_NODE */
-/* ensure the node has NORMAL memory if it is still online */
-static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long present_pages = 0;
- enum zone_type zt;
-
- for (zt = 0; zt <= ZONE_NORMAL; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
-
- if (present_pages > nr_pages)
- return true;
-
- present_pages = 0;
- for (; zt <= ZONE_MOVABLE; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
-
- /*
- * we can't offline the last normal memory until all
- * higher memory is offlined.
- */
- return present_pages == 0;
-}
-#endif /* CONFIG_MOVABLE_NODE */
-
static int __init cmdline_parse_movable_node(char *p)
{
-#ifdef CONFIG_MOVABLE_NODE
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
movable_node_enabled = true;
#else
- pr_warn("movable_node option not supported\n");
+ pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
#endif
return 0;
}
@@ -1887,9 +1637,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
- if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
- return -EINVAL;
-
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE, true);
@@ -1919,7 +1666,7 @@ repeat:
goto failed_removal;
ret = 0;
if (drain) {
- lru_add_drain_all();
+ lru_add_drain_all_cpuslocked();
cond_resched();
drain_all_pages(zone);
}
@@ -1940,7 +1687,7 @@ repeat:
}
}
/* drain all zone's lru pagevec, this is asynchronous... */
- lru_add_drain_all();
+ lru_add_drain_all_cpuslocked();
yield();
/* drain pcp pages, this is synchronous. */
drain_all_pages(zone);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 37d0b334bfe9..d911fa5cb2a7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p)
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
- /*
- * If read-side task has no lock to protect task->mempolicy, write-side
- * task will rebind the task->mempolicy by two step. The first step is
- * setting all the newly nodes, and the second step is cleaning all the
- * disallowed nodes. In this way, we can avoid finding no node to alloc
- * page.
- * If we have a lock to protect task->mempolicy in read-side, we do
- * rebind directly.
- *
- * step:
- * MPOL_REBIND_ONCE - do rebind work at once
- * MPOL_REBIND_STEP1 - set all the newly nodes
- * MPOL_REBIND_STEP2 - clean all the disallowed nodes
- */
- void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
- enum mpol_rebind_step step);
+ void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p)
kmem_cache_free(policy_cache, p);
}
-static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
- enum mpol_rebind_step step)
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}
-/*
- * step:
- * MPOL_REBIND_ONCE - do rebind work at once
- * MPOL_REBIND_STEP1 - set all the newly nodes
- * MPOL_REBIND_STEP2 - clean all the disallowed nodes
- */
-static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
- enum mpol_rebind_step step)
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t tmp;
@@ -325,41 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
- /*
- * if step == 1, we use ->w.cpuset_mems_allowed to cache the
- * result
- */
- if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
- nodes_remap(tmp, pol->v.nodes,
- pol->w.cpuset_mems_allowed, *nodes);
- pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
- } else if (step == MPOL_REBIND_STEP2) {
- tmp = pol->w.cpuset_mems_allowed;
- pol->w.cpuset_mems_allowed = *nodes;
- } else
- BUG();
+ nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
+ *nodes);
+ pol->w.cpuset_mems_allowed = tmp;
}
if (nodes_empty(tmp))
tmp = *nodes;
- if (step == MPOL_REBIND_STEP1)
- nodes_or(pol->v.nodes, pol->v.nodes, tmp);
- else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
- pol->v.nodes = tmp;
- else
- BUG();
-
- if (!node_isset(current->il_next, tmp)) {
- current->il_next = next_node_in(current->il_next, tmp);
- if (current->il_next >= MAX_NUMNODES)
- current->il_next = numa_node_id();
- }
+ pol->v.nodes = tmp;
}
static void mpol_rebind_preferred(struct mempolicy *pol,
- const nodemask_t *nodes,
- enum mpol_rebind_step step)
+ const nodemask_t *nodes)
{
nodemask_t tmp;
@@ -385,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
- * If read-side task has no lock to protect task->mempolicy, write-side
- * task will rebind the task->mempolicy by two step. The first step is
- * setting all the newly nodes, and the second step is cleaning all the
- * disallowed nodes. In this way, we can avoid finding no node to alloc
- * page.
- * If we have a lock to protect task->mempolicy in read-side, we do
- * rebind directly.
- *
- * step:
- * MPOL_REBIND_ONCE - do rebind work at once
- * MPOL_REBIND_STEP1 - set all the newly nodes
- * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ * Per-vma policies are protected by mmap_sem. Allocations using per-task
+ * policies are protected by task->mems_allowed_seq to prevent a premature
+ * OOM/allocation failure due to parallel nodemask modification.
*/
-static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
- enum mpol_rebind_step step)
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
+ if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
- if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
- return;
-
- if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
- BUG();
-
- if (step == MPOL_REBIND_STEP1)
- pol->flags |= MPOL_F_REBINDING;
- else if (step == MPOL_REBIND_STEP2)
- pol->flags &= ~MPOL_F_REBINDING;
- else if (step >= MPOL_REBIND_NSTEP)
- BUG();
-
- mpol_ops[pol->mode].rebind(pol, newmask, step);
+ mpol_ops[pol->mode].rebind(pol, newmask);
}
/*
@@ -430,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
* Called with task's alloc_lock held.
*/
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
- enum mpol_rebind_step step)
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
- mpol_rebind_policy(tsk->mempolicy, new, step);
+ mpol_rebind_policy(tsk->mempolicy, new);
}
/*
@@ -448,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
down_write(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next)
- mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
+ mpol_rebind_policy(vma->vm_policy, new);
up_write(&mm->mmap_sem);
}
@@ -812,9 +743,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
}
old = current->mempolicy;
current->mempolicy = new;
- if (new && new->mode == MPOL_INTERLEAVE &&
- nodes_weight(new->v.nodes))
- current->il_next = first_node(new->v.nodes);
+ if (new && new->mode == MPOL_INTERLEAVE)
+ current->il_prev = MAX_NUMNODES-1;
task_unlock(current);
mpol_put(old);
ret = 0;
@@ -916,7 +846,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
- *policy = current->il_next;
+ *policy = next_node_in(current->il_prev, pol->v.nodes);
} else {
err = -EINVAL;
goto out;
@@ -1148,7 +1078,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
/*
* if !vma, alloc_page_vma() will use task or system default policy
*/
- return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
+ vma, address);
}
#else
@@ -1676,9 +1607,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
return NULL;
}
-/* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
- int nd)
+/* Return the node id preferred by the given mempolicy, or the given id */
+static int policy_node(gfp_t gfp, struct mempolicy *policy,
+ int nd)
{
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
@@ -1691,20 +1622,19 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
- return node_zonelist(nd, gfp);
+ return nd;
}
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
- unsigned nid, next;
+ unsigned next;
struct task_struct *me = current;
- nid = me->il_next;
- next = next_node_in(nid, policy->v.nodes);
+ next = next_node_in(me->il_prev, policy->v.nodes);
if (next < MAX_NUMNODES)
- me->il_next = next;
- return nid;
+ me->il_prev = next;
+ return next;
}
/*
@@ -1799,38 +1729,37 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
#ifdef CONFIG_HUGETLBFS
/*
- * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
+ * huge_node(@vma, @addr, @gfp_flags, @mpol)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup and interleave policy
* @gfp_flags: for requested zone
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
* @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
*
- * Returns a zonelist suitable for a huge page allocation and a pointer
+ * Returns a nid suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
*
* Must be protected by read_mems_allowed_begin()
*/
-struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
- gfp_t gfp_flags, struct mempolicy **mpol,
- nodemask_t **nodemask)
+int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
+ struct mempolicy **mpol, nodemask_t **nodemask)
{
- struct zonelist *zl;
+ int nid;
*mpol = get_vma_policy(vma, addr);
*nodemask = NULL; /* assume !MPOL_BIND */
if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
- zl = node_zonelist(interleave_nid(*mpol, vma, addr,
- huge_page_shift(hstate_vma(vma))), gfp_flags);
+ nid = interleave_nid(*mpol, vma, addr,
+ huge_page_shift(hstate_vma(vma)));
} else {
- zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
+ nid = policy_node(gfp_flags, *mpol, numa_node_id());
if ((*mpol)->mode == MPOL_BIND)
*nodemask = &(*mpol)->v.nodes;
}
- return zl;
+ return nid;
}
/*
@@ -1932,12 +1861,10 @@ out:
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
unsigned nid)
{
- struct zonelist *zl;
struct page *page;
- zl = node_zonelist(nid, gfp);
- page = __alloc_pages(gfp, order, zl);
- if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
+ page = __alloc_pages(gfp, order, nid);
+ if (page && page_to_nid(page) == nid)
inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
return page;
}
@@ -1971,13 +1898,10 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
{
struct mempolicy *pol;
struct page *page;
- unsigned int cpuset_mems_cookie;
- struct zonelist *zl;
+ int preferred_nid;
nodemask_t *nmask;
-retry_cpuset:
pol = get_vma_policy(vma, addr);
- cpuset_mems_cookie = read_mems_allowed_begin();
if (pol->mode == MPOL_INTERLEAVE) {
unsigned nid;
@@ -2015,12 +1939,10 @@ retry_cpuset:
}
nmask = policy_nodemask(gfp, pol);
- zl = policy_zonelist(gfp, pol, node);
- page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+ preferred_nid = policy_node(gfp, pol, node);
+ page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
mpol_cond_put(pol);
out:
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
- goto retry_cpuset;
return page;
}
@@ -2038,23 +1960,15 @@ out:
* Allocate a page from the kernel page pool. When not in
* interrupt context and apply the current process NUMA policy.
* Returns NULL when no page can be allocated.
- *
- * Don't call cpuset_update_task_memory_state() unless
- * 1) it's ok to take cpuset_sem (can WAIT), and
- * 2) allocating for current task (not interrupt).
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = &default_policy;
struct page *page;
- unsigned int cpuset_mems_cookie;
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
-retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
-
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
@@ -2063,12 +1977,9 @@ retry_cpuset:
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
page = __alloc_pages_nodemask(gfp, order,
- policy_zonelist(gfp, pol, numa_node_id()),
+ policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
- goto retry_cpuset;
-
return page;
}
EXPORT_SYMBOL(alloc_pages_current);
@@ -2112,10 +2023,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
- if (new->flags & MPOL_F_REBINDING)
- mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
- else
- mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
+ mpol_rebind_policy(new, &mems);
}
atomic_set(&new->refcnt, 1);
return new;
diff --git a/mm/mempool.c b/mm/mempool.c
index 47a659dedd44..1c0294858527 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
gfp_t gfp_temp;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
diff --git a/mm/migrate.c b/mm/migrate.c
index 89a0a1707f4c..627671551873 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -227,25 +227,26 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
+ flush_dcache_page(new);
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
pte = arch_make_huge_pte(pte, vma, new, 0);
- }
-#endif
- flush_dcache_page(new);
- set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
-
- if (PageHuge(new)) {
+ set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, pvmw.address);
else
page_dup_rmap(new, true);
- } else if (PageAnon(new))
- page_add_anon_rmap(new, vma, pvmw.address, false);
- else
- page_add_file_rmap(new, false);
+ } else
+#endif
+ {
+ set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
+ if (PageAnon(new))
+ page_add_anon_rmap(new, vma, pvmw.address, false);
+ else
+ page_add_file_rmap(new, false);
+ }
if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
@@ -1251,6 +1252,8 @@ put_anon:
out:
if (rc != -EAGAIN)
putback_active_hugepage(hpage);
+ if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage))
+ num_poisoned_pages_inc();
/*
* If migration was not successful and there's a freeing callback, use
@@ -1913,7 +1916,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
int page_lru = page_is_file_cache(page);
unsigned long mmun_start = address & HPAGE_PMD_MASK;
unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
- pmd_t orig_entry;
/*
* Rate-limit the amount of data that is being migrated to a node.
@@ -1956,8 +1958,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* Recheck the target PMD */
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
-fail_putback:
+ if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1979,7 +1980,6 @@ fail_putback:
goto out_unlock;
}
- orig_entry = *pmd;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1996,15 +1996,7 @@ fail_putback:
set_pmd_at(mm, mmun_start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
- if (page_count(page) != 2) {
- set_pmd_at(mm, mmun_start, pmd, orig_entry);
- flush_pmd_tlb_range(vma, mmun_start, mmun_end);
- mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
- update_mmu_cache_pmd(vma, address, &entry);
- page_remove_rmap(new_page, true);
- goto fail_putback;
- }
-
+ page_ref_unfreeze(page, 2);
mlock_migrate_page(new_page, page);
page_remove_rmap(page, true);
set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
diff --git a/mm/mmap.c b/mm/mmap.c
index 8e07976d5e47..f19efcf75418 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -94,7 +94,7 @@ static void unmap_region(struct mm_struct *mm,
* w: (no) no
* x: (yes) yes
*/
-pgprot_t protection_map[16] = {
+pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};
@@ -1817,7 +1817,8 @@ check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
return -ENOMEM;
- if (gap_end >= low_limit && gap_end - gap_start >= length)
+ if (gap_end >= low_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit right subtree if it looks promising */
@@ -1920,7 +1921,8 @@ check_current:
gap_end = vm_start_gap(vma);
if (gap_end < low_limit)
return -ENOMEM;
- if (gap_start <= high_limit && gap_end - gap_start >= length)
+ if (gap_start <= high_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit left subtree if it looks promising */
@@ -2175,7 +2177,6 @@ static int acct_stack_growth(struct vm_area_struct *vma,
unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
- struct rlimit *rlim = current->signal->rlim;
unsigned long new_start;
/* address space limit tests */
@@ -2183,7 +2184,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
return -ENOMEM;
/* Stack limit test */
- if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ if (size > rlimit(RLIMIT_STACK))
return -ENOMEM;
/* mlock limit tests */
@@ -2191,7 +2192,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
- limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+ limit = rlimit(RLIMIT_MEMLOCK);
limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
return -ENOMEM;
@@ -2228,18 +2229,22 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
- /* Guard against wrapping around to address 0. */
+ /* Guard against exceeding limits of the address space. */
address &= PAGE_MASK;
- address += PAGE_SIZE;
- if (!address)
+ if (address >= (TASK_SIZE & PAGE_MASK))
return -ENOMEM;
+ address += PAGE_SIZE;
/* Enforce stack_guard_gap */
gap_addr = address + stack_guard_gap;
- if (gap_addr < address)
- return -ENOMEM;
+
+ /* Guard against overflow */
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
next = vma->vm_next;
- if (next && next->vm_start < gap_addr) {
+ if (next && next->vm_start < gap_addr &&
+ (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
@@ -2310,7 +2315,6 @@ int expand_downwards(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *prev;
- unsigned long gap_addr;
int error;
address &= PAGE_MASK;
@@ -2319,14 +2323,12 @@ int expand_downwards(struct vm_area_struct *vma,
return error;
/* Enforce stack_guard_gap */
- gap_addr = address - stack_guard_gap;
- if (gap_addr > address)
- return -ENOMEM;
prev = vma->vm_prev;
- if (prev && prev->vm_end > gap_addr) {
- if (!(prev->vm_flags & VM_GROWSDOWN))
+ /* Check that both stack segments have the same anon_vma? */
+ if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
+ (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
+ if (address - prev->vm_end < stack_guard_gap)
return -ENOMEM;
- /* Check that both stack segments have the same anon_vma? */
}
/* We must make sure the anon_vma is allocated. */
@@ -3181,8 +3183,12 @@ static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
struct vm_special_mapping *sm = new_vma->vm_private_data;
+ if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
+ return -EFAULT;
+
if (sm->mremap)
return sm->mremap(sm, new_vma);
+
return 0;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8edd0d576254..1a8c9ca83e48 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -58,8 +58,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* reading.
*/
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (!pte)
- return 0;
/* Get target node for single threaded private VMAs */
if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 487dad610731..36454d0f96ee 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -118,7 +118,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
unsigned long end_pfn = min_t(unsigned long,
PFN_DOWN(end), max_low_pfn);
- if (start_pfn > end_pfn)
+ if (start_pfn >= end_pfn)
return 0;
__free_pages_memory(start_pfn, end_pfn);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 04c9143a8625..9e8b4f030c1c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -490,6 +490,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
if (!down_read_trylock(&mm->mmap_sem)) {
ret = false;
+ trace_skip_task_reaping(tsk->pid);
goto unlock_oom;
}
@@ -500,9 +501,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
*/
if (!mmget_not_zero(mm)) {
up_read(&mm->mmap_sem);
+ trace_skip_task_reaping(tsk->pid);
goto unlock_oom;
}
+ trace_start_task_reaping(tsk->pid);
+
/*
* Tell all users of get_user/copy_from_user etc... that the content
* is no longer stable. No barriers really needed because unmapping
@@ -544,6 +548,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* put the oom_reaper out of the way.
*/
mmput_async(mm);
+ trace_finish_task_reaping(tsk->pid);
unlock_oom:
mutex_unlock(&oom_lock);
return ret;
@@ -615,6 +620,7 @@ static void wake_oom_reaper(struct task_struct *tsk)
tsk->oom_reaper_list = oom_reaper_list;
oom_reaper_list = tsk;
spin_unlock(&oom_reaper_lock);
+ trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);
}
@@ -666,6 +672,7 @@ static void mark_oom_victim(struct task_struct *tsk)
*/
__thaw_task(tsk);
atomic_inc(&oom_victims);
+ trace_mark_victim(tsk->pid);
}
/**
@@ -876,6 +883,11 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
/* Get a reference to safely compare mm after task_unlock(victim) */
mm = victim->mm;
mmgrab(mm);
+
+ /* Raise event before sending signal: task reaper must see this */
+ count_vm_event(OOM_KILL);
+ count_memcg_event_mm(mm, OOM_KILL);
+
/*
* We should send SIGKILL before setting TIF_MEMDIE in order to prevent
* the OOM victim from depleting the memory reserves from the user
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 143c1c25d680..96e93b214d31 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -601,7 +601,7 @@ static inline void __wb_writeout_inc(struct bdi_writeback *wb)
{
struct wb_domain *cgdom;
- __inc_wb_stat(wb, WB_WRITTEN);
+ inc_wb_stat(wb, WB_WRITTEN);
wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
wb->bdi->max_prop_frac);
@@ -2366,15 +2366,15 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
}
/**
- * write_one_page - write out a single page and optionally wait on I/O
+ * write_one_page - write out a single page and wait on I/O
* @page: the page to write
- * @wait: if true, wait on writeout
*
* The page must be locked by the caller and will be unlocked upon return.
*
- * write_one_page() returns a negative error code if I/O failed.
+ * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
+ * function returns.
*/
-int write_one_page(struct page *page, int wait)
+int write_one_page(struct page *page)
{
struct address_space *mapping = page->mapping;
int ret = 0;
@@ -2385,21 +2385,20 @@ int write_one_page(struct page *page, int wait)
BUG_ON(!PageLocked(page));
- if (wait)
- wait_on_page_writeback(page);
+ wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
get_page(page);
ret = mapping->a_ops->writepage(page, &wbc);
- if (ret == 0 && wait) {
+ if (ret == 0)
wait_on_page_writeback(page);
- if (PageError(page))
- ret = -EIO;
- }
put_page(page);
} else {
unlock_page(page);
}
+
+ if (!ret)
+ ret = filemap_check_errors(mapping);
return ret;
}
EXPORT_SYMBOL(write_one_page);
@@ -2433,12 +2432,11 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
- inc_memcg_page_state(page, NR_FILE_DIRTY);
- __inc_node_page_state(page, NR_FILE_DIRTY);
+ __inc_lruvec_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
__inc_node_page_state(page, NR_DIRTIED);
- __inc_wb_stat(wb, WB_RECLAIMABLE);
- __inc_wb_stat(wb, WB_DIRTIED);
+ inc_wb_stat(wb, WB_RECLAIMABLE);
+ inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
@@ -2455,8 +2453,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
struct bdi_writeback *wb)
{
if (mapping_cap_account_dirty(mapping)) {
- dec_memcg_page_state(page, NR_FILE_DIRTY);
- dec_node_page_state(page, NR_FILE_DIRTY);
+ dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_SIZE);
@@ -2712,8 +2709,7 @@ int clear_page_dirty_for_io(struct page *page)
*/
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
- dec_memcg_page_state(page, NR_FILE_DIRTY);
- dec_node_page_state(page, NR_FILE_DIRTY);
+ dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
@@ -2745,7 +2741,7 @@ int test_clear_page_writeback(struct page *page)
if (bdi_cap_account_writeback(bdi)) {
struct bdi_writeback *wb = inode_to_wb(inode);
- __dec_wb_stat(wb, WB_WRITEBACK);
+ dec_wb_stat(wb, WB_WRITEBACK);
__wb_writeout_inc(wb);
}
}
@@ -2759,8 +2755,7 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- dec_memcg_page_state(page, NR_WRITEBACK);
- dec_node_page_state(page, NR_WRITEBACK);
+ dec_lruvec_page_state(page, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
}
@@ -2791,7 +2786,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
- __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
+ inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
/*
* We can come through here when swapping anonymous
@@ -2814,8 +2809,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
ret = TestSetPageWriteback(page);
}
if (!ret) {
- inc_memcg_page_state(page, NR_WRITEBACK);
- inc_node_page_state(page, NR_WRITEBACK);
+ inc_lruvec_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
}
unlock_page_memcg(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2302f250d6b1..6d30e914afb6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -113,9 +113,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
-#ifdef CONFIG_MOVABLE_NODE
[N_MEMORY] = { { [0] = 1UL } },
-#endif
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
@@ -511,7 +509,7 @@ static int page_is_consistent(struct zone *zone, struct page *page)
/*
* Temporary debugging check for pages not lying within a given zone.
*/
-static int bad_range(struct zone *zone, struct page *page)
+static int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
return 1;
@@ -521,7 +519,7 @@ static int bad_range(struct zone *zone, struct page *page)
return 0;
}
#else
-static inline int bad_range(struct zone *zone, struct page *page)
+static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
return 0;
}
@@ -1297,8 +1295,9 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+static inline bool __meminit __maybe_unused
+meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
{
int nid;
@@ -1320,8 +1319,9 @@ static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+static inline bool __meminit __maybe_unused
+meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
{
return true;
}
@@ -1365,7 +1365,9 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
return NULL;
- start_page = pfn_to_page(start_pfn);
+ start_page = pfn_to_online_page(start_pfn);
+ if (!start_page)
+ return NULL;
if (page_zone(start_page) != zone)
return NULL;
@@ -2204,19 +2206,26 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* list of requested migratetype, possibly along with other pages from the same
* block, depending on fragmentation avoidance heuristics. Returns true if
* fallback was found so that __rmqueue_smallest() can grab it.
+ *
+ * The use of signed ints for order and current_order is a deliberate
+ * deviation from the rest of this file, to make the for loop
+ * condition simpler.
*/
static inline bool
-__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area *area;
- unsigned int current_order;
+ int current_order;
struct page *page;
int fallback_mt;
bool can_steal;
- /* Find the largest possible block of pages in the other list */
- for (current_order = MAX_ORDER-1;
- current_order >= order && current_order <= MAX_ORDER-1;
+ /*
+ * Find the largest available free page in the other list. This roughly
+ * approximates finding the pageblock with the most free pages, which
+ * would be too costly to do exactly.
+ */
+ for (current_order = MAX_ORDER - 1; current_order >= order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2224,19 +2233,50 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
if (fallback_mt == -1)
continue;
- page = list_first_entry(&area->free_list[fallback_mt],
- struct page, lru);
+ /*
+ * We cannot steal all free pages from the pageblock and the
+ * requested migratetype is movable. In that case it's better to
+ * steal and split the smallest available page instead of the
+ * largest available page, because even if the next movable
+ * allocation falls back into a different pageblock than this
+ * one, it won't cause permanent fragmentation.
+ */
+ if (!can_steal && start_migratetype == MIGRATE_MOVABLE
+ && current_order > order)
+ goto find_smallest;
- steal_suitable_fallback(zone, page, start_migratetype,
- can_steal);
+ goto do_steal;
+ }
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
+ return false;
- return true;
+find_smallest:
+ for (current_order = order; current_order < MAX_ORDER;
+ current_order++) {
+ area = &(zone->free_area[current_order]);
+ fallback_mt = find_suitable_fallback(area, current_order,
+ start_migratetype, false, &can_steal);
+ if (fallback_mt != -1)
+ break;
}
- return false;
+ /*
+ * This should not happen - we already found a suitable fallback
+ * when looking for the largest page.
+ */
+ VM_BUG_ON(current_order == MAX_ORDER);
+
+do_steal:
+ page = list_first_entry(&area->free_list[fallback_mt],
+ struct page, lru);
+
+ steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+
+ return true;
+
}
/*
@@ -3244,6 +3284,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
+ /*
+ * We have already exhausted all our reclaim opportunities without any
+ * success so it is time to admit defeat. We will skip the OOM killer
+ * because it is very likely that the caller has a more reasonable
+ * fallback than shooting a random task.
+ */
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
+ goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
@@ -3373,7 +3421,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
}
/*
- * !costly requests are much more important than __GFP_REPEAT
+ * !costly requests are much more important than __GFP_RETRY_MAYFAIL
* costly ones because they are de facto nofail and invoke OOM
* killer to move on while costly can fail and users are ready
* to cope with that. 1/4 retries is rather arbitrary but we
@@ -3673,6 +3721,39 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
return false;
}
+static inline bool
+check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
+{
+ /*
+ * It's possible that cpuset's mems_allowed and the nodemask from
+ * mempolicy don't intersect. This should be normally dealt with by
+ * policy_nodemask(), but it's possible to race with cpuset update in
+ * such a way the check therein was true, and then it became false
+ * before we got our cpuset_mems_cookie here.
+ * This assumes that for all allocations, ac->nodemask can come only
+ * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
+ * when it does not intersect with the cpuset restrictions) or the
+ * caller can deal with a violated nodemask.
+ */
+ if (cpusets_enabled() && ac->nodemask &&
+ !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
+ ac->nodemask = NULL;
+ return true;
+ }
+
+ /*
+ * When updating a task's mems_allowed or mempolicy nodemask, it is
+ * possible to race with parallel threads in such a way that our
+ * allocation can fail while the mask is being updated. If we are about
+ * to fail, check if the cpuset changed during allocation and if so,
+ * retry.
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ return true;
+
+ return false;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
@@ -3847,9 +3928,9 @@ retry:
/*
* Do not retry costly high order allocations unless they are
- * __GFP_REPEAT
+ * __GFP_RETRY_MAYFAIL
*/
- if (costly_order && !(gfp_mask & __GFP_REPEAT))
+ if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -3868,11 +3949,9 @@ retry:
&compaction_retries))
goto retry;
- /*
- * It's possible we raced with cpuset update so the OOM would be
- * premature (see below the nopage: label for full explanation).
- */
- if (read_mems_allowed_retry(cpuset_mems_cookie))
+
+ /* Deal with possible cpuset update races before we start OOM killing */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/* Reclaim has failed us, start killing things */
@@ -3893,14 +3972,8 @@ retry:
}
nopage:
- /*
- * When updating a task's mems_allowed or mempolicy nodemask, it is
- * possible to race with parallel threads in such a way that our
- * allocation can fail while the mask is being updated. If we are about
- * to fail, check if the cpuset changed during allocation and if so,
- * retry.
- */
- if (read_mems_allowed_retry(cpuset_mems_cookie))
+ /* Deal with possible cpuset update races before we fail */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/*
@@ -3951,12 +4024,12 @@ got_pg:
}
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask,
+ int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
ac->high_zoneidx = gfp_zone(gfp_mask);
- ac->zonelist = zonelist;
+ ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
@@ -4001,8 +4074,8 @@ static inline void finalise_ac(gfp_t gfp_mask,
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
+ nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
@@ -4010,7 +4083,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct alloc_context ac = { };
gfp_mask &= gfp_allowed_mask;
- if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
+ if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
finalise_ac(gfp_mask, order, &ac);
@@ -4614,8 +4687,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " slab_reclaimable:%lukB"
- " slab_unreclaimable:%lukB"
" kernel_stack:%lukB"
" pagetables:%lukB"
" bounce:%lukB"
@@ -4637,8 +4708,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone->present_pages),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
- K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
@@ -5124,6 +5193,7 @@ static void build_zonelists(pg_data_t *pgdat)
*/
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void setup_zone_pageset(struct zone *zone);
/*
@@ -5216,7 +5286,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
#endif
/* we have to stop all cpus to guarantee there is no user
of zonelist */
- stop_machine(__build_all_zonelists, pgdat, NULL);
+ stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -5528,7 +5598,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
zone_batchsize(zone));
}
-int __meminit init_currently_empty_zone(struct zone *zone,
+void __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
unsigned long size)
{
@@ -5546,8 +5616,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,
zone_init_free_lists(zone);
zone->initialized = 1;
-
- return 0;
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -6005,7 +6073,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- int ret;
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
@@ -6027,6 +6094,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
spin_lock_init(&pgdat->lru_lock);
lruvec_init(node_lruvec(pgdat));
+ pgdat->per_cpu_nodestats = &boot_nodestats;
+
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
@@ -6087,8 +6156,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
- ret = init_currently_empty_zone(zone, zone_start_pfn, size);
- BUG_ON(ret);
+ init_currently_empty_zone(zone, zone_start_pfn, size);
memmap_init(size, nid, j, zone_start_pfn);
}
}
@@ -7182,6 +7250,21 @@ static unsigned long __init arch_reserved_kernel_pages(void)
#endif
/*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ * Because 32-bit systems cannot have large physical memory, where this scaling
+ * makes sense, it is disabled on such platforms.
+ */
+#if __BITS_PER_LONG > 32
+#define ADAPT_SCALE_BASE (64ul << 30)
+#define ADAPT_SCALE_SHIFT 2
+#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
+#endif
+
+/*
* allocate a large system hash table from bootmem
* - it is assumed that the hash table must contain an exact power-of-2
* quantity of entries
@@ -7200,6 +7283,7 @@ void *__init alloc_large_system_hash(const char *tablename,
unsigned long long max = high_limit;
unsigned long log2qty, size;
void *table = NULL;
+ gfp_t gfp_flags;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -7211,6 +7295,16 @@ void *__init alloc_large_system_hash(const char *tablename,
if (PAGE_SHIFT < 20)
numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
+#if __BITS_PER_LONG > 32
+ if (!high_limit) {
+ unsigned long adapt;
+
+ for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+ adapt <<= ADAPT_SCALE_SHIFT)
+ scale++;
+ }
+#endif
+
/* limit to 1 bucket per 2^scale bytes of low memory */
if (scale > PAGE_SHIFT)
numentries >>= (scale - PAGE_SHIFT);
@@ -7244,12 +7338,17 @@ void *__init alloc_large_system_hash(const char *tablename,
log2qty = ilog2(numentries);
+ /*
+ * memblock allocator returns zeroed memory already, so HASH_ZERO is
+ * currently not used when HASH_EARLY is specified.
+ */
+ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
table = memblock_virt_alloc_nopanic(size, 0);
else if (hashdist)
- table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+ table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
else {
/*
* If bucketsize is not a power-of-two, we may free
@@ -7257,8 +7356,8 @@ void *__init alloc_large_system_hash(const char *tablename,
* alloc_pages_exact() automatically does
*/
if (get_order(size) < MAX_ORDER) {
- table = alloc_pages_exact(size, GFP_ATOMIC);
- kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+ table = alloc_pages_exact(size, gfp_flags);
+ kmemleak_alloc(table, size, 1, gfp_flags);
}
}
} while (!table && size > PAGE_SIZE && --log2qty);
@@ -7660,6 +7759,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
break;
if (pfn == end_pfn)
return;
+ offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
pfn = start_pfn;
diff --git a/mm/page_io.c b/mm/page_io.c
index 23f6d0d3470f..b6c4ac388209 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -45,7 +45,7 @@ void end_swap_bio_write(struct bio *bio)
{
struct page *page = bio->bi_io_vec[0].bv_page;
- if (bio->bi_error) {
+ if (bio->bi_status) {
SetPageError(page);
/*
* We failed to write the page out to swap-space.
@@ -117,8 +117,9 @@ static void swap_slot_free_notify(struct page *page)
static void end_swap_bio_read(struct bio *bio)
{
struct page *page = bio->bi_io_vec[0].bv_page;
+ struct task_struct *waiter = bio->bi_private;
- if (bio->bi_error) {
+ if (bio->bi_status) {
SetPageError(page);
ClearPageUptodate(page);
pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
@@ -132,7 +133,9 @@ static void end_swap_bio_read(struct bio *bio)
swap_slot_free_notify(page);
out:
unlock_page(page);
+ WRITE_ONCE(bio->bi_private, NULL);
bio_put(bio);
+ wake_up_process(waiter);
}
int generic_swapfile_activate(struct swap_info_struct *sis,
@@ -329,11 +332,13 @@ out:
return ret;
}
-int swap_readpage(struct page *page)
+int swap_readpage(struct page *page, bool do_poll)
{
struct bio *bio;
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
+ blk_qc_t qc;
+ struct block_device *bdev;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -372,9 +377,23 @@ int swap_readpage(struct page *page)
ret = -ENOMEM;
goto out;
}
+ bdev = bio->bi_bdev;
+ bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
count_vm_event(PSWPIN);
- submit_bio(bio);
+ bio_get(bio);
+ qc = submit_bio(bio);
+ while (do_poll) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(bio->bi_private))
+ break;
+
+ if (!blk_mq_poll(bdev_get_queue(bdev), qc))
+ break;
+ }
+ __set_current_state(TASK_RUNNING);
+ bio_put(bio);
+
out:
return ret;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5092e4ef00c8..757410d9f758 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -8,6 +8,7 @@
#include <linux/memory.h>
#include <linux/hugetlb.h>
#include <linux/page_owner.h>
+#include <linux/migrate.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -138,12 +139,18 @@ static inline struct page *
__first_valid_page(unsigned long pfn, unsigned long nr_pages)
{
int i;
- for (i = 0; i < nr_pages; i++)
- if (pfn_valid_within(pfn + i))
- break;
- if (unlikely(i == nr_pages))
- return NULL;
- return pfn_to_page(pfn + i);
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page;
+
+ if (!pfn_valid_within(pfn + i))
+ continue;
+ page = pfn_to_online_page(pfn + i);
+ if (!page)
+ continue;
+ return page;
+ }
+ return NULL;
}
/*
@@ -184,8 +191,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
- pfn += pageblock_nr_pages)
- unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
+ pfn += pageblock_nr_pages) {
+ struct page *page = pfn_to_online_page(pfn);
+ if (!page)
+ continue;
+ unset_migratetype_isolate(page, migratetype);
+ }
return -EBUSY;
}
@@ -284,20 +295,5 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
struct page *alloc_migrate_target(struct page *page, unsigned long private,
int **resultp)
{
- gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
-
- /*
- * TODO: allocate a destination hugepage from a nearest neighbor node,
- * accordance with memory policy of the user process if possible. For
- * now as a simple work-around, we use the next node for destination.
- */
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- next_node_in(page_to_nid(page),
- node_online_map));
-
- if (PageHighMem(page))
- gfp_mask |= __GFP_HIGHMEM;
-
- return alloc_page(gfp_mask);
+ return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]);
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 60634dc53a88..0fd9dcf2c5dc 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -281,7 +281,11 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
continue;
if (PageBuddy(page)) {
- pfn += (1UL << page_order(page)) - 1;
+ unsigned long freepage_order;
+
+ freepage_order = page_order_unsafe(page);
+ if (freepage_order < MAX_ORDER)
+ pfn += (1UL << freepage_order) - 1;
continue;
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index de9c40d7304a..8ec6ba230bb9 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -116,7 +116,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address);
+ pvmw->pte = huge_pte_offset(mm, pvmw->address,
+ PAGE_SIZE << compound_order(page));
if (!pvmw->pte)
return false;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 60f7856e508f..1a4197965415 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -180,12 +180,13 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct hstate *h = hstate_vma(vma);
unsigned long next;
unsigned long hmask = huge_page_mask(h);
+ unsigned long sz = huge_page_size(h);
pte_t *pte;
int err = 0;
do {
next = hugetlb_entry_end(h, addr, end);
- pte = huge_pte_offset(walk->mm, addr & hmask);
+ pte = huge_pte_offset(walk->mm, addr & hmask, sz);
if (pte && walk->hugetlb_entry)
err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
if (err)
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
new file mode 100644
index 000000000000..cd2442e13d8f
--- /dev/null
+++ b/mm/percpu-internal.h
@@ -0,0 +1,166 @@
+#ifndef _MM_PERCPU_INTERNAL_H
+#define _MM_PERCPU_INTERNAL_H
+
+#include <linux/types.h>
+#include <linux/percpu.h>
+
+struct pcpu_chunk {
+#ifdef CONFIG_PERCPU_STATS
+ int nr_alloc; /* # of allocations */
+ size_t max_alloc_size; /* largest allocation size */
+#endif
+
+ struct list_head list; /* linked to pcpu_slot lists */
+ int free_size; /* free bytes in the chunk */
+ int contig_hint; /* max contiguous size hint */
+ void *base_addr; /* base address of this chunk */
+
+ int map_used; /* # of map entries used before the sentry */
+ int map_alloc; /* # of map entries allocated */
+ int *map; /* allocation map */
+ struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
+
+ void *data; /* chunk data */
+ int first_free; /* no free below this */
+ bool immutable; /* no [de]population allowed */
+ bool has_reserved; /* Indicates if chunk has reserved space
+ at the beginning. Reserved chunk will
+ contain reservation for static chunk.
+ Dynamic chunk will contain reservation
+ for static and reserved chunks. */
+ int nr_populated; /* # of populated pages */
+ unsigned long populated[]; /* populated bitmap */
+};
+
+extern spinlock_t pcpu_lock;
+
+extern struct list_head *pcpu_slot;
+extern int pcpu_nr_slots;
+
+extern struct pcpu_chunk *pcpu_first_chunk;
+extern struct pcpu_chunk *pcpu_reserved_chunk;
+
+#ifdef CONFIG_PERCPU_STATS
+
+#include <linux/spinlock.h>
+
+struct percpu_stats {
+ u64 nr_alloc; /* lifetime # of allocations */
+ u64 nr_dealloc; /* lifetime # of deallocations */
+ u64 nr_cur_alloc; /* current # of allocations */
+ u64 nr_max_alloc; /* max # of live allocations */
+ u32 nr_chunks; /* current # of live chunks */
+ u32 nr_max_chunks; /* max # of live chunks */
+ size_t min_alloc_size; /* min allocaiton size */
+ size_t max_alloc_size; /* max allocation size */
+};
+
+extern struct percpu_stats pcpu_stats;
+extern struct pcpu_alloc_info pcpu_stats_ai;
+
+/*
+ * For debug purposes. We don't care about the flexible array.
+ */
+static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
+{
+ memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));
+
+ /* initialize min_alloc_size to unit_size */
+ pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
+}
+
+/*
+ * pcpu_stats_area_alloc - increment area allocation stats
+ * @chunk: the location of the area being allocated
+ * @size: size of area to allocate in bytes
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
+{
+ lockdep_assert_held(&pcpu_lock);
+
+ pcpu_stats.nr_alloc++;
+ pcpu_stats.nr_cur_alloc++;
+ pcpu_stats.nr_max_alloc =
+ max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
+ pcpu_stats.min_alloc_size =
+ min(pcpu_stats.min_alloc_size, size);
+ pcpu_stats.max_alloc_size =
+ max(pcpu_stats.max_alloc_size, size);
+
+ chunk->nr_alloc++;
+ chunk->max_alloc_size = max(chunk->max_alloc_size, size);
+}
+
+/*
+ * pcpu_stats_area_dealloc - decrement allocation stats
+ * @chunk: the location of the area being deallocated
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
+{
+ lockdep_assert_held(&pcpu_lock);
+
+ pcpu_stats.nr_dealloc++;
+ pcpu_stats.nr_cur_alloc--;
+
+ chunk->nr_alloc--;
+}
+
+/*
+ * pcpu_stats_chunk_alloc - increment chunk stats
+ */
+static inline void pcpu_stats_chunk_alloc(void)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ pcpu_stats.nr_chunks++;
+ pcpu_stats.nr_max_chunks =
+ max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);
+
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+}
+
+/*
+ * pcpu_stats_chunk_dealloc - decrement chunk stats
+ */
+static inline void pcpu_stats_chunk_dealloc(void)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ pcpu_stats.nr_chunks--;
+
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+}
+
+#else
+
+static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
+{
+}
+
+static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
+{
+}
+
+static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
+{
+}
+
+static inline void pcpu_stats_chunk_alloc(void)
+{
+}
+
+static inline void pcpu_stats_chunk_dealloc(void)
+{
+}
+
+#endif /* !CONFIG_PERCPU_STATS */
+
+#endif
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d66911ff42d9..eb58aa4c0997 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -72,6 +72,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
pcpu_chunk_populated(chunk, 0, nr_pages);
spin_unlock_irq(&pcpu_lock);
+ pcpu_stats_chunk_alloc();
+ trace_percpu_create_chunk(chunk->base_addr);
+
return chunk;
}
@@ -79,7 +82,13 @@ static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
{
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
- if (chunk && chunk->data)
+ if (!chunk)
+ return;
+
+ pcpu_stats_chunk_dealloc();
+ trace_percpu_destroy_chunk(chunk->base_addr);
+
+ if (chunk->data)
__free_pages(chunk->data, order_base_2(nr_pages));
pcpu_free_chunk(chunk);
}
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
new file mode 100644
index 000000000000..03524a56eeff
--- /dev/null
+++ b/mm/percpu-stats.c
@@ -0,0 +1,222 @@
+/*
+ * mm/percpu-debug.c
+ *
+ * Copyright (C) 2017 Facebook Inc.
+ * Copyright (C) 2017 Dennis Zhou <dennisz@fb.com>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Prints statistics about the percpu allocator and backing chunks.
+ */
+#include <linux/debugfs.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/sort.h>
+#include <linux/vmalloc.h>
+
+#include "percpu-internal.h"
+
+#define P(X, Y) \
+ seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y)
+
+struct percpu_stats pcpu_stats;
+struct pcpu_alloc_info pcpu_stats_ai;
+
+static int cmpint(const void *a, const void *b)
+{
+ return *(int *)a - *(int *)b;
+}
+
+/*
+ * Iterates over all chunks to find the max # of map entries used.
+ */
+static int find_max_map_used(void)
+{
+ struct pcpu_chunk *chunk;
+ int slot, max_map_used;
+
+ max_map_used = 0;
+ for (slot = 0; slot < pcpu_nr_slots; slot++)
+ list_for_each_entry(chunk, &pcpu_slot[slot], list)
+ max_map_used = max(max_map_used, chunk->map_used);
+
+ return max_map_used;
+}
+
+/*
+ * Prints out chunk state. Fragmentation is considered between
+ * the beginning of the chunk to the last allocation.
+ */
+static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
+ void *buffer)
+{
+ int i, s_index, last_alloc, alloc_sign, as_len;
+ int *alloc_sizes, *p;
+ /* statistics */
+ int sum_frag = 0, max_frag = 0;
+ int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
+
+ alloc_sizes = buffer;
+ s_index = chunk->has_reserved ? 1 : 0;
+
+ /* find last allocation */
+ last_alloc = -1;
+ for (i = chunk->map_used - 1; i >= s_index; i--) {
+ if (chunk->map[i] & 1) {
+ last_alloc = i;
+ break;
+ }
+ }
+
+ /* if the chunk is not empty - ignoring reserve */
+ if (last_alloc >= s_index) {
+ as_len = last_alloc + 1 - s_index;
+
+ /*
+ * Iterate through chunk map computing size info.
+ * The first bit is overloaded to be a used flag.
+ * negative = free space, positive = allocated
+ */
+ for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) {
+ alloc_sign = (*p & 1) ? 1 : -1;
+ alloc_sizes[i] = alloc_sign *
+ ((p[1] & ~1) - (p[0] & ~1));
+ }
+
+ sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL);
+
+ /* Iterate through the unallocated fragements. */
+ for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
+ sum_frag -= *p;
+ max_frag = max(max_frag, -1 * (*p));
+ }
+
+ cur_min_alloc = alloc_sizes[i];
+ cur_med_alloc = alloc_sizes[(i + as_len - 1) / 2];
+ cur_max_alloc = alloc_sizes[as_len - 1];
+ }
+
+ P("nr_alloc", chunk->nr_alloc);
+ P("max_alloc_size", chunk->max_alloc_size);
+ P("free_size", chunk->free_size);
+ P("contig_hint", chunk->contig_hint);
+ P("sum_frag", sum_frag);
+ P("max_frag", max_frag);
+ P("cur_min_alloc", cur_min_alloc);
+ P("cur_med_alloc", cur_med_alloc);
+ P("cur_max_alloc", cur_max_alloc);
+ seq_putc(m, '\n');
+}
+
+static int percpu_stats_show(struct seq_file *m, void *v)
+{
+ struct pcpu_chunk *chunk;
+ int slot, max_map_used;
+ void *buffer;
+
+alloc_buffer:
+ spin_lock_irq(&pcpu_lock);
+ max_map_used = find_max_map_used();
+ spin_unlock_irq(&pcpu_lock);
+
+ buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0]));
+ if (!buffer)
+ return -ENOMEM;
+
+ spin_lock_irq(&pcpu_lock);
+
+ /* if the buffer allocated earlier is too small */
+ if (max_map_used < find_max_map_used()) {
+ spin_unlock_irq(&pcpu_lock);
+ vfree(buffer);
+ goto alloc_buffer;
+ }
+
+#define PL(X) \
+ seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X)
+
+ seq_printf(m,
+ "Percpu Memory Statistics\n"
+ "Allocation Info:\n"
+ "----------------------------------------\n");
+ PL(unit_size);
+ PL(static_size);
+ PL(reserved_size);
+ PL(dyn_size);
+ PL(atom_size);
+ PL(alloc_size);
+ seq_putc(m, '\n');
+
+#undef PL
+
+#define PU(X) \
+ seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X)
+
+ seq_printf(m,
+ "Global Stats:\n"
+ "----------------------------------------\n");
+ PU(nr_alloc);
+ PU(nr_dealloc);
+ PU(nr_cur_alloc);
+ PU(nr_max_alloc);
+ PU(nr_chunks);
+ PU(nr_max_chunks);
+ PU(min_alloc_size);
+ PU(max_alloc_size);
+ seq_putc(m, '\n');
+
+#undef PU
+
+ seq_printf(m,
+ "Per Chunk Stats:\n"
+ "----------------------------------------\n");
+
+ if (pcpu_reserved_chunk) {
+ seq_puts(m, "Chunk: <- Reserved Chunk\n");
+ chunk_map_stats(m, pcpu_reserved_chunk, buffer);
+ }
+
+ for (slot = 0; slot < pcpu_nr_slots; slot++) {
+ list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+ if (chunk == pcpu_first_chunk) {
+ seq_puts(m, "Chunk: <- First Chunk\n");
+ chunk_map_stats(m, chunk, buffer);
+
+
+ } else {
+ seq_puts(m, "Chunk:\n");
+ chunk_map_stats(m, chunk, buffer);
+ }
+
+ }
+ }
+
+ spin_unlock_irq(&pcpu_lock);
+
+ vfree(buffer);
+
+ return 0;
+}
+
+static int percpu_stats_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, percpu_stats_show, NULL);
+}
+
+static const struct file_operations percpu_stats_fops = {
+ .open = percpu_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init init_percpu_stats_debugfs(void)
+{
+ debugfs_create_file("percpu_stats", 0444, NULL, NULL,
+ &percpu_stats_fops);
+
+ return 0;
+}
+
+late_initcall(init_percpu_stats_debugfs);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9ac639499bd1..15dab691ea70 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -343,12 +343,22 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
chunk->data = vms;
chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
+
+ pcpu_stats_chunk_alloc();
+ trace_percpu_create_chunk(chunk->base_addr);
+
return chunk;
}
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
{
- if (chunk && chunk->data)
+ if (!chunk)
+ return;
+
+ pcpu_stats_chunk_dealloc();
+ trace_percpu_destroy_chunk(chunk->base_addr);
+
+ if (chunk->data)
pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
pcpu_free_chunk(chunk);
}
diff --git a/mm/percpu.c b/mm/percpu.c
index e0aa8ae7bde7..bd4130a69bbc 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -76,6 +76,11 @@
#include <asm/tlbflush.h>
#include <asm/io.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/percpu.h>
+
+#include "percpu-internal.h"
+
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
@@ -103,53 +108,35 @@
#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
#endif /* CONFIG_SMP */
-struct pcpu_chunk {
- struct list_head list; /* linked to pcpu_slot lists */
- int free_size; /* free bytes in the chunk */
- int contig_hint; /* max contiguous size hint */
- void *base_addr; /* base address of this chunk */
-
- int map_used; /* # of map entries used before the sentry */
- int map_alloc; /* # of map entries allocated */
- int *map; /* allocation map */
- struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
-
- void *data; /* chunk data */
- int first_free; /* no free below this */
- bool immutable; /* no [de]population allowed */
- int nr_populated; /* # of populated pages */
- unsigned long populated[]; /* populated bitmap */
-};
-
-static int pcpu_unit_pages __read_mostly;
-static int pcpu_unit_size __read_mostly;
-static int pcpu_nr_units __read_mostly;
-static int pcpu_atom_size __read_mostly;
-static int pcpu_nr_slots __read_mostly;
-static size_t pcpu_chunk_struct_size __read_mostly;
+static int pcpu_unit_pages __ro_after_init;
+static int pcpu_unit_size __ro_after_init;
+static int pcpu_nr_units __ro_after_init;
+static int pcpu_atom_size __ro_after_init;
+int pcpu_nr_slots __ro_after_init;
+static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */
-static unsigned int pcpu_low_unit_cpu __read_mostly;
-static unsigned int pcpu_high_unit_cpu __read_mostly;
+static unsigned int pcpu_low_unit_cpu __ro_after_init;
+static unsigned int pcpu_high_unit_cpu __ro_after_init;
/* the address of the first chunk which starts with the kernel static area */
-void *pcpu_base_addr __read_mostly;
+void *pcpu_base_addr __ro_after_init;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
-static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
-const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
+static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */
+const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */
/* group information, used for vm allocation */
-static int pcpu_nr_groups __read_mostly;
-static const unsigned long *pcpu_group_offsets __read_mostly;
-static const size_t *pcpu_group_sizes __read_mostly;
+static int pcpu_nr_groups __ro_after_init;
+static const unsigned long *pcpu_group_offsets __ro_after_init;
+static const size_t *pcpu_group_sizes __ro_after_init;
/*
* The first chunk which always exists. Note that unlike other
* chunks, this one can be allocated and mapped in several different
* ways and thus often doesn't live in the vmalloc area.
*/
-static struct pcpu_chunk *pcpu_first_chunk;
+struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
/*
* Optional reserved chunk. This chunk reserves part of the first
@@ -158,13 +145,13 @@ static struct pcpu_chunk *pcpu_first_chunk;
* area doesn't exist, the following variables contain NULL and 0
* respectively.
*/
-static struct pcpu_chunk *pcpu_reserved_chunk;
-static int pcpu_reserved_chunk_limit;
+struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
+static int pcpu_reserved_chunk_limit __ro_after_init;
-static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
+DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
-static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
/* chunks which need their map areas extended, protected by pcpu_lock */
static LIST_HEAD(pcpu_map_extend_chunks);
@@ -672,6 +659,9 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
int to_free = 0;
int *p;
+ lockdep_assert_held(&pcpu_lock);
+ pcpu_stats_area_dealloc(chunk);
+
freeme |= 1; /* we are searching for <given offset, in use> pair */
i = 0;
@@ -735,6 +725,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map[0] = 0;
chunk->map[1] = pcpu_unit_size | 1;
chunk->map_used = 1;
+ chunk->has_reserved = false;
INIT_LIST_HEAD(&chunk->list);
INIT_LIST_HEAD(&chunk->map_extend_list);
@@ -965,8 +956,10 @@ restart:
* tasks to create chunks simultaneously. Serialize and create iff
* there's still no empty chunk after grabbing the mutex.
*/
- if (is_atomic)
+ if (is_atomic) {
+ err = "atomic alloc failed, no space left";
goto fail;
+ }
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
chunk = pcpu_create_chunk();
@@ -984,6 +977,7 @@ restart:
goto restart;
area_found:
+ pcpu_stats_area_alloc(chunk, size);
spin_unlock_irqrestore(&pcpu_lock, flags);
/* populate if not all pages are already there */
@@ -1026,11 +1020,17 @@ area_found:
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
kmemleak_alloc_percpu(ptr, size, gfp);
+
+ trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
+ chunk->base_addr, off, ptr);
+
return ptr;
fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
+ trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
+
if (!is_atomic && warn_limit) {
pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
size, align, is_atomic, err);
@@ -1280,6 +1280,8 @@ void free_percpu(void __percpu *ptr)
}
}
+ trace_percpu_free_percpu(chunk->base_addr, off, ptr);
+
spin_unlock_irqrestore(&pcpu_lock, flags);
}
EXPORT_SYMBOL_GPL(free_percpu);
@@ -1656,6 +1658,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
+ pcpu_stats_save_ai(ai);
+
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
@@ -1699,6 +1703,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (schunk->free_size)
schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;
schunk->map[schunk->map_used] |= 1;
+ schunk->has_reserved = true;
/* init dynamic chunk if necessary */
if (dyn_size) {
@@ -1717,6 +1722,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
dchunk->map[1] = pcpu_reserved_chunk_limit;
dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
dchunk->map_used = 2;
+ dchunk->has_reserved = true;
}
/* link the first chunk in */
@@ -1725,6 +1731,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_count_occupied_pages(pcpu_first_chunk, 1);
pcpu_chunk_relocate(pcpu_first_chunk, -1);
+ pcpu_stats_chunk_alloc();
+ trace_percpu_create_chunk(base_addr);
+
/* we're done */
pcpu_base_addr = base_addr;
return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index d405f0e0ee96..ced14f1af6dc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
void try_to_unmap_flush(void)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- int cpu;
if (!tlb_ubc->flush_required)
return;
- cpu = get_cpu();
-
- if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- local_flush_tlb();
- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
- }
-
- if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
- cpumask_clear(&tlb_ubc->cpumask);
+ arch_tlbbatch_flush(&tlb_ubc->arch);
tlb_ubc->flush_required = false;
tlb_ubc->writable = false;
- put_cpu();
}
/* Flush iff there are potentially writable TLB entries that can race with IO */
@@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+ arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
/*
@@ -1157,8 +1145,7 @@ void page_add_file_rmap(struct page *page, bool compound)
if (!atomic_inc_and_test(&page->_mapcount))
goto out;
}
- __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
- mod_memcg_page_state(page, NR_FILE_MAPPED, nr);
+ __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
out:
unlock_page_memcg(page);
}
@@ -1193,12 +1180,11 @@ static void page_remove_file_rmap(struct page *page, bool compound)
}
/*
- * We use the irq-unsafe __{inc|mod}_zone_page_state because
+ * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
- mod_memcg_page_state(page, NR_FILE_MAPPED, -nr);
+ __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
@@ -1379,15 +1365,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
update_hiwater_rss(mm);
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+ pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (PageHuge(page)) {
int nr = 1 << compound_order(page);
hugetlb_count_sub(nr, mm);
+ set_huge_swap_pte_at(mm, address,
+ pvmw.pte, pteval,
+ vma_mmu_pagesize(vma));
} else {
dec_mm_counter(mm, mm_counter(page));
+ set_pte_at(mm, address, pvmw.pte, pteval);
}
- pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
- set_pte_at(mm, address, pvmw.pte, pteval);
} else if (pte_unused(pteval)) {
/*
* The guest indicated that the page content is of no
diff --git a/mm/shmem.c b/mm/shmem.c
index e67d6ba4e98e..b0aa6075d164 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -75,6 +75,7 @@ static struct vfsmount *shm_mnt;
#include <uapi/linux/memfd.h>
#include <linux/userfaultfd_k.h>
#include <linux/rmap.h>
+#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
@@ -1290,7 +1291,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
SetPageUptodate(page);
}
- swap = get_swap_page();
+ swap = get_swap_page(page);
if (!swap.val)
goto redirty;
@@ -1326,7 +1327,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
mutex_unlock(&shmem_swaplist_mutex);
free_swap:
- swapcache_free(swap);
+ put_swap_page(page, swap);
redirty:
set_page_dirty(page);
if (wbc->for_reclaim)
@@ -1645,8 +1646,7 @@ repeat:
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(charge_mm,
- PGMAJFAULT);
+ count_memcg_event_mm(charge_mm, PGMAJFAULT);
}
/* Here we actually start the io */
page = shmem_swapin(swap, gfp, info, index);
@@ -1902,10 +1902,10 @@ unlock:
* entry unconditionally - even if something else had already woken the
* target.
*/
-static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
- list_del_init(&wait->task_list);
+ list_del_init(&wait->entry);
return ret;
}
@@ -1977,10 +1977,12 @@ static int shmem_fault(struct vm_fault *vmf)
}
sgp = SGP_CACHE;
- if (vma->vm_flags & VM_HUGEPAGE)
- sgp = SGP_HUGE;
- else if (vma->vm_flags & VM_NOHUGEPAGE)
+
+ if ((vma->vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
sgp = SGP_NOHUGE;
+ else if (vma->vm_flags & VM_HUGEPAGE)
+ sgp = SGP_HUGE;
error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
gfp, vma, vmf, &ret);
@@ -2840,7 +2842,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
spin_lock(&inode->i_lock);
inode->i_private = NULL;
wake_up_all(&shmem_falloc_waitq);
- WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list));
+ WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
spin_unlock(&inode->i_lock);
error = 0;
goto out;
@@ -3761,6 +3763,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_TMPFS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
#endif
+ uuid_gen(&sb->s_uuid);
inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
if (!inode)
diff --git a/mm/slab.c b/mm/slab.c
index 2a31ee3c5814..04dec48c3ed7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1425,11 +1425,9 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
- add_zone_page_state(page_zone(page),
- NR_SLAB_RECLAIMABLE, nr_pages);
+ mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages);
else
- add_zone_page_state(page_zone(page),
- NR_SLAB_UNRECLAIMABLE, nr_pages);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages);
__SetPageSlab(page);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
@@ -1459,11 +1457,9 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
kmemcheck_free_shadow(page, order);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
- sub_zone_page_state(page_zone(page),
- NR_SLAB_RECLAIMABLE, nr_freed);
+ mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
else
- sub_zone_page_state(page_zone(page),
- NR_SLAB_UNRECLAIMABLE, nr_freed);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed);
BUG_ON(!PageSlab(page));
__ClearPageSlabPfmemalloc(page);
@@ -2040,17 +2036,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
- if (size & (BYTES_PER_WORD - 1)) {
- size += (BYTES_PER_WORD - 1);
- size &= ~(BYTES_PER_WORD - 1);
- }
+ size = ALIGN(size, BYTES_PER_WORD);
if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
/* If redzoning, ensure that the second redzone is suitably
* aligned, by adjusting the object size accordingly. */
- size += REDZONE_ALIGN - 1;
- size &= ~(REDZONE_ALIGN - 1);
+ size = ALIGN(size, REDZONE_ALIGN);
}
/* 3) caller mandated alignment */
diff --git a/mm/slab.h b/mm/slab.h
index 9cfcf099709c..6885e1192ec5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -274,22 +274,11 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
- int ret;
-
if (!memcg_kmem_enabled())
return 0;
if (is_root_cache(s))
return 0;
-
- ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
- if (ret)
- return ret;
-
- memcg_kmem_update_page_stat(page,
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
- 1 << order);
- return 0;
+ return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
}
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
@@ -297,11 +286,6 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
{
if (!memcg_kmem_enabled())
return;
-
- memcg_kmem_update_page_stat(page,
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
- -(1 << order));
memcg_kmem_uncharge(page, order);
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 01a0fe2eb332..904a83be82de 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -47,13 +47,12 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
/*
* Merge control. If this is set then no merging of slab caches will occur.
- * (Could be removed. This was introduced to pacify the merge skeptics.)
*/
-static int slab_nomerge;
+static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
static int __init setup_slab_nomerge(char *str)
{
- slab_nomerge = 1;
+ slab_nomerge = true;
return 1;
}
diff --git a/mm/slub.c b/mm/slub.c
index 7449593fca72..1d3f9835f4ea 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1615,7 +1615,7 @@ out:
if (!page)
return NULL;
- mod_zone_page_state(page_zone(page),
+ mod_lruvec_page_state(page,
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1 << oo_order(oo));
@@ -1655,7 +1655,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
kmemcheck_free_shadow(page, compound_order(page));
- mod_zone_page_state(page_zone(page),
+ mod_lruvec_page_state(page,
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-pages);
@@ -1829,7 +1829,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
stat(s, CPU_PARTIAL_NODE);
}
if (!kmem_cache_has_cpu_partial(s)
- || available > s->cpu_partial / 2)
+ || available > slub_cpu_partial(s) / 2)
break;
}
@@ -1993,7 +1993,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s)
* Remove the cpu slab
*/
static void deactivate_slab(struct kmem_cache *s, struct page *page,
- void *freelist)
+ void *freelist, struct kmem_cache_cpu *c)
{
enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -2132,6 +2132,9 @@ redo:
discard_slab(s, page);
stat(s, FREE_SLAB);
}
+
+ c->page = NULL;
+ c->freelist = NULL;
}
/*
@@ -2266,11 +2269,9 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
stat(s, CPUSLAB_FLUSH);
- deactivate_slab(s, c->page, c->freelist);
+ deactivate_slab(s, c->page, c->freelist, c);
c->tid = next_tid(c->tid);
- c->page = NULL;
- c->freelist = NULL;
}
/*
@@ -2302,7 +2303,7 @@ static bool has_cpu_slab(int cpu, void *info)
struct kmem_cache *s = info;
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- return c->page || c->partial;
+ return c->page || slub_percpu_partial(c);
}
static void flush_all(struct kmem_cache *s)
@@ -2521,9 +2522,7 @@ redo:
if (unlikely(!node_match(page, searchnode))) {
stat(s, ALLOC_NODE_MISMATCH);
- deactivate_slab(s, page, c->freelist);
- c->page = NULL;
- c->freelist = NULL;
+ deactivate_slab(s, page, c->freelist, c);
goto new_slab;
}
}
@@ -2534,9 +2533,7 @@ redo:
* information when the page leaves the per-cpu allocator
*/
if (unlikely(!pfmemalloc_match(page, gfpflags))) {
- deactivate_slab(s, page, c->freelist);
- c->page = NULL;
- c->freelist = NULL;
+ deactivate_slab(s, page, c->freelist, c);
goto new_slab;
}
@@ -2568,11 +2565,10 @@ load_freelist:
new_slab:
- if (c->partial) {
- page = c->page = c->partial;
- c->partial = page->next;
+ if (slub_percpu_partial(c)) {
+ page = c->page = slub_percpu_partial(c);
+ slub_set_percpu_partial(c, page);
stat(s, CPU_PARTIAL_ALLOC);
- c->freelist = NULL;
goto redo;
}
@@ -2592,9 +2588,7 @@ new_slab:
!alloc_debug_processing(s, page, freelist, addr))
goto new_slab; /* Slab failed checks. Next slab needed */
- deactivate_slab(s, page, get_freepointer(s, freelist));
- c->page = NULL;
- c->freelist = NULL;
+ deactivate_slab(s, page, get_freepointer(s, freelist), c);
return freelist;
}
@@ -3410,6 +3404,39 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
s->min_partial = min;
}
+static void set_cpu_partial(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ /*
+ * cpu_partial determined the maximum number of objects kept in the
+ * per cpu partial lists of a processor.
+ *
+ * Per cpu partial lists mainly contain slabs that just have one
+ * object freed. If they are used for allocation then they can be
+ * filled up again with minimal effort. The slab will never hit the
+ * per node partial lists and therefore no locking will be required.
+ *
+ * This setting also determines
+ *
+ * A) The number of objects from per cpu partial slabs dumped to the
+ * per node list when we reach the limit.
+ * B) The number of objects in cpu partial slabs to extract from the
+ * per node list when we run out of per cpu objects. We only fetch
+ * 50% to keep some capacity around for frees.
+ */
+ if (!kmem_cache_has_cpu_partial(s))
+ s->cpu_partial = 0;
+ else if (s->size >= PAGE_SIZE)
+ s->cpu_partial = 2;
+ else if (s->size >= 1024)
+ s->cpu_partial = 6;
+ else if (s->size >= 256)
+ s->cpu_partial = 13;
+ else
+ s->cpu_partial = 30;
+#endif
+}
+
/*
* calculate_sizes() determines the order and the distribution of data within
* a slab object.
@@ -3568,33 +3595,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
*/
set_min_partial(s, ilog2(s->size) / 2);
- /*
- * cpu_partial determined the maximum number of objects kept in the
- * per cpu partial lists of a processor.
- *
- * Per cpu partial lists mainly contain slabs that just have one
- * object freed. If they are used for allocation then they can be
- * filled up again with minimal effort. The slab will never hit the
- * per node partial lists and therefore no locking will be required.
- *
- * This setting also determines
- *
- * A) The number of objects from per cpu partial slabs dumped to the
- * per node list when we reach the limit.
- * B) The number of objects in cpu partial slabs to extract from the
- * per node list when we run out of per cpu objects. We only fetch
- * 50% to keep some capacity around for frees.
- */
- if (!kmem_cache_has_cpu_partial(s))
- s->cpu_partial = 0;
- else if (s->size >= PAGE_SIZE)
- s->cpu_partial = 2;
- else if (s->size >= 1024)
- s->cpu_partial = 6;
- else if (s->size >= 256)
- s->cpu_partial = 13;
- else
- s->cpu_partial = 30;
+ set_cpu_partial(s);
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
@@ -3981,7 +3982,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)
* Disable empty slabs caching. Used to avoid pinning offline
* memory cgroups by kmem pages that can be freed.
*/
- s->cpu_partial = 0;
+ slub_set_cpu_partial(s, 0);
s->min_partial = 0;
/*
@@ -4760,7 +4761,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
total += x;
nodes[node] += x;
- page = READ_ONCE(c->partial);
+ page = slub_percpu_partial_read_once(c);
if (page) {
node = page_to_nid(page);
if (flags & SO_TOTAL)
@@ -4921,7 +4922,7 @@ SLAB_ATTR(min_partial);
static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->cpu_partial);
+ return sprintf(buf, "%u\n", slub_cpu_partial(s));
}
static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
@@ -4936,7 +4937,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
if (objects && !kmem_cache_has_cpu_partial(s))
return -EINVAL;
- s->cpu_partial = objects;
+ slub_set_cpu_partial(s, objects);
flush_all(s);
return length;
}
@@ -4988,7 +4989,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
int len;
for_each_online_cpu(cpu) {
- struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
+ struct page *page;
+
+ page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
if (page) {
pages += page->pages;
@@ -5000,7 +5003,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
#ifdef CONFIG_SMP
for_each_online_cpu(cpu) {
- struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
+ struct page *page;
+
+ page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
if (page && len < PAGE_SIZE - 20)
len += sprintf(buf + len, " C%d=%d(%d)", cpu,
@@ -5625,6 +5630,28 @@ static char *create_unique_id(struct kmem_cache *s)
return name;
}
+static void sysfs_slab_remove_workfn(struct work_struct *work)
+{
+ struct kmem_cache *s =
+ container_of(work, struct kmem_cache, kobj_remove_work);
+
+ if (!s->kobj.state_in_sysfs)
+ /*
+ * For a memcg cache, this may be called during
+ * deactivation and again on shutdown. Remove only once.
+ * A cache is never shut down before deactivation is
+ * complete, so no need to worry about synchronization.
+ */
+ return;
+
+#ifdef CONFIG_MEMCG
+ kset_unregister(s->memcg_kset);
+#endif
+ kobject_uevent(&s->kobj, KOBJ_REMOVE);
+ kobject_del(&s->kobj);
+ kobject_put(&s->kobj);
+}
+
static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
@@ -5632,6 +5659,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
+ INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
+
if (!kset) {
kobject_init(&s->kobj, &slab_ktype);
return 0;
@@ -5695,20 +5724,8 @@ static void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
- if (!s->kobj.state_in_sysfs)
- /*
- * For a memcg cache, this may be called during
- * deactivation and again on shutdown. Remove only once.
- * A cache is never shut down before deactivation is
- * complete, so no need to worry about synchronization.
- */
- return;
-
-#ifdef CONFIG_MEMCG
- kset_unregister(s->memcg_kset);
-#endif
- kobject_uevent(&s->kobj, KOBJ_REMOVE);
- kobject_del(&s->kobj);
+ kobject_get(&s->kobj);
+ schedule_work(&s->kobj_remove_work);
}
void sysfs_slab_release(struct kmem_cache *s)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a56c3989f773..c50b1a14d55e 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -56,11 +56,11 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
if (node_state(node, N_HIGH_MEMORY))
page = alloc_pages_node(
- node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
+ node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
get_order(size));
else
page = alloc_pages(
- GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
+ GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
get_order(size));
if (page)
return page_address(page);
diff --git a/mm/sparse.c b/mm/sparse.c
index 6903c8fc3085..7b4be3fd5cac 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -168,6 +168,44 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
}
+/*
+ * There are a number of times that we loop over NR_MEM_SECTIONS,
+ * looking for section_present() on each. But, when we have very
+ * large physical address spaces, NR_MEM_SECTIONS can also be
+ * very large which makes the loops quite long.
+ *
+ * Keeping track of this gives us an easy way to break out of
+ * those loops early.
+ */
+int __highest_present_section_nr;
+static void section_mark_present(struct mem_section *ms)
+{
+ int section_nr = __section_nr(ms);
+
+ if (section_nr > __highest_present_section_nr)
+ __highest_present_section_nr = section_nr;
+
+ ms->section_mem_map |= SECTION_MARKED_PRESENT;
+}
+
+static inline int next_present_section_nr(int section_nr)
+{
+ do {
+ section_nr++;
+ if (present_section_nr(section_nr))
+ return section_nr;
+ } while ((section_nr < NR_MEM_SECTIONS) &&
+ (section_nr <= __highest_present_section_nr));
+
+ return -1;
+}
+#define for_each_present_section_nr(start, section_nr) \
+ for (section_nr = next_present_section_nr(start-1); \
+ ((section_nr >= 0) && \
+ (section_nr < NR_MEM_SECTIONS) && \
+ (section_nr <= __highest_present_section_nr)); \
+ section_nr = next_present_section_nr(section_nr))
+
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -183,9 +221,11 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
set_section_nid(section, nid);
ms = __nr_to_section(section);
- if (!ms->section_mem_map)
+ if (!ms->section_mem_map) {
ms->section_mem_map = sparse_encode_early_nid(nid) |
- SECTION_MARKED_PRESENT;
+ SECTION_IS_ONLINE;
+ section_mark_present(ms);
+ }
}
}
@@ -476,23 +516,19 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)
int nodeid_begin = 0;
unsigned long pnum_begin = 0;
- for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ for_each_present_section_nr(0, pnum) {
struct mem_section *ms;
- if (!present_section_nr(pnum))
- continue;
ms = __nr_to_section(pnum);
nodeid_begin = sparse_early_nid(ms);
pnum_begin = pnum;
break;
}
map_count = 1;
- for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ for_each_present_section_nr(pnum_begin + 1, pnum) {
struct mem_section *ms;
int nodeid;
- if (!present_section_nr(pnum))
- continue;
ms = __nr_to_section(pnum);
nodeid = sparse_early_nid(ms);
if (nodeid == nodeid_begin) {
@@ -561,10 +597,7 @@ void __init sparse_init(void)
(void *)map_map);
#endif
- for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
- if (!present_section_nr(pnum))
- continue;
-
+ for_each_present_section_nr(0, pnum) {
usemap = usemap_map[pnum];
if (!usemap)
continue;
@@ -590,6 +623,48 @@ void __init sparse_init(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
+
+/* Mark all memory sections within the pfn range as online */
+void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ struct mem_section *ms;
+
+ /* onlining code should never touch invalid ranges */
+ if (WARN_ON(!valid_section_nr(section_nr)))
+ continue;
+
+ ms = __nr_to_section(section_nr);
+ ms->section_mem_map |= SECTION_IS_ONLINE;
+ }
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/* Mark all memory sections within the pfn range as online */
+void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ struct mem_section *ms;
+
+ /*
+ * TODO this needs some double checking. Offlining code makes
+ * sure to check pfn_valid but those checks might be just bogus
+ */
+ if (WARN_ON(!valid_section_nr(section_nr)))
+ continue;
+
+ ms = __nr_to_section(section_nr);
+ ms->section_mem_map &= ~SECTION_IS_ONLINE;
+ }
+}
+#endif
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
{
@@ -686,10 +761,9 @@ static void free_map_bootmem(struct page *memmap)
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
-int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
+int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
- struct pglist_data *pgdat = zone->zone_pgdat;
struct mem_section *ms;
struct page *memmap;
unsigned long *usemap;
@@ -722,7 +796,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
- ms->section_mem_map |= SECTION_MARKED_PRESENT;
+ section_mark_present(ms);
ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
diff --git a/mm/swap.c b/mm/swap.c
index 98d08b4579fa..60b1d2a75852 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -591,6 +591,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
__count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
+ count_memcg_page_event(page, PGLAZYFREE);
update_page_reclaim_stat(lruvec, 1, 0);
}
}
@@ -687,7 +688,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-void lru_add_drain_all(void)
+void lru_add_drain_all_cpuslocked(void)
{
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
@@ -701,7 +702,6 @@ void lru_add_drain_all(void)
return;
mutex_lock(&lock);
- get_online_cpus();
cpumask_clear(&has_work);
for_each_online_cpu(cpu) {
@@ -721,10 +721,16 @@ void lru_add_drain_all(void)
for_each_cpu(cpu, &has_work)
flush_work(&per_cpu(lru_add_drain_work, cpu));
- put_online_cpus();
mutex_unlock(&lock);
}
+void lru_add_drain_all(void)
+{
+ get_online_cpus();
+ lru_add_drain_all_cpuslocked();
+ put_online_cpus();
+}
+
/**
* release_pages - batched put_page()
* @pages: array of pages to release
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 3405b4ee1757..fcd2740f4ed7 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -61,21 +61,27 @@ not_enough_page:
return -ENOMEM;
}
+static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl,
+ pgoff_t offset)
+{
+ struct page *mappage;
+ struct swap_cgroup *sc;
+
+ mappage = ctrl->map[offset / SC_PER_PAGE];
+ sc = page_address(mappage);
+ return sc + offset % SC_PER_PAGE;
+}
+
static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
struct swap_cgroup_ctrl **ctrlp)
{
pgoff_t offset = swp_offset(ent);
struct swap_cgroup_ctrl *ctrl;
- struct page *mappage;
- struct swap_cgroup *sc;
ctrl = &swap_cgroup_ctrl[swp_type(ent)];
if (ctrlp)
*ctrlp = ctrl;
-
- mappage = ctrl->map[offset / SC_PER_PAGE];
- sc = page_address(mappage);
- return sc + offset % SC_PER_PAGE;
+ return __lookup_swap_cgroup(ctrl, offset);
}
/**
@@ -108,25 +114,39 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
}
/**
- * swap_cgroup_record - record mem_cgroup for this swp_entry.
- * @ent: swap entry to be recorded into
+ * swap_cgroup_record - record mem_cgroup for a set of swap entries
+ * @ent: the first swap entry to be recorded into
* @id: mem_cgroup to be recorded
+ * @nr_ents: number of swap entries to be recorded
*
* Returns old value at success, 0 at failure.
* (Of course, old value can be 0.)
*/
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
+ unsigned int nr_ents)
{
struct swap_cgroup_ctrl *ctrl;
struct swap_cgroup *sc;
unsigned short old;
unsigned long flags;
+ pgoff_t offset = swp_offset(ent);
+ pgoff_t end = offset + nr_ents;
sc = lookup_swap_cgroup(ent, &ctrl);
spin_lock_irqsave(&ctrl->lock, flags);
old = sc->id;
- sc->id = id;
+ for (;;) {
+ VM_BUG_ON(sc->id != old);
+ sc->id = id;
+ offset++;
+ if (offset == end)
+ break;
+ if (offset % SC_PER_PAGE)
+ sc++;
+ else
+ sc = __lookup_swap_cgroup(ctrl, offset);
+ }
spin_unlock_irqrestore(&ctrl->lock, flags);
return old;
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 58f6c78f1dad..13a174006b91 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -263,7 +263,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
cache->cur = 0;
if (swap_slot_cache_active)
- cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots);
+ cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false,
+ cache->slots);
return cache->nr;
}
@@ -272,11 +273,11 @@ int free_swap_slot(swp_entry_t entry)
{
struct swap_slots_cache *cache;
- cache = &get_cpu_var(swp_slots);
+ cache = raw_cpu_ptr(&swp_slots);
if (use_swap_slot_cache && cache->slots_ret) {
spin_lock_irq(&cache->free_lock);
/* Swap slots cache may be deactivated before acquiring lock */
- if (!use_swap_slot_cache) {
+ if (!use_swap_slot_cache || !cache->slots_ret) {
spin_unlock_irq(&cache->free_lock);
goto direct_free;
}
@@ -296,16 +297,23 @@ int free_swap_slot(swp_entry_t entry)
direct_free:
swapcache_free_entries(&entry, 1);
}
- put_cpu_var(swp_slots);
return 0;
}
-swp_entry_t get_swap_page(void)
+swp_entry_t get_swap_page(struct page *page)
{
swp_entry_t entry, *pentry;
struct swap_slots_cache *cache;
+ entry.val = 0;
+
+ if (PageTransHuge(page)) {
+ if (IS_ENABLED(CONFIG_THP_SWAP))
+ get_swap_pages(1, true, &entry);
+ return entry;
+ }
+
/*
* Preemption is allowed here, because we may sleep
* in refill_swap_slots_cache(). But it is safe, because
@@ -317,7 +325,6 @@ swp_entry_t get_swap_page(void)
*/
cache = raw_cpu_ptr(&swp_slots);
- entry.val = 0;
if (check_cache_active()) {
mutex_lock(&cache->alloc_lock);
if (cache->slots) {
@@ -337,7 +344,7 @@ repeat:
return entry;
}
- get_swap_pages(1, &entry);
+ get_swap_pages(1, false, &entry);
return entry;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 539b8885e3d1..b68c93014f50 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -19,6 +19,7 @@
#include <linux/migrate.h>
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
+#include <linux/huge_mm.h>
#include <asm/pgtable.h>
@@ -38,6 +39,7 @@ struct address_space *swapper_spaces[MAX_SWAPFILES];
static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
+#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
static struct {
unsigned long add_total;
@@ -90,39 +92,46 @@ void show_swap_cache_info(void)
*/
int __add_to_swap_cache(struct page *page, swp_entry_t entry)
{
- int error;
+ int error, i, nr = hpage_nr_pages(page);
struct address_space *address_space;
+ pgoff_t idx = swp_offset(entry);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- get_page(page);
+ page_ref_add(page, nr);
SetPageSwapCache(page);
- set_page_private(page, entry.val);
address_space = swap_address_space(entry);
spin_lock_irq(&address_space->tree_lock);
- error = radix_tree_insert(&address_space->page_tree,
- swp_offset(entry), page);
- if (likely(!error)) {
- address_space->nrpages++;
- __inc_node_page_state(page, NR_FILE_PAGES);
- INC_CACHE_INFO(add_total);
+ for (i = 0; i < nr; i++) {
+ set_page_private(page + i, entry.val + i);
+ error = radix_tree_insert(&address_space->page_tree,
+ idx + i, page + i);
+ if (unlikely(error))
+ break;
}
- spin_unlock_irq(&address_space->tree_lock);
-
- if (unlikely(error)) {
+ if (likely(!error)) {
+ address_space->nrpages += nr;
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
+ ADD_CACHE_INFO(add_total, nr);
+ } else {
/*
* Only the context which have set SWAP_HAS_CACHE flag
* would call add_to_swap_cache().
* So add_to_swap_cache() doesn't returns -EEXIST.
*/
VM_BUG_ON(error == -EEXIST);
- set_page_private(page, 0UL);
+ set_page_private(page + i, 0UL);
+ while (i--) {
+ radix_tree_delete(&address_space->page_tree, idx + i);
+ set_page_private(page + i, 0UL);
+ }
ClearPageSwapCache(page);
- put_page(page);
+ page_ref_sub(page, nr);
}
+ spin_unlock_irq(&address_space->tree_lock);
return error;
}
@@ -132,7 +141,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
int error;
- error = radix_tree_maybe_preload(gfp_mask);
+ error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
if (!error) {
error = __add_to_swap_cache(page, entry);
radix_tree_preload_end();
@@ -146,8 +155,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
*/
void __delete_from_swap_cache(struct page *page)
{
- swp_entry_t entry;
struct address_space *address_space;
+ int i, nr = hpage_nr_pages(page);
+ swp_entry_t entry;
+ pgoff_t idx;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
@@ -155,12 +166,15 @@ void __delete_from_swap_cache(struct page *page)
entry.val = page_private(page);
address_space = swap_address_space(entry);
- radix_tree_delete(&address_space->page_tree, swp_offset(entry));
- set_page_private(page, 0);
+ idx = swp_offset(entry);
+ for (i = 0; i < nr; i++) {
+ radix_tree_delete(&address_space->page_tree, idx + i);
+ set_page_private(page + i, 0);
+ }
ClearPageSwapCache(page);
- address_space->nrpages--;
- __dec_node_page_state(page, NR_FILE_PAGES);
- INC_CACHE_INFO(del_total);
+ address_space->nrpages -= nr;
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+ ADD_CACHE_INFO(del_total, nr);
}
/**
@@ -170,7 +184,7 @@ void __delete_from_swap_cache(struct page *page)
* Allocate swap space for the page and add the page to the
* swap cache. Caller needs to hold the page lock.
*/
-int add_to_swap(struct page *page, struct list_head *list)
+int add_to_swap(struct page *page)
{
swp_entry_t entry;
int err;
@@ -178,20 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list)
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageUptodate(page), page);
- entry = get_swap_page();
+ entry = get_swap_page(page);
if (!entry.val)
return 0;
- if (mem_cgroup_try_charge_swap(page, entry)) {
- swapcache_free(entry);
- return 0;
- }
-
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page_to_list(page, list))) {
- swapcache_free(entry);
- return 0;
- }
+ if (mem_cgroup_try_charge_swap(page, entry))
+ goto fail;
/*
* Radix-tree node allocations from PF_MEMALLOC contexts could
@@ -206,17 +212,19 @@ int add_to_swap(struct page *page, struct list_head *list)
*/
err = add_to_swap_cache(page, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
-
- if (!err) {
- return 1;
- } else { /* -ENOMEM radix-tree allocation failure */
+ /* -ENOMEM radix-tree allocation failure */
+ if (err)
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
*/
- swapcache_free(entry);
- return 0;
- }
+ goto fail;
+
+ return 1;
+
+fail:
+ put_swap_page(page, entry);
+ return 0;
}
/*
@@ -237,8 +245,8 @@ void delete_from_swap_cache(struct page *page)
__delete_from_swap_cache(page);
spin_unlock_irq(&address_space->tree_lock);
- swapcache_free(entry);
- put_page(page);
+ put_swap_page(page, entry);
+ page_ref_sub(page, hpage_nr_pages(page));
}
/*
@@ -295,7 +303,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
page = find_get_page(swap_address_space(entry), swp_offset(entry));
- if (page) {
+ if (page && likely(!PageTransCompound(page))) {
INC_CACHE_INFO(find_success);
if (TestClearPageReadahead(page))
atomic_inc(&swapin_readahead_hits);
@@ -389,7 +397,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
*/
- swapcache_free(entry);
+ put_swap_page(new_page, entry);
} while (err != -ENOMEM);
if (new_page)
@@ -404,14 +412,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* the swap entry is no longer in use.
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr)
+ struct vm_area_struct *vma, unsigned long addr, bool do_poll)
{
bool page_was_allocated;
struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
vma, addr, &page_was_allocated);
if (page_was_allocated)
- swap_readpage(retpage);
+ swap_readpage(retpage, do_poll);
return retpage;
}
@@ -488,11 +496,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long start_offset, end_offset;
unsigned long mask;
struct blk_plug plug;
+ bool do_poll = true;
mask = swapin_nr_pages(offset) - 1;
if (!mask)
goto skip;
+ do_poll = false;
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
end_offset = offset | mask;
@@ -503,10 +513,10 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
- gfp_mask, vma, addr);
+ gfp_mask, vma, addr, false);
if (!page)
continue;
- if (offset != entry_offset)
+ if (offset != entry_offset && likely(!PageTransCompound(page)))
SetPageReadahead(page);
put_page(page);
}
@@ -514,7 +524,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
- return read_swap_cache_async(entry, gfp_mask, vma, addr);
+ return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
}
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4f6cba1b6632..6ba4aab2db0b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -37,6 +37,7 @@
#include <linux/swapfile.h>
#include <linux/export.h>
#include <linux/swap_slots.h>
+#include <linux/sort.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -199,7 +200,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
}
}
+#ifdef CONFIG_THP_SWAP
+#define SWAPFILE_CLUSTER HPAGE_PMD_NR
+#else
#define SWAPFILE_CLUSTER 256
+#endif
#define LATENCY_LIMIT 256
static inline void cluster_set_flag(struct swap_cluster_info *info,
@@ -374,6 +379,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
schedule_work(&si->discard_work);
}
+static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ struct swap_cluster_info *ci = si->cluster_info;
+
+ cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
+ cluster_list_add_tail(&si->free_clusters, ci, idx);
+}
+
/*
* Doing discard actually. After a cluster discard is finished, the cluster
* will be added to free cluster list. caller should hold si->lock.
@@ -394,10 +407,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
spin_lock(&si->lock);
ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
- cluster_set_flag(ci, CLUSTER_FLAG_FREE);
- unlock_cluster(ci);
- cluster_list_add_tail(&si->free_clusters, info, idx);
- ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+ __free_cluster(si, idx);
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
0, SWAPFILE_CLUSTER);
unlock_cluster(ci);
@@ -415,6 +425,34 @@ static void swap_discard_work(struct work_struct *work)
spin_unlock(&si->lock);
}
+static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ struct swap_cluster_info *ci = si->cluster_info;
+
+ VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
+ cluster_list_del_first(&si->free_clusters, ci);
+ cluster_set_count_flag(ci + idx, 0, 0);
+}
+
+static void free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ struct swap_cluster_info *ci = si->cluster_info + idx;
+
+ VM_BUG_ON(cluster_count(ci) != 0);
+ /*
+ * If the swap is discardable, prepare discard the cluster
+ * instead of free it immediately. The cluster will be freed
+ * after discard.
+ */
+ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+ (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+ swap_cluster_schedule_discard(si, idx);
+ return;
+ }
+
+ __free_cluster(si, idx);
+}
+
/*
* The cluster corresponding to page_nr will be used. The cluster will be
* removed from free cluster list and its usage counter will be increased.
@@ -426,11 +464,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
if (!cluster_info)
return;
- if (cluster_is_free(&cluster_info[idx])) {
- VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx);
- cluster_list_del_first(&p->free_clusters, cluster_info);
- cluster_set_count_flag(&cluster_info[idx], 0, 0);
- }
+ if (cluster_is_free(&cluster_info[idx]))
+ alloc_cluster(p, idx);
VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
cluster_set_count(&cluster_info[idx],
@@ -454,21 +489,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
cluster_set_count(&cluster_info[idx],
cluster_count(&cluster_info[idx]) - 1);
- if (cluster_count(&cluster_info[idx]) == 0) {
- /*
- * If the swap is discardable, prepare discard the cluster
- * instead of free it immediately. The cluster will be freed
- * after discard.
- */
- if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
- (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
- swap_cluster_schedule_discard(p, idx);
- return;
- }
-
- cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
- cluster_list_add_tail(&p->free_clusters, cluster_info, idx);
- }
+ if (cluster_count(&cluster_info[idx]) == 0)
+ free_cluster(p, idx);
}
/*
@@ -558,6 +580,60 @@ new_cluster:
return found_free;
}
+static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+ unsigned int nr_entries)
+{
+ unsigned int end = offset + nr_entries - 1;
+
+ if (offset == si->lowest_bit)
+ si->lowest_bit += nr_entries;
+ if (end == si->highest_bit)
+ si->highest_bit -= nr_entries;
+ si->inuse_pages += nr_entries;
+ if (si->inuse_pages == si->pages) {
+ si->lowest_bit = si->max;
+ si->highest_bit = 0;
+ spin_lock(&swap_avail_lock);
+ plist_del(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ }
+}
+
+static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
+ unsigned int nr_entries)
+{
+ unsigned long end = offset + nr_entries - 1;
+ void (*swap_slot_free_notify)(struct block_device *, unsigned long);
+
+ if (offset < si->lowest_bit)
+ si->lowest_bit = offset;
+ if (end > si->highest_bit) {
+ bool was_full = !si->highest_bit;
+
+ si->highest_bit = end;
+ if (was_full && (si->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ WARN_ON(!plist_node_empty(&si->avail_list));
+ if (plist_node_empty(&si->avail_list))
+ plist_add(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ }
+ }
+ atomic_long_add(nr_entries, &nr_swap_pages);
+ si->inuse_pages -= nr_entries;
+ if (si->flags & SWP_BLKDEV)
+ swap_slot_free_notify =
+ si->bdev->bd_disk->fops->swap_slot_free_notify;
+ else
+ swap_slot_free_notify = NULL;
+ while (offset <= end) {
+ frontswap_invalidate_page(si->type, offset);
+ if (swap_slot_free_notify)
+ swap_slot_free_notify(si->bdev, offset);
+ offset++;
+ }
+}
+
static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned char usage, int nr,
swp_entry_t slots[])
@@ -676,18 +752,7 @@ checks:
inc_cluster_info_page(si, si->cluster_info, offset);
unlock_cluster(ci);
- if (offset == si->lowest_bit)
- si->lowest_bit++;
- if (offset == si->highest_bit)
- si->highest_bit--;
- si->inuse_pages++;
- if (si->inuse_pages == si->pages) {
- si->lowest_bit = si->max;
- si->highest_bit = 0;
- spin_lock(&swap_avail_lock);
- plist_del(&si->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- }
+ swap_range_alloc(si, offset, 1);
si->cluster_next = offset + 1;
slots[n_ret++] = swp_entry(si->type, offset);
@@ -766,6 +831,52 @@ no_page:
return n_ret;
}
+#ifdef CONFIG_THP_SWAP
+static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+{
+ unsigned long idx;
+ struct swap_cluster_info *ci;
+ unsigned long offset, i;
+ unsigned char *map;
+
+ if (cluster_list_empty(&si->free_clusters))
+ return 0;
+
+ idx = cluster_list_first(&si->free_clusters);
+ offset = idx * SWAPFILE_CLUSTER;
+ ci = lock_cluster(si, offset);
+ alloc_cluster(si, idx);
+ cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+
+ map = si->swap_map + offset;
+ for (i = 0; i < SWAPFILE_CLUSTER; i++)
+ map[i] = SWAP_HAS_CACHE;
+ unlock_cluster(ci);
+ swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
+ *slot = swp_entry(si->type, offset);
+
+ return 1;
+}
+
+static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ unsigned long offset = idx * SWAPFILE_CLUSTER;
+ struct swap_cluster_info *ci;
+
+ ci = lock_cluster(si, offset);
+ cluster_set_count_flag(ci, 0, 0);
+ free_cluster(si, idx);
+ unlock_cluster(ci);
+ swap_range_free(si, offset, SWAPFILE_CLUSTER);
+}
+#else
+static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+{
+ VM_WARN_ON_ONCE(1);
+ return 0;
+}
+#endif /* CONFIG_THP_SWAP */
+
static unsigned long scan_swap_map(struct swap_info_struct *si,
unsigned char usage)
{
@@ -781,13 +892,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
}
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
+int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
{
+ unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
struct swap_info_struct *si, *next;
long avail_pgs;
int n_ret = 0;
- avail_pgs = atomic_long_read(&nr_swap_pages);
+ /* Only single cluster request supported */
+ WARN_ON_ONCE(n_goal > 1 && cluster);
+
+ avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
if (avail_pgs <= 0)
goto noswap;
@@ -797,7 +912,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
if (n_goal > avail_pgs)
n_goal = avail_pgs;
- atomic_long_sub(n_goal, &nr_swap_pages);
+ atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
spin_lock(&swap_avail_lock);
@@ -823,10 +938,13 @@ start_over:
spin_unlock(&si->lock);
goto nextsi;
}
- n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
- n_goal, swp_entries);
+ if (cluster)
+ n_ret = swap_alloc_cluster(si, swp_entries);
+ else
+ n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+ n_goal, swp_entries);
spin_unlock(&si->lock);
- if (n_ret)
+ if (n_ret || cluster)
goto check_out;
pr_debug("scan_swap_map of si %d failed to find offset\n",
si->type);
@@ -852,7 +970,8 @@ nextsi:
check_out:
if (n_ret < n_goal)
- atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages);
+ atomic_long_add((long)(n_goal - n_ret) * nr_pages,
+ &nr_swap_pages);
noswap:
return n_ret;
}
@@ -1008,32 +1127,8 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
dec_cluster_info_page(p, p->cluster_info, offset);
unlock_cluster(ci);
- mem_cgroup_uncharge_swap(entry);
- if (offset < p->lowest_bit)
- p->lowest_bit = offset;
- if (offset > p->highest_bit) {
- bool was_full = !p->highest_bit;
-
- p->highest_bit = offset;
- if (was_full && (p->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- WARN_ON(!plist_node_empty(&p->avail_list));
- if (plist_node_empty(&p->avail_list))
- plist_add(&p->avail_list,
- &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- }
- }
- atomic_long_inc(&nr_swap_pages);
- p->inuse_pages--;
- frontswap_invalidate_page(p->type, offset);
- if (p->flags & SWP_BLKDEV) {
- struct gendisk *disk = p->bdev->bd_disk;
-
- if (disk->fops->swap_slot_free_notify)
- disk->fops->swap_slot_free_notify(p->bdev,
- offset);
- }
+ mem_cgroup_uncharge_swap(entry, 1);
+ swap_range_free(p, offset, 1);
}
/*
@@ -1054,7 +1149,7 @@ void swap_free(swp_entry_t entry)
/*
* Called after dropping swapcache to decrease refcnt to swap entries.
*/
-void swapcache_free(swp_entry_t entry)
+static void swapcache_free(swp_entry_t entry)
{
struct swap_info_struct *p;
@@ -1065,6 +1160,52 @@ void swapcache_free(swp_entry_t entry)
}
}
+#ifdef CONFIG_THP_SWAP
+static void swapcache_free_cluster(swp_entry_t entry)
+{
+ unsigned long offset = swp_offset(entry);
+ unsigned long idx = offset / SWAPFILE_CLUSTER;
+ struct swap_cluster_info *ci;
+ struct swap_info_struct *si;
+ unsigned char *map;
+ unsigned int i;
+
+ si = swap_info_get(entry);
+ if (!si)
+ return;
+
+ ci = lock_cluster(si, offset);
+ map = si->swap_map + offset;
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ VM_BUG_ON(map[i] != SWAP_HAS_CACHE);
+ map[i] = 0;
+ }
+ unlock_cluster(ci);
+ mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+ swap_free_cluster(si, idx);
+ spin_unlock(&si->lock);
+}
+#else
+static inline void swapcache_free_cluster(swp_entry_t entry)
+{
+}
+#endif /* CONFIG_THP_SWAP */
+
+void put_swap_page(struct page *page, swp_entry_t entry)
+{
+ if (!PageTransHuge(page))
+ swapcache_free(entry);
+ else
+ swapcache_free_cluster(entry);
+}
+
+static int swp_entry_cmp(const void *ent1, const void *ent2)
+{
+ const swp_entry_t *e1 = ent1, *e2 = ent2;
+
+ return (int)swp_type(*e1) - (int)swp_type(*e2);
+}
+
void swapcache_free_entries(swp_entry_t *entries, int n)
{
struct swap_info_struct *p, *prev;
@@ -1075,6 +1216,14 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
prev = NULL;
p = NULL;
+
+ /*
+ * Sort swap entries by swap device, so each lock is only taken once.
+ * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
+ * so low that it isn't necessary to optimize further.
+ */
+ if (nr_swapfiles > 1)
+ sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
for (i = 0; i < n; ++i) {
p = swap_info_get_cont(entries[i], prev);
if (p)
@@ -1719,7 +1868,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
swap_map = &si->swap_map[i];
entry = swp_entry(type, i);
page = read_swap_cache_async(entry,
- GFP_HIGHUSER_MOVABLE, NULL, 0);
+ GFP_HIGHUSER_MOVABLE, NULL, 0, false);
if (!page) {
/*
* Either swap_duplicate() failed because entry
diff --git a/mm/truncate.c b/mm/truncate.c
index 6479ed2afc53..2330223841fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -530,9 +530,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
} else if (PageTransHuge(page)) {
index += HPAGE_PMD_NR - 1;
i += HPAGE_PMD_NR - 1;
- /* 'end' is in the middle of THP */
- if (index == round_down(end, HPAGE_PMD_NR))
+ /*
+ * 'end' is in the middle of THP. Don't
+ * invalidate the page as the part outside of
+ * 'end' could be still useful.
+ */
+ if (index > end) {
+ unlock_page(page);
continue;
+ }
}
ret = invalidate_inode_page(page);
diff --git a/mm/util.c b/mm/util.c
index 26be6407abd7..7b07ec852e01 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -83,6 +83,8 @@ EXPORT_SYMBOL(kstrdup_const);
* @s: the string to duplicate
* @max: read at most @max chars from @s
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Note: Use kmemdup_nul() instead if the size is known exactly.
*/
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
@@ -121,6 +123,28 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
EXPORT_SYMBOL(kmemdup);
/**
+ * kmemdup_nul - Create a NUL-terminated string from unterminated data
+ * @s: The data to stringify
+ * @len: The size of the data
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
+{
+ char *buf;
+
+ if (!s)
+ return NULL;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (buf) {
+ memcpy(buf, s, len);
+ buf[len] = '\0';
+ }
+ return buf;
+}
+EXPORT_SYMBOL(kmemdup_nul);
+
+/**
* memdup_user - duplicate memory region from user space
*
* @src: source address in user space
@@ -339,9 +363,9 @@ EXPORT_SYMBOL(vm_mmap);
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
- * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_REPEAT
- * is supported only for large (>32kB) allocations, and it should be used only if
- * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks.
+ * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
+ * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
+ * preferable to the vmalloc fallback, due to visible performance drawbacks.
*
* Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
*/
@@ -366,13 +390,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (size > PAGE_SIZE) {
kmalloc_flags |= __GFP_NOWARN;
- /*
- * We have to override __GFP_REPEAT by __GFP_NORETRY for !costly
- * requests because there is no other way to tell the allocator
- * that we want to fail rather than retry endlessly.
- */
- if (!(kmalloc_flags & __GFP_REPEAT) ||
- (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+ if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
kmalloc_flags |= __GFP_NORETRY;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 34a1c3e46ed7..8698c1c86c4d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -287,10 +287,21 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (p4d_none(*p4d))
return NULL;
pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
+
+ /*
+ * Don't dereference bad PUD or PMD (below) entries. This will also
+ * identify huge mappings, which we may encounter on architectures
+ * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
+ * identified as vmalloc addresses by is_vmalloc_addr(), but are
+ * not [unambiguously] associated with a struct page, so there is
+ * no correct value to return for them.
+ */
+ WARN_ON_ONCE(pud_bad(*pud));
+ if (pud_none(*pud) || pud_bad(*pud))
return NULL;
pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
+ WARN_ON_ONCE(pmd_bad(*pmd));
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
return NULL;
ptep = pte_offset_map(pmd, addr);
@@ -314,6 +325,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
+#define VM_LAZY_FREE 0x02
#define VM_VM_AREA 0x04
static DEFINE_SPINLOCK(vmap_area_lock);
@@ -1486,6 +1498,7 @@ struct vm_struct *remove_vm_area(const void *addr)
spin_lock(&vmap_area_lock);
va->vm = NULL;
va->flags &= ~VM_VM_AREA;
+ va->flags |= VM_LAZY_FREE;
spin_unlock(&vmap_area_lock);
vmap_debug_free_range(va->va_start, va->va_end);
@@ -1759,12 +1772,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
*/
clear_vm_uninitialized_flag(area);
- /*
- * A ref_count = 2 is needed because vm_struct allocated in
- * __get_vm_area_node() contains a reference to the virtual address of
- * the vmalloc'ed block.
- */
- kmemleak_alloc(addr, real_size, 2, gfp_mask);
+ kmemleak_vmalloc(area, size, gfp_mask);
return addr;
@@ -1787,7 +1795,7 @@ fail:
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*
- * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT
+ * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
* and __GFP_NOFAIL are not supported
*
* Any use of gfp flags outside of GFP_KERNEL should be consulted
@@ -2698,8 +2706,14 @@ static int s_show(struct seq_file *m, void *p)
* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
* behalf of vmap area is being tear down or vm_map_ram allocation.
*/
- if (!(va->flags & VM_VM_AREA))
+ if (!(va->flags & VM_VM_AREA)) {
+ seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start,
+ va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
+
return 0;
+ }
v = va->vm;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index ce0618bfa8d0..85350ce2d25d 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -93,12 +93,25 @@ enum vmpressure_levels {
VMPRESSURE_NUM_LEVELS,
};
+enum vmpressure_modes {
+ VMPRESSURE_NO_PASSTHROUGH = 0,
+ VMPRESSURE_HIERARCHY,
+ VMPRESSURE_LOCAL,
+ VMPRESSURE_NUM_MODES,
+};
+
static const char * const vmpressure_str_levels[] = {
[VMPRESSURE_LOW] = "low",
[VMPRESSURE_MEDIUM] = "medium",
[VMPRESSURE_CRITICAL] = "critical",
};
+static const char * const vmpressure_str_modes[] = {
+ [VMPRESSURE_NO_PASSTHROUGH] = "default",
+ [VMPRESSURE_HIERARCHY] = "hierarchy",
+ [VMPRESSURE_LOCAL] = "local",
+};
+
static enum vmpressure_levels vmpressure_level(unsigned long pressure)
{
if (pressure >= vmpressure_level_critical)
@@ -141,27 +154,31 @@ out:
struct vmpressure_event {
struct eventfd_ctx *efd;
enum vmpressure_levels level;
+ enum vmpressure_modes mode;
struct list_head node;
};
static bool vmpressure_event(struct vmpressure *vmpr,
- enum vmpressure_levels level)
+ const enum vmpressure_levels level,
+ bool ancestor, bool signalled)
{
struct vmpressure_event *ev;
- bool signalled = false;
+ bool ret = false;
mutex_lock(&vmpr->events_lock);
-
list_for_each_entry(ev, &vmpr->events, node) {
- if (level >= ev->level) {
- eventfd_signal(ev->efd, 1);
- signalled = true;
- }
+ if (ancestor && ev->mode == VMPRESSURE_LOCAL)
+ continue;
+ if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
+ continue;
+ if (level < ev->level)
+ continue;
+ eventfd_signal(ev->efd, 1);
+ ret = true;
}
-
mutex_unlock(&vmpr->events_lock);
- return signalled;
+ return ret;
}
static void vmpressure_work_fn(struct work_struct *work)
@@ -170,6 +187,8 @@ static void vmpressure_work_fn(struct work_struct *work)
unsigned long scanned;
unsigned long reclaimed;
enum vmpressure_levels level;
+ bool ancestor = false;
+ bool signalled = false;
spin_lock(&vmpr->sr_lock);
/*
@@ -194,12 +213,9 @@ static void vmpressure_work_fn(struct work_struct *work)
level = vmpressure_calc_level(scanned, reclaimed);
do {
- if (vmpressure_event(vmpr, level))
- break;
- /*
- * If not handled, propagate the event upward into the
- * hierarchy.
- */
+ if (vmpressure_event(vmpr, level, ancestor, signalled))
+ signalled = true;
+ ancestor = true;
} while ((vmpr = vmpressure_parent(vmpr)));
}
@@ -326,17 +342,40 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
vmpressure(gfp, memcg, true, vmpressure_win, 0);
}
+static enum vmpressure_levels str_to_level(const char *arg)
+{
+ enum vmpressure_levels level;
+
+ for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++)
+ if (!strcmp(vmpressure_str_levels[level], arg))
+ return level;
+ return -1;
+}
+
+static enum vmpressure_modes str_to_mode(const char *arg)
+{
+ enum vmpressure_modes mode;
+
+ for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++)
+ if (!strcmp(vmpressure_str_modes[mode], arg))
+ return mode;
+ return -1;
+}
+
+#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
+
/**
* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
* @memcg: memcg that is interested in vmpressure notifications
* @eventfd: eventfd context to link notifications with
- * @args: event arguments (used to set up a pressure level threshold)
+ * @args: event arguments (pressure level threshold, optional mode)
*
* This function associates eventfd context with the vmpressure
* infrastructure, so that the notifications will be delivered to the
- * @eventfd. The @args parameter is a string that denotes pressure level
- * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
- * "critical").
+ * @eventfd. The @args parameter is a comma-delimited string that denotes a
+ * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
+ * or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
+ * "hierarchy" or "local").
*
* To be used as memcg event method.
*/
@@ -345,28 +384,53 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
- int level;
+ enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
+ enum vmpressure_levels level = -1;
+ char *spec, *spec_orig;
+ char *token;
+ int ret = 0;
+
+ spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL);
+ if (!spec) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN);
- for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
- if (!strcmp(vmpressure_str_levels[level], args))
- break;
+ /* Find required level */
+ token = strsep(&spec, ",");
+ level = str_to_level(token);
+ if (level == -1) {
+ ret = -EINVAL;
+ goto out;
}
- if (level >= VMPRESSURE_NUM_LEVELS)
- return -EINVAL;
+ /* Find optional mode */
+ token = strsep(&spec, ",");
+ if (token) {
+ mode = str_to_mode(token);
+ if (mode == -1) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
- if (!ev)
- return -ENOMEM;
+ if (!ev) {
+ ret = -ENOMEM;
+ goto out;
+ }
ev->efd = eventfd;
ev->level = level;
+ ev->mode = mode;
mutex_lock(&vmpr->events_lock);
list_add(&ev->node, &vmpr->events);
mutex_unlock(&vmpr->events_lock);
-
- return 0;
+out:
+ kfree(spec_orig);
+ return ret;
}
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ad39bbc79e6..a1af041930a6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -708,7 +708,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- swapcache_free(swap);
+ put_swap_page(page, swap);
} else {
void (*freepage)(struct page *);
void *shadow = NULL;
@@ -1125,8 +1125,36 @@ static unsigned long shrink_page_list(struct list_head *page_list,
!PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
- if (!add_to_swap(page, page_list))
+ if (PageTransHuge(page)) {
+ /* cannot split THP, skip it */
+ if (!can_split_huge_page(page, NULL))
+ goto activate_locked;
+ /*
+ * Split pages without a PMD map right
+ * away. Chances are some or all of the
+ * tail pages can be freed without IO.
+ */
+ if (!compound_mapcount(page) &&
+ split_huge_page_to_list(page, page_list))
+ goto activate_locked;
+ }
+ if (!add_to_swap(page)) {
+ if (!PageTransHuge(page))
+ goto activate_locked;
+ /* Split THP and swap individual base pages */
+ if (split_huge_page_to_list(page, page_list))
+ goto activate_locked;
+ if (!add_to_swap(page))
+ goto activate_locked;
+ }
+
+ /* XXX: We don't support THP writes */
+ if (PageTransHuge(page) &&
+ split_huge_page_to_list(page, page_list)) {
+ delete_from_swap_cache(page);
goto activate_locked;
+ }
+
may_enter_fs = 1;
/* Adding to swap updated mapping */
@@ -1266,6 +1294,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
count_vm_event(PGLAZYFREED);
+ count_memcg_page_event(page, PGLAZYFREED);
} else if (!mapping || !__remove_mapping(mapping, page, true))
goto keep_locked;
/*
@@ -1295,6 +1324,7 @@ activate_locked:
if (!PageMlocked(page)) {
SetPageActive(page);
pgactivate++;
+ count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
unlock_page(page);
@@ -1734,11 +1764,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
- if (global_reclaim(sc)) {
- if (current_is_kswapd())
+ if (current_is_kswapd()) {
+ if (global_reclaim(sc))
__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
- else
+ count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
+ nr_scanned);
+ } else {
+ if (global_reclaim(sc))
__count_vm_events(PGSCAN_DIRECT, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
+ nr_scanned);
}
spin_unlock_irq(&pgdat->lru_lock);
@@ -1750,11 +1785,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&pgdat->lru_lock);
- if (global_reclaim(sc)) {
- if (current_is_kswapd())
+ if (current_is_kswapd()) {
+ if (global_reclaim(sc))
__count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
- else
+ count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
+ nr_reclaimed);
+ } else {
+ if (global_reclaim(sc))
__count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
+ nr_reclaimed);
}
putback_inactive_pages(lruvec, &page_list);
@@ -1899,8 +1939,11 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
}
}
- if (!is_active_lru(lru))
+ if (!is_active_lru(lru)) {
__count_vm_events(PGDEACTIVATE, nr_moved);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ nr_moved);
+ }
return nr_moved;
}
@@ -1938,6 +1981,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
reclaim_stat->recent_scanned[file] += nr_taken;
__count_vm_events(PGREFILL, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2184,8 +2228,17 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
}
if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
- scan_balance = SCAN_ANON;
- goto out;
+ /*
+ * Force SCAN_ANON if there are enough inactive
+ * anonymous pages on the LRU in eligible zones.
+ * Otherwise, the small LRU gets thrashed.
+ */
+ if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
+ >> sc->priority) {
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
}
}
@@ -2453,18 +2506,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return false;
/* Consider stopping depending on scan and reclaim activity */
- if (sc->gfp_mask & __GFP_REPEAT) {
+ if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
/*
- * For __GFP_REPEAT allocations, stop reclaiming if the
+ * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
* full LRU list has been scanned and we are still failing
* to reclaim pages. This full LRU scan is potentially
- * expensive but a __GFP_REPEAT caller really wants to succeed
+ * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
*/
if (!nr_reclaimed && !nr_scanned)
return false;
} else {
/*
- * For non-__GFP_REPEAT allocations which can presumably
+ * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
* fail without consequence, stop if we failed to reclaim
* any pages from the last SWAP_CLUSTER_MAX number of
* pages that were scanned. This will return to the
@@ -2967,7 +3020,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
- .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
+ .gfp_mask = current_gfp_context(gfp_mask),
.reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
@@ -2982,12 +3035,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
* 1 is returned so that the page allocator does not OOM kill at this
* point.
*/
- if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
+ if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
- gfp_mask,
+ sc.gfp_mask,
sc.reclaim_idx);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -3652,7 +3705,7 @@ int kswapd_run(int nid)
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
- BUG_ON(system_state == SYSTEM_BOOTING);
+ BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
@@ -3774,17 +3827,16 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
- int classzone_idx = gfp_zone(gfp_mask);
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
+ .gfp_mask = current_gfp_context(gfp_mask),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
- .reclaim_idx = classzone_idx,
+ .reclaim_idx = gfp_zone(gfp_mask),
};
cond_resched();
@@ -3795,7 +3847,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
- lockdep_set_current_reclaim_state(gfp_mask);
+ lockdep_set_current_reclaim_state(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -3831,7 +3883,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
* unmapped file backed pages.
*/
if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
- sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+ node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
return NODE_RECLAIM_FULL;
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 76f73670200a..9a4441bbeef2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -928,8 +928,6 @@ const char * const vmstat_text[] = {
"nr_zone_unevictable",
"nr_zone_write_pending",
"nr_mlock",
- "nr_slab_reclaimable",
- "nr_slab_unreclaimable",
"nr_page_table_pages",
"nr_kernel_stack",
"nr_bounce",
@@ -952,6 +950,8 @@ const char * const vmstat_text[] = {
"nr_inactive_file",
"nr_active_file",
"nr_unevictable",
+ "nr_slab_reclaimable",
+ "nr_slab_unreclaimable",
"nr_isolated_anon",
"nr_isolated_file",
"workingset_refault",
@@ -1018,6 +1018,7 @@ const char * const vmstat_text[] = {
"drop_pagecache",
"drop_slab",
+ "oom_kill",
#ifdef CONFIG_NUMA_BALANCING
"numa_pte_updates",
@@ -1129,7 +1130,7 @@ static void frag_stop(struct seq_file *m, void *arg)
* If @assert_populated is true, only use callback for zones that are populated.
*/
static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
- bool assert_populated,
+ bool assert_populated, bool nolock,
void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
{
struct zone *zone;
@@ -1140,9 +1141,11 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
if (assert_populated && !populated_zone(zone))
continue;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!nolock)
+ spin_lock_irqsave(&zone->lock, flags);
print(m, pgdat, zone);
- spin_unlock_irqrestore(&zone->lock, flags);
+ if (!nolock)
+ spin_unlock_irqrestore(&zone->lock, flags);
}
}
#endif
@@ -1165,7 +1168,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
static int frag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, true, frag_show_print);
+ walk_zones_in_node(m, pgdat, true, false, frag_show_print);
return 0;
}
@@ -1206,7 +1209,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
seq_printf(m, "%6d ", order);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print);
+ walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
return 0;
}
@@ -1223,11 +1226,10 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
struct page *page;
- if (!pfn_valid(pfn))
+ page = pfn_to_online_page(pfn);
+ if (!page)
continue;
- page = pfn_to_page(pfn);
-
/* Watch for unexpected holes punched in the memmap */
if (!memmap_valid_within(pfn, page, zone))
continue;
@@ -1258,7 +1260,8 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print);
+ walk_zones_in_node(m, pgdat, true, false,
+ pagetypeinfo_showblockcount_print);
return 0;
}
@@ -1284,7 +1287,8 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print);
+ walk_zones_in_node(m, pgdat, true, true,
+ pagetypeinfo_showmixedcount_print);
#endif /* CONFIG_PAGE_OWNER */
}
@@ -1322,7 +1326,7 @@ static int fragmentation_open(struct inode *inode, struct file *file)
return seq_open(file, &fragmentation_op);
}
-static const struct file_operations fragmentation_file_operations = {
+static const struct file_operations buddyinfo_file_operations = {
.open = fragmentation_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -1341,7 +1345,7 @@ static int pagetypeinfo_open(struct inode *inode, struct file *file)
return seq_open(file, &pagetypeinfo_op);
}
-static const struct file_operations pagetypeinfo_file_ops = {
+static const struct file_operations pagetypeinfo_file_operations = {
.open = pagetypeinfo_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -1446,7 +1450,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
static int zoneinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, false, zoneinfo_show_print);
+ walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
return 0;
}
@@ -1463,7 +1467,7 @@ static int zoneinfo_open(struct inode *inode, struct file *file)
return seq_open(file, &zoneinfo_op);
}
-static const struct file_operations proc_zoneinfo_file_operations = {
+static const struct file_operations zoneinfo_file_operations = {
.open = zoneinfo_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -1552,7 +1556,7 @@ static int vmstat_open(struct inode *inode, struct file *file)
return seq_open(file, &vmstat_op);
}
-static const struct file_operations proc_vmstat_file_operations = {
+static const struct file_operations vmstat_file_operations = {
.open = vmstat_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -1785,10 +1789,10 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
#ifdef CONFIG_PROC_FS
- proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
- proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
- proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
- proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
+ proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
+ proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
+ proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
+ proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
#endif
}
@@ -1852,7 +1856,7 @@ static int unusable_show(struct seq_file *m, void *arg)
if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
- walk_zones_in_node(m, pgdat, true, unusable_show_print);
+ walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
return 0;
}
@@ -1904,7 +1908,7 @@ static int extfrag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, true, extfrag_show_print);
+ walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
return 0;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index b8c9ab678479..7119cd745ace 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -288,12 +288,10 @@ bool workingset_refault(void *shadow)
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
- inc_node_state(pgdat, WORKINGSET_REFAULT);
- inc_memcg_state(memcg, WORKINGSET_REFAULT);
+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
if (refault_distance <= active_file) {
- inc_node_state(pgdat, WORKINGSET_ACTIVATE);
- inc_memcg_state(memcg, WORKINGSET_ACTIVATE);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
rcu_read_unlock();
return true;
}
@@ -474,8 +472,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
}
if (WARN_ON_ONCE(node->exceptional))
goto out_invalid;
- inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
- inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
+ inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
__radix_tree_delete_node(&mapping->page_tree, node,
workingset_update_node, mapping);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d41edd28298b..013eea76685e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -116,6 +116,11 @@
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
+#define FULLNESS_BITS 2
+#define CLASS_BITS 8
+#define ISOLATED_BITS 3
+#define MAGIC_VAL_BITS 8
+
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
#define ZS_MIN_ALLOC_SIZE \
@@ -137,6 +142,8 @@
* (reason above)
*/
#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS)
+#define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
+ ZS_SIZE_CLASS_DELTA) + 1)
enum fullness_group {
ZS_EMPTY,
@@ -169,11 +176,6 @@ static struct vfsmount *zsmalloc_mnt;
#endif
/*
- * number of size_classes
- */
-static int zs_size_classes;
-
-/*
* We assign a page to ZS_ALMOST_EMPTY fullness group when:
* n <= N / f, where
* n = number of allocated objects
@@ -244,7 +246,7 @@ struct link_free {
struct zs_pool {
const char *name;
- struct size_class **size_class;
+ struct size_class *size_class[ZS_SIZE_CLASSES];
struct kmem_cache *handle_cachep;
struct kmem_cache *zspage_cachep;
@@ -268,11 +270,6 @@ struct zs_pool {
#endif
};
-#define FULLNESS_BITS 2
-#define CLASS_BITS 8
-#define ISOLATED_BITS 3
-#define MAGIC_VAL_BITS 8
-
struct zspage {
struct {
unsigned int fullness:FULLNESS_BITS;
@@ -469,7 +466,7 @@ static bool is_zspage_isolated(struct zspage *zspage)
return zspage->isolated;
}
-static int is_first_page(struct page *page)
+static __maybe_unused int is_first_page(struct page *page)
{
return PagePrivate(page);
}
@@ -551,7 +548,7 @@ static int get_size_class_index(int size)
idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
ZS_SIZE_CLASS_DELTA);
- return min(zs_size_classes - 1, idx);
+ return min_t(int, ZS_SIZE_CLASSES - 1, idx);
}
static inline void zs_stat_inc(struct size_class *class,
@@ -610,7 +607,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
"obj_allocated", "obj_used", "pages_used",
"pages_per_zspage", "freeable");
- for (i = 0; i < zs_size_classes; i++) {
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
class = pool->size_class[i];
if (class->index != i)
@@ -1294,17 +1291,6 @@ static int zs_cpu_dead(unsigned int cpu)
return 0;
}
-static void __init init_zs_size_classes(void)
-{
- int nr;
-
- nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1;
- if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA)
- nr += 1;
-
- zs_size_classes = nr;
-}
-
static bool can_merge(struct size_class *prev, int pages_per_zspage,
int objs_per_zspage)
{
@@ -2145,7 +2131,7 @@ static void async_free_zspage(struct work_struct *work)
struct zs_pool *pool = container_of(work, struct zs_pool,
free_work);
- for (i = 0; i < zs_size_classes; i++) {
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
class = pool->size_class[i];
if (class->index != i)
continue;
@@ -2263,7 +2249,7 @@ unsigned long zs_compact(struct zs_pool *pool)
int i;
struct size_class *class;
- for (i = zs_size_classes - 1; i >= 0; i--) {
+ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
if (!class)
continue;
@@ -2309,7 +2295,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
struct zs_pool *pool = container_of(shrinker, struct zs_pool,
shrinker);
- for (i = zs_size_classes - 1; i >= 0; i--) {
+ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
if (!class)
continue;
@@ -2361,12 +2347,6 @@ struct zs_pool *zs_create_pool(const char *name)
return NULL;
init_deferred_free(pool);
- pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
- GFP_KERNEL);
- if (!pool->size_class) {
- kfree(pool);
- return NULL;
- }
pool->name = kstrdup(name, GFP_KERNEL);
if (!pool->name)
@@ -2379,7 +2359,7 @@ struct zs_pool *zs_create_pool(const char *name)
* Iterate reversely, because, size of size_class that we want to use
* for merging should be larger or equal to current size.
*/
- for (i = zs_size_classes - 1; i >= 0; i--) {
+ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
int size;
int pages_per_zspage;
int objs_per_zspage;
@@ -2453,7 +2433,7 @@ void zs_destroy_pool(struct zs_pool *pool)
zs_unregister_migration(pool);
zs_pool_stat_destroy(pool);
- for (i = 0; i < zs_size_classes; i++) {
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
int fg;
struct size_class *class = pool->size_class[i];
@@ -2492,8 +2472,6 @@ static int __init zs_init(void)
if (ret)
goto hp_setup_fail;
- init_zs_size_classes();
-
#ifdef CONFIG_ZPOOL
zpool_register_driver(&zs_zpool_driver);
#endif
diff --git a/mm/zswap.c b/mm/zswap.c
index eedc27894b10..d39581a076c3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -371,10 +371,9 @@ static int zswap_dstmem_prepare(unsigned int cpu)
u8 *dst;
dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!dst) {
- pr_err("can't allocate compressor buffer\n");
+ if (!dst)
return -ENOMEM;
- }
+
per_cpu(zswap_dstmem, cpu) = dst;
return 0;
}
@@ -515,10 +514,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
}
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
- if (!pool) {
- pr_err("pool alloc failed\n");
+ if (!pool)
return NULL;
- }
/* unique name for each pool specifically required by zsmalloc */
snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
@@ -1158,7 +1155,7 @@ static void zswap_frontswap_init(unsigned type)
{
struct zswap_tree *tree;
- tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
+ tree = kzalloc(sizeof(*tree), GFP_KERNEL);
if (!tree) {
pr_err("alloc failed, zswap disabled for swap type %d\n", type);
return;