From acfa4380efe77e290d3a96b11cd4c9f24f4fbb18 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Dec 2008 10:06:33 -0500 Subject: inode->i_op is never NULL We used to have rather schizophrenic set of checks for NULL ->i_op even though it had been eliminated years ago. You'd need to go out of your way to set it to NULL explicitly _and_ a bunch of code would die on such inodes anyway. After killing two remaining places that still did that bogosity, all that crap can go away. Signed-off-by: Al Viro --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 0a2010a9518c..7b9db658aca2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2266,7 +2266,7 @@ int vmtruncate(struct inode * inode, loff_t offset) unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); } - if (inode->i_op && inode->i_op->truncate) + if (inode->i_op->truncate) inode->i_op->truncate(inode); return 0; @@ -2286,7 +2286,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) * a way to truncate a range of blocks (punch a hole) - * we should return failure right now. */ - if (!inode->i_op || !inode->i_op->truncate_range) + if (!inode->i_op->truncate_range) return -ENOSYS; mutex_lock(&inode->i_mutex); -- cgit v1.2.1 From bf3f3bc5e734706730c12a323f9b2068052aa1f0 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:38:55 -0800 Subject: mm: don't mark_page_accessed in fault path Doing a mark_page_accessed at fault-time, then doing SetPageReferenced at unmap-time if the pte is young has a number of problems. mark_page_accessed is supposed to be roughly the equivalent of a young pte for unmapped references. Unfortunately it doesn't come with any context: after being called, reclaim doesn't know who or why the page was touched. So calling mark_page_accessed not only adds extra lru or PG_referenced manipulations for pages that are already going to have pte_young ptes anyway, but it also adds these references which are difficult to work with from the context of vma specific references (eg. MADV_SEQUENTIAL pte_young may not wish to contribute to the page being referenced). Then, simply doing SetPageReferenced when zapping a pte and finding it is young, is not a really good solution either. SetPageReferenced does not correctly promote the page to the active list for example. So after removing mark_page_accessed from the fault path, several mmap()+touch+munmap() would have a very different result from several read(2) calls for example, which is not really desirable. Signed-off-by: Nick Piggin Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 7b9db658aca2..5e0e91cc6b67 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -768,7 +768,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent)) - SetPageReferenced(page); + mark_page_accessed(page); file_rss--; } page_remove_rmap(page, vma); -- cgit v1.2.1 From 4917e5d0499b5ae7b26b56fccaefddf9aec9369c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 Jan 2009 14:39:17 -0800 Subject: mm: more likely reclaim MADV_SEQUENTIAL mappings File pages mapped only in sequentially read mappings are perfect reclaim canditates. This patch makes these mappings behave like weak references, their pages will be reclaimed unless they have a strong reference from a normal mapping as well. It changes the reclaim and the unmap path where they check if the page has been referenced. In both cases, accesses through sequentially read mappings will be ignored. Benchmark results from KOSAKI Motohiro: http://marc.info/?l=linux-mm&m=122485301925098&w=2 Signed-off-by: Johannes Weiner Signed-off-by: Rik van Riel Acked-by: KOSAKI Motohiro Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 5e0e91cc6b67..99e8d5c7b312 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -767,7 +767,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, else { if (pte_dirty(ptent)) set_page_dirty(page); - if (pte_young(ptent)) + if (pte_young(ptent) && + likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); file_rss--; } -- cgit v1.2.1 From 38e0edb15bd07c6a0caf0cfe39f8f90bd98601b2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 6 Jan 2009 14:39:21 -0800 Subject: mm/apply_to_range: call pte function with lazy updates Make the pte-level function in apply_to_range be called in lazy mmu mode, so that any pagetable modifications can be batched. Signed-off-by: Jeremy Fitzhardinge Cc: Johannes Weiner Cc: Nick Piggin Cc: Venkatesh Pallipadi Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 99e8d5c7b312..b5af358b8b22 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1645,6 +1645,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, BUG_ON(pmd_huge(*pmd)); + arch_enter_lazy_mmu_mode(); + token = pmd_pgtable(*pmd); do { @@ -1653,6 +1655,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, break; } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + if (mm != &init_mm) pte_unmap_unlock(pte-1, ptl); return err; -- cgit v1.2.1 From b5934c531849ff4a51ce0f290141efe564290e40 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:25 -0800 Subject: mm: add_active_or_unevictable into rmap lru_cache_add_active_or_unevictable() and page_add_new_anon_rmap() always appear together. Save some symbol table space and some jumping around by removing lru_cache_add_active_or_unevictable(), folding its code into page_add_new_anon_rmap(): like how we add file pages to lru just after adding them to page cache. Remove the nearby "TODO: is this safe?" comments (yes, it is safe), and change page_add_new_anon_rmap()'s address BUG_ON to VM_BUG_ON as originally intended. Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Cc: Lee Schermerhorn Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index b5af358b8b22..a138c50dc39a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1949,10 +1949,7 @@ gotten: */ ptep_clear_flush_notify(vma, address, page_table); SetPageSwapBacked(new_page); - lru_cache_add_active_or_unevictable(new_page, vma); page_add_new_anon_rmap(new_page, vma, address); - -//TODO: is this safe? do_anonymous_page() does it this way. set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { @@ -2448,7 +2445,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto release; inc_mm_counter(mm, anon_rss); SetPageSwapBacked(page); - lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2597,7 +2593,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (anon) { inc_mm_counter(mm, anon_rss); SetPageSwapBacked(page); - lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); @@ -2607,7 +2602,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); } } -//TODO: is this safe? do_anonymous_page() does it this way. set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ -- cgit v1.2.1 From cbf84b7add8103b92aaa84928e335df726bfc8da Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:27 -0800 Subject: mm: further cleanup page_add_new_anon_rmap Moving lru_cache_add_active_or_unevictable() into page_add_new_anon_rmap() was good but stupid: we can and should SetPageSwapBacked() there too; and we know for sure that this anonymous, swap-backed page is not file cache. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Nick Piggin Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index a138c50dc39a..122d965e820f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1948,7 +1948,6 @@ gotten: * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); - SetPageSwapBacked(new_page); page_add_new_anon_rmap(new_page, vma, address); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); @@ -2444,7 +2443,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; inc_mm_counter(mm, anon_rss); - SetPageSwapBacked(page); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2592,7 +2590,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { inc_mm_counter(mm, anon_rss); - SetPageSwapBacked(page); page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); -- cgit v1.2.1 From 878b63ac889df706d01048f2c110e322ad2f996d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:32 -0800 Subject: mm: gup persist for write permission do_wp_page()'s VM_FAULT_WRITE return value tells __get_user_pages() that COW has been done if necessary, though it may be leaving the pte without write permission - for the odd case of forced writing to a readonly vma for ptrace. At present GUP then retries the follow_page() without asking for write permission, to escape an endless loop when forced. But an application may be relying on GUP to guarantee a writable page which won't be COWed again when written from userspace, whereas a race here might leave a readonly pte in place? Change the VM_FAULT_WRITE handling to ask follow_page() for write permission again, except in that odd case of forced writing to a readonly vma. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 122d965e820f..f594bb65a9f1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1264,9 +1264,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * do_wp_page has broken COW when necessary, * even if maybe_mkwrite decided not to set * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. + * page lookups as if they were reads. But only + * do so when looping for pte_write is futile: + * in some cases userspace may also be wanting + * to write to the gotten user page, which a + * read fault here might prevent (a readonly + * page might get reCOWed by userspace write). */ - if (ret & VM_FAULT_WRITE) + if ((ret & VM_FAULT_WRITE) && + !(vma->vm_flags & VM_WRITE)) foll_flags &= ~FOLL_WRITE; cond_resched(); -- cgit v1.2.1 From ab967d86015a19777955370deebc8262d50fed63 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:33 -0800 Subject: mm: wp lock page before deciding cow An application may rely on get_user_pages() to give it pages writable from userspace and shared with a driver, GUP breaking COW if necessary. It may mprotect() the pages' writability, off and on, from time to time. Normally this works fine (so long as the app does not fork); but just occasionally, under memory pressure, a readonly pte in a newly writable area is COWed unnecessarily, breaking the link with the driver: because do_wp_page() does trylock_page, and falls back to COW whenever that fails. For reliable behaviour in the unshared case, when the trylock_page fails, now unlock pagetable, lock page and relock pagetable, before deciding whether Copy-On-Write is really necessary. Reported-by: Zhou Yingchao Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index f594bb65a9f1..3922ffcf3dff 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1848,10 +1848,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page)) { - if (trylock_page(old_page)) { - reuse = can_share_swap_page(old_page); - unlock_page(old_page); + if (!trylock_page(old_page)) { + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + lock_page(old_page); + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + page_cache_release(old_page); + goto unlock; + } + page_cache_release(old_page); } + reuse = can_share_swap_page(old_page); + unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { /* -- cgit v1.2.1 From 7b1fe59793e61f826bef053107b57b23954833bb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:34 -0800 Subject: mm: reuse_swap_page replaces can_share_swap_page A good place to free up old swap is where do_wp_page(), or do_swap_page(), is about to redirty the page: the data on disk is then stale and won't be read again; and if we do decide to write the page out later, using the previous swap location makes an unnecessary disk seek very likely. So give can_share_swap_page() the side-effect of delete_from_swap_cache() when it safely can. And can_share_swap_page() was always a misleading name, the more so if it has a side-effect: rename it reuse_swap_page(). Irrelevant cleanup nearby: remove swap_token_default_timeout definition from swap.h: it's used nowhere. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Acked-by: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 3922ffcf3dff..8f471edcb985 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1861,7 +1861,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_release(old_page); } - reuse = can_share_swap_page(old_page); + reuse = reuse_swap_page(old_page); unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2392,7 +2392,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) { + if (write_access && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); write_access = 0; } -- cgit v1.2.1 From a2c43eed8334e878702fca713b212ae2a11d84b9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:36 -0800 Subject: mm: try_to_free_swap replaces remove_exclusive_swap_page remove_exclusive_swap_page(): its problem is in living up to its name. It doesn't matter if someone else has a reference to the page (raised page_count); it doesn't matter if the page is mapped into userspace (raised page_mapcount - though that hints it may be worth keeping the swap): all that matters is that there be no more references to the swap (and no writeback in progress). swapoff (try_to_unuse) has been removing pages from swapcache for years, with no concern for page count or page mapcount, and we used to have a comment in lookup_swap_cache() recognizing that: if you go for a page of swapcache, you'll get the right page, but it could have been removed from swapcache by the time you get page lock. So, give up asking for exclusivity: get rid of remove_exclusive_swap_page(), and remove_exclusive_swap_page_ref() and remove_exclusive_swap_page_count() which were spawned for the recent LRU work: replace them by the simpler try_to_free_swap() which just checks page_swapcount(). Similarly, remove the page_count limitation from free_swap_and_count(), but assume that it's worth holding on to the swap if page is mapped and swap nowhere near full. Add a vm_swap_full() test in free_swap_cache()? It would be consistent, but I think we probably have enough for now. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 8f471edcb985..1a83fe5339a9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2403,7 +2403,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - remove_exclusive_swap_page(page); + try_to_free_swap(page); unlock_page(page); if (write_access) { -- cgit v1.2.1 From 2bc7273b0e3a509fb598abfc5b9fe50158b830d2 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 6 Jan 2009 14:39:43 -0800 Subject: mm: make maddr __iomem sparse output following warnings. mm/memory.c:2936:8: warning: incorrect type in assignment (different address spaces) mm/memory.c:2936:8: expected void *maddr mm/memory.c:2936:8: got void [noderef] cleanup here. Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 1a83fe5339a9..89339c61f8e5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2966,7 +2966,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, { resource_size_t phys_addr; unsigned long prot = 0; - void *maddr; + void __iomem *maddr; int offset = addr & (PAGE_SIZE-1); if (follow_phys(vma, addr, write, &prot, &phys_addr)) -- cgit v1.2.1 From 3dc147414ccad81dc33edb80774b1fed12a38c08 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:08 -0800 Subject: badpage: replace page_remove_rmap Eeek and BUG Now that bad pages are kept out of circulation, there is no need for the infamous page_remove_rmap() BUG() - once that page is freed, its negative mapcount will issue a "Bad page state" message and the page won't be freed. Removing the BUG() allows more info, on subsequent pages, to be gathered. We do have more info about the page at this point than bad_page() can know - notably, what the pmd is, which might pinpoint something like low 64kB corruption - but page_remove_rmap() isn't given the address to find that. In practice, there is only one call to page_remove_rmap() which has ever reported anything, that from zap_pte_range() (usually on exit, sometimes on munmap). It has all the info, so remove page_remove_rmap()'s "Eeek" message and leave it all to zap_pte_range(). mm/memory.c already has a hardly used print_bad_pte() function, showing some of the appropriate info: extend it to show what we want for the rmap case: pte info, page info (when there is a page) and vma info to compare. zap_pte_range() already knows the pmd, but print_bad_pte() is easier to use if it works that out for itself. Some of this info is also shown in bad_page()'s "Bad page state" message. Keep them separate, but adjust them to match each other as far as possible. Say "Bad page map" in print_bad_pte(), and add a TAINT_BAD_PAGE there too. print_bad_pte() show current->comm unconditionally (though it should get repeated in the usually irrelevant stack trace): sorry, I misled Nick Piggin to make it conditional on vm_mm == current->mm, but current->mm is already NULL in the exit case. Usually current->comm is good, though exceptionally it may not be that of the mm (when "swapoff" for example). Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 52 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 89339c61f8e5..cda04b19f733 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -52,6 +52,9 @@ #include #include #include +#include +#include +#include #include #include @@ -59,9 +62,6 @@ #include #include -#include -#include - #include "internal.h" #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -375,15 +375,41 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) * * The calling function must still handle the error. */ -static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, - unsigned long vaddr) -{ - printk(KERN_ERR "Bad pte = %08llx, process = %s, " - "vm_flags = %lx, vaddr = %lx\n", - (long long)pte_val(pte), - (vma->vm_mm == current->mm ? current->comm : "???"), - vma->vm_flags, vaddr); +static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, struct page *page) +{ + pgd_t *pgd = pgd_offset(vma->vm_mm, addr); + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + struct address_space *mapping; + pgoff_t index; + + mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; + index = linear_page_index(vma, addr); + + printk(KERN_EMERG "Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); + if (page) { + printk(KERN_EMERG + "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", + page, (void *)page->flags, page_count(page), + page_mapcount(page), page->mapping, page->index); + } + printk(KERN_EMERG + "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + /* + * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y + */ + if (vma->vm_ops) + print_symbol(KERN_EMERG "vma->vm_ops->fault: %s\n", + (unsigned long)vma->vm_ops->fault); + if (vma->vm_file && vma->vm_file->f_op) + print_symbol(KERN_EMERG "vma->vm_file->f_op->mmap: %s\n", + (unsigned long)vma->vm_file->f_op->mmap); dump_stack(); + add_taint(TAINT_BAD_PAGE); } static inline int is_cow_mapping(unsigned int flags) @@ -773,6 +799,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, file_rss--; } page_remove_rmap(page, vma); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); tlb_remove_page(tlb, page); continue; } @@ -2684,7 +2712,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * Page table corrupted: show pte and kill process. */ - print_bad_pte(vma, orig_pte, address); + print_bad_pte(vma, address, orig_pte, NULL); return VM_FAULT_OOM; } -- cgit v1.2.1 From 22b31eec63e5f2e219a3ee15f456897272bc73e8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:09 -0800 Subject: badpage: vm_normal_page use print_bad_pte print_bad_pte() is so far being called only when zap_pte_range() finds negative page_mapcount, or there's a fault on a pte_file where it does not belong. That's weak coverage when we suspect pagetable corruption. Originally, it was called when vm_normal_page() found an invalid pfn: but pfn_valid is expensive on some architectures and configurations, so 2.6.24 put that under CONFIG_DEBUG_VM (which doesn't help in the field), then 2.6.26 replaced it by a VM_BUG_ON (likewise). Reinstate the print_bad_pte() in vm_normal_page(), but use a cheaper test than pfn_valid(): memmap_init_zone() (used in bootup and hotplug) keep a __read_mostly note of the highest_memmap_pfn, vm_normal_page() then check pfn against that. We could call this pfn_plausible() or pfn_sane(), but I doubt we'll need it elsewhere: of course it's not reliable, but gives much stronger pagetable validation on many boxes. Also use print_bad_pte() when the pte_special bit is found outside a VM_PFNMAP or VM_MIXEDMAP area, instead of VM_BUG_ON. Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index cda04b19f733..890095f5f36d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -467,21 +467,18 @@ static inline int is_cow_mapping(unsigned int flags) struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - unsigned long pfn; + unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte))) { - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - return pte_page(pte); - } - VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); + if (likely(!pte_special(pte))) + goto check_pfn; + if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) + print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !HAVE_PTE_SPECIAL case follows: */ - pfn = pte_pfn(pte); - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) @@ -497,11 +494,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } - VM_BUG_ON(!pfn_valid(pfn)); +check_pfn: + if (unlikely(pfn > highest_memmap_pfn)) { + print_bad_pte(vma, addr, pte, NULL); + return NULL; + } /* * NOTE! We still have PageReserved() pages in the page tables. - * * eg. VDSO mappings can cause them to exist. */ out: -- cgit v1.2.1 From 2509ef26db4699a5d9fa876e90ddfc107afcab84 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:10 -0800 Subject: badpage: zap print_bad_pte on swap and file Complete zap_pte_range()'s coverage of bad pagetable entries by calling print_bad_pte() on a pte_file in a linear vma and on a bad swap entry. That needs free_swap_and_cache() to tell it, which will also have shown one of those "swap_free" errors (but with much less information). Similar checks in fork's copy_one_pte()? No, that would be more noisy than helpful: we'll see them when parent and child exec or exit. Where do_nonlinear_fault() calls print_bad_pte(): omit !VM_CAN_NONLINEAR case, that could only be a bug in sys_remap_file_pages(), not a bad pte. VM_FAULT_OOM rather than VM_FAULT_SIGBUS? Well, okay, that is consistent with what happens if do_swap_page() operates a bad swap entry; but don't we have patches to be more careful about killing when VM_FAULT_OOM? Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 890095f5f36d..b273cc12b15d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -810,8 +810,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, */ if (unlikely(details)) continue; - if (!pte_file(ptent)) - free_swap_and_cache(pte_to_swp_entry(ptent)); + if (pte_file(ptent)) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) + print_bad_pte(vma, addr, ptent, NULL); + } else if + (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) + print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); @@ -2707,8 +2711,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return 0; - if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || - !(vma->vm_flags & VM_CAN_NONLINEAR))) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { /* * Page table corrupted: show pte and kill process. */ -- cgit v1.2.1 From edc315fd222497ae4f4b959a9e31ada1e68a4755 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:11 -0800 Subject: badpage: remove vma from page_remove_rmap Remove page_remove_rmap()'s vma arg, which was only for the Eeek message. And remove the BUG_ON(page_mapcount(page) == 0) from CONFIG_DEBUG_VM's page_dup_rmap(): we're trying to be more resilient about that than BUGs. Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index b273cc12b15d..0f9abbaf618c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -798,7 +798,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); file_rss--; } - page_remove_rmap(page, vma); + page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); tlb_remove_page(tlb, page); @@ -2023,7 +2023,7 @@ gotten: * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, vma); + page_remove_rmap(old_page); } /* Free the old page.. */ -- cgit v1.2.1 From d936cf9b39b06c8d2e0d7fb5e7b4f176e18dec69 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:12 -0800 Subject: badpage: ratelimit print_bad_pte and bad_page print_bad_pte() and bad_page() might each need ratelimiting - especially for their dump_stacks, almost never of interest, yet not quite dispensible. Correlating corruption across neighbouring entries can be very helpful, so allow a burst of 60 reports before keeping quiet for the remainder of that minute (or allow a steady drip of one report per second). Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 0f9abbaf618c..b12888c1b4e3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -383,6 +383,29 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + return; + } + if (nr_unshown) { + printk(KERN_EMERG + "Bad page map: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); -- cgit v1.2.1 From 1e9e63650d6cb88e6d6d2ca6cc3ee276c26de4a3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:13 -0800 Subject: badpage: KERN_ALERT BUG instead of KERN_EMERG bad_page() and rmap Eeek messages have said KERN_EMERG for a few years, which I've followed in print_bad_pte(). These are serious system errors, on a par with BUGs, but they're not quite emergencies, and we do our best to carry on: say KERN_ALERT "BUG: " like the x86 oops does. And remove the "Trying to fix it up, but a reboot is needed" line: it's not untrue, but I hope the KERN_ALERT "BUG: " conveys as much. Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index b12888c1b4e3..db68af8e0bc4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -397,8 +397,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, return; } if (nr_unshown) { - printk(KERN_EMERG - "Bad page map: %lu messages suppressed\n", + printk(KERN_ALERT + "BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } @@ -410,26 +410,27 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - printk(KERN_EMERG "Bad page map in process %s pte:%08llx pmd:%08llx\n", + printk(KERN_ALERT + "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) { - printk(KERN_EMERG + printk(KERN_ALERT "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", page, (void *)page->flags, page_count(page), page_mapcount(page), page->mapping, page->index); } - printk(KERN_EMERG + printk(KERN_ALERT "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ if (vma->vm_ops) - print_symbol(KERN_EMERG "vma->vm_ops->fault: %s\n", + print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", (unsigned long)vma->vm_ops->fault); if (vma->vm_file && vma->vm_file->f_op) - print_symbol(KERN_EMERG "vma->vm_file->f_op->mmap: %s\n", + print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", (unsigned long)vma->vm_file->f_op->mmap); dump_stack(); add_taint(TAINT_BAD_PAGE); -- cgit v1.2.1 From 4779280d1ea4d361af13ae77ba55217fbcd16d4c Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 6 Jan 2009 14:40:18 -0800 Subject: mm: make get_user_pages() interruptible The initial implementation of checking TIF_MEMDIE covers the cases of OOM killing. If the process has been OOM killed, the TIF_MEMDIE is set and it return immediately. This patch includes: 1. add the case that the SIGKILL is sent by user processes. The process can try to get_user_pages() unlimited memory even if a user process has sent a SIGKILL to it(maybe a monitor find the process exceed its memory limit and try to kill it). In the old implementation, the SIGKILL won't be handled until the get_user_pages() returns. 2. change the return value to be ERESTARTSYS. It makes no sense to return ENOMEM if the get_user_pages returned by getting a SIGKILL signal. Considering the general convention for a system call interrupted by a signal is ERESTARTNOSYS, so the current return value is consistant to that. Lee: An unfortunate side effect of "make-get_user_pages-interruptible" is that it prevents a SIGKILL'd task from munlock-ing pages that it had mlocked, resulting in freeing of mlocked pages. Freeing of mlocked pages, in itself, is not so bad. We just count them now--altho' I had hoped to remove this stat and add PG_MLOCKED to the free pages flags check. However, consider pages in shared libraries mapped by more than one task that a task mlocked--e.g., via mlockall(). If the task that mlocked the pages exits via SIGKILL, these pages would be left mlocked and unevictable. Proposed fix: Add another GUP flag to ignore sigkill when calling get_user_pages from munlock()--similar to Kosaki Motohiro's 'IGNORE_VMA_PERMISSIONS flag for the same purpose. We are not actually allocating memory in this case, which "make-get_user_pages-interruptible" intends to avoid. We're just munlocking pages that are already resident and mapped, and we're reusing get_user_pages() to access those pages. ?? Maybe we should combine 'IGNORE_VMA_PERMISSIONS and '_IGNORE_SIGKILL into a single flag: GUP_FLAGS_MUNLOCK ??? [Lee.Schermerhorn@hp.com: ignore sigkill in get_user_pages during munlock] Signed-off-by: Paul Menage Signed-off-by: Ying Han Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Nick Piggin Cc: Hugh Dickins Cc: Oleg Nesterov Cc: Lee Schermerhorn Cc: Rohit Seth Cc: David Rientjes Signed-off-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index db68af8e0bc4..3f8fa06b963b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1210,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int write = !!(flags & GUP_FLAGS_WRITE); int force = !!(flags & GUP_FLAGS_FORCE); int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); + int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); if (len <= 0) return 0; @@ -1288,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct page *page; /* - * If tsk is ooming, cut off its access to large memory - * allocations. It has a pending SIGKILL, but it can't - * be processed until returning to user space. + * If we have a pending SIGKILL, don't keep faulting + * pages and potentially allocating memory, unless + * current is handling munlock--e.g., on exit. In + * that case, we are not allocating memory. Rather, + * we're only unlocking already resident/mapped pages. */ - if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) - return i ? i : -ENOMEM; + if (unlikely(!ignore_sigkill && + fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; if (write) foll_flags |= FOLL_WRITE; -- cgit v1.2.1 From 7a81b88cb53e335ff7d019e6398c95792c817d93 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 7 Jan 2009 18:07:48 -0800 Subject: memcg: introduce charge-commit-cancel style of functions There is a small race in do_swap_page(). When the page swapped-in is charged, the mapcount can be greater than 0. But, at the same time some process (shares it ) call unmap and make mapcount 1->0 and the page is uncharged. CPUA CPUB mapcount == 1. (1) charge if mapcount==0 zap_pte_range() (2) mapcount 1 => 0. (3) uncharge(). (success) (4) set page's rmap() mapcount 0=>1 Then, this swap page's account is leaked. For fixing this, I added a new interface. - charge account to res_counter by PAGE_SIZE and try to free pages if necessary. - commit register page_cgroup and add to LRU if necessary. - cancel uncharge PAGE_SIZE because of do_swap_page failure. CPUA (1) charge (always) (2) set page's rmap (mapcount > 0) (3) commit charge was necessary or not after set_pte(). This protocol uses PCG_USED bit on page_cgroup for avoiding over accounting. Usual mem_cgroup_charge_common() does charge -> commit at a time. And this patch also adds following function to clarify all charges. - mem_cgroup_newpage_charge() ....replacement for mem_cgroup_charge() called against newly allocated anon pages. - mem_cgroup_charge_migrate_fixup() called only from remove_migration_ptes(). we'll have to rewrite this later.(this patch just keeps old behavior) This function will be removed by additional patch to make migration clearer. Good for clarifying "what we do" Then, we have 4 following charge points. - newpage - swap-in - add-to-cache. - migration. [akpm@linux-foundation.org: add missing inline directives to stubs] Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 3f8fa06b963b..7f210f160990 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2000,7 +2000,7 @@ gotten: cow_user_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); - if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; /* @@ -2392,6 +2392,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; swp_entry_t entry; pte_t pte; + struct mem_cgroup *ptr = NULL; int ret = 0; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) @@ -2430,7 +2431,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { + if (mem_cgroup_try_charge(mm, GFP_KERNEL, &ptr) == -ENOMEM) { ret = VM_FAULT_OOM; unlock_page(page); goto out; @@ -2460,6 +2461,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) @@ -2480,7 +2482,7 @@ unlock: out: return ret; out_nomap: - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge_swapin(ptr); pte_unmap_unlock(page_table, ptl); unlock_page(page); page_cache_release(page); @@ -2510,7 +2512,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto oom; __SetPageUptodate(page); - if (mem_cgroup_charge(page, mm, GFP_KERNEL)) + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -2601,7 +2603,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_OOM; goto out; } - if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { ret = VM_FAULT_OOM; page_cache_release(page); goto out; -- cgit v1.2.1 From bced0520fe462bb94021dcabd32e99630c171be2 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 7 Jan 2009 18:07:49 -0800 Subject: memcg: fix gfp_mask of callers of charge Fix misuse of gfp_kernel. Now, most of callers of mem_cgroup_charge_xxx functions uses GFP_KERNEL. I think that this is from the fact that page_cgroup *was* dynamically allocated. But now, we allocate all page_cgroup at boot. And mem_cgroup_try_to_free_pages() reclaim memory from GFP_HIGHUSER_MOVABLE + specified GFP_RECLAIM_MASK. * This is because we just want to reduce memory usage. "Where we should reclaim from ?" is not a problem in memcg. This patch modifies gfp masks to be GFP_HIGUSER_MOVABLE if possible. Note: This patch is not for fixing behavior but for showing sane information in source code. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 7f210f160990..ba5189e322e6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2000,7 +2000,7 @@ gotten: cow_user_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_newpage_charge(new_page, mm, GFP_HIGHUSER_MOVABLE)) goto oom_free_new; /* @@ -2431,7 +2431,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - if (mem_cgroup_try_charge(mm, GFP_KERNEL, &ptr) == -ENOMEM) { + if (mem_cgroup_try_charge(mm, GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) { ret = VM_FAULT_OOM; unlock_page(page); goto out; @@ -2512,7 +2512,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto oom; __SetPageUptodate(page); - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) + if (mem_cgroup_newpage_charge(page, mm, GFP_HIGHUSER_MOVABLE)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -2603,7 +2603,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_OOM; goto out; } - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { + if (mem_cgroup_newpage_charge(page, + mm, GFP_HIGHUSER_MOVABLE)) { ret = VM_FAULT_OOM; page_cache_release(page); goto out; -- cgit v1.2.1 From 8c7c6e34a1256a5082d38c8e9bd1474476912715 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 7 Jan 2009 18:08:00 -0800 Subject: memcg: mem+swap controller core This patch implements per cgroup limit for usage of memory+swap. However there are SwapCache, double counting of swap-cache and swap-entry is avoided. Mem+Swap controller works as following. - memory usage is limited by memory.limit_in_bytes. - memory + swap usage is limited by memory.memsw_limit_in_bytes. This has following benefits. - A user can limit total resource usage of mem+swap. Without this, because memory resource controller doesn't take care of usage of swap, a process can exhaust all the swap (by memory leak.) We can avoid this case. And Swap is shared resource but it cannot be reclaimed (goes back to memory) until it's used. This characteristic can be trouble when the memory is divided into some parts by cpuset or memcg. Assume group A and group B. After some application executes, the system can be.. Group A -- very large free memory space but occupy 99% of swap. Group B -- under memory shortage but cannot use swap...it's nearly full. Ability to set appropriate swap limit for each group is required. Maybe someone wonder "why not swap but mem+swap ?" - The global LRU(kswapd) can swap out arbitrary pages. Swap-out means to move account from memory to swap...there is no change in usage of mem+swap. In other words, when we want to limit the usage of swap without affecting global LRU, mem+swap limit is better than just limiting swap. Accounting target information is stored in swap_cgroup which is per swap entry record. Charge is done as following. map - charge page and memsw. unmap - uncharge page/memsw if not SwapCache. swap-out (__delete_from_swap_cache) - uncharge page - record mem_cgroup information to swap_cgroup. swap-in (do_swap_page) - charged as page and memsw. record in swap_cgroup is cleared. memsw accounting is decremented. swap-free (swap_free()) - if swap entry is freed, memsw is uncharged by PAGE_SIZE. There are people work under never-swap environments and consider swap as something bad. For such people, this mem+swap controller extension is just an overhead. This overhead is avoided by config or boot option. (see Kconfig. detail is not in this patch.) TODO: - maybe more optimization can be don in swap-in path. (but not very safe.) But we just do simple accounting at this stage. [nishimura@mxp.nes.nec.co.jp: make resize limit hold mutex] [hugh@veritas.com: memswap controller core swapcache fixes] Signed-off-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Balbir Singh Cc: Pavel Emelyanov Signed-off-by: Daisuke Nishimura Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index ba5189e322e6..1358012ffa73 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2431,7 +2431,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - if (mem_cgroup_try_charge(mm, GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) { + if (mem_cgroup_try_charge_swapin(mm, page, + GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) { ret = VM_FAULT_OOM; unlock_page(page); goto out; @@ -2449,8 +2450,20 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_nomap; } - /* The page isn't present yet, go ahead with the fault. */ + /* + * The page isn't present yet, go ahead with the fault. + * + * Be careful about the sequence of operations here. + * To get its accounting right, reuse_swap_page() must be called + * while the page is counted on swap but not yet in mapcount i.e. + * before page_add_anon_rmap() and swap_free(); try_to_free_swap() + * must be called after the swap_free(), or it will never succeed. + * And mem_cgroup_commit_charge_swapin(), which uses the swp_entry + * in page->private, must be called before reuse_swap_page(), + * which may delete_from_swap_cache(). + */ + mem_cgroup_commit_charge_swapin(page, ptr); inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); if (write_access && reuse_swap_page(page)) { @@ -2461,7 +2474,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); - mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) -- cgit v1.2.1 From 2c26fdd70c3094fa3e84caf9ef434911933d5477 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 7 Jan 2009 18:08:10 -0800 Subject: memcg: revert gfp mask fix My patch, memcg-fix-gfp_mask-of-callers-of-charge.patch changed gfp_mask of callers of charge to be GFP_HIGHUSER_MOVABLE for showing what will happen at memory reclaim. But in recent discussion, it's NACKed because it sounds ugly. This patch is for reverting it and add some clean up to gfp_mask of callers of charge. No behavior change but need review before generating HUNK in deep queue. This patch also adds explanation to meaning of gfp_mask passed to charge functions in memcontrol.h. Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Daisuke Nishimura Cc: Hugh Dickins Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 1358012ffa73..e5bfbe6b594c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2000,7 +2000,7 @@ gotten: cow_user_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); - if (mem_cgroup_newpage_charge(new_page, mm, GFP_HIGHUSER_MOVABLE)) + if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; /* @@ -2431,8 +2431,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - if (mem_cgroup_try_charge_swapin(mm, page, - GFP_HIGHUSER_MOVABLE, &ptr) == -ENOMEM) { + if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ret = VM_FAULT_OOM; unlock_page(page); goto out; @@ -2524,7 +2523,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto oom; __SetPageUptodate(page); - if (mem_cgroup_newpage_charge(page, mm, GFP_HIGHUSER_MOVABLE)) + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -2615,8 +2614,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_OOM; goto out; } - if (mem_cgroup_newpage_charge(page, - mm, GFP_HIGHUSER_MOVABLE)) { + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { ret = VM_FAULT_OOM; page_cache_release(page); goto out; -- cgit v1.2.1 From 03f3c433648a97ae7c86be789edba67690f6ea60 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 7 Jan 2009 18:08:31 -0800 Subject: memcg: fix swap accounting leak Fix swapin charge operation of memcg. Now, memcg has hooks to swap-out operation and checks SwapCache is really unused or not. That check depends on contents of struct page. I.e. If PageAnon(page) && page_mapped(page), the page is recoginized as still-in-use. Now, reuse_swap_page() calles delete_from_swap_cache() before establishment of any rmap. Then, in followinig sequence (Page fault with WRITE) try_charge() (charge += PAGESIZE) commit_charge() (Check page_cgroup is used or not..) reuse_swap_page() -> delete_from_swapcache() -> mem_cgroup_uncharge_swapcache() (charge -= PAGESIZE) ...... New charge is uncharged soon.... To avoid this, move commit_charge() after page_mapcount() goes up to 1. By this, try_charge() (usage += PAGESIZE) reuse_swap_page() (may usage -= PAGESIZE if PCG_USED is set) commit_charge() (If page_cgroup is not marked as PCG_USED, add new charge.) Accounting will be correct. Changelog (v2) -> (v3) - fixed invalid charge to swp_entry==0. - updated documentation. Changelog (v1) -> (v2) - fixed comment. [nishimura@mxp.nes.nec.co.jp: swap accounting leak doc fix] Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Tested-by: Balbir Singh Cc: Hugh Dickins Cc: Daisuke Nishimura Signed-off-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index e5bfbe6b594c..e009ce870859 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2457,22 +2457,23 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * while the page is counted on swap but not yet in mapcount i.e. * before page_add_anon_rmap() and swap_free(); try_to_free_swap() * must be called after the swap_free(), or it will never succeed. - * And mem_cgroup_commit_charge_swapin(), which uses the swp_entry - * in page->private, must be called before reuse_swap_page(), - * which may delete_from_swap_cache(). + * Because delete_from_swap_page() may be called by reuse_swap_page(), + * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry + * in page->private. In this case, a record in swap_cgroup is silently + * discarded at swap_free(). */ - mem_cgroup_commit_charge_swapin(page, ptr); inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); if (write_access && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); write_access = 0; } - flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + /* It's better to call commit-charge after rmap is established */ + mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) -- cgit v1.2.1 From 95156f0051cba60ec674bbaa5cf7dc74a74c5612 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 Jan 2009 13:02:11 +0100 Subject: lockdep, mm: fix might_fault() annotation Some code (nfs/sunrpc) uses socket ops on kernel memory while holding the mmap_sem, this is safe because kernel memory doesn't get paged out, therefore we'll never actually fault, and the might_fault() annotations will generate false positives. Reported-by: "J. Bruce Fields" Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- mm/memory.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index e009ce870859..c2d4c477e5bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3165,6 +3165,15 @@ void print_vma_addr(char *prefix, unsigned long ip) #ifdef CONFIG_PROVE_LOCKING void might_fault(void) { + /* + * Some code (nfs/sunrpc) uses socket ops on kernel memory while + * holding the mmap_sem, this is safe because kernel memory doesn't + * get paged out, therefore we'll never actually fault, and the + * below annotations will generate false positives. + */ + if (segment_eq(get_fs(), KERNEL_DS)) + return; + might_sleep(); /* * it would be nicer only to annotate paths which are not under -- cgit v1.2.1 From a36706131182f5507d1e2cfbf391b0fa8d72203c Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 9 Jan 2009 16:13:09 -0800 Subject: x86 PAT: remove PFNMAP type on track_pfn_vma_new() error Impact: fix (harmless) double-free of memtype entries and avoid warning On track_pfn_vma_new() failure, reset the vm_flags so that there will be no second cleanup happening when upper level routines call unmap_vmas(). This patch fixes part of the bug reported here: http://marc.info/?l=linux-kernel&m=123108883716357&w=2 Specifically the error message: X:5010 freeing invalid memtype d0000000-d0101000 Is due to multiple frees on error path, will not happen with the patch below. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- mm/memory.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index c2d4c477e5bb..d3ee2ea5615c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1672,8 +1672,14 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size)); - if (err) + if (err) { + /* + * To indicate that track_pfn related cleanup is not + * needed from higher level routine calling unmap_vmas + */ + vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); return -EINVAL; + } BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; -- cgit v1.2.1 From e4b866ed197cef9989348e0479fed8d864ea465b Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 9 Jan 2009 16:13:11 -0800 Subject: x86 PAT: change track_pfn_vma_new to take pgprot_t pointer param Impact: cleanup Change the protection parameter for track_pfn_vma_new() into a pgprot_t pointer. Subsequent patch changes the x86 PAT handling to return a compatible memtype in pgprot_t, if what was requested cannot be allowed due to conflicts. No fuctionality change in this patch. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- mm/memory.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index d3ee2ea5615c..22bfa7a47a0b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1511,6 +1511,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { int ret; + pgprot_t pgprot = vma->vm_page_prot; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1525,10 +1526,10 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE)) + if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) return -EINVAL; - ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot); + ret = insert_pfn(vma, addr, pfn, pgprot); if (ret) untrack_pfn_vma(vma, pfn, PAGE_SIZE); @@ -1671,7 +1672,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size)); + err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); if (err) { /* * To indicate that track_pfn related cleanup is not -- cgit v1.2.1 From ab92661d5d9514647346047f30f67a7f35ffea67 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Feb 2009 15:12:16 -0800 Subject: do_wp_page: fix regression with execute in place Fix do_wp_page for VM_MIXEDMAP mappings. In the case where pfn_valid returns 0 for a pfn at the beginning of do_wp_page and the mapping is not shared writable, the code branches to label `gotten:' with old_page == NULL. In case the vma is locked (vma->vm_flags & VM_LOCKED), lock_page, clear_page_mlock, and unlock_page try to access the old_page. This patch checks whether old_page is valid before it is dereferenced. The regression was introduced by "mlock: mlocked pages are unevictable" (commit b291f000393f5a0b679012b39d79fbc85c018233). Signed-off-by: Carsten Otte Cc: Nick Piggin Cc: Heiko Carstens Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 22bfa7a47a0b..baa999e87cd2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1999,7 +1999,7 @@ gotten: * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ - if (vma->vm_flags & VM_LOCKED) { + if ((vma->vm_flags & VM_LOCKED) && old_page) { lock_page(old_page); /* for LRU manipulation */ clear_page_mlock(old_page); unlock_page(old_page); -- cgit v1.2.1