From 72fad7139b6829f71d7f41f39eb30da5760d90a8 Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Mon, 10 Dec 2007 15:49:28 -0800 Subject: hugetlb: handle write-protection faults in follow_hugetlb_page The follow_hugetlb_page() fix I posted (merged as git commit 5b23dbe8173c212d6a326e35347b038705603d39) missed one case. If the pte is present, but not writable and write access is requested by the caller to get_user_pages(), the code will do the wrong thing. Rather than calling hugetlb_fault to make the pte writable, it notes the presence of the pte and continues. This simple one-liner makes sure we also fault on the pte for this case. Please apply. Signed-off-by: Adam Litke Acked-by: Dave Kleikamp Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6121b57bbe9..6f978218c2c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -907,7 +907,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); - if (!pte || pte_none(*pte)) { + if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { int ret; spin_unlock(&mm->page_table_lock); -- cgit v1.2.3 From a5ee6daa525c04079baee6f393c0b2dab3f61253 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Mon, 17 Dec 2007 16:19:53 -0800 Subject: sparsemem: make SPARSEMEM_VMEMMAP selectable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SPARSEMEM_VMEMMAP needs to be a selectable config option to support building the kernel both with and without sparsemem vmemmap support. This selection is desirable for platforms which could be configured one way for platform specific builds and the other for multi-platform builds. Signed-off-by: Miguel Botón Signed-off-by: Geoff Levand Acked-by: Yasunori Goto Cc: Christoph Lameter Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index c070ec0c15b..9ef97417a0b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -112,18 +112,17 @@ config SPARSEMEM_EXTREME def_bool y depends on SPARSEMEM && !SPARSEMEM_STATIC -# -# SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page -# and page_to_pfn. The most efficient option where kernel virtual space is -# not under pressure. -# config SPARSEMEM_VMEMMAP_ENABLE def_bool n config SPARSEMEM_VMEMMAP - bool - depends on SPARSEMEM - default y if (SPARSEMEM_VMEMMAP_ENABLE) + bool "Sparse Memory virtual memmap" + depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE + default y + help + SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise + pfn_to_page and page_to_pfn operations. This is the most + efficient option when sufficient kernel resources are available. # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG -- cgit v1.2.3 From af0cd5a7c3cded50c25e98acd94912d17a0eb914 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 17 Dec 2007 16:19:58 -0800 Subject: mm/sparse.c: check the return value of sparse_index_alloc() Since sparse_index_alloc() can return NULL on memory allocation failure, we must deal with the failure condition when calling it. Signed-off-by: WANG Cong Cc: Christoph Lameter Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index e06f514fe04..d245e59048a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -83,6 +83,8 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) return -EEXIST; section = sparse_index_alloc(nid); + if (!section) + return -ENOMEM; /* * This lock keeps two different sections from * reallocating for the same index -- cgit v1.2.3 From bbd0682596f7a434467ee551fee18d5f0b818539 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 17 Dec 2007 16:19:59 -0800 Subject: mm/sparse.c: improve the error handling for sparse_add_one_section() Improve the error handling for mm/sparse.c::sparse_add_one_section(). And I see no reason to check 'usemap' until holding the 'pgdat_resize_lock'. [geoffrey.levand@am.sony.com: sparse_index_init() returns -EEXIST] Cc: Christoph Lameter Acked-by: Dave Hansen Cc: Rik van Riel Acked-by: Yasunori Goto Cc: Andy Whitcroft Signed-off-by: WANG Cong Signed-off-by: Geoff Levand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index d245e59048a..a2183cb5d52 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -391,9 +391,17 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, * no locking for this, because it does its own * plus, it does a kmalloc */ - sparse_index_init(section_nr, pgdat->node_id); + ret = sparse_index_init(section_nr, pgdat->node_id); + if (ret < 0 && ret != -EEXIST) + return ret; memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); + if (!memmap) + return -ENOMEM; usemap = __kmalloc_section_usemap(); + if (!usemap) { + __kfree_section_memmap(memmap, nr_pages); + return -ENOMEM; + } pgdat_resize_lock(pgdat, &flags); @@ -403,18 +411,16 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, goto out; } - if (!usemap) { - ret = -ENOMEM; - goto out; - } ms->section_mem_map |= SECTION_MARKED_PRESENT; ret = sparse_init_one_section(ms, section_nr, memmap, usemap); out: pgdat_resize_unlock(pgdat, &flags); - if (ret <= 0) + if (ret <= 0) { + kfree(usemap); __kfree_section_memmap(memmap, nr_pages); + } return ret; } #endif -- cgit v1.2.3 From 81eabcbe0b991ddef5216f30ae91c4b226d54b6d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 17 Dec 2007 16:20:05 -0800 Subject: mm: fix page allocation for larger I/O segments In some cases the IO subsystem is able to merge requests if the pages are adjacent in physical memory. This was achieved in the allocator by having expand() return pages in physically contiguous order in situations were a large buddy was split. However, list-based anti-fragmentation changed the order pages were returned in to avoid searching in buffered_rmqueue() for a page of the appropriate migrate type. This patch restores behaviour of rmqueue_bulk() preserving the physical order of pages returned by the allocator without incurring increased search costs for anti-fragmentation. Signed-off-by: Mel Gorman Cc: James Bottomley Cc: Jens Axboe Cc: Mark Lord Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5a58d476c1..d73bfad1c32 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -847,8 +847,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) break; + + /* + * Split buddy pages returned by expand() are received here + * in physical page order. The page is added to the callers and + * list and the list head then moves forward. From the callers + * perspective, the linked list is ordered by page number in + * some conditions. This is useful for IO devices that can + * merge IO requests if the physical pages are ordered + * properly. + */ list_add(&page->lru, list); set_page_private(page, migratetype); + list = &page->lru; } spin_unlock(&zone->lock); return i; -- cgit v1.2.3 From d1c3fb1f8f29c41b0d098d7cfb3c32939043631f Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Mon, 17 Dec 2007 16:20:12 -0800 Subject: hugetlb: introduce nr_overcommit_hugepages sysctl hugetlb: introduce nr_overcommit_hugepages sysctl While examining the code to support /proc/sys/vm/hugetlb_dynamic_pool, I became convinced that having a boolean sysctl was insufficient: 1) To support per-node control of hugepages, I have previously submitted patches to add a sysfs attribute related to nr_hugepages. However, with a boolean global value and per-mount quota enforcement constraining the dynamic pool, adding corresponding control of the dynamic pool on a per-node basis seems inconsistent to me. 2) Administration of the hugetlb dynamic pool with multiple hugetlbfs mount points is, arguably, more arduous than it needs to be. Each quota would need to be set separately, and the sum would need to be monitored. To ease the administration, and to help make the way for per-node control of the static & dynamic hugepage pool, I added a separate sysctl, nr_overcommit_hugepages. This value serves as a high watermark for the overall hugepage pool, while nr_hugepages serves as a low watermark. The boolean sysctl can then be removed, as the condition nr_overcommit_hugepages > 0 indicates the same administrative setting as hugetlb_dynamic_pool == 1 Quotas still serve as local enforcement of the size of the pool on a per-mount basis. A few caveats: 1) There is a race whereby the global surplus huge page counter is incremented before a hugepage has allocated. Another process could then try grow the pool, and fail to convert a surplus huge page to a normal huge page and instead allocate a fresh huge page. I believe this is benign, as no memory is leaked (the actual pages are still tracked correctly) and the counters won't go out of sync. 2) Shrinking the static pool while a surplus is in effect will allow the number of surplus huge pages to exceed the overcommit value. As long as this condition holds, however, no more surplus huge pages will be allowed on the system until one of the two sysctls are increased sufficiently, or the surplus huge pages go out of use and are freed. Successfully tested on x86_64 with the current libhugetlbfs snapshot, modified to use the new sysctl. Signed-off-by: Nishanth Aravamudan Acked-by: Adam Litke Cc: William Lee Irwin III Cc: Dave Hansen Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6f978218c2c..3a790651475 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -32,6 +32,7 @@ static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; int hugetlb_dynamic_pool; +unsigned long nr_overcommit_huge_pages; static int hugetlb_next_nid; /* @@ -227,22 +228,62 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, unsigned long address) { struct page *page; + unsigned int nid; /* Check if the dynamic pool is enabled */ if (!hugetlb_dynamic_pool) return NULL; + /* + * Assume we will successfully allocate the surplus page to + * prevent racing processes from causing the surplus to exceed + * overcommit + * + * This however introduces a different race, where a process B + * tries to grow the static hugepage pool while alloc_pages() is + * called by process A. B will only examine the per-node + * counters in determining if surplus huge pages can be + * converted to normal huge pages in adjust_pool_surplus(). A + * won't be able to increment the per-node counter, until the + * lock is dropped by B, but B doesn't drop hugetlb_lock until + * no more huge pages can be converted from surplus to normal + * state (and doesn't try to convert again). Thus, we have a + * case where a surplus huge page exists, the pool is grown, and + * the surplus huge page still exists after, even though it + * should just have been converted to a normal huge page. This + * does not leak memory, though, as the hugepage will be freed + * once it is out of use. It also does not allow the counters to + * go out of whack in adjust_pool_surplus() as we don't modify + * the node values until we've gotten the hugepage and only the + * per-node value is checked there. + */ + spin_lock(&hugetlb_lock); + if (surplus_huge_pages >= nr_overcommit_huge_pages) { + spin_unlock(&hugetlb_lock); + return NULL; + } else { + nr_huge_pages++; + surplus_huge_pages++; + } + spin_unlock(&hugetlb_lock); + page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, HUGETLB_PAGE_ORDER); + + spin_lock(&hugetlb_lock); if (page) { + nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); - spin_lock(&hugetlb_lock); - nr_huge_pages++; - nr_huge_pages_node[page_to_nid(page)]++; - surplus_huge_pages++; - surplus_huge_pages_node[page_to_nid(page)]++; - spin_unlock(&hugetlb_lock); + /* + * We incremented the global counters already + */ + nr_huge_pages_node[nid]++; + surplus_huge_pages_node[nid]++; + } else { + nr_huge_pages--; + surplus_huge_pages--; } + spin_unlock(&hugetlb_lock); return page; } @@ -481,6 +522,12 @@ static unsigned long set_max_huge_pages(unsigned long count) * Increase the pool size * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. + * + * We might race with alloc_buddy_huge_page() here and be unable + * to convert a surplus huge page to a normal huge page. That is + * not critical, though, it just means the overall size of the + * pool might be one hugepage larger than it needs to be, but + * within all the constraints specified by the sysctls. */ spin_lock(&hugetlb_lock); while (surplus_huge_pages && count > persistent_huge_pages) { @@ -509,6 +556,14 @@ static unsigned long set_max_huge_pages(unsigned long count) * to keep enough around to satisfy reservations). Then place * pages into surplus state as needed so the pool will shrink * to the desired size as pages become free. + * + * By placing pages into the surplus state independent of the + * overcommit value, we are allowing the surplus pool size to + * exceed overcommit. There are few sane options here. Since + * alloc_buddy_huge_page() is checking the global counter, + * though, we'll note that we're not allowed to exceed surplus + * and won't grow the pool anywhere else. Not until one of the + * sysctls are changed, or the surplus pages go out of use. */ min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; min_count = max(count, min_count); -- cgit v1.2.3 From 368d2c6358c3c62b3820a8a73f9fe9c8b540cdea Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Mon, 17 Dec 2007 16:20:22 -0800 Subject: Revert "hugetlb: Add hugetlb_dynamic_pool sysctl" This reverts commit 54f9f80d6543fb7b157d3b11e2e7911dc1379790 ("hugetlb: Add hugetlb_dynamic_pool sysctl") Given the new sysctl nr_overcommit_hugepages, the boolean dynamic pool sysctl is not needed, as its semantics can be expressed by 0 in the overcommit sysctl (no dynamic pool) and non-0 in the overcommit sysctl (pool enabled). (Needed in 2.6.24 since it reverts a post-2.6.23 userspace-visible change) Signed-off-by: Nishanth Aravamudan Acked-by: Adam Litke Cc: William Lee Irwin III Cc: Dave Hansen Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3a790651475..7224a4f0710 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -31,7 +31,6 @@ static unsigned int free_huge_pages_node[MAX_NUMNODES]; static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -int hugetlb_dynamic_pool; unsigned long nr_overcommit_huge_pages; static int hugetlb_next_nid; @@ -230,10 +229,6 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, struct page *page; unsigned int nid; - /* Check if the dynamic pool is enabled */ - if (!hugetlb_dynamic_pool) - return NULL; - /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed -- cgit v1.2.3 From 3811dbf67162bd08412f1b0e02e554f353e93bdb Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 17 Dec 2007 16:20:27 -0800 Subject: SLUB: remove useless masking of GFP_ZERO Remove a recently added useless masking of GFP_ZERO. GFP_ZERO is already masked out in new_slab() (See how it calls allocate_slab). No need to do it twice. This reverts the SLUB parts of 7fd272550bd43cc1d7289ef0ab2fa50de137e767. Cc: Matt Mackall Reviewed-by: Pekka Enberg Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 9c1d9f3b364..b9f37cb0f2e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1468,9 +1468,6 @@ static void *__slab_alloc(struct kmem_cache *s, void **object; struct page *new; - /* We handle __GFP_ZERO in the caller */ - gfpflags &= ~__GFP_ZERO; - if (!c->page) goto new_slab; -- cgit v1.2.3 From 3a6927906f1b2adf5a31b789322d32eb8559ada0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 19 Dec 2007 14:05:13 -0800 Subject: Do dirty page accounting when removing a page from the page cache Krzysztof Oledzki noticed a dirty page accounting leak on some of his machines, causing the machine to eventually lock up when the kernel decided that there was too much dirty data, but nobody could actually write anything out to fix it. The culprit turns out to be filesystems (cough ext3 with data=journal cough) that re-dirty the page when the "->invalidatepage()" callback is called. Fix it up by doing a final dirty page accounting check when we actually remove the page from the page cache. This fixes bugzilla entry 9182: http://bugzilla.kernel.org/show_bug.cgi?id=9182 Tested-by: Ingo Molnar Tested-by: Krzysztof Oledzki Cc: Andrew Morton Cc: Nick Piggin Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- mm/filemap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 188cf5fd3e8..f4d0cded0e1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,6 +124,18 @@ void __remove_from_page_cache(struct page *page) mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); + + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * + * Fix it up by doing a final dirty accounting check after + * having removed the page entirely. + */ + if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + } } void remove_from_page_cache(struct page *page) -- cgit v1.2.3