diff options
author | Paul Mackerras <paulus@samba.org> | 2007-12-21 22:21:08 +1100 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2007-12-21 22:21:08 +1100 |
commit | c2a7dcad9f0d92d7a96e735abb8bec7b9c621536 (patch) | |
tree | bf9b20fdd5ab07e5b0e4e0b95c6a3dbab1005cb9 /mm | |
parent | 373a6da165ac3012a74fd072da340eabca55d031 (diff) | |
parent | ea67db4cdbbf7f4e74150e71da0984e25121f500 (diff) |
Merge branch 'linux-2.6'
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 15 | ||||
-rw-r--r-- | mm/filemap.c | 12 | ||||
-rw-r--r-- | mm/hugetlb.c | 70 | ||||
-rw-r--r-- | mm/page_alloc.c | 11 | ||||
-rw-r--r-- | mm/slub.c | 3 | ||||
-rw-r--r-- | mm/sparse.c | 20 |
6 files changed, 104 insertions, 27 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c070ec0c15b..9ef97417a0b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -112,18 +112,17 @@ config SPARSEMEM_EXTREME def_bool y depends on SPARSEMEM && !SPARSEMEM_STATIC -# -# SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page -# and page_to_pfn. The most efficient option where kernel virtual space is -# not under pressure. -# config SPARSEMEM_VMEMMAP_ENABLE def_bool n config SPARSEMEM_VMEMMAP - bool - depends on SPARSEMEM - default y if (SPARSEMEM_VMEMMAP_ENABLE) + bool "Sparse Memory virtual memmap" + depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE + default y + help + SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise + pfn_to_page and page_to_pfn operations. This is the most + efficient option when sufficient kernel resources are available. # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG diff --git a/mm/filemap.c b/mm/filemap.c index 188cf5fd3e8..f4d0cded0e1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,6 +124,18 @@ void __remove_from_page_cache(struct page *page) mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); + + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * + * Fix it up by doing a final dirty accounting check after + * having removed the page entirely. + */ + if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + } } void remove_from_page_cache(struct page *page) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6121b57bbe9..7224a4f0710 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -31,7 +31,7 @@ static unsigned int free_huge_pages_node[MAX_NUMNODES]; static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -int hugetlb_dynamic_pool; +unsigned long nr_overcommit_huge_pages; static int hugetlb_next_nid; /* @@ -227,22 +227,58 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, unsigned long address) { struct page *page; + unsigned int nid; - /* Check if the dynamic pool is enabled */ - if (!hugetlb_dynamic_pool) + /* + * Assume we will successfully allocate the surplus page to + * prevent racing processes from causing the surplus to exceed + * overcommit + * + * This however introduces a different race, where a process B + * tries to grow the static hugepage pool while alloc_pages() is + * called by process A. B will only examine the per-node + * counters in determining if surplus huge pages can be + * converted to normal huge pages in adjust_pool_surplus(). A + * won't be able to increment the per-node counter, until the + * lock is dropped by B, but B doesn't drop hugetlb_lock until + * no more huge pages can be converted from surplus to normal + * state (and doesn't try to convert again). Thus, we have a + * case where a surplus huge page exists, the pool is grown, and + * the surplus huge page still exists after, even though it + * should just have been converted to a normal huge page. This + * does not leak memory, though, as the hugepage will be freed + * once it is out of use. It also does not allow the counters to + * go out of whack in adjust_pool_surplus() as we don't modify + * the node values until we've gotten the hugepage and only the + * per-node value is checked there. + */ + spin_lock(&hugetlb_lock); + if (surplus_huge_pages >= nr_overcommit_huge_pages) { + spin_unlock(&hugetlb_lock); return NULL; + } else { + nr_huge_pages++; + surplus_huge_pages++; + } + spin_unlock(&hugetlb_lock); page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, HUGETLB_PAGE_ORDER); + + spin_lock(&hugetlb_lock); if (page) { + nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); - spin_lock(&hugetlb_lock); - nr_huge_pages++; - nr_huge_pages_node[page_to_nid(page)]++; - surplus_huge_pages++; - surplus_huge_pages_node[page_to_nid(page)]++; - spin_unlock(&hugetlb_lock); + /* + * We incremented the global counters already + */ + nr_huge_pages_node[nid]++; + surplus_huge_pages_node[nid]++; + } else { + nr_huge_pages--; + surplus_huge_pages--; } + spin_unlock(&hugetlb_lock); return page; } @@ -481,6 +517,12 @@ static unsigned long set_max_huge_pages(unsigned long count) * Increase the pool size * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. + * + * We might race with alloc_buddy_huge_page() here and be unable + * to convert a surplus huge page to a normal huge page. That is + * not critical, though, it just means the overall size of the + * pool might be one hugepage larger than it needs to be, but + * within all the constraints specified by the sysctls. */ spin_lock(&hugetlb_lock); while (surplus_huge_pages && count > persistent_huge_pages) { @@ -509,6 +551,14 @@ static unsigned long set_max_huge_pages(unsigned long count) * to keep enough around to satisfy reservations). Then place * pages into surplus state as needed so the pool will shrink * to the desired size as pages become free. + * + * By placing pages into the surplus state independent of the + * overcommit value, we are allowing the surplus pool size to + * exceed overcommit. There are few sane options here. Since + * alloc_buddy_huge_page() is checking the global counter, + * though, we'll note that we're not allowed to exceed surplus + * and won't grow the pool anywhere else. Not until one of the + * sysctls are changed, or the surplus pages go out of use. */ min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; min_count = max(count, min_count); @@ -907,7 +957,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); - if (!pte || pte_none(*pte)) { + if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { int ret; spin_unlock(&mm->page_table_lock); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5a58d476c1..d73bfad1c32 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -847,8 +847,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) break; + + /* + * Split buddy pages returned by expand() are received here + * in physical page order. The page is added to the callers and + * list and the list head then moves forward. From the callers + * perspective, the linked list is ordered by page number in + * some conditions. This is useful for IO devices that can + * merge IO requests if the physical pages are ordered + * properly. + */ list_add(&page->lru, list); set_page_private(page, migratetype); + list = &page->lru; } spin_unlock(&zone->lock); return i; diff --git a/mm/slub.c b/mm/slub.c index 9c1d9f3b364..b9f37cb0f2e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1468,9 +1468,6 @@ static void *__slab_alloc(struct kmem_cache *s, void **object; struct page *new; - /* We handle __GFP_ZERO in the caller */ - gfpflags &= ~__GFP_ZERO; - if (!c->page) goto new_slab; diff --git a/mm/sparse.c b/mm/sparse.c index e06f514fe04..a2183cb5d52 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -83,6 +83,8 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) return -EEXIST; section = sparse_index_alloc(nid); + if (!section) + return -ENOMEM; /* * This lock keeps two different sections from * reallocating for the same index @@ -389,9 +391,17 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, * no locking for this, because it does its own * plus, it does a kmalloc */ - sparse_index_init(section_nr, pgdat->node_id); + ret = sparse_index_init(section_nr, pgdat->node_id); + if (ret < 0 && ret != -EEXIST) + return ret; memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); + if (!memmap) + return -ENOMEM; usemap = __kmalloc_section_usemap(); + if (!usemap) { + __kfree_section_memmap(memmap, nr_pages); + return -ENOMEM; + } pgdat_resize_lock(pgdat, &flags); @@ -401,18 +411,16 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, goto out; } - if (!usemap) { - ret = -ENOMEM; - goto out; - } ms->section_mem_map |= SECTION_MARKED_PRESENT; ret = sparse_init_one_section(ms, section_nr, memmap, usemap); out: pgdat_resize_unlock(pgdat, &flags); - if (ret <= 0) + if (ret <= 0) { + kfree(usemap); __kfree_section_memmap(memmap, nr_pages); + } return ret; } #endif |