aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2007-12-21 22:21:08 +1100
committerPaul Mackerras <paulus@samba.org>2007-12-21 22:21:08 +1100
commitc2a7dcad9f0d92d7a96e735abb8bec7b9c621536 (patch)
treebf9b20fdd5ab07e5b0e4e0b95c6a3dbab1005cb9 /mm
parent373a6da165ac3012a74fd072da340eabca55d031 (diff)
parentea67db4cdbbf7f4e74150e71da0984e25121f500 (diff)
Merge branch 'linux-2.6'
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig15
-rw-r--r--mm/filemap.c12
-rw-r--r--mm/hugetlb.c70
-rw-r--r--mm/page_alloc.c11
-rw-r--r--mm/slub.c3
-rw-r--r--mm/sparse.c20
6 files changed, 104 insertions, 27 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c070ec0c15b..9ef97417a0b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -112,18 +112,17 @@ config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC
-#
-# SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page
-# and page_to_pfn. The most efficient option where kernel virtual space is
-# not under pressure.
-#
config SPARSEMEM_VMEMMAP_ENABLE
def_bool n
config SPARSEMEM_VMEMMAP
- bool
- depends on SPARSEMEM
- default y if (SPARSEMEM_VMEMMAP_ENABLE)
+ bool "Sparse Memory virtual memmap"
+ depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
+ default y
+ help
+ SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
+ pfn_to_page and page_to_pfn operations. This is the most
+ efficient option when sufficient kernel resources are available.
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
diff --git a/mm/filemap.c b/mm/filemap.c
index 188cf5fd3e8..f4d0cded0e1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -124,6 +124,18 @@ void __remove_from_page_cache(struct page *page)
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
BUG_ON(page_mapped(page));
+
+ /*
+ * Some filesystems seem to re-dirty the page even after
+ * the VM has canceled the dirty bit (eg ext3 journaling).
+ *
+ * Fix it up by doing a final dirty accounting check after
+ * having removed the page entirely.
+ */
+ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ }
}
void remove_from_page_cache(struct page *page)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6121b57bbe9..7224a4f0710 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -31,7 +31,7 @@ static unsigned int free_huge_pages_node[MAX_NUMNODES];
static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
-int hugetlb_dynamic_pool;
+unsigned long nr_overcommit_huge_pages;
static int hugetlb_next_nid;
/*
@@ -227,22 +227,58 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
unsigned long address)
{
struct page *page;
+ unsigned int nid;
- /* Check if the dynamic pool is enabled */
- if (!hugetlb_dynamic_pool)
+ /*
+ * Assume we will successfully allocate the surplus page to
+ * prevent racing processes from causing the surplus to exceed
+ * overcommit
+ *
+ * This however introduces a different race, where a process B
+ * tries to grow the static hugepage pool while alloc_pages() is
+ * called by process A. B will only examine the per-node
+ * counters in determining if surplus huge pages can be
+ * converted to normal huge pages in adjust_pool_surplus(). A
+ * won't be able to increment the per-node counter, until the
+ * lock is dropped by B, but B doesn't drop hugetlb_lock until
+ * no more huge pages can be converted from surplus to normal
+ * state (and doesn't try to convert again). Thus, we have a
+ * case where a surplus huge page exists, the pool is grown, and
+ * the surplus huge page still exists after, even though it
+ * should just have been converted to a normal huge page. This
+ * does not leak memory, though, as the hugepage will be freed
+ * once it is out of use. It also does not allow the counters to
+ * go out of whack in adjust_pool_surplus() as we don't modify
+ * the node values until we've gotten the hugepage and only the
+ * per-node value is checked there.
+ */
+ spin_lock(&hugetlb_lock);
+ if (surplus_huge_pages >= nr_overcommit_huge_pages) {
+ spin_unlock(&hugetlb_lock);
return NULL;
+ } else {
+ nr_huge_pages++;
+ surplus_huge_pages++;
+ }
+ spin_unlock(&hugetlb_lock);
page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
+
+ spin_lock(&hugetlb_lock);
if (page) {
+ nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
- spin_lock(&hugetlb_lock);
- nr_huge_pages++;
- nr_huge_pages_node[page_to_nid(page)]++;
- surplus_huge_pages++;
- surplus_huge_pages_node[page_to_nid(page)]++;
- spin_unlock(&hugetlb_lock);
+ /*
+ * We incremented the global counters already
+ */
+ nr_huge_pages_node[nid]++;
+ surplus_huge_pages_node[nid]++;
+ } else {
+ nr_huge_pages--;
+ surplus_huge_pages--;
}
+ spin_unlock(&hugetlb_lock);
return page;
}
@@ -481,6 +517,12 @@ static unsigned long set_max_huge_pages(unsigned long count)
* Increase the pool size
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
+ *
+ * We might race with alloc_buddy_huge_page() here and be unable
+ * to convert a surplus huge page to a normal huge page. That is
+ * not critical, though, it just means the overall size of the
+ * pool might be one hugepage larger than it needs to be, but
+ * within all the constraints specified by the sysctls.
*/
spin_lock(&hugetlb_lock);
while (surplus_huge_pages && count > persistent_huge_pages) {
@@ -509,6 +551,14 @@ static unsigned long set_max_huge_pages(unsigned long count)
* to keep enough around to satisfy reservations). Then place
* pages into surplus state as needed so the pool will shrink
* to the desired size as pages become free.
+ *
+ * By placing pages into the surplus state independent of the
+ * overcommit value, we are allowing the surplus pool size to
+ * exceed overcommit. There are few sane options here. Since
+ * alloc_buddy_huge_page() is checking the global counter,
+ * though, we'll note that we're not allowed to exceed surplus
+ * and won't grow the pool anywhere else. Not until one of the
+ * sysctls are changed, or the surplus pages go out of use.
*/
min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
min_count = max(count, min_count);
@@ -907,7 +957,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
- if (!pte || pte_none(*pte)) {
+ if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) {
int ret;
spin_unlock(&mm->page_table_lock);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b5a58d476c1..d73bfad1c32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -847,8 +847,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
struct page *page = __rmqueue(zone, order, migratetype);
if (unlikely(page == NULL))
break;
+
+ /*
+ * Split buddy pages returned by expand() are received here
+ * in physical page order. The page is added to the callers and
+ * list and the list head then moves forward. From the callers
+ * perspective, the linked list is ordered by page number in
+ * some conditions. This is useful for IO devices that can
+ * merge IO requests if the physical pages are ordered
+ * properly.
+ */
list_add(&page->lru, list);
set_page_private(page, migratetype);
+ list = &page->lru;
}
spin_unlock(&zone->lock);
return i;
diff --git a/mm/slub.c b/mm/slub.c
index 9c1d9f3b364..b9f37cb0f2e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1468,9 +1468,6 @@ static void *__slab_alloc(struct kmem_cache *s,
void **object;
struct page *new;
- /* We handle __GFP_ZERO in the caller */
- gfpflags &= ~__GFP_ZERO;
-
if (!c->page)
goto new_slab;
diff --git a/mm/sparse.c b/mm/sparse.c
index e06f514fe04..a2183cb5d52 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -83,6 +83,8 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
return -EEXIST;
section = sparse_index_alloc(nid);
+ if (!section)
+ return -ENOMEM;
/*
* This lock keeps two different sections from
* reallocating for the same index
@@ -389,9 +391,17 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
* no locking for this, because it does its own
* plus, it does a kmalloc
*/
- sparse_index_init(section_nr, pgdat->node_id);
+ ret = sparse_index_init(section_nr, pgdat->node_id);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
+ if (!memmap)
+ return -ENOMEM;
usemap = __kmalloc_section_usemap();
+ if (!usemap) {
+ __kfree_section_memmap(memmap, nr_pages);
+ return -ENOMEM;
+ }
pgdat_resize_lock(pgdat, &flags);
@@ -401,18 +411,16 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
goto out;
}
- if (!usemap) {
- ret = -ENOMEM;
- goto out;
- }
ms->section_mem_map |= SECTION_MARKED_PRESENT;
ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
out:
pgdat_resize_unlock(pgdat, &flags);
- if (ret <= 0)
+ if (ret <= 0) {
+ kfree(usemap);
__kfree_section_memmap(memmap, nr_pages);
+ }
return ret;
}
#endif