aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig21
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c46
-rw-r--r--mm/filemap.c26
-rw-r--r--mm/filemap_xip.c22
-rw-r--r--mm/fremap.c101
-rw-r--r--mm/highmem.c14
-rw-r--r--mm/hugetlb.c214
-rw-r--r--mm/madvise.c13
-rw-r--r--mm/memory.c1307
-rw-r--r--mm/memory_hotplug.c138
-rw-r--r--mm/mempolicy.c471
-rw-r--r--mm/mempool.c8
-rw-r--r--mm/mmap.c121
-rw-r--r--mm/mprotect.c11
-rw-r--r--mm/mremap.c193
-rw-r--r--mm/msync.c80
-rw-r--r--mm/nommu.c25
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c540
-rw-r--r--mm/page_io.c8
-rw-r--r--mm/pdflush.c13
-rw-r--r--mm/readahead.c31
-rw-r--r--mm/rmap.c198
-rw-r--r--mm/shmem.c35
-rw-r--r--mm/slab.c244
-rw-r--r--mm/sparse.c99
-rw-r--r--mm/swap.c14
-rw-r--r--mm/swap_state.c14
-rw-r--r--mm/swapfile.c44
-rw-r--r--mm/thrash.c12
-rw-r--r--mm/tiny-shmem.c5
-rw-r--r--mm/truncate.c17
-rw-r--r--mm/vmalloc.c77
-rw-r--r--mm/vmscan.c62
35 files changed, 2425 insertions, 1803 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 391ffc54d13..21eb51d4da8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -111,3 +111,24 @@ config SPARSEMEM_STATIC
config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+# eventually, we can have this option just 'select SPARSEMEM'
+config MEMORY_HOTPLUG
+ bool "Allow for memory hot-add"
+ depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
+
+comment "Memory hotplug is currently incompatible with Software Suspend"
+ depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
+#
+config SPLIT_PTLOCK_CPUS
+ int
+ default "4096" if ARM && !CPU_CACHE_VIPT
+ default "4096" if PARISC && !PA20
+ default "4"
diff --git a/mm/Makefile b/mm/Makefile
index 4cd69e3ce42..2fa6d2ca9f2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
-
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8ec4e4c2a17..e8c567177dc 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -61,17 +61,9 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
{
bootmem_data_t *bdata = pgdat->bdata;
unsigned long mapsize = ((end - start)+7)/8;
- static struct pglist_data *pgdat_last;
-
- pgdat->pgdat_next = NULL;
- /* Add new nodes last so that bootmem always starts
- searching in the first nodes, not the last ones */
- if (pgdat_last)
- pgdat_last->pgdat_next = pgdat;
- else {
- pgdat_list = pgdat;
- pgdat_last = pgdat;
- }
+
+ pgdat->pgdat_next = pgdat_list;
+ pgdat_list = pgdat;
mapsize = ALIGN(mapsize, sizeof(long));
bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
@@ -162,10 +154,10 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
*/
static void * __init
__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
- unsigned long align, unsigned long goal)
+ unsigned long align, unsigned long goal, unsigned long limit)
{
unsigned long offset, remaining_size, areasize, preferred;
- unsigned long i, start = 0, incr, eidx;
+ unsigned long i, start = 0, incr, eidx, end_pfn = bdata->node_low_pfn;
void *ret;
if(!size) {
@@ -174,7 +166,14 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
}
BUG_ON(align & (align-1));
- eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+ if (limit && bdata->node_boot_start >= limit)
+ return NULL;
+
+ limit >>=PAGE_SHIFT;
+ if (limit && end_pfn > limit)
+ end_pfn = limit;
+
+ eidx = end_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
offset = 0;
if (align &&
(bdata->node_boot_start & (align - 1UL)) != 0)
@@ -186,11 +185,12 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
* first, then we try to allocate lower pages.
*/
if (goal && (goal >= bdata->node_boot_start) &&
- ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
+ ((goal >> PAGE_SHIFT) < end_pfn)) {
preferred = goal - bdata->node_boot_start;
if (bdata->last_success >= preferred)
- preferred = bdata->last_success;
+ if (!limit || (limit && limit > bdata->last_success))
+ preferred = bdata->last_success;
} else
preferred = 0;
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
if (j + 16 < BITS_PER_LONG)
prefetchw(page + j + 16);
__ClearPageReserved(page + j);
+ set_page_count(page + j, 0);
}
__free_pages(page, order);
i += BITS_PER_LONG;
@@ -390,14 +391,15 @@ unsigned long __init free_all_bootmem (void)
return(free_all_bootmem_core(NODE_DATA(0)));
}
-void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
+void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal,
+ unsigned long limit)
{
pg_data_t *pgdat = pgdat_list;
void *ptr;
for_each_pgdat(pgdat)
if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
- align, goal)))
+ align, goal, limit)))
return(ptr);
/*
@@ -408,14 +410,16 @@ void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned
return NULL;
}
-void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal)
+
+void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
{
void *ptr;
- ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal);
+ ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit);
if (ptr)
return (ptr);
- return __alloc_bootmem(size, align, goal);
+ return __alloc_bootmem_limit(size, align, goal, limit);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index b5346576e58..33a28bfde15 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
*
* ->mmap_sem
* ->i_mmap_lock
- * ->page_table_lock (various places, mainly in mmap.c)
+ * ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_sem
@@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
- * ->page_table_lock (anon_vma_prepare and various)
+ * ->page_table_lock or pte_lock (anon_vma_prepare and various)
*
- * ->page_table_lock
+ * ->page_table_lock or pte_lock
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
@@ -134,7 +134,7 @@ static int sync_page(void *word)
struct address_space *mapping;
struct page *page;
- page = container_of((page_flags_t *)word, struct page, flags);
+ page = container_of((unsigned long *)word, struct page, flags);
/*
* page_mapping() is being called without PG_locked held.
@@ -152,7 +152,7 @@ static int sync_page(void *word)
* in the ->sync_page() methods make essential use of the
* page_mapping(), merely passing the page down to the backing
* device's unplug functions when it's non-NULL, which in turn
- * ignore it for all cases but swap, where only page->private is
+ * ignore it for all cases but swap, where only page_private(page) is
* of interest. When page_mapping() does go NULL, the entire
* call stack gracefully ignores the page and returns.
* -- wli
@@ -377,7 +377,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
* This function does not add the page to the LRU. The caller must do that.
*/
int add_to_page_cache(struct page *page, struct address_space *mapping,
- pgoff_t offset, int gfp_mask)
+ pgoff_t offset, gfp_t gfp_mask)
{
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
@@ -401,7 +401,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
EXPORT_SYMBOL(add_to_page_cache);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
- pgoff_t offset, int gfp_mask)
+ pgoff_t offset, gfp_t gfp_mask)
{
int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
if (ret == 0)
@@ -591,7 +591,7 @@ EXPORT_SYMBOL(find_lock_page);
* memory exhaustion.
*/
struct page *find_or_create_page(struct address_space *mapping,
- unsigned long index, unsigned int gfp_mask)
+ unsigned long index, gfp_t gfp_mask)
{
struct page *page, *cached_page = NULL;
int err;
@@ -683,7 +683,7 @@ struct page *
grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
{
struct page *page = find_get_page(mapping, index);
- unsigned int gfp_mask;
+ gfp_t gfp_mask;
if (page) {
if (!TestSetPageLocked(page))
@@ -1030,8 +1030,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
desc.error = 0;
do_generic_file_read(filp,ppos,&desc,file_read_actor);
retval += desc.written;
- if (!retval) {
- retval = desc.error;
+ if (desc.error) {
+ retval = retval ?: desc.error;
break;
}
}
@@ -1520,7 +1520,7 @@ repeat:
page_cache_release(page);
return err;
}
- } else {
+ } else if (vma->vm_flags & VM_NONLINEAR) {
/* No page was found just because we can't read it in now (being
* here implies nonblock != 0), but the page may exist, so set
* the PTE to fault it in later. */
@@ -1537,6 +1537,7 @@ repeat:
return 0;
}
+EXPORT_SYMBOL(filemap_populate);
struct vm_operations_struct generic_file_vm_ops = {
.nopage = filemap_nopage,
@@ -1555,7 +1556,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
vma->vm_ops = &generic_file_vm_ops;
return 0;
}
-EXPORT_SYMBOL(filemap_populate);
/*
* This is for filesystems which do not implement ->writepage.
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 8c199f53773..9cf687e4a29 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,8 @@ __xip_unmap (struct address_space * mapping,
unsigned long address;
pte_t *pte;
pte_t pteval;
+ spinlock_t *ptl;
+ struct page *page;
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -181,19 +183,17 @@ __xip_unmap (struct address_space * mapping,
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- pte = page_check_address(ZERO_PAGE(address), mm,
- address);
- if (!IS_ERR(pte)) {
+ page = ZERO_PAGE(address);
+ pte = page_check_address(page, mm, address, &ptl);
+ if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
+ page_remove_rmap(page);
+ dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
+ page_cache_release(page);
}
}
spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +228,7 @@ xip_file_nopage(struct vm_area_struct * area,
page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
if (!IS_ERR(page)) {
- return page;
+ goto out;
}
if (PTR_ERR(page) != -ENODATA)
return NULL;
@@ -249,6 +249,8 @@ xip_file_nopage(struct vm_area_struct * area,
page = ZERO_PAGE(address);
}
+out:
+ page_cache_get(page);
return page;
}
diff --git a/mm/fremap.c b/mm/fremap.c
index 3235fb77c13..9f381e58bf4 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -20,33 +20,28 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
+ struct page *page = NULL;
- if (pte_none(pte))
- return;
if (pte_present(pte)) {
- unsigned long pfn = pte_pfn(pte);
-
- flush_cache_page(vma, addr, pfn);
+ flush_cache_page(vma, addr, pte_pfn(pte));
pte = ptep_clear_flush(vma, addr, ptep);
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
- if (!PageReserved(page)) {
- if (pte_dirty(pte))
- set_page_dirty(page);
- page_remove_rmap(page);
- page_cache_release(page);
- dec_mm_counter(mm, rss);
- }
+ page = vm_normal_page(vma, addr, pte);
+ if (page) {
+ if (pte_dirty(pte))
+ set_page_dirty(page);
+ page_remove_rmap(page);
+ page_cache_release(page);
}
} else {
if (!pte_file(pte))
free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear(mm, addr, ptep);
}
+ return !!page;
}
/*
@@ -60,25 +55,12 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
pgoff_t size;
int err = -ENOMEM;
pte_t *pte;
- pmd_t *pmd;
- pud_t *pud;
- pgd_t *pgd;
pte_t pte_val;
+ spinlock_t *ptl;
- pgd = pgd_offset(mm, addr);
- spin_lock(&mm->page_table_lock);
-
- pud = pud_alloc(mm, pgd, addr);
- if (!pud)
- goto err_unlock;
-
- pmd = pmd_alloc(mm, pud, addr);
- if (!pmd)
- goto err_unlock;
-
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
- goto err_unlock;
+ goto out;
/*
* This page may have been truncated. Tell the
@@ -88,26 +70,27 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
inode = vma->vm_file->f_mapping->host;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (!page->mapping || page->index >= size)
- goto err_unlock;
+ goto unlock;
+ err = -ENOMEM;
+ if (page_mapcount(page) > INT_MAX/2)
+ goto unlock;
- zap_pte(mm, vma, addr, pte);
+ if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
+ inc_mm_counter(mm, file_rss);
- inc_mm_counter(mm,rss);
flush_icache_page(vma, page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
page_add_file_rmap(page);
pte_val = *pte;
- pte_unmap(pte);
update_mmu_cache(vma, addr, pte_val);
-
err = 0;
-err_unlock:
- spin_unlock(&mm->page_table_lock);
+unlock:
+ pte_unmap_unlock(pte, ptl);
+out:
return err;
}
EXPORT_SYMBOL(install_page);
-
/*
* Install a file pte to a given virtual memory address, release any
* previously existing mapping.
@@ -117,41 +100,27 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
{
int err = -ENOMEM;
pte_t *pte;
- pmd_t *pmd;
- pud_t *pud;
- pgd_t *pgd;
pte_t pte_val;
+ spinlock_t *ptl;
- pgd = pgd_offset(mm, addr);
- spin_lock(&mm->page_table_lock);
-
- pud = pud_alloc(mm, pgd, addr);
- if (!pud)
- goto err_unlock;
-
- pmd = pmd_alloc(mm, pud, addr);
- if (!pmd)
- goto err_unlock;
-
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
- goto err_unlock;
+ goto out;
- zap_pte(mm, vma, addr, pte);
+ if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
+ update_hiwater_rss(mm);
+ dec_mm_counter(mm, file_rss);
+ }
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
pte_val = *pte;
- pte_unmap(pte);
update_mmu_cache(vma, addr, pte_val);
- spin_unlock(&mm->page_table_lock);
- return 0;
-
-err_unlock:
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
+ err = 0;
+out:
return err;
}
-
/***
* sys_remap_file_pages - remap arbitrary pages of a shared backing store
* file within an existing vma.
@@ -207,12 +176,10 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
* Make sure the vma is shared, that it supports prefaulting,
* and that the remapped range is valid and fully within
* the single existing vma. vm_private_data is used as a
- * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED
- * or VM_LOCKED, but VM_LOCKED could be revoked later on).
+ * swapout cursor in a VM_NONLINEAR vma.
*/
if (vma && (vma->vm_flags & VM_SHARED) &&
- (!vma->vm_private_data ||
- (vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) &&
+ (!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) &&
vma->vm_ops && vma->vm_ops->populate &&
end > start && start >= vma->vm_start &&
end <= vma->vm_end) {
diff --git a/mm/highmem.c b/mm/highmem.c
index 40091159946..ce2e7e8bbfa 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -30,11 +30,9 @@
static mempool_t *page_pool, *isa_page_pool;
-static void *page_pool_alloc(unsigned int __nocast gfp_mask, void *data)
+static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data)
{
- unsigned int gfp = gfp_mask | (unsigned int) (long) data;
-
- return alloc_page(gfp);
+ return alloc_page(gfp_mask | GFP_DMA);
}
static void page_pool_free(void *page, void *data)
@@ -51,6 +49,12 @@ static void page_pool_free(void *page, void *data)
* n means that there are (n-1) current users of it.
*/
#ifdef CONFIG_HIGHMEM
+
+static void *page_pool_alloc(gfp_t gfp_mask, void *data)
+{
+ return alloc_page(gfp_mask);
+}
+
static int pkmap_count[LAST_PKMAP];
static unsigned int last_pkmap_nr;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
@@ -267,7 +271,7 @@ int init_emergency_isa_pool(void)
if (isa_page_pool)
return 0;
- isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA);
+ isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL);
if (!isa_page_pool)
BUG();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 901ac523a1c..3e52df7c471 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,6 +22,10 @@ unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
+
+/*
+ * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ */
static DEFINE_SPINLOCK(hugetlb_lock);
static void enqueue_huge_page(struct page *page)
@@ -61,8 +65,10 @@ static struct page *alloc_fresh_huge_page(void)
HUGETLB_PAGE_ORDER);
nid = (nid + 1) % num_online_nodes();
if (page) {
+ spin_lock(&hugetlb_lock);
nr_huge_pages++;
nr_huge_pages_node[page_to_nid(page)]++;
+ spin_unlock(&hugetlb_lock);
}
return page;
}
@@ -103,6 +109,9 @@ static int __init hugetlb_init(void)
unsigned long i;
struct page *page;
+ if (HPAGE_SHIFT == 0)
+ return 0;
+
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&hugepage_freelists[i]);
@@ -234,7 +243,6 @@ unsigned long hugetlb_total_pages(void)
{
return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
}
-EXPORT_SYMBOL(hugetlb_total_pages);
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
@@ -274,21 +282,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
{
pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
- unsigned long addr = vma->vm_start;
- unsigned long end = vma->vm_end;
+ unsigned long addr;
- while (addr < end) {
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+ src_pte = huge_pte_offset(src, addr);
+ if (!src_pte)
+ continue;
dst_pte = huge_pte_alloc(dst, addr);
if (!dst_pte)
goto nomem;
- src_pte = huge_pte_offset(src, addr);
- BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
- entry = *src_pte;
- ptepage = pte_page(entry);
- get_page(ptepage);
- add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(dst, addr, dst_pte, entry);
- addr += HPAGE_SIZE;
+ spin_lock(&dst->page_table_lock);
+ spin_lock(&src->page_table_lock);
+ if (!pte_none(*src_pte)) {
+ entry = *src_pte;
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ }
+ spin_unlock(&src->page_table_lock);
+ spin_unlock(&dst->page_table_lock);
}
return 0;
@@ -309,12 +322,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK);
+ spin_lock(&mm->page_table_lock);
+
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
- if (! ptep)
- /* This can happen on truncate, or if an
- * mmap() is aborted due to an error before
- * the prefault */
+ if (!ptep)
continue;
pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -323,74 +338,99 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
page = pte_page(pte);
put_page(page);
+ add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
}
- add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
+
+ spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
}
-void zap_hugepage_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long length)
+static struct page *find_lock_huge_page(struct address_space *mapping,
+ unsigned long idx)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+ int err;
+ struct inode *inode = mapping->host;
+ unsigned long size;
+
+retry:
+ page = find_lock_page(mapping, idx);
+ if (page)
+ goto out;
+
+ /* Check to make sure the mapping hasn't been truncated */
+ size = i_size_read(inode) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto out;
+
+ if (hugetlb_get_quota(mapping))
+ goto out;
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ goto out;
+ }
- spin_lock(&mm->page_table_lock);
- unmap_hugepage_range(vma, start, start + length);
- spin_unlock(&mm->page_table_lock);
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -EEXIST)
+ goto retry;
+ page = NULL;
+ }
+out:
+ return page;
}
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
{
- struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret = 0;
+ int ret = VM_FAULT_SIGBUS;
+ unsigned long idx;
+ unsigned long size;
+ pte_t *pte;
+ struct page *page;
+ struct address_space *mapping;
- WARN_ON(!is_vm_hugetlb_page(vma));
- BUG_ON(vma->vm_start & ~HPAGE_MASK);
- BUG_ON(vma->vm_end & ~HPAGE_MASK);
+ pte = huge_pte_alloc(mm, address);
+ if (!pte)
+ goto out;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
- hugetlb_prefault_arch_hook(mm);
+ /*
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
+ */
+ page = find_lock_huge_page(mapping, idx);
+ if (!page)
+ goto out;
spin_lock(&mm->page_table_lock);
- for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
- unsigned long idx;
- pte_t *pte = huge_pte_alloc(mm, addr);
- struct page *page;
+ size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto backout;
- if (!pte) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = VM_FAULT_MINOR;
+ if (!pte_none(*pte))
+ goto backout;
- idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
- + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
- page = find_get_page(mapping, idx);
- if (!page) {
- /* charge the fs quota first */
- if (hugetlb_get_quota(mapping)) {
- ret = -ENOMEM;
- goto out;
- }
- page = alloc_huge_page();
- if (!page) {
- hugetlb_put_quota(mapping);
- ret = -ENOMEM;
- goto out;
- }
- ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
- if (! ret) {
- unlock_page(page);
- } else {
- hugetlb_put_quota(mapping);
- free_huge_page(page);
- goto out;
- }
- }
- add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
- }
-out:
+ add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
spin_unlock(&mm->page_table_lock);
+ unlock_page(page);
+out:
return ret;
+
+backout:
+ spin_unlock(&mm->page_table_lock);
+ hugetlb_put_quota(mapping);
+ unlock_page(page);
+ put_page(page);
+ goto out;
}
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -400,28 +440,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long vpfn, vaddr = *position;
int remainder = *length;
- BUG_ON(!is_vm_hugetlb_page(vma));
-
vpfn = vaddr/PAGE_SIZE;
+ spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
+ pte_t *pte;
+ struct page *page;
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- /* Some archs (sparc64, sh*) have multiple
- * pte_ts to each hugepage. We have to make
- * sure we get the first, for the page
- * indexing below to work. */
- pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+ /*
+ * Some archs (sparc64, sh*) have multiple pte_ts to
+ * each hugepage. We have to make * sure we get the
+ * first, for the page indexing below to work.
+ */
+ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
- /* hugetlb should be locked, and hence, prefaulted */
- WARN_ON(!pte || pte_none(*pte));
+ if (!pte || pte_none(*pte)) {
+ int ret;
- page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+ spin_unlock(&mm->page_table_lock);
+ ret = hugetlb_fault(mm, vma, vaddr, 0);
+ spin_lock(&mm->page_table_lock);
+ if (ret == VM_FAULT_MINOR)
+ continue;
- WARN_ON(!PageCompound(page));
+ remainder = 0;
+ if (!i)
+ i = -EFAULT;
+ break;
+ }
+ if (pages) {
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
get_page(page);
pages[i] = page;
}
@@ -434,7 +482,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
--remainder;
++i;
}
-
+ spin_unlock(&mm->page_table_lock);
*length = remainder;
*position = vaddr;
diff --git a/mm/madvise.c b/mm/madvise.c
index 4454936f87d..2b7cf0400a2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -83,6 +83,9 @@ static long madvise_willneed(struct vm_area_struct * vma,
{
struct file *file = vma->vm_file;
+ if (!file)
+ return -EBADF;
+
if (file->f_mapping->a_ops->get_xip_page) {
/* no bad return value, but ignore advice */
return 0;
@@ -123,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
unsigned long start, unsigned long end)
{
*prev = vma;
- if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+ if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
return -EINVAL;
if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
@@ -141,11 +144,7 @@ static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
- struct file *filp = vma->vm_file;
- long error = -EBADF;
-
- if (!filp)
- goto out;
+ long error;
switch (behavior) {
case MADV_NORMAL:
@@ -166,8 +165,6 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
error = -EINVAL;
break;
}
-
-out:
return error;
}
diff --git a/mm/memory.c b/mm/memory.c
index ae8161f1f45..aa8af0e2026 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
{
struct page *page = pmd_page(*pmd);
pmd_clear(pmd);
+ pte_lock_deinit(page);
pte_free_tlb(tlb, page);
dec_page_state(nr_page_table_pages);
tlb->mm->nr_ptes--;
@@ -249,7 +250,7 @@ void free_pgd_range(struct mmu_gather **tlb,
free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
- if (!tlb_is_full_mm(*tlb))
+ if (!(*tlb)->fullmm)
flush_tlb_pgtables((*tlb)->mm, start, end);
}
@@ -260,6 +261,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;
+ /*
+ * Hide vma from rmap and vmtruncate before freeing pgtables
+ */
+ anon_vma_unlink(vma);
+ unlink_file_vma(vma);
+
if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
@@ -272,6 +279,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
HPAGE_SIZE)) {
vma = next;
next = vma->vm_next;
+ anon_vma_unlink(vma);
+ unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
@@ -280,75 +289,133 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
}
}
-pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
- if (!pmd_present(*pmd)) {
- struct page *new;
+ struct page *new = pte_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pmd_present(*pmd)) {
- pte_free(new);
- goto out;
- }
+ pte_lock_init(new);
+ spin_lock(&mm->page_table_lock);
+ if (pmd_present(*pmd)) { /* Another has populated it */
+ pte_lock_deinit(new);
+ pte_free(new);
+ } else {
mm->nr_ptes++;
inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new);
}
-out:
- return pte_offset_map(pmd, address);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+}
+
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
+{
+ pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+ if (!new)
+ return -ENOMEM;
+
+ spin_lock(&init_mm.page_table_lock);
+ if (pmd_present(*pmd)) /* Another has populated it */
+ pte_free_kernel(new);
+ else
+ pmd_populate_kernel(&init_mm, pmd, new);
+ spin_unlock(&init_mm.page_table_lock);
+ return 0;
}
-pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
{
- if (!pmd_present(*pmd)) {
- pte_t *new;
+ if (file_rss)
+ add_mm_counter(mm, file_rss, file_rss);
+ if (anon_rss)
+ add_mm_counter(mm, anon_rss, anon_rss);
+}
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one_kernel(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
+/*
+ * This function is called to print an error when a bad pte
+ * is found. For example, we might have a PFN-mapped pte in
+ * a region that doesn't allow it.
+ *
+ * The calling function must still handle the error.
+ */
+void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+{
+ printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+ "vm_flags = %lx, vaddr = %lx\n",
+ (long long)pte_val(pte),
+ (vma->vm_mm == current->mm ? current->comm : "???"),
+ vma->vm_flags, vaddr);
+ dump_stack();
+}
+
+/*
+ * This function gets the "struct page" associated with a pte.
+ *
+ * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
+ * will have each page table entry just pointing to a raw page frame
+ * number, and as far as the VM layer is concerned, those do not have
+ * pages associated with them - even if the PFN might point to memory
+ * that otherwise is perfectly fine and has a "struct page".
+ *
+ * The way we recognize those mappings is through the rules set up
+ * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
+ * and the vm_pgoff will point to the first PFN mapped: thus every
+ * page that is a raw mapping will always honor the rule
+ *
+ * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
+ *
+ * and if that isn't true, the page has been COW'ed (in which case it
+ * _does_ have a "struct page" associated with it even if it is in a
+ * VM_PFNMAP range).
+ */
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ if (vma->vm_flags & VM_PFNMAP) {
+ unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
+ if (pfn == vma->vm_pgoff + off)
return NULL;
+ }
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pmd_present(*pmd)) {
- pte_free_kernel(new);
- goto out;
- }
- pmd_populate_kernel(mm, pmd, new);
+ /*
+ * Add some anal sanity checks for now. Eventually,
+ * we should just do "return pfn_to_page(pfn)", but
+ * in the meantime we check that we get a valid pfn,
+ * and that the resulting page looks ok.
+ *
+ * Remove this test eventually!
+ */
+ if (unlikely(!pfn_valid(pfn))) {
+ print_bad_pte(vma, pte, addr);
+ return NULL;
}
-out:
- return pte_offset_kernel(pmd, address);
+
+ /*
+ * NOTE! We still have PageReserved() pages in the page
+ * tables.
+ *
+ * The PAGE_ZERO() pages and various VDSO mappings can
+ * cause them to exist.
+ */
+ return pfn_to_page(pfn);
}
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
*/
static inline void
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
- unsigned long addr)
+ pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
+ unsigned long addr, int *rss)
{
+ unsigned long vm_flags = vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
- unsigned long pfn;
/* pte contains position in swap or file, so copy. */
if (unlikely(!pte_present(pte))) {
@@ -357,27 +424,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
- list_add(&dst_mm->mmlist, &src_mm->mmlist);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist,
+ &src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
}
- set_pte_at(dst_mm, addr, dst_pte, pte);
- return;
- }
-
- pfn = pte_pfn(pte);
- /* the pte points outside of valid memory, the
- * mapping is assumed to be good, meaningful
- * and not mapped via rmap - duplicate the
- * mapping as is.
- */
- page = NULL;
- if (pfn_valid(pfn))
- page = pfn_to_page(pfn);
-
- if (!page || PageReserved(page)) {
- set_pte_at(dst_mm, addr, dst_pte, pte);
- return;
+ goto out_set_pte;
}
/*
@@ -396,12 +449,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (vm_flags & VM_SHARED)
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
- get_page(page);
- inc_mm_counter(dst_mm, rss);
- if (PageAnon(page))
- inc_mm_counter(dst_mm, anon_rss);
+
+ page = vm_normal_page(vma, addr, pte);
+ if (page) {
+ get_page(page);
+ page_dup_rmap(page);
+ rss[!!PageAnon(page)]++;
+ }
+
+out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
- page_dup_rmap(page);
}
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -409,38 +466,44 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
unsigned long addr, unsigned long end)
{
pte_t *src_pte, *dst_pte;
- unsigned long vm_flags = vma->vm_flags;
- int progress;
+ spinlock_t *src_ptl, *dst_ptl;
+ int progress = 0;
+ int rss[2];
again:
- dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
+ rss[1] = rss[0] = 0;
+ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
+ src_ptl = pte_lockptr(src_mm, src_pmd);
+ spin_lock(src_ptl);
- progress = 0;
- spin_lock(&src_mm->page_table_lock);
do {
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
*/
- if (progress >= 32 && (need_resched() ||
- need_lockbreak(&src_mm->page_table_lock) ||
- need_lockbreak(&dst_mm->page_table_lock)))
- break;
+ if (progress >= 32) {
+ progress = 0;
+ if (need_resched() ||
+ need_lockbreak(src_ptl) ||
+ need_lockbreak(dst_ptl))
+ break;
+ }
if (pte_none(*src_pte)) {
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
+ copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
- spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(src_ptl);
pte_unmap_nested(src_pte - 1);
- pte_unmap(dst_pte - 1);
- cond_resched_lock(&dst_mm->page_table_lock);
+ add_mm_rss(dst_mm, rss[0], rss[1]);
+ pte_unmap_unlock(dst_pte - 1, dst_ptl);
+ cond_resched();
if (addr != end)
goto again;
return 0;
@@ -504,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
- if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) {
+ if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) {
if (!vma->anon_vma)
return 0;
}
@@ -525,25 +588,30 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return 0;
}
-static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
- struct zap_details *details)
+ long *zap_work, struct zap_details *details)
{
+ struct mm_struct *mm = tlb->mm;
pte_t *pte;
+ spinlock_t *ptl;
+ int file_rss = 0;
+ int anon_rss = 0;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
do {
pte_t ptent = *pte;
- if (pte_none(ptent))
+ if (pte_none(ptent)) {
+ (*zap_work)--;
continue;
+ }
if (pte_present(ptent)) {
- struct page *page = NULL;
- unsigned long pfn = pte_pfn(ptent);
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
- if (PageReserved(page))
- page = NULL;
- }
+ struct page *page;
+
+ (*zap_work) -= PAGE_SIZE;
+
+ page = vm_normal_page(vma, addr, ptent);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
@@ -562,7 +630,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
page->index > details->last_index))
continue;
}
- ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
@@ -570,15 +638,17 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
if (unlikely(details) && details->nonlinear_vma
&& linear_page_index(details->nonlinear_vma,
addr) != page->index)
- set_pte_at(tlb->mm, addr, pte,
+ set_pte_at(mm, addr, pte,
pgoff_to_pte(page->index));
- if (pte_dirty(ptent))
- set_page_dirty(page);
if (PageAnon(page))
- dec_mm_counter(tlb->mm, anon_rss);
- else if (pte_young(ptent))
- mark_page_accessed(page);
- tlb->freed++;
+ anon_rss--;
+ else {
+ if (pte_dirty(ptent))
+ set_page_dirty(page);
+ if (pte_young(ptent))
+ mark_page_accessed(page);
+ file_rss--;
+ }
page_remove_rmap(page);
tlb_remove_page(tlb, page);
continue;
@@ -591,14 +661,19 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
continue;
if (!pte_file(ptent))
free_swap_and_cache(pte_to_swp_entry(ptent));
- pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_clear_full(mm, addr, pte, tlb->fullmm);
+ } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+
+ add_mm_rss(mm, file_rss, anon_rss);
+ pte_unmap_unlock(pte - 1, ptl);
+
+ return addr;
}
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
- struct zap_details *details)
+ long *zap_work, struct zap_details *details)
{
pmd_t *pmd;
unsigned long next;
@@ -606,15 +681,21 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none_or_clear_bad(pmd)) {
+ (*zap_work)--;
continue;
- zap_pte_range(tlb, pmd, addr, next, details);
- } while (pmd++, addr = next, addr != end);
+ }
+ next = zap_pte_range(tlb, vma, pmd, addr, next,
+ zap_work, details);
+ } while (pmd++, addr = next, (addr != end && *zap_work > 0));
+
+ return addr;
}
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
- struct zap_details *details)
+ long *zap_work, struct zap_details *details)
{
pud_t *pud;
unsigned long next;
@@ -622,15 +703,21 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud))
+ if (pud_none_or_clear_bad(pud)) {
+ (*zap_work)--;
continue;
- zap_pmd_range(tlb, pud, addr, next, details);
- } while (pud++, addr = next, addr != end);
+ }
+ next = zap_pmd_range(tlb, vma, pud, addr, next,
+ zap_work, details);
+ } while (pud++, addr = next, (addr != end && *zap_work > 0));
+
+ return addr;
}
-static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+static unsigned long unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
- struct zap_details *details)
+ long *zap_work, struct zap_details *details)
{
pgd_t *pgd;
unsigned long next;
@@ -643,11 +730,16 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
+ if (pgd_none_or_clear_bad(pgd)) {
+ (*zap_work)--;
continue;
- zap_pud_range(tlb, pgd, addr, next, details);
- } while (pgd++, addr = next, addr != end);
+ }
+ next = zap_pud_range(tlb, vma, pgd, addr, next,
+ zap_work, details);
+ } while (pgd++, addr = next, (addr != end && *zap_work > 0));
tlb_end_vma(tlb, vma);
+
+ return addr;
}
#ifdef CONFIG_PREEMPT
@@ -660,7 +752,6 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlbp: address of the caller's struct mmu_gather
- * @mm: the controlling mm_struct
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
@@ -669,10 +760,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
*
* Returns the end address of the unmapping (restart addr if interrupted).
*
- * Unmap all pages in the vma list. Called under page_table_lock.
+ * Unmap all pages in the vma list.
*
- * We aim to not hold page_table_lock for too long (for scheduling latency
- * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
+ * We aim to not hold locks for too long (for scheduling latency reasons).
+ * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
* return the ending mmu_gather to the caller.
*
* Only addresses between `start' and `end' will be unmapped.
@@ -684,17 +775,17 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlbp,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
{
- unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+ long zap_work = ZAP_BLOCK_SIZE;
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
unsigned long start = start_addr;
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
- int fullmm = tlb_is_full_mm(*tlbp);
+ int fullmm = (*tlbp)->fullmm;
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long end;
@@ -710,45 +801,39 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
*nr_accounted += (end - start) >> PAGE_SHIFT;
while (start != end) {
- unsigned long block;
-
if (!tlb_start_valid) {
tlb_start = start;
tlb_start_valid = 1;
}
- if (is_vm_hugetlb_page(vma)) {
- block = end - start;
+ if (unlikely(is_vm_hugetlb_page(vma))) {
unmap_hugepage_range(vma, start, end);
- } else {
- block = min(zap_bytes, end - start);
- unmap_page_range(*tlbp, vma, start,
- start + block, details);
+ zap_work -= (end - start) /
+ (HPAGE_SIZE / PAGE_SIZE);
+ start = end;
+ } else
+ start = unmap_page_range(*tlbp, vma,
+ start, end, &zap_work, details);
+
+ if (zap_work > 0) {
+ BUG_ON(start != end);
+ break;
}
- start += block;
- zap_bytes -= block;
- if ((long)zap_bytes > 0)
- continue;
-
tlb_finish_mmu(*tlbp, tlb_start, start);
if (need_resched() ||
- need_lockbreak(&mm->page_table_lock) ||
(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
if (i_mmap_lock) {
- /* must reset count of rss freed */
- *tlbp = tlb_gather_mmu(mm, fullmm);
+ *tlbp = NULL;
goto out;
}
- spin_unlock(&mm->page_table_lock);
cond_resched();
- spin_lock(&mm->page_table_lock);
}
- *tlbp = tlb_gather_mmu(mm, fullmm);
+ *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
tlb_start_valid = 0;
- zap_bytes = ZAP_BLOCK_SIZE;
+ zap_work = ZAP_BLOCK_SIZE;
}
}
out:
@@ -770,123 +855,92 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long end = address + size;
unsigned long nr_accounted = 0;
- if (is_vm_hugetlb_page(vma)) {
- zap_hugepage_range(vma, address, size);
- return end;
- }
-
lru_add_drain();
- spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
- end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
- tlb_finish_mmu(tlb, address, end);
- spin_unlock(&mm->page_table_lock);
+ update_hiwater_rss(mm);
+ end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+ if (tlb)
+ tlb_finish_mmu(tlb, address, end);
return end;
}
/*
* Do a quick page-table lookup for a single page.
- * mm->page_table_lock must be held.
*/
-static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
- int read, int write, int accessed)
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
- unsigned long pfn;
+ spinlock_t *ptl;
struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
- page = follow_huge_addr(mm, address, write);
- if (! IS_ERR(page))
- return page;
+ page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+ if (!IS_ERR(page)) {
+ BUG_ON(flags & FOLL_GET);
+ goto out;
+ }
+ page = NULL;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- goto out;
+ goto no_page_table;
pud = pud_offset(pgd, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
- goto out;
+ goto no_page_table;
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+ goto no_page_table;
+
+ if (pmd_huge(*pmd)) {
+ BUG_ON(flags & FOLL_GET);
+ page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
- if (pmd_huge(*pmd))
- return follow_huge_pmd(mm, address, pmd, write);
+ }
- ptep = pte_offset_map(pmd, address);
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep)
goto out;
pte = *ptep;
- pte_unmap(ptep);
- if (pte_present(pte)) {
- if (write && !pte_write(pte))
- goto out;
- if (read && !pte_read(pte))
- goto out;
- pfn = pte_pfn(pte);
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
- if (accessed) {
- if (write && !pte_dirty(pte) &&!PageDirty(page))
- set_page_dirty(page);
- mark_page_accessed(page);
- }
- return page;
- }
+ if (!pte_present(pte))
+ goto unlock;
+ if ((flags & FOLL_WRITE) && !pte_write(pte))
+ goto unlock;
+ page = vm_normal_page(vma, address, pte);
+ if (unlikely(!page))
+ goto unlock;
+
+ if (flags & FOLL_GET)
+ get_page(page);
+ if (flags & FOLL_TOUCH) {
+ if ((flags & FOLL_WRITE) &&
+ !pte_dirty(pte) && !PageDirty(page))
+ set_page_dirty(page);
+ mark_page_accessed(page);
}
-
+unlock:
+ pte_unmap_unlock(ptep, ptl);
out:
- return NULL;
-}
-
-inline struct page *
-follow_page(struct mm_struct *mm, unsigned long address, int write)
-{
- return __follow_page(mm, address, 0, write, 1);
-}
-
-/*
- * check_user_page_readable() can be called frm niterrupt context by oprofile,
- * so we need to avoid taking any non-irq-safe locks
- */
-int check_user_page_readable(struct mm_struct *mm, unsigned long address)
-{
- return __follow_page(mm, address, 1, 0, 0) != NULL;
-}
-EXPORT_SYMBOL(check_user_page_readable);
-
-static inline int
-untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
- unsigned long address)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
-
- /* Check if the vma is for an anonymous mapping. */
- if (vma->vm_ops && vma->vm_ops->nopage)
- return 0;
-
- /* Check if page directory entry exists. */
- pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- return 1;
-
- pud = pud_offset(pgd, address);
- if (pud_none(*pud) || unlikely(pud_bad(*pud)))
- return 1;
-
- /* Check if page middle directory entry exists. */
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- return 1;
+ return page;
- /* There is a pte slot for 'address' in 'mm'. */
- return 0;
+no_page_table:
+ /*
+ * When core dumping an enormous anonymous area that nobody
+ * has touched so far, we don't want to allocate page tables.
+ */
+ if (flags & FOLL_ANON) {
+ page = ZERO_PAGE(address);
+ if (flags & FOLL_GET)
+ get_page(page);
+ BUG_ON(flags & FOLL_WRITE);
+ }
+ return page;
}
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -894,18 +948,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
struct page **pages, struct vm_area_struct **vmas)
{
int i;
- unsigned int flags;
+ unsigned int vm_flags;
/*
* Require read or write permissions.
* If 'force' is set, we only require the "MAY" flags.
*/
- flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
- flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+ vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
i = 0;
do {
- struct vm_area_struct * vma;
+ struct vm_area_struct *vma;
+ unsigned int foll_flags;
vma = find_extend_vma(mm, start);
if (!vma && in_gate_area(tsk, start)) {
@@ -933,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
return i ? : -EFAULT;
}
if (pages) {
- pages[i] = pte_page(*pte);
- get_page(pages[i]);
+ struct page *page = vm_normal_page(gate_vma, start, *pte);
+ pages[i] = page;
+ if (page)
+ get_page(page);
}
pte_unmap(pte);
if (vmas)
@@ -946,7 +1003,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
if (!vma || (vma->vm_flags & VM_IO)
- || !(flags & vma->vm_flags))
+ || !(vm_flags & vma->vm_flags))
return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
@@ -954,29 +1011,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
&start, &len, i);
continue;
}
- spin_lock(&mm->page_table_lock);
+
+ foll_flags = FOLL_TOUCH;
+ if (pages)
+ foll_flags |= FOLL_GET;
+ if (!write && !(vma->vm_flags & VM_LOCKED) &&
+ (!vma->vm_ops || !vma->vm_ops->nopage))
+ foll_flags |= FOLL_ANON;
+
do {
- int write_access = write;
struct page *page;
- cond_resched_lock(&mm->page_table_lock);
- while (!(page = follow_page(mm, start, write_access))) {
- int ret;
-
- /*
- * Shortcut for anonymous pages. We don't want
- * to force the creation of pages tables for
- * insanely big anonymously mapped areas that
- * nobody touched so far. This is important
- * for doing a core dump for these mappings.
- */
- if (!write && untouched_anonymous_page(mm,vma,start)) {
- page = ZERO_PAGE(start);
- break;
- }
- spin_unlock(&mm->page_table_lock);
- ret = __handle_mm_fault(mm, vma, start, write_access);
+ if (write)
+ foll_flags |= FOLL_WRITE;
+ cond_resched();
+ while (!(page = follow_page(vma, start, foll_flags))) {
+ int ret;
+ ret = __handle_mm_fault(mm, vma, start,
+ foll_flags & FOLL_WRITE);
/*
* The VM_FAULT_WRITE bit tells us that do_wp_page has
* broken COW when necessary, even if maybe_mkwrite
@@ -984,7 +1037,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
* subsequent page lookups as if they were reads.
*/
if (ret & VM_FAULT_WRITE)
- write_access = 0;
+ foll_flags &= ~FOLL_WRITE;
switch (ret & ~VM_FAULT_WRITE) {
case VM_FAULT_MINOR:
@@ -1000,13 +1053,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
default:
BUG();
}
- spin_lock(&mm->page_table_lock);
}
if (pages) {
pages[i] = page;
flush_dcache_page(page);
- if (!PageReserved(page))
- page_cache_get(page);
}
if (vmas)
vmas[i] = vma;
@@ -1014,7 +1064,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
start += PAGE_SIZE;
len--;
} while (len && start < vma->vm_end);
- spin_unlock(&mm->page_table_lock);
} while (len);
return i;
}
@@ -1024,16 +1073,21 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t prot)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
do {
- pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
+ struct page *page = ZERO_PAGE(addr);
+ pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+ page_cache_get(page);
+ page_add_file_rmap(page);
+ inc_mm_counter(mm, file_rss);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, zero_pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
return 0;
}
@@ -1083,17 +1137,138 @@ int zeromap_page_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = zeromap_pud_range(mm, pgd, addr, next, prot);
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&mm->page_table_lock);
return err;
}
+pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+{
+ pgd_t * pgd = pgd_offset(mm, addr);
+ pud_t * pud = pud_alloc(mm, pgd, addr);
+ if (pud) {
+ pmd_t * pmd = pmd_alloc(mm, pud, addr);
+ if (pmd)
+ return pte_alloc_map_lock(mm, pmd, addr, ptl);
+ }
+ return NULL;
+}
+
+/*
+ * This is the old fallback for page remapping.
+ *
+ * For historical reasons, it only allows reserved pages. Only
+ * old drivers should use this, and they needed to mark their
+ * pages reserved for the old functions anyway.
+ */
+static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+{
+ int retval;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ retval = -EINVAL;
+ if (PageAnon(page))
+ goto out;
+ retval = -ENOMEM;
+ flush_dcache_page(page);
+ pte = get_locked_pte(mm, addr, &ptl);
+ if (!pte)
+ goto out;
+ retval = -EBUSY;
+ if (!pte_none(*pte))
+ goto out_unlock;
+
+ /* Ok, finally just insert the thing.. */
+ get_page(page);
+ inc_mm_counter(mm, file_rss);
+ page_add_file_rmap(page);
+ set_pte_at(mm, addr, pte, mk_pte(page, prot));
+
+ retval = 0;
+out_unlock:
+ pte_unmap_unlock(pte, ptl);
+out:
+ return retval;
+}
+
+/*
+ * This allows drivers to insert individual pages they've allocated
+ * into a user vma.
+ *
+ * The page has to be a nice clean _individual_ kernel allocation.
+ * If you allocate a compound page, you need to have marked it as
+ * such (__GFP_COMP), or manually just split the page up yourself
+ * (which is mainly an issue of doing "set_page_count(page, 1)" for
+ * each sub-page, and then freeing them one by one when you free
+ * them rather than freeing it as a compound page).
+ *
+ * NOTE! Traditionally this was done with "remap_pfn_range()" which
+ * took an arbitrary page protection parameter. This doesn't allow
+ * that. Your vma protection will have to be set up correctly, which
+ * means that if you want a shared writable mapping, you'd better
+ * ask for a shared writable mapping!
+ *
+ * The page does not need to be reserved.
+ */
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
+{
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return -EFAULT;
+ if (!page_count(page))
+ return -EINVAL;
+ return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_page);
+
+/*
+ * Somebody does a pfn remapping that doesn't actually work as a vma.
+ *
+ * Do it as individual pages instead, and warn about it. It's bad form,
+ * and very inefficient.
+ */
+static int incomplete_pfn_remap(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ static int warn = 10;
+ struct page *page;
+ int retval;
+
+ if (!(vma->vm_flags & VM_INCOMPLETE)) {
+ if (warn) {
+ warn--;
+ printk("%s does an incomplete pfn remapping", current->comm);
+ dump_stack();
+ }
+ }
+ vma->vm_flags |= VM_INCOMPLETE | VM_IO | VM_RESERVED;
+
+ if (start < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ if (!pfn_valid(pfn))
+ return -EINVAL;
+
+ page = pfn_to_page(pfn);
+ if (!PageReserved(page))
+ return -EINVAL;
+
+ retval = 0;
+ while (start < end) {
+ retval = insert_page(vma->vm_mm, start, page, prot);
+ if (retval < 0)
+ break;
+ start += PAGE_SIZE;
+ page++;
+ }
+ return retval;
+}
+
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
@@ -1104,17 +1279,17 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long pfn, pgprot_t prot)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
do {
BUG_ON(!pte_none(*pte));
- if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
- set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+ set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
return 0;
}
@@ -1168,21 +1343,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
struct mm_struct *mm = vma->vm_mm;
int err;
+ if (addr != vma->vm_start || end != vma->vm_end)
+ return incomplete_pfn_remap(vma, addr, end, pfn, prot);
+
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
* VM_IO tells people not to look at these pages
* (accesses can have side effects).
- * VM_RESERVED tells swapout not to try to touch
- * this region.
+ * VM_RESERVED is specified all over the place, because
+ * in 2.4 it kept swapout's vma scan off this vma; but
+ * in 2.6 the LRU scan won't even find its pages, so this
+ * flag means no more than count its pages in reserved_vm,
+ * and omit it from core dump, even when VM_IO turned off.
+ * VM_PFNMAP tells the core MM that the base pages are just
+ * raw PFN mappings, and do not have a "struct page" associated
+ * with them.
*/
- vma->vm_flags |= VM_IO | VM_RESERVED;
+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+ vma->vm_pgoff = pfn;
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = remap_pud_range(mm, pgd, addr, next,
@@ -1190,12 +1374,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&mm->page_table_lock);
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
/*
+ * handle_pte_fault chooses page fault handler according to an entry
+ * which was read non-atomically. Before making any commitment, on
+ * those architectures or configurations (e.g. i386 with PAE) which
+ * might give a mix of unmatched parts, do_swap_page and do_file_page
+ * must check under lock before unmapping the pte and proceeding
+ * (but do_wp_page is only called after already making such a check;
+ * and do_anonymous_page and do_no_page can safely check later on).
+ */
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
+ pte_t *page_table, pte_t orig_pte)
+{
+ int same = 1;
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+ if (sizeof(pte_t) > sizeof(unsigned long)) {
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ same = pte_same(*page_table, orig_pte);
+ spin_unlock(ptl);
+ }
+#endif
+ pte_unmap(page_table);
+ return same;
+}
+
+/*
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
* servicing faults for write access. In the normal case, do always want
* pte_mkwrite. But get_user_pages can cause write faults for mappings
@@ -1208,19 +1416,31 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}
-/*
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
- */
-static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
- pte_t *page_table)
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
{
- pte_t entry;
+ /*
+ * If the source page was a PFN mapping, we don't have
+ * a "struct page" for it. We do a best-effort copy by
+ * just copying from the original user address. If that
+ * fails, we just zero-fill it. Live with it.
+ */
+ if (unlikely(!src)) {
+ void *kaddr = kmap_atomic(dst, KM_USER0);
+ void __user *uaddr = (void __user *)(va & PAGE_MASK);
- entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
- vma);
- ptep_establish(vma, address, page_table, entry);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
+ /*
+ * This really shouldn't fail, because the page is there
+ * in the page tables. But it might just be unreadable,
+ * in which case we just give up and fill the result with
+ * zeroes.
+ */
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
+ memset(kaddr, 0, PAGE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ return;
+
+ }
+ copy_user_highpage(dst, src, va);
}
/*
@@ -1228,9 +1448,6 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
*
- * Goto-purists beware: the only reason for goto's here is that it results
- * in better assembly code.. The "default" path will see no jumps at all.
- *
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
* Thus we can safely just mark it writable once we've done any necessary
@@ -1240,82 +1457,76 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*
- * We hold the mm semaphore and the page_table_lock on entry and exit
- * with the page_table_lock released.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ spinlock_t *ptl, pte_t orig_pte)
{
struct page *old_page, *new_page;
- unsigned long pfn = pte_pfn(pte);
pte_t entry;
- int ret;
+ int ret = VM_FAULT_MINOR;
- if (unlikely(!pfn_valid(pfn))) {
- /*
- * This should really halt the system so it can be debugged or
- * at least the kernel stops what it's doing before it corrupts
- * data, but for the moment just pretend this is OOM.
- */
- pte_unmap(page_table);
- printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
- address);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_OOM;
- }
- old_page = pfn_to_page(pfn);
+ old_page = vm_normal_page(vma, address, orig_pte);
+ if (!old_page)
+ goto gotten;
if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
int reuse = can_share_swap_page(old_page);
unlock_page(old_page);
if (reuse) {
- flush_cache_page(vma, address, pfn);
- entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
- vma);
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = pte_mkyoung(orig_pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
ptep_set_access_flags(vma, address, page_table, entry, 1);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_MINOR|VM_FAULT_WRITE;
+ ret |= VM_FAULT_WRITE;
+ goto unlock;
}
}
- pte_unmap(page_table);
/*
* Ok, we need to copy. Oh, well..
*/
- if (!PageReserved(old_page))
- page_cache_get(old_page);
- spin_unlock(&mm->page_table_lock);
+ page_cache_get(old_page);
+gotten:
+ pte_unmap_unlock(page_table, ptl);
if (unlikely(anon_vma_prepare(vma)))
- goto no_new_page;
+ goto oom;
if (old_page == ZERO_PAGE(address)) {
new_page = alloc_zeroed_user_highpage(vma, address);
if (!new_page)
- goto no_new_page;
+ goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
if (!new_page)
- goto no_new_page;
- copy_user_highpage(new_page, old_page, address);
+ goto oom;
+ cow_user_page(new_page, old_page, address);
}
+
/*
* Re-check the pte - we dropped the lock
*/
- ret = VM_FAULT_MINOR;
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, address);
- if (likely(pte_same(*page_table, pte))) {
- if (PageAnon(old_page))
- dec_mm_counter(mm, anon_rss);
- if (PageReserved(old_page))
- inc_mm_counter(mm, rss);
- else
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (likely(pte_same(*page_table, orig_pte))) {
+ if (old_page) {
page_remove_rmap(old_page);
- flush_cache_page(vma, address, pfn);
- break_cow(vma, new_page, address, page_table);
+ if (!PageAnon(old_page)) {
+ dec_mm_counter(mm, file_rss);
+ inc_mm_counter(mm, anon_rss);
+ }
+ } else
+ inc_mm_counter(mm, anon_rss);
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ ptep_establish(vma, address, page_table, entry);
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
lru_cache_add_active(new_page);
page_add_anon_rmap(new_page, vma, address);
@@ -1323,14 +1534,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
new_page = old_page;
ret |= VM_FAULT_WRITE;
}
- pte_unmap(page_table);
- page_cache_release(new_page);
- page_cache_release(old_page);
- spin_unlock(&mm->page_table_lock);
+ if (new_page)
+ page_cache_release(new_page);
+ if (old_page)
+ page_cache_release(old_page);
+unlock:
+ pte_unmap_unlock(page_table, ptl);
return ret;
-
-no_new_page:
- page_cache_release(old_page);
+oom:
+ if (old_page)
+ page_cache_release(old_page);
return VM_FAULT_OOM;
}
@@ -1399,13 +1612,6 @@ again:
restart_addr = zap_page_range(vma, start_addr,
end_addr - start_addr, details);
-
- /*
- * We cannot rely on the break test in unmap_vmas:
- * on the one hand, we don't want to restart our loop
- * just because that broke out for the page_table_lock;
- * on the other hand, it does no test when vma is small.
- */
need_break = need_resched() ||
need_lockbreak(details->i_mmap_lock);
@@ -1654,38 +1860,37 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc
}
/*
- * We hold the mm semaphore and the page_table_lock on entry and
- * should release the pagetable lock on exit..
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_swap_page(struct mm_struct * mm,
- struct vm_area_struct * vma, unsigned long address,
- pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
+static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
{
+ spinlock_t *ptl;
struct page *page;
- swp_entry_t entry = pte_to_swp_entry(orig_pte);
+ swp_entry_t entry;
pte_t pte;
int ret = VM_FAULT_MINOR;
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+ goto out;
+
+ entry = pte_to_swp_entry(orig_pte);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
page = read_swap_cache_async(entry, vma, address);
if (!page) {
/*
- * Back out if somebody else faulted in this pte while
- * we released the page table lock.
+ * Back out if somebody else faulted in this pte
+ * while we released the pte lock.
*/
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, address);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte)))
ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_MINOR;
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- goto out;
+ goto unlock;
}
/* Had to read the page from swap area: Major fault */
@@ -1698,15 +1903,11 @@ static int do_swap_page(struct mm_struct * mm,
lock_page(page);
/*
- * Back out if somebody else faulted in this pte while we
- * released the page table lock.
+ * Back out if somebody else already faulted in this pte.
*/
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, address);
- if (unlikely(!pte_same(*page_table, orig_pte))) {
- ret = VM_FAULT_MINOR;
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (unlikely(!pte_same(*page_table, orig_pte)))
goto out_nomap;
- }
if (unlikely(!PageUptodate(page))) {
ret = VM_FAULT_SIGBUS;
@@ -1715,7 +1916,7 @@ static int do_swap_page(struct mm_struct * mm,
/* The page isn't present yet, go ahead with the fault. */
- inc_mm_counter(mm, rss);
+ inc_mm_counter(mm, anon_rss);
pte = mk_pte(page, vma->vm_page_prot);
if (write_access && can_share_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1733,7 +1934,7 @@ static int do_swap_page(struct mm_struct * mm,
if (write_access) {
if (do_wp_page(mm, vma, address,
- page_table, pmd, pte) == VM_FAULT_OOM)
+ page_table, pmd, ptl, pte) == VM_FAULT_OOM)
ret = VM_FAULT_OOM;
goto out;
}
@@ -1741,74 +1942,76 @@ static int do_swap_page(struct mm_struct * mm,
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
lazy_mmu_prot_update(pte);
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+unlock:
+ pte_unmap_unlock(page_table, ptl);
out:
return ret;
out_nomap:
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(page_table, ptl);
unlock_page(page);
page_cache_release(page);
- goto out;
+ return ret;
}
/*
- * We are called with the MM semaphore and page_table_lock
- * spinlock held to protect against concurrent faults in
- * multithreaded programs.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int
-do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
- pte_t *page_table, pmd_t *pmd, int write_access,
- unsigned long addr)
+static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
{
+ struct page *page;
+ spinlock_t *ptl;
pte_t entry;
- struct page * page = ZERO_PAGE(addr);
-
- /* Read-only mapping of ZERO_PAGE. */
- entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
- /* ..except if it's a write access */
if (write_access) {
/* Allocate our own private page. */
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
if (unlikely(anon_vma_prepare(vma)))
- goto no_mem;
- page = alloc_zeroed_user_highpage(vma, addr);
+ goto oom;
+ page = alloc_zeroed_user_highpage(vma, address);
if (!page)
- goto no_mem;
+ goto oom;
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, addr);
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (!pte_none(*page_table)) {
- pte_unmap(page_table);
- page_cache_release(page);
- spin_unlock(&mm->page_table_lock);
- goto out;
- }
- inc_mm_counter(mm, rss);
- entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
- vma->vm_page_prot)),
- vma);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_none(*page_table))
+ goto release;
+ inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
SetPageReferenced(page);
- page_add_anon_rmap(page, vma, addr);
+ page_add_anon_rmap(page, vma, address);
+ } else {
+ /* Map the ZERO_PAGE - vm_page_prot is readonly */
+ page = ZERO_PAGE(address);
+ page_cache_get(page);
+ entry = mk_pte(page, vma->vm_page_prot);
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (!pte_none(*page_table))
+ goto release;
+ inc_mm_counter(mm, file_rss);
+ page_add_file_rmap(page);
}
- set_pte_at(mm, addr, page_table, entry);
- pte_unmap(page_table);
+ set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, addr, entry);
+ update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- spin_unlock(&mm->page_table_lock);
-out:
+unlock:
+ pte_unmap_unlock(page_table, ptl);
return VM_FAULT_MINOR;
-no_mem:
+release:
+ page_cache_release(page);
+ goto unlock;
+oom:
return VM_FAULT_OOM;
}
@@ -1821,25 +2024,24 @@ no_mem:
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
- * This is called with the MM semaphore held and the page table
- * spinlock held. Exit with the spinlock released.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int
-do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
{
- struct page * new_page;
+ spinlock_t *ptl;
+ struct page *new_page;
struct address_space *mapping = NULL;
pte_t entry;
unsigned int sequence = 0;
int ret = VM_FAULT_MINOR;
int anon = 0;
- if (!vma->vm_ops || !vma->vm_ops->nopage)
- return do_anonymous_page(mm, vma, page_table,
- pmd, write_access, address);
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ BUG_ON(vma->vm_flags & VM_PFNMAP);
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
@@ -1847,7 +2049,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
smp_rmb(); /* serializes i_size against truncate_count */
}
retry:
- cond_resched();
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
/*
* No smp_rmb is needed here as long as there's a full
@@ -1880,19 +2081,20 @@ retry:
anon = 1;
}
- spin_lock(&mm->page_table_lock);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
* For a file-backed vma, someone could have truncated or otherwise
* invalidated this page. If unmap_mapping_range got called,
* retry getting the page.
*/
if (mapping && unlikely(sequence != mapping->truncate_count)) {
- sequence = mapping->truncate_count;
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(page_table, ptl);
page_cache_release(new_page);
+ cond_resched();
+ sequence = mapping->truncate_count;
+ smp_rmb();
goto retry;
}
- page_table = pte_offset_map(pmd, address);
/*
* This silly early PAGE_DIRTY setting removes a race
@@ -1906,68 +2108,67 @@ retry:
*/
/* Only go through if we didn't race with anybody else... */
if (pte_none(*page_table)) {
- if (!PageReserved(new_page))
- inc_mm_counter(mm, rss);
-
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte_at(mm, address, page_table, entry);
if (anon) {
+ inc_mm_counter(mm, anon_rss);
lru_cache_add_active(new_page);
page_add_anon_rmap(new_page, vma, address);
- } else
+ } else {
+ inc_mm_counter(mm, file_rss);
page_add_file_rmap(new_page);
- pte_unmap(page_table);
+ }
} else {
/* One of our sibling threads was faster, back out. */
- pte_unmap(page_table);
page_cache_release(new_page);
- spin_unlock(&mm->page_table_lock);
- goto out;
+ goto unlock;
}
/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- spin_unlock(&mm->page_table_lock);
-out:
+unlock:
+ pte_unmap_unlock(page_table, ptl);
return ret;
oom:
page_cache_release(new_page);
- ret = VM_FAULT_OOM;
- goto out;
+ return VM_FAULT_OOM;
}
/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
- unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
{
- unsigned long pgoff;
+ pgoff_t pgoff;
int err;
- BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
- /*
- * Fall back to the linear mapping if the fs does not support
- * ->populate:
- */
- if (!vma->vm_ops->populate ||
- (write_access && !(vma->vm_flags & VM_SHARED))) {
- pte_clear(mm, address, pte);
- return do_no_page(mm, vma, address, write_access, pte, pmd);
- }
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+ return VM_FAULT_MINOR;
- pgoff = pte_to_pgoff(*pte);
-
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
+ /*
+ * Page table corrupted: show pte and kill process.
+ */
+ print_bad_pte(vma, orig_pte, address);
+ return VM_FAULT_OOM;
+ }
+ /* We can then assume vm->vm_ops && vma->vm_ops->populate */
- err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
+ pgoff = pte_to_pgoff(orig_pte);
+ err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
+ vma->vm_page_prot, pgoff, 0);
if (err == -ENOMEM)
return VM_FAULT_OOM;
if (err)
@@ -1984,56 +2185,68 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
- * The adding of pages is protected by the MM semaphore (which we hold),
- * so we don't need to worry about a page being suddenly been added into
- * our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct * vma, unsigned long address,
- int write_access, pte_t *pte, pmd_t *pmd)
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, int write_access)
{
pte_t entry;
+ pte_t old_entry;
+ spinlock_t *ptl;
- entry = *pte;
+ old_entry = entry = *pte;
if (!pte_present(entry)) {
- /*
- * If it truly wasn't present, we know that kswapd
- * and the PTE updates will not touch it later. So
- * drop the lock.
- */
- if (pte_none(entry))
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ if (pte_none(entry)) {
+ if (!vma->vm_ops || !vma->vm_ops->nopage)
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, write_access);
+ return do_no_page(mm, vma, address,
+ pte, pmd, write_access);
+ }
if (pte_file(entry))
- return do_file_page(mm, vma, address, write_access, pte, pmd);
- return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
+ return do_file_page(mm, vma, address,
+ pte, pmd, write_access, entry);
+ return do_swap_page(mm, vma, address,
+ pte, pmd, write_access, entry);
}
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*pte, entry)))
+ goto unlock;
if (write_access) {
if (!pte_write(entry))
- return do_wp_page(mm, vma, address, pte, pmd, entry);
+ return do_wp_page(mm, vma, address,
+ pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- ptep_set_access_flags(vma, address, pte, entry, write_access);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ if (!pte_same(old_entry, entry)) {
+ ptep_set_access_flags(vma, address, pte, entry, write_access);
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
+ } else {
+ /*
+ * This is needed only for protection faults but the arch code
+ * is not yet telling us if this is a protection fault or not.
+ * This still avoids useless tlb flushes for .text page faults
+ * with threads.
+ */
+ if (write_access)
+ flush_tlb_page(vma, address);
+ }
+unlock:
+ pte_unmap_unlock(pte, ptl);
return VM_FAULT_MINOR;
}
/*
* By the time we get here, we already hold the mm semaphore
*/
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access)
{
pgd_t *pgd;
@@ -2045,103 +2258,81 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
inc_page_state(pgfault);
- if (is_vm_hugetlb_page(vma))
- return VM_FAULT_SIGBUS; /* mapping truncation does this. */
+ if (unlikely(is_vm_hugetlb_page(vma)))
+ return hugetlb_fault(mm, vma, address, write_access);
- /*
- * We need the page table lock to synchronize with kswapd
- * and the SMP-safe atomic PTE updates.
- */
pgd = pgd_offset(mm, address);
- spin_lock(&mm->page_table_lock);
-
pud = pud_alloc(mm, pgd, address);
if (!pud)
- goto oom;
-
+ return VM_FAULT_OOM;
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
- goto oom;
-
+ return VM_FAULT_OOM;
pte = pte_alloc_map(mm, pmd, address);
if (!pte)
- goto oom;
-
- return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+ return VM_FAULT_OOM;
- oom:
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_OOM;
+ return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
}
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
*/
-pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
- pud_t *new;
-
- spin_unlock(&mm->page_table_lock);
- new = pud_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
+ pud_t *new = pud_alloc_one(mm, address);
if (!new)
- return NULL;
+ return -ENOMEM;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pgd_present(*pgd)) {
+ spin_lock(&mm->page_table_lock);
+ if (pgd_present(*pgd)) /* Another has populated it */
pud_free(new);
- goto out;
- }
- pgd_populate(mm, pgd, new);
- out:
- return pud_offset(pgd, address);
+ else
+ pgd_populate(mm, pgd, new);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+}
+#else
+/* Workaround for gcc 2.96 */
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+ return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
*/
-pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
- pmd_t *new;
-
- spin_unlock(&mm->page_table_lock);
- new = pmd_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
+ pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
- return NULL;
+ return -ENOMEM;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
+ spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_4LEVEL_HACK
- if (pud_present(*pud)) {
+ if (pud_present(*pud)) /* Another has populated it */
pmd_free(new);
- goto out;
- }
- pud_populate(mm, pud, new);
+ else
+ pud_populate(mm, pud, new);
#else
- if (pgd_present(*pud)) {
+ if (pgd_present(*pud)) /* Another has populated it */
pmd_free(new);
- goto out;
- }
- pgd_populate(mm, pud, new);
+ else
+ pgd_populate(mm, pud, new);
#endif /* __ARCH_HAS_4LEVEL_HACK */
-
- out:
- return pmd_offset(pud, address);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+}
+#else
+/* Workaround for gcc 2.96 */
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+{
+ return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */
@@ -2206,22 +2397,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
EXPORT_SYMBOL(vmalloc_to_pfn);
-/*
- * update_mem_hiwater
- * - update per process rss and vm high water data
- */
-void update_mem_hiwater(struct task_struct *tsk)
-{
- if (tsk->mm) {
- unsigned long rss = get_mm_counter(tsk->mm, rss);
-
- if (tsk->mm->hiwater_rss < rss)
- tsk->mm->hiwater_rss = rss;
- if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
- tsk->mm->hiwater_vm = tsk->mm->total_vm;
- }
-}
-
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
new file mode 100644
index 00000000000..431a64f021c
--- /dev/null
+++ b/mm/memory_hotplug.c
@@ -0,0 +1,138 @@
+/*
+ * linux/mm/memory_hotplug.c
+ *
+ * Copyright (C)
+ */
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/pagevec.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+
+extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+ unsigned long size);
+static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nr_pages = PAGES_PER_SECTION;
+ int nid = pgdat->node_id;
+ int zone_type;
+
+ zone_type = zone - pgdat->node_zones;
+ memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
+ zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+}
+
+extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+ int nr_pages);
+static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nr_pages = PAGES_PER_SECTION;
+ int ret;
+
+ ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
+
+ if (ret < 0)
+ return ret;
+
+ __add_zone(zone, phys_start_pfn);
+ return register_new_memory(__pfn_to_section(phys_start_pfn));
+}
+
+/*
+ * Reasonably generic function for adding memory. It is
+ * expected that archs that support memory hotplug will
+ * call this function after deciding the zone to which to
+ * add the new pages.
+ */
+int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+ int err = 0;
+
+ for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
+ err = __add_section(zone, phys_start_pfn + i);
+
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+static void grow_zone_span(struct zone *zone,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long old_zone_end_pfn;
+
+ zone_span_writelock(zone);
+
+ old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ if (start_pfn < zone->zone_start_pfn)
+ zone->zone_start_pfn = start_pfn;
+
+ if (end_pfn > old_zone_end_pfn)
+ zone->spanned_pages = end_pfn - zone->zone_start_pfn;
+
+ zone_span_writeunlock(zone);
+}
+
+static void grow_pgdat_span(struct pglist_data *pgdat,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long old_pgdat_end_pfn =
+ pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+ if (start_pfn < pgdat->node_start_pfn)
+ pgdat->node_start_pfn = start_pfn;
+
+ if (end_pfn > old_pgdat_end_pfn)
+ pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages;
+}
+
+int online_pages(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long i;
+ unsigned long flags;
+ unsigned long onlined_pages = 0;
+ struct zone *zone;
+
+ /*
+ * This doesn't need a lock to do pfn_to_page().
+ * The section can't be removed here because of the
+ * memory_block->state_sem.
+ */
+ zone = page_zone(pfn_to_page(pfn));
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ grow_zone_span(zone, pfn, pfn + nr_pages);
+ grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pfn_to_page(pfn + i);
+ online_page(page);
+ onlined_pages++;
+ }
+ zone->present_pages += onlined_pages;
+
+ setup_per_zone_pages_min();
+
+ return 0;
+}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9033f0859aa..bec88c81244 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2,6 +2,7 @@
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
+ * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
* Subject to the GNU Public License, version 2.
*
* NUMA policy allows the user to give hints in which node(s) memory should
@@ -17,13 +18,19 @@
* offset into the backing object or offset into the mapping
* for anonymous memory. For process policy an process counter
* is used.
+ *
* bind Only allocate memory on a specific set of nodes,
* no fallback.
+ * FIXME: memory is allocated starting with the first node
+ * to the last. It would be better if bind would truly restrict
+ * the allocation to memory nodes instead
+ *
* preferred Try a specific node first before normal fallback.
* As a special case node -1 here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
+ *
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
@@ -93,23 +100,10 @@ struct mempolicy default_policy = {
.policy = MPOL_DEFAULT,
};
-/* Check if all specified nodes are online */
-static int nodes_online(unsigned long *nodes)
-{
- DECLARE_BITMAP(online2, MAX_NUMNODES);
-
- bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
- if (bitmap_empty(online2, MAX_NUMNODES))
- set_bit(0, online2);
- if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
- return -EINVAL;
- return 0;
-}
-
/* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, unsigned long *nodes)
+static int mpol_check_policy(int mode, nodemask_t *nodes)
{
- int empty = bitmap_empty(nodes, MAX_NUMNODES);
+ int empty = nodes_empty(*nodes);
switch (mode) {
case MPOL_DEFAULT:
@@ -124,71 +118,20 @@ static int mpol_check_policy(int mode, unsigned long *nodes)
return -EINVAL;
break;
}
- return nodes_online(nodes);
-}
-
-/* Copy a node mask from user space. */
-static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
- unsigned long maxnode, int mode)
-{
- unsigned long k;
- unsigned long nlongs;
- unsigned long endmask;
-
- --maxnode;
- bitmap_zero(nodes, MAX_NUMNODES);
- if (maxnode == 0 || !nmask)
- return 0;
-
- nlongs = BITS_TO_LONGS(maxnode);
- if ((maxnode % BITS_PER_LONG) == 0)
- endmask = ~0UL;
- else
- endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
-
- /* When the user specified more nodes than supported just check
- if the non supported part is all zero. */
- if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
- if (nlongs > PAGE_SIZE/sizeof(long))
- return -EINVAL;
- for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
- unsigned long t;
- if (get_user(t, nmask + k))
- return -EFAULT;
- if (k == nlongs - 1) {
- if (t & endmask)
- return -EINVAL;
- } else if (t)
- return -EINVAL;
- }
- nlongs = BITS_TO_LONGS(MAX_NUMNODES);
- endmask = ~0UL;
- }
-
- if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
- return -EFAULT;
- nodes[nlongs-1] &= endmask;
- /* Update current mems_allowed */
- cpuset_update_current_mems_allowed();
- /* Ignore nodes not set in current->mems_allowed */
- cpuset_restrict_to_mems_allowed(nodes);
- return mpol_check_policy(mode, nodes);
+ return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
}
-
/* Generate a custom zonelist for the BIND policy. */
-static struct zonelist *bind_zonelist(unsigned long *nodes)
+static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
struct zonelist *zl;
int num, max, nd;
- max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
+ max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
if (!zl)
return NULL;
num = 0;
- for (nd = find_first_bit(nodes, MAX_NUMNODES);
- nd < MAX_NUMNODES;
- nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
+ for_each_node_mask(nd, *nodes) {
int k;
for (k = MAX_NR_ZONES-1; k >= 0; k--) {
struct zone *z = &NODE_DATA(nd)->node_zones[k];
@@ -199,17 +142,16 @@ static struct zonelist *bind_zonelist(unsigned long *nodes)
policy_zone = k;
}
}
- BUG_ON(num >= max);
zl->zones[num] = NULL;
return zl;
}
/* Create a new policy */
-static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
+static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
{
struct mempolicy *policy;
- PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
+ PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
if (mode == MPOL_DEFAULT)
return NULL;
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -218,10 +160,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
atomic_set(&policy->refcnt, 1);
switch (mode) {
case MPOL_INTERLEAVE:
- bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
+ policy->v.nodes = *nodes;
break;
case MPOL_PREFERRED:
- policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
+ policy->v.preferred_node = first_node(*nodes);
if (policy->v.preferred_node >= MAX_NUMNODES)
policy->v.preferred_node = -1;
break;
@@ -238,34 +180,33 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
}
/* Ensure all existing pages follow the policy. */
-static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, unsigned long end, unsigned long *nodes)
+static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end, nodemask_t *nodes)
{
pte_t *orig_pte;
pte_t *pte;
+ spinlock_t *ptl;
- spin_lock(&mm->page_table_lock);
- orig_pte = pte = pte_offset_map(pmd, addr);
+ orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
- unsigned long pfn;
+ struct page *page;
unsigned int nid;
if (!pte_present(*pte))
continue;
- pfn = pte_pfn(*pte);
- if (!pfn_valid(pfn))
+ page = vm_normal_page(vma, addr, *pte);
+ if (!page)
continue;
- nid = pfn_to_nid(pfn);
- if (!test_bit(nid, nodes))
+ nid = page_to_nid(page);
+ if (!node_isset(nid, *nodes))
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(orig_pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(orig_pte, ptl);
return addr != end;
}
-static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
- unsigned long addr, unsigned long end, unsigned long *nodes)
+static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end, nodemask_t *nodes)
{
pmd_t *pmd;
unsigned long next;
@@ -275,14 +216,14 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- if (check_pte_range(mm, pmd, addr, next, nodes))
+ if (check_pte_range(vma, pmd, addr, next, nodes))
return -EIO;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
- unsigned long addr, unsigned long end, unsigned long *nodes)
+static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end, nodemask_t *nodes)
{
pud_t *pud;
unsigned long next;
@@ -292,24 +233,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- if (check_pmd_range(mm, pud, addr, next, nodes))
+ if (check_pmd_range(vma, pud, addr, next, nodes))
return -EIO;
} while (pud++, addr = next, addr != end);
return 0;
}
-static inline int check_pgd_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end, unsigned long *nodes)
+static inline int check_pgd_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end, nodemask_t *nodes)
{
pgd_t *pgd;
unsigned long next;
- pgd = pgd_offset(mm, addr);
+ pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- if (check_pud_range(mm, pgd, addr, next, nodes))
+ if (check_pud_range(vma, pgd, addr, next, nodes))
return -EIO;
} while (pgd++, addr = next, addr != end);
return 0;
@@ -318,7 +259,7 @@ static inline int check_pgd_range(struct mm_struct *mm,
/* Step 1: check the range */
static struct vm_area_struct *
check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
- unsigned long *nodes, unsigned long flags)
+ nodemask_t *nodes, unsigned long flags)
{
int err;
struct vm_area_struct *first, *vma, *prev;
@@ -338,8 +279,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
endvma = end;
if (vma->vm_start > start)
start = vma->vm_start;
- err = check_pgd_range(vma->vm_mm,
- start, endvma, nodes);
+ err = check_pgd_range(vma, start, endvma, nodes);
if (err) {
first = ERR_PTR(err);
break;
@@ -393,17 +333,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
return err;
}
-/* Change policy for a memory range */
-asmlinkage long sys_mbind(unsigned long start, unsigned long len,
- unsigned long mode,
- unsigned long __user *nmask, unsigned long maxnode,
- unsigned flags)
+static int contextualize_policy(int mode, nodemask_t *nodes)
+{
+ if (!nodes)
+ return 0;
+
+ /* Update current mems_allowed */
+ cpuset_update_current_mems_allowed();
+ /* Ignore nodes not set in current->mems_allowed */
+ cpuset_restrict_to_mems_allowed(nodes->bits);
+ return mpol_check_policy(mode, nodes);
+}
+
+long do_mbind(unsigned long start, unsigned long len,
+ unsigned long mode, nodemask_t *nmask, unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
- DECLARE_BITMAP(nodes, MAX_NUMNODES);
int err;
if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@@ -418,20 +366,17 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
return -EINVAL;
if (end == start)
return 0;
-
- err = get_nodes(nodes, nmask, maxnode, mode);
- if (err)
- return err;
-
- new = mpol_new(mode, nodes);
+ if (mpol_check_policy(mode, nmask))
+ return -EINVAL;
+ new = mpol_new(mode, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
- mode,nodes[0]);
+ mode,nodes_addr(nodes)[0]);
down_write(&mm->mmap_sem);
- vma = check_range(mm, start, end, nodes, flags);
+ vma = check_range(mm, start, end, nmask, flags);
err = PTR_ERR(vma);
if (!IS_ERR(vma))
err = mbind_range(vma, start, end, new);
@@ -441,50 +386,45 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
}
/* Set the process memory policy */
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
- unsigned long maxnode)
+long do_set_mempolicy(int mode, nodemask_t *nodes)
{
- int err;
struct mempolicy *new;
- DECLARE_BITMAP(nodes, MAX_NUMNODES);
- if (mode < 0 || mode > MPOL_MAX)
+ if (contextualize_policy(mode, nodes))
return -EINVAL;
- err = get_nodes(nodes, nmask, maxnode, mode);
- if (err)
- return err;
new = mpol_new(mode, nodes);
if (IS_ERR(new))
return PTR_ERR(new);
mpol_free(current->mempolicy);
current->mempolicy = new;
if (new && new->policy == MPOL_INTERLEAVE)
- current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
+ current->il_next = first_node(new->v.nodes);
return 0;
}
/* Fill a zone bitmap for a policy */
-static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
+static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
{
int i;
- bitmap_zero(nodes, MAX_NUMNODES);
+ nodes_clear(*nodes);
switch (p->policy) {
case MPOL_BIND:
for (i = 0; p->v.zonelist->zones[i]; i++)
- __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
+ node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
+ *nodes);
break;
case MPOL_DEFAULT:
break;
case MPOL_INTERLEAVE:
- bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
+ *nodes = p->v.nodes;
break;
case MPOL_PREFERRED:
/* or use current node instead of online map? */
if (p->v.preferred_node < 0)
- bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
+ *nodes = node_online_map;
else
- __set_bit(p->v.preferred_node, nodes);
+ node_set(p->v.preferred_node, *nodes);
break;
default:
BUG();
@@ -504,37 +444,18 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
return err;
}
-/* Copy a kernel node mask to user space */
-static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
- void *nodes, unsigned nbytes)
-{
- unsigned long copy = ALIGN(maxnode-1, 64) / 8;
-
- if (copy > nbytes) {
- if (copy > PAGE_SIZE)
- return -EINVAL;
- if (clear_user((char __user *)mask + nbytes, copy - nbytes))
- return -EFAULT;
- copy = nbytes;
- }
- return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
-}
-
/* Retrieve NUMA policy */
-asmlinkage long sys_get_mempolicy(int __user *policy,
- unsigned long __user *nmask,
- unsigned long maxnode,
- unsigned long addr, unsigned long flags)
+long do_get_mempolicy(int *policy, nodemask_t *nmask,
+ unsigned long addr, unsigned long flags)
{
- int err, pval;
+ int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
+ cpuset_update_current_mems_allowed();
if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
- if (nmask != NULL && maxnode < MAX_NUMNODES)
- return -EINVAL;
if (flags & MPOL_F_ADDR) {
down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1);
@@ -557,31 +478,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
err = lookup_node(mm, addr);
if (err < 0)
goto out;
- pval = err;
+ *policy = err;
} else if (pol == current->mempolicy &&
pol->policy == MPOL_INTERLEAVE) {
- pval = current->il_next;
+ *policy = current->il_next;
} else {
err = -EINVAL;
goto out;
}
} else
- pval = pol->policy;
+ *policy = pol->policy;
if (vma) {
up_read(&current->mm->mmap_sem);
vma = NULL;
}
- if (policy && put_user(pval, policy))
- return -EFAULT;
-
err = 0;
- if (nmask) {
- DECLARE_BITMAP(nodes, MAX_NUMNODES);
- get_zonemask(pol, nodes);
- err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
- }
+ if (nmask)
+ get_zonemask(pol, nmask);
out:
if (vma)
@@ -589,6 +504,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
return err;
}
+/*
+ * User space interface with variable sized bitmaps for nodelists.
+ */
+
+/* Copy a node mask from user space. */
+static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ unsigned long k;
+ unsigned long nlongs;
+ unsigned long endmask;
+
+ --maxnode;
+ nodes_clear(*nodes);
+ if (maxnode == 0 || !nmask)
+ return 0;
+
+ nlongs = BITS_TO_LONGS(maxnode);
+ if ((maxnode % BITS_PER_LONG) == 0)
+ endmask = ~0UL;
+ else
+ endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+ /* When the user specified more nodes than supported just check
+ if the non supported part is all zero. */
+ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+ if (nlongs > PAGE_SIZE/sizeof(long))
+ return -EINVAL;
+ for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
+ unsigned long t;
+ if (get_user(t, nmask + k))
+ return -EFAULT;
+ if (k == nlongs - 1) {
+ if (t & endmask)
+ return -EINVAL;
+ } else if (t)
+ return -EINVAL;
+ }
+ nlongs = BITS_TO_LONGS(MAX_NUMNODES);
+ endmask = ~0UL;
+ }
+
+ if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
+ return -EFAULT;
+ nodes_addr(*nodes)[nlongs-1] &= endmask;
+ return 0;
+}
+
+/* Copy a kernel node mask to user space */
+static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
+ nodemask_t *nodes)
+{
+ unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+ const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+
+ if (copy > nbytes) {
+ if (copy > PAGE_SIZE)
+ return -EINVAL;
+ if (clear_user((char __user *)mask + nbytes, copy - nbytes))
+ return -EFAULT;
+ copy = nbytes;
+ }
+ return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
+}
+
+asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+ unsigned long mode,
+ unsigned long __user *nmask, unsigned long maxnode,
+ unsigned flags)
+{
+ nodemask_t nodes;
+ int err;
+
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+ return do_mbind(start, len, mode, &nodes, flags);
+}
+
+/* Set the process memory policy */
+asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ int err;
+ nodemask_t nodes;
+
+ if (mode < 0 || mode > MPOL_MAX)
+ return -EINVAL;
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+ return do_set_mempolicy(mode, &nodes);
+}
+
+/* Retrieve NUMA policy */
+asmlinkage long sys_get_mempolicy(int __user *policy,
+ unsigned long __user *nmask,
+ unsigned long maxnode,
+ unsigned long addr, unsigned long flags)
+{
+ int err, pval;
+ nodemask_t nodes;
+
+ if (nmask != NULL && maxnode < MAX_NUMNODES)
+ return -EINVAL;
+
+ err = do_get_mempolicy(&pval, &nodes, addr, flags);
+
+ if (err)
+ return err;
+
+ if (policy && put_user(pval, policy))
+ return -EFAULT;
+
+ if (nmask)
+ err = copy_nodes_to_user(nmask, maxnode, &nodes);
+
+ return err;
+}
+
#ifdef CONFIG_COMPAT
asmlinkage long compat_sys_get_mempolicy(int __user *policy,
@@ -649,15 +684,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
long err = 0;
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
- DECLARE_BITMAP(bm, MAX_NUMNODES);
+ nodemask_t bm;
nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
- err = compat_get_bitmap(bm, nmask, nr_bits);
+ err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
nm = compat_alloc_user_space(alloc_size);
- err |= copy_to_user(nm, bm, alloc_size);
+ err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
}
if (err)
@@ -676,7 +711,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy)
- pol = vma->vm_ops->get_policy(vma, addr);
+ pol = vma->vm_ops->get_policy(vma, addr);
else if (vma->vm_policy &&
vma->vm_policy->policy != MPOL_DEFAULT)
pol = vma->vm_policy;
@@ -687,7 +722,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
}
/* Return a zonelist representing a mempolicy */
-static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
+static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
{
int nd;
@@ -700,7 +735,7 @@ static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempol
case MPOL_BIND:
/* Lower zones don't get a policy applied */
/* Careful: current->mems_allowed might have moved */
- if ((gfp & GFP_ZONEMASK) >= policy_zone)
+ if (gfp_zone(gfp) >= policy_zone)
if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
return policy->v.zonelist;
/*FALL THROUGH*/
@@ -712,7 +747,7 @@ static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempol
nd = 0;
BUG();
}
- return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
+ return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
}
/* Do dynamic interleaving for a process */
@@ -722,10 +757,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
struct task_struct *me = current;
nid = me->il_next;
- BUG_ON(nid >= MAX_NUMNODES);
- next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
+ next = next_node(nid, policy->v.nodes);
if (next >= MAX_NUMNODES)
- next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
+ next = first_node(policy->v.nodes);
me->il_next = next;
return nid;
}
@@ -734,30 +768,28 @@ static unsigned interleave_nodes(struct mempolicy *policy)
static unsigned offset_il_node(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long off)
{
- unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
+ unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target = (unsigned)off % nnodes;
int c;
int nid = -1;
c = 0;
do {
- nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
+ nid = next_node(nid, pol->v.nodes);
c++;
} while (c <= target);
- BUG_ON(nid >= MAX_NUMNODES);
- BUG_ON(!test_bit(nid, pol->v.nodes));
return nid;
}
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
+static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
+ unsigned nid)
{
struct zonelist *zl;
struct page *page;
- BUG_ON(!node_online(nid));
- zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
+ zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
page = __alloc_pages(gfp, order, zl);
if (page && page_zone(page) == zl->zones[0]) {
zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
@@ -789,7 +821,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
* Should be called with the mm_sem of the vma hold.
*/
struct page *
-alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
@@ -799,8 +831,6 @@ alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned l
unsigned nid;
if (vma) {
unsigned long off;
- BUG_ON(addr >= vma->vm_end);
- BUG_ON(addr < vma->vm_start);
off = vma->vm_pgoff;
off += (addr - vma->vm_start) >> PAGE_SHIFT;
nid = offset_il_node(pol, vma, off);
@@ -832,7 +862,7 @@ alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned l
* 1) it's ok to take cpuset_sem (can WAIT), and
* 2) allocating for current task (not interrupt).
*/
-struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
+struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
@@ -878,7 +908,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_DEFAULT:
return 1;
case MPOL_INTERLEAVE:
- return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
+ return nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
return a->v.preferred_node == b->v.preferred_node;
case MPOL_BIND: {
@@ -1117,7 +1147,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
vma->vm_pgoff,
sz, npol? npol->policy : -1,
- npol ? npol->v.nodes[0] : -1);
+ npol ? nodes_addr(npol->v.nodes)[0] : -1);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1164,14 +1194,75 @@ void __init numa_policy_init(void)
/* Set interleaving policy for system init. This way not all
the data structures allocated at system boot end up in node zero. */
- if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
- MAX_NUMNODES) < 0)
+ if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
printk("numa_policy_init: interleaving failed\n");
}
-/* Reset policy of current process to default.
- * Assumes fs == KERNEL_DS */
+/* Reset policy of current process to default */
void numa_default_policy(void)
{
- sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
+ do_set_mempolicy(MPOL_DEFAULT, NULL);
+}
+
+/* Migrate a policy to a different set of nodes */
+static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
+ const nodemask_t *new)
+{
+ nodemask_t tmp;
+
+ if (!pol)
+ return;
+
+ switch (pol->policy) {
+ case MPOL_DEFAULT:
+ break;
+ case MPOL_INTERLEAVE:
+ nodes_remap(tmp, pol->v.nodes, *old, *new);
+ pol->v.nodes = tmp;
+ current->il_next = node_remap(current->il_next, *old, *new);
+ break;
+ case MPOL_PREFERRED:
+ pol->v.preferred_node = node_remap(pol->v.preferred_node,
+ *old, *new);
+ break;
+ case MPOL_BIND: {
+ nodemask_t nodes;
+ struct zone **z;
+ struct zonelist *zonelist;
+
+ nodes_clear(nodes);
+ for (z = pol->v.zonelist->zones; *z; z++)
+ node_set((*z)->zone_pgdat->node_id, nodes);
+ nodes_remap(tmp, nodes, *old, *new);
+ nodes = tmp;
+
+ zonelist = bind_zonelist(&nodes);
+
+ /* If no mem, then zonelist is NULL and we keep old zonelist.
+ * If that old zonelist has no remaining mems_allowed nodes,
+ * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
+ */
+
+ if (zonelist) {
+ /* Good - got mem - substitute new zonelist */
+ kfree(pol->v.zonelist);
+ pol->v.zonelist = zonelist;
+ }
+ break;
+ }
+ default:
+ BUG();
+ break;
+ }
+}
+
+/*
+ * Someone moved this task to different nodes. Fixup mempolicies.
+ *
+ * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
+ * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ */
+void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+{
+ rebind_policy(current->mempolicy, old, new);
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 65f2957b8d5..1a99b80480d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -112,7 +112,7 @@ EXPORT_SYMBOL(mempool_create_node);
* while this function is running. mempool_alloc() & mempool_free()
* might be called (eg. from IRQ contexts) while this function executes.
*/
-int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask)
+int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
{
void *element;
void **new_elements;
@@ -200,12 +200,12 @@ EXPORT_SYMBOL(mempool_destroy);
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
*/
-void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
+void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
wait_queue_t wait;
- unsigned int gfp_temp;
+ gfp_t gfp_temp;
might_sleep_if(gfp_mask & __GFP_WAIT);
@@ -276,7 +276,7 @@ EXPORT_SYMBOL(mempool_free);
/*
* A commonly used alloc and free fn.
*/
-void *mempool_alloc_slab(unsigned int __nocast gfp_mask, void *pool_data)
+void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
{
kmem_cache_t *mem = (kmem_cache_t *) pool_data;
return kmem_cache_alloc(mem, gfp_mask);
diff --git a/mm/mmap.c b/mm/mmap.c
index fa11d91242e..11ca5927d5f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -155,10 +155,6 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
return -ENOMEM;
}
-EXPORT_SYMBOL(sysctl_overcommit_memory);
-EXPORT_SYMBOL(sysctl_overcommit_ratio);
-EXPORT_SYMBOL(sysctl_max_map_count);
-EXPORT_SYMBOL(vm_committed_space);
EXPORT_SYMBOL(__vm_enough_memory);
/*
@@ -181,26 +177,36 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
}
/*
- * Remove one vm structure and free it.
+ * Unlink a file-based vm structure from its prio_tree, to hide
+ * vma from rmap and vmtruncate before freeing its page tables.
*/
-static void remove_vm_struct(struct vm_area_struct *vma)
+void unlink_file_vma(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
- might_sleep();
if (file) {
struct address_space *mapping = file->f_mapping;
spin_lock(&mapping->i_mmap_lock);
__remove_shared_vm_struct(vma, file, mapping);
spin_unlock(&mapping->i_mmap_lock);
}
+}
+
+/*
+ * Close a vm structure and free it, returning the next.
+ */
+static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+{
+ struct vm_area_struct *next = vma->vm_next;
+
+ might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (file)
- fput(file);
- anon_vma_unlink(vma);
+ if (vma->vm_file)
+ fput(vma->vm_file);
mpol_free(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
+ return next;
}
asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -832,7 +838,7 @@ none:
}
#ifdef CONFIG_PROC_FS
-void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
+void vm_stat_account(struct mm_struct *mm, unsigned long flags,
struct file *file, long pages)
{
const unsigned long stack_flags
@@ -1110,7 +1116,7 @@ munmap_back:
}
out:
mm->total_vm += len >> PAGE_SHIFT;
- __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+ vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
@@ -1475,15 +1481,19 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
mm->total_vm += grow;
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
- __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
return 0;
}
-#ifdef CONFIG_STACK_GROWSUP
+#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
/*
- * vma is the first one with address > vma->vm_end. Have to extend vma.
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
*/
-int expand_stack(struct vm_area_struct * vma, unsigned long address)
+#ifndef CONFIG_IA64
+static inline
+#endif
+int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
int error;
@@ -1521,6 +1531,13 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address)
anon_vma_unlock(vma);
return error;
}
+#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
+
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_upwards(vma, address);
+}
struct vm_area_struct *
find_extend_vma(struct mm_struct *mm, unsigned long addr)
@@ -1603,36 +1620,24 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
}
#endif
-/* Normal function to fix up a mapping
- * This function is the default for when an area has no specific
- * function. This may be used as part of a more specific routine.
- *
- * By the time this function is called, the area struct has been
- * removed from the process mapping list.
- */
-static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
-{
- size_t len = area->vm_end - area->vm_start;
-
- area->vm_mm->total_vm -= len >> PAGE_SHIFT;
- if (area->vm_flags & VM_LOCKED)
- area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
- vm_stat_unaccount(area);
- remove_vm_struct(area);
-}
-
/*
- * Update the VMA and inode share lists.
- *
- * Ok - we have the memory areas we should free on the 'free' list,
+ * Ok - we have the memory areas we should free on the vma list,
* so release them, and do the vma updates.
+ *
+ * Called with the mm semaphore held.
*/
-static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
+ /* Update high watermark before we lower total_vm */
+ update_hiwater_vm(mm);
do {
- struct vm_area_struct *next = vma->vm_next;
- unmap_vma(mm, vma);
- vma = next;
+ long nrpages = vma_pages(vma);
+
+ mm->total_vm -= nrpages;
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm -= nrpages;
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+ vma = remove_vma(vma);
} while (vma);
validate_mm(mm);
}
@@ -1651,14 +1656,13 @@ static void unmap_region(struct mm_struct *mm,
unsigned long nr_accounted = 0;
lru_add_drain();
- spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
- unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
+ update_hiwater_rss(mm);
+ unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
- spin_unlock(&mm->page_table_lock);
}
/*
@@ -1799,7 +1803,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
unmap_region(mm, vma, prev, start, end);
/* Fix up all other VM information */
- unmap_vma_list(mm, vma);
+ remove_vma_list(mm, vma);
return 0;
}
@@ -1821,7 +1825,7 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len)
static inline void verify_mm_writelocked(struct mm_struct *mm)
{
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_VM
if (unlikely(down_read_trylock(&mm->mmap_sem))) {
WARN_ON(1);
up_read(&mm->mmap_sem);
@@ -1933,34 +1937,21 @@ void exit_mmap(struct mm_struct *mm)
unsigned long end;
lru_add_drain();
-
- spin_lock(&mm->page_table_lock);
-
flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1);
+ /* Don't update_hiwater_rss(mm) here, do_exit already did */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
- end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
+ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
- mm->mmap = mm->mmap_cache = NULL;
- mm->mm_rb = RB_ROOT;
- set_mm_counter(mm, rss, 0);
- mm->total_vm = 0;
- mm->locked_vm = 0;
-
- spin_unlock(&mm->page_table_lock);
-
/*
- * Walk the list again, actually closing and freeing it
- * without holding any MM locks.
+ * Walk the list again, actually closing and freeing it,
+ * with preemption enabled, without holding any MM locks.
*/
- while (vma) {
- struct vm_area_struct *next = vma->vm_next;
- remove_vm_struct(vma);
- vma = next;
- }
+ while (vma)
+ vma = remove_vma(vma);
BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 57577f63b30..653b8571c1e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,8 +29,9 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
do {
if (pte_present(*pte)) {
pte_t ptent;
@@ -44,7 +45,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
lazy_mmu_prot_update(ptent);
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
}
static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -88,7 +89,6 @@ static void change_protection(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -96,7 +96,6 @@ static void change_protection(struct vm_area_struct *vma,
change_pud_range(mm, pgd, addr, next, newprot);
} while (pgd++, addr = next, addr != end);
flush_tlb_range(vma, start, end);
- spin_unlock(&mm->page_table_lock);
}
static int
@@ -168,8 +167,8 @@ success:
vma->vm_flags = newflags;
vma->vm_page_prot = newprot;
change_protection(vma, start, end, newprot);
- __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
- __vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+ vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+ vm_stat_account(mm, newflags, vma->vm_file, nrpages);
return 0;
fail:
diff --git a/mm/mremap.c b/mm/mremap.c
index a32fed454bd..b535438c363 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,35 +22,7 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte = NULL;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_none_or_clear_bad(pgd))
- goto end;
-
- pud = pud_offset(pgd, addr);
- if (pud_none_or_clear_bad(pud))
- goto end;
-
- pmd = pmd_offset(pud, addr);
- if (pmd_none_or_clear_bad(pmd))
- goto end;
-
- pte = pte_offset_map_nested(pmd, addr);
- if (pte_none(*pte)) {
- pte_unmap_nested(pte);
- pte = NULL;
- }
-end:
- return pte;
-}
-
-static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
@@ -68,35 +40,39 @@ static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
if (pmd_none_or_clear_bad(pmd))
return NULL;
- return pte_offset_map(pmd, addr);
+ return pmd;
}
-static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
-
pud = pud_alloc(mm, pgd, addr);
if (!pud)
return NULL;
+
pmd = pmd_alloc(mm, pud, addr);
- if (pmd)
- pte = pte_alloc_map(mm, pmd, addr);
- return pte;
+ if (!pmd)
+ return NULL;
+
+ if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+ return NULL;
+
+ return pmd;
}
-static int
-move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
- struct vm_area_struct *new_vma, unsigned long new_addr)
+static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
+ unsigned long old_addr, unsigned long old_end,
+ struct vm_area_struct *new_vma, pmd_t *new_pmd,
+ unsigned long new_addr)
{
struct address_space *mapping = NULL;
struct mm_struct *mm = vma->vm_mm;
- int error = 0;
- pte_t *src, *dst;
+ pte_t *old_pte, *new_pte, pte;
+ spinlock_t *old_ptl, *new_ptl;
if (vma->vm_file) {
/*
@@ -111,74 +87,69 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
new_vma->vm_truncate_count != vma->vm_truncate_count)
new_vma->vm_truncate_count = 0;
}
- spin_lock(&mm->page_table_lock);
- src = get_one_pte_map_nested(mm, old_addr);
- if (src) {
- /*
- * Look to see whether alloc_one_pte_map needs to perform a
- * memory allocation. If it does then we need to drop the
- * atomic kmap
- */
- dst = get_one_pte_map(mm, new_addr);
- if (unlikely(!dst)) {
- pte_unmap_nested(src);
- if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
- dst = alloc_one_pte_map(mm, new_addr);
- if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
- spin_unlock(&mm->page_table_lock);
- spin_lock(&mapping->i_mmap_lock);
- spin_lock(&mm->page_table_lock);
- }
- src = get_one_pte_map_nested(mm, old_addr);
- }
- /*
- * Since alloc_one_pte_map can drop and re-acquire
- * page_table_lock, we should re-check the src entry...
- */
- if (src) {
- if (dst) {
- pte_t pte;
- pte = ptep_clear_flush(vma, old_addr, src);
- /* ZERO_PAGE can be dependant on virtual addr */
- if (pfn_valid(pte_pfn(pte)) &&
- pte_page(pte) == ZERO_PAGE(old_addr))
- pte = pte_wrprotect(mk_pte(ZERO_PAGE(new_addr), new_vma->vm_page_prot));
- set_pte_at(mm, new_addr, dst, pte);
- } else
- error = -ENOMEM;
- pte_unmap_nested(src);
- }
- if (dst)
- pte_unmap(dst);
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * pte locks because exclusive mmap_sem prevents deadlock.
+ */
+ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ new_pte = pte_offset_map_nested(new_pmd, new_addr);
+ new_ptl = pte_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock(new_ptl);
+
+ for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
+ new_pte++, new_addr += PAGE_SIZE) {
+ if (pte_none(*old_pte))
+ continue;
+ pte = ptep_clear_flush(vma, old_addr, old_pte);
+ /* ZERO_PAGE can be dependant on virtual addr */
+ pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
+ set_pte_at(mm, new_addr, new_pte, pte);
}
- spin_unlock(&mm->page_table_lock);
+
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ pte_unmap_nested(new_pte - 1);
+ pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
- return error;
}
+#define LATENCY_LIMIT (64 * PAGE_SIZE)
+
static unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len)
{
- unsigned long offset;
+ unsigned long extent, next, old_end;
+ pmd_t *old_pmd, *new_pmd;
- flush_cache_range(vma, old_addr, old_addr + len);
+ old_end = old_addr + len;
+ flush_cache_range(vma, old_addr, old_end);
- /*
- * This is not the clever way to do this, but we're taking the
- * easy way out on the assumption that most remappings will be
- * only a few pages.. This also makes error recovery easier.
- */
- for (offset = 0; offset < len; offset += PAGE_SIZE) {
- if (move_one_page(vma, old_addr + offset,
- new_vma, new_addr + offset) < 0)
- break;
+ for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
+ next = (old_addr + PMD_SIZE) & PMD_MASK;
+ if (next - 1 > old_end)
+ next = old_end;
+ extent = next - old_addr;
+ old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+ if (!old_pmd)
+ continue;
+ new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+ if (!new_pmd)
+ break;
+ next = (new_addr + PMD_SIZE) & PMD_MASK;
+ if (extent > next - new_addr)
+ extent = next - new_addr;
+ if (extent > LATENCY_LIMIT)
+ extent = LATENCY_LIMIT;
+ move_ptes(vma, old_pmd, old_addr, old_addr + extent,
+ new_vma, new_pmd, new_addr);
}
- return offset;
+
+ return len + old_addr - old_end; /* how much done */
}
static unsigned long move_vma(struct vm_area_struct *vma,
@@ -191,6 +162,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long new_pgoff;
unsigned long moved_len;
unsigned long excess = 0;
+ unsigned long hiwater_vm;
int split = 0;
/*
@@ -229,17 +201,24 @@ static unsigned long move_vma(struct vm_area_struct *vma,
}
/*
- * if we failed to move page tables we still do total_vm increment
- * since do_munmap() will decrement it by old_len == new_len
+ * If we failed to move page tables we still do total_vm increment
+ * since do_munmap() will decrement it by old_len == new_len.
+ *
+ * Since total_vm is about to be raised artificially high for a
+ * moment, we need to restore high watermark afterwards: if stats
+ * are taken meanwhile, total_vm and hiwater_vm appear too high.
+ * If this were a serious issue, we'd add a flag to do_munmap().
*/
+ hiwater_vm = mm->hiwater_vm;
mm->total_vm += new_len >> PAGE_SHIFT;
- __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
if (do_munmap(mm, old_addr, old_len) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0;
}
+ mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
if (excess) {
@@ -269,6 +248,7 @@ unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr)
{
+ struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
@@ -309,7 +289,7 @@ unsigned long do_mremap(unsigned long addr,
if ((addr <= new_addr) && (addr+old_len) > new_addr)
goto out;
- ret = do_munmap(current->mm, new_addr, new_len);
+ ret = do_munmap(mm, new_addr, new_len);
if (ret)
goto out;
}
@@ -320,7 +300,7 @@ unsigned long do_mremap(unsigned long addr,
* do_munmap does all the needed commit accounting
*/
if (old_len >= new_len) {
- ret = do_munmap(current->mm, addr+new_len, old_len - new_len);
+ ret = do_munmap(mm, addr+new_len, old_len - new_len);
if (ret && old_len != new_len)
goto out;
ret = addr;
@@ -333,7 +313,7 @@ unsigned long do_mremap(unsigned long addr,
* Ok, we need to grow.. or relocate.
*/
ret = -EFAULT;
- vma = find_vma(current->mm, addr);
+ vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
goto out;
if (is_vm_hugetlb_page(vma)) {
@@ -349,14 +329,14 @@ unsigned long do_mremap(unsigned long addr,
}
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
- locked = current->mm->locked_vm << PAGE_SHIFT;
+ locked = mm->locked_vm << PAGE_SHIFT;
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
locked += new_len - old_len;
ret = -EAGAIN;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
goto out;
}
- if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) {
+ if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
ret = -ENOMEM;
goto out;
}
@@ -383,11 +363,10 @@ unsigned long do_mremap(unsigned long addr,
vma_adjust(vma, vma->vm_start,
addr + new_len, vma->vm_pgoff, NULL);
- current->mm->total_vm += pages;
- __vm_stat_account(vma->vm_mm, vma->vm_flags,
- vma->vm_file, pages);
+ mm->total_vm += pages;
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
if (vma->vm_flags & VM_LOCKED) {
- current->mm->locked_vm += pages;
+ mm->locked_vm += pages;
make_pages_present(addr + old_len,
addr + new_len);
}
diff --git a/mm/msync.c b/mm/msync.c
index d0f5a1bce7c..1b5b6f662dc 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -17,40 +17,43 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
-/*
- * Called with mm->page_table_lock held to protect against other
- * threads/the swapper from ripping pte's out from under us.
- */
-
-static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end)
{
pte_t *pte;
+ spinlock_t *ptl;
+ int progress = 0;
- pte = pte_offset_map(pmd, addr);
+again:
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
- unsigned long pfn;
struct page *page;
+ if (progress >= 64) {
+ progress = 0;
+ if (need_resched() || need_lockbreak(ptl))
+ break;
+ }
+ progress++;
if (!pte_present(*pte))
continue;
if (!pte_maybe_dirty(*pte))
continue;
- pfn = pte_pfn(*pte);
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
- if (PageReserved(page))
+ page = vm_normal_page(vma, addr, *pte);
+ if (!page)
continue;
-
if (ptep_clear_flush_dirty(vma, addr, pte) ||
page_test_and_clear_dirty(page))
set_page_dirty(page);
+ progress += 3;
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+ if (addr != end)
+ goto again;
}
-static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end)
{
pmd_t *pmd;
@@ -61,11 +64,11 @@ static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- sync_pte_range(vma, pmd, addr, next);
+ msync_pte_range(vma, pmd, addr, next);
} while (pmd++, addr = next, addr != end);
}
-static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end)
{
pud_t *pud;
@@ -76,58 +79,33 @@ static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- sync_pmd_range(vma, pud, addr, next);
+ msync_pmd_range(vma, pud, addr, next);
} while (pud++, addr = next, addr != end);
}
-static void sync_page_range(struct vm_area_struct *vma,
+static void msync_page_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
- struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
/* For hugepages we can't go walking the page table normally,
* but that's ok, hugetlbfs is memory based, so we don't need
- * to do anything more on an msync() */
- if (is_vm_hugetlb_page(vma))
+ * to do anything more on an msync().
+ */
+ if (vma->vm_flags & VM_HUGETLB)
return;
BUG_ON(addr >= end);
- pgd = pgd_offset(mm, addr);
+ pgd = pgd_offset(vma->vm_mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- sync_pud_range(vma, pgd, addr, next);
+ msync_pud_range(vma, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
- spin_unlock(&mm->page_table_lock);
-}
-
-#ifdef CONFIG_PREEMPT
-static inline void filemap_sync(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
-{
- const size_t chunk = 64 * 1024; /* bytes */
- unsigned long next;
-
- do {
- next = addr + chunk;
- if (next > end || next < addr)
- next = end;
- sync_page_range(vma, addr, next);
- cond_resched();
- } while (addr = next, addr != end);
-}
-#else
-static inline void filemap_sync(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
-{
- sync_page_range(vma, addr, end);
}
-#endif
/*
* MS_SYNC syncs the entire file - including mappings.
@@ -150,7 +128,7 @@ static int msync_interval(struct vm_area_struct *vma,
return -EBUSY;
if (file && (vma->vm_flags & VM_SHARED)) {
- filemap_sync(vma, addr, end);
+ msync_page_range(vma, addr, end);
if (flags & MS_SYNC) {
struct address_space *mapping = file->f_mapping;
diff --git a/mm/nommu.c b/mm/nommu.c
index 064d7044289..c1196812876 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -44,10 +44,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int heap_stack_gap = 0;
EXPORT_SYMBOL(mem_map);
-EXPORT_SYMBOL(sysctl_max_map_count);
-EXPORT_SYMBOL(sysctl_overcommit_memory);
-EXPORT_SYMBOL(sysctl_overcommit_ratio);
-EXPORT_SYMBOL(vm_committed_space);
EXPORT_SYMBOL(__vm_enough_memory);
/* list of shareable VMAs */
@@ -157,8 +153,7 @@ void vfree(void *addr)
kfree(addr);
}
-void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask,
- pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
/*
* kmalloc doesn't like __GFP_HIGHMEM for some reason
@@ -932,6 +927,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
realalloc -= kobjsize(vml);
askedalloc -= sizeof(*vml);
kfree(vml);
+
+ update_hiwater_vm(mm);
mm->total_vm -= len >> PAGE_SHIFT;
#ifdef DEBUG
@@ -1048,7 +1045,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL(find_vma);
-struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags)
{
return NULL;
}
@@ -1079,19 +1077,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
{
}
-void update_mem_hiwater(struct task_struct *tsk)
-{
- unsigned long rss;
-
- if (likely(tsk->mm)) {
- rss = get_mm_counter(tsk->mm, rss);
- if (tsk->mm->hiwater_rss < rss)
- tsk->mm->hiwater_rss = rss;
- if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
- tsk->mm->hiwater_vm = tsk->mm->total_vm;
- }
-}
-
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen,
int even_cows)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ac3bf33e537..d348b903595 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -263,7 +263,7 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-void out_of_memory(unsigned int __nocast gfp_mask, int order)
+void out_of_memory(gfp_t gfp_mask, int order)
{
struct mm_struct *mm = NULL;
task_t * p;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ae2903339e7..3b21a13d841 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -33,6 +33,7 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
@@ -59,11 +60,13 @@ long nr_swap_pages;
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we normally
+ * don't need any ZONE_NORMAL reservation
*/
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
EXPORT_SYMBOL(totalram_pages);
-EXPORT_SYMBOL(nr_swap_pages);
/*
* Used by page_zone() to look up the address of the struct zone whose
@@ -72,27 +75,50 @@ EXPORT_SYMBOL(nr_swap_pages);
struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
EXPORT_SYMBOL(zone_table);
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
int min_free_kbytes = 1024;
unsigned long __initdata nr_kernel_pages;
unsigned long __initdata nr_all_pages;
+static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
+{
+ int ret = 0;
+ unsigned seq;
+ unsigned long pfn = page_to_pfn(page);
+
+ do {
+ seq = zone_span_seqbegin(zone);
+ if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+ ret = 1;
+ else if (pfn < zone->zone_start_pfn)
+ ret = 1;
+ } while (zone_span_seqretry(zone, seq));
+
+ return ret;
+}
+
+static int page_is_consistent(struct zone *zone, struct page *page)
+{
+#ifdef CONFIG_HOLES_IN_ZONE
+ if (!pfn_valid(page_to_pfn(page)))
+ return 0;
+#endif
+ if (zone != page_zone(page))
+ return 0;
+
+ return 1;
+}
/*
* Temporary debugging check for pages not lying within a given zone.
*/
static int bad_range(struct zone *zone, struct page *page)
{
- if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
- return 1;
- if (page_to_pfn(page) < zone->zone_start_pfn)
- return 1;
-#ifdef CONFIG_HOLES_IN_ZONE
- if (!pfn_valid(page_to_pfn(page)))
+ if (page_outside_zone_boundaries(zone, page))
return 1;
-#endif
- if (zone != page_zone(page))
+ if (!page_is_consistent(zone, page))
return 1;
+
return 0;
}
@@ -101,7 +127,7 @@ static void bad_page(const char *function, struct page *page)
printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
function, current->comm, page);
printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
- (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
+ (int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
page->mapping, page_mapcount(page), page_count(page));
printk(KERN_EMERG "Backtrace:\n");
dump_stack();
@@ -114,17 +140,13 @@ static void bad_page(const char *function, struct page *page)
1 << PG_reclaim |
1 << PG_slab |
1 << PG_swapcache |
- 1 << PG_writeback);
+ 1 << PG_writeback );
set_page_count(page, 0);
reset_page_mapcount(page);
page->mapping = NULL;
add_taint(TAINT_BAD_PAGE);
}
-#ifndef CONFIG_HUGETLB_PAGE
-#define prep_compound_page(page, order) do { } while (0)
-#define destroy_compound_page(page, order) do { } while (0)
-#else
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
@@ -153,7 +175,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
struct page *p = page + i;
SetPageCompound(p);
- p->private = (unsigned long)page;
+ set_page_private(p, (unsigned long)page);
}
}
@@ -173,12 +195,11 @@ static void destroy_compound_page(struct page *page, unsigned long order)
if (!PageCompound(p))
bad_page(__FUNCTION__, page);
- if (p->private != (unsigned long)page)
+ if (page_private(p) != (unsigned long)page)
bad_page(__FUNCTION__, page);
ClearPageCompound(p);
}
}
-#endif /* CONFIG_HUGETLB_PAGE */
/*
* function for dealing with page's order in buddy system.
@@ -186,18 +207,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
* So, we don't need atomic page->flags operations here.
*/
static inline unsigned long page_order(struct page *page) {
- return page->private;
+ return page_private(page);
}
static inline void set_page_order(struct page *page, int order) {
- page->private = order;
+ set_page_private(page, order);
__SetPagePrivate(page);
}
static inline void rmv_page_order(struct page *page)
{
__ClearPagePrivate(page);
- page->private = 0;
+ set_page_private(page, 0);
}
/*
@@ -237,14 +258,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* (a) the buddy is free &&
* (b) the buddy is on the buddy system &&
* (c) a page and its buddy have the same order.
- * for recording page's order, we use page->private and PG_private.
+ * for recording page's order, we use page_private(page) and PG_private.
*
*/
static inline int page_is_buddy(struct page *page, int order)
{
if (PagePrivate(page) &&
(page_order(page) == order) &&
- !PageReserved(page) &&
page_count(page) == 0)
return 1;
return 0;
@@ -264,7 +284,7 @@ static inline int page_is_buddy(struct page *page, int order)
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
* free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page->private field.
+ * order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
@@ -314,7 +334,7 @@ static inline void __free_pages_bulk (struct page *page,
zone->free_area[order].nr_free++;
}
-static inline void free_pages_check(const char *function, struct page *page)
+static inline int free_pages_check(const char *function, struct page *page)
{
if ( page_mapcount(page) ||
page->mapping != NULL ||
@@ -327,10 +347,17 @@ static inline void free_pages_check(const char *function, struct page *page)
1 << PG_reclaim |
1 << PG_slab |
1 << PG_swapcache |
- 1 << PG_writeback )))
+ 1 << PG_writeback |
+ 1 << PG_reserved )))
bad_page(function, page);
if (PageDirty(page))
__ClearPageDirty(page);
+ /*
+ * For now, we report if PG_reserved was found set, but do not
+ * clear it, and do not free the page. But we shall soon need
+ * to do more, for when the ZERO_PAGE count wraps negative.
+ */
+ return PageReserved(page);
}
/*
@@ -370,11 +397,10 @@ void __free_pages_ok(struct page *page, unsigned int order)
{
LIST_HEAD(list);
int i;
+ int reserved = 0;
arch_free_page(page, order);
- mod_page_state(pgfree, 1 << order);
-
#ifndef CONFIG_MMU
if (order > 0)
for (i = 1 ; i < (1 << order) ; ++i)
@@ -382,8 +408,12 @@ void __free_pages_ok(struct page *page, unsigned int order)
#endif
for (i = 0 ; i < (1 << order) ; ++i)
- free_pages_check(__FUNCTION__, page + i);
+ reserved += free_pages_check(__FUNCTION__, page + i);
+ if (reserved)
+ return;
+
list_add(&page->lru, &list);
+ mod_page_state(pgfree, 1 << order);
kernel_map_pages(page, 1<<order, 0);
free_pages_bulk(page_zone(page), 1, &list, order);
}
@@ -441,7 +471,7 @@ void set_page_refs(struct page *page, int order)
/*
* This page is about to be returned from the page allocator
*/
-static void prep_new_page(struct page *page, int order)
+static int prep_new_page(struct page *page, int order)
{
if ( page_mapcount(page) ||
page->mapping != NULL ||
@@ -455,15 +485,24 @@ static void prep_new_page(struct page *page, int order)
1 << PG_reclaim |
1 << PG_slab |
1 << PG_swapcache |
- 1 << PG_writeback )))
+ 1 << PG_writeback |
+ 1 << PG_reserved )))
bad_page(__FUNCTION__, page);
+ /*
+ * For now, we report if PG_reserved was found set, but do not
+ * clear it, and do not allocate the page: as a safety net.
+ */
+ if (PageReserved(page))
+ return 1;
+
page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked | 1 << PG_mappedtodisk);
- page->private = 0;
+ set_page_private(page, 0);
set_page_refs(page, order);
kernel_map_pages(page, 1 << order, 1);
+ return 0;
}
/*
@@ -646,11 +685,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
arch_free_page(page, 0);
- kernel_map_pages(page, 1, 0);
- inc_page_state(pgfree);
if (PageAnon(page))
page->mapping = NULL;
- free_pages_check(__FUNCTION__, page);
+ if (free_pages_check(__FUNCTION__, page))
+ return;
+
+ inc_page_state(pgfree);
+ kernel_map_pages(page, 1, 0);
+
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
list_add(&page->lru, &pcp->list);
@@ -671,7 +713,7 @@ void fastcall free_cold_page(struct page *page)
free_hot_cold_page(page, 1);
}
-static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags)
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
{
int i;
@@ -686,15 +728,17 @@ static inline void prep_zero_page(struct page *page, int order, unsigned int __n
* or two.
*/
static struct page *
-buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
+buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
{
unsigned long flags;
- struct page *page = NULL;
+ struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
+again:
if (order == 0) {
struct per_cpu_pages *pcp;
+ page = NULL;
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
if (pcp->count <= pcp->low)
@@ -707,9 +751,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
}
local_irq_restore(flags);
put_cpu();
- }
-
- if (page == NULL) {
+ } else {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -718,7 +760,8 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
if (page != NULL) {
BUG_ON(bad_range(zone, page));
mod_page_state_zone(zone, pgalloc, 1 << order);
- prep_new_page(page, order);
+ if (prep_new_page(page, order))
+ goto again;
if (gfp_flags & __GFP_ZERO)
prep_zero_page(page, order, gfp_flags);
@@ -729,20 +772,28 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
return page;
}
+#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
+#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
+#define ALLOC_HARDER 0x10 /* try to alloc harder */
+#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+
/*
* Return 1 if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int can_try_harder, int gfp_high)
+ int classzone_idx, int alloc_flags)
{
/* free_pages my go negative - that's OK */
long min = mark, free_pages = z->free_pages - (1 << order) + 1;
int o;
- if (gfp_high)
+ if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
- if (can_try_harder)
+ if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -760,123 +811,127 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return 1;
}
-static inline int
-should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
+/*
+ * get_page_from_freeliest goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, int alloc_flags)
{
- if (!z->reclaim_pages)
- return 0;
- if (gfp_mask & __GFP_NORECLAIM)
- return 0;
- return 1;
+ struct zone **z = zonelist->zones;
+ struct page *page = NULL;
+ int classzone_idx = zone_idx(*z);
+
+ /*
+ * Go through the zonelist once, looking for a zone with enough free.
+ * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ */
+ do {
+ if ((alloc_flags & ALLOC_CPUSET) &&
+ !cpuset_zone_allowed(*z, gfp_mask))
+ continue;
+
+ if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+ unsigned long mark;
+ if (alloc_flags & ALLOC_WMARK_MIN)
+ mark = (*z)->pages_min;
+ else if (alloc_flags & ALLOC_WMARK_LOW)
+ mark = (*z)->pages_low;
+ else
+ mark = (*z)->pages_high;
+ if (!zone_watermark_ok(*z, order, mark,
+ classzone_idx, alloc_flags))
+ continue;
+ }
+
+ page = buffered_rmqueue(*z, order, gfp_mask);
+ if (page) {
+ zone_statistics(zonelist, *z);
+ break;
+ }
+ } while (*(++z) != NULL);
+ return page;
}
/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page * fastcall
-__alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
+__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
- const int wait = gfp_mask & __GFP_WAIT;
- struct zone **zones, *z;
+ const gfp_t wait = gfp_mask & __GFP_WAIT;
+ struct zone **z;
struct page *page;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
- int i;
- int classzone_idx;
int do_retry;
- int can_try_harder;
+ int alloc_flags;
int did_some_progress;
might_sleep_if(wait);
- /*
- * The caller may dip into page reserves a bit more if the caller
- * cannot run direct reclaim, or is the caller has realtime scheduling
- * policy
- */
- can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
-
- zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
+restart:
+ z = zonelist->zones; /* the list of zones suitable for gfp_mask */
- if (unlikely(zones[0] == NULL)) {
+ if (unlikely(*z == NULL)) {
/* Should this ever happen?? */
return NULL;
}
- classzone_idx = zone_idx(zones[0]);
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+ zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+ if (page)
+ goto got_pg;
+
+ do {
+ wakeup_kswapd(*z, order);
+ } while (*(++z));
-restart:
/*
- * Go through the zonelist once, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * OK, we're below the kswapd watermark and have kicked background
+ * reclaim. Now things get more complex, so set up alloc_flags according
+ * to how we want to proceed.
+ *
+ * The caller may dip into page reserves a bit more if the caller
+ * cannot run direct reclaim, or if the caller has realtime scheduling
+ * policy.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- int do_reclaim = should_reclaim_zone(z, gfp_mask);
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- /*
- * If the zone is to attempt early page reclaim then this loop
- * will try to reclaim pages and check the watermark a second
- * time before giving up and falling back to the next zone.
- */
-zone_reclaim_retry:
- if (!zone_watermark_ok(z, order, z->pages_low,
- classzone_idx, 0, 0)) {
- if (!do_reclaim)
- continue;
- else {
- zone_reclaim(z, gfp_mask, order);
- /* Only try reclaim once */
- do_reclaim = 0;
- goto zone_reclaim_retry;
- }
- }
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
-
- for (i = 0; (z = zones[i]) != NULL; i++)
- wakeup_kswapd(z, order);
+ alloc_flags = ALLOC_WMARK_MIN;
+ if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+ alloc_flags |= ALLOC_HARDER;
+ if (gfp_mask & __GFP_HIGH)
+ alloc_flags |= ALLOC_HIGH;
+ if (wait)
+ alloc_flags |= ALLOC_CPUSET;
/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
- * coming from realtime tasks to go deeper into reserves
+ * coming from realtime tasks go deeper into reserves.
*
* This is the last chance, in general, before the goto nopage.
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;
-
- if (wait && !cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+ if (page)
+ goto got_pg;
/* This allocation should allow future memory freeing. */
if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+nofail_alloc:
/* go through the zonelist yet again, ignoring mins */
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
+ page = get_page_from_freelist(gfp_mask, order,
+ zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+ if (page)
+ goto got_pg;
+ if (gfp_mask & __GFP_NOFAIL) {
+ blk_congestion_wait(WRITE, HZ/50);
+ goto nofail_alloc;
}
}
goto nopage;
@@ -894,7 +949,7 @@ rebalance:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- did_some_progress = try_to_free_pages(zones, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
@@ -902,19 +957,10 @@ rebalance:
cond_resched();
if (likely(did_some_progress)) {
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;
-
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask, order,
+ zonelist, alloc_flags);
+ if (page)
+ goto got_pg;
} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
/*
* Go through the zonelist yet one more time, keep
@@ -922,18 +968,10 @@ rebalance:
* a parallel oom killing, we must fail if we're still
* under heavy pressure.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_high,
- classzone_idx, 0, 0))
- continue;
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+ zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+ if (page)
+ goto got_pg;
out_of_memory(gfp_mask, order);
goto restart;
@@ -966,9 +1004,7 @@ nopage:
dump_stack();
show_mem();
}
- return NULL;
got_pg:
- zone_statistics(zonelist, z);
return page;
}
@@ -977,7 +1013,7 @@ EXPORT_SYMBOL(__alloc_pages);
/*
* Common helper functions.
*/
-fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order)
+fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page * page;
page = alloc_pages(gfp_mask, order);
@@ -988,7 +1024,7 @@ fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned
EXPORT_SYMBOL(__get_free_pages);
-fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask)
+fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
{
struct page * page;
@@ -996,7 +1032,7 @@ fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask)
* get_zeroed_page() returns a 32-bit address, which cannot represent
* a highmem page
*/
- BUG_ON(gfp_mask & __GFP_HIGHMEM);
+ BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
if (page)
@@ -1016,7 +1052,7 @@ void __pagevec_free(struct pagevec *pvec)
fastcall void __free_pages(struct page *page, unsigned int order)
{
- if (!PageReserved(page) && put_page_testzero(page)) {
+ if (put_page_testzero(page)) {
if (order == 0)
free_hot_page(page);
else
@@ -1089,7 +1125,7 @@ static unsigned int nr_free_zone_pages(int offset)
*/
unsigned int nr_free_buffer_pages(void)
{
- return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
+ return nr_free_zone_pages(gfp_zone(GFP_USER));
}
/*
@@ -1097,7 +1133,7 @@ unsigned int nr_free_buffer_pages(void)
*/
unsigned int nr_free_pagecache_pages(void)
{
- return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
+ return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
}
#ifdef CONFIG_HIGHMEM
@@ -1305,12 +1341,9 @@ void show_free_areas(void)
} else
printk("\n");
- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ for_each_online_cpu(cpu) {
struct per_cpu_pageset *pageset;
- if (!cpu_possible(cpu))
- continue;
-
pageset = zone_pcp(zone, cpu);
for (temperature = 0; temperature < 2; temperature++)
@@ -1419,6 +1452,10 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
zone = pgdat->node_zones + ZONE_NORMAL;
if (zone->present_pages)
zonelist->zones[j++] = zone;
+ case ZONE_DMA32:
+ zone = pgdat->node_zones + ZONE_DMA32;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
case ZONE_DMA:
zone = pgdat->node_zones + ZONE_DMA;
if (zone->present_pages)
@@ -1428,6 +1465,18 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
return j;
}
+static inline int highest_zone(int zone_bits)
+{
+ int res = ZONE_NORMAL;
+ if (zone_bits & (__force int)__GFP_HIGHMEM)
+ res = ZONE_HIGHMEM;
+ if (zone_bits & (__force int)__GFP_DMA32)
+ res = ZONE_DMA32;
+ if (zone_bits & (__force int)__GFP_DMA)
+ res = ZONE_DMA;
+ return res;
+}
+
#ifdef CONFIG_NUMA
#define MAX_NODE_LOAD (num_online_nodes())
static int __initdata node_load[MAX_NUMNODES];
@@ -1524,11 +1573,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
zonelist = pgdat->node_zonelists + i;
for (j = 0; zonelist->zones[j] != NULL; j++);
- k = ZONE_NORMAL;
- if (i & __GFP_HIGHMEM)
- k = ZONE_HIGHMEM;
- if (i & __GFP_DMA)
- k = ZONE_DMA;
+ k = highest_zone(i);
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
zonelist->zones[j] = NULL;
@@ -1549,12 +1594,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
zonelist = pgdat->node_zonelists + i;
j = 0;
- k = ZONE_NORMAL;
- if (i & __GFP_HIGHMEM)
- k = ZONE_HIGHMEM;
- if (i & __GFP_DMA)
- k = ZONE_DMA;
-
+ k = highest_zone(i);
j = build_zonelists_node(pgdat, zonelist, j, k);
/*
* Now we build the zonelist so that it contains the zones
@@ -1659,7 +1699,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
-void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn)
{
struct page *page;
@@ -1673,7 +1713,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
continue;
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
- set_page_count(page, 0);
+ set_page_count(page, 1);
reset_page_mapcount(page);
SetPageReserved(page);
INIT_LIST_HEAD(&page->lru);
@@ -1720,14 +1760,13 @@ static int __devinit zone_batchsize(struct zone *zone)
/*
* The per-cpu-pages pools are set to around 1000th of the
- * size of the zone. But no more than 1/4 of a meg - there's
- * no point in going beyond the size of L2 cache.
+ * size of the zone. But no more than 1/2 of a meg.
*
* OK, so we don't know how big the cache is. So guess.
*/
batch = zone->present_pages / 1024;
- if (batch * PAGE_SIZE > 256 * 1024)
- batch = (256 * 1024) / PAGE_SIZE;
+ if (batch * PAGE_SIZE > 512 * 1024)
+ batch = (512 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
@@ -1742,7 +1781,8 @@ static int __devinit zone_batchsize(struct zone *zone)
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
*/
- batch = (1 << fls(batch + batch/2)) - 1;
+ batch = (1 << (fls(batch + batch/2)-1)) - 1;
+
return batch;
}
@@ -1750,9 +1790,11 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
{
struct per_cpu_pages *pcp;
+ memset(p, 0, sizeof(*p));
+
pcp = &p->pcp[0]; /* hot */
pcp->count = 0;
- pcp->low = 2 * batch;
+ pcp->low = 0;
pcp->high = 6 * batch;
pcp->batch = max(1UL, 1 * batch);
INIT_LIST_HEAD(&pcp->list);
@@ -1761,7 +1803,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
pcp->count = 0;
pcp->low = 0;
pcp->high = 2 * batch;
- pcp->batch = max(1UL, 1 * batch);
+ pcp->batch = max(1UL, batch/2);
INIT_LIST_HEAD(&pcp->list);
}
@@ -1841,11 +1883,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
if (process_zones(cpu))
ret = NOTIFY_BAD;
break;
-#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
case CPU_DEAD:
free_zone_pagesets(cpu);
break;
-#endif
default:
break;
}
@@ -1870,6 +1911,60 @@ void __init setup_per_cpu_pageset()
#endif
+static __devinit
+void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+ int i;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+
+ /*
+ * The per-page waitqueue mechanism uses hashed waitqueues
+ * per zone.
+ */
+ zone->wait_table_size = wait_table_size(zone_size_pages);
+ zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
+ zone->wait_table = (wait_queue_head_t *)
+ alloc_bootmem_node(pgdat, zone->wait_table_size
+ * sizeof(wait_queue_head_t));
+
+ for(i = 0; i < zone->wait_table_size; ++i)
+ init_waitqueue_head(zone->wait_table + i);
+}
+
+static __devinit void zone_pcp_init(struct zone *zone)
+{
+ int cpu;
+ unsigned long batch = zone_batchsize(zone);
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+ /* Early boot. Slab allocator not functional yet */
+ zone->pageset[cpu] = &boot_pageset[cpu];
+ setup_pageset(&boot_pageset[cpu],0);
+#else
+ setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+ }
+ printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
+ zone->name, zone->present_pages, batch);
+}
+
+static __devinit void init_currently_empty_zone(struct zone *zone,
+ unsigned long zone_start_pfn, unsigned long size)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+
+ zone_wait_table_init(zone, size);
+ pgdat->nr_zones = zone_idx(zone) + 1;
+
+ zone->zone_mem_map = pfn_to_page(zone_start_pfn);
+ zone->zone_start_pfn = zone_start_pfn;
+
+ memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+ zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+}
+
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -1879,10 +1974,11 @@ void __init setup_per_cpu_pageset()
static void __init free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
- unsigned long i, j;
- int cpu, nid = pgdat->node_id;
+ unsigned long j;
+ int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
+ pgdat_resize_init(pgdat);
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
@@ -1890,13 +1986,12 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize;
- unsigned long batch;
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];
- if (j == ZONE_DMA || j == ZONE_NORMAL)
+ if (j < ZONE_HIGHMEM)
nr_kernel_pages += realsize;
nr_all_pages += realsize;
@@ -1905,24 +2000,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
+ zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
- batch = zone_batchsize(zone);
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
- /* Early boot. Slab allocator not functional yet */
- zone->pageset[cpu] = &boot_pageset[cpu];
- setup_pageset(&boot_pageset[cpu],0);
-#else
- setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
- }
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone_names[j], realsize, batch);
+ zone_pcp_init(zone);
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
zone->nr_scan_active = 0;
@@ -1933,32 +2017,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
if (!size)
continue;
- /*
- * The per-page waitqueue mechanism uses hashed waitqueues
- * per zone.
- */
- zone->wait_table_size = wait_table_size(size);
- zone->wait_table_bits =
- wait_table_bits(zone->wait_table_size);
- zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node(pgdat, zone->wait_table_size
- * sizeof(wait_queue_head_t));
-
- for(i = 0; i < zone->wait_table_size; ++i)
- init_waitqueue_head(zone->wait_table + i);
-
- pgdat->nr_zones = j+1;
-
- zone->zone_mem_map = pfn_to_page(zone_start_pfn);
- zone->zone_start_pfn = zone_start_pfn;
-
- memmap_init(size, nid, j, zone_start_pfn);
-
zonetable_add(zone, nid, j, zone_start_pfn, size);
-
+ init_currently_empty_zone(zone, zone_start_pfn, size);
zone_start_pfn += size;
-
- zone_init_free_lists(pgdat, zone, zone->spanned_pages);
}
}
@@ -2358,7 +2419,7 @@ static void setup_per_zone_lowmem_reserve(void)
* that the pages_{min,low,high} values for each zone are set correctly
* with respect to min_free_kbytes.
*/
-static void setup_per_zone_pages_min(void)
+void setup_per_zone_pages_min(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
@@ -2372,13 +2433,18 @@ static void setup_per_zone_pages_min(void)
}
for_each_zone(zone) {
+ unsigned long tmp;
spin_lock_irqsave(&zone->lru_lock, flags);
+ tmp = (pages_min * zone->present_pages) / lowmem_pages;
if (is_highmem(zone)) {
/*
- * Often, highmem doesn't need to reserve any pages.
- * But the pages_min/low/high values are also used for
- * batching up page reclaim activity so we need a
- * decent value here.
+ * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+ * need highmem pages, so cap pages_min to a small
+ * value here.
+ *
+ * The (pages_high-pages_low) and (pages_low-pages_min)
+ * deltas controls asynch page reclaim, and so should
+ * not be capped for highmem.
*/
int min_pages;
@@ -2389,19 +2455,15 @@ static void setup_per_zone_pages_min(void)
min_pages = 128;
zone->pages_min = min_pages;
} else {
- /* if it's a lowmem zone, reserve a number of pages
+ /*
+ * If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->pages_min = (pages_min * zone->present_pages) /
- lowmem_pages;
+ zone->pages_min = tmp;
}
- /*
- * When interpreting these watermarks, just keep in mind that:
- * zone->pages_min == (zone->pages_min * 4) / 4;
- */
- zone->pages_low = (zone->pages_min * 5) / 4;
- zone->pages_high = (zone->pages_min * 6) / 4;
+ zone->pages_low = zone->pages_min + tmp / 4;
+ zone->pages_high = zone->pages_min + tmp / 2;
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 2e605a19ce5..bb2b0d53889 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -19,7 +19,7 @@
#include <linux/writeback.h>
#include <asm/pgtable.h>
-static struct bio *get_swap_bio(unsigned int __nocast gfp_flags, pgoff_t index,
+static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
struct page *page, bio_end_io_t end_io)
{
struct bio *bio;
@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
unlock_page(page);
goto out;
}
- bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
+ bio = get_swap_bio(GFP_NOIO, page_private(page), page,
+ end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page)
BUG_ON(!PageLocked(page));
ClearPageUptodate(page);
- bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
+ bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
+ end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
ret = -ENOMEM;
diff --git a/mm/pdflush.c b/mm/pdflush.c
index d6781951267..52822c98c48 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -20,6 +20,7 @@
#include <linux/fs.h> // Needed by writeback.h
#include <linux/writeback.h> // Prototypes pdflush_operation()
#include <linux/kthread.h>
+#include <linux/cpuset.h>
/*
@@ -170,12 +171,24 @@ static int __pdflush(struct pdflush_work *my_work)
static int pdflush(void *dummy)
{
struct pdflush_work my_work;
+ cpumask_t cpus_allowed;
/*
* pdflush can spend a lot of time doing encryption via dm-crypt. We
* don't want to do that at keventd's priority.
*/
set_user_nice(current, 0);
+
+ /*
+ * Some configs put our parent kthread in a limited cpuset,
+ * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL.
+ * Our needs are more modest - cut back to our cpusets cpus_allowed.
+ * This is needed as pdflush's are dynamically created and destroyed.
+ * The boottime pdflush's are easily placed w/o these 2 lines.
+ */
+ cpus_allowed = cpuset_cpus_allowed(current);
+ set_cpus_allowed(current, cpus_allowed);
+
return __pdflush(&my_work);
}
diff --git a/mm/readahead.c b/mm/readahead.c
index d0b50034e24..72e7adbb87c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -254,7 +254,7 @@ out:
*/
static int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- unsigned long offset, unsigned long nr_to_read)
+ pgoff_t offset, unsigned long nr_to_read)
{
struct inode *inode = mapping->host;
struct page *page;
@@ -274,7 +274,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
*/
read_lock_irq(&mapping->tree_lock);
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
- unsigned long page_offset = offset + page_idx;
+ pgoff_t page_offset = offset + page_idx;
if (page_offset > end_index)
break;
@@ -311,7 +311,7 @@ out:
* memory at once.
*/
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
- unsigned long offset, unsigned long nr_to_read)
+ pgoff_t offset, unsigned long nr_to_read)
{
int ret = 0;
@@ -368,7 +368,7 @@ static inline int check_ra_success(struct file_ra_state *ra,
* request queues.
*/
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- unsigned long offset, unsigned long nr_to_read)
+ pgoff_t offset, unsigned long nr_to_read)
{
if (bdi_read_congested(mapping->backing_dev_info))
return -1;
@@ -385,7 +385,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
*/
static int
blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
- unsigned long offset, unsigned long nr_to_read,
+ pgoff_t offset, unsigned long nr_to_read,
struct file_ra_state *ra, int block)
{
int actual;
@@ -430,14 +430,27 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
return ret;
}
-/*
- * page_cache_readahead is the main function. If performs the adaptive
+/**
+ * page_cache_readahead - generic adaptive readahead
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @filp: passed on to ->readpage() and ->readpages()
+ * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
+ * @req_size: hint: total size of the read which the caller is performing in
+ * PAGE_CACHE_SIZE units
+ *
+ * page_cache_readahead() is the main function. If performs the adaptive
* readahead window size management and submits the readahead I/O.
+ *
+ * Note that @filp is purely used for passing on to the ->readpage[s]()
+ * handler: it may refer to a different file from @mapping (so we may not use
+ * @filp->f_mapping or @filp->f_dentry->d_inode here).
+ * Also, @ra may not be equal to &@filp->f_ra.
+ *
*/
unsigned long
page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
- struct file *filp, unsigned long offset,
- unsigned long req_size)
+ struct file *filp, pgoff_t offset, unsigned long req_size)
{
unsigned long max, newsize;
int sequential;
diff --git a/mm/rmap.c b/mm/rmap.c
index 450f5241b5a..f853c6def15 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,7 +32,7 @@
* page->flags PG_locked (lock_page)
* mapping->i_mmap_lock
* anon_vma->lock
- * mm->page_table_lock
+ * mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
@@ -225,7 +225,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
/*
* At what user virtual address is page expected in vma? checking that the
- * page matches the vma: currently only used by unuse_process, on anon pages.
+ * page matches the vma: currently only used on anon pages, by unuse_vma;
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
@@ -234,7 +234,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
(void *)page->mapping - PAGE_MAPPING_ANON)
return -EFAULT;
} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
- if (vma->vm_file->f_mapping != page->mapping)
+ if (!vma->vm_file ||
+ vma->vm_file->f_mapping != page->mapping)
return -EFAULT;
} else
return -EFAULT;
@@ -244,37 +245,44 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
/*
* Check that @page is mapped at @address into @mm.
*
- * On success returns with mapped pte and locked mm->page_table_lock.
+ * On success returns with pte mapped and locked.
*/
pte_t *page_check_address(struct page *page, struct mm_struct *mm,
- unsigned long address)
+ unsigned long address, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ spinlock_t *ptl;
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- spin_lock(&mm->page_table_lock);
pgd = pgd_offset(mm, address);
- if (likely(pgd_present(*pgd))) {
- pud = pud_offset(pgd, address);
- if (likely(pud_present(*pud))) {
- pmd = pmd_offset(pud, address);
- if (likely(pmd_present(*pmd))) {
- pte = pte_offset_map(pmd, address);
- if (likely(pte_present(*pte) &&
- page_to_pfn(page) == pte_pfn(*pte)))
- return pte;
- pte_unmap(pte);
- }
- }
+ if (!pgd_present(*pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return NULL;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return NULL;
+
+ pte = pte_offset_map(pmd, address);
+ /* Make a quick check before getting the lock */
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return NULL;
}
- spin_unlock(&mm->page_table_lock);
- return ERR_PTR(-ENOENT);
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
+ *ptlp = ptl;
+ return pte;
+ }
+ pte_unmap_unlock(pte, ptl);
+ return NULL;
}
/*
@@ -282,34 +290,38 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
* repeatedly from either page_referenced_anon or page_referenced_file.
*/
static int page_referenced_one(struct page *page,
- struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token)
+ struct vm_area_struct *vma, unsigned int *mapcount)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *pte;
+ spinlock_t *ptl;
int referenced = 0;
address = vma_address(page, vma);
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address);
- if (!IS_ERR(pte)) {
- if (ptep_clear_flush_young(vma, address, pte))
- referenced++;
+ pte = page_check_address(page, mm, address, &ptl);
+ if (!pte)
+ goto out;
- if (mm != current->mm && !ignore_token && has_swap_token(mm))
- referenced++;
+ if (ptep_clear_flush_young(vma, address, pte))
+ referenced++;
- (*mapcount)--;
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
- }
+ /* Pretend the page is referenced if the task has the
+ swap token and is in the middle of a page fault. */
+ if (mm != current->mm && has_swap_token(mm) &&
+ rwsem_is_locked(&mm->mmap_sem))
+ referenced++;
+
+ (*mapcount)--;
+ pte_unmap_unlock(pte, ptl);
out:
return referenced;
}
-static int page_referenced_anon(struct page *page, int ignore_token)
+static int page_referenced_anon(struct page *page)
{
unsigned int mapcount;
struct anon_vma *anon_vma;
@@ -322,8 +334,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
mapcount = page_mapcount(page);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- referenced += page_referenced_one(page, vma, &mapcount,
- ignore_token);
+ referenced += page_referenced_one(page, vma, &mapcount);
if (!mapcount)
break;
}
@@ -342,7 +353,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
*
* This function is only called from page_referenced for object-based pages.
*/
-static int page_referenced_file(struct page *page, int ignore_token)
+static int page_referenced_file(struct page *page)
{
unsigned int mapcount;
struct address_space *mapping = page->mapping;
@@ -380,8 +391,7 @@ static int page_referenced_file(struct page *page, int ignore_token)
referenced++;
break;
}
- referenced += page_referenced_one(page, vma, &mapcount,
- ignore_token);
+ referenced += page_referenced_one(page, vma, &mapcount);
if (!mapcount)
break;
}
@@ -398,13 +408,10 @@ static int page_referenced_file(struct page *page, int ignore_token)
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
*/
-int page_referenced(struct page *page, int is_locked, int ignore_token)
+int page_referenced(struct page *page, int is_locked)
{
int referenced = 0;
- if (!swap_token_default_timeout)
- ignore_token = 1;
-
if (page_test_and_clear_young(page))
referenced++;
@@ -413,15 +420,14 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
if (page_mapped(page) && page->mapping) {
if (PageAnon(page))
- referenced += page_referenced_anon(page, ignore_token);
+ referenced += page_referenced_anon(page);
else if (is_locked)
- referenced += page_referenced_file(page, ignore_token);
+ referenced += page_referenced_file(page);
else if (TestSetPageLocked(page))
referenced++;
else {
if (page->mapping)
- referenced += page_referenced_file(page,
- ignore_token);
+ referenced += page_referenced_file(page);
unlock_page(page);
}
}
@@ -434,15 +440,11 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- BUG_ON(PageReserved(page));
-
- inc_mm_counter(vma->vm_mm, anon_rss);
-
if (atomic_inc_and_test(&page->_mapcount)) {
struct anon_vma *anon_vma = vma->anon_vma;
@@ -461,13 +463,12 @@ void page_add_anon_rmap(struct page *page,
* page_add_file_rmap - add pte mapping to a file page
* @page: the page to add the mapping to
*
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
*/
void page_add_file_rmap(struct page *page)
{
BUG_ON(PageAnon(page));
- if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
- return;
+ BUG_ON(!pfn_valid(page_to_pfn(page)));
if (atomic_inc_and_test(&page->_mapcount))
inc_page_state(nr_mapped);
@@ -477,12 +478,10 @@ void page_add_file_rmap(struct page *page)
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
*
- * Caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte lock.
*/
void page_remove_rmap(struct page *page)
{
- BUG_ON(PageReserved(page));
-
if (atomic_add_negative(-1, &page->_mapcount)) {
BUG_ON(page_mapcount(page) < 0);
/*
@@ -510,24 +509,23 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
unsigned long address;
pte_t *pte;
pte_t pteval;
+ spinlock_t *ptl;
int ret = SWAP_AGAIN;
address = vma_address(page, vma);
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address);
- if (IS_ERR(pte))
+ pte = page_check_address(page, mm, address, &ptl);
+ if (!pte)
goto out;
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
* skipped over this mm) then we should reactivate it.
- *
- * Pages belonging to VM_RESERVED regions should not happen here.
*/
- if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
+ if ((vma->vm_flags & VM_LOCKED) ||
ptep_clear_flush_young(vma, address, pte)) {
ret = SWAP_FAIL;
goto out_unmap;
@@ -541,8 +539,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
if (pte_dirty(pteval))
set_page_dirty(page);
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
if (PageAnon(page)) {
- swp_entry_t entry = { .val = page->private };
+ swp_entry_t entry = { .val = page_private(page) };
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
@@ -551,21 +552,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
swap_duplicate(entry);
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
- list_add(&mm->mmlist, &init_mm.mmlist);
+ if (list_empty(&mm->mmlist))
+ list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
dec_mm_counter(mm, anon_rss);
- }
+ } else
+ dec_mm_counter(mm, file_rss);
- dec_mm_counter(mm, rss);
page_remove_rmap(page);
page_cache_release(page);
out_unmap:
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
out:
return ret;
}
@@ -599,18 +600,12 @@ static void try_to_unmap_cluster(unsigned long cursor,
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte, *original_pte;
+ pte_t *pte;
pte_t pteval;
+ spinlock_t *ptl;
struct page *page;
unsigned long address;
unsigned long end;
- unsigned long pfn;
-
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- spin_lock(&mm->page_table_lock);
address = (vma->vm_start + cursor) & CLUSTER_MASK;
end = address + CLUSTER_SIZE;
@@ -621,36 +616,32 @@ static void try_to_unmap_cluster(unsigned long cursor,
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
- goto out_unlock;
+ return;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
- goto out_unlock;
+ return;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
- goto out_unlock;
-
- for (original_pte = pte = pte_offset_map(pmd, address);
- address < end; pte++, address += PAGE_SIZE) {
+ return;
- if (!pte_present(*pte))
- continue;
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- pfn = pte_pfn(*pte);
- if (!pfn_valid(pfn))
- continue;
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
- page = pfn_to_page(pfn);
- BUG_ON(PageAnon(page));
- if (PageReserved(page))
+ for (; address < end; pte++, address += PAGE_SIZE) {
+ if (!pte_present(*pte))
continue;
+ page = vm_normal_page(vma, address, *pte);
+ BUG_ON(!page || PageAnon(page));
if (ptep_clear_flush_young(vma, address, pte))
continue;
/* Nuke the page table entry. */
- flush_cache_page(vma, address, pfn);
+ flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
@@ -663,13 +654,10 @@ static void try_to_unmap_cluster(unsigned long cursor,
page_remove_rmap(page);
page_cache_release(page);
- dec_mm_counter(mm, rss);
+ dec_mm_counter(mm, file_rss);
(*mapcount)--;
}
-
- pte_unmap(original_pte);
-out_unlock:
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte - 1, ptl);
}
static int try_to_unmap_anon(struct page *page)
@@ -724,7 +712,7 @@ static int try_to_unmap_file(struct page *page)
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
shared.vm_set.list) {
- if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+ if (vma->vm_flags & VM_LOCKED)
continue;
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
@@ -758,7 +746,7 @@ static int try_to_unmap_file(struct page *page)
do {
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
shared.vm_set.list) {
- if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+ if (vma->vm_flags & VM_LOCKED)
continue;
cursor = (unsigned long) vma->vm_private_data;
while ( cursor < max_nl_cursor &&
@@ -780,11 +768,8 @@ static int try_to_unmap_file(struct page *page)
* in locked vmas). Reset cursor on all unreserved nonlinear
* vmas, now forgetting on which ones it had fallen behind.
*/
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.vm_set.list) {
- if (!(vma->vm_flags & VM_RESERVED))
- vma->vm_private_data = NULL;
- }
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+ vma->vm_private_data = NULL;
out:
spin_unlock(&mapping->i_mmap_lock);
return ret;
@@ -806,7 +791,6 @@ int try_to_unmap(struct page *page)
{
int ret;
- BUG_ON(PageReserved(page));
BUG_ON(!PageLocked(page));
if (PageAnon(page))
diff --git a/mm/shmem.c b/mm/shmem.c
index 1f7aeb210c7..dc25565a61e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -71,9 +71,6 @@
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
-/* Keep swapped page count in private field of indirect struct page */
-#define nr_swapped private
-
/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
enum sgp_type {
SGP_QUICK, /* don't try more than file page cache lookup */
@@ -85,7 +82,7 @@ enum sgp_type {
static int shmem_getpage(struct inode *inode, unsigned long idx,
struct page **pagep, enum sgp_type sgp, int *type);
-static inline struct page *shmem_dir_alloc(unsigned int gfp_mask)
+static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
{
/*
* The above definition of ENTRIES_PER_PAGE, and the use of
@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
entry->val = value;
info->swapped += incdec;
- if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
- kmap_atomic_to_page(entry)->nr_swapped += incdec;
+ if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
+ struct page *page = kmap_atomic_to_page(entry);
+ set_page_private(page, page_private(page) + incdec);
+ }
}
/*
@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
spin_unlock(&info->lock);
page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
- if (page) {
- page->nr_swapped = 0;
- }
+ if (page)
+ set_page_private(page, 0);
spin_lock(&info->lock);
if (!page) {
@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode)
diroff = 0;
}
subdir = dir[diroff];
- if (subdir && subdir->nr_swapped) {
+ if (subdir && page_private(subdir)) {
size = limit - idx;
if (size > ENTRIES_PER_PAGE)
size = ENTRIES_PER_PAGE;
@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode)
nr_swaps_freed += freed;
if (offset)
spin_lock(&info->lock);
- subdir->nr_swapped -= freed;
+ set_page_private(subdir, page_private(subdir) - freed);
if (offset)
spin_unlock(&info->lock);
- BUG_ON(subdir->nr_swapped > offset);
+ BUG_ON(page_private(subdir) > offset);
}
if (offset)
offset = 0;
@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
dir = shmem_dir_map(subdir);
}
subdir = *dir;
- if (subdir && subdir->nr_swapped) {
+ if (subdir && page_private(subdir)) {
ptr = shmem_swp_map(subdir);
size = limit - idx;
if (size > ENTRIES_PER_PAGE)
@@ -898,7 +896,7 @@ struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
}
static struct page *
-shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info,
+shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
unsigned long idx)
{
struct vm_area_struct pvma;
@@ -921,8 +919,7 @@ shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
}
static inline struct page *
-shmem_alloc_page(unsigned int __nocast gfp,struct shmem_inode_info *info,
- unsigned long idx)
+shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
{
return alloc_page(gfp | __GFP_ZERO);
}
@@ -1202,7 +1199,7 @@ static int shmem_populate(struct vm_area_struct *vma,
page_cache_release(page);
return err;
}
- } else {
+ } else if (vma->vm_flags & VM_NONLINEAR) {
/* No page was found just because we can't read it in
* now (being here implies nonblock != 0), but the page
* may exist, so set the PTE to fault it in later. */
@@ -1507,8 +1504,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
*/
if (!offset)
mark_page_accessed(page);
- } else
+ } else {
page = ZERO_PAGE(0);
+ page_cache_get(page);
+ }
/*
* Ok, we have the page, and it's up-to-date, so
diff --git a/mm/slab.c b/mm/slab.c
index 437d3388054..e5ec26e0c46 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -308,12 +308,12 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
#define SIZE_L3 (1 + MAX_NUMNODES)
/*
- * This function may be completely optimized away if
+ * This function must be completely optimized away if
* a constant is passed to it. Mostly the same as
* what is in linux/slab.h except it returns an
* index.
*/
-static inline int index_of(const size_t size)
+static __always_inline int index_of(const size_t size)
{
if (__builtin_constant_p(size)) {
int i = 0;
@@ -329,7 +329,8 @@ static inline int index_of(const size_t size)
extern void __bad_size(void);
__bad_size();
}
- }
+ } else
+ BUG();
return 0;
}
@@ -367,7 +368,7 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
* manages a cache.
*/
-struct kmem_cache_s {
+struct kmem_cache {
/* 1) per-cpu data, touched during every alloc/free */
struct array_cache *array[NR_CPUS];
unsigned int batchcount;
@@ -385,7 +386,7 @@ struct kmem_cache_s {
unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA */
- unsigned int gfpflags;
+ gfp_t gfpflags;
size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
@@ -433,7 +434,7 @@ struct kmem_cache_s {
/* Optimization question: fewer reaps means less
* probability for unnessary cpucache drain/refill cycles.
*
- * OTHO the cpuarrays can contain lots of objects,
+ * OTOH the cpuarrays can contain lots of objects,
* which could lock up otherwise freeable slabs.
*/
#define REAPTIMEOUT_CPUC (2*HZ)
@@ -564,14 +565,29 @@ static void **dbg_userword(kmem_cache_t *cachep, void *objp)
#define BREAK_GFP_ORDER_LO 0
static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
-/* Macros for storing/retrieving the cachep and or slab from the
+/* Functions for storing/retrieving the cachep and or slab from the
* global 'mem_map'. These are used to find the slab an obj belongs to.
* With kfree(), these are used to find the cache which an obj belongs to.
*/
-#define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x))
-#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next)
-#define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x))
-#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev)
+static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
+{
+ page->lru.next = (struct list_head *)cache;
+}
+
+static inline struct kmem_cache *page_get_cache(struct page *page)
+{
+ return (struct kmem_cache *)page->lru.next;
+}
+
+static inline void page_set_slab(struct page *page, struct slab *slab)
+{
+ page->lru.prev = (struct list_head *)slab;
+}
+
+static inline struct slab *page_get_slab(struct page *page)
+{
+ return (struct slab *)page->lru.prev;
+}
/* These are the default caches for kmalloc. Custom caches can have other sizes. */
struct cache_sizes malloc_sizes[] = {
@@ -639,7 +655,7 @@ static enum {
static DEFINE_PER_CPU(struct work_struct, reap_work);
-static void free_block(kmem_cache_t* cachep, void** objpp, int len);
+static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node);
static void enable_cpucache (kmem_cache_t *cachep);
static void cache_reap (void *unused);
static int __node_shrink(kmem_cache_t *cachep, int node);
@@ -649,8 +665,7 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep)
return cachep->array[smp_processor_id()];
}
-static inline kmem_cache_t *__find_general_cachep(size_t size,
- unsigned int __nocast gfpflags)
+static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
{
struct cache_sizes *csizep = malloc_sizes;
@@ -674,8 +689,7 @@ static inline kmem_cache_t *__find_general_cachep(size_t size,
return csizep->cs_cachep;
}
-kmem_cache_t *kmem_find_general_cachep(size_t size,
- unsigned int __nocast gfpflags)
+kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
{
return __find_general_cachep(size, gfpflags);
}
@@ -804,7 +818,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache
if (ac->avail) {
spin_lock(&rl3->list_lock);
- free_block(cachep, ac->entry, ac->avail);
+ free_block(cachep, ac->entry, ac->avail, node);
ac->avail = 0;
spin_unlock(&rl3->list_lock);
}
@@ -925,7 +939,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
/* Free limit for this kmem_list3 */
l3->free_limit -= cachep->batchcount;
if (nc)
- free_block(cachep, nc->entry, nc->avail);
+ free_block(cachep, nc->entry, nc->avail, node);
if (!cpus_empty(mask)) {
spin_unlock(&l3->list_lock);
@@ -934,7 +948,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
if (l3->shared) {
free_block(cachep, l3->shared->entry,
- l3->shared->avail);
+ l3->shared->avail, node);
kfree(l3->shared);
l3->shared = NULL;
}
@@ -1184,18 +1198,14 @@ __initcall(cpucache_init);
* did not request dmaable memory, we might get it, but that
* would be relatively rare and ignorable.
*/
-static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
{
struct page *page;
void *addr;
int i;
flags |= cachep->gfpflags;
- if (likely(nodeid == -1)) {
- page = alloc_pages(flags, cachep->gfporder);
- } else {
- page = alloc_pages_node(nodeid, flags, cachep->gfporder);
- }
+ page = alloc_pages_node(nodeid, flags, cachep->gfporder);
if (!page)
return NULL;
addr = page_address(page);
@@ -1369,7 +1379,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
/* Print some data about the neighboring objects, if they
* exist:
*/
- struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
+ struct slab *slabp = page_get_slab(virt_to_page(objp));
int objnr;
objnr = (objp-slabp->s_mem)/cachep->objsize;
@@ -1503,6 +1513,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
{
size_t left_over, slab_size, ralign;
kmem_cache_t *cachep = NULL;
+ struct list_head *p;
/*
* Sanity checks... these are all serious usage bugs.
@@ -1517,6 +1528,35 @@ kmem_cache_create (const char *name, size_t size, size_t align,
BUG();
}
+ down(&cache_chain_sem);
+
+ list_for_each(p, &cache_chain) {
+ kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
+ mm_segment_t old_fs = get_fs();
+ char tmp;
+ int res;
+
+ /*
+ * This happens when the module gets unloaded and doesn't
+ * destroy its slab cache and no-one else reuses the vmalloc
+ * area of the module. Print a warning.
+ */
+ set_fs(KERNEL_DS);
+ res = __get_user(tmp, pc->name);
+ set_fs(old_fs);
+ if (res) {
+ printk("SLAB: cache with size %d has lost its name\n",
+ pc->objsize);
+ continue;
+ }
+
+ if (!strcmp(pc->name,name)) {
+ printk("kmem_cache_create: duplicate cache %s\n", name);
+ dump_stack();
+ goto oops;
+ }
+ }
+
#if DEBUG
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
@@ -1593,7 +1633,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
/* Get cache's description obj. */
cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
if (!cachep)
- goto opps;
+ goto oops;
memset(cachep, 0, sizeof(kmem_cache_t));
#if DEBUG
@@ -1687,7 +1727,7 @@ next:
printk("kmem_cache_create: couldn't create cache %s.\n", name);
kmem_cache_free(&cache_cache, cachep);
cachep = NULL;
- goto opps;
+ goto oops;
}
slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
+ sizeof(struct slab), align);
@@ -1782,43 +1822,14 @@ next:
cachep->limit = BOOT_CPUCACHE_ENTRIES;
}
- /* Need the semaphore to access the chain. */
- down(&cache_chain_sem);
- {
- struct list_head *p;
- mm_segment_t old_fs;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- list_for_each(p, &cache_chain) {
- kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
- char tmp;
- /* This happens when the module gets unloaded and doesn't
- destroy its slab cache and noone else reuses the vmalloc
- area of the module. Print a warning. */
- if (__get_user(tmp,pc->name)) {
- printk("SLAB: cache with size %d has lost its name\n",
- pc->objsize);
- continue;
- }
- if (!strcmp(pc->name,name)) {
- printk("kmem_cache_create: duplicate cache %s\n",name);
- up(&cache_chain_sem);
- unlock_cpu_hotplug();
- BUG();
- }
- }
- set_fs(old_fs);
- }
-
/* cache setup completed, link it into the list */
list_add(&cachep->next, &cache_chain);
- up(&cache_chain_sem);
unlock_cpu_hotplug();
-opps:
+oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
+ up(&cache_chain_sem);
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -1882,12 +1893,13 @@ static void do_drain(void *arg)
{
kmem_cache_t *cachep = (kmem_cache_t*)arg;
struct array_cache *ac;
+ int node = numa_node_id();
check_irq_off();
ac = ac_data(cachep);
- spin_lock(&cachep->nodelists[numa_node_id()]->list_lock);
- free_block(cachep, ac->entry, ac->avail);
- spin_unlock(&cachep->nodelists[numa_node_id()]->list_lock);
+ spin_lock(&cachep->nodelists[node]->list_lock);
+ free_block(cachep, ac->entry, ac->avail, node);
+ spin_unlock(&cachep->nodelists[node]->list_lock);
ac->avail = 0;
}
@@ -2046,7 +2058,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
/* Get the memory for a slab management obj. */
static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
- int colour_off, unsigned int __nocast local_flags)
+ int colour_off, gfp_t local_flags)
{
struct slab *slabp;
@@ -2117,7 +2129,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
slabp->free = 0;
}
-static void kmem_flagcheck(kmem_cache_t *cachep, unsigned int flags)
+static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
{
if (flags & SLAB_DMA) {
if (!(cachep->gfpflags & GFP_DMA))
@@ -2137,8 +2149,8 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
i = 1 << cachep->gfporder;
page = virt_to_page(objp);
do {
- SET_PAGE_CACHE(page, cachep);
- SET_PAGE_SLAB(page, slabp);
+ page_set_cache(page, cachep);
+ page_set_slab(page, slabp);
page++;
} while (--i);
}
@@ -2147,12 +2159,12 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
-static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
{
struct slab *slabp;
void *objp;
size_t offset;
- unsigned int local_flags;
+ gfp_t local_flags;
unsigned long ctor_flags;
struct kmem_list3 *l3;
@@ -2268,14 +2280,14 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
kfree_debugcheck(objp);
page = virt_to_page(objp);
- if (GET_PAGE_CACHE(page) != cachep) {
+ if (page_get_cache(page) != cachep) {
printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
- GET_PAGE_CACHE(page),cachep);
+ page_get_cache(page),cachep);
printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
- printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name);
+ printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name);
WARN_ON(1);
}
- slabp = GET_PAGE_SLAB(page);
+ slabp = page_get_slab(page);
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
@@ -2354,7 +2366,7 @@ bad:
#define check_slabp(x,y) do { } while(0)
#endif
-static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags)
+static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
{
int batchcount;
struct kmem_list3 *l3;
@@ -2419,6 +2431,7 @@ retry:
next = slab_bufctl(slabp)[slabp->free];
#if DEBUG
slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+ WARN_ON(numa_node_id() != slabp->nodeid);
#endif
slabp->free = next;
}
@@ -2454,7 +2467,7 @@ alloc_done:
}
static inline void
-cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
+cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
{
might_sleep_if(flags & __GFP_WAIT);
#if DEBUG
@@ -2465,7 +2478,7 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
#if DEBUG
static void *
cache_alloc_debugcheck_after(kmem_cache_t *cachep,
- unsigned int __nocast flags, void *objp, void *caller)
+ gfp_t flags, void *objp, void *caller)
{
if (!objp)
return objp;
@@ -2508,16 +2521,12 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
#endif
-
-static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
+static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
{
- unsigned long save_flags;
void* objp;
struct array_cache *ac;
- cache_alloc_debugcheck_before(cachep, flags);
-
- local_irq_save(save_flags);
+ check_irq_off();
ac = ac_data(cachep);
if (likely(ac->avail)) {
STATS_INC_ALLOCHIT(cachep);
@@ -2527,6 +2536,18 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast fl
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags);
}
+ return objp;
+}
+
+static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+{
+ unsigned long save_flags;
+ void* objp;
+
+ cache_alloc_debugcheck_before(cachep, flags);
+
+ local_irq_save(save_flags);
+ objp = ____cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp,
__builtin_return_address(0));
@@ -2538,7 +2559,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast fl
/*
* A interface to enable slab creation on nodeid
*/
-static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
+static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
{
struct list_head *entry;
struct slab *slabp;
@@ -2608,7 +2629,7 @@ done:
/*
* Caller needs to acquire correct kmem_list's list_lock
*/
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
+static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node)
{
int i;
struct kmem_list3 *l3;
@@ -2617,18 +2638,18 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
void *objp = objpp[i];
struct slab *slabp;
unsigned int objnr;
- int nodeid = 0;
- slabp = GET_PAGE_SLAB(virt_to_page(objp));
- nodeid = slabp->nodeid;
- l3 = cachep->nodelists[nodeid];
+ slabp = page_get_slab(virt_to_page(objp));
+ l3 = cachep->nodelists[node];
list_del(&slabp->list);
objnr = (objp - slabp->s_mem) / cachep->objsize;
- check_spinlock_acquired_node(cachep, nodeid);
+ check_spinlock_acquired_node(cachep, node);
check_slabp(cachep, slabp);
-
#if DEBUG
+ /* Verify that the slab belongs to the intended node */
+ WARN_ON(slabp->nodeid != node);
+
if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
printk(KERN_ERR "slab: double free detected in cache "
"'%s', objp %p\n", cachep->name, objp);
@@ -2664,13 +2685,14 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
{
int batchcount;
struct kmem_list3 *l3;
+ int node = numa_node_id();
batchcount = ac->batchcount;
#if DEBUG
BUG_ON(!batchcount || batchcount > ac->avail);
#endif
check_irq_off();
- l3 = cachep->nodelists[numa_node_id()];
+ l3 = cachep->nodelists[node];
spin_lock(&l3->list_lock);
if (l3->shared) {
struct array_cache *shared_array = l3->shared;
@@ -2686,7 +2708,7 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
}
}
- free_block(cachep, ac->entry, batchcount);
+ free_block(cachep, ac->entry, batchcount, node);
free_done:
#if STATS
{
@@ -2733,7 +2755,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
#ifdef CONFIG_NUMA
{
struct slab *slabp;
- slabp = GET_PAGE_SLAB(virt_to_page(objp));
+ slabp = page_get_slab(virt_to_page(objp));
if (unlikely(slabp->nodeid != numa_node_id())) {
struct array_cache *alien = NULL;
int nodeid = slabp->nodeid;
@@ -2751,7 +2773,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
} else {
spin_lock(&(cachep->nodelists[nodeid])->
list_lock);
- free_block(cachep, &objp, 1);
+ free_block(cachep, &objp, 1, nodeid);
spin_unlock(&(cachep->nodelists[nodeid])->
list_lock);
}
@@ -2778,7 +2800,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
* Allocate an object from this cache. The flags are only relevant
* if the cache has no available objects.
*/
-void *kmem_cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
+void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags)
{
return __cache_alloc(cachep, flags);
}
@@ -2819,7 +2841,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
page = virt_to_page(ptr);
if (unlikely(!PageSlab(page)))
goto out;
- if (unlikely(GET_PAGE_CACHE(page) != cachep))
+ if (unlikely(page_get_cache(page) != cachep))
goto out;
return 1;
out:
@@ -2839,12 +2861,12 @@ out:
* New and improved: it will now make sure that the object gets
* put on the correct node list so that there is no false sharing.
*/
-void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
{
unsigned long save_flags;
void *ptr;
- if (nodeid == numa_node_id() || nodeid == -1)
+ if (nodeid == -1)
return __cache_alloc(cachep, flags);
if (unlikely(!cachep->nodelists[nodeid])) {
@@ -2855,7 +2877,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, i
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
- ptr = __cache_alloc_node(cachep, flags, nodeid);
+ if (nodeid == numa_node_id())
+ ptr = ____cache_alloc(cachep, flags);
+ else
+ ptr = __cache_alloc_node(cachep, flags, nodeid);
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
@@ -2863,7 +2888,7 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, i
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-void *kmalloc_node(size_t size, unsigned int __nocast flags, int node)
+void *kmalloc_node(size_t size, gfp_t flags, int node)
{
kmem_cache_t *cachep;
@@ -2896,7 +2921,7 @@ EXPORT_SYMBOL(kmalloc_node);
* platforms. For example, on i386, it means that the memory must come
* from the first 16MB.
*/
-void *__kmalloc(size_t size, unsigned int __nocast flags)
+void *__kmalloc(size_t size, gfp_t flags)
{
kmem_cache_t *cachep;
@@ -2985,7 +3010,7 @@ EXPORT_SYMBOL(kmem_cache_free);
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
*/
-void *kzalloc(size_t size, unsigned int __nocast flags)
+void *kzalloc(size_t size, gfp_t flags)
{
void *ret = kmalloc(size, flags);
if (ret)
@@ -3012,7 +3037,7 @@ void kfree(const void *objp)
return;
local_irq_save(flags);
kfree_debugcheck(objp);
- c = GET_PAGE_CACHE(virt_to_page(objp));
+ c = page_get_cache(virt_to_page(objp));
__cache_free(c, (void*)objp);
local_irq_restore(flags);
}
@@ -3079,7 +3104,7 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
if ((nc = cachep->nodelists[node]->shared))
free_block(cachep, nc->entry,
- nc->avail);
+ nc->avail, node);
l3->shared = new;
if (!cachep->nodelists[node]->alien) {
@@ -3160,7 +3185,7 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
if (!ccold)
continue;
spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
- free_block(cachep, ccold->entry, ccold->avail);
+ free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
kfree(ccold);
}
@@ -3240,7 +3265,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
if (tofree > ac->avail) {
tofree = (ac->avail+1)/2;
}
- free_block(cachep, ac->entry, tofree);
+ free_block(cachep, ac->entry, tofree, node);
ac->avail -= tofree;
memmove(ac->entry, &(ac->entry[tofree]),
sizeof(void*)*ac->avail);
@@ -3249,6 +3274,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
/**
* cache_reap - Reclaim memory from caches.
+ * @unused: unused parameter
*
* Called from workqueue/eventd every few seconds.
* Purpose:
@@ -3265,7 +3291,7 @@ static void cache_reap(void *unused)
if (down_trylock(&cache_chain_sem)) {
/* Give up. Setup the next iteration. */
- schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
return;
}
@@ -3334,7 +3360,7 @@ next:
up(&cache_chain_sem);
drain_remote_pages();
/* Setup the next iteration */
- schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
}
#ifdef CONFIG_PROC_FS
@@ -3581,7 +3607,7 @@ unsigned int ksize(const void *objp)
if (unlikely(objp == NULL))
return 0;
- return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp)));
+ return obj_reallen(page_get_cache(virt_to_page(objp)));
}
@@ -3591,7 +3617,7 @@ unsigned int ksize(const void *objp)
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*/
-char *kstrdup(const char *s, unsigned int __nocast gfp)
+char *kstrdup(const char *s, gfp_t gfp)
{
size_t len;
char *buf;
diff --git a/mm/sparse.c b/mm/sparse.c
index 347249a4917..72079b538e2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,8 +5,10 @@
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
+#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
#include <asm/dma.h>
/*
@@ -72,6 +74,31 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
}
#endif
+/*
+ * Although written for the SPARSEMEM_EXTREME case, this happens
+ * to also work for the flat array case becase
+ * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
+ */
+int __section_nr(struct mem_section* ms)
+{
+ unsigned long root_nr;
+ struct mem_section* root;
+
+ for (root_nr = 0;
+ root_nr < NR_MEM_SECTIONS;
+ root_nr += SECTIONS_PER_ROOT) {
+ root = __nr_to_section(root_nr);
+
+ if (!root)
+ continue;
+
+ if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
+ break;
+ }
+
+ return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
+}
+
/* Record a memory area against a node. */
void memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -162,6 +189,45 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
return NULL;
}
+static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+{
+ struct page *page, *ret;
+ unsigned long memmap_size = sizeof(struct page) * nr_pages;
+
+ page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
+ if (page)
+ goto got_map_page;
+
+ ret = vmalloc(memmap_size);
+ if (ret)
+ goto got_map_ptr;
+
+ return NULL;
+got_map_page:
+ ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
+got_map_ptr:
+ memset(ret, 0, memmap_size);
+
+ return ret;
+}
+
+static int vaddr_in_vmalloc_area(void *addr)
+{
+ if (addr >= (void *)VMALLOC_START &&
+ addr < (void *)VMALLOC_END)
+ return 1;
+ return 0;
+}
+
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+{
+ if (vaddr_in_vmalloc_area(memmap))
+ vfree(memmap);
+ else
+ free_pages((unsigned long)memmap,
+ get_order(sizeof(struct page) * nr_pages));
+}
+
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
@@ -187,14 +253,37 @@ void sparse_init(void)
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
-int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
+int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+ int nr_pages)
{
- struct mem_section *ms = __pfn_to_section(start_pfn);
+ unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ struct mem_section *ms;
+ struct page *memmap;
+ unsigned long flags;
+ int ret;
- if (ms->section_mem_map & SECTION_MARKED_PRESENT)
- return -EEXIST;
+ /*
+ * no locking for this, because it does its own
+ * plus, it does a kmalloc
+ */
+ sparse_index_init(section_nr, pgdat->node_id);
+ memmap = __kmalloc_section_memmap(nr_pages);
+
+ pgdat_resize_lock(pgdat, &flags);
+ ms = __pfn_to_section(start_pfn);
+ if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
+ ret = -EEXIST;
+ goto out;
+ }
ms->section_mem_map |= SECTION_MARKED_PRESENT;
- return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
+ ret = sparse_init_one_section(ms, section_nr, memmap);
+
+ if (ret <= 0)
+ __kfree_section_memmap(memmap, nr_pages);
+out:
+ pgdat_resize_unlock(pgdat, &flags);
+ return ret;
}
diff --git a/mm/swap.c b/mm/swap.c
index 7771d2803f6..73d351439ef 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,12 +34,10 @@
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
-#ifdef CONFIG_HUGETLB_PAGE
-
void put_page(struct page *page)
{
if (unlikely(PageCompound(page))) {
- page = (struct page *)page->private;
+ page = (struct page *)page_private(page);
if (put_page_testzero(page)) {
void (*dtor)(struct page *page);
@@ -48,11 +46,10 @@ void put_page(struct page *page)
}
return;
}
- if (!PageReserved(page) && put_page_testzero(page))
+ if (put_page_testzero(page))
__page_cache_release(page);
}
EXPORT_SYMBOL(put_page);
-#endif
/*
* Writeback is about to end against a page which has been marked for immediate
@@ -215,7 +212,7 @@ void release_pages(struct page **pages, int nr, int cold)
struct page *page = pages[i];
struct zone *pagezone;
- if (PageReserved(page) || !put_page_testzero(page))
+ if (!put_page_testzero(page))
continue;
pagezone = page_zone(page);
@@ -259,6 +256,8 @@ void __pagevec_release(struct pagevec *pvec)
pagevec_reinit(pvec);
}
+EXPORT_SYMBOL(__pagevec_release);
+
/*
* pagevec_release() for pages which are known to not be on the LRU
*
@@ -270,7 +269,6 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
struct pagevec pages_to_free;
pagevec_init(&pages_to_free, pvec->cold);
- pages_to_free.cold = pvec->cold;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
@@ -388,6 +386,7 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
return pagevec_count(pvec);
}
+EXPORT_SYMBOL(pagevec_lookup_tag);
#ifdef CONFIG_SMP
/*
@@ -411,7 +410,6 @@ void vm_acct_memory(long pages)
}
preempt_enable();
}
-EXPORT_SYMBOL(vm_acct_memory);
#ifdef CONFIG_HOTPLUG_CPU
static void lru_drain_cache(unsigned int cpu)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index adbc2b426c2..0df9a57b1de 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -40,7 +40,6 @@ struct address_space swapper_space = {
.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
.backing_dev_info = &swap_backing_dev_info,
};
-EXPORT_SYMBOL(swapper_space);
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -68,7 +67,7 @@ void show_swap_cache_info(void)
* but sets SwapCache flag and private instead of mapping and index.
*/
static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
- unsigned int __nocast gfp_mask)
+ gfp_t gfp_mask)
{
int error;
@@ -83,7 +82,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
page_cache_get(page);
SetPageLocked(page);
SetPageSwapCache(page);
- page->private = entry.val;
+ set_page_private(page, entry.val);
total_swapcache_pages++;
pagecache_acct(1);
}
@@ -126,8 +125,8 @@ void __delete_from_swap_cache(struct page *page)
BUG_ON(PageWriteback(page));
BUG_ON(PagePrivate(page));
- radix_tree_delete(&swapper_space.page_tree, page->private);
- page->private = 0;
+ radix_tree_delete(&swapper_space.page_tree, page_private(page));
+ set_page_private(page, 0);
ClearPageSwapCache(page);
total_swapcache_pages--;
pagecache_acct(-1);
@@ -197,7 +196,7 @@ void delete_from_swap_cache(struct page *page)
{
swp_entry_t entry;
- entry.val = page->private;
+ entry.val = page_private(page);
write_lock_irq(&swapper_space.tree_lock);
__delete_from_swap_cache(page);
@@ -259,8 +258,7 @@ static inline void free_swap_cache(struct page *page)
/*
* Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page. Can not do a lock_page,
- * as we are holding the page_table_lock spinlock.
+ * this page if it is the last user of the page.
*/
void free_page_and_swap_cache(struct page *page)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0184f510aac..edafeace301 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -36,8 +36,6 @@ unsigned int nr_swapfiles;
long total_swap_pages;
static int swap_overflow;
-EXPORT_SYMBOL(total_swap_pages);
-
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
@@ -61,7 +59,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
swp_entry_t entry;
down_read(&swap_unplug_sem);
- entry.val = page->private;
+ entry.val = page_private(page);
if (PageSwapCache(page)) {
struct block_device *bdev = swap_info[swp_type(entry)].bdev;
struct backing_dev_info *bdi;
@@ -69,8 +67,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
/*
* If the page is removed from swapcache from under us (with a
* racy try_to_unuse/swapoff) we need an additional reference
- * count to avoid reading garbage from page->private above. If
- * the WARN_ON triggers during a swapoff it maybe the race
+ * count to avoid reading garbage from page_private(page) above.
+ * If the WARN_ON triggers during a swapoff it maybe the race
* condition and it's harmless. However if it triggers without
* swapoff it signals a problem.
*/
@@ -294,7 +292,7 @@ static inline int page_swapcount(struct page *page)
struct swap_info_struct *p;
swp_entry_t entry;
- entry.val = page->private;
+ entry.val = page_private(page);
p = swap_info_get(entry);
if (p) {
/* Subtract the 1 for the swap cache itself */
@@ -339,7 +337,7 @@ int remove_exclusive_swap_page(struct page *page)
if (page_count(page) != 2) /* 2: us + cache */
return 0;
- entry.val = page->private;
+ entry.val = page_private(page);
p = swap_info_get(entry);
if (!p)
return 0;
@@ -398,17 +396,14 @@ void free_swap_and_cache(swp_entry_t entry)
}
/*
- * Always set the resulting pte to be nowrite (the same as COW pages
- * after one process has exited). We don't know just how many PTEs will
- * share this swap entry, so be cautious and let do_wp_page work out
- * what to do if a write is requested later.
- *
- * vma->vm_mm->page_table_lock is held.
+ * No need to decide whether this PTE shares the swap entry with others,
+ * just let do_wp_page work it out if a write is requested later - to
+ * force COW, vm_page_prot omits write permission from any private vma.
*/
static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, swp_entry_t entry, struct page *page)
{
- inc_mm_counter(vma->vm_mm, rss);
+ inc_mm_counter(vma->vm_mm, anon_rss);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -425,23 +420,25 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
swp_entry_t entry, struct page *page)
{
- pte_t *pte;
pte_t swp_pte = swp_entry_to_pte(entry);
+ pte_t *pte;
+ spinlock_t *ptl;
+ int found = 0;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
/*
* swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte.
*/
if (unlikely(pte_same(*pte, swp_pte))) {
- unuse_pte(vma, pte, addr, entry, page);
- pte_unmap(pte);
- return 1;
+ unuse_pte(vma, pte++, addr, entry, page);
+ found = 1;
+ break;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
- return 0;
+ pte_unmap_unlock(pte - 1, ptl);
+ return found;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -523,12 +520,10 @@ static int unuse_mm(struct mm_struct *mm,
down_read(&mm->mmap_sem);
lock_page(page);
}
- spin_lock(&mm->page_table_lock);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma && unuse_vma(vma, entry, page))
break;
}
- spin_unlock(&mm->page_table_lock);
up_read(&mm->mmap_sem);
/*
* Currently unuse_mm cannot fail, but leave error handling
@@ -1045,7 +1040,7 @@ int page_queue_congested(struct page *page)
BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
if (PageSwapCache(page)) {
- swp_entry_t entry = { .val = page->private };
+ swp_entry_t entry = { .val = page_private(page) };
struct swap_info_struct *sis;
sis = get_swap_info_struct(swp_type(entry));
@@ -1381,6 +1376,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
error = bd_claim(bdev, sys_swapon);
if (error < 0) {
bdev = NULL;
+ error = -EINVAL;
goto bad_swap;
}
p->old_block_size = block_size(bdev);
diff --git a/mm/thrash.c b/mm/thrash.c
index 11461f7ad83..f4c560b4a2b 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -19,7 +19,7 @@ static unsigned long swap_token_check;
struct mm_struct * swap_token_mm = &init_mm;
#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
-#define SWAP_TOKEN_TIMEOUT 0
+#define SWAP_TOKEN_TIMEOUT (300 * HZ)
/*
* Currently disabled; Needs further code to work at HZ * 300.
*/
@@ -57,14 +57,17 @@ void grab_swap_token(void)
/* We have the token. Let others know we still need it. */
if (has_swap_token(current->mm)) {
current->mm->recent_pagein = 1;
+ if (unlikely(!swap_token_default_timeout))
+ disable_swap_token();
return;
}
if (time_after(jiffies, swap_token_check)) {
- /* Can't get swapout protection if we exceed our RSS limit. */
- // if (current->mm->rss > current->mm->rlimit_rss)
- // return;
+ if (!swap_token_default_timeout) {
+ swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
+ return;
+ }
/* ... or if we recently held the token. */
if (time_before(jiffies, current->mm->swap_token_time))
@@ -95,6 +98,7 @@ void __put_swap_token(struct mm_struct *mm)
{
spin_lock(&swap_token_lock);
if (likely(mm == swap_token_mm)) {
+ mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
swap_token_mm = &init_mm;
swap_token_check = jiffies;
}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index c13a2161bca..b58abcf44ed 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -31,11 +31,14 @@ static struct vfsmount *shm_mnt;
static int __init init_tmpfs(void)
{
- register_filesystem(&tmpfs_fs_type);
+ BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+
#ifdef CONFIG_TMPFS
devfs_mk_dir("shm");
#endif
shm_mnt = kern_mount(&tmpfs_fs_type);
+ BUG_ON(IS_ERR(shm_mnt));
+
return 0;
}
module_init(init_tmpfs)
diff --git a/mm/truncate.c b/mm/truncate.c
index 60c8764bfac..9173ab50060 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -13,18 +13,9 @@
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
- block_invalidatepage */
+ do_invalidatepage */
-static int do_invalidatepage(struct page *page, unsigned long offset)
-{
- int (*invalidatepage)(struct page *, unsigned long);
- invalidatepage = page->mapping->a_ops->invalidatepage;
- if (invalidatepage == NULL)
- invalidatepage = block_invalidatepage;
- return (*invalidatepage)(page, offset);
-}
-
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
@@ -291,8 +282,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* Zap the rest of the file in one hit.
*/
unmap_mapping_range(mapping,
- page_index << PAGE_CACHE_SHIFT,
- (end - page_index + 1)
+ (loff_t)page_index<<PAGE_CACHE_SHIFT,
+ (loff_t)(end - page_index + 1)
<< PAGE_CACHE_SHIFT,
0);
did_range_unmap = 1;
@@ -301,7 +292,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* Just zap this page
*/
unmap_mapping_range(mapping,
- page_index << PAGE_CACHE_SHIFT,
+ (loff_t)page_index<<PAGE_CACHE_SHIFT,
PAGE_CACHE_SIZE, 0);
}
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 13c3d82968a..729eb3eec75 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -5,6 +5,7 @@
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
+ * Numa awareness, Christoph Lameter, SGI, June 2005
*/
#include <linux/mm.h>
@@ -88,7 +89,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
{
pte_t *pte;
- pte = pte_alloc_kernel(&init_mm, pmd, addr);
+ pte = pte_alloc_kernel(pmd, addr);
if (!pte)
return -ENOMEM;
do {
@@ -146,20 +147,18 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
- spin_lock(&init_mm.page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = vmap_pud_range(pgd, addr, next, prot, pages);
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&init_mm.page_table_lock);
flush_cache_vmap((unsigned long) area->addr, end);
return err;
}
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end)
+struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
+ unsigned long start, unsigned long end, int node)
{
struct vm_struct **p, *tmp, *area;
unsigned long align = 1;
@@ -178,7 +177,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
addr = ALIGN(start, align);
size = PAGE_ALIGN(size);
- area = kmalloc(sizeof(*area), GFP_KERNEL);
+ area = kmalloc_node(sizeof(*area), GFP_KERNEL, node);
if (unlikely(!area))
return NULL;
@@ -231,6 +230,12 @@ out:
return NULL;
}
+struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
+ unsigned long start, unsigned long end)
+{
+ return __get_vm_area_node(size, flags, start, end, -1);
+}
+
/**
* get_vm_area - reserve a contingous kernel virtual area
*
@@ -246,6 +251,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
}
+struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node)
+{
+ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
+}
+
/* Caller must hold vmlist_lock */
struct vm_struct *__remove_vm_area(void *addr)
{
@@ -342,7 +352,6 @@ void vfree(void *addr)
BUG_ON(in_interrupt());
__vunmap(addr, 1);
}
-
EXPORT_SYMBOL(vfree);
/**
@@ -360,7 +369,6 @@ void vunmap(void *addr)
BUG_ON(in_interrupt());
__vunmap(addr, 0);
}
-
EXPORT_SYMBOL(vunmap);
/**
@@ -392,10 +400,10 @@ void *vmap(struct page **pages, unsigned int count,
return area->addr;
}
-
EXPORT_SYMBOL(vmap);
-void *__vmalloc_area(struct vm_struct *area, unsigned int __nocast gfp_mask, pgprot_t prot)
+void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
+ pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
@@ -406,9 +414,9 @@ void *__vmalloc_area(struct vm_struct *area, unsigned int __nocast gfp_mask, pgp
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE)
- pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL);
+ pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
else
- pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM));
+ pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
@@ -418,7 +426,10 @@ void *__vmalloc_area(struct vm_struct *area, unsigned int __nocast gfp_mask, pgp
memset(area->pages, 0, array_size);
for (i = 0; i < area->nr_pages; i++) {
- area->pages[i] = alloc_page(gfp_mask);
+ if (node < 0)
+ area->pages[i] = alloc_page(gfp_mask);
+ else
+ area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
if (unlikely(!area->pages[i])) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
@@ -435,18 +446,25 @@ fail:
return NULL;
}
+void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
+{
+ return __vmalloc_area_node(area, gfp_mask, prot, -1);
+}
+
/**
- * __vmalloc - allocate virtually contiguous memory
+ * __vmalloc_node - allocate virtually contiguous memory
*
* @size: allocation size
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
+ * @node: node to use for allocation or -1
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
-void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask, pgprot_t prot)
+void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+ int node)
{
struct vm_struct *area;
@@ -454,13 +472,18 @@ void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask, pgprot_t pro
if (!size || (size >> PAGE_SHIFT) > num_physpages)
return NULL;
- area = get_vm_area(size, VM_ALLOC);
+ area = get_vm_area_node(size, VM_ALLOC, node);
if (!area)
return NULL;
- return __vmalloc_area(area, gfp_mask, prot);
+ return __vmalloc_area_node(area, gfp_mask, prot, node);
}
+EXPORT_SYMBOL(__vmalloc_node);
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+{
+ return __vmalloc_node(size, gfp_mask, prot, -1);
+}
EXPORT_SYMBOL(__vmalloc);
/**
@@ -478,9 +501,26 @@ void *vmalloc(unsigned long size)
{
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
}
-
EXPORT_SYMBOL(vmalloc);
+/**
+ * vmalloc_node - allocate memory on a specific node
+ *
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight cotrol over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vmalloc_node(unsigned long size, int node)
+{
+ return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+}
+EXPORT_SYMBOL(vmalloc_node);
+
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
@@ -515,7 +555,6 @@ void *vmalloc_32(unsigned long size)
{
return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
}
-
EXPORT_SYMBOL(vmalloc_32);
long vread(char *buf, char *addr, unsigned long count)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0ea71e887bb..b0cd81c32de 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -70,7 +70,7 @@ struct scan_control {
unsigned int priority;
/* This context's GFP mask */
- unsigned int gfp_mask;
+ gfp_t gfp_mask;
int may_writepage;
@@ -186,7 +186,7 @@ EXPORT_SYMBOL(remove_shrinker);
*
* Returns the number of slab objects which we shrunk.
*/
-static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
+static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
unsigned long lru_pages)
{
struct shrinker *shrinker;
@@ -201,13 +201,25 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
+ unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
delta = (4 * scanned) / shrinker->seeks;
- delta *= (*shrinker->shrinker)(0, gfp_mask);
+ delta *= max_pass;
do_div(delta, lru_pages + 1);
shrinker->nr += delta;
- if (shrinker->nr < 0)
- shrinker->nr = LONG_MAX; /* It wrapped! */
+ if (shrinker->nr < 0) {
+ printk(KERN_ERR "%s: nr=%ld\n",
+ __FUNCTION__, shrinker->nr);
+ shrinker->nr = max_pass;
+ }
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (shrinker->nr > max_pass * 2)
+ shrinker->nr = max_pass * 2;
total_scan = shrinker->nr;
shrinker->nr = 0;
@@ -407,7 +419,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
if (PageWriteback(page))
goto keep_locked;
- referenced = page_referenced(page, 1, sc->priority <= 0);
+ referenced = page_referenced(page, 1);
/* In active use or really unfreeable? Activate it. */
if (referenced && page_mapping_inuse(page))
goto activate_locked;
@@ -417,7 +429,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
- if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
+ if (PageAnon(page) && !PageSwapCache(page)) {
+ if (!sc->may_swap)
+ goto keep_locked;
if (!add_to_swap(page))
goto activate_locked;
}
@@ -511,14 +525,15 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
* PageDirty _after_ making sure that the page is freeable and
* not in use by anybody. (pagecache + us == 2)
*/
- if (page_count(page) != 2 || PageDirty(page)) {
- write_unlock_irq(&mapping->tree_lock);
- goto keep_locked;
- }
+ if (unlikely(page_count(page) != 2))
+ goto cannot_free;
+ smp_rmb();
+ if (unlikely(PageDirty(page)))
+ goto cannot_free;
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page->private };
+ swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
write_unlock_irq(&mapping->tree_lock);
swap_free(swap);
@@ -538,6 +553,10 @@ free_it:
__pagevec_release_nonlru(&freed_pvec);
continue;
+cannot_free:
+ write_unlock_irq(&mapping->tree_lock);
+ goto keep_locked;
+
activate_locked:
SetPageActive(page);
pgactivate++;
@@ -749,7 +768,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
if (page_mapped(page)) {
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
- page_referenced(page, 0, sc->priority <= 0)) {
+ page_referenced(page, 0)) {
list_add(&page->lru, &l_active);
continue;
}
@@ -921,7 +940,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
+int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
{
int priority;
int ret = 0;
@@ -953,6 +972,8 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
sc.nr_reclaimed = 0;
sc.priority = priority;
sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+ if (!priority)
+ disable_swap_token();
shrink_caches(zones, &sc);
shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
if (reclaim_state) {
@@ -1049,6 +1070,10 @@ loop_again:
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
+ /* The swap token gets in the way of swapout... */
+ if (!priority)
+ disable_swap_token();
+
all_zones_ok = 1;
if (nr_pages == 0) {
@@ -1067,7 +1092,7 @@ loop_again:
continue;
if (!zone_watermark_ok(zone, order,
- zone->pages_high, 0, 0, 0)) {
+ zone->pages_high, 0, 0)) {
end_zone = i;
goto scan;
}
@@ -1104,7 +1129,7 @@ scan:
if (nr_pages == 0) { /* Not software suspend */
if (!zone_watermark_ok(zone, order,
- zone->pages_high, end_zone, 0, 0))
+ zone->pages_high, end_zone, 0))
all_zones_ok = 0;
}
zone->temp_priority = priority;
@@ -1252,7 +1277,7 @@ void wakeup_kswapd(struct zone *zone, int order)
return;
pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
+ if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
@@ -1333,7 +1358,7 @@ module_init(kswapd_init)
/*
* Try to free up some pages from this zone through reclaim.
*/
-int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
struct scan_control sc;
int nr_pages = 1 << order;
@@ -1353,6 +1378,7 @@ int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
sc.nr_reclaimed = 0;
/* scan at the highest priority */
sc.priority = 0;
+ disable_swap_token();
if (nr_pages > SWAP_CLUSTER_MAX)
sc.swap_cluster_max = nr_pages;