diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hugetlb.c | 177 | ||||
-rw-r--r-- | mm/madvise.c | 103 | ||||
-rw-r--r-- | mm/memory.c | 57 | ||||
-rw-r--r-- | mm/mempolicy.c | 110 | ||||
-rw-r--r-- | mm/mmap.c | 57 | ||||
-rw-r--r-- | mm/msync.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 7 | ||||
-rw-r--r-- | mm/page_alloc.c | 423 | ||||
-rw-r--r-- | mm/rmap.c | 21 | ||||
-rw-r--r-- | mm/shmem.c | 143 | ||||
-rw-r--r-- | mm/slab.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 55 | ||||
-rw-r--r-- | mm/vmscan.c | 103 |
14 files changed, 918 insertions, 343 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4eb5ae3fbe1..fbd1111ea11 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7,10 +7,14 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/hugetlb.h> #include <linux/sysctl.h> #include <linux/highmem.h> #include <linux/nodemask.h> +#include <linux/pagemap.h> +#include <asm/page.h> +#include <asm/pgtable.h> + +#include <linux/hugetlb.h> const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static unsigned long nr_huge_pages, free_huge_pages; @@ -249,6 +253,72 @@ struct vm_operations_struct hugetlb_vm_ops = { .nopage = hugetlb_nopage, }; +static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) +{ + pte_t entry; + + if (vma->vm_flags & VM_WRITE) { + entry = + pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + } else { + entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + } + entry = pte_mkyoung(entry); + entry = pte_mkhuge(entry); + + return entry; +} + +int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pte_t *src_pte, *dst_pte, entry; + struct page *ptepage; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + + while (addr < end) { + dst_pte = huge_pte_alloc(dst, addr); + if (!dst_pte) + goto nomem; + src_pte = huge_pte_offset(src, addr); + BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */ + entry = *src_pte; + ptepage = pte_page(entry); + get_page(ptepage); + add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); + set_huge_pte_at(dst, addr, dst_pte, entry); + addr += HPAGE_SIZE; + } + return 0; + +nomem: + return -ENOMEM; +} + +void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t pte; + struct page *page; + + WARN_ON(!is_vm_hugetlb_page(vma)); + BUG_ON(start & ~HPAGE_MASK); + BUG_ON(end & ~HPAGE_MASK); + + for (address = start; address < end; address += HPAGE_SIZE) { + pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address)); + if (pte_none(pte)) + continue; + page = pte_page(pte); + put_page(page); + } + add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); + flush_tlb_range(vma, start, end); +} + void zap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long length) { @@ -258,3 +328,108 @@ void zap_hugepage_range(struct vm_area_struct *vma, unmap_hugepage_range(vma, start, start + length); spin_unlock(&mm->page_table_lock); } + +int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) +{ + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret = 0; + + WARN_ON(!is_vm_hugetlb_page(vma)); + BUG_ON(vma->vm_start & ~HPAGE_MASK); + BUG_ON(vma->vm_end & ~HPAGE_MASK); + + hugetlb_prefault_arch_hook(mm); + + spin_lock(&mm->page_table_lock); + for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + unsigned long idx; + pte_t *pte = huge_pte_alloc(mm, addr); + struct page *page; + + if (!pte) { + ret = -ENOMEM; + goto out; + } + if (! pte_none(*pte)) + hugetlb_clean_stale_pgtable(pte); + + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + page = find_get_page(mapping, idx); + if (!page) { + /* charge the fs quota first */ + if (hugetlb_get_quota(mapping)) { + ret = -ENOMEM; + goto out; + } + page = alloc_huge_page(); + if (!page) { + hugetlb_put_quota(mapping); + ret = -ENOMEM; + goto out; + } + ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); + if (! ret) { + unlock_page(page); + } else { + hugetlb_put_quota(mapping); + free_huge_page(page); + goto out; + } + } + add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); + set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); + } +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, int *length, int i) +{ + unsigned long vpfn, vaddr = *position; + int remainder = *length; + + BUG_ON(!is_vm_hugetlb_page(vma)); + + vpfn = vaddr/PAGE_SIZE; + while (vaddr < vma->vm_end && remainder) { + + if (pages) { + pte_t *pte; + struct page *page; + + /* Some archs (sparc64, sh*) have multiple + * pte_ts to each hugepage. We have to make + * sure we get the first, for the page + * indexing below to work. */ + pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); + + /* hugetlb should be locked, and hence, prefaulted */ + WARN_ON(!pte || pte_none(*pte)); + + page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; + + WARN_ON(!PageCompound(page)); + + get_page(page); + pages[i] = page; + } + + if (vmas) + vmas[i] = vma; + + vaddr += PAGE_SIZE; + ++vpfn; + --remainder; + ++i; + } + + *length = remainder; + *position = vaddr; + + return i; +} diff --git a/mm/madvise.c b/mm/madvise.c index 944b5e52d81..e3108054733 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -8,17 +8,47 @@ #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/syscalls.h> +#include <linux/mempolicy.h> #include <linux/hugetlb.h> /* * We can potentially split a vm area into separate * areas, each area with its own behavior. */ -static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, - unsigned long end, int behavior) +static long madvise_behavior(struct vm_area_struct * vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) { struct mm_struct * mm = vma->vm_mm; int error = 0; + pgoff_t pgoff; + int new_flags = vma->vm_flags & ~VM_READHINTMASK; + + switch (behavior) { + case MADV_SEQUENTIAL: + new_flags |= VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags |= VM_RAND_READ; + break; + default: + break; + } + + if (new_flags == vma->vm_flags) { + *prev = vma; + goto success; + } + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma)); + if (*prev) { + vma = *prev; + goto success; + } + + *prev = vma; if (start != vma->vm_start) { error = split_vma(mm, vma, start, 1); @@ -36,21 +66,12 @@ static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, * vm_flags is protected by the mmap_sem held in write mode. */ VM_ClearReadHint(vma); - - switch (behavior) { - case MADV_SEQUENTIAL: - vma->vm_flags |= VM_SEQ_READ; - break; - case MADV_RANDOM: - vma->vm_flags |= VM_RAND_READ; - break; - default: - break; - } + vma->vm_flags = new_flags; out: if (error == -ENOMEM) error = -EAGAIN; +success: return error; } @@ -58,6 +79,7 @@ out: * Schedule all required I/O operations. Do not wait for completion. */ static long madvise_willneed(struct vm_area_struct * vma, + struct vm_area_struct ** prev, unsigned long start, unsigned long end) { struct file *file = vma->vm_file; @@ -65,6 +87,7 @@ static long madvise_willneed(struct vm_area_struct * vma, if (!file) return -EBADF; + *prev = vma; start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) end = vma->vm_end; @@ -95,8 +118,10 @@ static long madvise_willneed(struct vm_area_struct * vma, * dirty pages is already available as msync(MS_INVALIDATE). */ static long madvise_dontneed(struct vm_area_struct * vma, + struct vm_area_struct ** prev, unsigned long start, unsigned long end) { + *prev = vma; if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) return -EINVAL; @@ -111,8 +136,8 @@ static long madvise_dontneed(struct vm_area_struct * vma, return 0; } -static long madvise_vma(struct vm_area_struct * vma, unsigned long start, - unsigned long end, int behavior) +static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) { long error = -EBADF; @@ -120,15 +145,15 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start, case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: - error = madvise_behavior(vma, start, end, behavior); + error = madvise_behavior(vma, prev, start, end, behavior); break; case MADV_WILLNEED: - error = madvise_willneed(vma, start, end); + error = madvise_willneed(vma, prev, start, end); break; case MADV_DONTNEED: - error = madvise_dontneed(vma, start, end); + error = madvise_dontneed(vma, prev, start, end); break; default: @@ -175,8 +200,8 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start, */ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) { - unsigned long end; - struct vm_area_struct * vma; + unsigned long end, tmp; + struct vm_area_struct * vma, *prev; int unmapped_error = 0; int error = -EINVAL; size_t len; @@ -202,40 +227,42 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) /* * If the interval [start,end) covers some unmapped address * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. */ - vma = find_vma(current->mm, start); + vma = find_vma_prev(current->mm, start, &prev); + if (!vma && prev) + vma = prev->vm_next; for (;;) { /* Still start < end. */ error = -ENOMEM; if (!vma) goto out; - /* Here start < vma->vm_end. */ + /* Here start < (end|vma->vm_end). */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; start = vma->vm_start; + if (start >= end) + goto out; } - /* Here vma->vm_start <= start < vma->vm_end. */ - if (end <= vma->vm_end) { - if (start < end) { - error = madvise_vma(vma, start, end, - behavior); - if (error) - goto out; - } - error = unmapped_error; - goto out; - } + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; - /* Here vma->vm_start <= start < vma->vm_end < end. */ - error = madvise_vma(vma, start, vma->vm_end, behavior); + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = madvise_vma(vma, &prev, start, tmp, behavior); if (error) goto out; - start = vma->vm_end; - vma = vma->vm_next; + start = tmp; + if (start < prev->vm_end) + start = prev->vm_end; + error = unmapped_error; + if (start >= end) + goto out; + vma = prev->vm_next; } - out: up_write(¤t->mm->mmap_sem); return error; diff --git a/mm/memory.c b/mm/memory.c index d209f745db7..da91b7bf998 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -840,23 +840,8 @@ check_user_page_readable(struct mm_struct *mm, unsigned long address) { return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; } - EXPORT_SYMBOL(check_user_page_readable); -/* - * Given a physical address, is there a useful struct page pointing to - * it? This may become more complex in the future if we start dealing - * with IO-aperture pages for direct-IO. - */ - -static inline struct page *get_page_map(struct page *page) -{ - if (!pfn_valid(page_to_pfn(page))) - return NULL; - return page; -} - - static inline int untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, unsigned long address) @@ -887,7 +872,6 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, return 0; } - int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -951,21 +935,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } spin_lock(&mm->page_table_lock); do { - struct page *map; + struct page *page; int lookup_write = write; cond_resched_lock(&mm->page_table_lock); - while (!(map = follow_page(mm, start, lookup_write))) { + while (!(page = follow_page(mm, start, lookup_write))) { /* * Shortcut for anonymous pages. We don't want * to force the creation of pages tables for - * insanly big anonymously mapped areas that + * insanely big anonymously mapped areas that * nobody touched so far. This is important * for doing a core dump for these mappings. */ if (!lookup_write && untouched_anonymous_page(mm,vma,start)) { - map = ZERO_PAGE(start); + page = ZERO_PAGE(start); break; } spin_unlock(&mm->page_table_lock); @@ -994,30 +978,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, spin_lock(&mm->page_table_lock); } if (pages) { - pages[i] = get_page_map(map); - if (!pages[i]) { - spin_unlock(&mm->page_table_lock); - while (i--) - page_cache_release(pages[i]); - i = -EFAULT; - goto out; - } - flush_dcache_page(pages[i]); - if (!PageReserved(pages[i])) - page_cache_get(pages[i]); + pages[i] = page; + flush_dcache_page(page); + if (!PageReserved(page)) + page_cache_get(page); } if (vmas) vmas[i] = vma; i++; start += PAGE_SIZE; len--; - } while(len && start < vma->vm_end); + } while (len && start < vma->vm_end); spin_unlock(&mm->page_table_lock); - } while(len); -out: + } while (len); return i; } - EXPORT_SYMBOL(get_user_pages); static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, @@ -1264,7 +1239,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, } old_page = pfn_to_page(pfn); - if (!TestSetPageLocked(old_page)) { + if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); unlock_page(old_page); if (reuse) { @@ -1711,10 +1686,6 @@ static int do_swap_page(struct mm_struct * mm, } /* The page isn't present yet, go ahead with the fault. */ - - swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); inc_mm_counter(mm, rss); pte = mk_pte(page, vma->vm_page_prot); @@ -1722,12 +1693,16 @@ static int do_swap_page(struct mm_struct * mm, pte = maybe_mkwrite(pte_mkdirty(pte), vma); write_access = 0; } - unlock_page(page); flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + unlock_page(page); + if (write_access) { if (do_wp_page(mm, vma, address, page_table, pmd, pte) == VM_FAULT_OOM) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 08c41da429c..cb41c31e7c8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -238,46 +238,80 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) } /* Ensure all existing pages follow the policy. */ -static int -verify_pages(struct mm_struct *mm, - unsigned long addr, unsigned long end, unsigned long *nodes) +static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, unsigned long *nodes) { - while (addr < end) { - struct page *p; - pte_t *pte; - pmd_t *pmd; - pud_t *pud; - pgd_t *pgd; - pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) { - unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK; - if (next > addr) - break; - addr = next; + pte_t *orig_pte; + pte_t *pte; + + spin_lock(&mm->page_table_lock); + orig_pte = pte = pte_offset_map(pmd, addr); + do { + unsigned long pfn; + unsigned int nid; + + if (!pte_present(*pte)) continue; - } - pud = pud_offset(pgd, addr); - if (pud_none(*pud)) { - addr = (addr + PUD_SIZE) & PUD_MASK; + pfn = pte_pfn(*pte); + if (!pfn_valid(pfn)) continue; - } - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { - addr = (addr + PMD_SIZE) & PMD_MASK; + nid = pfn_to_nid(pfn); + if (!test_bit(nid, nodes)) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(orig_pte); + spin_unlock(&mm->page_table_lock); + return addr != end; +} + +static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, unsigned long *nodes) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) continue; - } - p = NULL; - pte = pte_offset_map(pmd, addr); - if (pte_present(*pte)) - p = pte_page(*pte); - pte_unmap(pte); - if (p) { - unsigned nid = page_to_nid(p); - if (!test_bit(nid, nodes)) - return -EIO; - } - addr += PAGE_SIZE; - } + if (check_pte_range(mm, pmd, addr, next, nodes)) + return -EIO; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, unsigned long *nodes) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + if (check_pmd_range(mm, pud, addr, next, nodes)) + return -EIO; + } while (pud++, addr = next, addr != end); + return 0; +} + +static inline int check_pgd_range(struct mm_struct *mm, + unsigned long addr, unsigned long end, unsigned long *nodes) +{ + pgd_t *pgd; + unsigned long next; + + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + if (check_pud_range(mm, pgd, addr, next, nodes)) + return -EIO; + } while (pgd++, addr = next, addr != end); return 0; } @@ -299,7 +333,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, if (prev && prev->vm_end < vma->vm_start) return ERR_PTR(-EFAULT); if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { - err = verify_pages(vma->vm_mm, + err = check_pgd_range(vma->vm_mm, vma->vm_start, vma->vm_end, nodes); if (err) { first = ERR_PTR(err); @@ -721,7 +755,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); page = __alloc_pages(gfp, order, zl); if (page && page_zone(page) == zl->zones[0]) { - zl->zones[0]->pageset[get_cpu()].interleave_hit++; + zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; put_cpu(); } return page; diff --git a/mm/mmap.c b/mm/mmap.c index de54acd9942..da3fa90a0aa 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1175,7 +1175,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || addr + len <= vma->vm_start)) return addr; } - start_addr = addr = mm->free_area_cache; + if (len > mm->cached_hole_size) { + start_addr = addr = mm->free_area_cache; + } else { + start_addr = addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; + } full_search: for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { @@ -1186,7 +1191,9 @@ full_search: * some holes. */ if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = addr = TASK_UNMAPPED_BASE; + addr = TASK_UNMAPPED_BASE; + start_addr = addr; + mm->cached_hole_size = 0; goto full_search; } return -ENOMEM; @@ -1198,19 +1205,22 @@ full_search: mm->free_area_cache = addr + len; return addr; } + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; addr = vma->vm_end; } } #endif -void arch_unmap_area(struct vm_area_struct *area) +void arch_unmap_area(struct mm_struct *mm, unsigned long addr) { /* * Is this a new hole at the lowest possible address? */ - if (area->vm_start >= TASK_UNMAPPED_BASE && - area->vm_start < area->vm_mm->free_area_cache) - area->vm_mm->free_area_cache = area->vm_start; + if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { + mm->free_area_cache = addr; + mm->cached_hole_size = ~0UL; + } } /* @@ -1240,6 +1250,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } + /* check if free_area_cache is useful for us */ + if (len <= mm->cached_hole_size) { + mm->cached_hole_size = 0; + mm->free_area_cache = mm->mmap_base; + } + /* either no address requested or can't fit in requested address hole */ addr = mm->free_area_cache; @@ -1251,6 +1267,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return (mm->free_area_cache = addr-len); } + if (mm->mmap_base < len) + goto bottomup; + addr = mm->mmap_base-len; do { @@ -1264,38 +1283,45 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, /* remember the address as a hint for next time */ return (mm->free_area_cache = addr); + /* remember the largest hole we saw so far */ + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + /* try just below the current vma->vm_start */ addr = vma->vm_start-len; } while (len < vma->vm_start); +bottomup: /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ - mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->cached_hole_size = ~0UL; + mm->free_area_cache = TASK_UNMAPPED_BASE; addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); /* * Restore the topdown base: */ mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = ~0UL; return addr; } #endif -void arch_unmap_area_topdown(struct vm_area_struct *area) +void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) { /* * Is this a new hole at the highest possible address? */ - if (area->vm_end > area->vm_mm->free_area_cache) - area->vm_mm->free_area_cache = area->vm_end; + if (addr > mm->free_area_cache) + mm->free_area_cache = addr; /* dont allow allocations above current base */ - if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base) - area->vm_mm->free_area_cache = area->vm_mm->mmap_base; + if (mm->free_area_cache > mm->mmap_base) + mm->free_area_cache = mm->mmap_base; } unsigned long @@ -1595,7 +1621,6 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) if (area->vm_flags & VM_LOCKED) area->vm_mm->locked_vm -= len >> PAGE_SHIFT; vm_stat_unaccount(area); - area->vm_mm->unmap_area(area); remove_vm_struct(area); } @@ -1649,6 +1674,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; + unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); do { @@ -1659,6 +1685,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } while (vma && vma->vm_start < end); *insertion_point = vma; tail_vma->vm_next = NULL; + if (mm->unmap_area == arch_unmap_area) + addr = prev ? prev->vm_end : mm->mmap_base; + else + addr = vma ? vma->vm_start : mm->mmap_base; + mm->unmap_area(mm, addr); mm->mmap_cache = NULL; /* Kill the cache. */ } diff --git a/mm/msync.c b/mm/msync.c index 090f426bca7..d0f5a1bce7c 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -34,6 +34,8 @@ static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!pte_present(*pte)) continue; + if (!pte_maybe_dirty(*pte)) + continue; pfn = pte_pfn(*pte); if (!pfn_valid(pfn)) continue; diff --git a/mm/nommu.c b/mm/nommu.c index c53e9c8f6b4..ce74452c02d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1067,7 +1067,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void arch_unmap_area(struct vm_area_struct *area) +void arch_unmap_area(struct mm_struct *mm, unsigned long addr) { } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4bbb1cb1049..59666d905f1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -258,6 +258,10 @@ void out_of_memory(unsigned int __nocast gfp_mask) struct mm_struct *mm = NULL; task_t * p; + printk("oom-killer: gfp_mask=0x%x\n", gfp_mask); + /* print memory stats */ + show_mem(); + read_lock(&tasklist_lock); retry: p = select_bad_process(); @@ -268,12 +272,9 @@ retry: /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { read_unlock(&tasklist_lock); - show_free_areas(); panic("Out of memory and no killable processes...\n"); } - printk("oom-killer: gfp_mask=0x%x\n", gfp_mask); - show_free_areas(); mm = oom_kill_process(p); if (!mm) goto retry; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b1061b1962f..206920796f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page) printk(KERN_EMERG "Backtrace:\n"); dump_stack(); printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); - page->flags &= ~(1 << PG_private | + page->flags &= ~(1 << PG_lru | + 1 << PG_private | 1 << PG_locked | - 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); @@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order) */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapcount(page) || - (page->flags & ( + if ( page_mapcount(page) || + page->mapping != NULL || + page_count(page) != 0 || + (page->flags & ( + 1 << PG_lru | 1 << PG_private | 1 << PG_locked | - 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); @@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, return allocated; } +#ifdef CONFIG_NUMA +/* Called from the slab reaper to drain remote pagesets */ +void drain_remote_pages(void) +{ + struct zone *zone; + int i; + unsigned long flags; + + local_irq_save(flags); + for_each_zone(zone) { + struct per_cpu_pageset *pset; + + /* Do not drain local pagesets */ + if (zone->zone_pgdat->node_id == numa_node_id()) + continue; + + pset = zone->pageset[smp_processor_id()]; + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + if (pcp->count) + pcp->count -= free_pages_bulk(zone, pcp->count, + &pcp->list, 0); + } + } + local_irq_restore(flags); +} +#endif + #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { @@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu) for_each_zone(zone) { struct per_cpu_pageset *pset; - pset = &zone->pageset[cpu]; + pset = zone_pcp(zone, cpu); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) local_irq_save(flags); cpu = smp_processor_id(); - p = &z->pageset[cpu]; + p = zone_pcp(z,cpu); if (pg == orig) { - z->pageset[cpu].numa_hit++; + p->numa_hit++; } else { p->numa_miss++; - zonelist->zones[0]->pageset[cpu].numa_foreign++; + zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; } if (pg == NODE_DATA(numa_node_id())) p->local_node++; @@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (PageAnon(page)) page->mapping = NULL; free_pages_check(__FUNCTION__, page); - pcp = &zone->pageset[get_cpu()].pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); list_add(&page->lru, &pcp->list); pcp->count++; + if (pcp->count >= pcp->high) + pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); local_irq_restore(flags); put_cpu(); } @@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) if (order == 0) { struct per_cpu_pages *pcp; - pcp = &zone->pageset[get_cpu()].pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); if (pcp->count <= pcp->low) pcp->count += rmqueue_bulk(zone, 0, @@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, return 1; } +static inline int +should_reclaim_zone(struct zone *z, unsigned int gfp_mask) +{ + if (!z->reclaim_pages) + return 0; + if (gfp_mask & __GFP_NORECLAIM) + return 0; + return 1; +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, classzone_idx = zone_idx(zones[0]); - restart: +restart: /* Go through the zonelist once, looking for a zone with enough free */ for (i = 0; (z = zones[i]) != NULL; i++) { - - if (!zone_watermark_ok(z, order, z->pages_low, - classzone_idx, 0, 0)) - continue; + int do_reclaim = should_reclaim_zone(z, gfp_mask); if (!cpuset_zone_allowed(z)) continue; + /* + * If the zone is to attempt early page reclaim then this loop + * will try to reclaim pages and check the watermark a second + * time before giving up and falling back to the next zone. + */ +zone_reclaim_retry: + if (!zone_watermark_ok(z, order, z->pages_low, + classzone_idx, 0, 0)) { + if (!do_reclaim) + continue; + else { + zone_reclaim(z, gfp_mask, order); + /* Only try reclaim once */ + do_reclaim = 0; + goto zone_reclaim_retry; + } + } + page = buffered_rmqueue(z, order, gfp_mask); if (page) goto got_pg; @@ -829,7 +889,7 @@ rebalance: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zones, gfp_mask, order); + did_some_progress = try_to_free_pages(zones, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -905,6 +965,7 @@ nopage: " order:%d, mode:0x%x\n", p->comm, order, gfp_mask); dump_stack(); + show_mem(); } return NULL; got_pg: @@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret) __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); } -unsigned long __read_page_state(unsigned offset) +unsigned long __read_page_state(unsigned long offset) { unsigned long ret = 0; int cpu; @@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset) return ret; } -void __mod_page_state(unsigned offset, unsigned long delta) +void __mod_page_state(unsigned long offset, unsigned long delta) { unsigned long flags; void* ptr; @@ -1237,22 +1298,23 @@ void show_free_areas(void) if (!cpu_possible(cpu)) continue; - pageset = zone->pageset + cpu; + pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d\n", + printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", cpu, temperature ? "cold" : "hot", pageset->pcp[temperature].low, pageset->pcp[temperature].high, - pageset->pcp[temperature].batch); + pageset->pcp[temperature].batch, + pageset->pcp[temperature].count); } } get_page_state(&ps); get_zone_counts(&active, &inactive, &free); - printk("\nFree pages: %11ukB (%ukB HighMem)\n", + printk("Free pages: %11ukB (%ukB HighMem)\n", K(nr_free_pages()), K(nr_free_highpages())); @@ -1620,6 +1682,155 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, memmap_init_zone((size), (nid), (zone), (start_pfn)) #endif +static int __devinit zone_batchsize(struct zone *zone) +{ + int batch; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = (1 << fls(batch + batch/2)) - 1; + return batch; +} + +inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = max(1UL, 1 * batch); + INIT_LIST_HEAD(&pcp->list); + + pcp = &p->pcp[1]; /* cold*/ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = max(1UL, 1 * batch); + INIT_LIST_HEAD(&pcp->list); +} + +#ifdef CONFIG_NUMA +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * Some NUMA counter updates may also be caught by the boot pagesets. + * These will be discarded when bootup is complete. + */ +static struct per_cpu_pageset + boot_pageset[NR_CPUS] __initdata; + +/* + * Dynamically allocate memory for the + * per cpu pageset array in struct zone. + */ +static int __devinit process_zones(int cpu) +{ + struct zone *zone, *dzone; + + for_each_zone(zone) { + + zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), + GFP_KERNEL, cpu_to_node(cpu)); + if (!zone->pageset[cpu]) + goto bad; + + setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); + } + + return 0; +bad: + for_each_zone(dzone) { + if (dzone == zone) + break; + kfree(dzone->pageset[cpu]); + dzone->pageset[cpu] = NULL; + } + return -ENOMEM; +} + +static inline void free_zone_pagesets(int cpu) +{ +#ifdef CONFIG_NUMA + struct zone *zone; + + for_each_zone(zone) { + struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + + zone_pcp(zone, cpu) = NULL; + kfree(pset); + } +#endif +} + +static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + if (process_zones(cpu)) + ret = NOTIFY_BAD; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + free_zone_pagesets(cpu); + break; +#endif + default: + break; + } + return ret; +} + +static struct notifier_block pageset_notifier = + { &pageset_cpuup_callback, NULL, 0 }; + +void __init setup_per_cpu_pageset() +{ + int err; + + /* Initialize per_cpu_pageset for cpu 0. + * A cpuup callback will do this for every cpu + * as it comes online + */ + err = process_zones(smp_processor_id()); + BUG_ON(err); + register_cpu_notifier(&pageset_notifier); +} + +#endif + /* * Set up the zone data structures: * - mark all pages reserved @@ -1662,48 +1873,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. - * - * OK, so we don't know how big the cache is. So guess. - */ - batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ - if (batch < 1) - batch = 1; - - /* - * Clamp the batch to a 2^n - 1 value. Having a power - * of 2 value was found to be more likely to have - * suboptimal cache aliasing properties in some cases. - * - * For example if 2 tasks are alternately allocating - * batches of pages, one task can end up with a lot - * of pages of one half of the possible page colors - * and the other with pages of the other colors. - */ - batch = (1 << fls(batch + batch/2)) - 1; + batch = zone_batchsize(zone); for (cpu = 0; cpu < NR_CPUS; cpu++) { - struct per_cpu_pages *pcp; - - pcp = &zone->pageset[cpu].pcp[0]; /* hot */ - pcp->count = 0; - pcp->low = 2 * batch; - pcp->high = 6 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - - pcp = &zone->pageset[cpu].pcp[1]; /* cold */ - pcp->count = 0; - pcp->low = 0; - pcp->high = 2 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif } printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", zone_names[j], realsize, batch); @@ -1713,6 +1892,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->nr_scan_inactive = 0; zone->nr_active = 0; zone->nr_inactive = 0; + atomic_set(&zone->reclaim_in_progress, -1); if (!size) continue; @@ -1853,6 +2033,115 @@ struct seq_operations fragmentation_op = { .show = frag_show, }; +/* + * Output information about zones in @pgdat. + */ +static int zoneinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { + int i; + + if (!zone->present_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + seq_printf(m, + "\n pages free %lu" + "\n min %lu" + "\n low %lu" + "\n high %lu" + "\n active %lu" + "\n inactive %lu" + "\n scanned %lu (a: %lu i: %lu)" + "\n spanned %lu" + "\n present %lu", + zone->free_pages, + zone->pages_min, + zone->pages_low, + zone->pages_high, + zone->nr_active, + zone->nr_inactive, + zone->pages_scanned, + zone->nr_scan_active, zone->nr_scan_inactive, + zone->spanned_pages, + zone->present_pages); + seq_printf(m, + "\n protection: (%lu", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) + seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, + ")" + "\n pagesets"); + for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { + struct per_cpu_pageset *pageset; + int j; + + pageset = zone_pcp(zone, i); + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + if (pageset->pcp[j].count) + break; + } + if (j == ARRAY_SIZE(pageset->pcp)) + continue; + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + seq_printf(m, + "\n cpu: %i pcp: %i" + "\n count: %i" + "\n low: %i" + "\n high: %i" + "\n batch: %i", + i, j, + pageset->pcp[j].count, + pageset->pcp[j].low, + pageset->pcp[j].high, + pageset->pcp[j].batch); + } +#ifdef CONFIG_NUMA + seq_printf(m, + "\n numa_hit: %lu" + "\n numa_miss: %lu" + "\n numa_foreign: %lu" + "\n interleave_hit: %lu" + "\n local_node: %lu" + "\n other_node: %lu", + pageset->numa_hit, + pageset->numa_miss, + pageset->numa_foreign, + pageset->interleave_hit, + pageset->local_node, + pageset->other_node); +#endif + } + seq_printf(m, + "\n all_unreclaimable: %u" + "\n prev_priority: %i" + "\n temp_priority: %i" + "\n start_pfn: %lu", + zone->all_unreclaimable, + zone->prev_priority, + zone->temp_priority, + zone->zone_start_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations zoneinfo_op = { + .start = frag_start, /* iterate over all zones. The same as in + * fragmentation. */ + .next = frag_next, + .stop = frag_stop, + .show = zoneinfo_show, +}; + static char *vmstat_text[] = { "nr_dirty", "nr_writeback", @@ -2058,10 +2347,10 @@ static void setup_per_zone_pages_min(void) min_pages = 128; zone->pages_min = min_pages; } else { - /* if it's a lowmem zone, reserve a number of pages + /* if it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->pages_min = (pages_min * zone->present_pages) / + zone->pages_min = (pages_min * zone->present_pages) / lowmem_pages; } diff --git a/mm/rmap.c b/mm/rmap.c index 9827409eb7c..89770bd25f3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -539,27 +539,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) goto out_unmap; } - /* - * Don't pull an anonymous page out from under get_user_pages. - * GUP carefully breaks COW and raises page count (while holding - * page_table_lock, as we have here) to make sure that the page - * cannot be freed. If we unmap that page here, a user write - * access to the virtual address will bring back the page, but - * its raised count will (ironically) be taken to mean it's not - * an exclusive swap page, do_wp_page will replace it by a copy - * page, and the user never get to see the data GUP was holding - * the original page for. - * - * This test is also useful for when swapoff (unuse_process) has - * to drop page lock: its reference to the page stops existing - * ptes from being unmapped, so swapoff can make progress. - */ - if (PageSwapCache(page) && - page_count(page) != page_mapcount(page) + 2) { - ret = SWAP_FAIL; - goto out_unmap; - } - /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); pteval = ptep_clear_flush(vma, address, pte); diff --git a/mm/shmem.c b/mm/shmem.c index 61574b81d97..e64fa726a79 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -6,8 +6,8 @@ * 2000-2001 Christoph Rohland * 2000-2001 SAP AG * 2002 Red Hat Inc. - * Copyright (C) 2002-2004 Hugh Dickins. - * Copyright (C) 2002-2004 VERITAS Software Corporation. + * Copyright (C) 2002-2005 Hugh Dickins. + * Copyright (C) 2002-2005 VERITAS Software Corporation. * Copyright (C) 2004 Andi Kleen, SuSE Labs * * Extended attribute support for tmpfs: @@ -194,7 +194,7 @@ static DEFINE_SPINLOCK(shmem_swaplist_lock); static void shmem_free_blocks(struct inode *inode, long pages) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo) { + if (sbinfo->max_blocks) { spin_lock(&sbinfo->stat_lock); sbinfo->free_blocks += pages; inode->i_blocks -= pages*BLOCKS_PER_PAGE; @@ -357,7 +357,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long * page (and perhaps indirect index pages) yet to allocate: * a waste to allocate index if we cannot allocate data. */ - if (sbinfo) { + if (sbinfo->max_blocks) { spin_lock(&sbinfo->stat_lock); if (sbinfo->free_blocks <= 1) { spin_unlock(&sbinfo->stat_lock); @@ -677,8 +677,8 @@ static void shmem_delete_inode(struct inode *inode) spin_unlock(&shmem_swaplist_lock); } } - if (sbinfo) { - BUG_ON(inode->i_blocks); + BUG_ON(inode->i_blocks); + if (sbinfo->max_inodes) { spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; spin_unlock(&sbinfo->stat_lock); @@ -1080,7 +1080,7 @@ repeat: } else { shmem_swp_unmap(entry); sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo) { + if (sbinfo->max_blocks) { spin_lock(&sbinfo->stat_lock); if (sbinfo->free_blocks == 0 || shmem_acct_block(info->flags)) { @@ -1269,7 +1269,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - if (sbinfo) { + if (sbinfo->max_inodes) { spin_lock(&sbinfo->stat_lock); if (!sbinfo->free_inodes) { spin_unlock(&sbinfo->stat_lock); @@ -1319,7 +1319,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) mpol_shared_policy_init(&info->policy); break; } - } else if (sbinfo) { + } else if (sbinfo->max_inodes) { spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; spin_unlock(&sbinfo->stat_lock); @@ -1328,31 +1328,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) } #ifdef CONFIG_TMPFS - -static int shmem_set_size(struct shmem_sb_info *sbinfo, - unsigned long max_blocks, unsigned long max_inodes) -{ - int error; - unsigned long blocks, inodes; - - spin_lock(&sbinfo->stat_lock); - blocks = sbinfo->max_blocks - sbinfo->free_blocks; - inodes = sbinfo->max_inodes - sbinfo->free_inodes; - error = -EINVAL; - if (max_blocks < blocks) - goto out; - if (max_inodes < inodes) - goto out; - error = 0; - sbinfo->max_blocks = max_blocks; - sbinfo->free_blocks = max_blocks - blocks; - sbinfo->max_inodes = max_inodes; - sbinfo->free_inodes = max_inodes - inodes; -out: - spin_unlock(&sbinfo->stat_lock); - return error; -} - static struct inode_operations shmem_symlink_inode_operations; static struct inode_operations shmem_symlink_inline_operations; @@ -1607,15 +1582,17 @@ static int shmem_statfs(struct super_block *sb, struct kstatfs *buf) buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_CACHE_SIZE; buf->f_namelen = NAME_MAX; - if (sbinfo) { - spin_lock(&sbinfo->stat_lock); + spin_lock(&sbinfo->stat_lock); + if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; + } + if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; - spin_unlock(&sbinfo->stat_lock); } /* else leave those fields 0 like simple_statfs */ + spin_unlock(&sbinfo->stat_lock); return 0; } @@ -1672,7 +1649,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. */ - if (sbinfo) { + if (sbinfo->max_inodes) { spin_lock(&sbinfo->stat_lock); if (!sbinfo->free_inodes) { spin_unlock(&sbinfo->stat_lock); @@ -1697,7 +1674,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo) { + if (sbinfo->max_inodes) { spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; spin_unlock(&sbinfo->stat_lock); @@ -1921,22 +1898,42 @@ bad_val: static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - unsigned long max_blocks = 0; - unsigned long max_inodes = 0; + unsigned long max_blocks = sbinfo->max_blocks; + unsigned long max_inodes = sbinfo->max_inodes; + unsigned long blocks; + unsigned long inodes; + int error = -EINVAL; + + if (shmem_parse_options(data, NULL, NULL, NULL, + &max_blocks, &max_inodes)) + return error; - if (sbinfo) { - max_blocks = sbinfo->max_blocks; - max_inodes = sbinfo->max_inodes; - } - if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes)) - return -EINVAL; - /* Keep it simple: disallow limited <-> unlimited remount */ - if ((max_blocks || max_inodes) == !sbinfo) - return -EINVAL; - /* But allow the pointless unlimited -> unlimited remount */ - if (!sbinfo) - return 0; - return shmem_set_size(sbinfo, max_blocks, max_inodes); + spin_lock(&sbinfo->stat_lock); + blocks = sbinfo->max_blocks - sbinfo->free_blocks; + inodes = sbinfo->max_inodes - sbinfo->free_inodes; + if (max_blocks < blocks) + goto out; + if (max_inodes < inodes) + goto out; + /* + * Those tests also disallow limited->unlimited while any are in + * use, so i_blocks will always be zero when max_blocks is zero; + * but we must separately disallow unlimited->limited, because + * in that case we have no record of how much is already in use. + */ + if (max_blocks && !sbinfo->max_blocks) + goto out; + if (max_inodes && !sbinfo->max_inodes) + goto out; + + error = 0; + sbinfo->max_blocks = max_blocks; + sbinfo->free_blocks = max_blocks - blocks; + sbinfo->max_inodes = max_inodes; + sbinfo->free_inodes = max_inodes - inodes; +out: + spin_unlock(&sbinfo->stat_lock); + return error; } #endif @@ -1961,11 +1958,11 @@ static int shmem_fill_super(struct super_block *sb, uid_t uid = current->fsuid; gid_t gid = current->fsgid; int err = -ENOMEM; - -#ifdef CONFIG_TMPFS + struct shmem_sb_info *sbinfo; unsigned long blocks = 0; unsigned long inodes = 0; +#ifdef CONFIG_TMPFS /* * Per default we only allow half of the physical ram per * tmpfs instance, limiting inodes to one per page of lowmem; @@ -1976,34 +1973,34 @@ static int shmem_fill_super(struct super_block *sb, inodes = totalram_pages - totalhigh_pages; if (inodes > blocks) inodes = blocks; - - if (shmem_parse_options(data, &mode, - &uid, &gid, &blocks, &inodes)) + if (shmem_parse_options(data, &mode, &uid, &gid, + &blocks, &inodes)) return -EINVAL; } - - if (blocks || inodes) { - struct shmem_sb_info *sbinfo; - sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL); - if (!sbinfo) - return -ENOMEM; - sb->s_fs_info = sbinfo; - spin_lock_init(&sbinfo->stat_lock); - sbinfo->max_blocks = blocks; - sbinfo->free_blocks = blocks; - sbinfo->max_inodes = inodes; - sbinfo->free_inodes = inodes; - } - sb->s_xattr = shmem_xattr_handlers; #else sb->s_flags |= MS_NOUSER; #endif + /* Round up to L1_CACHE_BYTES to resist false sharing */ + sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), + L1_CACHE_BYTES), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + + spin_lock_init(&sbinfo->stat_lock); + sbinfo->max_blocks = blocks; + sbinfo->free_blocks = blocks; + sbinfo->max_inodes = inodes; + sbinfo->free_inodes = inodes; + + sb->s_fs_info = sbinfo; sb->s_maxbytes = SHMEM_MAX_BYTES; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; + sb->s_xattr = shmem_xattr_handlers; + inode = shmem_get_inode(sb, S_IFDIR | mode, 0); if (!inode) goto failed; diff --git a/mm/slab.c b/mm/slab.c index c78d343b3c5..93cbbbb39f4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2851,6 +2851,7 @@ next: } check_irq_on(); up(&cache_chain_sem); + drain_remote_pages(); /* Setup the next iteration */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); } diff --git a/mm/swapfile.c b/mm/swapfile.c index da48405cd9a..60cd24a5520 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -276,61 +276,37 @@ void swap_free(swp_entry_t entry) } /* - * Check if we're the only user of a swap page, - * when the page is locked. + * How many references to page are currently swapped out? */ -static int exclusive_swap_page(struct page *page) +static inline int page_swapcount(struct page *page) { - int retval = 0; - struct swap_info_struct * p; + int count = 0; + struct swap_info_struct *p; swp_entry_t entry; entry.val = page->private; p = swap_info_get(entry); if (p) { - /* Is the only swap cache user the cache itself? */ - if (p->swap_map[swp_offset(entry)] == 1) { - /* Recheck the page count with the swapcache lock held.. */ - write_lock_irq(&swapper_space.tree_lock); - if (page_count(page) == 2) - retval = 1; - write_unlock_irq(&swapper_space.tree_lock); - } + /* Subtract the 1 for the swap cache itself */ + count = p->swap_map[swp_offset(entry)] - 1; swap_info_put(p); } - return retval; + return count; } /* * We can use this swap cache entry directly * if there are no other references to it. - * - * Here "exclusive_swap_page()" does the real - * work, but we opportunistically check whether - * we need to get all the locks first.. */ int can_share_swap_page(struct page *page) { - int retval = 0; + int count; - if (!PageLocked(page)) - BUG(); - switch (page_count(page)) { - case 3: - if (!PagePrivate(page)) - break; - /* Fallthrough */ - case 2: - if (!PageSwapCache(page)) - break; - retval = exclusive_swap_page(page); - break; - case 1: - if (PageReserved(page)) - break; - retval = 1; - } - return retval; + BUG_ON(!PageLocked(page)); + count = page_mapcount(page); + if (count <= 1 && PageSwapCache(page)) + count += page_swapcount(page); + return count == 1; } /* @@ -529,9 +505,10 @@ static int unuse_mm(struct mm_struct *mm, if (!down_read_trylock(&mm->mmap_sem)) { /* - * Our reference to the page stops try_to_unmap_one from - * unmapping its ptes, so swapoff can make progress. + * Activate page so shrink_cache is unlikely to unmap its + * ptes while lock is dropped, so swapoff can make progress. */ + activate_page(page); unlock_page(page); down_read(&mm->mmap_sem); lock_page(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 269eded9b45..4b8e62a1937 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -74,6 +74,9 @@ struct scan_control { int may_writepage; + /* Can pages be swapped as part of reclaim? */ + int may_swap; + /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. * In this context, it doesn't matter that we scan the @@ -180,17 +183,20 @@ EXPORT_SYMBOL(remove_shrinker); * `lru_pages' represents the number of on-LRU pages in all the zones which * are eligible for the caller's allocation attempt. It is used for balancing * slab reclaim versus page reclaim. + * + * Returns the number of slab objects which we shrunk. */ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, unsigned long lru_pages) { struct shrinker *shrinker; + int ret = 0; if (scanned == 0) scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) - return 0; + return 1; /* Assume we'll be able to shrink next time */ list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; @@ -209,10 +215,14 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, while (total_scan >= SHRINK_BATCH) { long this_scan = SHRINK_BATCH; int shrink_ret; + int nr_before; + nr_before = (*shrinker->shrinker)(0, gfp_mask); shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); if (shrink_ret == -1) break; + if (shrink_ret < nr_before) + ret += nr_before - shrink_ret; mod_page_state(slabs_scanned, this_scan); total_scan -= this_scan; @@ -222,7 +232,7 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, shrinker->nr += total_scan; } up_read(&shrinker_rwsem); - return 0; + return ret; } /* Called without lock on whether page is mapped, so answer is unstable */ @@ -407,7 +417,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) { + if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) { if (!add_to_swap(page)) goto activate_locked; } @@ -890,7 +900,9 @@ shrink_caches(struct zone **zones, struct scan_control *sc) if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ + atomic_inc(&zone->reclaim_in_progress); shrink_zone(zone, sc); + atomic_dec(&zone->reclaim_in_progress); } } @@ -907,8 +919,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc) * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -int try_to_free_pages(struct zone **zones, - unsigned int gfp_mask, unsigned int order) +int try_to_free_pages(struct zone **zones, unsigned int gfp_mask) { int priority; int ret = 0; @@ -920,6 +931,7 @@ int try_to_free_pages(struct zone **zones, sc.gfp_mask = gfp_mask; sc.may_writepage = 0; + sc.may_swap = 1; inc_page_state(allocstall); @@ -1020,6 +1032,7 @@ loop_again: total_reclaimed = 0; sc.gfp_mask = GFP_KERNEL; sc.may_writepage = 0; + sc.may_swap = 1; sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); @@ -1079,6 +1092,7 @@ scan: */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; + int nr_slab; if (zone->present_pages == 0) continue; @@ -1098,16 +1112,19 @@ scan: sc.nr_reclaimed = 0; sc.priority = priority; sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; + atomic_inc(&zone->reclaim_in_progress); shrink_zone(zone, &sc); + atomic_dec(&zone->reclaim_in_progress); reclaim_state->reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); + nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, + lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_reclaimed += sc.nr_reclaimed; total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) continue; - if (zone->pages_scanned >= (zone->nr_active + - zone->nr_inactive) * 4) + if (nr_slab == 0 && zone->pages_scanned >= + (zone->nr_active + zone->nr_inactive) * 4) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -1309,3 +1326,73 @@ static int __init kswapd_init(void) } module_init(kswapd_init) + + +/* + * Try to free up some pages from this zone through reclaim. + */ +int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order) +{ + struct scan_control sc; + int nr_pages = 1 << order; + int total_reclaimed = 0; + + /* The reclaim may sleep, so don't do it if sleep isn't allowed */ + if (!(gfp_mask & __GFP_WAIT)) + return 0; + if (zone->all_unreclaimable) + return 0; + + sc.gfp_mask = gfp_mask; + sc.may_writepage = 0; + sc.may_swap = 0; + sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_scanned = 0; + sc.nr_reclaimed = 0; + /* scan at the highest priority */ + sc.priority = 0; + + if (nr_pages > SWAP_CLUSTER_MAX) + sc.swap_cluster_max = nr_pages; + else + sc.swap_cluster_max = SWAP_CLUSTER_MAX; + + /* Don't reclaim the zone if there are other reclaimers active */ + if (!atomic_inc_and_test(&zone->reclaim_in_progress)) + goto out; + + shrink_zone(zone, &sc); + total_reclaimed = sc.nr_reclaimed; + + out: + atomic_dec(&zone->reclaim_in_progress); + return total_reclaimed; +} + +asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone, + unsigned int state) +{ + struct zone *z; + int i; + + if (node >= MAX_NUMNODES || !node_online(node)) + return -EINVAL; + + /* This will break if we ever add more zones */ + if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM))) + return -EINVAL; + + for (i = 0; i < MAX_NR_ZONES; i++) { + if (!(zone & 1<<i)) + continue; + + z = &NODE_DATA(node)->node_zones[i]; + + if (state) + z->reclaim_pages = 1; + else + z->reclaim_pages = 0; + } + + return 0; +} |