aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/fadvise.c16
-rw-r--r--mm/filemap.c56
-rw-r--r--mm/filemap_xip.c13
-rw-r--r--mm/fremap.c5
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/hugetlb.c90
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memory.c243
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/migrate.c35
-rw-r--r--mm/mmap.c26
-rw-r--r--mm/nommu.c56
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page-writeback.c15
-rw-r--r--mm/page_alloc.c174
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pagewalk.c131
-rw-r--r--mm/quicklist.c12
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/shmem.c495
-rw-r--r--mm/slab.c69
-rw-r--r--mm/slob.c54
-rw-r--r--mm/slub.c299
-rw-r--r--mm/sparse.c32
-rw-r--r--mm/swap.c10
-rw-r--r--mm/swap_state.c153
-rw-r--r--mm/swapfile.c113
-rw-r--r--mm/tiny-shmem.c12
-rw-r--r--mm/truncate.c10
-rw-r--r--mm/vmalloc.c74
-rw-r--r--mm/vmstat.c61
34 files changed, 1373 insertions, 932 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c070ec0c15b..0016ebd4dcb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -112,18 +112,17 @@ config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC
-#
-# SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page
-# and page_to_pfn. The most efficient option where kernel virtual space is
-# not under pressure.
-#
config SPARSEMEM_VMEMMAP_ENABLE
def_bool n
config SPARSEMEM_VMEMMAP
- bool
- depends on SPARSEMEM
- default y if (SPARSEMEM_VMEMMAP_ENABLE)
+ bool "Sparse Memory virtual memmap"
+ depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
+ default y
+ help
+ SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
+ pfn_to_page and page_to_pfn operations. This is the most
+ efficient option when sufficient kernel resources are available.
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
@@ -188,7 +187,7 @@ config BOUNCE
config NR_QUICK
int
depends on QUICKLIST
- default "2" if (SUPERH && !SUPERH64)
+ default "2" if SUPERH
default "1"
config VIRT_TO_BUS
diff --git a/mm/Makefile b/mm/Makefile
index e222cc5a79c..4af5dff3727 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -13,6 +13,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o $(mmu-y)
+obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b0ceb29da4c..e8644b1e552 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -7,7 +7,7 @@
int bdi_init(struct backing_dev_info *bdi)
{
- int i, j;
+ int i;
int err;
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
@@ -21,7 +21,7 @@ int bdi_init(struct backing_dev_info *bdi)
if (err) {
err:
- for (j = 0; j < i; j++)
+ while (i--)
percpu_counter_destroy(&bdi->bdi_stat[i]);
}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0df4c899e97..3c0f1e99f5e 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -49,9 +49,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
goto out;
}
- if (mapping->a_ops->get_xip_page)
- /* no bad return value, but ignore advice */
+ if (mapping->a_ops->get_xip_page) {
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_NOREUSE:
+ case POSIX_FADV_DONTNEED:
+ /* no bad return value, but ignore advice */
+ break;
+ default:
+ ret = -EINVAL;
+ }
goto out;
+ }
/* Careful about overflows. Len == 0 means "as much as possible" */
endbyte = offset + len;
diff --git a/mm/filemap.c b/mm/filemap.c
index 188cf5fd3e8..81fb9bff0d4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -65,7 +65,6 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
- * ->zone.lock
*
* ->i_mutex
* ->i_mmap_lock (truncate->unmap_mapping_range)
@@ -124,6 +123,18 @@ void __remove_from_page_cache(struct page *page)
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
BUG_ON(page_mapped(page));
+
+ /*
+ * Some filesystems seem to re-dirty the page even after
+ * the VM has canceled the dirty bit (eg ext3 journaling).
+ *
+ * Fix it up by doing a final dirty accounting check after
+ * having removed the page entirely.
+ */
+ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ }
}
void remove_from_page_cache(struct page *page)
@@ -173,6 +184,12 @@ static int sync_page(void *word)
return 0;
}
+static int sync_page_killable(void *word)
+{
+ sync_page(word);
+ return fatal_signal_pending(current) ? -EINTR : 0;
+}
+
/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
@@ -510,7 +527,7 @@ static inline void wake_up_page(struct page *page, int bit)
__wake_up_bit(page_waitqueue(page), &page->flags, bit);
}
-void fastcall wait_on_page_bit(struct page *page, int bit_nr)
+void wait_on_page_bit(struct page *page, int bit_nr)
{
DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
@@ -534,7 +551,7 @@ EXPORT_SYMBOL(wait_on_page_bit);
* the clear_bit and the read of the waitqueue (to avoid SMP races with a
* parallel wait_on_page_locked()).
*/
-void fastcall unlock_page(struct page *page)
+void unlock_page(struct page *page)
{
smp_mb__before_clear_bit();
if (!TestClearPageLocked(page))
@@ -568,7 +585,7 @@ EXPORT_SYMBOL(end_page_writeback);
* chances are that on the second loop, the block layer's plug list is empty,
* so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
*/
-void fastcall __lock_page(struct page *page)
+void __lock_page(struct page *page)
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
@@ -577,11 +594,19 @@ void fastcall __lock_page(struct page *page)
}
EXPORT_SYMBOL(__lock_page);
+int fastcall __lock_page_killable(struct page *page)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+
+ return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ sync_page_killable, TASK_KILLABLE);
+}
+
/*
* Variant of lock_page that does not require the caller to hold a reference
* on the page's mapping.
*/
-void fastcall __lock_page_nosync(struct page *page)
+void __lock_page_nosync(struct page *page)
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
@@ -968,7 +993,8 @@ page_ok:
page_not_up_to_date:
/* Get exclusive access to the page ... */
- lock_page(page);
+ if (lock_page_killable(page))
+ goto readpage_eio;
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
@@ -996,7 +1022,8 @@ readpage:
}
if (!PageUptodate(page)) {
- lock_page(page);
+ if (lock_page_killable(page))
+ goto readpage_eio;
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
@@ -1007,15 +1034,16 @@ readpage:
goto find_page;
}
unlock_page(page);
- error = -EIO;
shrink_readahead_size_eio(filp, ra);
- goto readpage_error;
+ goto readpage_eio;
}
unlock_page(page);
}
goto page_ok;
+readpage_eio:
+ error = -EIO;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
desc->error = error;
@@ -1248,7 +1276,7 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
*/
-static int fastcall page_cache_read(struct file * file, pgoff_t offset)
+static int page_cache_read(struct file *file, pgoff_t offset)
{
struct address_space *mapping = file->f_mapping;
struct page *page;
@@ -1721,7 +1749,11 @@ static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
const struct iovec *iov = i->iov;
size_t base = i->iov_offset;
- while (bytes) {
+ /*
+ * The !iov->iov_len check ensures we skip over unlikely
+ * zero-length segments.
+ */
+ while (bytes || !iov->iov_len) {
int copy = min(bytes, iov->iov_len - base);
bytes -= copy;
@@ -2239,6 +2271,7 @@ again:
cond_resched();
+ iov_iter_advance(i, copied);
if (unlikely(copied == 0)) {
/*
* If we were unable to copy any data at all, we must
@@ -2252,7 +2285,6 @@ again:
iov_iter_single_seg_count(i));
goto again;
}
- iov_iter_advance(i, copied);
pos += copied;
written += copied;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 32132f3cd64..0420a0292b0 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -25,14 +25,15 @@ static struct page *__xip_sparse_page;
static struct page *xip_sparse_page(void)
{
if (!__xip_sparse_page) {
- unsigned long zeroes = get_zeroed_page(GFP_HIGHUSER);
- if (zeroes) {
+ struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
+
+ if (page) {
static DEFINE_SPINLOCK(xip_alloc_lock);
spin_lock(&xip_alloc_lock);
if (!__xip_sparse_page)
- __xip_sparse_page = virt_to_page(zeroes);
+ __xip_sparse_page = page;
else
- free_page(zeroes);
+ __free_page(page);
spin_unlock(&xip_alloc_lock);
}
}
@@ -314,7 +315,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
fault_in_pages_readable(buf, bytes);
kaddr = kmap_atomic(page, KM_USER0);
copied = bytes -
- __copy_from_user_inatomic_nocache(kaddr, buf, bytes);
+ __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
kunmap_atomic(kaddr, KM_USER0);
flush_dcache_page(page);
@@ -430,7 +431,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
else
return PTR_ERR(page);
}
- zero_user_page(page, offset, length, KM_USER0);
+ zero_user(page, offset, length);
return 0;
}
EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/fremap.c b/mm/fremap.c
index 14bd3bf7826..69a37c2bdf8 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -190,10 +190,13 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
*/
if (mapping_cap_account_dirty(mapping)) {
unsigned long addr;
+ struct file *file = vma->vm_file;
flags &= MAP_NONBLOCK;
- addr = mmap_region(vma->vm_file, start, size,
+ get_file(file);
+ addr = mmap_region(file, start, size,
flags, vma->vm_flags, pgoff, 1);
+ fput(file);
if (IS_ERR_VALUE(addr)) {
err = addr;
} else {
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a967bc3515..35d47733cde 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -163,7 +163,7 @@ start:
return vaddr;
}
-void fastcall *kmap_high(struct page *page)
+void *kmap_high(struct page *page)
{
unsigned long vaddr;
@@ -185,7 +185,7 @@ void fastcall *kmap_high(struct page *page)
EXPORT_SYMBOL(kmap_high);
-void fastcall kunmap_high(struct page *page)
+void kunmap_high(struct page *page)
{
unsigned long vaddr;
unsigned long nr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6121b57bbe9..1a5642074e3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -31,7 +31,7 @@ static unsigned int free_huge_pages_node[MAX_NUMNODES];
static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
-int hugetlb_dynamic_pool;
+unsigned long nr_overcommit_huge_pages;
static int hugetlb_next_nid;
/*
@@ -227,22 +227,58 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
unsigned long address)
{
struct page *page;
+ unsigned int nid;
- /* Check if the dynamic pool is enabled */
- if (!hugetlb_dynamic_pool)
+ /*
+ * Assume we will successfully allocate the surplus page to
+ * prevent racing processes from causing the surplus to exceed
+ * overcommit
+ *
+ * This however introduces a different race, where a process B
+ * tries to grow the static hugepage pool while alloc_pages() is
+ * called by process A. B will only examine the per-node
+ * counters in determining if surplus huge pages can be
+ * converted to normal huge pages in adjust_pool_surplus(). A
+ * won't be able to increment the per-node counter, until the
+ * lock is dropped by B, but B doesn't drop hugetlb_lock until
+ * no more huge pages can be converted from surplus to normal
+ * state (and doesn't try to convert again). Thus, we have a
+ * case where a surplus huge page exists, the pool is grown, and
+ * the surplus huge page still exists after, even though it
+ * should just have been converted to a normal huge page. This
+ * does not leak memory, though, as the hugepage will be freed
+ * once it is out of use. It also does not allow the counters to
+ * go out of whack in adjust_pool_surplus() as we don't modify
+ * the node values until we've gotten the hugepage and only the
+ * per-node value is checked there.
+ */
+ spin_lock(&hugetlb_lock);
+ if (surplus_huge_pages >= nr_overcommit_huge_pages) {
+ spin_unlock(&hugetlb_lock);
return NULL;
+ } else {
+ nr_huge_pages++;
+ surplus_huge_pages++;
+ }
+ spin_unlock(&hugetlb_lock);
page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
+
+ spin_lock(&hugetlb_lock);
if (page) {
+ nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
- spin_lock(&hugetlb_lock);
- nr_huge_pages++;
- nr_huge_pages_node[page_to_nid(page)]++;
- surplus_huge_pages++;
- surplus_huge_pages_node[page_to_nid(page)]++;
- spin_unlock(&hugetlb_lock);
+ /*
+ * We incremented the global counters already
+ */
+ nr_huge_pages_node[nid]++;
+ surplus_huge_pages_node[nid]++;
+ } else {
+ nr_huge_pages--;
+ surplus_huge_pages--;
}
+ spin_unlock(&hugetlb_lock);
return page;
}
@@ -382,9 +418,14 @@ static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
if (free_huge_pages > resv_huge_pages)
page = dequeue_huge_page(vma, addr);
spin_unlock(&hugetlb_lock);
- if (!page)
+ if (!page) {
page = alloc_buddy_huge_page(vma, addr);
- return page ? page : ERR_PTR(-VM_FAULT_OOM);
+ if (!page) {
+ hugetlb_put_quota(vma->vm_file->f_mapping, 1);
+ return ERR_PTR(-VM_FAULT_OOM);
+ }
+ }
+ return page;
}
static struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -481,6 +522,12 @@ static unsigned long set_max_huge_pages(unsigned long count)
* Increase the pool size
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
+ *
+ * We might race with alloc_buddy_huge_page() here and be unable
+ * to convert a surplus huge page to a normal huge page. That is
+ * not critical, though, it just means the overall size of the
+ * pool might be one hugepage larger than it needs to be, but
+ * within all the constraints specified by the sysctls.
*/
spin_lock(&hugetlb_lock);
while (surplus_huge_pages && count > persistent_huge_pages) {
@@ -509,6 +556,14 @@ static unsigned long set_max_huge_pages(unsigned long count)
* to keep enough around to satisfy reservations). Then place
* pages into surplus state as needed so the pool will shrink
* to the desired size as pages become free.
+ *
+ * By placing pages into the surplus state independent of the
+ * overcommit value, we are allowing the surplus pool size to
+ * exceed overcommit. There are few sane options here. Since
+ * alloc_buddy_huge_page() is checking the global counter,
+ * though, we'll note that we're not allowed to exceed surplus
+ * and won't grow the pool anywhere else. Not until one of the
+ * sysctls are changed, or the surplus pages go out of use.
*/
min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
min_count = max(count, min_count);
@@ -644,6 +699,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
dst_pte = huge_pte_alloc(dst, addr);
if (!dst_pte)
goto nomem;
+
+ /* If the pagetables are shared don't copy or take references */
+ if (dst_pte == src_pte)
+ continue;
+
spin_lock(&dst->page_table_lock);
spin_lock(&src->page_table_lock);
if (!pte_none(*src_pte)) {
@@ -753,6 +813,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(&mm->page_table_lock);
copy_huge_page(new_page, old_page, address, vma);
+ __SetPageUptodate(new_page);
spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -798,6 +859,7 @@ retry:
goto out;
}
clear_huge_page(page, address);
+ __SetPageUptodate(page);
if (vma->vm_flags & VM_SHARED) {
int err;
@@ -907,7 +969,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
- if (!pte || pte_none(*pte)) {
+ if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) {
int ret;
spin_unlock(&mm->page_table_lock);
@@ -1156,8 +1218,10 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
if (hugetlb_get_quota(inode->i_mapping, chg))
return -ENOSPC;
ret = hugetlb_acct_memory(chg);
- if (ret < 0)
+ if (ret < 0) {
+ hugetlb_put_quota(inode->i_mapping, chg);
return ret;
+ }
region_add(&inode->i_mapping->private_list, from, to);
return 0;
}
diff --git a/mm/internal.h b/mm/internal.h
index 953f941ea86..5a9a6200e03 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,7 +24,7 @@ static inline void set_page_count(struct page *page, int v)
*/
static inline void set_page_refcounted(struct page *page)
{
- VM_BUG_ON(PageCompound(page) && PageTail(page));
+ VM_BUG_ON(PageTail(page));
VM_BUG_ON(atomic_read(&page->_count));
set_page_count(page, 1);
}
@@ -34,7 +34,7 @@ static inline void __put_page(struct page *page)
atomic_dec(&page->_count);
}
-extern void fastcall __init __free_pages_bootmem(struct page *page,
+extern void __init __free_pages_bootmem(struct page *page,
unsigned int order);
/*
diff --git a/mm/memory.c b/mm/memory.c
index 4bf0b6d0eb2..7bb70728bb5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -305,7 +305,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
spin_lock(&mm->page_table_lock);
if (pmd_present(*pmd)) { /* Another has populated it */
pte_lock_deinit(new);
- pte_free(new);
+ pte_free(mm, new);
} else {
mm->nr_ptes++;
inc_zone_page_state(new, NR_PAGETABLE);
@@ -323,7 +323,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
spin_lock(&init_mm.page_table_lock);
if (pmd_present(*pmd)) /* Another has populated it */
- pte_free_kernel(new);
+ pte_free_kernel(&init_mm, new);
else
pmd_populate_kernel(&init_mm, pmd, new);
spin_unlock(&init_mm.page_table_lock);
@@ -392,6 +392,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
return NULL;
}
+#ifdef CONFIG_DEBUG_VM
/*
* Add some anal sanity checks for now. Eventually,
* we should just do "return pfn_to_page(pfn)", but
@@ -402,6 +403,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
print_bad_pte(vma, pte, addr);
return NULL;
}
+#endif
/*
* NOTE! We still have PageReserved() pages in the page
@@ -511,8 +513,7 @@ again:
if (progress >= 32) {
progress = 0;
if (need_resched() ||
- need_lockbreak(src_ptl) ||
- need_lockbreak(dst_ptl))
+ spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
break;
}
if (pte_none(*src_pte)) {
@@ -851,7 +852,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
tlb_finish_mmu(*tlbp, tlb_start, start);
if (need_resched() ||
- (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
+ (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
if (i_mmap_lock) {
*tlbp = NULL;
goto out;
@@ -1108,7 +1109,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
EXPORT_SYMBOL(get_user_pages);
-pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+ spinlock_t **ptl)
{
pgd_t * pgd = pgd_offset(mm, addr);
pud_t * pud = pud_alloc(mm, pgd, addr);
@@ -1516,10 +1518,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
memset(kaddr, 0, PAGE_SIZE);
kunmap_atomic(kaddr, KM_USER0);
flush_dcache_page(dst);
- return;
-
- }
- copy_user_highpage(dst, src, va, vma);
+ } else
+ copy_user_highpage(dst, src, va, vma);
}
/*
@@ -1628,6 +1628,7 @@ gotten:
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address, vma);
+ __SetPageUptodate(new_page);
/*
* Re-check the pte - we dropped the lock
@@ -1668,6 +1669,9 @@ gotten:
unlock:
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+
/*
* Yes, Virginia, this is actually required to prevent a race
* with clear_page_dirty_for_io() from clearing the page dirty
@@ -1763,8 +1767,7 @@ again:
restart_addr = zap_page_range(vma, start_addr,
end_addr - start_addr, details);
- need_break = need_resched() ||
- need_lockbreak(details->i_mmap_lock);
+ need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
if (restart_addr >= end_addr) {
/* We have now completed this vma: mark it so */
@@ -1906,50 +1909,49 @@ EXPORT_SYMBOL(unmap_mapping_range);
*/
int vmtruncate(struct inode * inode, loff_t offset)
{
- struct address_space *mapping = inode->i_mapping;
- unsigned long limit;
+ if (inode->i_size < offset) {
+ unsigned long limit;
- if (inode->i_size < offset)
- goto do_expand;
- /*
- * truncation of in-use swapfiles is disallowed - it would cause
- * subsequent swapout to scribble on the now-freed blocks.
- */
- if (IS_SWAPFILE(inode))
- goto out_busy;
- i_size_write(inode, offset);
+ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ if (limit != RLIM_INFINITY && offset > limit)
+ goto out_sig;
+ if (offset > inode->i_sb->s_maxbytes)
+ goto out_big;
+ i_size_write(inode, offset);
+ } else {
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * truncation of in-use swapfiles is disallowed - it would
+ * cause subsequent swapout to scribble on the now-freed
+ * blocks.
+ */
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+ i_size_write(inode, offset);
+
+ /*
+ * unmap_mapping_range is called twice, first simply for
+ * efficiency so that truncate_inode_pages does fewer
+ * single-page unmaps. However after this first call, and
+ * before truncate_inode_pages finishes, it is possible for
+ * private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second
+ * unmap_mapping_range call must be made for correctness.
+ */
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(mapping, offset);
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+ }
- /*
- * unmap_mapping_range is called twice, first simply for efficiency
- * so that truncate_inode_pages does fewer single-page unmaps. However
- * after this first call, and before truncate_inode_pages finishes,
- * it is possible for private pages to be COWed, which remain after
- * truncate_inode_pages finishes, hence the second unmap_mapping_range
- * call must be made for correctness.
- */
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, offset);
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- goto out_truncate;
-
-do_expand:
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && offset > limit)
- goto out_sig;
- if (offset > inode->i_sb->s_maxbytes)
- goto out_big;
- i_size_write(inode, offset);
-
-out_truncate:
if (inode->i_op && inode->i_op->truncate)
inode->i_op->truncate(inode);
return 0;
+
out_sig:
send_sig(SIGXFSZ, current, 0);
out_big:
return -EFBIG;
-out_busy:
- return -ETXTBSY;
}
EXPORT_SYMBOL(vmtruncate);
@@ -1977,67 +1979,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
return 0;
}
-/**
- * swapin_readahead - swap in pages in hope we need them soon
- * @entry: swap entry of this memory
- * @addr: address to start
- * @vma: user vma this addresses belong to
- *
- * Primitive swap readahead code. We simply read an aligned block of
- * (1 << page_cluster) entries in the swap area. This method is chosen
- * because it doesn't cost us any seek time. We also make sure to queue
- * the 'original' request together with the readahead ones...
- *
- * This has been extended to use the NUMA policies from the mm triggering
- * the readahead.
- *
- * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
- */
-void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
-{
-#ifdef CONFIG_NUMA
- struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
-#endif
- int i, num;
- struct page *new_page;
- unsigned long offset;
-
- /*
- * Get the number of handles we should do readahead io to.
- */
- num = valid_swaphandles(entry, &offset);
- for (i = 0; i < num; offset++, i++) {
- /* Ok, do the async read-ahead now */
- new_page = read_swap_cache_async(swp_entry(swp_type(entry),
- offset), vma, addr);
- if (!new_page)
- break;
- page_cache_release(new_page);
-#ifdef CONFIG_NUMA
- /*
- * Find the next applicable VMA for the NUMA policy.
- */
- addr += PAGE_SIZE;
- if (addr == 0)
- vma = NULL;
- if (vma) {
- if (addr >= vma->vm_end) {
- vma = next_vma;
- next_vma = vma ? vma->vm_next : NULL;
- }
- if (vma && addr < vma->vm_start)
- vma = NULL;
- } else {
- if (next_vma && addr >= next_vma->vm_start) {
- vma = next_vma;
- next_vma = vma->vm_next;
- }
- }
-#endif
- }
- lru_add_drain(); /* Push any new pages onto the LRU now */
-}
-
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -2065,8 +2006,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = lookup_swap_cache(entry);
if (!page) {
grab_swap_token(); /* Contend for token _before_ read-in */
- swapin_readahead(entry, address, vma);
- page = read_swap_cache_async(entry, vma, address);
+ page = swapin_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vma, address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
@@ -2160,6 +2101,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
+ __SetPageUptodate(page);
entry = mk_pte(page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2260,6 +2202,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
}
copy_user_highpage(page, vmf.page, address, vma);
+ __SetPageUptodate(page);
} else {
/*
* If the page will be shareable, see if the backing
@@ -2341,6 +2284,9 @@ out_unlocked:
if (anon)
page_cache_release(vmf.page);
else if (dirty_page) {
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+
set_page_dirty_balance(dirty_page, page_mkwrite);
put_page(dirty_page);
}
@@ -2557,7 +2503,7 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
spin_lock(&mm->page_table_lock);
if (pgd_present(*pgd)) /* Another has populated it */
- pud_free(new);
+ pud_free(mm, new);
else
pgd_populate(mm, pgd, new);
spin_unlock(&mm->page_table_lock);
@@ -2579,12 +2525,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_4LEVEL_HACK
if (pud_present(*pud)) /* Another has populated it */
- pmd_free(new);
+ pmd_free(mm, new);
else
pud_populate(mm, pud, new);
#else
if (pgd_present(*pud)) /* Another has populated it */
- pmd_free(new);
+ pmd_free(mm, new);
else
pgd_populate(mm, pud, new);
#endif /* __ARCH_HAS_4LEVEL_HACK */
@@ -2612,46 +2558,6 @@ int make_pages_present(unsigned long addr, unsigned long end)
return ret == len ? 0 : -1;
}
-/*
- * Map a vmalloc()-space virtual address to the physical page.
- */
-struct page * vmalloc_to_page(void * vmalloc_addr)
-{
- unsigned long addr = (unsigned long) vmalloc_addr;
- struct page *page = NULL;
- pgd_t *pgd = pgd_offset_k(addr);
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep, pte;
-
- if (!pgd_none(*pgd)) {
- pud = pud_offset(pgd, addr);
- if (!pud_none(*pud)) {
- pmd = pmd_offset(pud, addr);
- if (!pmd_none(*pmd)) {
- ptep = pte_offset_map(pmd, addr);
- pte = *ptep;
- if (pte_present(pte))
- page = pte_page(pte);
- pte_unmap(ptep);
- }
- }
- }
- return page;
-}
-
-EXPORT_SYMBOL(vmalloc_to_page);
-
-/*
- * Map a vmalloc()-space virtual address to the physical page frame number.
- */
-unsigned long vmalloc_to_pfn(void * vmalloc_addr)
-{
- return page_to_pfn(vmalloc_to_page(vmalloc_addr));
-}
-
-EXPORT_SYMBOL(vmalloc_to_pfn);
-
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
@@ -2748,3 +2654,34 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
return buf - old_buf;
}
+
+/*
+ * Print the name of a VMA.
+ */
+void print_vma_addr(char *prefix, unsigned long ip)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, ip);
+ if (vma && vma->vm_file) {
+ struct file *f = vma->vm_file;
+ char *buf = (char *)__get_free_page(GFP_KERNEL);
+ if (buf) {
+ char *p, *s;
+
+ p = d_path(f->f_dentry, f->f_vfsmnt, buf, PAGE_SIZE);
+ if (IS_ERR(p))
+ p = "?";
+ s = strrchr(p, '/');
+ if (s)
+ p = s+1;
+ printk("%s%s[%lx+%lx]", prefix, p,
+ vma->vm_start,
+ vma->vm_end - vma->vm_start);
+ free_page((unsigned long)buf);
+ }
+ }
+ up_read(&current->mm->mmap_sem);
+}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9512a544d04..7469c503580 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -481,8 +481,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
return offlined;
}
-extern void drain_all_local_pages(void);
-
int offline_pages(unsigned long start_pfn,
unsigned long end_pfn, unsigned long timeout)
{
@@ -540,7 +538,7 @@ repeat:
lru_add_drain_all();
flush_scheduled_work();
cond_resched();
- drain_all_local_pages();
+ drain_all_pages();
}
pfn = scan_lru_pages(start_pfn, end_pfn);
@@ -563,7 +561,7 @@ repeat:
flush_scheduled_work();
yield();
/* drain pcp pages , this is synchrouns. */
- drain_all_local_pages();
+ drain_all_pages();
/* check again */
offlined_pages = check_pages_isolated(start_pfn, end_pfn);
if (offlined_pages < 0) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a207e8d17e..857a987e369 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -115,11 +115,6 @@ int putback_lru_pages(struct list_head *l)
return count;
}
-static inline int is_swap_pte(pte_t pte)
-{
- return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
-}
-
/*
* Restore a potential migration pte to a working pte entry
*/
@@ -645,15 +640,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
rcu_read_lock();
rcu_locked = 1;
}
+
/*
- * This is a corner case handling.
- * When a new swap-cache is read into, it is linked to LRU
- * and treated as swapcache but has no rmap yet.
- * Calling try_to_unmap() against a page->mapping==NULL page is
- * BUG. So handle it here.
+ * Corner case handling:
+ * 1. When a new swap-cache page is read into, it is added to the LRU
+ * and treated as swapcache but it has no rmap yet.
+ * Calling try_to_unmap() against a page->mapping==NULL page will
+ * trigger a BUG. So handle it here.
+ * 2. An orphaned page (see truncate_complete_page) might have
+ * fs-private metadata. The page can be picked up due to memory
+ * offlining. Everywhere else except page reclaim, the page is
+ * invisible to the vm, so the page can not be migrated. So try to
+ * free the metadata, so the page can be freed.
*/
- if (!page->mapping)
+ if (!page->mapping) {
+ if (!PageAnon(page) && PagePrivate(page)) {
+ /*
+ * Go direct to try_to_free_buffers() here because
+ * a) that's what try_to_release_page() would do anyway
+ * b) we may be under rcu_read_lock() here, so we can't
+ * use GFP_KERNEL which is what try_to_release_page()
+ * needs to be effective.
+ */
+ try_to_free_buffers(page);
+ }
goto rcu_unlock;
+ }
+
/* Establish migration ptes or remove ptes */
try_to_unmap(page, 1);
diff --git a/mm/mmap.c b/mm/mmap.c
index facc1a75bd4..bb4c963cc53 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -36,6 +36,10 @@
#define arch_mmap_check(addr, len, flags) (0)
#endif
+#ifndef arch_rebalance_pgtables
+#define arch_rebalance_pgtables(addr, len) (addr)
+#endif
+
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
@@ -251,7 +255,8 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
* not page aligned -Ram Gupta
*/
rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
- if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
+ if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
+ (mm->end_data - mm->start_data) > rlim)
goto out;
newbrk = PAGE_ALIGN(brk);
@@ -912,6 +917,9 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
if (!len)
return -EINVAL;
+ if (!(flags & MAP_FIXED))
+ addr = round_hint_to_min(addr);
+
error = arch_mmap_check(addr, len, flags);
if (error)
return error;
@@ -1420,7 +1428,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (addr & ~PAGE_MASK)
return -EINVAL;
- return addr;
+ return arch_rebalance_pgtables(addr, len);
}
EXPORT_SYMBOL(get_unmapped_area);
@@ -1615,6 +1623,12 @@ static inline int expand_downwards(struct vm_area_struct *vma,
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
+
+ address &= PAGE_MASK;
+ error = security_file_mmap(NULL, 0, 0, 0, address, 1);
+ if (error)
+ return error;
+
anon_vma_lock(vma);
/*
@@ -1622,8 +1636,6 @@ static inline int expand_downwards(struct vm_area_struct *vma,
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
- address &= PAGE_MASK;
- error = 0;
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
@@ -1934,6 +1946,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
if (is_hugepage_only_range(mm, addr, len))
return -EINVAL;
+ error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
+ if (error)
+ return error;
+
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = arch_mmap_check(addr, len, flags);
@@ -2204,7 +2220,7 @@ int install_special_mapping(struct mm_struct *mm,
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_flags = vm_flags | mm->def_flags;
+ vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
vma->vm_ops = &special_mapping_vmops;
diff --git a/mm/nommu.c b/mm/nommu.c
index 35622c59092..5d8ae086f74 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,6 +10,7 @@
* Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
* Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
* Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
+ * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org>
*/
#include <linux/module.h>
@@ -167,7 +168,7 @@ EXPORT_SYMBOL(get_user_pages);
DEFINE_RWLOCK(vmlist_lock);
struct vm_struct *vmlist;
-void vfree(void *addr)
+void vfree(const void *addr)
{
kfree(addr);
}
@@ -183,13 +184,33 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
}
EXPORT_SYMBOL(__vmalloc);
-struct page * vmalloc_to_page(void *addr)
+void *vmalloc_user(unsigned long size)
+{
+ void *ret;
+
+ ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+ if (ret) {
+ struct vm_area_struct *vma;
+
+ down_write(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, (unsigned long)ret);
+ if (vma)
+ vma->vm_flags |= VM_USERMAP;
+ up_write(&current->mm->mmap_sem);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+
+struct page *vmalloc_to_page(const void *addr)
{
return virt_to_page(addr);
}
EXPORT_SYMBOL(vmalloc_to_page);
-unsigned long vmalloc_to_pfn(void *addr)
+unsigned long vmalloc_to_pfn(const void *addr)
{
return page_to_pfn(virt_to_page(addr));
}
@@ -253,10 +274,17 @@ EXPORT_SYMBOL(vmalloc_32);
*
* The resulting memory area is 32bit addressable and zeroed so it can be
* mapped to userspace without leaking data.
+ *
+ * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
+ * remap_vmalloc_range() are permissible.
*/
void *vmalloc_32_user(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+ /*
+ * We'll have to sort out the ZONE_DMA bits for 64-bit,
+ * but for now this can simply use vmalloc_user() directly.
+ */
+ return vmalloc_user(size);
}
EXPORT_SYMBOL(vmalloc_32_user);
@@ -267,7 +295,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_
}
EXPORT_SYMBOL(vmap);
-void vunmap(void *addr)
+void vunmap(const void *addr)
{
BUG();
}
@@ -829,6 +857,9 @@ unsigned long do_mmap_pgoff(struct file *file,
void *result;
int ret;
+ if (!(flags & MAP_FIXED))
+ addr = round_hint_to_min(addr);
+
/* decide whether we should attempt the mapping, and if so what sort of
* mapping */
ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1213,6 +1244,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
}
EXPORT_SYMBOL(remap_pfn_range);
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+ unsigned long pgoff)
+{
+ unsigned int size = vma->vm_end - vma->vm_start;
+
+ if (!(vma->vm_flags & VM_USERMAP))
+ return -EINVAL;
+
+ vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
+ vma->vm_end = vma->vm_start + size;
+
+ return 0;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
+
void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
{
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 91a081a82f5..c1850bf991c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -125,8 +125,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
* Superuser processes are usually more important, so we make it
* less likely that we kill those.
*/
- if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
- p->uid == 0 || p->euid == 0)
+ if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE))
points /= 4;
/*
@@ -135,7 +134,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
* tend to only have this flag set on applications they think
* of as important.
*/
- if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
+ if (__capable(p, CAP_SYS_RAWIO))
points /= 4;
/*
@@ -286,7 +285,7 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
* all the memory it needs. That way it should be able to
* exit() and clear out its resources quickly...
*/
- p->time_slice = HZ;
+ p->rt.time_slice = HZ;
set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d55cfcae2ef..5e00f1772c2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
int dirty_background_ratio = 5;
/*
+ * free highmem will not be subtracted from the total free memory
+ * for calculating free ratios if vm_highmem_is_dirtyable is true
+ */
+int vm_highmem_is_dirtyable;
+
+/*
* The generator of dirty data starts writeback at this percentage
*/
int vm_dirty_ratio = 10;
@@ -219,7 +225,7 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
*
* dirty -= (dirty/8) * p_{t}
*/
-void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
{
long numerator, denominator;
long dirty = *pdirty;
@@ -287,7 +293,10 @@ static unsigned long determine_dirtyable_memory(void)
x = global_page_state(NR_FREE_PAGES)
+ global_page_state(NR_INACTIVE)
+ global_page_state(NR_ACTIVE);
- x -= highmem_dirtyable_memory(x);
+
+ if (!vm_highmem_is_dirtyable)
+ x -= highmem_dirtyable_memory(x);
+
return x + 1; /* Ensure that we never return 0 */
}
@@ -1067,7 +1076,7 @@ static int __set_page_dirty(struct page *page)
return 0;
}
-int fastcall set_page_dirty(struct page *page)
+int set_page_dirty(struct page *page)
{
int ret = __set_page_dirty(page);
if (ret)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b5a58d476c1..37576b822f0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -537,7 +537,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
/*
* permit the bootmem allocator to evade page validation on high-order frees
*/
-void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+void __init __free_pages_bootmem(struct page *page, unsigned int order)
{
if (order == 0) {
__ClearPageReserved(page);
@@ -847,8 +847,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
struct page *page = __rmqueue(zone, order, migratetype);
if (unlikely(page == NULL))
break;
+
+ /*
+ * Split buddy pages returned by expand() are received here
+ * in physical page order. The page is added to the callers and
+ * list and the list head then moves forward. From the callers
+ * perspective, the linked list is ordered by page number in
+ * some conditions. This is useful for IO devices that can
+ * merge IO requests if the physical pages are ordered
+ * properly.
+ */
list_add(&page->lru, list);
set_page_private(page, migratetype);
+ list = &page->lru;
}
spin_unlock(&zone->lock);
return i;
@@ -879,31 +890,51 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
}
#endif
-static void __drain_pages(unsigned int cpu)
+/*
+ * Drain pages of the indicated processor.
+ *
+ * The processor must either be the current processor and the
+ * thread pinned to the current processor or a processor that
+ * is not online.
+ */
+static void drain_pages(unsigned int cpu)
{
unsigned long flags;
struct zone *zone;
- int i;
for_each_zone(zone) {
struct per_cpu_pageset *pset;
+ struct per_cpu_pages *pcp;
if (!populated_zone(zone))
continue;
pset = zone_pcp(zone, cpu);
- for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
- struct per_cpu_pages *pcp;
-
- pcp = &pset->pcp[i];
- local_irq_save(flags);
- free_pages_bulk(zone, pcp->count, &pcp->list, 0);
- pcp->count = 0;
- local_irq_restore(flags);
- }
+
+ pcp = &pset->pcp;
+ local_irq_save(flags);
+ free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+ pcp->count = 0;
+ local_irq_restore(flags);
}
}
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ */
+void drain_local_pages(void *arg)
+{
+ drain_pages(smp_processor_id());
+}
+
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ */
+void drain_all_pages(void)
+{
+ on_each_cpu(drain_local_pages, NULL, 0, 1);
+}
+
#ifdef CONFIG_HIBERNATION
void mark_free_pages(struct zone *zone)
@@ -941,40 +972,9 @@ void mark_free_pages(struct zone *zone)
#endif /* CONFIG_PM */
/*
- * Spill all of this CPU's per-cpu pages back into the buddy allocator.
- */
-void drain_local_pages(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __drain_pages(smp_processor_id());
- local_irq_restore(flags);
-}
-
-void smp_drain_local_pages(void *arg)
-{
- drain_local_pages();
-}
-
-/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
- */
-void drain_all_local_pages(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __drain_pages(smp_processor_id());
- local_irq_restore(flags);
-
- smp_call_function(smp_drain_local_pages, NULL, 0, 1);
-}
-
-/*
* Free a 0-order page
*/
-static void fastcall free_hot_cold_page(struct page *page, int cold)
+static void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
@@ -990,10 +990,13 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
- pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+ pcp = &zone_pcp(zone, get_cpu())->pcp;
local_irq_save(flags);
__count_vm_event(PGFREE);
- list_add(&page->lru, &pcp->list);
+ if (cold)
+ list_add_tail(&page->lru, &pcp->list);
+ else
+ list_add(&page->lru, &pcp->list);
set_page_private(page, get_pageblock_migratetype(page));
pcp->count++;
if (pcp->count >= pcp->high) {
@@ -1004,12 +1007,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
put_cpu();
}
-void fastcall free_hot_page(struct page *page)
+void free_hot_page(struct page *page)
{
free_hot_cold_page(page, 0);
}
-void fastcall free_cold_page(struct page *page)
+void free_cold_page(struct page *page)
{
free_hot_cold_page(page, 1);
}
@@ -1051,7 +1054,7 @@ again:
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
- pcp = &zone_pcp(zone, cpu)->pcp[cold];
+ pcp = &zone_pcp(zone, cpu)->pcp;
local_irq_save(flags);
if (!pcp->count) {
pcp->count = rmqueue_bulk(zone, 0,
@@ -1061,9 +1064,15 @@ again:
}
/* Find a page of the appropriate migrate type */
- list_for_each_entry(page, &pcp->list, lru)
- if (page_private(page) == migratetype)
- break;
+ if (cold) {
+ list_for_each_entry_reverse(page, &pcp->list, lru)
+ if (page_private(page) == migratetype)
+ break;
+ } else {
+ list_for_each_entry(page, &pcp->list, lru)
+ if (page_private(page) == migratetype)
+ break;
+ }
/* Allocate more to the pcp list if necessary */
if (unlikely(&page->lru == &pcp->list)) {
@@ -1558,7 +1567,7 @@ nofail_alloc:
cond_resched();
if (order != 0)
- drain_all_local_pages();
+ drain_all_pages();
if (likely(did_some_progress)) {
page = get_page_from_freelist(gfp_mask, order,
@@ -1632,7 +1641,7 @@ EXPORT_SYMBOL(__alloc_pages);
/*
* Common helper functions.
*/
-fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
+unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page * page;
page = alloc_pages(gfp_mask, order);
@@ -1643,7 +1652,7 @@ fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
EXPORT_SYMBOL(__get_free_pages);
-fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
+unsigned long get_zeroed_page(gfp_t gfp_mask)
{
struct page * page;
@@ -1669,7 +1678,7 @@ void __pagevec_free(struct pagevec *pvec)
free_hot_cold_page(pvec->pages[i], pvec->cold);
}
-fastcall void __free_pages(struct page *page, unsigned int order)
+void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
@@ -1681,7 +1690,7 @@ fastcall void __free_pages(struct page *page, unsigned int order)
EXPORT_SYMBOL(__free_pages);
-fastcall void free_pages(unsigned long addr, unsigned int order)
+void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
VM_BUG_ON(!virt_addr_valid((void *)addr));
@@ -1790,12 +1799,9 @@ void show_free_areas(void)
pageset = zone_pcp(zone, cpu);
- printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
- "Cold: hi:%5d, btch:%4d usd:%4d\n",
- cpu, pageset->pcp[0].high,
- pageset->pcp[0].batch, pageset->pcp[0].count,
- pageset->pcp[1].high, pageset->pcp[1].batch,
- pageset->pcp[1].count);
+ printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
+ cpu, pageset->pcp.high,
+ pageset->pcp.batch, pageset->pcp.count);
}
}
@@ -1868,6 +1874,8 @@ void show_free_areas(void)
printk("= %lukB\n", K(total));
}
+ printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
+
show_swap_cache_info();
}
@@ -2540,8 +2548,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
}
}
-static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
- struct zone *zone, unsigned long size)
+static void __meminit zone_init_free_lists(struct zone *zone)
{
int order, t;
for_each_migratetype_order(order, t) {
@@ -2555,7 +2562,7 @@ static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int __devinit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
{
int batch;
@@ -2593,17 +2600,11 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
memset(p, 0, sizeof(*p));
- pcp = &p->pcp[0]; /* hot */
+ pcp = &p->pcp;
pcp->count = 0;
pcp->high = 6 * batch;
pcp->batch = max(1UL, 1 * batch);
INIT_LIST_HEAD(&pcp->list);
-
- pcp = &p->pcp[1]; /* cold*/
- pcp->count = 0;
- pcp->high = 2 * batch;
- pcp->batch = max(1UL, batch/2);
- INIT_LIST_HEAD(&pcp->list);
}
/*
@@ -2616,7 +2617,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
{
struct per_cpu_pages *pcp;
- pcp = &p->pcp[0]; /* hot list */
+ pcp = &p->pcp;
pcp->high = high;
pcp->batch = max(1UL, high/4);
if ((high/4) > (PAGE_SHIFT * 8))
@@ -2820,7 +2821,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
- zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+ zone_init_free_lists(zone);
return 0;
}
@@ -3427,7 +3428,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
mem_map = NODE_DATA(0)->node_mem_map;
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- mem_map -= pgdat->node_start_pfn;
+ mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
}
#endif
@@ -3967,10 +3968,23 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
int cpu = (unsigned long)hcpu;
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- local_irq_disable();
- __drain_pages(cpu);
+ drain_pages(cpu);
+
+ /*
+ * Spill the event counters of the dead processor
+ * into the current processors event counters.
+ * This artificially elevates the count of the current
+ * processor.
+ */
vm_events_fold_cpu(cpu);
- local_irq_enable();
+
+ /*
+ * Zero the differential counters of the dead processor
+ * so that the vm statistics are consistent.
+ *
+ * This is only okay since the processor is dead and cannot
+ * race with what we are doing.
+ */
refresh_cpu_vm_stats(cpu);
}
return NOTIFY_OK;
@@ -4469,7 +4483,7 @@ int set_migratetype_isolate(struct page *page)
out:
spin_unlock_irqrestore(&zone->lock, flags);
if (!ret)
- drain_all_local_pages();
+ drain_all_pages();
return ret;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 3b97f685027..065c4480eaf 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -126,7 +126,7 @@ int swap_readpage(struct file *file, struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- ClearPageUptodate(page);
+ BUG_ON(PageUptodate(page));
bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
end_swap_bio_read);
if (bio == NULL) {
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
new file mode 100644
index 00000000000..b4f27d22da9
--- /dev/null
+++ b/mm/pagewalk.c
@@ -0,0 +1,131 @@
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ const struct mm_walk *walk, void *private)
+{
+ pte_t *pte;
+ int err = 0;
+
+ pte = pte_offset_map(pmd, addr);
+ do {
+ err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
+ if (err)
+ break;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+
+ pte_unmap(pte);
+ return err;
+}
+
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ const struct mm_walk *walk, void *private)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int err = 0;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, private);
+ if (err)
+ break;
+ continue;
+ }
+ if (walk->pmd_entry)
+ err = walk->pmd_entry(pmd, addr, next, private);
+ if (!err && walk->pte_entry)
+ err = walk_pte_range(pmd, addr, next, walk, private);
+ if (err)
+ break;
+ } while (pmd++, addr = next, addr != end);
+
+ return err;
+}
+
+static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ const struct mm_walk *walk, void *private)
+{
+ pud_t *pud;
+ unsigned long next;
+ int err = 0;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, private);
+ if (err)
+ break;
+ continue;
+ }
+ if (walk->pud_entry)
+ err = walk->pud_entry(pud, addr, next, private);
+ if (!err && (walk->pmd_entry || walk->pte_entry))
+ err = walk_pmd_range(pud, addr, next, walk, private);
+ if (err)
+ break;
+ } while (pud++, addr = next, addr != end);
+
+ return err;
+}
+
+/**
+ * walk_page_range - walk a memory map's page tables with a callback
+ * @mm - memory map to walk
+ * @addr - starting address
+ * @end - ending address
+ * @walk - set of callbacks to invoke for each level of the tree
+ * @private - private data passed to the callback function
+ *
+ * Recursively walk the page table for the memory area in a VMA,
+ * calling supplied callbacks. Callbacks are called in-order (first
+ * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
+ * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ *
+ * Each callback receives an entry pointer, the start and end of the
+ * associated range, and a caller-supplied private data pointer.
+ *
+ * No locks are taken, but the bottom level iterator will map PTE
+ * directories from highmem if necessary.
+ *
+ * If any callback returns a non-zero value, the walk is aborted and
+ * the return value is propagated back to the caller. Otherwise 0 is returned.
+ */
+int walk_page_range(const struct mm_struct *mm,
+ unsigned long addr, unsigned long end,
+ const struct mm_walk *walk, void *private)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ int err = 0;
+
+ if (addr >= end)
+ return err;
+
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, private);
+ if (err)
+ break;
+ continue;
+ }
+ if (walk->pgd_entry)
+ err = walk->pgd_entry(pgd, addr, next, private);
+ if (!err &&
+ (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
+ err = walk_pud_range(pgd, addr, next, walk, private);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+
+ return err;
+}
diff --git a/mm/quicklist.c b/mm/quicklist.c
index ae8189c2799..3f703f7cb39 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -26,9 +26,17 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
static unsigned long max_pages(unsigned long min_pages)
{
unsigned long node_free_pages, max;
+ struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
+
+ node_free_pages =
+#ifdef CONFIG_ZONE_DMA
+ zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) +
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) +
+#endif
+ zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
- node_free_pages = node_page_state(numa_node_id(),
- NR_FREE_PAGES);
max = node_free_pages / FRACTION_OF_NODE_MEM;
return max(max, min_pages);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index dbc2ca2057a..57ad276900c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,7 +36,6 @@
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
* within inode_lock in __sync_single_inode)
- * zone->lock (within radix tree node alloc)
*/
#include <linux/mm.h>
@@ -284,7 +283,10 @@ static int page_referenced_one(struct page *page,
if (!pte)
goto out;
- if (ptep_clear_flush_young(vma, address, pte))
+ if (vma->vm_flags & VM_LOCKED) {
+ referenced++;
+ *mapcount = 1; /* break early from loop */
+ } else if (ptep_clear_flush_young(vma, address, pte))
referenced++;
/* Pretend the page is referenced if the task has the
diff --git a/mm/shmem.c b/mm/shmem.c
index 51b3d6ccdda..0f246c44a57 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -78,11 +78,10 @@
/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
enum sgp_type {
- SGP_QUICK, /* don't try more than file page cache lookup */
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
+ SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
SGP_WRITE, /* may exceed i_size, may allocate page */
- SGP_FAULT, /* same as SGP_CACHE, return with page locked */
};
static int shmem_getpage(struct inode *inode, unsigned long idx,
@@ -194,7 +193,7 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
};
static LIST_HEAD(shmem_swaplist);
-static DEFINE_SPINLOCK(shmem_swaplist_lock);
+static DEFINE_MUTEX(shmem_swaplist_mutex);
static void shmem_free_blocks(struct inode *inode, long pages)
{
@@ -207,6 +206,31 @@ static void shmem_free_blocks(struct inode *inode, long pages)
}
}
+static int shmem_reserve_inode(struct super_block *sb)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ if (sbinfo->max_inodes) {
+ spin_lock(&sbinfo->stat_lock);
+ if (!sbinfo->free_inodes) {
+ spin_unlock(&sbinfo->stat_lock);
+ return -ENOSPC;
+ }
+ sbinfo->free_inodes--;
+ spin_unlock(&sbinfo->stat_lock);
+ }
+ return 0;
+}
+
+static void shmem_free_inode(struct super_block *sb)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ if (sbinfo->max_inodes) {
+ spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_inodes++;
+ spin_unlock(&sbinfo->stat_lock);
+ }
+}
+
/*
* shmem_recalc_inode - recalculate the size of an inode
*
@@ -731,6 +755,8 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
(void) shmem_getpage(inode,
attr->ia_size>>PAGE_CACHE_SHIFT,
&page, SGP_READ, NULL);
+ if (page)
+ unlock_page(page);
}
/*
* Reset SHMEM_PAGEIN flag so that shmem_truncate can
@@ -762,7 +788,6 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
static void shmem_delete_inode(struct inode *inode)
{
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
struct shmem_inode_info *info = SHMEM_I(inode);
if (inode->i_op->truncate == shmem_truncate) {
@@ -771,17 +796,13 @@ static void shmem_delete_inode(struct inode *inode)
inode->i_size = 0;
shmem_truncate(inode);
if (!list_empty(&info->swaplist)) {
- spin_lock(&shmem_swaplist_lock);
+ mutex_lock(&shmem_swaplist_mutex);
list_del_init(&info->swaplist);
- spin_unlock(&shmem_swaplist_lock);
+ mutex_unlock(&shmem_swaplist_mutex);
}
}
BUG_ON(inode->i_blocks);
- if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
- }
+ shmem_free_inode(inode->i_sb);
clear_inode(inode);
}
@@ -807,19 +828,22 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
struct page *subdir;
swp_entry_t *ptr;
int offset;
+ int error;
idx = 0;
ptr = info->i_direct;
spin_lock(&info->lock);
+ if (!info->swapped) {
+ list_del_init(&info->swaplist);
+ goto lost2;
+ }
limit = info->next_index;
size = limit;
if (size > SHMEM_NR_DIRECT)
size = SHMEM_NR_DIRECT;
offset = shmem_find_swp(entry, ptr, ptr+size);
- if (offset >= 0) {
- shmem_swp_balance_unmap();
+ if (offset >= 0)
goto found;
- }
if (!info->i_indirect)
goto lost2;
@@ -829,6 +853,14 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
if (unlikely(idx == stage)) {
shmem_dir_unmap(dir-1);
+ if (cond_resched_lock(&info->lock)) {
+ /* check it has not been truncated */
+ if (limit > info->next_index) {
+ limit = info->next_index;
+ if (idx >= limit)
+ goto lost2;
+ }
+ }
dir = shmem_dir_map(info->i_indirect) +
ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
while (!*dir) {
@@ -849,11 +881,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
if (size > ENTRIES_PER_PAGE)
size = ENTRIES_PER_PAGE;
offset = shmem_find_swp(entry, ptr, ptr+size);
+ shmem_swp_unmap(ptr);
if (offset >= 0) {
shmem_dir_unmap(dir);
goto found;
}
- shmem_swp_unmap(ptr);
}
}
lost1:
@@ -863,19 +895,63 @@ lost2:
return 0;
found:
idx += offset;
- inode = &info->vfs_inode;
- if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
- info->flags |= SHMEM_PAGEIN;
- shmem_swp_set(info, ptr + offset, 0);
- }
- shmem_swp_unmap(ptr);
+ inode = igrab(&info->vfs_inode);
spin_unlock(&info->lock);
+
/*
- * Decrement swap count even when the entry is left behind:
- * try_to_unuse will skip over mms, then reincrement count.
+ * Move _head_ to start search for next from here.
+ * But be careful: shmem_delete_inode checks list_empty without taking
+ * mutex, and there's an instant in list_move_tail when info->swaplist
+ * would appear empty, if it were the only one on shmem_swaplist. We
+ * could avoid doing it if inode NULL; or use this minor optimization.
*/
- swap_free(entry);
- return 1;
+ if (shmem_swaplist.next != &info->swaplist)
+ list_move_tail(&shmem_swaplist, &info->swaplist);
+ mutex_unlock(&shmem_swaplist_mutex);
+
+ error = 1;
+ if (!inode)
+ goto out;
+ error = radix_tree_preload(GFP_KERNEL);
+ if (error)
+ goto out;
+ error = 1;
+
+ spin_lock(&info->lock);
+ ptr = shmem_swp_entry(info, idx, NULL);
+ if (ptr && ptr->val == entry.val)
+ error = add_to_page_cache(page, inode->i_mapping,
+ idx, GFP_NOWAIT);
+ if (error == -EEXIST) {
+ struct page *filepage = find_get_page(inode->i_mapping, idx);
+ error = 1;
+ if (filepage) {
+ /*
+ * There might be a more uptodate page coming down
+ * from a stacked writepage: forget our swappage if so.
+ */
+ if (PageUptodate(filepage))
+ error = 0;
+ page_cache_release(filepage);
+ }
+ }
+ if (!error) {
+ delete_from_swap_cache(page);
+ set_page_dirty(page);
+ info->flags |= SHMEM_PAGEIN;
+ shmem_swp_set(info, ptr, 0);
+ swap_free(entry);
+ error = 1; /* not an error, but entry was found */
+ }
+ if (ptr)
+ shmem_swp_unmap(ptr);
+ spin_unlock(&info->lock);
+ radix_tree_preload_end();
+out:
+ unlock_page(page);
+ page_cache_release(page);
+ iput(inode); /* allows for NULL */
+ return error;
}
/*
@@ -887,20 +963,16 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
struct shmem_inode_info *info;
int found = 0;
- spin_lock(&shmem_swaplist_lock);
+ mutex_lock(&shmem_swaplist_mutex);
list_for_each_safe(p, next, &shmem_swaplist) {
info = list_entry(p, struct shmem_inode_info, swaplist);
- if (!info->swapped)
- list_del_init(&info->swaplist);
- else if (shmem_unuse_inode(info, entry, page)) {
- /* move head to start search for next from here */
- list_move_tail(&shmem_swaplist, &info->swaplist);
- found = 1;
- break;
- }
+ found = shmem_unuse_inode(info, entry, page);
+ cond_resched();
+ if (found)
+ goto out;
}
- spin_unlock(&shmem_swaplist_lock);
- return found;
+ mutex_unlock(&shmem_swaplist_mutex);
+out: return found; /* 0 or 1 or -ENOMEM */
}
/*
@@ -915,54 +987,65 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode;
BUG_ON(!PageLocked(page));
- /*
- * shmem_backing_dev_info's capabilities prevent regular writeback or
- * sync from ever calling shmem_writepage; but a stacking filesystem
- * may use the ->writepage of its underlying filesystem, in which case
- * we want to do nothing when that underlying filesystem is tmpfs
- * (writing out to swap is useful as a response to memory pressure, but
- * of no use to stabilize the data) - just redirty the page, unlock it
- * and claim success in this case. AOP_WRITEPAGE_ACTIVATE, and the
- * page_mapped check below, must be avoided unless we're in reclaim.
- */
- if (!wbc->for_reclaim) {
- set_page_dirty(page);
- unlock_page(page);
- return 0;
- }
- BUG_ON(page_mapped(page));
-
mapping = page->mapping;
index = page->index;
inode = mapping->host;
info = SHMEM_I(inode);
if (info->flags & VM_LOCKED)
goto redirty;
- swap = get_swap_page();
- if (!swap.val)
+ if (!total_swap_pages)
goto redirty;
+ /*
+ * shmem_backing_dev_info's capabilities prevent regular writeback or
+ * sync from ever calling shmem_writepage; but a stacking filesystem
+ * may use the ->writepage of its underlying filesystem, in which case
+ * tmpfs should write out to swap only in response to memory pressure,
+ * and not for pdflush or sync. However, in those cases, we do still
+ * want to check if there's a redundant swappage to be discarded.
+ */
+ if (wbc->for_reclaim)
+ swap = get_swap_page();
+ else
+ swap.val = 0;
+
spin_lock(&info->lock);
- shmem_recalc_inode(inode);
if (index >= info->next_index) {
BUG_ON(!(info->flags & SHMEM_TRUNCATE));
goto unlock;
}
entry = shmem_swp_entry(info, index, NULL);
- BUG_ON(!entry);
- BUG_ON(entry->val);
+ if (entry->val) {
+ /*
+ * The more uptodate page coming down from a stacked
+ * writepage should replace our old swappage.
+ */
+ free_swap_and_cache(*entry);
+ shmem_swp_set(info, entry, 0);
+ }
+ shmem_recalc_inode(inode);
- if (move_to_swap_cache(page, swap) == 0) {
+ if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+ remove_from_page_cache(page);
shmem_swp_set(info, entry, swap.val);
shmem_swp_unmap(entry);
+ if (list_empty(&info->swaplist))
+ inode = igrab(inode);
+ else
+ inode = NULL;
spin_unlock(&info->lock);
- if (list_empty(&info->swaplist)) {
- spin_lock(&shmem_swaplist_lock);
+ swap_duplicate(swap);
+ BUG_ON(page_mapped(page));
+ page_cache_release(page); /* pagecache ref */
+ set_page_dirty(page);
+ unlock_page(page);
+ if (inode) {
+ mutex_lock(&shmem_swaplist_mutex);
/* move instead of add in case we're racing */
list_move_tail(&info->swaplist, &shmem_swaplist);
- spin_unlock(&shmem_swaplist_lock);
+ mutex_unlock(&shmem_swaplist_mutex);
+ iput(inode);
}
- unlock_page(page);
return 0;
}
@@ -972,7 +1055,10 @@ unlock:
swap_free(swap);
redirty:
set_page_dirty(page);
- return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */
+ if (wbc->for_reclaim)
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
+ unlock_page(page);
+ return 0;
}
#ifdef CONFIG_NUMA
@@ -1025,53 +1111,33 @@ out:
return err;
}
-static struct page *shmem_swapin_async(struct shared_policy *p,
- swp_entry_t entry, unsigned long idx)
+static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
+ struct shmem_inode_info *info, unsigned long idx)
{
- struct page *page;
struct vm_area_struct pvma;
+ struct page *page;
/* Create a pseudo vma that just contains the policy */
- memset(&pvma, 0, sizeof(struct vm_area_struct));
- pvma.vm_end = PAGE_SIZE;
+ pvma.vm_start = 0;
pvma.vm_pgoff = idx;
- pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
- page = read_swap_cache_async(entry, &pvma, 0);
+ pvma.vm_ops = NULL;
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+ page = swapin_readahead(entry, gfp, &pvma, 0);
mpol_free(pvma.vm_policy);
return page;
}
-static struct page *shmem_swapin(struct shmem_inode_info *info,
- swp_entry_t entry, unsigned long idx)
-{
- struct shared_policy *p = &info->policy;
- int i, num;
- struct page *page;
- unsigned long offset;
-
- num = valid_swaphandles(entry, &offset);
- for (i = 0; i < num; offset++, i++) {
- page = shmem_swapin_async(p,
- swp_entry(swp_type(entry), offset), idx);
- if (!page)
- break;
- page_cache_release(page);
- }
- lru_add_drain(); /* Push any new pages onto the LRU now */
- return shmem_swapin_async(p, entry, idx);
-}
-
-static struct page *
-shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
- unsigned long idx)
+static struct page *shmem_alloc_page(gfp_t gfp,
+ struct shmem_inode_info *info, unsigned long idx)
{
struct vm_area_struct pvma;
struct page *page;
- memset(&pvma, 0, sizeof(struct vm_area_struct));
- pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+ /* Create a pseudo vma that just contains the policy */
+ pvma.vm_start = 0;
pvma.vm_pgoff = idx;
- pvma.vm_end = PAGE_SIZE;
+ pvma.vm_ops = NULL;
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
page = alloc_page_vma(gfp, &pvma, 0);
mpol_free(pvma.vm_policy);
return page;
@@ -1083,15 +1149,14 @@ static inline int shmem_parse_mpol(char *value, int *policy,
return 1;
}
-static inline struct page *
-shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
+static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
+ struct shmem_inode_info *info, unsigned long idx)
{
- swapin_readahead(entry, 0, NULL);
- return read_swap_cache_async(entry, NULL, 0);
+ return swapin_readahead(entry, gfp, NULL, 0);
}
-static inline struct page *
-shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
+static inline struct page *shmem_alloc_page(gfp_t gfp,
+ struct shmem_inode_info *info, unsigned long idx)
{
return alloc_page(gfp);
}
@@ -1114,6 +1179,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
struct page *swappage;
swp_entry_t *entry;
swp_entry_t swap;
+ gfp_t gfp;
int error;
if (idx >= SHMEM_MAX_INDEX)
@@ -1126,7 +1192,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
* Normally, filepage is NULL on entry, and either found
* uptodate immediately, or allocated and zeroed, or read
* in under swappage, which is then assigned to filepage.
- * But shmem_readpage and shmem_write_begin pass in a locked
+ * But shmem_readpage (required for splice) passes in a locked
* filepage, which may be found not uptodate by other callers
* too, and may need to be copied from the swappage read in.
*/
@@ -1136,8 +1202,17 @@ repeat:
if (filepage && PageUptodate(filepage))
goto done;
error = 0;
- if (sgp == SGP_QUICK)
- goto failed;
+ gfp = mapping_gfp_mask(mapping);
+ if (!filepage) {
+ /*
+ * Try to preload while we can wait, to not make a habit of
+ * draining atomic reserves; but don't latch on to this cpu.
+ */
+ error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+ if (error)
+ goto failed;
+ radix_tree_preload_end();
+ }
spin_lock(&info->lock);
shmem_recalc_inode(inode);
@@ -1160,7 +1235,7 @@ repeat:
*type |= VM_FAULT_MAJOR;
}
spin_unlock(&info->lock);
- swappage = shmem_swapin(info, swap, idx);
+ swappage = shmem_swapin(swap, gfp, info, idx);
if (!swappage) {
spin_lock(&info->lock);
entry = shmem_swp_alloc(info, idx, sgp);
@@ -1218,23 +1293,21 @@ repeat:
SetPageUptodate(filepage);
set_page_dirty(filepage);
swap_free(swap);
- } else if (!(error = move_from_swap_cache(
- swappage, idx, mapping))) {
+ } else if (!(error = add_to_page_cache(
+ swappage, mapping, idx, GFP_NOWAIT))) {
info->flags |= SHMEM_PAGEIN;
shmem_swp_set(info, entry, 0);
shmem_swp_unmap(entry);
+ delete_from_swap_cache(swappage);
spin_unlock(&info->lock);
filepage = swappage;
+ set_page_dirty(filepage);
swap_free(swap);
} else {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
unlock_page(swappage);
page_cache_release(swappage);
- if (error == -ENOMEM) {
- /* let kswapd refresh zone for GFP_ATOMICs */
- congestion_wait(WRITE, HZ/50);
- }
goto repeat;
}
} else if (sgp == SGP_READ && !filepage) {
@@ -1272,9 +1345,7 @@ repeat:
if (!filepage) {
spin_unlock(&info->lock);
- filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
- info,
- idx);
+ filepage = shmem_alloc_page(gfp, info, idx);
if (!filepage) {
shmem_unacct_blocks(info->flags, 1);
shmem_free_blocks(inode, 1);
@@ -1291,7 +1362,7 @@ repeat:
shmem_swp_unmap(entry);
}
if (error || swap.val || 0 != add_to_page_cache_lru(
- filepage, mapping, idx, GFP_ATOMIC)) {
+ filepage, mapping, idx, GFP_NOWAIT)) {
spin_unlock(&info->lock);
page_cache_release(filepage);
shmem_unacct_blocks(info->flags, 1);
@@ -1309,14 +1380,11 @@ repeat:
clear_highpage(filepage);
flush_dcache_page(filepage);
SetPageUptodate(filepage);
+ if (sgp == SGP_DIRTY)
+ set_page_dirty(filepage);
}
done:
- if (*pagep != filepage) {
- *pagep = filepage;
- if (sgp != SGP_FAULT)
- unlock_page(filepage);
-
- }
+ *pagep = filepage;
return 0;
failed:
@@ -1336,7 +1404,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
- error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret);
+ error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1399,15 +1467,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
- return NULL;
- }
- sbinfo->free_inodes--;
- spin_unlock(&sbinfo->stat_lock);
- }
+ if (shmem_reserve_inode(sb))
+ return NULL;
inode = new_inode(sb);
if (inode) {
@@ -1451,11 +1512,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
NULL);
break;
}
- } else if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
- }
+ } else
+ shmem_free_inode(sb);
return inode;
}
@@ -1494,123 +1552,30 @@ shmem_write_end(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
+ if (pos + copied > inode->i_size)
+ i_size_write(inode, pos + copied);
+
+ unlock_page(page);
set_page_dirty(page);
page_cache_release(page);
- if (pos+copied > inode->i_size)
- i_size_write(inode, pos+copied);
-
return copied;
}
-static ssize_t
-shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
-{
- struct inode *inode = file->f_path.dentry->d_inode;
- loff_t pos;
- unsigned long written;
- ssize_t err;
-
- if ((ssize_t) count < 0)
- return -EINVAL;
-
- if (!access_ok(VERIFY_READ, buf, count))
- return -EFAULT;
-
- mutex_lock(&inode->i_mutex);
-
- pos = *ppos;
- written = 0;
-
- err = generic_write_checks(file, &pos, &count, 0);
- if (err || !count)
- goto out;
-
- err = remove_suid(file->f_path.dentry);
- if (err)
- goto out;
-
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-
- do {
- struct page *page = NULL;
- unsigned long bytes, index, offset;
- char *kaddr;
- int left;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- /*
- * We don't hold page lock across copy from user -
- * what would it guard against? - so no deadlock here.
- * But it still may be a good idea to prefault below.
- */
-
- err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
- if (err)
- break;
-
- left = bytes;
- if (PageHighMem(page)) {
- volatile unsigned char dummy;
- __get_user(dummy, buf);
- __get_user(dummy, buf + bytes - 1);
-
- kaddr = kmap_atomic(page, KM_USER0);
- left = __copy_from_user_inatomic(kaddr + offset,
- buf, bytes);
- kunmap_atomic(kaddr, KM_USER0);
- }
- if (left) {
- kaddr = kmap(page);
- left = __copy_from_user(kaddr + offset, buf, bytes);
- kunmap(page);
- }
-
- written += bytes;
- count -= bytes;
- pos += bytes;
- buf += bytes;
- if (pos > inode->i_size)
- i_size_write(inode, pos);
-
- flush_dcache_page(page);
- set_page_dirty(page);
- mark_page_accessed(page);
- page_cache_release(page);
-
- if (left) {
- pos -= left;
- written -= left;
- err = -EFAULT;
- break;
- }
-
- /*
- * Our dirty pages are not counted in nr_dirty,
- * and we do not attempt to balance dirty pages.
- */
-
- cond_resched();
- } while (count);
-
- *ppos = pos;
- if (written)
- err = written;
-out:
- mutex_unlock(&inode->i_mutex);
- return err;
-}
-
static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
{
struct inode *inode = filp->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
unsigned long index, offset;
+ enum sgp_type sgp = SGP_READ;
+
+ /*
+ * Might this read be for a stacking filesystem? Then when reading
+ * holes of a sparse file, we actually need to allocate those pages,
+ * and even mark them dirty, so it cannot exceed the max_blocks limit.
+ */
+ if (segment_eq(get_fs(), KERNEL_DS))
+ sgp = SGP_DIRTY;
index = *ppos >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
@@ -1629,12 +1594,14 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
break;
}
- desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
+ desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
if (desc->error) {
if (desc->error == -EINVAL)
desc->error = 0;
break;
}
+ if (page)
+ unlock_page(page);
/*
* We must evaluate after, since reads (unlike writes)
@@ -1798,22 +1765,16 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ int ret;
/*
* No ordinary (disk based) filesystem counts links as inodes;
* but each new link needs a new dentry, pinning lowmem, and
* tmpfs dentries cannot be pruned until they are unlinked.
*/
- if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
- return -ENOSPC;
- }
- sbinfo->free_inodes--;
- spin_unlock(&sbinfo->stat_lock);
- }
+ ret = shmem_reserve_inode(inode->i_sb);
+ if (ret)
+ goto out;
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1821,21 +1782,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
atomic_inc(&inode->i_count); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
d_instantiate(dentry, inode);
- return 0;
+out:
+ return ret;
}
static int shmem_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
- }
- }
+ if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
+ shmem_free_inode(inode->i_sb);
dir->i_size -= BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1924,6 +1880,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
iput(inode);
return error;
}
+ unlock_page(page);
inode->i_op = &shmem_symlink_inode_operations;
kaddr = kmap_atomic(page, KM_USER0);
memcpy(kaddr, symname, len);
@@ -1951,6 +1908,8 @@ static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
struct page *page = NULL;
int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+ if (page)
+ unlock_page(page);
return page;
}
@@ -1996,8 +1955,7 @@ static int shmem_xattr_security_get(struct inode *inode, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return security_inode_getsecurity(inode, name, buffer, size,
- -EOPNOTSUPP);
+ return xattr_getsecurity(inode, name, buffer, size);
}
static int shmem_xattr_security_set(struct inode *inode, const char *name,
@@ -2138,7 +2096,7 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
}
if (*rest)
goto bad_val;
- *blocks = size >> PAGE_CACHE_SHIFT;
+ *blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
} else if (!strcmp(this_char,"nr_blocks")) {
*blocks = memparse(value,&rest);
if (*rest)
@@ -2375,7 +2333,8 @@ static const struct file_operations shmem_file_operations = {
#ifdef CONFIG_TMPFS
.llseek = generic_file_llseek,
.read = shmem_file_read,
- .write = shmem_file_write,
+ .write = do_sync_write,
+ .aio_write = generic_file_aio_write,
.fsync = simple_sync_file,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
diff --git a/mm/slab.c b/mm/slab.c
index 202465a193c..40c00dacbe4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -304,11 +304,11 @@ struct kmem_list3 {
/*
* Need this for bootstrapping a per node allocator.
*/
-#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
+#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
#define CACHE_CACHE 0
-#define SIZE_AC 1
-#define SIZE_L3 (1 + MAX_NUMNODES)
+#define SIZE_AC MAX_NUMNODES
+#define SIZE_L3 (2 * MAX_NUMNODES)
static int drain_freelist(struct kmem_cache *cache,
struct kmem_list3 *l3, int tofree);
@@ -730,8 +730,7 @@ static inline void init_lock_keys(void)
#endif
/*
- * 1. Guard access to the cache-chain.
- * 2. Protect sanity of cpu_online_map against cpu hotplug events
+ * Guard access to the cache-chain.
*/
static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
@@ -1331,12 +1330,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
int err = 0;
switch (action) {
- case CPU_LOCK_ACQUIRE:
- mutex_lock(&cache_chain_mutex);
- break;
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
+ mutex_lock(&cache_chain_mutex);
err = cpuup_prepare(cpu);
+ mutex_unlock(&cache_chain_mutex);
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
@@ -1373,9 +1371,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
#endif
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
+ mutex_lock(&cache_chain_mutex);
cpuup_canceled(cpu);
- break;
- case CPU_LOCK_RELEASE:
mutex_unlock(&cache_chain_mutex);
break;
}
@@ -1410,6 +1407,22 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
}
/*
+ * For setting up all the kmem_list3s for cache whose buffer_size is same as
+ * size of kmem_list3.
+ */
+static void __init set_up_list3s(struct kmem_cache *cachep, int index)
+{
+ int node;
+
+ for_each_online_node(node) {
+ cachep->nodelists[node] = &initkmem_list3[index + node];
+ cachep->nodelists[node]->next_reap = jiffies +
+ REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ }
+}
+
+/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
*/
@@ -1432,6 +1445,7 @@ void __init kmem_cache_init(void)
if (i < MAX_NUMNODES)
cache_cache.nodelists[i] = NULL;
}
+ set_up_list3s(&cache_cache, CACHE_CACHE);
/*
* Fragmentation resistance on low memory - only use bigger
@@ -1587,10 +1601,9 @@ void __init kmem_cache_init(void)
{
int nid;
- /* Replace the static kmem_list3 structures for the boot cpu */
- init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
+ for_each_online_node(nid) {
+ init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], nid);
- for_each_node_state(nid, N_NORMAL_MEMORY) {
init_list(malloc_sizes[INDEX_AC].cs_cachep,
&initkmem_list3[SIZE_AC + nid], nid);
@@ -1960,22 +1973,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
}
}
-/*
- * For setting up all the kmem_list3s for cache whose buffer_size is same as
- * size of kmem_list3.
- */
-static void __init set_up_list3s(struct kmem_cache *cachep, int index)
-{
- int node;
-
- for_each_node_state(node, N_NORMAL_MEMORY) {
- cachep->nodelists[node] = &initkmem_list3[index + node];
- cachep->nodelists[node]->next_reap = jiffies +
- REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
- }
-}
-
static void __kmem_cache_destroy(struct kmem_cache *cachep)
{
int i;
@@ -2099,7 +2096,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
g_cpucache_up = PARTIAL_L3;
} else {
int node;
- for_each_node_state(node, N_NORMAL_MEMORY) {
+ for_each_online_node(node) {
cachep->nodelists[node] =
kmalloc_node(sizeof(struct kmem_list3),
GFP_KERNEL, node);
@@ -2170,6 +2167,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* We use cache_chain_mutex to ensure a consistent view of
* cpu_online_map as well. Please see cpuup_callback
*/
+ get_online_cpus();
mutex_lock(&cache_chain_mutex);
list_for_each_entry(pc, &cache_chain, next) {
@@ -2396,6 +2394,7 @@ oops:
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -2547,9 +2546,11 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
int ret;
BUG_ON(!cachep || in_interrupt());
+ get_online_cpus();
mutex_lock(&cache_chain_mutex);
ret = __cache_shrink(cachep);
mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
return ret;
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -2575,6 +2576,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
BUG_ON(!cachep || in_interrupt());
/* Find the cache in the chain of caches. */
+ get_online_cpus();
mutex_lock(&cache_chain_mutex);
/*
* the chain is never empty, cache_cache is never destroyed
@@ -2584,6 +2586,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
slab_error(cachep, "Can't free all objects");
list_add(&cachep->next, &cache_chain);
mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
return;
}
@@ -2592,6 +2595,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
__kmem_cache_destroy(cachep);
mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -3815,7 +3819,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
struct array_cache *new_shared;
struct array_cache **new_alien = NULL;
- for_each_node_state(node, N_NORMAL_MEMORY) {
+ for_each_online_node(node) {
if (use_alien_caches) {
new_alien = alloc_alien_cache(node, cachep->limit);
@@ -4105,7 +4109,7 @@ out:
schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
}
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_SLABINFO
static void print_slabinfo_header(struct seq_file *m)
{
@@ -4475,3 +4479,4 @@ size_t ksize(const void *objp)
return obj_size(virt_to_cache(objp));
}
+EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index 08a9bd91a1a..e2c3c0ec546 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -12,10 +12,17 @@
* allocator is as little as 2 bytes, however typically most architectures
* will require 4 bytes on 32-bit and 8 bytes on 64-bit.
*
- * The slob heap is a linked list of pages from alloc_pages(), and
- * within each page, there is a singly-linked list of free blocks (slob_t).
- * The heap is grown on demand and allocation from the heap is currently
- * first-fit.
+ * The slob heap is a set of linked list of pages from alloc_pages(),
+ * and within each page, there is a singly-linked list of free blocks
+ * (slob_t). The heap is grown on demand. To reduce fragmentation,
+ * heap pages are segregated into three lists, with objects less than
+ * 256 bytes, objects less than 1024 bytes, and all other objects.
+ *
+ * Allocation from heap involves first searching for a page with
+ * sufficient free blocks (using a next-fit-like approach) followed by
+ * a first-fit scan of the page. Deallocation inserts objects back
+ * into the free list in address order, so this is effectively an
+ * address-ordered first fit.
*
* Above this is an implementation of kmalloc/kfree. Blocks returned
* from kmalloc are prepended with a 4-byte header with the kmalloc size.
@@ -110,9 +117,13 @@ static inline void free_slob_page(struct slob_page *sp)
}
/*
- * All (partially) free slob pages go on this list.
+ * All partially free slob pages go on these lists.
*/
-static LIST_HEAD(free_slob_pages);
+#define SLOB_BREAK1 256
+#define SLOB_BREAK2 1024
+static LIST_HEAD(free_slob_small);
+static LIST_HEAD(free_slob_medium);
+static LIST_HEAD(free_slob_large);
/*
* slob_page: True for all slob pages (false for bigblock pages)
@@ -140,9 +151,9 @@ static inline int slob_page_free(struct slob_page *sp)
return test_bit(PG_private, &sp->flags);
}
-static inline void set_slob_page_free(struct slob_page *sp)
+static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
{
- list_add(&sp->list, &free_slob_pages);
+ list_add(&sp->list, list);
__set_bit(PG_private, &sp->flags);
}
@@ -294,12 +305,20 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
{
struct slob_page *sp;
struct list_head *prev;
+ struct list_head *slob_list;
slob_t *b = NULL;
unsigned long flags;
+ if (size < SLOB_BREAK1)
+ slob_list = &free_slob_small;
+ else if (size < SLOB_BREAK2)
+ slob_list = &free_slob_medium;
+ else
+ slob_list = &free_slob_large;
+
spin_lock_irqsave(&slob_lock, flags);
/* Iterate through each partially free page, try to find room */
- list_for_each_entry(sp, &free_slob_pages, list) {
+ list_for_each_entry(sp, slob_list, list) {
#ifdef CONFIG_NUMA
/*
* If there's a node specification, search for a partial
@@ -321,16 +340,16 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
/* Improve fragment distribution and reduce our average
* search time by starting our next search here. (see
* Knuth vol 1, sec 2.5, pg 449) */
- if (prev != free_slob_pages.prev &&
- free_slob_pages.next != prev->next)
- list_move_tail(&free_slob_pages, prev->next);
+ if (prev != slob_list->prev &&
+ slob_list->next != prev->next)
+ list_move_tail(slob_list, prev->next);
break;
}
spin_unlock_irqrestore(&slob_lock, flags);
/* Not enough space: must allocate a new page */
if (!b) {
- b = slob_new_page(gfp, 0, node);
+ b = slob_new_page(gfp & ~__GFP_ZERO, 0, node);
if (!b)
return 0;
sp = (struct slob_page *)virt_to_page(b);
@@ -341,7 +360,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
sp->free = b;
INIT_LIST_HEAD(&sp->list);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
- set_slob_page_free(sp);
+ set_slob_page_free(sp, slob_list);
b = slob_page_alloc(sp, size, align);
BUG_ON(!b);
spin_unlock_irqrestore(&slob_lock, flags);
@@ -387,7 +406,7 @@ static void slob_free(void *block, int size)
set_slob(b, units,
(void *)((unsigned long)(b +
SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
- set_slob_page_free(sp);
+ set_slob_page_free(sp, &free_slob_small);
goto out;
}
@@ -398,6 +417,10 @@ static void slob_free(void *block, int size)
sp->units += units;
if (b < sp->free) {
+ if (b + units == sp->free) {
+ units += slob_units(sp->free);
+ sp->free = slob_next(sp->free);
+ }
set_slob(b, units, sp->free);
sp->free = b;
} else {
@@ -495,6 +518,7 @@ size_t ksize(const void *block)
else
return sp->page.private;
}
+EXPORT_SYMBOL(ksize);
struct kmem_cache {
unsigned int size, align;
diff --git a/mm/slub.c b/mm/slub.c
index 9acb413858a..3f056677fa8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -172,7 +172,7 @@ static inline void ClearSlabDebug(struct page *page)
* Mininum number of partial slabs. These will be left on the partial
* lists even if they are empty. kmem_cache_shrink may reclaim them.
*/
-#define MIN_PARTIAL 2
+#define MIN_PARTIAL 5
/*
* Maximum number of desirable partial slabs.
@@ -247,7 +247,10 @@ static void sysfs_slab_remove(struct kmem_cache *);
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
-static inline void sysfs_slab_remove(struct kmem_cache *s) {}
+static inline void sysfs_slab_remove(struct kmem_cache *s)
+{
+ kfree(s);
+}
#endif
/********************************************************************
@@ -354,22 +357,22 @@ static void print_section(char *text, u8 *addr, unsigned int length)
printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
newline = 0;
}
- printk(" %02x", addr[i]);
+ printk(KERN_CONT " %02x", addr[i]);
offset = i % 16;
ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
if (offset == 15) {
- printk(" %s\n",ascii);
+ printk(KERN_CONT " %s\n", ascii);
newline = 1;
}
}
if (!newline) {
i %= 16;
while (i < 16) {
- printk(" ");
+ printk(KERN_CONT " ");
ascii[i] = ' ';
i++;
}
- printk(" %s\n", ascii);
+ printk(KERN_CONT " %s\n", ascii);
}
}
@@ -529,7 +532,7 @@ static void init_object(struct kmem_cache *s, void *object, int active)
if (s->flags & __OBJECT_POISON) {
memset(p, POISON_FREE, s->objsize - 1);
- p[s->objsize -1] = POISON_END;
+ p[s->objsize - 1] = POISON_END;
}
if (s->flags & SLAB_RED_ZONE)
@@ -558,7 +561,7 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
u8 *object, char *what,
- u8* start, unsigned int value, unsigned int bytes)
+ u8 *start, unsigned int value, unsigned int bytes)
{
u8 *fault;
u8 *end;
@@ -692,7 +695,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
(!check_bytes_and_report(s, page, p, "Poison", p,
POISON_FREE, s->objsize - 1) ||
!check_bytes_and_report(s, page, p, "Poison",
- p + s->objsize -1, POISON_END, 1)))
+ p + s->objsize - 1, POISON_END, 1)))
return 0;
/*
* check_pad_bytes cleans up on its own.
@@ -900,8 +903,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
"SLUB <none>: no slab for object 0x%p.\n",
object);
dump_stack();
- }
- else
+ } else
object_err(s, page, object,
"page slab pointer corrupt.");
goto fail;
@@ -947,7 +949,7 @@ static int __init setup_slub_debug(char *str)
/*
* Determine which debug features should be switched on
*/
- for ( ;*str && *str != ','; str++) {
+ for (; *str && *str != ','; str++) {
switch (tolower(*str)) {
case 'f':
slub_debug |= SLAB_DEBUG_FREE;
@@ -966,7 +968,7 @@ static int __init setup_slub_debug(char *str)
break;
default:
printk(KERN_ERR "slub_debug option '%c' "
- "unknown. skipped\n",*str);
+ "unknown. skipped\n", *str);
}
}
@@ -1039,7 +1041,7 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
*/
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
- struct page * page;
+ struct page *page;
int pages = 1 << s->order;
if (s->order)
@@ -1135,7 +1137,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
mod_zone_page_state(page_zone(page),
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
- - pages);
+ -pages);
__free_pages(page, s->order);
}
@@ -1195,19 +1197,15 @@ static __always_inline int slab_trylock(struct page *page)
/*
* Management of partially allocated slabs
*/
-static void add_partial_tail(struct kmem_cache_node *n, struct page *page)
+static void add_partial(struct kmem_cache_node *n,
+ struct page *page, int tail)
{
spin_lock(&n->list_lock);
n->nr_partial++;
- list_add_tail(&page->lru, &n->partial);
- spin_unlock(&n->list_lock);
-}
-
-static void add_partial(struct kmem_cache_node *n, struct page *page)
-{
- spin_lock(&n->list_lock);
- n->nr_partial++;
- list_add(&page->lru, &n->partial);
+ if (tail)
+ list_add_tail(&page->lru, &n->partial);
+ else
+ list_add(&page->lru, &n->partial);
spin_unlock(&n->list_lock);
}
@@ -1292,7 +1290,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
* expensive if we do it every time we are trying to find a slab
* with available objects.
*/
- if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio)
+ if (!s->remote_node_defrag_ratio ||
+ get_cycles() % 1024 > s->remote_node_defrag_ratio)
return NULL;
zonelist = &NODE_DATA(slab_node(current->mempolicy))
@@ -1335,7 +1334,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
*
* On exit the slab lock will have been dropped.
*/
-static void unfreeze_slab(struct kmem_cache *s, struct page *page)
+static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -1343,7 +1342,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
if (page->inuse) {
if (page->freelist)
- add_partial(n, page);
+ add_partial(n, page, tail);
else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
add_full(n, page);
slab_unlock(page);
@@ -1358,7 +1357,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
* partial list stays small. kmem_cache_shrink can
* reclaim empty slabs from the partial list.
*/
- add_partial_tail(n, page);
+ add_partial(n, page, 1);
slab_unlock(page);
} else {
slab_unlock(page);
@@ -1373,6 +1372,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
struct page *page = c->page;
+ int tail = 1;
/*
* Merge cpu freelist into freelist. Typically we get here
* because both freelists are empty. So this is unlikely
@@ -1381,6 +1381,8 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
while (unlikely(c->freelist)) {
void **object;
+ tail = 0; /* Hot objects. Put the slab first */
+
/* Retrieve object from cpu_freelist */
object = c->freelist;
c->freelist = c->freelist[c->offset];
@@ -1391,7 +1393,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
page->inuse--;
}
c->page = NULL;
- unfreeze_slab(s, page);
+ unfreeze_slab(s, page, tail);
}
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -1539,7 +1541,7 @@ debug:
*
* Otherwise we can simply pick the next object from the lockless free list.
*/
-static void __always_inline *slab_alloc(struct kmem_cache *s,
+static __always_inline void *slab_alloc(struct kmem_cache *s,
gfp_t gfpflags, int node, void *addr)
{
void **object;
@@ -1613,7 +1615,7 @@ checks_ok:
* then add it.
*/
if (unlikely(!prior))
- add_partial(get_node(s, page_to_nid(page)), page);
+ add_partial(get_node(s, page_to_nid(page)), page, 1);
out_unlock:
slab_unlock(page);
@@ -1647,7 +1649,7 @@ debug:
* If fastpath is not possible then fall back to __slab_free where we deal
* with all sorts of special processing.
*/
-static void __always_inline slab_free(struct kmem_cache *s,
+static __always_inline void slab_free(struct kmem_cache *s,
struct page *page, void *x, void *addr)
{
void **object = (void *)x;
@@ -1997,6 +1999,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
{
struct page *page;
struct kmem_cache_node *n;
+ unsigned long flags;
BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
@@ -2021,7 +2024,14 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
#endif
init_kmem_cache_node(n);
atomic_long_inc(&n->nr_slabs);
- add_partial(n, page);
+ /*
+ * lockdep requires consistent irq usage for each lock
+ * so even though there cannot be a race this early in
+ * the boot sequence, we still disable irqs.
+ */
+ local_irq_save(flags);
+ add_partial(n, page, 0);
+ local_irq_restore(flags);
return n;
}
@@ -2206,7 +2216,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
s->refcount = 1;
#ifdef CONFIG_NUMA
- s->defrag_ratio = 100;
+ s->remote_node_defrag_ratio = 100;
#endif
if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
goto error;
@@ -2228,7 +2238,7 @@ error:
*/
int kmem_ptr_validate(struct kmem_cache *s, const void *object)
{
- struct page * page;
+ struct page *page;
page = get_object_page(object);
@@ -2322,7 +2332,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (kmem_cache_close(s))
WARN_ON(1);
sysfs_slab_remove(s);
- kfree(s);
} else
up_write(&slub_lock);
}
@@ -2341,7 +2350,7 @@ static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
static int __init setup_slub_min_order(char *str)
{
- get_option (&str, &slub_min_order);
+ get_option(&str, &slub_min_order);
return 1;
}
@@ -2350,7 +2359,7 @@ __setup("slub_min_order=", setup_slub_min_order);
static int __init setup_slub_max_order(char *str)
{
- get_option (&str, &slub_max_order);
+ get_option(&str, &slub_max_order);
return 1;
}
@@ -2359,7 +2368,7 @@ __setup("slub_max_order=", setup_slub_max_order);
static int __init setup_slub_min_objects(char *str)
{
- get_option (&str, &slub_min_objects);
+ get_option(&str, &slub_min_objects);
return 1;
}
@@ -2558,8 +2567,12 @@ size_t ksize(const void *object)
if (unlikely(object == ZERO_SIZE_PTR))
return 0;
- page = get_object_page(object);
+ page = virt_to_head_page(object);
BUG_ON(!page);
+
+ if (unlikely(!PageSlab(page)))
+ return PAGE_SIZE << compound_order(page);
+
s = page->slab;
BUG_ON(!s);
@@ -2601,6 +2614,19 @@ void kfree(const void *x)
}
EXPORT_SYMBOL(kfree);
+static unsigned long count_partial(struct kmem_cache_node *n)
+{
+ unsigned long flags;
+ unsigned long x = 0;
+ struct page *page;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->partial, lru)
+ x += page->inuse;
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return x;
+}
+
/*
* kmem_cache_shrink removes empty slabs from the partial lists and sorts
* the remaining slabs by the number of items in use. The slabs with the
@@ -2927,7 +2953,7 @@ static struct kmem_cache *find_mergeable(size_t size,
* Check if alignment is compatible.
* Courtesy of Adrian Drzewiecki
*/
- if ((s->size & ~(align -1)) != s->size)
+ if ((s->size & ~(align - 1)) != s->size)
continue;
if (s->size - size >= sizeof(void *))
@@ -3036,8 +3062,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata slab_notifier =
- { &slab_cpuup_callback, NULL, 0 };
+static struct notifier_block __cpuinitdata slab_notifier = {
+ &slab_cpuup_callback, NULL, 0
+};
#endif
@@ -3373,7 +3400,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
static int list_locations(struct kmem_cache *s, char *buf,
enum track_item alloc)
{
- int n = 0;
+ int len = 0;
unsigned long i;
struct loc_track t = { 0, 0, NULL };
int node;
@@ -3404,67 +3431,54 @@ static int list_locations(struct kmem_cache *s, char *buf,
for (i = 0; i < t.count; i++) {
struct location *l = &t.loc[i];
- if (n > PAGE_SIZE - 100)
+ if (len > PAGE_SIZE - 100)
break;
- n += sprintf(buf + n, "%7ld ", l->count);
+ len += sprintf(buf + len, "%7ld ", l->count);
if (l->addr)
- n += sprint_symbol(buf + n, (unsigned long)l->addr);
+ len += sprint_symbol(buf + len, (unsigned long)l->addr);
else
- n += sprintf(buf + n, "<not-available>");
+ len += sprintf(buf + len, "<not-available>");
if (l->sum_time != l->min_time) {
unsigned long remainder;
- n += sprintf(buf + n, " age=%ld/%ld/%ld",
+ len += sprintf(buf + len, " age=%ld/%ld/%ld",
l->min_time,
div_long_long_rem(l->sum_time, l->count, &remainder),
l->max_time);
} else
- n += sprintf(buf + n, " age=%ld",
+ len += sprintf(buf + len, " age=%ld",
l->min_time);
if (l->min_pid != l->max_pid)
- n += sprintf(buf + n, " pid=%ld-%ld",
+ len += sprintf(buf + len, " pid=%ld-%ld",
l->min_pid, l->max_pid);
else
- n += sprintf(buf + n, " pid=%ld",
+ len += sprintf(buf + len, " pid=%ld",
l->min_pid);
if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
- n < PAGE_SIZE - 60) {
- n += sprintf(buf + n, " cpus=");
- n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50,
+ len < PAGE_SIZE - 60) {
+ len += sprintf(buf + len, " cpus=");
+ len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
l->cpus);
}
if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
- n < PAGE_SIZE - 60) {
- n += sprintf(buf + n, " nodes=");
- n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50,
+ len < PAGE_SIZE - 60) {
+ len += sprintf(buf + len, " nodes=");
+ len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
l->nodes);
}
- n += sprintf(buf + n, "\n");
+ len += sprintf(buf + len, "\n");
}
free_loc_track(&t);
if (!t.count)
- n += sprintf(buf, "No data\n");
- return n;
-}
-
-static unsigned long count_partial(struct kmem_cache_node *n)
-{
- unsigned long flags;
- unsigned long x = 0;
- struct page *page;
-
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
- x += page->inuse;
- spin_unlock_irqrestore(&n->list_lock, flags);
- return x;
+ len += sprintf(buf, "No data\n");
+ return len;
}
enum slab_stat_type {
@@ -3494,7 +3508,6 @@ static unsigned long slab_objects(struct kmem_cache *s,
for_each_possible_cpu(cpu) {
struct page *page;
- int node;
struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
if (!c)
@@ -3506,8 +3519,6 @@ static unsigned long slab_objects(struct kmem_cache *s,
continue;
if (page) {
if (flags & SO_CPU) {
- int x = 0;
-
if (flags & SO_OBJECTS)
x = page->inuse;
else
@@ -3844,24 +3855,24 @@ static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
SLAB_ATTR_RO(free_calls);
#ifdef CONFIG_NUMA
-static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf)
+static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->defrag_ratio / 10);
+ return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
}
-static ssize_t defrag_ratio_store(struct kmem_cache *s,
+static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
const char *buf, size_t length)
{
int n = simple_strtoul(buf, NULL, 10);
if (n < 100)
- s->defrag_ratio = n * 10;
+ s->remote_node_defrag_ratio = n * 10;
return length;
}
-SLAB_ATTR(defrag_ratio);
+SLAB_ATTR(remote_node_defrag_ratio);
#endif
-static struct attribute * slab_attrs[] = {
+static struct attribute *slab_attrs[] = {
&slab_size_attr.attr,
&object_size_attr.attr,
&objs_per_slab_attr.attr,
@@ -3889,7 +3900,7 @@ static struct attribute * slab_attrs[] = {
&cache_dma_attr.attr,
#endif
#ifdef CONFIG_NUMA
- &defrag_ratio_attr.attr,
+ &remote_node_defrag_ratio_attr.attr,
#endif
NULL
};
@@ -3936,6 +3947,13 @@ static ssize_t slab_attr_store(struct kobject *kobj,
return err;
}
+static void kmem_cache_release(struct kobject *kobj)
+{
+ struct kmem_cache *s = to_slab(kobj);
+
+ kfree(s);
+}
+
static struct sysfs_ops slab_sysfs_ops = {
.show = slab_attr_show,
.store = slab_attr_store,
@@ -3943,6 +3961,7 @@ static struct sysfs_ops slab_sysfs_ops = {
static struct kobj_type slab_ktype = {
.sysfs_ops = &slab_sysfs_ops,
+ .release = kmem_cache_release
};
static int uevent_filter(struct kset *kset, struct kobject *kobj)
@@ -3958,7 +3977,7 @@ static struct kset_uevent_ops slab_uevent_ops = {
.filter = uevent_filter,
};
-static decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
+static struct kset *slab_kset;
#define ID_STR_LENGTH 64
@@ -4011,7 +4030,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
* This is typically the case for debug situations. In that
* case we can catch duplicate names easily.
*/
- sysfs_remove_link(&slab_subsys.kobj, s->name);
+ sysfs_remove_link(&slab_kset->kobj, s->name);
name = s->name;
} else {
/*
@@ -4021,12 +4040,12 @@ static int sysfs_slab_add(struct kmem_cache *s)
name = create_unique_id(s);
}
- kobj_set_kset_s(s, slab_subsys);
- kobject_set_name(&s->kobj, name);
- kobject_init(&s->kobj);
- err = kobject_add(&s->kobj);
- if (err)
+ s->kobj.kset = slab_kset;
+ err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
+ if (err) {
+ kobject_put(&s->kobj);
return err;
+ }
err = sysfs_create_group(&s->kobj, &slab_attr_group);
if (err)
@@ -4044,6 +4063,7 @@ static void sysfs_slab_remove(struct kmem_cache *s)
{
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
+ kobject_put(&s->kobj);
}
/*
@@ -4066,9 +4086,8 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
/*
* If we have a leftover link then remove it.
*/
- sysfs_remove_link(&slab_subsys.kobj, name);
- return sysfs_create_link(&slab_subsys.kobj,
- &s->kobj, name);
+ sysfs_remove_link(&slab_kset->kobj, name);
+ return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
}
al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
@@ -4087,8 +4106,8 @@ static int __init slab_sysfs_init(void)
struct kmem_cache *s;
int err;
- err = subsystem_register(&slab_subsys);
- if (err) {
+ slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
+ if (!slab_kset) {
printk(KERN_ERR "Cannot register slab subsystem.\n");
return -ENOSYS;
}
@@ -4119,3 +4138,89 @@ static int __init slab_sysfs_init(void)
__initcall(slab_sysfs_init);
#endif
+
+/*
+ * The /proc/slabinfo ABI
+ */
+#ifdef CONFIG_SLABINFO
+
+ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+ size_t count, loff_t *ppos)
+{
+ return -EINVAL;
+}
+
+
+static void print_slabinfo_header(struct seq_file *m)
+{
+ seq_puts(m, "slabinfo - version: 2.1\n");
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
+ "<objperslab> <pagesperslab>");
+ seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+ seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+ seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+
+ down_read(&slub_lock);
+ if (!n)
+ print_slabinfo_header(m);
+
+ return seq_list_start(&slab_caches, *pos);
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &slab_caches, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ up_read(&slub_lock);
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ unsigned long nr_partials = 0;
+ unsigned long nr_slabs = 0;
+ unsigned long nr_inuse = 0;
+ unsigned long nr_objs;
+ struct kmem_cache *s;
+ int node;
+
+ s = list_entry(p, struct kmem_cache, list);
+
+ for_each_online_node(node) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ if (!n)
+ continue;
+
+ nr_partials += n->nr_partial;
+ nr_slabs += atomic_long_read(&n->nr_slabs);
+ nr_inuse += count_partial(n);
+ }
+
+ nr_objs = nr_slabs * s->objects;
+ nr_inuse += (nr_slabs - nr_partials) * s->objects;
+
+ seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
+ nr_objs, s->size, s->objects, (1 << s->order));
+ seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
+ seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
+ 0UL);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+const struct seq_operations slabinfo_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+#endif /* CONFIG_SLABINFO */
diff --git a/mm/sparse.c b/mm/sparse.c
index e06f514fe04..f6a43c09c32 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -83,6 +83,8 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
return -EEXIST;
section = sparse_index_alloc(nid);
+ if (!section)
+ return -ENOMEM;
/*
* This lock keeps two different sections from
* reallocating for the same index
@@ -235,7 +237,7 @@ static unsigned long *__kmalloc_section_usemap(void)
}
#endif /* CONFIG_MEMORY_HOTPLUG */
-static unsigned long *sparse_early_usemap_alloc(unsigned long pnum)
+static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
{
unsigned long *usemap;
struct mem_section *ms = __nr_to_section(pnum);
@@ -351,17 +353,9 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
return __kmalloc_section_memmap(nr_pages);
}
-static int vaddr_in_vmalloc_area(void *addr)
-{
- if (addr >= (void *)VMALLOC_START &&
- addr < (void *)VMALLOC_END)
- return 1;
- return 0;
-}
-
static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
- if (vaddr_in_vmalloc_area(memmap))
+ if (is_vmalloc_addr(memmap))
vfree(memmap);
else
free_pages((unsigned long)memmap,
@@ -389,9 +383,17 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
* no locking for this, because it does its own
* plus, it does a kmalloc
*/
- sparse_index_init(section_nr, pgdat->node_id);
+ ret = sparse_index_init(section_nr, pgdat->node_id);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
+ if (!memmap)
+ return -ENOMEM;
usemap = __kmalloc_section_usemap();
+ if (!usemap) {
+ __kfree_section_memmap(memmap, nr_pages);
+ return -ENOMEM;
+ }
pgdat_resize_lock(pgdat, &flags);
@@ -401,18 +403,16 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
goto out;
}
- if (!usemap) {
- ret = -ENOMEM;
- goto out;
- }
ms->section_mem_map |= SECTION_MARKED_PRESENT;
ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
out:
pgdat_resize_unlock(pgdat, &flags);
- if (ret <= 0)
+ if (ret <= 0) {
+ kfree(usemap);
__kfree_section_memmap(memmap, nr_pages);
+ }
return ret;
}
#endif
diff --git a/mm/swap.c b/mm/swap.c
index 9ac88323d23..57b7e25a939 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -41,7 +41,7 @@ static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
* This path almost never happens for VM activity - pages are normally
* freed via pagevecs. But it gets used by networking.
*/
-static void fastcall __page_cache_release(struct page *page)
+static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
unsigned long flags;
@@ -165,7 +165,7 @@ int rotate_reclaimable_page(struct page *page)
/*
* FIXME: speed this up?
*/
-void fastcall activate_page(struct page *page)
+void activate_page(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -186,7 +186,7 @@ void fastcall activate_page(struct page *page)
* inactive,referenced -> active,unreferenced
* active,unreferenced -> active,referenced
*/
-void fastcall mark_page_accessed(struct page *page)
+void mark_page_accessed(struct page *page)
{
if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
activate_page(page);
@@ -202,7 +202,7 @@ EXPORT_SYMBOL(mark_page_accessed);
* lru_cache_add: add a page to the page lists
* @page: the page to add
*/
-void fastcall lru_cache_add(struct page *page)
+void lru_cache_add(struct page *page)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
@@ -212,7 +212,7 @@ void fastcall lru_cache_add(struct page *page)
put_cpu_var(lru_add_pvecs);
}
-void fastcall lru_cache_add_active(struct page *page)
+void lru_cache_add_active(struct page *page)
{
struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b52635601df..ec42f01a8d0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
@@ -51,26 +52,22 @@ static struct {
unsigned long del_total;
unsigned long find_success;
unsigned long find_total;
- unsigned long noent_race;
- unsigned long exist_race;
} swap_cache_info;
void show_swap_cache_info(void)
{
- printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
+ printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
swap_cache_info.add_total, swap_cache_info.del_total,
- swap_cache_info.find_success, swap_cache_info.find_total,
- swap_cache_info.noent_race, swap_cache_info.exist_race);
+ swap_cache_info.find_success, swap_cache_info.find_total);
printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}
/*
- * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
- gfp_t gfp_mask)
+int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
int error;
@@ -88,6 +85,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
set_page_private(page, entry.val);
total_swapcache_pages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
+ INC_CACHE_INFO(add_total);
}
write_unlock_irq(&swapper_space.tree_lock);
radix_tree_preload_end();
@@ -95,31 +93,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
return error;
}
-static int add_to_swap_cache(struct page *page, swp_entry_t entry)
-{
- int error;
-
- BUG_ON(PageLocked(page));
- if (!swap_duplicate(entry)) {
- INC_CACHE_INFO(noent_race);
- return -ENOENT;
- }
- SetPageLocked(page);
- error = __add_to_swap_cache(page, entry, GFP_KERNEL);
- /*
- * Anon pages are already on the LRU, we don't run lru_cache_add here.
- */
- if (error) {
- ClearPageLocked(page);
- swap_free(entry);
- if (error == -EEXIST)
- INC_CACHE_INFO(exist_race);
- return error;
- }
- INC_CACHE_INFO(add_total);
- return 0;
-}
-
/*
* This must be called only on pages that have
* been verified to be in the swap cache.
@@ -152,6 +125,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
int err;
BUG_ON(!PageLocked(page));
+ BUG_ON(!PageUptodate(page));
for (;;) {
entry = get_swap_page();
@@ -169,18 +143,15 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
/*
* Add it to the swap cache and mark it dirty
*/
- err = __add_to_swap_cache(page, entry,
+ err = add_to_swap_cache(page, entry,
gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
switch (err) {
case 0: /* Success */
- SetPageUptodate(page);
SetPageDirty(page);
- INC_CACHE_INFO(add_total);
return 1;
case -EEXIST:
/* Raced with "speculative" read_swap_cache_async */
- INC_CACHE_INFO(exist_race);
swap_free(entry);
continue;
default:
@@ -211,40 +182,6 @@ void delete_from_swap_cache(struct page *page)
page_cache_release(page);
}
-/*
- * Strange swizzling function only for use by shmem_writepage
- */
-int move_to_swap_cache(struct page *page, swp_entry_t entry)
-{
- int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
- if (!err) {
- remove_from_page_cache(page);
- page_cache_release(page); /* pagecache ref */
- if (!swap_duplicate(entry))
- BUG();
- SetPageDirty(page);
- INC_CACHE_INFO(add_total);
- } else if (err == -EEXIST)
- INC_CACHE_INFO(exist_race);
- return err;
-}
-
-/*
- * Strange swizzling function for shmem_getpage (and shmem_unuse)
- */
-int move_from_swap_cache(struct page *page, unsigned long index,
- struct address_space *mapping)
-{
- int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
- if (!err) {
- delete_from_swap_cache(page);
- /* shift page from clean_pages to dirty_pages list */
- ClearPageDirty(page);
- set_page_dirty(page);
- }
- return err;
-}
-
/*
* If we are the only user, then try to free up the swap cache.
*
@@ -317,7 +254,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
* A failure return means that either the page allocation failed or that
* the swap entry is no longer in use.
*/
-struct page *read_swap_cache_async(swp_entry_t entry,
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *found_page, *new_page = NULL;
@@ -337,23 +274,27 @@ struct page *read_swap_cache_async(swp_entry_t entry,
* Get a new page to read into from swap.
*/
if (!new_page) {
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
- vma, addr);
+ new_page = alloc_page_vma(gfp_mask, vma, addr);
if (!new_page)
break; /* Out of memory */
}
/*
+ * Swap entry may have been freed since our caller observed it.
+ */
+ if (!swap_duplicate(entry))
+ break;
+
+ /*
* Associate the page with swap entry in the swap cache.
- * May fail (-ENOENT) if swap entry has been freed since
- * our caller observed it. May fail (-EEXIST) if there
- * is already a page associated with this entry in the
- * swap cache: added by a racing read_swap_cache_async,
- * or by try_to_swap_out (or shmem_writepage) re-using
- * the just freed swap entry for an existing page.
+ * May fail (-EEXIST) if there is already a page associated
+ * with this entry in the swap cache: added by a racing
+ * read_swap_cache_async, or add_to_swap or shmem_writepage
+ * re-using the just freed swap entry for an existing page.
* May fail (-ENOMEM) if radix-tree node allocation failed.
*/
- err = add_to_swap_cache(new_page, entry);
+ SetPageLocked(new_page);
+ err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
if (!err) {
/*
* Initiate read into locked page and return.
@@ -362,9 +303,57 @@ struct page *read_swap_cache_async(swp_entry_t entry,
swap_readpage(NULL, new_page);
return new_page;
}
- } while (err != -ENOENT && err != -ENOMEM);
+ ClearPageLocked(new_page);
+ swap_free(entry);
+ } while (err != -ENOMEM);
if (new_page)
page_cache_release(new_page);
return found_page;
}
+
+/**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @vma: user vma this address belongs to
+ * @addr: target address for mempolicy
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time. We also make sure to queue
+ * the 'original' request together with the readahead ones...
+ *
+ * This has been extended to use the NUMA policies from the mm triggering
+ * the readahead.
+ *
+ * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
+ */
+struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ int nr_pages;
+ struct page *page;
+ unsigned long offset;
+ unsigned long end_offset;
+
+ /*
+ * Get starting offset for readaround, and number of pages to read.
+ * Adjust starting address by readbehind (for NUMA interleave case)?
+ * No, it's very unlikely that swap layout would follow vma layout,
+ * more likely that neighbouring swap pages came from the same node:
+ * so use the same "addr" to choose the same node for each swap read.
+ */
+ nr_pages = valid_swaphandles(entry, &offset);
+ for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
+ /* Ok, do the async read-ahead now */
+ page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
+ gfp_mask, vma, addr);
+ if (!page)
+ break;
+ page_cache_release(page);
+ }
+ lru_add_drain(); /* Push any new pages onto the LRU now */
+ return read_swap_cache_async(entry, gfp_mask, vma, addr);
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f071648e136..eade24da931 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -506,9 +506,19 @@ unsigned int count_swap_pages(int type, int free)
* just let do_wp_page work it out if a write is requested later - to
* force COW, vm_page_prot omits write permission from any private vma.
*/
-static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
+static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, swp_entry_t entry, struct page *page)
{
+ spinlock_t *ptl;
+ pte_t *pte;
+ int found = 1;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
+ found = 0;
+ goto out;
+ }
+
inc_mm_counter(vma->vm_mm, anon_rss);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
@@ -520,6 +530,9 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
* immediately swapped out again after swapon.
*/
activate_page(page);
+out:
+ pte_unmap_unlock(pte, ptl);
+ return found;
}
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -528,22 +541,33 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
{
pte_t swp_pte = swp_entry_to_pte(entry);
pte_t *pte;
- spinlock_t *ptl;
int found = 0;
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ /*
+ * We don't actually need pte lock while scanning for swp_pte: since
+ * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
+ * page table while we're scanning; though it could get zapped, and on
+ * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
+ * of unmatched parts which look like swp_pte, so unuse_pte must
+ * recheck under pte lock. Scanning without pte lock lets it be
+ * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
+ */
+ pte = pte_offset_map(pmd, addr);
do {
/*
* swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte.
*/
if (unlikely(pte_same(*pte, swp_pte))) {
- unuse_pte(vma, pte++, addr, entry, page);
- found = 1;
- break;
+ pte_unmap(pte);
+ found = unuse_pte(vma, pmd, addr, entry, page);
+ if (found)
+ goto out;
+ pte = pte_offset_map(pmd, addr);
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap_unlock(pte - 1, ptl);
+ pte_unmap(pte - 1);
+out:
return found;
}
@@ -730,7 +754,8 @@ static int try_to_unuse(unsigned int type)
*/
swap_map = &si->swap_map[i];
entry = swp_entry(type, i);
- page = read_swap_cache_async(entry, NULL, 0);
+ page = read_swap_cache_async(entry,
+ GFP_HIGHUSER_MOVABLE, NULL, 0);
if (!page) {
/*
* Either swap_duplicate() failed because entry
@@ -789,7 +814,7 @@ static int try_to_unuse(unsigned int type)
atomic_inc(&new_start_mm->mm_users);
atomic_inc(&prev_mm->mm_users);
spin_lock(&mmlist_lock);
- while (*swap_map > 1 && !retval &&
+ while (*swap_map > 1 && !retval && !shmem &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
if (!atomic_inc_not_zero(&mm->mm_users))
@@ -821,6 +846,13 @@ static int try_to_unuse(unsigned int type)
mmput(start_mm);
start_mm = new_start_mm;
}
+ if (shmem) {
+ /* page has already been unlocked and released */
+ if (shmem > 0)
+ continue;
+ retval = shmem;
+ break;
+ }
if (retval) {
unlock_page(page);
page_cache_release(page);
@@ -859,12 +891,6 @@ static int try_to_unuse(unsigned int type)
* read from disk into another page. Splitting into two
* pages would be incorrect if swap supported "shared
* private" pages, but they are handled by tmpfs files.
- *
- * Note shmem_unuse already deleted a swappage from
- * the swap cache, unless the move to filepage failed:
- * in which case it left swappage in cache, lowered its
- * swap count to pass quickly through the loops above,
- * and now we must reincrement count to try again later.
*/
if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
struct writeback_control wbc = {
@@ -875,12 +901,8 @@ static int try_to_unuse(unsigned int type)
lock_page(page);
wait_on_page_writeback(page);
}
- if (PageSwapCache(page)) {
- if (shmem)
- swap_duplicate(entry);
- else
- delete_from_swap_cache(page);
- }
+ if (PageSwapCache(page))
+ delete_from_swap_cache(page);
/*
* So we could skip searching mms once swap count went
@@ -1768,31 +1790,48 @@ get_swap_info_struct(unsigned type)
*/
int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
{
+ struct swap_info_struct *si;
int our_page_cluster = page_cluster;
- int ret = 0, i = 1 << our_page_cluster;
- unsigned long toff;
- struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
+ pgoff_t target, toff;
+ pgoff_t base, end;
+ int nr_pages = 0;
if (!our_page_cluster) /* no readahead */
return 0;
- toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster;
- if (!toff) /* first page is swap header */
- toff++, i--;
- *offset = toff;
+
+ si = &swap_info[swp_type(entry)];
+ target = swp_offset(entry);
+ base = (target >> our_page_cluster) << our_page_cluster;
+ end = base + (1 << our_page_cluster);
+ if (!base) /* first page is swap header */
+ base++;
spin_lock(&swap_lock);
- do {
- /* Don't read-ahead past the end of the swap area */
- if (toff >= swapdev->max)
+ if (end > si->max) /* don't go beyond end of map */
+ end = si->max;
+
+ /* Count contiguous allocated slots above our target */
+ for (toff = target; ++toff < end; nr_pages++) {
+ /* Don't read in free or bad pages */
+ if (!si->swap_map[toff])
+ break;
+ if (si->swap_map[toff] == SWAP_MAP_BAD)
break;
+ }
+ /* Count contiguous allocated slots below our target */
+ for (toff = target; --toff >= base; nr_pages++) {
/* Don't read in free or bad pages */
- if (!swapdev->swap_map[toff])
+ if (!si->swap_map[toff])
break;
- if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
+ if (si->swap_map[toff] == SWAP_MAP_BAD)
break;
- toff++;
- ret++;
- } while (--i);
+ }
spin_unlock(&swap_lock);
- return ret;
+
+ /*
+ * Indicate starting offset, and return number of pages to get:
+ * if only 1, say 0, since there's then no readahead to be done.
+ */
+ *offset = ++toff;
+ return nr_pages? ++nr_pages: 0;
}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index d436a9c82db..702083638c1 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -121,18 +121,6 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
return 0;
}
-#if 0
-int shmem_mmap(struct file *file, struct vm_area_struct *vma)
-{
- file_accessed(file);
-#ifndef CONFIG_MMU
- return ramfs_nommu_mmap(file, vma);
-#else
- return 0;
-#endif
-}
-#endif /* 0 */
-
#ifndef CONFIG_MMU
unsigned long shmem_get_unmapped_area(struct file *file,
unsigned long addr,
diff --git a/mm/truncate.c b/mm/truncate.c
index cadc15653dd..c35c49e54fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -21,7 +21,7 @@
/**
- * do_invalidatepage - invalidate part of all of a page
+ * do_invalidatepage - invalidate part or all of a page
* @page: the page which is affected
* @offset: the index of the truncation point
*
@@ -48,7 +48,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
- zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0);
+ zero_user_segment(page, partial, PAGE_CACHE_SIZE);
if (PagePrivate(page))
do_invalidatepage(page, partial);
}
@@ -84,7 +84,7 @@ EXPORT_SYMBOL(cancel_dirty_page);
/*
* If truncate cannot remove the fs-private metadata from the page, the page
- * becomes anonymous. It will be left on the LRU and may even be mapped into
+ * becomes orphaned. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_fault().
*
* We need to bale out if page->mapping is no longer equal to the original
@@ -98,11 +98,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (page->mapping != mapping)
return;
- cancel_dirty_page(page, PAGE_CACHE_SIZE);
-
if (PagePrivate(page))
do_invalidatepage(page, 0);
+ cancel_dirty_page(page, PAGE_CACHE_SIZE);
+
remove_from_page_cache(page);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af77e171e33..0536dde139d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -166,6 +166,44 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
}
EXPORT_SYMBOL_GPL(map_vm_area);
+/*
+ * Map a vmalloc()-space virtual address to the physical page.
+ */
+struct page *vmalloc_to_page(const void *vmalloc_addr)
+{
+ unsigned long addr = (unsigned long) vmalloc_addr;
+ struct page *page = NULL;
+ pgd_t *pgd = pgd_offset_k(addr);
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+
+ if (!pgd_none(*pgd)) {
+ pud = pud_offset(pgd, addr);
+ if (!pud_none(*pud)) {
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_none(*pmd)) {
+ ptep = pte_offset_map(pmd, addr);
+ pte = *ptep;
+ if (pte_present(pte))
+ page = pte_page(pte);
+ pte_unmap(ptep);
+ }
+ }
+ }
+ return page;
+}
+EXPORT_SYMBOL(vmalloc_to_page);
+
+/*
+ * Map a vmalloc()-space virtual address to the physical page frame number.
+ */
+unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
+{
+ return page_to_pfn(vmalloc_to_page(vmalloc_addr));
+}
+EXPORT_SYMBOL(vmalloc_to_pfn);
+
static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
int node, gfp_t gfp_mask)
@@ -216,6 +254,10 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
if (addr > end - size)
goto out;
}
+ if ((size + addr) < addr)
+ goto out;
+ if (addr > end - size)
+ goto out;
found:
area->next = *p;
@@ -268,7 +310,7 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
}
/* Caller must hold vmlist_lock */
-static struct vm_struct *__find_vm_area(void *addr)
+static struct vm_struct *__find_vm_area(const void *addr)
{
struct vm_struct *tmp;
@@ -281,7 +323,7 @@ static struct vm_struct *__find_vm_area(void *addr)
}
/* Caller must hold vmlist_lock */
-static struct vm_struct *__remove_vm_area(void *addr)
+static struct vm_struct *__remove_vm_area(const void *addr)
{
struct vm_struct **p, *tmp;
@@ -310,7 +352,7 @@ found:
* This function returns the found VM area, but using it is NOT safe
* on SMP machines, except for its size or flags.
*/
-struct vm_struct *remove_vm_area(void *addr)
+struct vm_struct *remove_vm_area(const void *addr)
{
struct vm_struct *v;
write_lock(&vmlist_lock);
@@ -319,7 +361,7 @@ struct vm_struct *remove_vm_area(void *addr)
return v;
}
-static void __vunmap(void *addr, int deallocate_pages)
+static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
@@ -346,8 +388,10 @@ static void __vunmap(void *addr, int deallocate_pages)
int i;
for (i = 0; i < area->nr_pages; i++) {
- BUG_ON(!area->pages[i]);
- __free_page(area->pages[i]);
+ struct page *page = area->pages[i];
+
+ BUG_ON(!page);
+ __free_page(page);
}
if (area->flags & VM_VPAGES)
@@ -370,7 +414,7 @@ static void __vunmap(void *addr, int deallocate_pages)
*
* Must not be called in interrupt context.
*/
-void vfree(void *addr)
+void vfree(const void *addr)
{
BUG_ON(in_interrupt());
__vunmap(addr, 1);
@@ -386,7 +430,7 @@ EXPORT_SYMBOL(vfree);
*
* Must not be called in interrupt context.
*/
-void vunmap(void *addr)
+void vunmap(const void *addr)
{
BUG_ON(in_interrupt());
__vunmap(addr, 0);
@@ -423,8 +467,8 @@ void *vmap(struct page **pages, unsigned int count,
}
EXPORT_SYMBOL(vmap);
-void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
+ pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
@@ -451,15 +495,19 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
for (i = 0; i < area->nr_pages; i++) {
+ struct page *page;
+
if (node < 0)
- area->pages[i] = alloc_page(gfp_mask);
+ page = alloc_page(gfp_mask);
else
- area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
- if (unlikely(!area->pages[i])) {
+ page = alloc_pages_node(node, gfp_mask, 0);
+
+ if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
+ area->pages[i] = page;
}
if (map_vm_area(area, prot, &pages))
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e8d846f5777..422d960ffcd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -21,21 +21,14 @@ EXPORT_PER_CPU_SYMBOL(vm_event_states);
static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
{
- int cpu = 0;
+ int cpu;
int i;
memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
- cpu = first_cpu(*cpumask);
- while (cpu < NR_CPUS) {
+ for_each_cpu_mask(cpu, *cpumask) {
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
- cpu = next_cpu(cpu, *cpumask);
-
- if (cpu < NR_CPUS)
- prefetch(&per_cpu(vm_event_states, cpu));
-
-
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
ret[i] += this->event[i];
}
@@ -284,6 +277,10 @@ EXPORT_SYMBOL(dec_zone_page_state);
/*
* Update the zone counters for one cpu.
*
+ * The cpu specified must be either the current cpu or a processor that
+ * is not online. If it is the current cpu then the execution thread must
+ * be pinned to the current cpu.
+ *
* Note that refresh_cpu_vm_stats strives to only access
* node local memory. The per cpu pagesets on remote zones are placed
* in the memory local to the processor using that pageset. So the
@@ -299,7 +296,7 @@ void refresh_cpu_vm_stats(int cpu)
{
struct zone *zone;
int i;
- unsigned long flags;
+ int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
for_each_zone(zone) {
struct per_cpu_pageset *p;
@@ -311,15 +308,19 @@ void refresh_cpu_vm_stats(int cpu)
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
if (p->vm_stat_diff[i]) {
+ unsigned long flags;
+ int v;
+
local_irq_save(flags);
- zone_page_state_add(p->vm_stat_diff[i],
- zone, i);
+ v = p->vm_stat_diff[i];
p->vm_stat_diff[i] = 0;
+ local_irq_restore(flags);
+ atomic_long_add(v, &zone->vm_stat[i]);
+ global_diff[i] += v;
#ifdef CONFIG_NUMA
/* 3 seconds idle till flush */
p->expire = 3;
#endif
- local_irq_restore(flags);
}
#ifdef CONFIG_NUMA
/*
@@ -329,7 +330,7 @@ void refresh_cpu_vm_stats(int cpu)
* Check if there are pages remaining in this pageset
* if not then there is nothing to expire.
*/
- if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
+ if (!p->expire || !p->pcp.count)
continue;
/*
@@ -344,13 +345,14 @@ void refresh_cpu_vm_stats(int cpu)
if (p->expire)
continue;
- if (p->pcp[0].count)
- drain_zone_pages(zone, p->pcp + 0);
-
- if (p->pcp[1].count)
- drain_zone_pages(zone, p->pcp + 1);
+ if (p->pcp.count)
+ drain_zone_pages(zone, &p->pcp);
#endif
}
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (global_diff[i])
+ atomic_long_add(global_diff[i], &vm_stat[i]);
}
#endif
@@ -681,20 +683,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n pagesets");
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
- int j;
pageset = zone_pcp(zone, i);
- for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
- seq_printf(m,
- "\n cpu: %i pcp: %i"
- "\n count: %i"
- "\n high: %i"
- "\n batch: %i",
- i, j,
- pageset->pcp[j].count,
- pageset->pcp[j].high,
- pageset->pcp[j].batch);
- }
+ seq_printf(m,
+ "\n cpu: %i"
+ "\n count: %i"
+ "\n high: %i"
+ "\n batch: %i",
+ i,
+ pageset->pcp.count,
+ pageset->pcp.high,
+ pageset->pcp.batch);
#ifdef CONFIG_SMP
seq_printf(m, "\n vm stats threshold: %d",
pageset->stat_threshold);