aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/bounce.c25
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/hugetlb.c8
-rw-r--r--mm/memory.c23
-rw-r--r--mm/mempolicy.c86
-rw-r--r--mm/migrate.c13
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c15
-rw-r--r--mm/page_io.c12
-rw-r--r--mm/readahead.c1
-rw-r--r--mm/slab.c14
-rw-r--r--mm/slub.c61
-rw-r--r--mm/sparse.c14
-rw-r--r--mm/vmscan.c69
18 files changed, 259 insertions, 98 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e24d348083c..a7609cbcb00 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,6 +137,7 @@ config SPLIT_PTLOCK_CPUS
int
default "4096" if ARM && !CPU_CACHE_VIPT
default "4096" if PARISC && !PA20
+ default "4096" if XEN
default "4"
#
diff --git a/mm/bounce.c b/mm/bounce.c
index 179fe38a241..3b549bf31f7 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -140,26 +140,19 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
mempool_free(bvec->bv_page, pool);
}
- bio_endio(bio_orig, bio_orig->bi_size, err);
+ bio_endio(bio_orig, err);
bio_put(bio);
}
-static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
+static void bounce_end_io_write(struct bio *bio, int err)
{
- if (bio->bi_size)
- return 1;
-
bounce_end_io(bio, page_pool, err);
- return 0;
}
-static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
+static void bounce_end_io_write_isa(struct bio *bio, int err)
{
- if (bio->bi_size)
- return 1;
bounce_end_io(bio, isa_page_pool, err);
- return 0;
}
static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
@@ -172,22 +165,14 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
bounce_end_io(bio, pool, err);
}
-static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+static void bounce_end_io_read(struct bio *bio, int err)
{
- if (bio->bi_size)
- return 1;
-
__bounce_end_io_read(bio, page_pool, err);
- return 0;
}
-static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
+static void bounce_end_io_read_isa(struct bio *bio, int err)
{
- if (bio->bi_size)
- return 1;
-
__bounce_end_io_read(bio, isa_page_pool, err);
- return 0;
}
static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
diff --git a/mm/filemap.c b/mm/filemap.c
index 90b657b50f8..15c8413ee92 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1388,6 +1388,7 @@ retry_find:
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(vmf->pgoff >= size)) {
unlock_page(page);
+ page_cache_release(page);
goto outside_data_content;
}
diff --git a/mm/fremap.c b/mm/fremap.c
index c395b1abf08..95bcb5641c7 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -160,7 +160,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
goto out;
- if (!vma->vm_flags & VM_CAN_NONLINEAR)
+ if (!(vma->vm_flags & VM_CAN_NONLINEAR))
goto out;
if (end <= start || start < vma->vm_start || end > vma->vm_end)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d7ca59d66c5..eab8c428cc9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -42,7 +42,7 @@ static void clear_huge_page(struct page *page, unsigned long addr)
might_sleep();
for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
cond_resched();
- clear_user_highpage(page + i, addr);
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
}
}
@@ -71,8 +71,9 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
{
int nid;
struct page *page = NULL;
+ struct mempolicy *mpol;
struct zonelist *zonelist = huge_zonelist(vma, address,
- htlb_alloc_mask);
+ htlb_alloc_mask, &mpol);
struct zone **z;
for (z = zonelist->zones; *z; z++) {
@@ -87,6 +88,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
break;
}
}
+ mpol_free(mpol); /* unref if mpol !NULL */
return page;
}
@@ -643,7 +645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(&mm->page_table_lock);
ret = hugetlb_fault(mm, vma, vaddr, 0);
spin_lock(&mm->page_table_lock);
- if (!(ret & VM_FAULT_MAJOR))
+ if (!(ret & VM_FAULT_ERROR))
continue;
remainder = 0;
diff --git a/mm/memory.c b/mm/memory.c
index ca8cac11bd2..f82b359b274 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1639,6 +1639,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *old_page, *new_page;
pte_t entry;
int reuse = 0, ret = 0;
+ int page_mkwrite = 0;
struct page *dirty_page = NULL;
old_page = vm_normal_page(vma, address, orig_pte);
@@ -1687,6 +1688,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_cache_release(old_page);
if (!pte_same(*page_table, orig_pte))
goto unlock;
+
+ page_mkwrite = 1;
}
dirty_page = old_page;
get_page(dirty_page);
@@ -1774,7 +1777,7 @@ unlock:
* do_no_page is protected similarly.
*/
wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page);
+ set_page_dirty_balance(dirty_page, page_mkwrite);
put_page(dirty_page);
}
return ret;
@@ -2307,13 +2310,14 @@ oom:
* do not need to flush old virtual caches or the TLB.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
+ * but allow concurrent faults), and pte neither mapped nor locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
+ pte_t *page_table;
spinlock_t *ptl;
struct page *page;
pte_t entry;
@@ -2321,13 +2325,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *dirty_page = NULL;
struct vm_fault vmf;
int ret;
+ int page_mkwrite = 0;
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
- pte_unmap(page_table);
BUG_ON(vma->vm_flags & VM_PFNMAP);
if (likely(vma->vm_ops->fault)) {
@@ -2398,6 +2402,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
anon = 1; /* no anon but release vmf.page */
goto out;
}
+ page_mkwrite = 1;
}
}
@@ -2453,7 +2458,7 @@ out_unlocked:
if (anon)
page_cache_release(vmf.page);
else if (dirty_page) {
- set_page_dirty_balance(dirty_page);
+ set_page_dirty_balance(dirty_page, page_mkwrite);
put_page(dirty_page);
}
@@ -2468,8 +2473,8 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
- return __do_fault(mm, vma, address, page_table, pmd, pgoff,
- flags, orig_pte);
+ pte_unmap(page_table);
+ return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
@@ -2552,9 +2557,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
pgoff = pte_to_pgoff(orig_pte);
-
- return __do_fault(mm, vma, address, page_table, pmd, pgoff,
- flags, orig_pte);
+ return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 71b84b45154..3d6ac9505d0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -149,7 +149,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
lower zones etc. Avoid empty zones because the memory allocator
doesn't like them. If you implement node hot removal you
have to fix that. */
- k = policy_zone;
+ k = MAX_NR_ZONES - 1;
while (1) {
for_each_node_mask(nd, *nodes) {
struct zone *z = &NODE_DATA(nd)->node_zones[k];
@@ -955,6 +955,11 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
goto out;
}
+ if (!nodes_subset(new, node_online_map)) {
+ err = -EINVAL;
+ goto out;
+ }
+
err = security_task_movememory(task);
if (err)
goto out;
@@ -1072,21 +1077,37 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
#endif
-/* Return effective policy for a VMA */
+/*
+ * get_vma_policy(@task, @vma, @addr)
+ * @task - task for fallback if vma policy == default
+ * @vma - virtual memory area whose policy is sought
+ * @addr - address in @vma for shared policy lookup
+ *
+ * Returns effective policy for a VMA at specified address.
+ * Falls back to @task or system default policy, as necessary.
+ * Returned policy has extra reference count if shared, vma,
+ * or some other task's policy [show_numa_maps() can pass
+ * @task != current]. It is the caller's responsibility to
+ * free the reference in these cases.
+ */
static struct mempolicy * get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = task->mempolicy;
+ int shared_pol = 0;
if (vma) {
- if (vma->vm_ops && vma->vm_ops->get_policy)
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
pol = vma->vm_ops->get_policy(vma, addr);
- else if (vma->vm_policy &&
+ shared_pol = 1; /* if pol non-NULL, add ref below */
+ } else if (vma->vm_policy &&
vma->vm_policy->policy != MPOL_DEFAULT)
pol = vma->vm_policy;
}
if (!pol)
pol = &default_policy;
+ else if (!shared_pol && pol != current->mempolicy)
+ mpol_get(pol); /* vma or other task's policy */
return pol;
}
@@ -1202,19 +1223,45 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
}
#ifdef CONFIG_HUGETLBFS
-/* Return a zonelist suitable for a huge page allocation. */
+/*
+ * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
+ * @vma = virtual memory area whose policy is sought
+ * @addr = address in @vma for shared policy lookup and interleave policy
+ * @gfp_flags = for requested zone
+ * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
+ *
+ * Returns a zonelist suitable for a huge page allocation.
+ * If the effective policy is 'BIND, returns pointer to policy's zonelist.
+ * If it is also a policy for which get_vma_policy() returns an extra
+ * reference, we must hold that reference until after allocation.
+ * In that case, return policy via @mpol so hugetlb allocation can drop
+ * the reference. For non-'BIND referenced policies, we can/do drop the
+ * reference here, so the caller doesn't need to know about the special case
+ * for default and current task policy.
+ */
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
- gfp_t gfp_flags)
+ gfp_t gfp_flags, struct mempolicy **mpol)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
+ struct zonelist *zl;
+ *mpol = NULL; /* probably no unref needed */
if (pol->policy == MPOL_INTERLEAVE) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+ __mpol_free(pol); /* finished with pol */
return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
}
- return zonelist_policy(GFP_HIGHUSER, pol);
+
+ zl = zonelist_policy(GFP_HIGHUSER, pol);
+ if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
+ if (pol->policy != MPOL_BIND)
+ __mpol_free(pol); /* finished with pol */
+ else
+ *mpol = pol; /* unref needed after allocation */
+ }
+ return zl;
}
#endif
@@ -1259,6 +1306,7 @@ struct page *
alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
+ struct zonelist *zl;
cpuset_update_task_memory_state();
@@ -1268,7 +1316,19 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
return alloc_page_interleave(gfp, 0, nid);
}
- return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
+ zl = zonelist_policy(gfp, pol);
+ if (pol != &default_policy && pol != current->mempolicy) {
+ /*
+ * slow path: ref counted policy -- shared or vma
+ */
+ struct page *page = __alloc_pages(gfp, 0, zl);
+ __mpol_free(pol);
+ return page;
+ }
+ /*
+ * fast path: default or task policy
+ */
+ return __alloc_pages(gfp, 0, zl);
}
/**
@@ -1867,6 +1927,7 @@ int show_numa_map(struct seq_file *m, void *v)
struct numa_maps *md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
+ struct mempolicy *pol;
int n;
char buffer[50];
@@ -1877,8 +1938,13 @@ int show_numa_map(struct seq_file *m, void *v)
if (!md)
return 0;
- mpol_to_str(buffer, sizeof(buffer),
- get_vma_policy(priv->task, vma, vma->vm_start));
+ pol = get_vma_policy(priv->task, vma, vma->vm_start);
+ mpol_to_str(buffer, sizeof(buffer), pol);
+ /*
+ * unref shared or other task's mempolicy
+ */
+ if (pol != &default_policy && pol != current->mempolicy)
+ __mpol_free(pol);
seq_printf(m, "%08lx %s", vma->vm_start, buffer);
diff --git a/mm/migrate.c b/mm/migrate.c
index 37c73b90200..07f22d4a431 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -611,6 +611,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
int rc = 0;
int *result = NULL;
struct page *newpage = get_new_page(page, private, &result);
+ int rcu_locked = 0;
if (!newpage)
return -ENOMEM;
@@ -636,8 +637,13 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* we cannot notice that anon_vma is freed while we migrates a page.
* This rcu_read_lock() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
+ * File Caches may use write_page() or lock_page() in migration, then,
+ * just care Anon page here.
*/
- rcu_read_lock();
+ if (PageAnon(page)) {
+ rcu_read_lock();
+ rcu_locked = 1;
+ }
/*
* This is a corner case handling.
* When a new swap-cache is read into, it is linked to LRU
@@ -656,7 +662,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
if (rc)
remove_migration_ptes(page, page);
rcu_unlock:
- rcu_read_unlock();
+ if (rcu_locked)
+ rcu_read_unlock();
unlock:
@@ -965,7 +972,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
* array. Return various errors if the user did something wrong.
*/
for (i = 0; i < nr_pages; i++) {
- const void *p;
+ const void __user *p;
err = -EFAULT;
if (get_user(p, pages + i))
diff --git a/mm/mmap.c b/mm/mmap.c
index b6537211b9c..0d40e66c841 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -93,7 +93,7 @@ atomic_t vm_committed_space = ATOMIC_INIT(0);
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
-int __vm_enough_memory(long pages, int cap_sys_admin)
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
unsigned long free, allowed;
@@ -166,7 +166,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
/* Don't let a single process grow too big:
leave 3% of the size of this process for other processes */
- allowed -= current->mm->total_vm / 32;
+ allowed -= mm->total_vm / 32;
/*
* cast `allowed' as a signed long because vm_committed_space
@@ -2077,7 +2077,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
if (__vma && __vma->vm_start < vma->vm_end)
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory(vma_pages(vma)))
+ security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
diff --git a/mm/nommu.c b/mm/nommu.c
index 9eef6a39855..8ed0cb43118 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1270,7 +1270,7 @@ EXPORT_SYMBOL(get_unmapped_area);
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
-int __vm_enough_memory(long pages, int cap_sys_admin)
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
unsigned long free, allowed;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 63512a9ed57..44720363374 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -274,9 +274,9 @@ static void balance_dirty_pages(struct address_space *mapping)
pdflush_operation(background_writeout, 0);
}
-void set_page_dirty_balance(struct page *page)
+void set_page_dirty_balance(struct page *page, int page_mkwrite)
{
- if (set_page_dirty(page)) {
+ if (set_page_dirty(page) || page_mkwrite) {
struct address_space *mapping = page_mapping(page);
if (mapping)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3da85b81dab..1a8c59571cb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1157,6 +1157,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
+ enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */
zonelist_scan:
/*
@@ -1166,6 +1167,18 @@ zonelist_scan:
z = zonelist->zones;
do {
+ /*
+ * In NUMA, this could be a policy zonelist which contains
+ * zones that may not be allowed by the current gfp_mask.
+ * Check the zone is allowed by the current flags
+ */
+ if (unlikely(alloc_should_filter_zonelist(zonelist))) {
+ if (highest_zoneidx == -1)
+ highest_zoneidx = gfp_zone(gfp_mask);
+ if (zone_idx(*z) > highest_zoneidx)
+ continue;
+ }
+
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
@@ -2332,6 +2345,8 @@ static int __cpuinit process_zones(int cpu)
return 0;
bad:
for_each_zone(dzone) {
+ if (!populated_zone(dzone))
+ continue;
if (dzone == zone)
break;
kfree(zone_pcp(dzone, cpu));
diff --git a/mm/page_io.c b/mm/page_io.c
index dbffec0d78c..3b97f685027 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -44,14 +44,11 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
return bio;
}
-static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
+static void end_swap_bio_write(struct bio *bio, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
- if (bio->bi_size)
- return 1;
-
if (!uptodate) {
SetPageError(page);
/*
@@ -71,17 +68,13 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
}
end_page_writeback(page);
bio_put(bio);
- return 0;
}
-int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
+void end_swap_bio_read(struct bio *bio, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
- if (bio->bi_size)
- return 1;
-
if (!uptodate) {
SetPageError(page);
ClearPageUptodate(page);
@@ -94,7 +87,6 @@ int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
}
unlock_page(page);
bio_put(bio);
- return 0;
}
/*
diff --git a/mm/readahead.c b/mm/readahead.c
index 39bf45d4332..be20c9d699d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -15,6 +15,7 @@
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
+#include <linux/pagemap.h>
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
{
diff --git a/mm/slab.c b/mm/slab.c
index a684778b2b4..6f6abef83a1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -883,6 +883,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
*/
static int use_alien_caches __read_mostly = 1;
+static int numa_platform __read_mostly = 1;
static int __init noaliencache_setup(char *s)
{
use_alien_caches = 0;
@@ -1399,8 +1400,10 @@ void __init kmem_cache_init(void)
int order;
int node;
- if (num_possible_nodes() == 1)
+ if (num_possible_nodes() == 1) {
use_alien_caches = 0;
+ numa_platform = 0;
+ }
for (i = 0; i < NUM_INIT_LISTS; i++) {
kmem_list3_init(&initkmem_list3[i]);
@@ -3558,7 +3561,14 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
check_irq_off();
objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
- if (cache_free_alien(cachep, objp))
+ /*
+ * Skip calling cache_free_alien() when the platform is not numa.
+ * This will avoid cache misses that happen while accessing slabp (which
+ * is per page memory reference) to get nodeid. Instead use a global
+ * variable to skip the call, which is mostly likely to be present in
+ * the cache.
+ */
+ if (numa_platform && cache_free_alien(cachep, objp))
return;
if (likely(ac->avail < ac->limit)) {
diff --git a/mm/slub.c b/mm/slub.c
index 69d02e3e439..addb20a6d67 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -986,7 +986,9 @@ out:
__setup("slub_debug", setup_slub_debug);
-static void kmem_cache_open_debug_check(struct kmem_cache *s)
+static unsigned long kmem_cache_flags(unsigned long objsize,
+ unsigned long flags, const char *name,
+ void (*ctor)(void *, struct kmem_cache *, unsigned long))
{
/*
* The page->offset field is only 16 bit wide. This is an offset
@@ -1000,19 +1002,21 @@ static void kmem_cache_open_debug_check(struct kmem_cache *s)
* Debugging or ctor may create a need to move the free
* pointer. Fail if this happens.
*/
- if (s->objsize >= 65535 * sizeof(void *)) {
- BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON |
+ if (objsize >= 65535 * sizeof(void *)) {
+ BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON |
SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
- BUG_ON(s->ctor);
- }
- else
+ BUG_ON(ctor);
+ } else {
/*
* Enable debugging if selected on the kernel commandline.
*/
if (slub_debug && (!slub_debug_slabs ||
- strncmp(slub_debug_slabs, s->name,
+ strncmp(slub_debug_slabs, name,
strlen(slub_debug_slabs)) == 0))
- s->flags |= slub_debug;
+ flags |= slub_debug;
+ }
+
+ return flags;
}
#else
static inline void setup_object_debug(struct kmem_cache *s,
@@ -1029,7 +1033,12 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
static inline int check_object(struct kmem_cache *s, struct page *page,
void *object, int active) { return 1; }
static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
-static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {}
+static inline unsigned long kmem_cache_flags(unsigned long objsize,
+ unsigned long flags, const char *name,
+ void (*ctor)(void *, struct kmem_cache *, unsigned long))
+{
+ return flags;
+}
#define slub_debug 0
#endif
/*
@@ -1877,9 +1886,16 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
- page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node);
+ page = new_slab(kmalloc_caches, gfpflags, node);
BUG_ON(!page);
+ if (page_to_nid(page) != node) {
+ printk(KERN_ERR "SLUB: Unable to allocate memory from "
+ "node %d\n", node);
+ printk(KERN_ERR "SLUB: Allocating a useless per node structure "
+ "in order to be able to continue\n");
+ }
+
n = page->freelist;
BUG_ON(!n);
page->freelist = get_freepointer(kmalloc_caches, n);
@@ -2081,9 +2097,8 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
s->name = name;
s->ctor = ctor;
s->objsize = size;
- s->flags = flags;
s->align = align;
- kmem_cache_open_debug_check(s);
+ s->flags = kmem_cache_flags(size, flags, name, ctor);
if (!calculate_sizes(s))
goto error;
@@ -2653,7 +2668,7 @@ static int slab_unmergeable(struct kmem_cache *s)
}
static struct kmem_cache *find_mergeable(size_t size,
- size_t align, unsigned long flags,
+ size_t align, unsigned long flags, const char *name,
void (*ctor)(void *, struct kmem_cache *, unsigned long))
{
struct kmem_cache *s;
@@ -2667,6 +2682,7 @@ static struct kmem_cache *find_mergeable(size_t size,
size = ALIGN(size, sizeof(void *));
align = calculate_alignment(flags, align, size);
size = ALIGN(size, align);
+ flags = kmem_cache_flags(size, flags, name, NULL);
list_for_each_entry(s, &slab_caches, list) {
if (slab_unmergeable(s))
@@ -2675,8 +2691,7 @@ static struct kmem_cache *find_mergeable(size_t size,
if (size > s->size)
continue;
- if (((flags | slub_debug) & SLUB_MERGE_SAME) !=
- (s->flags & SLUB_MERGE_SAME))
+ if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
continue;
/*
* Check if alignment is compatible.
@@ -2700,7 +2715,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
struct kmem_cache *s;
down_write(&slub_lock);
- s = find_mergeable(size, align, flags, ctor);
+ s = find_mergeable(size, align, flags, name, ctor);
if (s) {
s->refcount++;
/*
@@ -3112,7 +3127,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
unsigned long flags;
struct page *page;
- if (!atomic_read(&n->nr_slabs))
+ if (!atomic_long_read(&n->nr_slabs))
continue;
spin_lock_irqsave(&n->list_lock, flags);
@@ -3247,7 +3262,7 @@ static unsigned long slab_objects(struct kmem_cache *s,
}
if (flags & SO_FULL) {
- int full_slabs = atomic_read(&n->nr_slabs)
+ int full_slabs = atomic_long_read(&n->nr_slabs)
- per_cpu[node]
- n->nr_partial;
@@ -3283,7 +3298,7 @@ static int any_slab_objects(struct kmem_cache *s)
for_each_node(node) {
struct kmem_cache_node *n = get_node(s, node);
- if (n->nr_partial || atomic_read(&n->nr_slabs))
+ if (n->nr_partial || atomic_long_read(&n->nr_slabs))
return 1;
}
return 0;
@@ -3806,7 +3821,9 @@ static int __init slab_sysfs_init(void)
list_for_each_entry(s, &slab_caches, list) {
err = sysfs_slab_add(s);
- BUG_ON(err);
+ if (err)
+ printk(KERN_ERR "SLUB: Unable to add boot slab %s"
+ " to sysfs\n", s->name);
}
while (alias_list) {
@@ -3814,7 +3831,9 @@ static int __init slab_sysfs_init(void)
alias_list = alias_list->next;
err = sysfs_slab_alias(al->s, al->name);
- BUG_ON(err);
+ if (err)
+ printk(KERN_ERR "SLUB: Unable to add boot slab alias"
+ " %s to sysfs\n", s->name);
kfree(al);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 3047bf06c1f..239f5a720d3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -41,6 +41,15 @@ int page_to_nid(struct page *page)
return section_to_node_table[page_to_section(page)];
}
EXPORT_SYMBOL(page_to_nid);
+
+static void set_section_nid(unsigned long section_nr, int nid)
+{
+ section_to_node_table[section_nr] = nid;
+}
+#else /* !NODE_NOT_IN_PAGE_FLAGS */
+static inline void set_section_nid(unsigned long section_nr, int nid)
+{
+}
#endif
#ifdef CONFIG_SPARSEMEM_EXTREME
@@ -68,10 +77,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
struct mem_section *section;
int ret = 0;
-#ifdef NODE_NOT_IN_PAGE_FLAGS
- section_to_node_table[section_nr] = nid;
-#endif
-
if (mem_section[root])
return -EEXIST;
@@ -148,6 +153,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
struct mem_section *ms;
sparse_index_init(section, nid);
+ set_section_nid(section, nid);
ms = __nr_to_section(section);
if (!ms->section_mem_map)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d419e10e3da..a6e65d02499 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -271,6 +271,12 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page);
}
+/* Request for sync pageout. */
+enum pageout_io {
+ PAGEOUT_IO_ASYNC,
+ PAGEOUT_IO_SYNC,
+};
+
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
@@ -287,7 +293,8 @@ typedef enum {
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping,
+ enum pageout_io sync_writeback)
{
/*
* If the page is dirty, only perform writeback if that write
@@ -346,6 +353,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
+
+ /*
+ * Wait on writeback if requested to. This happens when
+ * direct reclaiming a large contiguous area and the
+ * first attempt to free a range of pages fails.
+ */
+ if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+ wait_on_page_writeback(page);
+
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
@@ -423,7 +439,8 @@ cannot_free:
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct scan_control *sc)
+ struct scan_control *sc,
+ enum pageout_io sync_writeback)
{
LIST_HEAD(ret_pages);
struct pagevec freed_pvec;
@@ -458,8 +475,23 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
- if (PageWriteback(page))
- goto keep_locked;
+ may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+ (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+ if (PageWriteback(page)) {
+ /*
+ * Synchronous reclaim is performed in two passes,
+ * first an asynchronous pass over the list to
+ * start parallel writeback, and a second synchronous
+ * pass to wait for the IO to complete. Wait here
+ * for any page for which writeback has already
+ * started.
+ */
+ if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+ wait_on_page_writeback(page);
+ else
+ goto keep_locked;
+ }
referenced = page_referenced(page, 1);
/* In active use or really unfreeable? Activate it. */
@@ -478,8 +510,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
#endif /* CONFIG_SWAP */
mapping = page_mapping(page);
- may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
- (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
/*
* The page is mapped into the page tables of one or more
@@ -505,7 +535,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
/* Page is dirty, try to write it out here */
- switch(pageout(page, mapping)) {
+ switch (pageout(page, mapping, sync_writeback)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
@@ -777,6 +807,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
(sc->order > PAGE_ALLOC_COSTLY_ORDER)?
ISOLATE_BOTH : ISOLATE_INACTIVE);
nr_active = clear_active_flags(&page_list);
+ __count_vm_events(PGDEACTIVATE, nr_active);
__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
__mod_zone_page_state(zone, NR_INACTIVE,
@@ -785,7 +816,29 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
spin_unlock_irq(&zone->lru_lock);
nr_scanned += nr_scan;
- nr_freed = shrink_page_list(&page_list, sc);
+ nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+
+ /*
+ * If we are direct reclaiming for contiguous pages and we do
+ * not reclaim everything in the list, try again and wait
+ * for IO to complete. This will stall high-order allocations
+ * but that should be acceptable to the caller
+ */
+ if (nr_freed < nr_taken && !current_is_kswapd() &&
+ sc->order > PAGE_ALLOC_COSTLY_ORDER) {
+ congestion_wait(WRITE, HZ/10);
+
+ /*
+ * The attempt at page out may have made some
+ * of the pages active, mark them inactive again.
+ */
+ nr_active = clear_active_flags(&page_list);
+ count_vm_events(PGDEACTIVATE, nr_active);
+
+ nr_freed += shrink_page_list(&page_list, sc,
+ PAGEOUT_IO_SYNC);
+ }
+
nr_reclaimed += nr_freed;
local_irq_disable();
if (current_is_kswapd()) {