diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 1 | ||||
-rw-r--r-- | mm/bounce.c | 25 | ||||
-rw-r--r-- | mm/filemap.c | 1 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 23 | ||||
-rw-r--r-- | mm/mempolicy.c | 86 | ||||
-rw-r--r-- | mm/migrate.c | 13 | ||||
-rw-r--r-- | mm/mmap.c | 6 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 15 | ||||
-rw-r--r-- | mm/page_io.c | 12 | ||||
-rw-r--r-- | mm/readahead.c | 1 | ||||
-rw-r--r-- | mm/slab.c | 14 | ||||
-rw-r--r-- | mm/slub.c | 61 | ||||
-rw-r--r-- | mm/sparse.c | 14 | ||||
-rw-r--r-- | mm/vmscan.c | 69 |
18 files changed, 259 insertions, 98 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e24d348083c..a7609cbcb00 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,6 +137,7 @@ config SPLIT_PTLOCK_CPUS int default "4096" if ARM && !CPU_CACHE_VIPT default "4096" if PARISC && !PA20 + default "4096" if XEN default "4" # diff --git a/mm/bounce.c b/mm/bounce.c index 179fe38a241..3b549bf31f7 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -140,26 +140,19 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) mempool_free(bvec->bv_page, pool); } - bio_endio(bio_orig, bio_orig->bi_size, err); + bio_endio(bio_orig, err); bio_put(bio); } -static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err) +static void bounce_end_io_write(struct bio *bio, int err) { - if (bio->bi_size) - return 1; - bounce_end_io(bio, page_pool, err); - return 0; } -static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) +static void bounce_end_io_write_isa(struct bio *bio, int err) { - if (bio->bi_size) - return 1; bounce_end_io(bio, isa_page_pool, err); - return 0; } static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) @@ -172,22 +165,14 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) bounce_end_io(bio, pool, err); } -static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) +static void bounce_end_io_read(struct bio *bio, int err) { - if (bio->bi_size) - return 1; - __bounce_end_io_read(bio, page_pool, err); - return 0; } -static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) +static void bounce_end_io_read_isa(struct bio *bio, int err) { - if (bio->bi_size) - return 1; - __bounce_end_io_read(bio, isa_page_pool, err); - return 0; } static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, diff --git a/mm/filemap.c b/mm/filemap.c index 90b657b50f8..15c8413ee92 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1388,6 +1388,7 @@ retry_find: size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (unlikely(vmf->pgoff >= size)) { unlock_page(page); + page_cache_release(page); goto outside_data_content; } diff --git a/mm/fremap.c b/mm/fremap.c index c395b1abf08..95bcb5641c7 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -160,7 +160,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) goto out; - if (!vma->vm_flags & VM_CAN_NONLINEAR) + if (!(vma->vm_flags & VM_CAN_NONLINEAR)) goto out; if (end <= start || start < vma->vm_start || end > vma->vm_end) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d7ca59d66c5..eab8c428cc9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -42,7 +42,7 @@ static void clear_huge_page(struct page *page, unsigned long addr) might_sleep(); for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { cond_resched(); - clear_user_highpage(page + i, addr); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); } } @@ -71,8 +71,9 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, { int nid; struct page *page = NULL; + struct mempolicy *mpol; struct zonelist *zonelist = huge_zonelist(vma, address, - htlb_alloc_mask); + htlb_alloc_mask, &mpol); struct zone **z; for (z = zonelist->zones; *z; z++) { @@ -87,6 +88,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, break; } } + mpol_free(mpol); /* unref if mpol !NULL */ return page; } @@ -643,7 +645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); ret = hugetlb_fault(mm, vma, vaddr, 0); spin_lock(&mm->page_table_lock); - if (!(ret & VM_FAULT_MAJOR)) + if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; diff --git a/mm/memory.c b/mm/memory.c index ca8cac11bd2..f82b359b274 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1639,6 +1639,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *old_page, *new_page; pte_t entry; int reuse = 0, ret = 0; + int page_mkwrite = 0; struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); @@ -1687,6 +1688,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(old_page); if (!pte_same(*page_table, orig_pte)) goto unlock; + + page_mkwrite = 1; } dirty_page = old_page; get_page(dirty_page); @@ -1774,7 +1777,7 @@ unlock: * do_no_page is protected similarly. */ wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); put_page(dirty_page); } return ret; @@ -2307,13 +2310,14 @@ oom: * do not need to flush old virtual caches or the TLB. * * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. + * but allow concurrent faults), and pte neither mapped nor locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, + unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { + pte_t *page_table; spinlock_t *ptl; struct page *page; pte_t entry; @@ -2321,13 +2325,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *dirty_page = NULL; struct vm_fault vmf; int ret; + int page_mkwrite = 0; vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; vmf.flags = flags; vmf.page = NULL; - pte_unmap(page_table); BUG_ON(vma->vm_flags & VM_PFNMAP); if (likely(vma->vm_ops->fault)) { @@ -2398,6 +2402,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, anon = 1; /* no anon but release vmf.page */ goto out; } + page_mkwrite = 1; } } @@ -2453,7 +2458,7 @@ out_unlocked: if (anon) page_cache_release(vmf.page); else if (dirty_page) { - set_page_dirty_balance(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); put_page(dirty_page); } @@ -2468,8 +2473,8 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); - return __do_fault(mm, vma, address, page_table, pmd, pgoff, - flags, orig_pte); + pte_unmap(page_table); + return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } @@ -2552,9 +2557,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } pgoff = pte_to_pgoff(orig_pte); - - return __do_fault(mm, vma, address, page_table, pmd, pgoff, - flags, orig_pte); + return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 71b84b45154..3d6ac9505d0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -149,7 +149,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) lower zones etc. Avoid empty zones because the memory allocator doesn't like them. If you implement node hot removal you have to fix that. */ - k = policy_zone; + k = MAX_NR_ZONES - 1; while (1) { for_each_node_mask(nd, *nodes) { struct zone *z = &NODE_DATA(nd)->node_zones[k]; @@ -955,6 +955,11 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, goto out; } + if (!nodes_subset(new, node_online_map)) { + err = -EINVAL; + goto out; + } + err = security_task_movememory(task); if (err) goto out; @@ -1072,21 +1077,37 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, #endif -/* Return effective policy for a VMA */ +/* + * get_vma_policy(@task, @vma, @addr) + * @task - task for fallback if vma policy == default + * @vma - virtual memory area whose policy is sought + * @addr - address in @vma for shared policy lookup + * + * Returns effective policy for a VMA at specified address. + * Falls back to @task or system default policy, as necessary. + * Returned policy has extra reference count if shared, vma, + * or some other task's policy [show_numa_maps() can pass + * @task != current]. It is the caller's responsibility to + * free the reference in these cases. + */ static struct mempolicy * get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = task->mempolicy; + int shared_pol = 0; if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) + if (vma->vm_ops && vma->vm_ops->get_policy) { pol = vma->vm_ops->get_policy(vma, addr); - else if (vma->vm_policy && + shared_pol = 1; /* if pol non-NULL, add ref below */ + } else if (vma->vm_policy && vma->vm_policy->policy != MPOL_DEFAULT) pol = vma->vm_policy; } if (!pol) pol = &default_policy; + else if (!shared_pol && pol != current->mempolicy) + mpol_get(pol); /* vma or other task's policy */ return pol; } @@ -1202,19 +1223,45 @@ static inline unsigned interleave_nid(struct mempolicy *pol, } #ifdef CONFIG_HUGETLBFS -/* Return a zonelist suitable for a huge page allocation. */ +/* + * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) + * @vma = virtual memory area whose policy is sought + * @addr = address in @vma for shared policy lookup and interleave policy + * @gfp_flags = for requested zone + * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy + * + * Returns a zonelist suitable for a huge page allocation. + * If the effective policy is 'BIND, returns pointer to policy's zonelist. + * If it is also a policy for which get_vma_policy() returns an extra + * reference, we must hold that reference until after allocation. + * In that case, return policy via @mpol so hugetlb allocation can drop + * the reference. For non-'BIND referenced policies, we can/do drop the + * reference here, so the caller doesn't need to know about the special case + * for default and current task policy. + */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, - gfp_t gfp_flags) + gfp_t gfp_flags, struct mempolicy **mpol) { struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct zonelist *zl; + *mpol = NULL; /* probably no unref needed */ if (pol->policy == MPOL_INTERLEAVE) { unsigned nid; nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); + __mpol_free(pol); /* finished with pol */ return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); } - return zonelist_policy(GFP_HIGHUSER, pol); + + zl = zonelist_policy(GFP_HIGHUSER, pol); + if (unlikely(pol != &default_policy && pol != current->mempolicy)) { + if (pol->policy != MPOL_BIND) + __mpol_free(pol); /* finished with pol */ + else + *mpol = pol; /* unref needed after allocation */ + } + return zl; } #endif @@ -1259,6 +1306,7 @@ struct page * alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct zonelist *zl; cpuset_update_task_memory_state(); @@ -1268,7 +1316,19 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); return alloc_page_interleave(gfp, 0, nid); } - return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); + zl = zonelist_policy(gfp, pol); + if (pol != &default_policy && pol != current->mempolicy) { + /* + * slow path: ref counted policy -- shared or vma + */ + struct page *page = __alloc_pages(gfp, 0, zl); + __mpol_free(pol); + return page; + } + /* + * fast path: default or task policy + */ + return __alloc_pages(gfp, 0, zl); } /** @@ -1867,6 +1927,7 @@ int show_numa_map(struct seq_file *m, void *v) struct numa_maps *md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; + struct mempolicy *pol; int n; char buffer[50]; @@ -1877,8 +1938,13 @@ int show_numa_map(struct seq_file *m, void *v) if (!md) return 0; - mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(priv->task, vma, vma->vm_start)); + pol = get_vma_policy(priv->task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol); + /* + * unref shared or other task's mempolicy + */ + if (pol != &default_policy && pol != current->mempolicy) + __mpol_free(pol); seq_printf(m, "%08lx %s", vma->vm_start, buffer); diff --git a/mm/migrate.c b/mm/migrate.c index 37c73b90200..07f22d4a431 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -611,6 +611,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); + int rcu_locked = 0; if (!newpage) return -ENOMEM; @@ -636,8 +637,13 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * we cannot notice that anon_vma is freed while we migrates a page. * This rcu_read_lock() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() + * File Caches may use write_page() or lock_page() in migration, then, + * just care Anon page here. */ - rcu_read_lock(); + if (PageAnon(page)) { + rcu_read_lock(); + rcu_locked = 1; + } /* * This is a corner case handling. * When a new swap-cache is read into, it is linked to LRU @@ -656,7 +662,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (rc) remove_migration_ptes(page, page); rcu_unlock: - rcu_read_unlock(); + if (rcu_locked) + rcu_read_unlock(); unlock: @@ -965,7 +972,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, * array. Return various errors if the user did something wrong. */ for (i = 0; i < nr_pages; i++) { - const void *p; + const void __user *p; err = -EFAULT; if (get_user(p, pages + i)) diff --git a/mm/mmap.c b/mm/mmap.c index b6537211b9c..0d40e66c841 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -93,7 +93,7 @@ atomic_t vm_committed_space = ATOMIC_INIT(0); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(long pages, int cap_sys_admin) +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed; @@ -166,7 +166,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) /* Don't let a single process grow too big: leave 3% of the size of this process for other processes */ - allowed -= current->mm->total_vm / 32; + allowed -= mm->total_vm / 32; /* * cast `allowed' as a signed long because vm_committed_space @@ -2077,7 +2077,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) if (__vma && __vma->vm_start < vma->vm_end) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory(vma_pages(vma))) + security_vm_enough_memory_mm(mm, vma_pages(vma))) return -ENOMEM; vma_link(mm, vma, prev, rb_link, rb_parent); return 0; diff --git a/mm/nommu.c b/mm/nommu.c index 9eef6a39855..8ed0cb43118 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1270,7 +1270,7 @@ EXPORT_SYMBOL(get_unmapped_area); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(long pages, int cap_sys_admin) +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 63512a9ed57..44720363374 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -274,9 +274,9 @@ static void balance_dirty_pages(struct address_space *mapping) pdflush_operation(background_writeout, 0); } -void set_page_dirty_balance(struct page *page) +void set_page_dirty_balance(struct page *page, int page_mkwrite) { - if (set_page_dirty(page)) { + if (set_page_dirty(page) || page_mkwrite) { struct address_space *mapping = page_mapping(page); if (mapping) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3da85b81dab..1a8c59571cb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1157,6 +1157,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ + enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ zonelist_scan: /* @@ -1166,6 +1167,18 @@ zonelist_scan: z = zonelist->zones; do { + /* + * In NUMA, this could be a policy zonelist which contains + * zones that may not be allowed by the current gfp_mask. + * Check the zone is allowed by the current flags + */ + if (unlikely(alloc_should_filter_zonelist(zonelist))) { + if (highest_zoneidx == -1) + highest_zoneidx = gfp_zone(gfp_mask); + if (zone_idx(*z) > highest_zoneidx) + continue; + } + if (NUMA_BUILD && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; @@ -2332,6 +2345,8 @@ static int __cpuinit process_zones(int cpu) return 0; bad: for_each_zone(dzone) { + if (!populated_zone(dzone)) + continue; if (dzone == zone) break; kfree(zone_pcp(dzone, cpu)); diff --git a/mm/page_io.c b/mm/page_io.c index dbffec0d78c..3b97f685027 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -44,14 +44,11 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, return bio; } -static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err) +static void end_swap_bio_write(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_size) - return 1; - if (!uptodate) { SetPageError(page); /* @@ -71,17 +68,13 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err) } end_page_writeback(page); bio_put(bio); - return 0; } -int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) +void end_swap_bio_read(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_size) - return 1; - if (!uptodate) { SetPageError(page); ClearPageUptodate(page); @@ -94,7 +87,6 @@ int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) } unlock_page(page); bio_put(bio); - return 0; } /* diff --git a/mm/readahead.c b/mm/readahead.c index 39bf45d4332..be20c9d699d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -15,6 +15,7 @@ #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/pagevec.h> +#include <linux/pagemap.h> void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { diff --git a/mm/slab.c b/mm/slab.c index a684778b2b4..6f6abef83a1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -883,6 +883,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, */ static int use_alien_caches __read_mostly = 1; +static int numa_platform __read_mostly = 1; static int __init noaliencache_setup(char *s) { use_alien_caches = 0; @@ -1399,8 +1400,10 @@ void __init kmem_cache_init(void) int order; int node; - if (num_possible_nodes() == 1) + if (num_possible_nodes() == 1) { use_alien_caches = 0; + numa_platform = 0; + } for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); @@ -3558,7 +3561,14 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - if (cache_free_alien(cachep, objp)) + /* + * Skip calling cache_free_alien() when the platform is not numa. + * This will avoid cache misses that happen while accessing slabp (which + * is per page memory reference) to get nodeid. Instead use a global + * variable to skip the call, which is mostly likely to be present in + * the cache. + */ + if (numa_platform && cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { diff --git a/mm/slub.c b/mm/slub.c index 69d02e3e439..addb20a6d67 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -986,7 +986,9 @@ out: __setup("slub_debug", setup_slub_debug); -static void kmem_cache_open_debug_check(struct kmem_cache *s) +static unsigned long kmem_cache_flags(unsigned long objsize, + unsigned long flags, const char *name, + void (*ctor)(void *, struct kmem_cache *, unsigned long)) { /* * The page->offset field is only 16 bit wide. This is an offset @@ -1000,19 +1002,21 @@ static void kmem_cache_open_debug_check(struct kmem_cache *s) * Debugging or ctor may create a need to move the free * pointer. Fail if this happens. */ - if (s->objsize >= 65535 * sizeof(void *)) { - BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON | + if (objsize >= 65535 * sizeof(void *)) { + BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); - BUG_ON(s->ctor); - } - else + BUG_ON(ctor); + } else { /* * Enable debugging if selected on the kernel commandline. */ if (slub_debug && (!slub_debug_slabs || - strncmp(slub_debug_slabs, s->name, + strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) - s->flags |= slub_debug; + flags |= slub_debug; + } + + return flags; } #else static inline void setup_object_debug(struct kmem_cache *s, @@ -1029,7 +1033,12 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) static inline int check_object(struct kmem_cache *s, struct page *page, void *object, int active) { return 1; } static inline void add_full(struct kmem_cache_node *n, struct page *page) {} -static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {} +static inline unsigned long kmem_cache_flags(unsigned long objsize, + unsigned long flags, const char *name, + void (*ctor)(void *, struct kmem_cache *, unsigned long)) +{ + return flags; +} #define slub_debug 0 #endif /* @@ -1877,9 +1886,16 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); + page = new_slab(kmalloc_caches, gfpflags, node); BUG_ON(!page); + if (page_to_nid(page) != node) { + printk(KERN_ERR "SLUB: Unable to allocate memory from " + "node %d\n", node); + printk(KERN_ERR "SLUB: Allocating a useless per node structure " + "in order to be able to continue\n"); + } + n = page->freelist; BUG_ON(!n); page->freelist = get_freepointer(kmalloc_caches, n); @@ -2081,9 +2097,8 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, s->name = name; s->ctor = ctor; s->objsize = size; - s->flags = flags; s->align = align; - kmem_cache_open_debug_check(s); + s->flags = kmem_cache_flags(size, flags, name, ctor); if (!calculate_sizes(s)) goto error; @@ -2653,7 +2668,7 @@ static int slab_unmergeable(struct kmem_cache *s) } static struct kmem_cache *find_mergeable(size_t size, - size_t align, unsigned long flags, + size_t align, unsigned long flags, const char *name, void (*ctor)(void *, struct kmem_cache *, unsigned long)) { struct kmem_cache *s; @@ -2667,6 +2682,7 @@ static struct kmem_cache *find_mergeable(size_t size, size = ALIGN(size, sizeof(void *)); align = calculate_alignment(flags, align, size); size = ALIGN(size, align); + flags = kmem_cache_flags(size, flags, name, NULL); list_for_each_entry(s, &slab_caches, list) { if (slab_unmergeable(s)) @@ -2675,8 +2691,7 @@ static struct kmem_cache *find_mergeable(size_t size, if (size > s->size) continue; - if (((flags | slub_debug) & SLUB_MERGE_SAME) != - (s->flags & SLUB_MERGE_SAME)) + if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) continue; /* * Check if alignment is compatible. @@ -2700,7 +2715,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, struct kmem_cache *s; down_write(&slub_lock); - s = find_mergeable(size, align, flags, ctor); + s = find_mergeable(size, align, flags, name, ctor); if (s) { s->refcount++; /* @@ -3112,7 +3127,7 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long flags; struct page *page; - if (!atomic_read(&n->nr_slabs)) + if (!atomic_long_read(&n->nr_slabs)) continue; spin_lock_irqsave(&n->list_lock, flags); @@ -3247,7 +3262,7 @@ static unsigned long slab_objects(struct kmem_cache *s, } if (flags & SO_FULL) { - int full_slabs = atomic_read(&n->nr_slabs) + int full_slabs = atomic_long_read(&n->nr_slabs) - per_cpu[node] - n->nr_partial; @@ -3283,7 +3298,7 @@ static int any_slab_objects(struct kmem_cache *s) for_each_node(node) { struct kmem_cache_node *n = get_node(s, node); - if (n->nr_partial || atomic_read(&n->nr_slabs)) + if (n->nr_partial || atomic_long_read(&n->nr_slabs)) return 1; } return 0; @@ -3806,7 +3821,9 @@ static int __init slab_sysfs_init(void) list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); - BUG_ON(err); + if (err) + printk(KERN_ERR "SLUB: Unable to add boot slab %s" + " to sysfs\n", s->name); } while (alias_list) { @@ -3814,7 +3831,9 @@ static int __init slab_sysfs_init(void) alias_list = alias_list->next; err = sysfs_slab_alias(al->s, al->name); - BUG_ON(err); + if (err) + printk(KERN_ERR "SLUB: Unable to add boot slab alias" + " %s to sysfs\n", s->name); kfree(al); } diff --git a/mm/sparse.c b/mm/sparse.c index 3047bf06c1f..239f5a720d3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -41,6 +41,15 @@ int page_to_nid(struct page *page) return section_to_node_table[page_to_section(page)]; } EXPORT_SYMBOL(page_to_nid); + +static void set_section_nid(unsigned long section_nr, int nid) +{ + section_to_node_table[section_nr] = nid; +} +#else /* !NODE_NOT_IN_PAGE_FLAGS */ +static inline void set_section_nid(unsigned long section_nr, int nid) +{ +} #endif #ifdef CONFIG_SPARSEMEM_EXTREME @@ -68,10 +77,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) struct mem_section *section; int ret = 0; -#ifdef NODE_NOT_IN_PAGE_FLAGS - section_to_node_table[section_nr] = nid; -#endif - if (mem_section[root]) return -EEXIST; @@ -148,6 +153,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) struct mem_section *ms; sparse_index_init(section, nid); + set_section_nid(section, nid); ms = __nr_to_section(section); if (!ms->section_mem_map) diff --git a/mm/vmscan.c b/mm/vmscan.c index d419e10e3da..a6e65d02499 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -271,6 +271,12 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } +/* Request for sync pageout. */ +enum pageout_io { + PAGEOUT_IO_ASYNC, + PAGEOUT_IO_SYNC, +}; + /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -287,7 +293,8 @@ typedef enum { * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +static pageout_t pageout(struct page *page, struct address_space *mapping, + enum pageout_io sync_writeback) { /* * If the page is dirty, only perform writeback if that write @@ -346,6 +353,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) ClearPageReclaim(page); return PAGE_ACTIVATE; } + + /* + * Wait on writeback if requested to. This happens when + * direct reclaiming a large contiguous area and the + * first attempt to free a range of pages fails. + */ + if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) + wait_on_page_writeback(page); + if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); @@ -423,7 +439,8 @@ cannot_free: * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc) + struct scan_control *sc, + enum pageout_io sync_writeback) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; @@ -458,8 +475,23 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (page_mapped(page) || PageSwapCache(page)) sc->nr_scanned++; - if (PageWriteback(page)) - goto keep_locked; + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + + if (PageWriteback(page)) { + /* + * Synchronous reclaim is performed in two passes, + * first an asynchronous pass over the list to + * start parallel writeback, and a second synchronous + * pass to wait for the IO to complete. Wait here + * for any page for which writeback has already + * started. + */ + if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) + wait_on_page_writeback(page); + else + goto keep_locked; + } referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ @@ -478,8 +510,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, #endif /* CONFIG_SWAP */ mapping = page_mapping(page); - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); /* * The page is mapped into the page tables of one or more @@ -505,7 +535,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { + switch (pageout(page, mapping, sync_writeback)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: @@ -777,6 +807,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, (sc->order > PAGE_ALLOC_COSTLY_ORDER)? ISOLATE_BOTH : ISOLATE_INACTIVE); nr_active = clear_active_flags(&page_list); + __count_vm_events(PGDEACTIVATE, nr_active); __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); __mod_zone_page_state(zone, NR_INACTIVE, @@ -785,7 +816,29 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; - nr_freed = shrink_page_list(&page_list, sc); + nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + + /* + * If we are direct reclaiming for contiguous pages and we do + * not reclaim everything in the list, try again and wait + * for IO to complete. This will stall high-order allocations + * but that should be acceptable to the caller + */ + if (nr_freed < nr_taken && !current_is_kswapd() && + sc->order > PAGE_ALLOC_COSTLY_ORDER) { + congestion_wait(WRITE, HZ/10); + + /* + * The attempt at page out may have made some + * of the pages active, mark them inactive again. + */ + nr_active = clear_active_flags(&page_list); + count_vm_events(PGDEACTIVATE, nr_active); + + nr_freed += shrink_page_list(&page_list, sc, + PAGEOUT_IO_SYNC); + } + nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { |