aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorJeff Garzik <jgarzik@pobox.com>2006-01-26 22:19:57 -0500
committerJeff Garzik <jgarzik@pobox.com>2006-01-26 22:19:57 -0500
commit97309d1a0bbdcb0813ea08574b4473d8e5416012 (patch)
tree0dcbcf5aa5146147e0ac1d0c4b73b868a67d333b /mm
parentb4ea75b649417606fd6b38710a2962ec9770e772 (diff)
parentefd51b5c6798d103e3aa683464aebb2019b62119 (diff)
Merge branch 'upstream-fixes'
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/mempolicy.c99
-rw-r--r--mm/page-writeback.c7
-rw-r--r--mm/page_alloc.c17
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/slab.c58
-rw-r--r--mm/swap.c26
-rw-r--r--mm/swapfile.c17
-rw-r--r--mm/vmscan.c146
9 files changed, 246 insertions, 127 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index a965b6b35f2..44da3d47699 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
* ->zone.lru_lock (follow_page->mark_page_accessed)
+ * ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
* ->inode_lock (page_remove_rmap->set_page_dirty)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3171f884d24..73790188b0e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -185,8 +185,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
}
static void gather_stats(struct page *, void *);
-static void migrate_page_add(struct vm_area_struct *vma,
- struct page *page, struct list_head *pagelist, unsigned long flags);
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags);
/* Scan through pages checking if pages follow certain conditions. */
static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -208,6 +208,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page = vm_normal_page(vma, addr, *pte);
if (!page)
continue;
+ /*
+ * The check for PageReserved here is important to avoid
+ * handling zero pages and other pages that may have been
+ * marked special by the system.
+ *
+ * If the PageReserved would not be checked here then f.e.
+ * the location of the zero page could have an influence
+ * on MPOL_MF_STRICT, zero pages would be counted for
+ * the per node stats, and there would be useless attempts
+ * to put zero pages on the migration list.
+ */
if (PageReserved(page))
continue;
nid = page_to_nid(page);
@@ -216,11 +227,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (flags & MPOL_MF_STATS)
gather_stats(page, private);
- else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- spin_unlock(ptl);
- migrate_page_add(vma, page, private, flags);
- spin_lock(ptl);
- }
+ else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ migrate_page_add(page, private, flags);
else
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -309,6 +317,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
int err;
struct vm_area_struct *first, *vma, *prev;
+ /* Clear the LRU lists so pages can be isolated */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ lru_add_drain_all();
+
first = find_vma(mm, start);
if (!first)
return ERR_PTR(-EFAULT);
@@ -519,51 +531,15 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
* page migration
*/
-/* Check if we are the only process mapping the page in question */
-static inline int single_mm_mapping(struct mm_struct *mm,
- struct address_space *mapping)
-{
- struct vm_area_struct *vma;
- struct prio_tree_iter iter;
- int rc = 1;
-
- spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
- if (mm != vma->vm_mm) {
- rc = 0;
- goto out;
- }
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
- if (mm != vma->vm_mm) {
- rc = 0;
- goto out;
- }
-out:
- spin_unlock(&mapping->i_mmap_lock);
- return rc;
-}
-
-/*
- * Add a page to be migrated to the pagelist
- */
-static void migrate_page_add(struct vm_area_struct *vma,
- struct page *page, struct list_head *pagelist, unsigned long flags)
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
{
/*
- * Avoid migrating a page that is shared by others and not writable.
+ * Avoid migrating a page that is shared with others.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
- mapping_writably_mapped(page->mapping) ||
- single_mm_mapping(vma->vm_mm, page->mapping)) {
- int rc = isolate_lru_page(page);
-
- if (rc == 1)
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+ if (isolate_lru_page(page))
list_add(&page->lru, pagelist);
- /*
- * If the isolate attempt was not successful then we just
- * encountered an unswappable page. Something must be wrong.
- */
- WARN_ON(rc == 0);
}
}
@@ -1000,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
return nid;
}
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+ switch (policy->policy) {
+ case MPOL_INTERLEAVE:
+ return interleave_nodes(policy);
+
+ case MPOL_BIND:
+ /*
+ * Follow bind policy behavior and start allocation at the
+ * first node.
+ */
+ return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+
+ case MPOL_PREFERRED:
+ if (policy->v.preferred_node >= 0)
+ return policy->v.preferred_node;
+ /* Fall through */
+
+ default:
+ return numa_node_id();
+ }
+}
+
/* Do static interleaving for a VMA with known offset. */
static unsigned offset_il_node(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long off)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5240e426c1f..945559fb63d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -46,7 +46,7 @@
static long ratelimit_pages = 32;
static long total_pages; /* The total number of pages in the machine. */
-static int dirty_exceeded; /* Dirty mem may be over limit */
+static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
/*
* When balance_dirty_pages decides that the caller needs to perform some
@@ -212,7 +212,8 @@ static void balance_dirty_pages(struct address_space *mapping)
if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
break;
- dirty_exceeded = 1;
+ if (!dirty_exceeded)
+ dirty_exceeded = 1;
/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
@@ -234,7 +235,7 @@ static void balance_dirty_pages(struct address_space *mapping)
blk_congestion_wait(WRITE, HZ/10);
}
- if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+ if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded)
dirty_exceeded = 0;
if (writeback_in_progress(bdi))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2e29743a8d..df54e2fc8ee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
mark = (*z)->pages_high;
if (!zone_watermark_ok(*z, order, mark,
classzone_idx, alloc_flags))
- continue;
+ if (!zone_reclaim_mode ||
+ !zone_reclaim(*z, gfp_mask, order))
+ continue;
}
page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
prev_node = local_node;
nodes_clear(used_mask);
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+ int distance = node_distance(local_node, node);
+
+ /*
+ * If another node is sufficiently far away then it is better
+ * to reclaim pages in a zone before going off node.
+ */
+ if (distance > RECLAIM_DISTANCE)
+ zone_reclaim_mode = 1;
+
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
- if (node_distance(local_node, node) !=
- node_distance(local_node, prev_node))
+
+ if (distance != node_distance(local_node, prev_node))
node_load[node] += load;
prev_node = node;
load--;
diff --git a/mm/rmap.c b/mm/rmap.c
index dfbb89f99a1..d85a99d28c0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -33,7 +33,7 @@
* mapping->i_mmap_lock
* anon_vma->lock
* mm->page_table_lock or pte_lock
- * zone->lru_lock (in mark_page_accessed)
+ * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
diff --git a/mm/slab.c b/mm/slab.c
index 9374293a301..6f8495e2185 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
* Further notes from the original documentation:
*
* 11 April '97. Started multi-threading - markhe
- * The global cache-chain is protected by the semaphore 'cache_chain_sem'.
+ * The global cache-chain is protected by the mutex 'cache_chain_mutex'.
* The sem is only needed when accessing/extending the cache-chain, which
* can never happen inside an interrupt (kmem_cache_create(),
* kmem_cache_shrink() and kmem_cache_reap()).
@@ -103,6 +103,8 @@
#include <linux/rcupdate.h>
#include <linux/string.h>
#include <linux/nodemask.h>
+#include <linux/mempolicy.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -631,7 +633,7 @@ static kmem_cache_t cache_cache = {
};
/* Guard access to the cache-chain. */
-static struct semaphore cache_chain_sem;
+static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
/*
@@ -772,6 +774,8 @@ static struct array_cache *alloc_arraycache(int node, int entries,
}
#ifdef CONFIG_NUMA
+static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
+
static inline struct array_cache **alloc_alien_cache(int node, int limit)
{
struct array_cache **ac_ptr;
@@ -857,7 +861,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
/* we need to do this right in the beginning since
* alloc_arraycache's are going to use this list.
* kmalloc_node allows us to add the slab to the right
@@ -912,7 +916,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
l3->shared = nc;
}
}
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
break;
case CPU_ONLINE:
start_cpu_timer(cpu);
@@ -921,7 +925,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
case CPU_DEAD:
/* fall thru */
case CPU_UP_CANCELED:
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
@@ -973,13 +977,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
spin_unlock_irq(&cachep->spinlock);
kfree(nc);
}
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
break;
#endif
}
return NOTIFY_OK;
bad:
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
return NOTIFY_BAD;
}
@@ -1047,7 +1051,6 @@ void __init kmem_cache_init(void)
*/
/* 1) create the cache_cache */
- init_MUTEX(&cache_chain_sem);
INIT_LIST_HEAD(&cache_chain);
list_add(&cache_cache.next, &cache_chain);
cache_cache.colour_off = cache_line_size();
@@ -1168,10 +1171,10 @@ void __init kmem_cache_init(void)
/* 6) resize the head arrays to their final sizes */
{
kmem_cache_t *cachep;
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next)
enable_cpucache(cachep);
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
}
/* Done! */
@@ -1590,7 +1593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
BUG();
}
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
list_for_each(p, &cache_chain) {
kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
@@ -1856,7 +1859,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -2044,18 +2047,18 @@ int kmem_cache_destroy(kmem_cache_t *cachep)
lock_cpu_hotplug();
/* Find the cache in the chain of caches. */
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
/*
* the chain is never empty, cache_cache is never destroyed
*/
list_del(&cachep->next);
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
if (__cache_shrink(cachep)) {
slab_error(cachep, "Can't free all objects");
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
list_add(&cachep->next, &cache_chain);
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
unlock_cpu_hotplug();
return 1;
}
@@ -2570,6 +2573,15 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
void *objp;
struct array_cache *ac;
+#ifdef CONFIG_NUMA
+ if (unlikely(current->mempolicy && !in_interrupt())) {
+ int nid = slab_node(current->mempolicy);
+
+ if (nid != numa_node_id())
+ return __cache_alloc_node(cachep, flags, nid);
+ }
+#endif
+
check_irq_off();
ac = ac_data(cachep);
if (likely(ac->avail)) {
@@ -3314,7 +3326,7 @@ static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
* - clear the per-cpu caches for this CPU.
* - return freeable pages to the main free memory pool.
*
- * If we cannot acquire the cache chain semaphore then just give up - we'll
+ * If we cannot acquire the cache chain mutex then just give up - we'll
* try again on the next iteration.
*/
static void cache_reap(void *unused)
@@ -3322,7 +3334,7 @@ static void cache_reap(void *unused)
struct list_head *walk;
struct kmem_list3 *l3;
- if (down_trylock(&cache_chain_sem)) {
+ if (!mutex_trylock(&cache_chain_mutex)) {
/* Give up. Setup the next iteration. */
schedule_delayed_work(&__get_cpu_var(reap_work),
REAPTIMEOUT_CPUC);
@@ -3393,7 +3405,7 @@ static void cache_reap(void *unused)
cond_resched();
}
check_irq_on();
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
drain_remote_pages();
/* Setup the next iteration */
schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
@@ -3429,7 +3441,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
loff_t n = *pos;
struct list_head *p;
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
if (!n)
print_slabinfo_header(m);
p = cache_chain.next;
@@ -3451,7 +3463,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
static void s_stop(struct seq_file *m, void *p)
{
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
}
static int s_show(struct seq_file *m, void *p)
@@ -3603,7 +3615,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
return -EINVAL;
/* Find the cache in the chain of caches. */
- down(&cache_chain_sem);
+ mutex_lock(&cache_chain_mutex);
res = -EINVAL;
list_for_each(p, &cache_chain) {
kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
@@ -3620,7 +3632,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
break;
}
}
- up(&cache_chain_sem);
+ mutex_unlock(&cache_chain_mutex);
if (res >= 0)
res = count;
return res;
diff --git a/mm/swap.c b/mm/swap.c
index cbb48e721ab..bc2442a7b0e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -174,6 +174,32 @@ void lru_add_drain(void)
put_cpu();
}
+#ifdef CONFIG_NUMA
+static void lru_add_drain_per_cpu(void *dummy)
+{
+ lru_add_drain();
+}
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+ return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+}
+
+#else
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+ lru_add_drain();
+ return 0;
+}
+#endif
+
/*
* This path almost never happens for VM activity - pages are normally
* freed via pagevecs. But it gets used by networking.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 957fef43fa6..f1e69c30d20 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,7 @@
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
+#include <linux/mutex.h>
#include <linux/capability.h>
#include <linux/syscalls.h>
@@ -46,12 +47,12 @@ struct swap_list_t swap_list = {-1, -1};
struct swap_info_struct swap_info[MAX_SWAPFILES];
-static DECLARE_MUTEX(swapon_sem);
+static DEFINE_MUTEX(swapon_mutex);
/*
* We need this because the bdev->unplug_fn can sleep and we cannot
* hold swap_lock while calling the unplug_fn. And swap_lock
- * cannot be turned into a semaphore.
+ * cannot be turned into a mutex.
*/
static DECLARE_RWSEM(swap_unplug_sem);
@@ -1161,7 +1162,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
up_write(&swap_unplug_sem);
destroy_swap_extents(p);
- down(&swapon_sem);
+ mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
drain_mmlist();
@@ -1180,7 +1181,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
p->swap_map = NULL;
p->flags = 0;
spin_unlock(&swap_lock);
- up(&swapon_sem);
+ mutex_unlock(&swapon_mutex);
vfree(swap_map);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
@@ -1209,7 +1210,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
int i;
loff_t l = *pos;
- down(&swapon_sem);
+ mutex_lock(&swapon_mutex);
for (i = 0; i < nr_swapfiles; i++, ptr++) {
if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
@@ -1238,7 +1239,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
static void swap_stop(struct seq_file *swap, void *v)
{
- up(&swapon_sem);
+ mutex_unlock(&swapon_mutex);
}
static int swap_show(struct seq_file *swap, void *v)
@@ -1540,7 +1541,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
goto bad_swap;
}
- down(&swapon_sem);
+ mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
p->flags = SWP_ACTIVE;
nr_swap_pages += nr_good_pages;
@@ -1566,7 +1567,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
swap_info[prev].next = p - swap_info;
}
spin_unlock(&swap_lock);
- up(&swapon_sem);
+ mutex_unlock(&swapon_mutex);
error = 0;
goto out;
bad_swap:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bf903b2d198..2e34b61a70c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -71,6 +71,9 @@ struct scan_control {
int may_writepage;
+ /* Can pages be swapped as part of reclaim? */
+ int may_swap;
+
/* This context's SWAP_CLUSTER_MAX. If freeing memory for
* suspend, we effectively ignore SWAP_CLUSTER_MAX.
* In this context, it doesn't matter that we scan the
@@ -458,6 +461,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
* Try to allocate it some swap space here.
*/
if (PageAnon(page) && !PageSwapCache(page)) {
+ if (!sc->may_swap)
+ goto keep_locked;
if (!add_to_swap(page, GFP_ATOMIC))
goto activate_locked;
}
@@ -586,7 +591,7 @@ static inline void move_to_lru(struct page *page)
}
/*
- * Add isolated pages on the list back to the LRU
+ * Add isolated pages on the list back to the LRU.
*
* returns the number of pages put back.
*/
@@ -760,46 +765,33 @@ next:
return nr_failed + retry;
}
-static void lru_add_drain_per_cpu(void *dummy)
-{
- lru_add_drain();
-}
-
/*
* Isolate one page from the LRU lists and put it on the
- * indicated list. Do necessary cache draining if the
- * page is not on the LRU lists yet.
+ * indicated list with elevated refcount.
*
* Result:
* 0 = page not on LRU list
* 1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
*/
int isolate_lru_page(struct page *page)
{
- int rc = 0;
- struct zone *zone = page_zone(page);
+ int ret = 0;
-redo:
- spin_lock_irq(&zone->lru_lock);
- rc = __isolate_lru_page(page);
- if (rc == 1) {
- if (PageActive(page))
- del_page_from_active_list(zone, page);
- else
- del_page_from_inactive_list(zone, page);
- }
- spin_unlock_irq(&zone->lru_lock);
- if (rc == 0) {
- /*
- * Maybe this page is still waiting for a cpu to drain it
- * from one of the lru lists?
- */
- rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
- if (rc == 0 && PageLRU(page))
- goto redo;
+ if (PageLRU(page)) {
+ struct zone *zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+ if (TestClearPageLRU(page)) {
+ ret = 1;
+ get_page(page);
+ if (PageActive(page))
+ del_page_from_active_list(zone, page);
+ else
+ del_page_from_inactive_list(zone, page);
+ }
+ spin_unlock_irq(&zone->lru_lock);
}
- return rc;
+
+ return ret;
}
#endif
@@ -831,18 +823,20 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
- switch (__isolate_lru_page(page)) {
- case 1:
- /* Succeeded to isolate page */
- list_move(&page->lru, dst);
- nr_taken++;
- break;
- case -ENOENT:
- /* Not possible to isolate */
- list_move(&page->lru, src);
- break;
- default:
+ if (!TestClearPageLRU(page))
BUG();
+ list_del(&page->lru);
+ if (get_page_testone(page)) {
+ /*
+ * It is being freed elsewhere
+ */
+ __put_page(page);
+ SetPageLRU(page);
+ list_add(&page->lru, src);
+ continue;
+ } else {
+ list_add(&page->lru, dst);
+ nr_taken++;
}
}
@@ -1177,6 +1171,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
sc.gfp_mask = gfp_mask;
sc.may_writepage = 0;
+ sc.may_swap = 1;
inc_page_state(allocstall);
@@ -1279,6 +1274,7 @@ loop_again:
total_reclaimed = 0;
sc.gfp_mask = GFP_KERNEL;
sc.may_writepage = 0;
+ sc.may_swap = 1;
sc.nr_mapped = read_page_state(nr_mapped);
inc_page_state(pageoutrun);
@@ -1576,3 +1572,71 @@ static int __init kswapd_init(void)
}
module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+
+/*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+ int nr_pages = 1 << order;
+ struct task_struct *p = current;
+ struct reclaim_state reclaim_state;
+ struct scan_control sc = {
+ .gfp_mask = gfp_mask,
+ .may_writepage = 0,
+ .may_swap = 0,
+ .nr_mapped = read_page_state(nr_mapped),
+ .nr_scanned = 0,
+ .nr_reclaimed = 0,
+ .priority = 0
+ };
+
+ if (!(gfp_mask & __GFP_WAIT) ||
+ zone->zone_pgdat->node_id != numa_node_id() ||
+ zone->all_unreclaimable ||
+ atomic_read(&zone->reclaim_in_progress) > 0)
+ return 0;
+
+ if (time_before(jiffies,
+ zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+ return 0;
+
+ disable_swap_token();
+
+ if (nr_pages > SWAP_CLUSTER_MAX)
+ sc.swap_cluster_max = nr_pages;
+ else
+ sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+ cond_resched();
+ p->flags |= PF_MEMALLOC;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+ shrink_zone(zone, &sc);
+ p->reclaim_state = NULL;
+ current->flags &= ~PF_MEMALLOC;
+
+ if (sc.nr_reclaimed == 0)
+ zone->last_unsuccessful_zone_reclaim = jiffies;
+
+ return sc.nr_reclaimed > nr_pages;
+}
+#endif
+