aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-03-31 15:34:58 -0500
committerSteven Whitehouse <swhiteho@redhat.com>2006-03-31 15:34:58 -0500
commit86579dd06deecfa6ac88d5e84e4d63c397cd6f6d (patch)
treeb4475d3ccde53015ad84a06e4e55e64591171b75 /mm
parent7ea9ea832212c4a755650f7c7cc1ff0b63292a41 (diff)
parenta0f067802576d4eb4c65d40b8ee7d6ea3c81dd61 (diff)
Merge branch 'master'
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile4
-rw-r--r--mm/bootmem.c41
-rw-r--r--mm/fadvise.c46
-rw-r--r--mm/filemap.c43
-rw-r--r--mm/highmem.c26
-rw-r--r--mm/hugetlb.c286
-rw-r--r--mm/internal.h34
-rw-r--r--mm/memory.c21
-rw-r--r--mm/mempolicy.c151
-rw-r--r--mm/mempool.c50
-rw-r--r--mm/migrate.c655
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mmzone.c50
-rw-r--r--mm/mprotect.c12
-rw-r--r--mm/msync.c139
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page-writeback.c64
-rw-r--r--mm/page_alloc.c180
-rw-r--r--mm/readahead.c33
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/slab.c1233
-rw-r--r--mm/slob.c10
-rw-r--r--mm/swap.c66
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c59
-rw-r--r--mm/util.c47
-rw-r--r--mm/vmscan.c888
29 files changed, 2603 insertions, 1587 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a9cb80ae640..332f5c29b53 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS
# support for page migration
#
config MIGRATION
- def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
- depends on SWAP
+ bool "Page migration"
+ def_bool y if NUMA
+ depends on SWAP && NUMA
+ help
+ Allows the migration of the physical location of pages of processes
+ while the virtual addresses are not changed. This is useful for
+ example on NUMA systems to put pages nearer to the processors accessing
+ the page.
diff --git a/mm/Makefile b/mm/Makefile
index 9aa03fa1dcc..0b8f73f2ed1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o \
readahead.o swap.o truncate.o vmscan.o \
- prio_tree.o util.o $(mmu-y)
+ prio_tree.o util.o mmzone.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
+obj-$(CONFIG_MIGRATION) += migrate.o
+
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 35c32290f71..d3e3bd2ffce 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(max_pfn); /* This is exported so
* dma_get_required_mask(), which uses
* it, can be an inline function */
+static LIST_HEAD(bdata_list);
#ifdef CONFIG_CRASH_DUMP
/*
* If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -52,6 +53,27 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
return mapsize;
}
+/*
+ * link bdata in order
+ */
+static void link_bootmem(bootmem_data_t *bdata)
+{
+ bootmem_data_t *ent;
+ if (list_empty(&bdata_list)) {
+ list_add(&bdata->list, &bdata_list);
+ return;
+ }
+ /* insert in order */
+ list_for_each_entry(ent, &bdata_list, list) {
+ if (bdata->node_boot_start < ent->node_boot_start) {
+ list_add_tail(&bdata->list, &ent->list);
+ return;
+ }
+ }
+ list_add_tail(&bdata->list, &bdata_list);
+ return;
+}
+
/*
* Called once to set up the allocator itself.
@@ -62,13 +84,11 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
bootmem_data_t *bdata = pgdat->bdata;
unsigned long mapsize = ((end - start)+7)/8;
- pgdat->pgdat_next = pgdat_list;
- pgdat_list = pgdat;
-
mapsize = ALIGN(mapsize, sizeof(long));
bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
bdata->node_boot_start = (start << PAGE_SHIFT);
bdata->node_low_pfn = end;
+ link_bootmem(bdata);
/*
* Initially all pages are reserved - setup_arch() has to
@@ -152,7 +172,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
*
* NOTE: This function is _not_ reentrant.
*/
-static void * __init
+void * __init
__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
unsigned long align, unsigned long goal, unsigned long limit)
{
@@ -383,12 +403,11 @@ unsigned long __init free_all_bootmem (void)
void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
{
- pg_data_t *pgdat = pgdat_list;
+ bootmem_data_t *bdata;
void *ptr;
- for_each_pgdat(pgdat)
- if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
- align, goal, 0)))
+ list_for_each_entry(bdata, &bdata_list, list)
+ if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0)))
return(ptr);
/*
@@ -416,11 +435,11 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigne
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
{
- pg_data_t *pgdat = pgdat_list;
+ bootmem_data_t *bdata;
void *ptr;
- for_each_pgdat(pgdat)
- if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+ list_for_each_entry(bdata, &bdata_list, list)
+ if ((ptr = __alloc_bootmem_core(bdata, size,
align, goal, LOW32LIMIT)))
return(ptr);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d257c89e770..907c39257ca 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -15,6 +15,7 @@
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/fadvise.h>
+#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <asm/unistd.h>
@@ -22,13 +23,36 @@
/*
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
+ *
+ * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
+ * offsets `offset' and `offset+len' inclusive. Any pages which are currently
+ * under writeout are skipped, whether or not they are dirty.
+ *
+ * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
+ * offsets `offset' and `offset+len'.
+ *
+ * By combining these two operations the application may do several things:
+ *
+ * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently
+ * dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push
+ * all of the currently dirty pages at the disk, wait until they have been
+ * written.
+ *
+ * It should be noted that none of these operations write out the file's
+ * metadata. So unless the application is strictly performing overwrites of
+ * already-instantiated disk blocks, there are no guarantees here that the data
+ * will be available after a crash.
*/
asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
{
struct file *file = fget(fd);
struct address_space *mapping;
struct backing_dev_info *bdi;
- loff_t endbyte;
+ loff_t endbyte; /* inclusive */
pgoff_t start_index;
pgoff_t end_index;
unsigned long nrpages;
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
endbyte = offset + len;
if (!len || endbyte < len)
endbyte = -1;
+ else
+ endbyte--; /* inclusive */
bdi = mapping->backing_dev_info;
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
/* First and last PARTIAL page! */
start_index = offset >> PAGE_CACHE_SHIFT;
- end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
+ end_index = endbyte >> PAGE_CACHE_SHIFT;
/* Careful about overflow on the "+1" */
nrpages = end_index - start_index + 1;
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
filemap_flush(mapping);
/* First and last FULL page! */
- start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+ start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
end_index = (endbyte >> PAGE_CACHE_SHIFT);
- if (end_index > start_index)
- invalidate_mapping_pages(mapping, start_index, end_index-1);
+ if (end_index >= start_index)
+ invalidate_mapping_pages(mapping, start_index,
+ end_index);
+ break;
+ case LINUX_FADV_ASYNC_WRITE:
+ ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
+ break;
+ case LINUX_FADV_WRITE_WAIT:
+ ret = wait_on_page_writeback_range(mapping,
+ offset >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
break;
default:
ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 7624c26fcea..1120338a5d0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,7 +29,10 @@
#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/cpuset.h>
#include "filemap.h"
+#include "internal.h"
+
/*
* FIXME: remove all knowledge of the buffer layer from the core VM
*/
@@ -172,7 +175,7 @@ static int sync_page(void *word)
* dirty pages that lie within the byte offsets <start, end>
* @mapping: address space structure to write
* @start: offset in bytes where the range starts
- * @end: offset in bytes where the range ends
+ * @end: offset in bytes where the range ends (inclusive)
* @sync_mode: enable synchronous operation
*
* If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
@@ -180,8 +183,8 @@ static int sync_page(void *word)
* these two operations is that if a dirty page/buffer is encountered, it must
* be waited upon, and not just skipped over.
*/
-static int __filemap_fdatawrite_range(struct address_space *mapping,
- loff_t start, loff_t end, int sync_mode)
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end, int sync_mode)
{
int ret;
struct writeback_control wbc = {
@@ -210,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_fdatawrite);
-static int filemap_fdatawrite_range(struct address_space *mapping,
- loff_t start, loff_t end)
+static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end)
{
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
@@ -230,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush);
* Wait for writeback to complete against pages indexed by start->end
* inclusive
*/
-static int wait_on_page_writeback_range(struct address_space *mapping,
+int wait_on_page_writeback_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct pagevec pvec;
@@ -365,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_write_and_wait);
+/*
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that `lend' is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ */
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
@@ -425,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
return ret;
}
+#ifdef CONFIG_NUMA
+struct page *page_cache_alloc(struct address_space *x)
+{
+ if (cpuset_do_page_mem_spread()) {
+ int n = cpuset_mem_spread_node();
+ return alloc_pages_node(n, mapping_gfp_mask(x), 0);
+ }
+ return alloc_pages(mapping_gfp_mask(x), 0);
+}
+EXPORT_SYMBOL(page_cache_alloc);
+
+struct page *page_cache_alloc_cold(struct address_space *x)
+{
+ if (cpuset_do_page_mem_spread()) {
+ int n = cpuset_mem_spread_node();
+ return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
+ }
+ return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+}
+EXPORT_SYMBOL(page_cache_alloc_cold);
+#endif
+
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
diff --git a/mm/highmem.c b/mm/highmem.c
index ce2e7e8bbfa..55885f64af4 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,18 +26,14 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
+#include <linux/blktrace_api.h>
#include <asm/tlbflush.h>
static mempool_t *page_pool, *isa_page_pool;
-static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data)
+static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
{
- return alloc_page(gfp_mask | GFP_DMA);
-}
-
-static void page_pool_free(void *page, void *data)
-{
- __free_page(page);
+ return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
}
/*
@@ -50,11 +46,6 @@ static void page_pool_free(void *page, void *data)
*/
#ifdef CONFIG_HIGHMEM
-static void *page_pool_alloc(gfp_t gfp_mask, void *data)
-{
- return alloc_page(gfp_mask);
-}
-
static int pkmap_count[LAST_PKMAP];
static unsigned int last_pkmap_nr;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
@@ -228,7 +219,7 @@ static __init int init_emergency_pool(void)
if (!i.totalhigh)
return 0;
- page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
+ page_pool = mempool_create_page_pool(POOL_SIZE, 0);
if (!page_pool)
BUG();
printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
@@ -271,7 +262,8 @@ int init_emergency_isa_pool(void)
if (isa_page_pool)
return 0;
- isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL);
+ isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
+ mempool_free_pages, (void *) 0);
if (!isa_page_pool)
BUG();
@@ -336,7 +328,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
bio_put(bio);
}
-static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err)
+static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
{
if (bio->bi_size)
return 1;
@@ -383,7 +375,7 @@ static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int
}
static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
- mempool_t *pool)
+ mempool_t *pool)
{
struct page *page;
struct bio *bio = NULL;
@@ -483,6 +475,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
pool = isa_page_pool;
}
+ blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
/*
* slow path
*/
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 508707704d2..ebad6bbb350 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,24 +13,48 @@
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/cpuset.h>
+#include <linux/mutex.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <linux/hugetlb.h>
+#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
-
/*
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
static DEFINE_SPINLOCK(hugetlb_lock);
+static void clear_huge_page(struct page *page, unsigned long addr)
+{
+ int i;
+
+ might_sleep();
+ for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+ cond_resched();
+ clear_user_highpage(page + i, addr);
+ }
+}
+
+static void copy_huge_page(struct page *dst, struct page *src,
+ unsigned long addr)
+{
+ int i;
+
+ might_sleep();
+ for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+ cond_resched();
+ copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+ }
+}
+
static void enqueue_huge_page(struct page *page)
{
int nid = page_to_nid(page);
@@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
return page;
}
-static struct page *alloc_fresh_huge_page(void)
+static void free_huge_page(struct page *page)
+{
+ BUG_ON(page_count(page));
+
+ INIT_LIST_HEAD(&page->lru);
+
+ spin_lock(&hugetlb_lock);
+ enqueue_huge_page(page);
+ spin_unlock(&hugetlb_lock);
+}
+
+static int alloc_fresh_huge_page(void)
{
static int nid = 0;
struct page *page;
page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
- nid = (nid + 1) % num_online_nodes();
+ nid = next_node(nid, node_online_map);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(node_online_map);
if (page) {
+ page[1].lru.next = (void *)free_huge_page; /* dtor */
spin_lock(&hugetlb_lock);
nr_huge_pages++;
nr_huge_pages_node[page_to_nid(page)]++;
spin_unlock(&hugetlb_lock);
+ put_page(page); /* free it into the hugepage allocator */
+ return 1;
}
- return page;
+ return 0;
}
-void free_huge_page(struct page *page)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr)
{
- BUG_ON(page_count(page));
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct page *page;
+ int use_reserve = 0;
+ unsigned long idx;
- INIT_LIST_HEAD(&page->lru);
- page[1].lru.next = NULL; /* reset dtor */
+ spin_lock(&hugetlb_lock);
+
+ if (vma->vm_flags & VM_MAYSHARE) {
+
+ /* idx = radix tree index, i.e. offset into file in
+ * HPAGE_SIZE units */
+ idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+ /* The hugetlbfs specific inode info stores the number
+ * of "guaranteed available" (huge) pages. That is,
+ * the first 'prereserved_hpages' pages of the inode
+ * are either already instantiated, or have been
+ * pre-reserved (by hugetlb_reserve_for_inode()). Here
+ * we're in the process of instantiating the page, so
+ * we use this to determine whether to draw from the
+ * pre-reserved pool or the truly free pool. */
+ if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
+ use_reserve = 1;
+ }
+
+ if (!use_reserve) {
+ if (free_huge_pages <= reserved_huge_pages)
+ goto fail;
+ } else {
+ BUG_ON(reserved_huge_pages == 0);
+ reserved_huge_pages--;
+ }
+
+ page = dequeue_huge_page(vma, addr);
+ if (!page)
+ goto fail;
+
+ spin_unlock(&hugetlb_lock);
+ set_page_refcounted(page);
+ return page;
+
+ fail:
+ WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+ spin_unlock(&hugetlb_lock);
+ return NULL;
+}
+
+/* hugetlb_extend_reservation()
+ *
+ * Ensure that at least 'atleast' hugepages are, and will remain,
+ * available to instantiate the first 'atleast' pages of the given
+ * inode. If the inode doesn't already have this many pages reserved
+ * or instantiated, set aside some hugepages in the reserved pool to
+ * satisfy later faults (or fail now if there aren't enough, rather
+ * than getting the SIGBUS later).
+ */
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+ unsigned long atleast)
+{
+ struct inode *inode = &info->vfs_inode;
+ unsigned long change_in_reserve = 0;
+ int ret = 0;
spin_lock(&hugetlb_lock);
- enqueue_huge_page(page);
+ read_lock_irq(&inode->i_mapping->tree_lock);
+
+ if (info->prereserved_hpages >= atleast)
+ goto out;
+
+ /* Because we always call this on shared mappings, none of the
+ * pages beyond info->prereserved_hpages can have been
+ * instantiated, so we need to reserve all of them now. */
+ change_in_reserve = atleast - info->prereserved_hpages;
+
+ if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ reserved_huge_pages += change_in_reserve;
+ info->prereserved_hpages = atleast;
+
+ out:
+ read_unlock_irq(&inode->i_mapping->tree_lock);
spin_unlock(&hugetlb_lock);
+
+ return ret;
}
-struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
+/* hugetlb_truncate_reservation()
+ *
+ * This returns pages reserved for the given inode to the general free
+ * hugepage pool. If the inode has any pages prereserved, but not
+ * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
+ * them.
+ */
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+ unsigned long atmost)
{
+ struct inode *inode = &info->vfs_inode;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long idx;
+ unsigned long change_in_reserve = 0;
struct page *page;
- int i;
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page(vma, addr);
- if (!page) {
- spin_unlock(&hugetlb_lock);
- return NULL;
+ read_lock_irq(&inode->i_mapping->tree_lock);
+
+ if (info->prereserved_hpages <= atmost)
+ goto out;
+
+ /* Count pages which were reserved, but not instantiated, and
+ * which we can now release. */
+ for (idx = atmost; idx < info->prereserved_hpages; idx++) {
+ page = radix_tree_lookup(&mapping->page_tree, idx);
+ if (!page)
+ /* Pages which are already instantiated can't
+ * be unreserved (and in fact have already
+ * been removed from the reserved pool) */
+ change_in_reserve++;
}
+
+ BUG_ON(reserved_huge_pages < change_in_reserve);
+ reserved_huge_pages -= change_in_reserve;
+ info->prereserved_hpages = atmost;
+
+ out:
+ read_unlock_irq(&inode->i_mapping->tree_lock);
spin_unlock(&hugetlb_lock);
- set_page_count(page, 1);
- page[1].lru.next = (void *)free_huge_page; /* set dtor */
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
- clear_user_highpage(&page[i], addr);
- return page;
}
static int __init hugetlb_init(void)
{
unsigned long i;
- struct page *page;
if (HPAGE_SHIFT == 0)
return 0;
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void)
INIT_LIST_HEAD(&hugepage_freelists[i]);
for (i = 0; i < max_huge_pages; ++i) {
- page = alloc_fresh_huge_page();
- if (!page)
+ if (!alloc_fresh_huge_page())
break;
- spin_lock(&hugetlb_lock);
- enqueue_huge_page(page);
- spin_unlock(&hugetlb_lock);
}
max_huge_pages = free_huge_pages = nr_huge_pages = i;
printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page)
page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
1 << PG_private | 1<< PG_writeback);
- set_page_count(&page[i], 0);
}
- set_page_count(page, 1);
+ page[1].lru.next = NULL;
+ set_page_refcounted(page);
__free_pages(page, HUGETLB_PAGE_ORDER);
}
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count)
static unsigned long set_max_huge_pages(unsigned long count)
{
while (count > nr_huge_pages) {
- struct page *page = alloc_fresh_huge_page();
- if (!page)
+ if (!alloc_fresh_huge_page())
return nr_huge_pages;
- spin_lock(&hugetlb_lock);
- enqueue_huge_page(page);
- spin_unlock(&hugetlb_lock);
}
if (count >= nr_huge_pages)
return nr_huge_pages;
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf)
return sprintf(buf,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
+ "HugePages_Rsvd: %5lu\n"
"Hugepagesize: %5lu kB\n",
nr_huge_pages,
free_huge_pages,
+ reserved_huge_pages,
HPAGE_SIZE/1024);
}
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
nid, free_huge_pages_node[nid]);
}
-int is_hugepage_mem_enough(size_t size)
-{
- return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
-}
-
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte)
{
struct page *old_page, *new_page;
- int i, avoidcopy;
+ int avoidcopy;
old_page = pte_page(pte);
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
}
spin_unlock(&mm->page_table_lock);
- for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
- copy_user_highpage(new_page + i, old_page + i,
- address + i*PAGE_SIZE);
+ copy_huge_page(new_page, old_page, address);
spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -442,6 +572,7 @@ retry:
ret = VM_FAULT_OOM;
goto out;
}
+ clear_huge_page(page, address);
if (vma->vm_flags & VM_SHARED) {
int err;
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *ptep;
pte_t entry;
int ret;
+ static DEFINE_MUTEX(hugetlb_instantiation_mutex);
ptep = huge_pte_alloc(mm, address);
if (!ptep)
return VM_FAULT_OOM;
+ /*
+ * Serialize hugepage allocation and instantiation, so that we don't
+ * get spurious allocation failures if two CPUs race to instantiate
+ * the same page in the page cache.
+ */
+ mutex_lock(&hugetlb_instantiation_mutex);
entry = *ptep;
- if (pte_none(entry))
- return hugetlb_no_page(mm, vma, address, ptep, write_access);
+ if (pte_none(entry)) {
+ ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
+ mutex_unlock(&hugetlb_instantiation_mutex);
+ return ret;
+ }
ret = VM_FAULT_MINOR;
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (write_access && !pte_write(entry))
ret = hugetlb_cow(mm, vma, address, ptep, entry);
spin_unlock(&mm->page_table_lock);
+ mutex_unlock(&hugetlb_instantiation_mutex);
return ret;
}
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, int *length, int i)
{
- unsigned long vpfn, vaddr = *position;
+ unsigned long pfn_offset;
+ unsigned long vaddr = *position;
int remainder = *length;
- vpfn = vaddr/PAGE_SIZE;
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
- if (pages) {
- page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
- get_page(page);
- pages[i] = page;
- }
+ pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+ page = pte_page(*pte);
+same_page:
+ get_page(page);
+ if (pages)
+ pages[i] = page + pfn_offset;
if (vmas)
vmas[i] = vma;
vaddr += PAGE_SIZE;
- ++vpfn;
+ ++pfn_offset;
--remainder;
++i;
+ if (vaddr < vma->vm_end && remainder &&
+ pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+ /*
+ * We use pfn_offset to avoid touching the pageframes
+ * of this compound page.
+ */
+ goto same_page;
+ }
}
spin_unlock(&mm->page_table_lock);
*length = remainder;
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
return i;
}
+
+void hugetlb_change_protection(struct vm_area_struct *vma,
+ unsigned long address, unsigned long end, pgprot_t newprot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long start = address;
+ pte_t *ptep;
+ pte_t pte;
+
+ BUG_ON(address >= end);
+ flush_cache_range(vma, address, end);
+
+ spin_lock(&mm->page_table_lock);
+ for (; address < end; address += HPAGE_SIZE) {
+ ptep = huge_pte_offset(mm, address);
+ if (!ptep)
+ continue;
+ if (!pte_none(*ptep)) {
+ pte = huge_ptep_get_and_clear(mm, address, ptep);
+ pte = pte_mkhuge(pte_modify(pte, newprot));
+ set_huge_pte_at(mm, address, ptep, pte);
+ lazy_mmu_prot_update(pte);
+ }
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ flush_tlb_range(vma, start, end);
+}
+
diff --git a/mm/internal.h b/mm/internal.h
index 17256bb2f4e..d20e3cc4aef 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -8,23 +8,33 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
+#ifndef __MM_INTERNAL_H
+#define __MM_INTERNAL_H
-static inline void set_page_refs(struct page *page, int order)
+#include <linux/mm.h>
+
+static inline void set_page_count(struct page *page, int v)
+{
+ atomic_set(&page->_count, v);
+}
+
+/*
+ * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * a count of one.
+ */
+static inline void set_page_refcounted(struct page *page)
{
-#ifdef CONFIG_MMU
+ BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
+ BUG_ON(atomic_read(&page->_count));
set_page_count(page, 1);
-#else
- int i;
+}
- /*
- * We need to reference all the pages for this order, otherwise if
- * anyone accesses one of the pages with (get/put) it will be freed.
- * - eg: access_process_vm()
- */
- for (i = 0; i < (1 << order); i++)
- set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
+static inline void __put_page(struct page *page)
+{
+ atomic_dec(&page->_count);
}
extern void fastcall __init __free_pages_bootmem(struct page *page,
unsigned int order);
+
+#endif
diff --git a/mm/memory.c b/mm/memory.c
index 85e80a57db2..8d8f52569f3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
anon_vma_unlink(vma);
unlink_file_vma(vma);
- if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
+ if (is_vm_hugetlb_page(vma)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
} else {
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
* Optimization: gather nearby vmas into one call down
*/
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
- && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
- HPAGE_SIZE)) {
+ && !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
anon_vma_unlink(vma);
@@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
{
unsigned long pfn = pte_pfn(pte);
- if (vma->vm_flags & VM_PFNMAP) {
+ if (unlikely(vma->vm_flags & VM_PFNMAP)) {
unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
@@ -401,8 +400,6 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
* we should just do "return pfn_to_page(pfn)", but
* in the meantime we check that we get a valid pfn,
* and that the resulting page looks ok.
- *
- * Remove this test eventually!
*/
if (unlikely(!pfn_valid(pfn))) {
print_bad_pte(vma, pte, addr);
@@ -1074,6 +1071,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
if (pages) {
pages[i] = page;
+
+ flush_anon_page(page, start);
flush_dcache_page(page);
}
if (vmas)
@@ -1221,9 +1220,7 @@ out:
* The page has to be a nice clean _individual_ kernel allocation.
* If you allocate a compound page, you need to have marked it as
* such (__GFP_COMP), or manually just split the page up yourself
- * (which is mainly an issue of doing "set_page_count(page, 1)" for
- * each sub-page, and then freeing them one by one when you free
- * them rather than freeing it as a compound page).
+ * (see split_page()).
*
* NOTE! Traditionally this was done with "remap_pfn_range()" which
* took an arbitrary page protection parameter. This doesn't allow
@@ -2357,10 +2354,8 @@ int make_pages_present(unsigned long addr, unsigned long end)
if (!vma)
return -1;
write = (vma->vm_flags & VM_WRITE) != 0;
- if (addr >= end)
- BUG();
- if (end > vma->vm_end)
- BUG();
+ BUG_ON(addr >= end);
+ BUG_ON(end > vma->vm_end);
len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
ret = get_user_pages(current, current->mm, addr,
len, write, 0, NULL, NULL);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b21869a39f0..dec8249e972 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/migrate.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -95,11 +96,8 @@
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
-/* The number of pages to migrate per call to migrate_pages() */
-#define MIGRATE_CHUNK_SIZE 256
-
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
#define PDprintk(fmt...)
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
struct vm_area_struct *first, *vma, *prev;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- /* Must have swap device for migration */
- if (nr_swap_pages <= 0)
- return ERR_PTR(-ENODEV);
- /*
- * Clear the LRU lists so pages can be isolated.
- * Note that pages may be moved off the LRU after we have
- * drained them. Those pages will fail to migrate like other
- * pages that may be busy.
- */
- lru_add_drain_all();
+ err = migrate_prep();
+ if (err)
+ return ERR_PTR(err);
}
first = find_vma(mm, start);
@@ -431,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
return mpol_check_policy(mode, nodes);
}
+
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy. Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+ if (p->mempolicy)
+ p->flags |= PF_MEMPOLICY;
+ else
+ p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+ mpol_fix_fork_child_flag(current);
+}
+
/* Set the process memory policy */
long do_set_mempolicy(int mode, nodemask_t *nodes)
{
@@ -443,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
return PTR_ERR(new);
mpol_free(current->mempolicy);
current->mempolicy = new;
+ mpol_set_task_struct_flag();
if (new && new->policy == MPOL_INTERLEAVE)
current->il_next = first_node(new->v.nodes);
return 0;
@@ -550,92 +573,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
return err;
}
+#ifdef CONFIG_MIGRATION
/*
* page migration
*/
-
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
/*
* Avoid migrating a page that is shared with others.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
- if (isolate_lru_page(page))
- list_add_tail(&page->lru, pagelist);
- }
-}
-
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-static int migrate_pages_to(struct list_head *pagelist,
- struct vm_area_struct *vma, int dest)
-{
- LIST_HEAD(newlist);
- LIST_HEAD(moved);
- LIST_HEAD(failed);
- int err = 0;
- unsigned long offset = 0;
- int nr_pages;
- struct page *page;
- struct list_head *p;
-
-redo:
- nr_pages = 0;
- list_for_each(p, pagelist) {
- if (vma) {
- /*
- * The address passed to alloc_page_vma is used to
- * generate the proper interleave behavior. We fake
- * the address here by an increasing offset in order
- * to get the proper distribution of pages.
- *
- * No decision has been made as to which page
- * a certain old page is moved to so we cannot
- * specify the correct address.
- */
- page = alloc_page_vma(GFP_HIGHUSER, vma,
- offset + vma->vm_start);
- offset += PAGE_SIZE;
- }
- else
- page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
- if (!page) {
- err = -ENOMEM;
- goto out;
- }
- list_add_tail(&page->lru, &newlist);
- nr_pages++;
- if (nr_pages > MIGRATE_CHUNK_SIZE)
- break;
- }
- err = migrate_pages(pagelist, &newlist, &moved, &failed);
-
- putback_lru_pages(&moved); /* Call release pages instead ?? */
-
- if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
- goto redo;
-out:
- /* Return leftover allocated pages */
- while (!list_empty(&newlist)) {
- page = list_entry(newlist.next, struct page, lru);
- list_del(&page->lru);
- __free_page(page);
- }
- list_splice(&failed, pagelist);
- if (err < 0)
- return err;
-
- /* Calculate number of leftover pages */
- nr_pages = 0;
- list_for_each(p, pagelist)
- nr_pages++;
- return nr_pages;
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+ isolate_lru_page(page, pagelist);
}
/*
@@ -742,8 +691,23 @@ int do_migrate_pages(struct mm_struct *mm,
if (err < 0)
return err;
return busy;
+
}
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+ const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+ return -ENOSYS;
+}
+#endif
+
long do_mbind(unsigned long start, unsigned long len,
unsigned long mode, nodemask_t *nmask, unsigned long flags)
{
@@ -808,6 +772,7 @@ long do_mbind(unsigned long start, unsigned long len,
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
}
+
if (!list_empty(&pagelist))
putback_lru_pages(&pagelist);
@@ -947,7 +912,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
/*
* Check if this process has the right to modify the specified
* process. The right exists if the process has administrative
- * capabilities, superuser priviledges or the same
+ * capabilities, superuser privileges or the same
* userid as the target process.
*/
if ((current->euid != task->suid) && (current->euid != task->uid) &&
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a99b80480d..fe6e05289cc 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -183,8 +183,8 @@ EXPORT_SYMBOL(mempool_resize);
*/
void mempool_destroy(mempool_t *pool)
{
- if (pool->curr_nr != pool->min_nr)
- BUG(); /* There were outstanding elements */
+ /* Check for outstanding elements */
+ BUG_ON(pool->curr_nr != pool->min_nr);
free_pool(pool);
}
EXPORT_SYMBOL(mempool_destroy);
@@ -278,14 +278,56 @@ EXPORT_SYMBOL(mempool_free);
*/
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
{
- kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+ struct kmem_cache *mem = pool_data;
return kmem_cache_alloc(mem, gfp_mask);
}
EXPORT_SYMBOL(mempool_alloc_slab);
void mempool_free_slab(void *element, void *pool_data)
{
- kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+ struct kmem_cache *mem = pool_data;
kmem_cache_free(mem, element);
}
EXPORT_SYMBOL(mempool_free_slab);
+
+/*
+ * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
+ * specfied by pool_data
+ */
+void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
+{
+ size_t size = (size_t)(long)pool_data;
+ return kmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kmalloc);
+
+void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
+{
+ size_t size = (size_t) pool_data;
+ return kzalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kzalloc);
+
+void mempool_kfree(void *element, void *pool_data)
+{
+ kfree(element);
+}
+EXPORT_SYMBOL(mempool_kfree);
+
+/*
+ * A simple mempool-backed page allocator that allocates pages
+ * of the order specified by pool_data.
+ */
+void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
+{
+ int order = (int)(long)pool_data;
+ return alloc_pages(gfp_mask, order);
+}
+EXPORT_SYMBOL(mempool_alloc_pages);
+
+void mempool_free_pages(void *element, void *pool_data)
+{
+ int order = (int)(long)pool_data;
+ __free_pages(element, order);
+}
+EXPORT_SYMBOL(mempool_free_pages);
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 00000000000..09f6e4aa87f
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,655 @@
+/*
+ * Memory Migration functionality - linux/mm/migration.c
+ *
+ * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
+ *
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+
+#include <linux/migrate.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h> /* for try_to_release_page(),
+ buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+#include "internal.h"
+
+/* The maximum number of pages to take off the LRU for migration */
+#define MIGRATE_CHUNK_SIZE 256
+
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+
+/*
+ * Isolate one page from the LRU lists. If successful put it onto
+ * the indicated list with elevated page count.
+ *
+ * Result:
+ * -EBUSY: page not on LRU list
+ * 0: page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page, struct list_head *pagelist)
+{
+ int ret = -EBUSY;
+
+ if (PageLRU(page)) {
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
+ if (PageLRU(page)) {
+ ret = 0;
+ get_page(page);
+ ClearPageLRU(page);
+ if (PageActive(page))
+ del_page_from_active_list(zone, page);
+ else
+ del_page_from_inactive_list(zone, page);
+ list_add_tail(&page->lru, pagelist);
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ }
+ return ret;
+}
+
+/*
+ * migrate_prep() needs to be called after we have compiled the list of pages
+ * to be migrated using isolate_lru_page() but before we begin a series of calls
+ * to migrate_pages().
+ */
+int migrate_prep(void)
+{
+ /* Must have swap device for migration */
+ if (nr_swap_pages <= 0)
+ return -ENODEV;
+
+ /*
+ * Clear the LRU lists so pages can be isolated.
+ * Note that pages may be moved off the LRU after we have
+ * drained them. Those pages will fail to migrate like other
+ * pages that may be busy.
+ */
+ lru_add_drain_all();
+
+ return 0;
+}
+
+static inline void move_to_lru(struct page *page)
+{
+ list_del(&page->lru);
+ if (PageActive(page)) {
+ /*
+ * lru_cache_add_active checks that
+ * the PG_active bit is off.
+ */
+ ClearPageActive(page);
+ lru_cache_add_active(page);
+ } else {
+ lru_cache_add(page);
+ }
+ put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU.
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+ struct page *page;
+ struct page *page2;
+ int count = 0;
+
+ list_for_each_entry_safe(page, page2, l, lru) {
+ move_to_lru(page);
+ count++;
+ }
+ return count;
+}
+
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+ return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+
+ if (page_mapped(page) && mapping)
+ if (try_to_unmap(page, 1) != SWAP_SUCCESS)
+ goto unlock_retry;
+
+ if (PageDirty(page)) {
+ /* Page is dirty, try to write it out here */
+ switch(pageout(page, mapping)) {
+ case PAGE_KEEP:
+ case PAGE_ACTIVATE:
+ goto unlock_retry;
+
+ case PAGE_SUCCESS:
+ goto retry;
+
+ case PAGE_CLEAN:
+ ; /* try to free the page below */
+ }
+ }
+
+ if (PagePrivate(page)) {
+ if (!try_to_release_page(page, GFP_KERNEL) ||
+ (!mapping && page_count(page) == 1))
+ goto unlock_retry;
+ }
+
+ if (remove_mapping(mapping, page)) {
+ /* Success */
+ unlock_page(page);
+ return 0;
+ }
+
+unlock_retry:
+ unlock_page(page);
+
+retry:
+ return -EAGAIN;
+}
+EXPORT_SYMBOL(swap_page);
+
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+int migrate_page_remove_references(struct page *newpage,
+ struct page *page, int nr_refs)
+{
+ struct address_space *mapping = page_mapping(page);
+ struct page **radix_pointer;
+
+ /*
+ * Avoid doing any of the following work if the page count
+ * indicates that the page is in use or truncate has removed
+ * the page.
+ */
+ if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+ return -EAGAIN;
+
+ /*
+ * Establish swap ptes for anonymous pages or destroy pte
+ * maps for files.
+ *
+ * In order to reestablish file backed mappings the fault handlers
+ * will take the radix tree_lock which may then be used to stop
+ * processses from accessing this page until the new page is ready.
+ *
+ * A process accessing via a swap pte (an anonymous page) will take a
+ * page_lock on the old page which will block the process until the
+ * migration attempt is complete. At that time the PageSwapCache bit
+ * will be examined. If the page was migrated then the PageSwapCache
+ * bit will be clear and the operation to retrieve the page will be
+ * retried which will find the new page in the radix tree. Then a new
+ * direct mapping may be generated based on the radix tree contents.
+ *
+ * If the page was not migrated then the PageSwapCache bit
+ * is still set and the operation may continue.
+ */
+ if (try_to_unmap(page, 1) == SWAP_FAIL)
+ /* A vma has VM_LOCKED set -> permanent failure */
+ return -EPERM;
+
+ /*
+ * Give up if we were unable to remove all mappings.
+ */
+ if (page_mapcount(page))
+ return -EAGAIN;
+
+ write_lock_irq(&mapping->tree_lock);
+
+ radix_pointer = (struct page **)radix_tree_lookup_slot(
+ &mapping->page_tree,
+ page_index(page));
+
+ if (!page_mapping(page) || page_count(page) != nr_refs ||
+ *radix_pointer != page) {
+ write_unlock_irq(&mapping->tree_lock);
+ return 1;
+ }
+
+ /*
+ * Now we know that no one else is looking at the page.
+ *
+ * Certain minimal information about a page must be available
+ * in order for other subsystems to properly handle the page if they
+ * find it through the radix tree update before we are finished
+ * copying the page.
+ */
+ get_page(newpage);
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
+ if (PageSwapCache(page)) {
+ SetPageSwapCache(newpage);
+ set_page_private(newpage, page_private(page));
+ }
+
+ *radix_pointer = newpage;
+ __put_page(page);
+ write_unlock_irq(&mapping->tree_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(migrate_page_remove_references);
+
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+ copy_highpage(newpage, page);
+
+ if (PageError(page))
+ SetPageError(newpage);
+ if (PageReferenced(page))
+ SetPageReferenced(newpage);
+ if (PageUptodate(page))
+ SetPageUptodate(newpage);
+ if (PageActive(page))
+ SetPageActive(newpage);
+ if (PageChecked(page))
+ SetPageChecked(newpage);
+ if (PageMappedToDisk(page))
+ SetPageMappedToDisk(newpage);
+
+ if (PageDirty(page)) {
+ clear_page_dirty_for_io(page);
+ set_page_dirty(newpage);
+ }
+
+ ClearPageSwapCache(page);
+ ClearPageActive(page);
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page->mapping = NULL;
+
+ /*
+ * If any waiters have accumulated on the new page then
+ * wake them up.
+ */
+ if (PageWriteback(newpage))
+ end_page_writeback(newpage);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+ int rc;
+
+ BUG_ON(PageWriteback(page)); /* Writeback must be complete */
+
+ rc = migrate_page_remove_references(newpage, page, 2);
+
+ if (rc)
+ return rc;
+
+ migrate_page_copy(newpage, page);
+
+ /*
+ * Remove auxiliary swap entries and replace
+ * them with real ptes.
+ *
+ * Note that a real pte entry will allow processes that are not
+ * waiting on the page lock to use the new page via the page tables
+ * before the new page is unlocked.
+ */
+ remove_from_swap(newpage);
+ return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because to has become empty
+ * or no retryable pages exist anymore.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+ struct list_head *moved, struct list_head *failed)
+{
+ int retry;
+ int nr_failed = 0;
+ int pass = 0;
+ struct page *page;
+ struct page *page2;
+ int swapwrite = current->flags & PF_SWAPWRITE;
+ int rc;
+
+ if (!swapwrite)
+ current->flags |= PF_SWAPWRITE;
+
+redo:
+ retry = 0;
+
+ list_for_each_entry_safe(page, page2, from, lru) {
+ struct page *newpage = NULL;
+ struct address_space *mapping;
+
+ cond_resched();
+
+ rc = 0;
+ if (page_count(page) == 1)
+ /* page was freed from under us. So we are done. */
+ goto next;
+
+ if (to && list_empty(to))
+ break;
+
+ /*
+ * Skip locked pages during the first two passes to give the
+ * functions holding the lock time to release the page. Later we
+ * use lock_page() to have a higher chance of acquiring the
+ * lock.
+ */
+ rc = -EAGAIN;
+ if (pass > 2)
+ lock_page(page);
+ else
+ if (TestSetPageLocked(page))
+ goto next;
+
+ /*
+ * Only wait on writeback if we have already done a pass where
+ * we we may have triggered writeouts for lots of pages.
+ */
+ if (pass > 0) {
+ wait_on_page_writeback(page);
+ } else {
+ if (PageWriteback(page))
+ goto unlock_page;
+ }
+
+ /*
+ * Anonymous pages must have swap cache references otherwise
+ * the information contained in the page maps cannot be
+ * preserved.
+ */
+ if (PageAnon(page) && !PageSwapCache(page)) {
+ if (!add_to_swap(page, GFP_KERNEL)) {
+ rc = -ENOMEM;
+ goto unlock_page;
+ }
+ }
+
+ if (!to) {
+ rc = swap_page(page);
+ goto next;
+ }
+
+ newpage = lru_to_page(to);
+ lock_page(newpage);
+
+ /*
+ * Pages are properly locked and writeback is complete.
+ * Try to migrate the page.
+ */
+ mapping = page_mapping(page);
+ if (!mapping)
+ goto unlock_both;
+
+ if (mapping->a_ops->migratepage) {
+ /*
+ * Most pages have a mapping and most filesystems
+ * should provide a migration function. Anonymous
+ * pages are part of swap space which also has its
+ * own migration function. This is the most common
+ * path for page migration.
+ */
+ rc = mapping->a_ops->migratepage(newpage, page);
+ goto unlock_both;
+ }
+
+ /*
+ * Default handling if a filesystem does not provide
+ * a migration function. We can only migrate clean
+ * pages so try to write out any dirty pages first.
+ */
+ if (PageDirty(page)) {
+ switch (pageout(page, mapping)) {
+ case PAGE_KEEP:
+ case PAGE_ACTIVATE:
+ goto unlock_both;
+
+ case PAGE_SUCCESS:
+ unlock_page(newpage);
+ goto next;
+
+ case PAGE_CLEAN:
+ ; /* try to migrate the page below */
+ }
+ }
+
+ /*
+ * Buffers are managed in a filesystem specific way.
+ * We must have no buffers or drop them.
+ */
+ if (!page_has_buffers(page) ||
+ try_to_release_page(page, GFP_KERNEL)) {
+ rc = migrate_page(newpage, page);
+ goto unlock_both;
+ }
+
+ /*
+ * On early passes with mapped pages simply
+ * retry. There may be a lock held for some
+ * buffers that may go away. Later
+ * swap them out.
+ */
+ if (pass > 4) {
+ /*
+ * Persistently unable to drop buffers..... As a
+ * measure of last resort we fall back to
+ * swap_page().
+ */
+ unlock_page(newpage);
+ newpage = NULL;
+ rc = swap_page(page);
+ goto next;
+ }
+
+unlock_both:
+ unlock_page(newpage);
+
+unlock_page:
+ unlock_page(page);
+
+next:
+ if (rc == -EAGAIN) {
+ retry++;
+ } else if (rc) {
+ /* Permanent failure */
+ list_move(&page->lru, failed);
+ nr_failed++;
+ } else {
+ if (newpage) {
+ /* Successful migration. Return page to LRU */
+ move_to_lru(newpage);
+ }
+ list_move(&page->lru, moved);
+ }
+ }
+ if (retry && pass++ < 10)
+ goto redo;
+
+ if (!swapwrite)
+ current->flags &= ~PF_SWAPWRITE;
+
+ return nr_failed + retry;
+}
+
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct page *newpage, struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct buffer_head *bh, *head;
+ int rc;
+
+ if (!mapping)
+ return -EAGAIN;
+
+ if (!page_has_buffers(page))
+ return migrate_page(newpage, page);
+
+ head = page_buffers(page);
+
+ rc = migrate_page_remove_references(newpage, page, 3);
+
+ if (rc)
+ return rc;
+
+ bh = head;
+ do {
+ get_bh(bh);
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ ClearPagePrivate(page);
+ set_page_private(newpage, page_private(page));
+ set_page_private(page, 0);
+ put_page(page);
+ get_page(newpage);
+
+ bh = head;
+ do {
+ set_bh_page(bh, newpage, bh_offset(bh));
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ SetPagePrivate(newpage);
+
+ migrate_page_copy(newpage, page);
+
+ bh = head;
+ do {
+ unlock_buffer(bh);
+ put_bh(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+int migrate_pages_to(struct list_head *pagelist,
+ struct vm_area_struct *vma, int dest)
+{
+ LIST_HEAD(newlist);
+ LIST_HEAD(moved);
+ LIST_HEAD(failed);
+ int err = 0;
+ unsigned long offset = 0;
+ int nr_pages;
+ struct page *page;
+ struct list_head *p;
+
+redo:
+ nr_pages = 0;
+ list_for_each(p, pagelist) {
+ if (vma) {
+ /*
+ * The address passed to alloc_page_vma is used to
+ * generate the proper interleave behavior. We fake
+ * the address here by an increasing offset in order
+ * to get the proper distribution of pages.
+ *
+ * No decision has been made as to which page
+ * a certain old page is moved to so we cannot
+ * specify the correct address.
+ */
+ page = alloc_page_vma(GFP_HIGHUSER, vma,
+ offset + vma->vm_start);
+ offset += PAGE_SIZE;
+ }
+ else
+ page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+ list_add_tail(&page->lru, &newlist);
+ nr_pages++;
+ if (nr_pages > MIGRATE_CHUNK_SIZE)
+ break;
+ }
+ err = migrate_pages(pagelist, &newlist, &moved, &failed);
+
+ putback_lru_pages(&moved); /* Call release pages instead ?? */
+
+ if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+ goto redo;
+out:
+ /* Return leftover allocated pages */
+ while (!list_empty(&newlist)) {
+ page = list_entry(newlist.next, struct page, lru);
+ list_del(&page->lru);
+ __free_page(page);
+ }
+ list_splice(&failed, pagelist);
+ if (err < 0)
+ return err;
+
+ /* Calculate number of leftover pages */
+ nr_pages = 0;
+ list_for_each(p, pagelist)
+ nr_pages++;
+ return nr_pages;
+}
diff --git a/mm/mmap.c b/mm/mmap.c
index 47556d2b3e9..4f5b5709136 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ again: remove_next = 1 + (end > next->vm_end);
* If the vma has a ->close operation then the driver probably needs to release
* per-vma resources, so we don't attempt to merge those.
*/
-#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
@@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
const unsigned long stack_flags
= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
-#ifdef CONFIG_HUGETLB
- if (flags & VM_HUGETLB) {
- if (!(flags & VM_DONTCOPY))
- mm->shared_vm += pages;
- return;
- }
-#endif /* CONFIG_HUGETLB */
-
if (file) {
mm->shared_vm += pages;
if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -1048,12 +1040,11 @@ munmap_back:
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
- memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_start = addr;
@@ -1904,12 +1895,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
/*
* create a vma struct for an anonymous mapping
*/
- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
}
- memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_start = addr;
diff --git a/mm/mmzone.c b/mm/mmzone.c
new file mode 100644
index 00000000000..b022370e612
--- /dev/null
+++ b/mm/mmzone.c
@@ -0,0 +1,50 @@
+/*
+ * linux/mm/mmzone.c
+ *
+ * management codes for pgdats and zones.
+ */
+
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+
+struct pglist_data *first_online_pgdat(void)
+{
+ return NODE_DATA(first_online_node);
+}
+
+EXPORT_SYMBOL(first_online_pgdat);
+
+struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+{
+ int nid = next_online_node(pgdat->node_id);
+
+ if (nid == MAX_NUMNODES)
+ return NULL;
+ return NODE_DATA(nid);
+}
+EXPORT_SYMBOL(next_online_pgdat);
+
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ */
+struct zone *next_zone(struct zone *zone)
+{
+ pg_data_t *pgdat = zone->zone_pgdat;
+
+ if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
+ zone++;
+ else {
+ pgdat = next_online_pgdat(pgdat);
+ if (pgdat)
+ zone = pgdat->node_zones;
+ else
+ zone = NULL;
+ }
+ return zone;
+}
+EXPORT_SYMBOL(next_zone);
+
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 653b8571c1e..4c14d4289b6 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
* a MAP_NORESERVE private mapping to writable will now reserve.
*/
if (newflags & VM_WRITE) {
- if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
+ if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
charged = nrpages;
if (security_vm_enough_memory(charged))
return -ENOMEM;
@@ -166,7 +166,10 @@ success:
*/
vma->vm_flags = newflags;
vma->vm_page_prot = newprot;
- change_protection(vma, start, end, newprot);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_change_protection(vma, start, end, newprot);
+ else
+ change_protection(vma, start, end, newprot);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
return 0;
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
- if (is_vm_hugetlb_page(vma)) {
- error = -EACCES;
- goto out;
- }
-
newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */
diff --git a/mm/msync.c b/mm/msync.c
index 3563a56e1a5..bc6c9537636 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -9,20 +9,24 @@
*/
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
+#include <linux/writeback.h>
+#include <linux/file.h>
#include <linux/syscalls.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
-static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end)
{
pte_t *pte;
spinlock_t *ptl;
int progress = 0;
+ unsigned long ret = 0;
again:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -43,58 +47,64 @@ again:
if (!page)
continue;
if (ptep_clear_flush_dirty(vma, addr, pte) ||
- page_test_and_clear_dirty(page))
- set_page_dirty(page);
+ page_test_and_clear_dirty(page))
+ ret += set_page_dirty(page);
progress += 3;
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
if (addr != end)
goto again;
+ return ret;
}
-static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end)
+static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
+ pud_t *pud, unsigned long addr, unsigned long end)
{
pmd_t *pmd;
unsigned long next;
+ unsigned long ret = 0;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- msync_pte_range(vma, pmd, addr, next);
+ ret += msync_pte_range(vma, pmd, addr, next);
} while (pmd++, addr = next, addr != end);
+ return ret;
}
-static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end)
+static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
+ pgd_t *pgd, unsigned long addr, unsigned long end)
{
pud_t *pud;
unsigned long next;
+ unsigned long ret = 0;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- msync_pmd_range(vma, pud, addr, next);
+ ret += msync_pmd_range(vma, pud, addr, next);
} while (pud++, addr = next, addr != end);
+ return ret;
}
-static void msync_page_range(struct vm_area_struct *vma,
+static unsigned long msync_page_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
pgd_t *pgd;
unsigned long next;
+ unsigned long ret = 0;
/* For hugepages we can't go walking the page table normally,
* but that's ok, hugetlbfs is memory based, so we don't need
* to do anything more on an msync().
*/
if (vma->vm_flags & VM_HUGETLB)
- return;
+ return 0;
BUG_ON(addr >= end);
pgd = pgd_offset(vma->vm_mm, addr);
@@ -103,8 +113,9 @@ static void msync_page_range(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- msync_pud_range(vma, pgd, addr, next);
+ ret += msync_pud_range(vma, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
+ return ret;
}
/*
@@ -115,53 +126,31 @@ static void msync_page_range(struct vm_area_struct *vma,
* write out the dirty pages and wait on the writeout and check the result.
* Or the application may run fadvise(FADV_DONTNEED) against the fd to start
* async writeout immediately.
- * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
+ * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
* applications.
*/
-static int msync_interval(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end, int flags)
+static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long end, int flags,
+ unsigned long *nr_pages_dirtied)
{
- int ret = 0;
struct file *file = vma->vm_file;
if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
return -EBUSY;
- if (file && (vma->vm_flags & VM_SHARED)) {
- msync_page_range(vma, addr, end);
-
- if (flags & MS_SYNC) {
- struct address_space *mapping = file->f_mapping;
- int err;
-
- ret = filemap_fdatawrite(mapping);
- if (file->f_op && file->f_op->fsync) {
- /*
- * We don't take i_mutex here because mmap_sem
- * is already held.
- */
- err = file->f_op->fsync(file,file->f_dentry,1);
- if (err && !ret)
- ret = err;
- }
- err = filemap_fdatawait(mapping);
- if (!ret)
- ret = err;
- }
- }
- return ret;
+ if (file && (vma->vm_flags & VM_SHARED))
+ *nr_pages_dirtied = msync_page_range(vma, addr, end);
+ return 0;
}
asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
{
unsigned long end;
struct vm_area_struct *vma;
- int unmapped_error, error = -EINVAL;
-
- if (flags & MS_SYNC)
- current->flags |= PF_SYNCWRITE;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+ int done = 0;
- down_read(&current->mm->mmap_sem);
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
goto out;
if (start & ~PAGE_MASK)
@@ -180,13 +169,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
* If the interval [start,end) covers some unmapped address ranges,
* just ignore them, but return -ENOMEM at the end.
*/
+ down_read(&current->mm->mmap_sem);
+ if (flags & MS_SYNC)
+ current->flags |= PF_SYNCWRITE;
vma = find_vma(current->mm, start);
- unmapped_error = 0;
- for (;;) {
- /* Still start < end. */
+ if (!vma) {
error = -ENOMEM;
- if (!vma)
- goto out;
+ goto out_unlock;
+ }
+ do {
+ unsigned long nr_pages_dirtied = 0;
+ struct file *file;
+
/* Here start < vma->vm_end. */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
@@ -195,22 +189,47 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
/* Here vma->vm_start <= start < vma->vm_end. */
if (end <= vma->vm_end) {
if (start < end) {
- error = msync_interval(vma, start, end, flags);
+ error = msync_interval(vma, start, end, flags,
+ &nr_pages_dirtied);
if (error)
- goto out;
+ goto out_unlock;
}
error = unmapped_error;
- goto out;
+ done = 1;
+ } else {
+ /* Here vma->vm_start <= start < vma->vm_end < end. */
+ error = msync_interval(vma, start, vma->vm_end, flags,
+ &nr_pages_dirtied);
+ if (error)
+ goto out_unlock;
}
- /* Here vma->vm_start <= start < vma->vm_end < end. */
- error = msync_interval(vma, start, vma->vm_end, flags);
- if (error)
- goto out;
+ file = vma->vm_file;
start = vma->vm_end;
- vma = vma->vm_next;
- }
-out:
- up_read(&current->mm->mmap_sem);
+ if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
+ get_file(file);
+ up_read(&current->mm->mmap_sem);
+ balance_dirty_pages_ratelimited_nr(file->f_mapping,
+ nr_pages_dirtied);
+ fput(file);
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, start);
+ } else if ((flags & MS_SYNC) && file &&
+ (vma->vm_flags & VM_SHARED)) {
+ get_file(file);
+ up_read(&current->mm->mmap_sem);
+ error = do_fsync(file, 0);
+ fput(file);
+ down_read(&current->mm->mmap_sem);
+ if (error)
+ goto out_unlock;
+ vma = find_vma(current->mm, start);
+ } else {
+ vma = vma->vm_next;
+ }
+ } while (vma && !done);
+out_unlock:
current->flags &= ~PF_SYNCWRITE;
+ up_read(&current->mm->mmap_sem);
+out:
return error;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 4951f4786f2..db45efac17c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
/*
* kmalloc doesn't like __GFP_HIGHMEM for some reason
*/
- return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM);
+ return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
}
struct page * vmalloc_to_page(void *addr)
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
* - note that this may not return a page-aligned address if the object
* we're allocating is smaller than a page
*/
- base = kmalloc(len, GFP_KERNEL);
+ base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
if (!base)
goto enomem;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 945559fb63d..893d7677579 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -75,12 +75,12 @@ int vm_dirty_ratio = 40;
* The interval between `kupdate'-style writebacks, in centiseconds
* (hundredths of a second)
*/
-int dirty_writeback_centisecs = 5 * 100;
+int dirty_writeback_interval = 5 * HZ;
/*
* The longest number of centiseconds for which data is allowed to remain dirty
*/
-int dirty_expire_centisecs = 30 * 100;
+int dirty_expire_interval = 30 * HZ;
/*
* Flag that makes the machine dump writes/reads and block dirtyings.
@@ -88,7 +88,8 @@ int dirty_expire_centisecs = 30 * 100;
int block_dump;
/*
- * Flag that puts the machine in "laptop mode".
+ * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
+ * a full sync is triggered after this time elapses without any disk activity.
*/
int laptop_mode;
@@ -255,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping)
}
/**
- * balance_dirty_pages_ratelimited - balance dirty memory state
+ * balance_dirty_pages_ratelimited_nr - balance dirty memory state
* @mapping: address_space which was dirtied
+ * @nr_pages: number of pages which the caller has just dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
@@ -267,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping)
* limit we decrease the ratelimiting by a lot, to prevent individual processes
* from overshooting the limit by (ratelimit_pages) each.
*/
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+ unsigned long nr_pages_dirtied)
{
- static DEFINE_PER_CPU(int, ratelimits) = 0;
- long ratelimit;
+ static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
+ unsigned long ratelimit;
+ unsigned long *p;
ratelimit = ratelimit_pages;
if (dirty_exceeded)
@@ -280,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
* Check the rate limiting. Also, we do not want to throttle real-time
* tasks in balance_dirty_pages(). Period.
*/
- if (get_cpu_var(ratelimits)++ >= ratelimit) {
- __get_cpu_var(ratelimits) = 0;
- put_cpu_var(ratelimits);
+ preempt_disable();
+ p = &__get_cpu_var(ratelimits);
+ *p += nr_pages_dirtied;
+ if (unlikely(*p >= ratelimit)) {
+ *p = 0;
+ preempt_enable();
balance_dirty_pages(mapping);
return;
}
- put_cpu_var(ratelimits);
+ preempt_enable();
}
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
void throttle_vm_writeout(void)
{
@@ -380,8 +387,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
- * Try to run once per dirty_writeback_centisecs. But if a writeback event
- * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * Try to run once per dirty_writeback_interval. But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
*
* older_than_this takes precedence over nr_to_write. So we'll only write back
@@ -406,9 +413,9 @@ static void wb_kupdate(unsigned long arg)
sync_supers();
get_writeback_state(&wbs);
- oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
+ oldest_jif = jiffies - dirty_expire_interval;
start_jif = jiffies;
- next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
+ next_jif = start_jif + dirty_writeback_interval;
nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
while (nr_to_write > 0) {
@@ -425,7 +432,7 @@ static void wb_kupdate(unsigned long arg)
}
if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ;
- if (dirty_writeback_centisecs)
+ if (dirty_writeback_interval)
mod_timer(&wb_timer, next_jif);
}
@@ -435,11 +442,11 @@ static void wb_kupdate(unsigned long arg)
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, file, buffer, length, ppos);
- if (dirty_writeback_centisecs) {
+ proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+ if (dirty_writeback_interval) {
mod_timer(&wb_timer,
- jiffies + (dirty_writeback_centisecs * HZ) / 100);
- } else {
+ jiffies + dirty_writeback_interval);
+ } else {
del_timer(&wb_timer);
}
return 0;
@@ -468,7 +475,7 @@ static void laptop_timer_fn(unsigned long unused)
*/
void laptop_io_completion(void)
{
- mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+ mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
}
/*
@@ -544,7 +551,7 @@ void __init page_writeback_init(void)
if (vm_dirty_ratio <= 0)
vm_dirty_ratio = 1;
}
- mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
+ mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
}
@@ -621,8 +628,6 @@ EXPORT_SYMBOL(write_one_page);
*/
int __set_page_dirty_nobuffers(struct page *page)
{
- int ret = 0;
-
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
@@ -644,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page)
I_DIRTY_PAGES);
}
}
+ return 1;
}
- return ret;
+ return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -675,8 +681,10 @@ int fastcall set_page_dirty(struct page *page)
return (*spd)(page);
return __set_page_dirty_buffers(page);
}
- if (!PageDirty(page))
- SetPageDirty(page);
+ if (!PageDirty(page)) {
+ if (!TestSetPageDirty(page))
+ return 1;
+ }
return 0;
}
EXPORT_SYMBOL(set_page_dirty);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 234bd4895d1..dc523a1f270 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,13 +49,11 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
EXPORT_SYMBOL(node_online_map);
nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list __read_mostly;
unsigned long totalram_pages __read_mostly;
unsigned long totalhigh_pages __read_mostly;
long nr_swap_pages;
int percpu_pagelist_fraction;
-static void fastcall free_hot_cold_page(struct page *page, int cold);
static void __free_pages_ok(struct page *page, unsigned int order);
/*
@@ -190,7 +188,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
- SetPageCompound(p);
+ __SetPageCompound(p);
set_page_private(p, (unsigned long)page);
}
}
@@ -209,10 +207,24 @@ static void destroy_compound_page(struct page *page, unsigned long order)
if (unlikely(!PageCompound(p) |
(page_private(p) != (unsigned long)page)))
bad_page(page);
- ClearPageCompound(p);
+ __ClearPageCompound(p);
}
}
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+{
+ int i;
+
+ BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+ /*
+ * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
+ * and __GFP_HIGHMEM from hard or soft interrupt context.
+ */
+ BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
+ for (i = 0; i < (1 << order); i++)
+ clear_highpage(page + i);
+}
+
/*
* function for dealing with page's order in buddy system.
* zone->lock is already acquired when we use these.
@@ -423,11 +435,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
mutex_debug_check_no_locks_freed(page_address(page),
PAGE_SIZE<<order);
-#ifndef CONFIG_MMU
- for (i = 1 ; i < (1 << order) ; ++i)
- __put_page(page + i);
-#endif
-
for (i = 0 ; i < (1 << order) ; ++i)
reserved += free_pages_check(page + i);
if (reserved)
@@ -448,28 +455,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
if (order == 0) {
__ClearPageReserved(page);
set_page_count(page, 0);
-
- free_hot_cold_page(page, 0);
+ set_page_refcounted(page);
+ __free_page(page);
} else {
- LIST_HEAD(list);
int loop;
+ prefetchw(page);
for (loop = 0; loop < BITS_PER_LONG; loop++) {
struct page *p = &page[loop];
- if (loop + 16 < BITS_PER_LONG)
- prefetchw(p + 16);
+ if (loop + 1 < BITS_PER_LONG)
+ prefetchw(p + 1);
__ClearPageReserved(p);
set_page_count(p, 0);
}
- arch_free_page(page, order);
-
- mod_page_state(pgfree, 1 << order);
-
- list_add(&page->lru, &list);
- kernel_map_pages(page, 1 << order, 0);
- free_pages_bulk(page_zone(page), 1, &list, order);
+ set_page_refcounted(page);
+ __free_pages(page, order);
}
}
@@ -507,7 +509,7 @@ static inline void expand(struct zone *zone, struct page *page,
/*
* This page is about to be returned from the page allocator
*/
-static int prep_new_page(struct page *page, int order)
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
{
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
@@ -536,8 +538,15 @@ static int prep_new_page(struct page *page, int order)
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked | 1 << PG_mappedtodisk);
set_page_private(page, 0);
- set_page_refs(page, order);
+ set_page_refcounted(page);
kernel_map_pages(page, 1 << order, 1);
+
+ if (gfp_flags & __GFP_ZERO)
+ prep_zero_page(page, order, gfp_flags);
+
+ if (order && (gfp_flags & __GFP_COMP))
+ prep_compound_page(page, order);
+
return 0;
}
@@ -593,13 +602,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
/*
* Called from the slab reaper to drain pagesets on a particular node that
* belong to the currently executing processor.
+ * Note that this function must be called with the thread pinned to
+ * a single processor.
*/
void drain_node_pages(int nodeid)
{
int i, z;
unsigned long flags;
- local_irq_save(flags);
for (z = 0; z < MAX_NR_ZONES; z++) {
struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
struct per_cpu_pageset *pset;
@@ -609,11 +619,14 @@ void drain_node_pages(int nodeid)
struct per_cpu_pages *pcp;
pcp = &pset->pcp[i];
- free_pages_bulk(zone, pcp->count, &pcp->list, 0);
- pcp->count = 0;
+ if (pcp->count) {
+ local_irq_save(flags);
+ free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+ pcp->count = 0;
+ local_irq_restore(flags);
+ }
}
}
- local_irq_restore(flags);
}
#endif
@@ -743,13 +756,22 @@ void fastcall free_cold_page(struct page *page)
free_hot_cold_page(page, 1);
}
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+/*
+ * split_page takes a non-compound higher-order page, and splits it into
+ * n (1<<order) sub-pages: page[0..n]
+ * Each sub-page must be freed individually.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+void split_page(struct page *page, unsigned int order)
{
int i;
- BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
- for(i = 0; i < (1 << order); i++)
- clear_highpage(page + i);
+ BUG_ON(PageCompound(page));
+ BUG_ON(!page_count(page));
+ for (i = 1; i < (1 << order); i++)
+ set_page_refcounted(page + i);
}
/*
@@ -795,14 +817,8 @@ again:
put_cpu();
BUG_ON(bad_range(zone, page));
- if (prep_new_page(page, order))
+ if (prep_new_page(page, order, gfp_flags))
goto again;
-
- if (gfp_flags & __GFP_ZERO)
- prep_zero_page(page, order, gfp_flags);
-
- if (order && (gfp_flags & __GFP_COMP))
- prep_compound_page(page, order);
return page;
failed:
@@ -926,7 +942,8 @@ restart:
goto got_pg;
do {
- wakeup_kswapd(*z, order);
+ if (cpuset_zone_allowed(*z, gfp_mask))
+ wakeup_kswapd(*z, order);
} while (*(++z));
/*
@@ -1183,7 +1200,7 @@ unsigned int nr_free_highpages (void)
pg_data_t *pgdat;
unsigned int pages = 0;
- for_each_pgdat(pgdat)
+ for_each_online_pgdat(pgdat)
pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
return pages;
@@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
{
- int cpu = 0;
+ unsigned cpu;
memset(ret, 0, nr * sizeof(unsigned long));
cpus_and(*cpumask, *cpumask, cpu_online_map);
- cpu = first_cpu(*cpumask);
- while (cpu < NR_CPUS) {
- unsigned long *in, *out, off;
-
- if (!cpu_isset(cpu, *cpumask))
- continue;
+ for_each_cpu_mask(cpu, *cpumask) {
+ unsigned long *in;
+ unsigned long *out;
+ unsigned off;
+ unsigned next_cpu;
in = (unsigned long *)&per_cpu(page_states, cpu);
- cpu = next_cpu(cpu, *cpumask);
-
- if (likely(cpu < NR_CPUS))
- prefetch(&per_cpu(page_states, cpu));
+ next_cpu = next_cpu(cpu, *cpumask);
+ if (likely(next_cpu < NR_CPUS))
+ prefetch(&per_cpu(page_states, next_cpu));
out = (unsigned long *)ret;
for (off = 0; off < nr; off++)
@@ -1327,7 +1342,7 @@ void get_zone_counts(unsigned long *active,
*active = 0;
*inactive = 0;
*free = 0;
- for_each_pgdat(pgdat) {
+ for_each_online_pgdat(pgdat) {
unsigned long l, m, n;
__get_zone_counts(&l, &m, &n, pgdat);
*active += l;
@@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
continue;
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
- set_page_count(page, 1);
+ init_page_count(page);
reset_page_mapcount(page);
SetPageReserved(page);
INIT_LIST_HEAD(&page->lru);
@@ -2013,8 +2028,9 @@ static __meminit void zone_pcp_init(struct zone *zone)
setup_pageset(zone_pcp(zone,cpu), batch);
#endif
}
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone->name, zone->present_pages, batch);
+ if (zone->present_pages)
+ printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
+ zone->name, zone->present_pages, batch);
}
static __meminit void init_currently_empty_zone(struct zone *zone,
@@ -2025,7 +2041,6 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
zone_wait_table_init(zone, size);
pgdat->nr_zones = zone_idx(zone) + 1;
- zone->zone_mem_map = pfn_to_page(zone_start_pfn);
zone->zone_start_pfn = zone_start_pfn;
memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
@@ -2153,8 +2168,9 @@ static void *frag_start(struct seq_file *m, loff_t *pos)
{
pg_data_t *pgdat;
loff_t node = *pos;
-
- for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
+ for (pgdat = first_online_pgdat();
+ pgdat && node;
+ pgdat = next_online_pgdat(pgdat))
--node;
return pgdat;
@@ -2165,7 +2181,7 @@ static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
pg_data_t *pgdat = (pg_data_t *)arg;
(*pos)++;
- return pgdat->pgdat_next;
+ return next_online_pgdat(pgdat);
}
static void frag_stop(struct seq_file *m, void *arg)
@@ -2466,7 +2482,7 @@ static void setup_per_zone_lowmem_reserve(void)
struct pglist_data *pgdat;
int j, idx;
- for_each_pgdat(pgdat) {
+ for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long present_pages = zone->present_pages;
@@ -2685,8 +2701,7 @@ void *__init alloc_large_system_hash(const char *tablename,
else
numentries <<= (PAGE_SHIFT - scale);
}
- /* rounded up to nearest power of 2 in size */
- numentries = 1UL << (long_log2(numentries) + 1);
+ numentries = roundup_pow_of_two(numentries);
/* limit allocation size to 1/16 total memory by default */
if (max == 0) {
@@ -2729,3 +2744,44 @@ void *__init alloc_large_system_hash(const char *tablename,
return table;
}
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+/*
+ * pfn <-> page translation. out-of-line version.
+ * (see asm-generic/memory_model.h)
+ */
+#if defined(CONFIG_FLATMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+ return mem_map + (pfn - ARCH_PFN_OFFSET);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+ return (page - mem_map) + ARCH_PFN_OFFSET;
+}
+#elif defined(CONFIG_DISCONTIGMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+ int nid = arch_pfn_to_nid(pfn);
+ return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+ return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
+}
+#elif defined(CONFIG_SPARSEMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+ return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
+}
+
+unsigned long page_to_pfn(struct page *page)
+{
+ long section_id = page_to_section(page);
+ return page - __section_mem_map_addr(__nr_to_section(section_id));
+}
+#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
+EXPORT_SYMBOL(pfn_to_page);
+EXPORT_SYMBOL(page_to_pfn);
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
diff --git a/mm/readahead.c b/mm/readahead.c
index 9f0b98227b4..ba7db816f4c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -53,13 +53,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra)
return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
}
+static inline void reset_ahead_window(struct file_ra_state *ra)
+{
+ /*
+ * ... but preserve ahead_start + ahead_size value,
+ * see 'recheck:' label in page_cache_readahead().
+ * Note: We never use ->ahead_size as rvalue without
+ * checking ->ahead_start != 0 first.
+ */
+ ra->ahead_size += ra->ahead_start;
+ ra->ahead_start = 0;
+}
+
static inline void ra_off(struct file_ra_state *ra)
{
ra->start = 0;
ra->flags = 0;
ra->size = 0;
- ra->ahead_start = 0;
- ra->ahead_size = 0;
+ reset_ahead_window(ra);
return;
}
@@ -73,10 +84,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
unsigned long newsize = roundup_pow_of_two(size);
- if (newsize <= max / 64)
- newsize = newsize * newsize;
+ if (newsize <= max / 32)
+ newsize = newsize * 4;
else if (newsize <= max / 4)
- newsize = max / 4;
+ newsize = newsize * 2;
else
newsize = max;
return newsize;
@@ -427,8 +438,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
* congestion. The ahead window will any way be closed
* in case we failed due to excessive page cache hits.
*/
- ra->ahead_start = 0;
- ra->ahead_size = 0;
+ reset_ahead_window(ra);
}
return ret;
@@ -521,11 +531,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
* If we get here we are doing sequential IO and this was not the first
* occurence (ie we have an existing window)
*/
-
if (ra->ahead_start == 0) { /* no ahead window yet */
if (!make_ahead_window(mapping, filp, ra, 0))
- goto out;
+ goto recheck;
}
+
/*
* Already have an ahead window, check if we crossed into it.
* If so, shift windows and issue a new ahead window.
@@ -537,11 +547,16 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
ra->start = ra->ahead_start;
ra->size = ra->ahead_size;
make_ahead_window(mapping, filp, ra, 0);
+recheck:
+ /* prev_page shouldn't overrun the ahead window */
+ ra->prev_page = min(ra->prev_page,
+ ra->ahead_start + ra->ahead_size - 1);
}
out:
return ra->prev_page + 1;
}
+EXPORT_SYMBOL_GPL(page_cache_readahead);
/*
* handle_ra_miss() is called when it is known that a page which should have
diff --git a/mm/rmap.c b/mm/rmap.c
index 67f0e20b101..1963e269314 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,13 +56,11 @@
#include <asm/tlbflush.h>
-//#define RMAP_DEBUG /* can be enabled only for debugging */
-
-kmem_cache_t *anon_vma_cachep;
+struct kmem_cache *anon_vma_cachep;
static inline void validate_anon_vma(struct vm_area_struct *find_vma)
{
-#ifdef RMAP_DEBUG
+#ifdef CONFIG_DEBUG_VM
struct anon_vma *anon_vma = find_vma->anon_vma;
struct vm_area_struct *vma;
unsigned int mapcount = 0;
@@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma)
anon_vma_free(anon_vma);
}
-static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
+ unsigned long flags)
{
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
SLAB_CTOR_CONSTRUCTOR) {
@@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page)
void page_remove_rmap(struct page *page)
{
if (atomic_add_negative(-1, &page->_mapcount)) {
- if (page_mapcount(page) < 0) {
+#ifdef CONFIG_DEBUG_VM
+ if (unlikely(page_mapcount(page) < 0)) {
printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
printk (KERN_EMERG " page->flags = %lx\n", page->flags);
printk (KERN_EMERG " page->count = %x\n", page_count(page));
printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
}
-
+#endif
BUG_ON(page_mapcount(page) < 0);
/*
* It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c455fbaff7..37eaf42ed2c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -875,7 +875,7 @@ redirty:
}
#ifdef CONFIG_NUMA
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
{
char *nodelist = strchr(value, ':');
int err = 1;
@@ -2119,7 +2119,7 @@ failed:
return err;
}
-static kmem_cache_t *shmem_inode_cachep;
+static struct kmem_cache *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb)
{
@@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode)
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep,
+ unsigned long flags)
{
struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
diff --git a/mm/slab.c b/mm/slab.c
index d0bd7f07ab0..4cbf8bb1355 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -50,7 +50,7 @@
* The head array is strictly LIFO and should improve the cache hit rates.
* On SMP, it additionally reduces the spinlock operations.
*
- * The c_cpuarray may not be read with enabled local interrupts -
+ * The c_cpuarray may not be read with enabled local interrupts -
* it's changed with a smp_call_function().
*
* SMP synchronization:
@@ -94,6 +94,7 @@
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compiler.h>
+#include <linux/cpuset.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
@@ -170,15 +171,15 @@
#if DEBUG
# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_HWCACHE_ALIGN | \
- SLAB_NO_REAP | SLAB_CACHE_DMA | \
+ SLAB_CACHE_DMA | \
SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU)
+ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
#else
-# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
+# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU)
+ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
#endif
/*
@@ -203,7 +204,8 @@
typedef unsigned int kmem_bufctl_t;
#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
-#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2)
+#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
+#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
/* Max number of objs-per-slab for caches which use off-slab slabs.
* Needed to avoid a possible looping condition in cache_grow().
@@ -266,16 +268,17 @@ struct array_cache {
unsigned int batchcount;
unsigned int touched;
spinlock_t lock;
- void *entry[0]; /*
- * Must have this definition in here for the proper
- * alignment of array_cache. Also simplifies accessing
- * the entries.
- * [0] is for gcc 2.95. It should really be [].
- */
+ void *entry[0]; /*
+ * Must have this definition in here for the proper
+ * alignment of array_cache. Also simplifies accessing
+ * the entries.
+ * [0] is for gcc 2.95. It should really be [].
+ */
};
-/* bootstrap: The caches do not work without cpuarrays anymore,
- * but the cpuarrays are allocated from the generic caches...
+/*
+ * bootstrap: The caches do not work without cpuarrays anymore, but the
+ * cpuarrays are allocated from the generic caches...
*/
#define BOOT_CPUCACHE_ENTRIES 1
struct arraycache_init {
@@ -291,13 +294,13 @@ struct kmem_list3 {
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long free_objects;
- unsigned long next_reap;
- int free_touched;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
spinlock_t list_lock;
struct array_cache *shared; /* shared per node */
struct array_cache **alien; /* on other nodes */
+ unsigned long next_reap; /* updated without locking */
+ int free_touched; /* updated without locking */
};
/*
@@ -310,10 +313,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
#define SIZE_L3 (1 + MAX_NUMNODES)
/*
- * This function must be completely optimized away if
- * a constant is passed to it. Mostly the same as
- * what is in linux/slab.h except it returns an
- * index.
+ * This function must be completely optimized away if a constant is passed to
+ * it. Mostly the same as what is in linux/slab.h except it returns an index.
*/
static __always_inline int index_of(const size_t size)
{
@@ -351,14 +352,14 @@ static void kmem_list3_init(struct kmem_list3 *parent)
parent->free_touched = 0;
}
-#define MAKE_LIST(cachep, listp, slab, nodeid) \
- do { \
- INIT_LIST_HEAD(listp); \
- list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+#define MAKE_LIST(cachep, listp, slab, nodeid) \
+ do { \
+ INIT_LIST_HEAD(listp); \
+ list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
} while (0)
-#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
- do { \
+#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
+ do { \
MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
@@ -373,28 +374,30 @@ static void kmem_list3_init(struct kmem_list3 *parent)
struct kmem_cache {
/* 1) per-cpu data, touched during every alloc/free */
struct array_cache *array[NR_CPUS];
+/* 2) Cache tunables. Protected by cache_chain_mutex */
unsigned int batchcount;
unsigned int limit;
unsigned int shared;
+
unsigned int buffer_size;
-/* 2) touched by every alloc & free from the backend */
+/* 3) touched by every alloc & free from the backend */
struct kmem_list3 *nodelists[MAX_NUMNODES];
- unsigned int flags; /* constant flags */
- unsigned int num; /* # of objs per slab */
- spinlock_t spinlock;
-/* 3) cache_grow/shrink */
+ unsigned int flags; /* constant flags */
+ unsigned int num; /* # of objs per slab */
+
+/* 4) cache_grow/shrink */
/* order of pgs per slab (2^n) */
unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA */
gfp_t gfpflags;
- size_t colour; /* cache colouring range */
+ size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
struct kmem_cache *slabp_cache;
unsigned int slab_size;
- unsigned int dflags; /* dynamic flags */
+ unsigned int dflags; /* dynamic flags */
/* constructor func */
void (*ctor) (void *, struct kmem_cache *, unsigned long);
@@ -402,11 +405,11 @@ struct kmem_cache {
/* de-constructor func */
void (*dtor) (void *, struct kmem_cache *, unsigned long);
-/* 4) cache creation/removal */
+/* 5) cache creation/removal */
const char *name;
struct list_head next;
-/* 5) statistics */
+/* 6) statistics */
#if STATS
unsigned long num_active;
unsigned long num_allocations;
@@ -438,8 +441,9 @@ struct kmem_cache {
#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
#define BATCHREFILL_LIMIT 16
-/* Optimization question: fewer reaps means less
- * probability for unnessary cpucache drain/refill cycles.
+/*
+ * Optimization question: fewer reaps means less probability for unnessary
+ * cpucache drain/refill cycles.
*
* OTOH the cpuarrays can contain lots of objects,
* which could lock up otherwise freeable slabs.
@@ -453,17 +457,19 @@ struct kmem_cache {
#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
#define STATS_INC_GROWN(x) ((x)->grown++)
#define STATS_INC_REAPED(x) ((x)->reaped++)
-#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \
- (x)->high_mark = (x)->num_active; \
- } while (0)
+#define STATS_SET_HIGH(x) \
+ do { \
+ if ((x)->num_active > (x)->high_mark) \
+ (x)->high_mark = (x)->num_active; \
+ } while (0)
#define STATS_INC_ERR(x) ((x)->errors++)
#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
-#define STATS_SET_FREEABLE(x, i) \
- do { if ((x)->max_freeable < i) \
- (x)->max_freeable = i; \
- } while (0)
-
+#define STATS_SET_FREEABLE(x, i) \
+ do { \
+ if ((x)->max_freeable < i) \
+ (x)->max_freeable = i; \
+ } while (0)
#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
@@ -478,9 +484,7 @@ struct kmem_cache {
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
#define STATS_INC_NODEFREES(x) do { } while (0)
-#define STATS_SET_FREEABLE(x, i) \
- do { } while (0)
-
+#define STATS_SET_FREEABLE(x, i) do { } while (0)
#define STATS_INC_ALLOCHIT(x) do { } while (0)
#define STATS_INC_ALLOCMISS(x) do { } while (0)
#define STATS_INC_FREEHIT(x) do { } while (0)
@@ -488,7 +492,8 @@ struct kmem_cache {
#endif
#if DEBUG
-/* Magic nums for obj red zoning.
+/*
+ * Magic nums for obj red zoning.
* Placed in the first word before and the first word after an obj.
*/
#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */
@@ -499,7 +504,8 @@ struct kmem_cache {
#define POISON_FREE 0x6b /* for use-after-free poisoning */
#define POISON_END 0xa5 /* end-byte of poisoning */
-/* memory layout of objects:
+/*
+ * memory layout of objects:
* 0 : objp
* 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
* the end of an object is aligned with the end of the real
@@ -508,7 +514,8 @@ struct kmem_cache {
* redzone word.
* cachep->obj_offset: The real object.
* cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
+ * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ * [BYTES_PER_WORD long]
*/
static int obj_offset(struct kmem_cache *cachep)
{
@@ -552,8 +559,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
/*
- * Maximum size of an obj (in 2^order pages)
- * and absolute limit for the gfp order.
+ * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
+ * order.
*/
#if defined(CONFIG_LARGE_ALLOCS)
#define MAX_OBJ_ORDER 13 /* up to 32Mb */
@@ -573,9 +580,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#define BREAK_GFP_ORDER_LO 0
static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
-/* Functions for storing/retrieving the cachep and or slab from the
- * global 'mem_map'. These are used to find the slab an obj belongs to.
- * With kfree(), these are used to find the cache which an obj belongs to.
+/*
+ * Functions for storing/retrieving the cachep and or slab from the page
+ * allocator. These are used to find the slab an obj belongs to. With kfree(),
+ * these are used to find the cache which an obj belongs to.
*/
static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
{
@@ -584,6 +592,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
static inline struct kmem_cache *page_get_cache(struct page *page)
{
+ if (unlikely(PageCompound(page)))
+ page = (struct page *)page_private(page);
return (struct kmem_cache *)page->lru.next;
}
@@ -594,6 +604,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab)
static inline struct slab *page_get_slab(struct page *page)
{
+ if (unlikely(PageCompound(page)))
+ page = (struct page *)page_private(page);
return (struct slab *)page->lru.prev;
}
@@ -609,7 +621,21 @@ static inline struct slab *virt_to_slab(const void *obj)
return page_get_slab(page);
}
-/* These are the default caches for kmalloc. Custom caches can have other sizes. */
+static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
+ unsigned int idx)
+{
+ return slab->s_mem + cache->buffer_size * idx;
+}
+
+static inline unsigned int obj_to_index(struct kmem_cache *cache,
+ struct slab *slab, void *obj)
+{
+ return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
+}
+
+/*
+ * These are the default caches for kmalloc. Custom caches can have other sizes.
+ */
struct cache_sizes malloc_sizes[] = {
#define CACHE(x) { .cs_size = (x) },
#include <linux/kmalloc_sizes.h>
@@ -642,8 +668,6 @@ static struct kmem_cache cache_cache = {
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
.buffer_size = sizeof(struct kmem_cache),
- .flags = SLAB_NO_REAP,
- .spinlock = SPIN_LOCK_UNLOCKED,
.name = "kmem_cache",
#if DEBUG
.obj_size = sizeof(struct kmem_cache),
@@ -655,8 +679,8 @@ static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
/*
- * vm_enough_memory() looks at this to determine how many
- * slab-allocated pages are possibly freeable under pressure
+ * vm_enough_memory() looks at this to determine how many slab-allocated pages
+ * are possibly freeable under pressure
*
* SLAB_RECLAIM_ACCOUNT turns this on per-slab
*/
@@ -675,7 +699,8 @@ static enum {
static DEFINE_PER_CPU(struct work_struct, reap_work);
-static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len,
+ int node);
static void enable_cpucache(struct kmem_cache *cachep);
static void cache_reap(void *unused);
static int __node_shrink(struct kmem_cache *cachep, int node);
@@ -685,7 +710,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
return cachep->array[smp_processor_id()];
}
-static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
+static inline struct kmem_cache *__find_general_cachep(size_t size,
+ gfp_t gfpflags)
{
struct cache_sizes *csizep = malloc_sizes;
@@ -720,8 +746,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align)
return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
}
-/* Calculate the number of objects and left-over bytes for a given
- buffer size. */
+/*
+ * Calculate the number of objects and left-over bytes for a given buffer size.
+ */
static void cache_estimate(unsigned long gfporder, size_t buffer_size,
size_t align, int flags, size_t *left_over,
unsigned int *num)
@@ -782,7 +809,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
-static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
+static void __slab_error(const char *function, struct kmem_cache *cachep,
+ char *msg)
{
printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
function, cachep->name, msg);
@@ -804,7 +832,7 @@ static void init_reap_node(int cpu)
node = next_node(cpu_to_node(cpu), node_online_map);
if (node == MAX_NUMNODES)
- node = 0;
+ node = first_node(node_online_map);
__get_cpu_var(reap_node) = node;
}
@@ -870,8 +898,33 @@ static struct array_cache *alloc_arraycache(int node, int entries,
return nc;
}
+/*
+ * Transfer objects in one arraycache to another.
+ * Locking must be handled by the caller.
+ *
+ * Return the number of entries transferred.
+ */
+static int transfer_objects(struct array_cache *to,
+ struct array_cache *from, unsigned int max)
+{
+ /* Figure out how many entries to transfer */
+ int nr = min(min(from->avail, max), to->limit - to->avail);
+
+ if (!nr)
+ return 0;
+
+ memcpy(to->entry + to->avail, from->entry + from->avail -nr,
+ sizeof(void *) *nr);
+
+ from->avail -= nr;
+ to->avail += nr;
+ to->touched = 1;
+ return nr;
+}
+
#ifdef CONFIG_NUMA
static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
static struct array_cache **alloc_alien_cache(int node, int limit)
{
@@ -906,10 +959,8 @@ static void free_alien_cache(struct array_cache **ac_ptr)
if (!ac_ptr)
return;
-
for_each_node(i)
kfree(ac_ptr[i]);
-
kfree(ac_ptr);
}
@@ -920,6 +971,13 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
if (ac->avail) {
spin_lock(&rl3->list_lock);
+ /*
+ * Stuff objects into the remote nodes shared array first.
+ * That way we could avoid the overhead of putting the objects
+ * into the free lists and getting them back later.
+ */
+ transfer_objects(rl3->shared, ac, ac->limit);
+
free_block(cachep, ac->entry, ac->avail, node);
ac->avail = 0;
spin_unlock(&rl3->list_lock);
@@ -935,15 +993,16 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
if (l3->alien) {
struct array_cache *ac = l3->alien[node];
- if (ac && ac->avail) {
- spin_lock_irq(&ac->lock);
+
+ if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
__drain_alien_cache(cachep, ac, node);
spin_unlock_irq(&ac->lock);
}
}
}
-static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
+static void drain_alien_cache(struct kmem_cache *cachep,
+ struct array_cache **alien)
{
int i = 0;
struct array_cache *ac;
@@ -986,20 +1045,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
mutex_lock(&cache_chain_mutex);
- /* we need to do this right in the beginning since
+ /*
+ * We need to do this right in the beginning since
* alloc_arraycache's are going to use this list.
* kmalloc_node allows us to add the slab to the right
* kmem_list3 and not this cpu's kmem_list3
*/
list_for_each_entry(cachep, &cache_chain, next) {
- /* setup the size64 kmemlist for cpu before we can
+ /*
+ * Set up the size64 kmemlist for cpu before we can
* begin anything. Make sure some other cpu on this
* node has not already allocated this
*/
if (!cachep->nodelists[node]) {
- if (!(l3 = kmalloc_node(memsize,
- GFP_KERNEL, node)))
+ l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+ if (!l3)
goto bad;
kmem_list3_init(l3);
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
@@ -1015,13 +1076,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
spin_lock_irq(&cachep->nodelists[node]->list_lock);
cachep->nodelists[node]->free_limit =
- (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
+ (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
spin_unlock_irq(&cachep->nodelists[node]->list_lock);
}
- /* Now we can go ahead with allocating the shared array's
- & array cache's */
+ /*
+ * Now we can go ahead with allocating the shared arrays and
+ * array caches
+ */
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
struct array_cache *shared;
@@ -1041,7 +1104,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
if (!alien)
goto bad;
cachep->array[cpu] = nc;
-
l3 = cachep->nodelists[node];
BUG_ON(!l3);
@@ -1061,7 +1123,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
}
#endif
spin_unlock_irq(&l3->list_lock);
-
kfree(shared);
free_alien_cache(alien);
}
@@ -1083,7 +1144,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
/* fall thru */
case CPU_UP_CANCELED:
mutex_lock(&cache_chain_mutex);
-
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
struct array_cache *shared;
@@ -1150,7 +1210,7 @@ free_array_cache:
#endif
}
return NOTIFY_OK;
- bad:
+bad:
mutex_unlock(&cache_chain_mutex);
return NOTIFY_BAD;
}
@@ -1160,7 +1220,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
/*
* swap the static kmem_list3 with kmalloced memory
*/
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
+static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+ int nodeid)
{
struct kmem_list3 *ptr;
@@ -1175,8 +1236,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no
local_irq_enable();
}
-/* Initialisation.
- * Called after the gfp() functions have been enabled, and before smp_init().
+/*
+ * Initialisation. Called after the page allocator have been initialised and
+ * before smp_init().
*/
void __init kmem_cache_init(void)
{
@@ -1201,9 +1263,9 @@ void __init kmem_cache_init(void)
/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
- * 1) initialize the cache_cache cache: it contains the struct kmem_cache
- * structures of all caches, except cache_cache itself: cache_cache
- * is statically allocated.
+ * 1) initialize the cache_cache cache: it contains the struct
+ * kmem_cache structures of all caches, except cache_cache itself:
+ * cache_cache is statically allocated.
* Initially an __init data area is used for the head array and the
* kmem_list3 structures, it's replaced with a kmalloc allocated
* array at the end of the bootstrap.
@@ -1226,7 +1288,8 @@ void __init kmem_cache_init(void)
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
- cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
+ cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
+ cache_line_size());
for (order = 0; order < MAX_ORDER; order++) {
cache_estimate(order, cache_cache.buffer_size,
@@ -1245,24 +1308,26 @@ void __init kmem_cache_init(void)
sizes = malloc_sizes;
names = cache_names;
- /* Initialize the caches that provide memory for the array cache
- * and the kmem_list3 structures first.
- * Without this, further allocations will bug
+ /*
+ * Initialize the caches that provide memory for the array cache and the
+ * kmem_list3 structures first. Without this, further allocations will
+ * bug.
*/
sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
- sizes[INDEX_AC].cs_size,
- ARCH_KMALLOC_MINALIGN,
- (ARCH_KMALLOC_FLAGS |
- SLAB_PANIC), NULL, NULL);
+ sizes[INDEX_AC].cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+ NULL, NULL);
- if (INDEX_AC != INDEX_L3)
+ if (INDEX_AC != INDEX_L3) {
sizes[INDEX_L3].cs_cachep =
- kmem_cache_create(names[INDEX_L3].name,
- sizes[INDEX_L3].cs_size,
- ARCH_KMALLOC_MINALIGN,
- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
- NULL);
+ kmem_cache_create(names[INDEX_L3].name,
+ sizes[INDEX_L3].cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+ NULL, NULL);
+ }
while (sizes->cs_size != ULONG_MAX) {
/*
@@ -1272,13 +1337,13 @@ void __init kmem_cache_init(void)
* Note for systems short on memory removing the alignment will
* allow tighter packing of the smaller caches.
*/
- if (!sizes->cs_cachep)
+ if (!sizes->cs_cachep) {
sizes->cs_cachep = kmem_cache_create(names->name,
- sizes->cs_size,
- ARCH_KMALLOC_MINALIGN,
- (ARCH_KMALLOC_FLAGS
- | SLAB_PANIC),
- NULL, NULL);
+ sizes->cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+ NULL, NULL);
+ }
/* Inc off-slab bufctl limit until the ceiling is hit. */
if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -1287,13 +1352,11 @@ void __init kmem_cache_init(void)
}
sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
- sizes->cs_size,
- ARCH_KMALLOC_MINALIGN,
- (ARCH_KMALLOC_FLAGS |
- SLAB_CACHE_DMA |
- SLAB_PANIC), NULL,
- NULL);
-
+ sizes->cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
+ SLAB_PANIC,
+ NULL, NULL);
sizes++;
names++;
}
@@ -1345,20 +1408,22 @@ void __init kmem_cache_init(void)
struct kmem_cache *cachep;
mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next)
- enable_cpucache(cachep);
+ enable_cpucache(cachep);
mutex_unlock(&cache_chain_mutex);
}
/* Done! */
g_cpucache_up = FULL;
- /* Register a cpu startup notifier callback
- * that initializes cpu_cache_get for all new cpus
+ /*
+ * Register a cpu startup notifier callback that initializes
+ * cpu_cache_get for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);
- /* The reap timers are started later, with a module init call:
- * That part of the kernel is not yet operational.
+ /*
+ * The reap timers are started later, with a module init call: That part
+ * of the kernel is not yet operational.
*/
}
@@ -1366,16 +1431,13 @@ static int __init cpucache_init(void)
{
int cpu;
- /*
- * Register the timers that return unneeded
- * pages to gfp.
+ /*
+ * Register the timers that return unneeded pages to the page allocator
*/
for_each_online_cpu(cpu)
- start_cpu_timer(cpu);
-
+ start_cpu_timer(cpu);
return 0;
}
-
__initcall(cpucache_init);
/*
@@ -1402,7 +1464,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
atomic_add(i, &slab_reclaim_pages);
add_page_state(nr_slab, i);
while (i--) {
- SetPageSlab(page);
+ __SetPageSlab(page);
page++;
}
return addr;
@@ -1418,8 +1480,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
const unsigned long nr_freed = i;
while (i--) {
- if (!TestClearPageSlab(page))
- BUG();
+ BUG_ON(!PageSlab(page));
+ __ClearPageSlab(page);
page++;
}
sub_page_state(nr_slab, nr_freed);
@@ -1489,9 +1551,8 @@ static void dump_line(char *data, int offset, int limit)
{
int i;
printk(KERN_ERR "%03x:", offset);
- for (i = 0; i < limit; i++) {
+ for (i = 0; i < limit; i++)
printk(" %02x", (unsigned char)data[offset + i]);
- }
printk("\n");
}
#endif
@@ -1505,15 +1566,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
if (cachep->flags & SLAB_RED_ZONE) {
printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
- *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
+ *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
if (cachep->flags & SLAB_STORE_USER) {
printk(KERN_ERR "Last user: [<%p>]",
- *dbg_userword(cachep, objp));
+ *dbg_userword(cachep, objp));
print_symbol("(%s)",
- (unsigned long)*dbg_userword(cachep, objp));
+ (unsigned long)*dbg_userword(cachep, objp));
printk("\n");
}
realobj = (char *)objp + obj_offset(cachep);
@@ -1546,8 +1607,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Print header */
if (lines == 0) {
printk(KERN_ERR
- "Slab corruption: start=%p, len=%d\n",
- realobj, size);
+ "Slab corruption: start=%p, len=%d\n",
+ realobj, size);
print_objinfo(cachep, objp, 0);
}
/* Hexdump the affected line */
@@ -1568,18 +1629,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
* exist:
*/
struct slab *slabp = virt_to_slab(objp);
- int objnr;
+ unsigned int objnr;
- objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+ objnr = obj_to_index(cachep, slabp, objp);
if (objnr) {
- objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
+ objp = index_to_obj(cachep, slabp, objnr - 1);
realobj = (char *)objp + obj_offset(cachep);
printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
realobj, size);
print_objinfo(cachep, objp, 2);
}
if (objnr + 1 < cachep->num) {
- objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
+ objp = index_to_obj(cachep, slabp, objnr + 1);
realobj = (char *)objp + obj_offset(cachep);
printk(KERN_ERR "Next obj: start=%p, len=%d\n",
realobj, size);
@@ -1591,22 +1652,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
#if DEBUG
/**
- * slab_destroy_objs - call the registered destructor for each object in
- * a slab that is to be destroyed.
+ * slab_destroy_objs - destroy a slab and its objects
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
+ * Call the registered destructor for each object in a slab that is being
+ * destroyed.
*/
static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
{
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = slabp->s_mem + cachep->buffer_size * i;
+ void *objp = index_to_obj(cachep, slabp, i);
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->buffer_size % PAGE_SIZE) == 0
- && OFF_SLAB(cachep))
+ if (cachep->buffer_size % PAGE_SIZE == 0 &&
+ OFF_SLAB(cachep))
kernel_map_pages(virt_to_page(objp),
- cachep->buffer_size / PAGE_SIZE,
- 1);
+ cachep->buffer_size / PAGE_SIZE, 1);
else
check_poison_obj(cachep, objp);
#else
@@ -1631,7 +1695,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
if (cachep->dtor) {
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = slabp->s_mem + cachep->buffer_size * i;
+ void *objp = index_to_obj(cachep, slabp, i);
(cachep->dtor) (objp, cachep, 0);
}
}
@@ -1639,9 +1703,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
#endif
/**
+ * slab_destroy - destroy and release all objects in a slab
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
* Destroy all the objs in a slab, and release the mem back to the system.
- * Before calling the slab must have been unlinked from the cache.
- * The cache-lock is not held/needed.
+ * Before calling the slab must have been unlinked from the cache. The
+ * cache-lock is not held/needed.
*/
static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
{
@@ -1662,8 +1730,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
}
}
-/* For setting up all the kmem_list3s for cache whose buffer_size is same
- as size of kmem_list3. */
+/*
+ * For setting up all the kmem_list3s for cache whose buffer_size is same as
+ * size of kmem_list3.
+ */
static void set_up_list3s(struct kmem_cache *cachep, int index)
{
int node;
@@ -1689,13 +1759,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
* high order pages for slabs. When the gfp() functions are more friendly
* towards high-order requests, this should be changed.
*/
-static inline size_t calculate_slab_order(struct kmem_cache *cachep,
+static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, size_t align, unsigned long flags)
{
size_t left_over = 0;
int gfporder;
- for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
+ for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
unsigned int num;
size_t remainder;
@@ -1730,12 +1800,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
/*
* Acceptable internal fragmentation?
*/
- if ((left_over * 8) <= (PAGE_SIZE << gfporder))
+ if (left_over * 8 <= (PAGE_SIZE << gfporder))
break;
}
return left_over;
}
+static void setup_cpu_cache(struct kmem_cache *cachep)
+{
+ if (g_cpucache_up == FULL) {
+ enable_cpucache(cachep);
+ return;
+ }
+ if (g_cpucache_up == NONE) {
+ /*
+ * Note: the first kmem_cache_create must create the cache
+ * that's used by kmalloc(24), otherwise the creation of
+ * further caches will BUG().
+ */
+ cachep->array[smp_processor_id()] = &initarray_generic.cache;
+
+ /*
+ * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
+ * the first cache, then we need to set up all its list3s,
+ * otherwise the creation of further caches will BUG().
+ */
+ set_up_list3s(cachep, SIZE_AC);
+ if (INDEX_AC == INDEX_L3)
+ g_cpucache_up = PARTIAL_L3;
+ else
+ g_cpucache_up = PARTIAL_AC;
+ } else {
+ cachep->array[smp_processor_id()] =
+ kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
+ if (g_cpucache_up == PARTIAL_AC) {
+ set_up_list3s(cachep, SIZE_L3);
+ g_cpucache_up = PARTIAL_L3;
+ } else {
+ int node;
+ for_each_online_node(node) {
+ cachep->nodelists[node] =
+ kmalloc_node(sizeof(struct kmem_list3),
+ GFP_KERNEL, node);
+ BUG_ON(!cachep->nodelists[node]);
+ kmem_list3_init(cachep->nodelists[node]);
+ }
+ }
+ }
+ cachep->nodelists[numa_node_id()]->next_reap =
+ jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+ cpu_cache_get(cachep)->avail = 0;
+ cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+ cpu_cache_get(cachep)->batchcount = 1;
+ cpu_cache_get(cachep)->touched = 0;
+ cachep->batchcount = 1;
+ cachep->limit = BOOT_CPUCACHE_ENTRIES;
+}
+
/**
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1751,9 +1875,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
* and the @dtor is run before the pages are handed back.
*
* @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting
- * unloaded.
- *
+ * the module calling this has to destroy the cache before getting unloaded.
+ *
* The flags are
*
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -1762,16 +1885,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
* for buffer overruns.
*
- * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
- * memory pressure.
- *
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
+ unsigned long flags,
+ void (*ctor)(void*, struct kmem_cache *, unsigned long),
void (*dtor)(void*, struct kmem_cache *, unsigned long))
{
size_t left_over, slab_size, ralign;
@@ -1781,12 +1902,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
/*
* Sanity checks... these are all serious usage bugs.
*/
- if ((!name) ||
- in_interrupt() ||
- (size < BYTES_PER_WORD) ||
+ if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
(size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
- printk(KERN_ERR "%s: Early error in slab %s\n",
- __FUNCTION__, name);
+ printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
+ name);
BUG();
}
@@ -1840,8 +1959,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* above the next power of two: caches with object sizes just above a
* power of two have a significant amount of internal fragmentation.
*/
- if ((size < 4096
- || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
+ if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
if (!(flags & SLAB_DESTROY_BY_RCU))
flags |= SLAB_POISON;
@@ -1853,13 +1971,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
BUG_ON(dtor);
/*
- * Always checks flags, a caller might be expecting debug
- * support which isn't available.
+ * Always checks flags, a caller might be expecting debug support which
+ * isn't available.
*/
if (flags & ~CREATE_MASK)
BUG();
- /* Check that size is in terms of words. This is needed to avoid
+ /*
+ * Check that size is in terms of words. This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
@@ -1868,12 +1987,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
size &= ~(BYTES_PER_WORD - 1);
}
- /* calculate out the final buffer alignment: */
+ /* calculate the final buffer alignment: */
+
/* 1) arch recommendation: can be overridden for debug */
if (flags & SLAB_HWCACHE_ALIGN) {
- /* Default alignment: as specified by the arch code.
- * Except if an object is really small, then squeeze multiple
- * objects into one cacheline.
+ /*
+ * Default alignment: as specified by the arch code. Except if
+ * an object is really small, then squeeze multiple objects into
+ * one cacheline.
*/
ralign = cache_line_size();
while (size <= ralign / 2)
@@ -1893,16 +2014,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (ralign > BYTES_PER_WORD)
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
}
- /* 4) Store it. Note that the debug code below can reduce
+ /*
+ * 4) Store it. Note that the debug code below can reduce
* the alignment to BYTES_PER_WORD.
*/
align = ralign;
/* Get cache's description obj. */
- cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
+ cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
if (!cachep)
goto oops;
- memset(cachep, 0, sizeof(struct kmem_cache));
#if DEBUG
cachep->obj_size = size;
@@ -1978,7 +2099,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
cachep->gfpflags = 0;
if (flags & SLAB_CACHE_DMA)
cachep->gfpflags |= GFP_DMA;
- spin_lock_init(&cachep->spinlock);
cachep->buffer_size = size;
if (flags & CFLGS_OFF_SLAB)
@@ -1988,64 +2108,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
cachep->name = name;
- if (g_cpucache_up == FULL) {
- enable_cpucache(cachep);
- } else {
- if (g_cpucache_up == NONE) {
- /* Note: the first kmem_cache_create must create
- * the cache that's used by kmalloc(24), otherwise
- * the creation of further caches will BUG().
- */
- cachep->array[smp_processor_id()] =
- &initarray_generic.cache;
-
- /* If the cache that's used by
- * kmalloc(sizeof(kmem_list3)) is the first cache,
- * then we need to set up all its list3s, otherwise
- * the creation of further caches will BUG().
- */
- set_up_list3s(cachep, SIZE_AC);
- if (INDEX_AC == INDEX_L3)
- g_cpucache_up = PARTIAL_L3;
- else
- g_cpucache_up = PARTIAL_AC;
- } else {
- cachep->array[smp_processor_id()] =
- kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-
- if (g_cpucache_up == PARTIAL_AC) {
- set_up_list3s(cachep, SIZE_L3);
- g_cpucache_up = PARTIAL_L3;
- } else {
- int node;
- for_each_online_node(node) {
-
- cachep->nodelists[node] =
- kmalloc_node(sizeof
- (struct kmem_list3),
- GFP_KERNEL, node);
- BUG_ON(!cachep->nodelists[node]);
- kmem_list3_init(cachep->
- nodelists[node]);
- }
- }
- }
- cachep->nodelists[numa_node_id()]->next_reap =
- jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
- BUG_ON(!cpu_cache_get(cachep));
- cpu_cache_get(cachep)->avail = 0;
- cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
- cpu_cache_get(cachep)->batchcount = 1;
- cpu_cache_get(cachep)->touched = 0;
- cachep->batchcount = 1;
- cachep->limit = BOOT_CPUCACHE_ENTRIES;
- }
+ setup_cpu_cache(cachep);
/* cache setup completed, link it into the list */
list_add(&cachep->next, &cache_chain);
- oops:
+oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
@@ -2089,30 +2156,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
#define check_spinlock_acquired_node(x, y) do { } while(0)
#endif
-/*
- * Waits for all CPUs to execute func().
- */
-static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
-{
- check_irq_on();
- preempt_disable();
-
- local_irq_disable();
- func(arg);
- local_irq_enable();
-
- if (smp_call_function(func, arg, 1, 1))
- BUG();
-
- preempt_enable();
-}
-
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
- int force, int node);
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+ struct array_cache *ac,
+ int force, int node);
static void do_drain(void *arg)
{
- struct kmem_cache *cachep = (struct kmem_cache *) arg;
+ struct kmem_cache *cachep = arg;
struct array_cache *ac;
int node = numa_node_id();
@@ -2129,14 +2179,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
struct kmem_list3 *l3;
int node;
- smp_call_function_all_cpus(do_drain, cachep);
+ on_each_cpu(do_drain, cachep, 1, 1);
check_irq_on();
for_each_online_node(node) {
l3 = cachep->nodelists[node];
if (l3) {
- spin_lock_irq(&l3->list_lock);
- drain_array_locked(cachep, l3->shared, 1, node);
- spin_unlock_irq(&l3->list_lock);
+ drain_array(cachep, l3, l3->shared, 1, node);
if (l3->alien)
drain_alien_cache(cachep, l3->alien);
}
@@ -2260,16 +2308,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
/* NUMA: free the list3 structures */
for_each_online_node(i) {
- if ((l3 = cachep->nodelists[i])) {
+ l3 = cachep->nodelists[i];
+ if (l3) {
kfree(l3->shared);
free_alien_cache(l3->alien);
kfree(l3);
}
}
kmem_cache_free(&cache_cache, cachep);
-
unlock_cpu_hotplug();
-
return 0;
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2292,7 +2339,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
slabp->inuse = 0;
slabp->colouroff = colour_off;
slabp->s_mem = objp + colour_off;
-
return slabp;
}
@@ -2307,7 +2353,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = slabp->s_mem + cachep->buffer_size * i;
+ void *objp = index_to_obj(cachep, slabp, i);
#if DEBUG
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON)
@@ -2320,9 +2366,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
/*
- * Constructors are not allowed to allocate memory from
- * the same cache which they are a constructor for.
- * Otherwise, deadlock. They must also be threaded.
+ * Constructors are not allowed to allocate memory from the same
+ * cache which they are a constructor for. Otherwise, deadlock.
+ * They must also be threaded.
*/
if (cachep->ctor && !(cachep->flags & SLAB_POISON))
cachep->ctor(objp + obj_offset(cachep), cachep,
@@ -2336,8 +2382,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
slab_error(cachep, "constructor overwrote the"
" start of an object");
}
- if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
- && cachep->flags & SLAB_POISON)
+ if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
+ OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
kernel_map_pages(virt_to_page(objp),
cachep->buffer_size / PAGE_SIZE, 0);
#else
@@ -2352,18 +2398,16 @@ static void cache_init_objs(struct kmem_cache *cachep,
static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
{
- if (flags & SLAB_DMA) {
- if (!(cachep->gfpflags & GFP_DMA))
- BUG();
- } else {
- if (cachep->gfpflags & GFP_DMA)
- BUG();
- }
+ if (flags & SLAB_DMA)
+ BUG_ON(!(cachep->gfpflags & GFP_DMA));
+ else
+ BUG_ON(cachep->gfpflags & GFP_DMA);
}
-static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
+ int nodeid)
{
- void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
+ void *objp = index_to_obj(cachep, slabp, slabp->free);
kmem_bufctl_t next;
slabp->inuse++;
@@ -2377,18 +2421,18 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod
return objp;
}
-static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
- int nodeid)
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
+ void *objp, int nodeid)
{
- unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
+ unsigned int objnr = obj_to_index(cachep, slabp, objp);
#if DEBUG
/* Verify that the slab belongs to the intended node */
WARN_ON(slabp->nodeid != nodeid);
- if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
+ if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
printk(KERN_ERR "slab: double free detected in cache "
- "'%s', objp %p\n", cachep->name, objp);
+ "'%s', objp %p\n", cachep->name, objp);
BUG();
}
#endif
@@ -2397,14 +2441,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
slabp->inuse--;
}
-static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
+static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
+ void *objp)
{
int i;
struct page *page;
/* Nasty!!!!!! I hope this is OK. */
- i = 1 << cachep->gfporder;
page = virt_to_page(objp);
+
+ i = 1;
+ if (likely(!PageCompound(page)))
+ i <<= cachep->gfporder;
do {
page_set_cache(page, cachep);
page_set_slab(page, slabp);
@@ -2425,8 +2473,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
unsigned long ctor_flags;
struct kmem_list3 *l3;
- /* Be lazy and only check for valid flags here,
- * keeping it out of the critical path in kmem_cache_alloc().
+ /*
+ * Be lazy and only check for valid flags here, keeping it out of the
+ * critical path in kmem_cache_alloc().
*/
if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
BUG();
@@ -2467,14 +2516,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
*/
kmem_flagcheck(cachep, flags);
- /* Get mem for the objs.
- * Attempt to allocate a physical page from 'nodeid',
+ /*
+ * Get mem for the objs. Attempt to allocate a physical page from
+ * 'nodeid'.
*/
- if (!(objp = kmem_getpages(cachep, flags, nodeid)))
+ objp = kmem_getpages(cachep, flags, nodeid);
+ if (!objp)
goto failed;
/* Get slab management. */
- if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
+ slabp = alloc_slabmgmt(cachep, objp, offset, local_flags);
+ if (!slabp)
goto opps1;
slabp->nodeid = nodeid;
@@ -2493,9 +2545,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
l3->free_objects += cachep->num;
spin_unlock(&l3->list_lock);
return 1;
- opps1:
+opps1:
kmem_freepages(cachep, objp);
- failed:
+failed:
if (local_flags & __GFP_WAIT)
local_irq_disable();
return 0;
@@ -2538,8 +2590,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
page = virt_to_page(objp);
if (page_get_cache(page) != cachep) {
- printk(KERN_ERR
- "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+ printk(KERN_ERR "mismatch in kmem_cache_free: expected "
+ "cache %p, got %p\n",
page_get_cache(page), cachep);
printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
@@ -2549,13 +2601,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
slabp = page_get_slab(page);
if (cachep->flags & SLAB_RED_ZONE) {
- if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
- || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
- slab_error(cachep,
- "double free, or memory outside"
- " object was overwritten");
- printk(KERN_ERR
- "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+ if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
+ *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+ slab_error(cachep, "double free, or memory outside"
+ " object was overwritten");
+ printk(KERN_ERR "%p: redzone 1:0x%lx, "
+ "redzone 2:0x%lx.\n",
objp, *dbg_redzone1(cachep, objp),
*dbg_redzone2(cachep, objp));
}
@@ -2565,15 +2616,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = caller;
- objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+ objnr = obj_to_index(cachep, slabp, objp);
BUG_ON(objnr >= cachep->num);
- BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
+ BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
if (cachep->flags & SLAB_DEBUG_INITIAL) {
- /* Need to call the slab's constructor so the
- * caller can perform a verify of its state (debugging).
- * Called without the cache-lock held.
+ /*
+ * Need to call the slab's constructor so the caller can
+ * perform a verify of its state (debugging). Called without
+ * the cache-lock held.
*/
cachep->ctor(objp + obj_offset(cachep),
cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
@@ -2584,9 +2636,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
*/
cachep->dtor(objp + obj_offset(cachep), cachep, 0);
}
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
+#endif
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
+ if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
store_stackinfo(cachep, objp, (unsigned long)caller);
kernel_map_pages(virt_to_page(objp),
cachep->buffer_size / PAGE_SIZE, 0);
@@ -2612,14 +2667,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
goto bad;
}
if (entries != cachep->num - slabp->inuse) {
- bad:
- printk(KERN_ERR
- "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
- cachep->name, cachep->num, slabp, slabp->inuse);
+bad:
+ printk(KERN_ERR "slab: Internal list corruption detected in "
+ "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+ cachep->name, cachep->num, slabp, slabp->inuse);
for (i = 0;
i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
i++) {
- if ((i % 16) == 0)
+ if (i % 16 == 0)
printk("\n%03x:", i);
printk(" %02x", ((unsigned char *)slabp)[i]);
}
@@ -2641,12 +2696,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
check_irq_off();
ac = cpu_cache_get(cachep);
- retry:
+retry:
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
- /* if there was little recent activity on this
- * cache, then perform only a partial refill.
- * Otherwise we could generate refill bouncing.
+ /*
+ * If there was little recent activity on this cache, then
+ * perform only a partial refill. Otherwise we could generate
+ * refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT;
}
@@ -2655,20 +2711,10 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
BUG_ON(ac->avail > 0 || !l3);
spin_lock(&l3->list_lock);
- if (l3->shared) {
- struct array_cache *shared_array = l3->shared;
- if (shared_array->avail) {
- if (batchcount > shared_array->avail)
- batchcount = shared_array->avail;
- shared_array->avail -= batchcount;
- ac->avail = batchcount;
- memcpy(ac->entry,
- &(shared_array->entry[shared_array->avail]),
- sizeof(void *) * batchcount);
- shared_array->touched = 1;
- goto alloc_done;
- }
- }
+ /* See if we can refill from the shared array */
+ if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
+ goto alloc_done;
+
while (batchcount > 0) {
struct list_head *entry;
struct slab *slabp;
@@ -2702,29 +2748,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
list_add(&slabp->list, &l3->slabs_partial);
}
- must_grow:
+must_grow:
l3->free_objects -= ac->avail;
- alloc_done:
+alloc_done:
spin_unlock(&l3->list_lock);
if (unlikely(!ac->avail)) {
int x;
x = cache_grow(cachep, flags, numa_node_id());
- // cache_grow can reenable interrupts, then ac could change.
+ /* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
- if (!x && ac->avail == 0) // no objects in sight? abort
+ if (!x && ac->avail == 0) /* no objects in sight? abort */
return NULL;
- if (!ac->avail) // objects refilled by interrupt?
+ if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = 1;
return ac->entry[--ac->avail];
}
-static inline void
-cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
+static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
+ gfp_t flags)
{
might_sleep_if(flags & __GFP_WAIT);
#if DEBUG
@@ -2733,8 +2779,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
}
#if DEBUG
-static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
- void *objp, void *caller)
+static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
+ gfp_t flags, void *objp, void *caller)
{
if (!objp)
return objp;
@@ -2754,19 +2800,28 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags
*dbg_userword(cachep, objp) = caller;
if (cachep->flags & SLAB_RED_ZONE) {
- if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
- || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
- slab_error(cachep,
- "double free, or memory outside"
- " object was overwritten");
+ if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
+ *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+ slab_error(cachep, "double free, or memory outside"
+ " object was overwritten");
printk(KERN_ERR
- "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
- objp, *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
+ "%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
+ objp, *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
*dbg_redzone1(cachep, objp) = RED_ACTIVE;
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
}
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ {
+ struct slab *slabp;
+ unsigned objnr;
+
+ slabp = page_get_slab(virt_to_page(objp));
+ objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+ slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
+ }
+#endif
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON) {
unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
@@ -2788,11 +2843,10 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
struct array_cache *ac;
#ifdef CONFIG_NUMA
- if (unlikely(current->mempolicy && !in_interrupt())) {
- int nid = slab_node(current->mempolicy);
-
- if (nid != numa_node_id())
- return __cache_alloc_node(cachep, flags, nid);
+ if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
+ objp = alternate_node_alloc(cachep, flags);
+ if (objp != NULL)
+ return objp;
}
#endif
@@ -2809,8 +2863,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
return objp;
}
-static __always_inline void *
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
+ gfp_t flags, void *caller)
{
unsigned long save_flags;
void *objp;
@@ -2828,9 +2882,32 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
#ifdef CONFIG_NUMA
/*
+ * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ *
+ * If we are in_interrupt, then process context, including cpusets and
+ * mempolicy, may not apply and should not be used for allocation policy.
+ */
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ int nid_alloc, nid_here;
+
+ if (in_interrupt())
+ return NULL;
+ nid_alloc = nid_here = numa_node_id();
+ if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
+ nid_alloc = cpuset_mem_spread_node();
+ else if (current->mempolicy)
+ nid_alloc = slab_node(current->mempolicy);
+ if (nid_alloc != nid_here)
+ return __cache_alloc_node(cachep, flags, nid_alloc);
+ return NULL;
+}
+
+/*
* A interface to enable slab creation on nodeid
*/
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid)
{
struct list_head *entry;
struct slab *slabp;
@@ -2841,7 +2918,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
l3 = cachep->nodelists[nodeid];
BUG_ON(!l3);
- retry:
+retry:
check_irq_off();
spin_lock(&l3->list_lock);
entry = l3->slabs_partial.next;
@@ -2868,16 +2945,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
/* move slabp to correct slabp list: */
list_del(&slabp->list);
- if (slabp->free == BUFCTL_END) {
+ if (slabp->free == BUFCTL_END)
list_add(&slabp->list, &l3->slabs_full);
- } else {
+ else
list_add(&slabp->list, &l3->slabs_partial);
- }
spin_unlock(&l3->list_lock);
goto done;
- must_grow:
+must_grow:
spin_unlock(&l3->list_lock);
x = cache_grow(cachep, flags, nodeid);
@@ -2885,7 +2961,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
return NULL;
goto retry;
- done:
+done:
return obj;
}
#endif
@@ -2958,7 +3034,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
}
free_block(cachep, ac->entry, batchcount, node);
- free_done:
+free_done:
#if STATS
{
int i = 0;
@@ -2979,16 +3055,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
#endif
spin_unlock(&l3->list_lock);
ac->avail -= batchcount;
- memmove(ac->entry, &(ac->entry[batchcount]),
- sizeof(void *) * ac->avail);
+ memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}
/*
- * __cache_free
- * Release an obj back to its cache. If the obj has a constructed
- * state, it must be in this state _before_ it is released.
- *
- * Called with disabled ints.
+ * Release an obj back to its cache. If the obj has a constructed state, it must
+ * be in this state _before_ it is released. Called with disabled ints.
*/
static inline void __cache_free(struct kmem_cache *cachep, void *objp)
{
@@ -3007,9 +3079,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
if (unlikely(slabp->nodeid != numa_node_id())) {
struct array_cache *alien = NULL;
int nodeid = slabp->nodeid;
- struct kmem_list3 *l3 =
- cachep->nodelists[numa_node_id()];
+ struct kmem_list3 *l3;
+ l3 = cachep->nodelists[numa_node_id()];
STATS_INC_NODEFREES(cachep);
if (l3->alien && l3->alien[nodeid]) {
alien = l3->alien[nodeid];
@@ -3056,6 +3128,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
EXPORT_SYMBOL(kmem_cache_alloc);
/**
+ * kmem_cache_alloc - Allocate an object. The memory is set to zero.
+ * @cache: The cache to allocate from.
+ * @flags: See kmalloc().
+ *
+ * Allocate an object from this cache and set the allocated memory to zero.
+ * The flags are only relevant if the cache has no available objects.
+ */
+void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
+{
+ void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
+ if (ret)
+ memset(ret, 0, obj_size(cache));
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_zalloc);
+
+/**
* kmem_ptr_validate - check if an untrusted pointer might
* be a slab entry.
* @cachep: the cache we're checking against
@@ -3093,7 +3182,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
if (unlikely(page_get_cache(page) != cachep))
goto out;
return 1;
- out:
+out:
return 0;
}
@@ -3119,7 +3208,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
local_irq_save(save_flags);
if (nodeid == -1 || nodeid == numa_node_id() ||
- !cachep->nodelists[nodeid])
+ !cachep->nodelists[nodeid])
ptr = ____cache_alloc(cachep, flags);
else
ptr = __cache_alloc_node(cachep, flags, nodeid);
@@ -3148,6 +3237,7 @@ EXPORT_SYMBOL(kmalloc_node);
* kmalloc - allocate memory
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
+ * @caller: function caller for debug tracking of the caller
*
* kmalloc is the normal method of allocating memory
* in the kernel.
@@ -3181,22 +3271,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
return __cache_alloc(cachep, flags, caller);
}
-#ifndef CONFIG_DEBUG_SLAB
void *__kmalloc(size_t size, gfp_t flags)
{
+#ifndef CONFIG_DEBUG_SLAB
return __do_kmalloc(size, flags, NULL);
+#else
+ return __do_kmalloc(size, flags, __builtin_return_address(0));
+#endif
}
EXPORT_SYMBOL(__kmalloc);
-#else
-
+#ifdef CONFIG_DEBUG_SLAB
void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
{
return __do_kmalloc(size, flags, caller);
}
EXPORT_SYMBOL(__kmalloc_track_caller);
-
#endif
#ifdef CONFIG_SMP
@@ -3220,7 +3311,7 @@ void *__alloc_percpu(size_t size)
* and we have no way of figuring out how to fix the array
* that we have allocated then....
*/
- for_each_cpu(i) {
+ for_each_possible_cpu(i) {
int node = cpu_to_node(i);
if (node_online(node))
@@ -3236,7 +3327,7 @@ void *__alloc_percpu(size_t size)
/* Catch derefs w/o wrappers */
return (void *)(~(unsigned long)pdata);
- unwind_oom:
+unwind_oom:
while (--i >= 0) {
if (!cpu_possible(i))
continue;
@@ -3307,7 +3398,7 @@ void free_percpu(const void *objp)
/*
* We allocate for all cpus so we cannot use for online cpu here.
*/
- for_each_cpu(i)
+ for_each_possible_cpu(i)
kfree(p->ptrs[i]);
kfree(p);
}
@@ -3327,61 +3418,86 @@ const char *kmem_cache_name(struct kmem_cache *cachep)
EXPORT_SYMBOL_GPL(kmem_cache_name);
/*
- * This initializes kmem_list3 for all nodes.
+ * This initializes kmem_list3 or resizes varioius caches for all nodes.
*/
static int alloc_kmemlist(struct kmem_cache *cachep)
{
int node;
struct kmem_list3 *l3;
- int err = 0;
+ struct array_cache *new_shared;
+ struct array_cache **new_alien;
for_each_online_node(node) {
- struct array_cache *nc = NULL, *new;
- struct array_cache **new_alien = NULL;
-#ifdef CONFIG_NUMA
- if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
+
+ new_alien = alloc_alien_cache(node, cachep->limit);
+ if (!new_alien)
goto fail;
-#endif
- if (!(new = alloc_arraycache(node, (cachep->shared *
- cachep->batchcount),
- 0xbaadf00d)))
+
+ new_shared = alloc_arraycache(node,
+ cachep->shared*cachep->batchcount,
+ 0xbaadf00d);
+ if (!new_shared) {
+ free_alien_cache(new_alien);
goto fail;
- if ((l3 = cachep->nodelists[node])) {
+ }
+
+ l3 = cachep->nodelists[node];
+ if (l3) {
+ struct array_cache *shared = l3->shared;
spin_lock_irq(&l3->list_lock);
- if ((nc = cachep->nodelists[node]->shared))
- free_block(cachep, nc->entry, nc->avail, node);
+ if (shared)
+ free_block(cachep, shared->entry,
+ shared->avail, node);
- l3->shared = new;
- if (!cachep->nodelists[node]->alien) {
+ l3->shared = new_shared;
+ if (!l3->alien) {
l3->alien = new_alien;
new_alien = NULL;
}
l3->free_limit = (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
+ cachep->batchcount + cachep->num;
spin_unlock_irq(&l3->list_lock);
- kfree(nc);
+ kfree(shared);
free_alien_cache(new_alien);
continue;
}
- if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
- GFP_KERNEL, node)))
+ l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+ if (!l3) {
+ free_alien_cache(new_alien);
+ kfree(new_shared);
goto fail;
+ }
kmem_list3_init(l3);
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
- l3->shared = new;
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ l3->shared = new_shared;
l3->alien = new_alien;
l3->free_limit = (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
+ cachep->batchcount + cachep->num;
cachep->nodelists[node] = l3;
}
- return err;
- fail:
- err = -ENOMEM;
- return err;
+ return 0;
+
+fail:
+ if (!cachep->next.next) {
+ /* Cache is not active yet. Roll back what we did */
+ node--;
+ while (node >= 0) {
+ if (cachep->nodelists[node]) {
+ l3 = cachep->nodelists[node];
+
+ kfree(l3->shared);
+ free_alien_cache(l3->alien);
+ kfree(l3);
+ cachep->nodelists[node] = NULL;
+ }
+ node--;
+ }
+ }
+ return -ENOMEM;
}
struct ccupdate_struct {
@@ -3391,7 +3507,7 @@ struct ccupdate_struct {
static void do_ccupdate_local(void *info)
{
- struct ccupdate_struct *new = (struct ccupdate_struct *)info;
+ struct ccupdate_struct *new = info;
struct array_cache *old;
check_irq_off();
@@ -3401,16 +3517,17 @@ static void do_ccupdate_local(void *info)
new->new[smp_processor_id()] = old;
}
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
- int shared)
+/* Always called with the cache_chain_mutex held */
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+ int batchcount, int shared)
{
struct ccupdate_struct new;
int i, err;
memset(&new.new, 0, sizeof(new.new));
for_each_online_cpu(i) {
- new.new[i] =
- alloc_arraycache(cpu_to_node(i), limit, batchcount);
+ new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
+ batchcount);
if (!new.new[i]) {
for (i--; i >= 0; i--)
kfree(new.new[i]);
@@ -3419,14 +3536,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
}
new.cachep = cachep;
- smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
+ on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
check_irq_on();
- spin_lock(&cachep->spinlock);
cachep->batchcount = batchcount;
cachep->limit = limit;
cachep->shared = shared;
- spin_unlock(&cachep->spinlock);
for_each_online_cpu(i) {
struct array_cache *ccold = new.new[i];
@@ -3447,15 +3562,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
return 0;
}
+/* Called with cache_chain_mutex held always */
static void enable_cpucache(struct kmem_cache *cachep)
{
int err;
int limit, shared;
- /* The head array serves three purposes:
+ /*
+ * The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
* - reduce the number of spinlock operations.
- * - reduce the number of linked list operations on the slab and
+ * - reduce the number of linked list operations on the slab and
* bufctl chains: array operations are cheaper.
* The numbers are guessed, we should auto-tune as described by
* Bonwick.
@@ -3471,7 +3588,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
else
limit = 120;
- /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
+ /*
+ * CPU bound tasks (e.g. network routing) can exhibit cpu bound
* allocation behaviour: Most allocs on one cpu, most free operations
* on another cpu. For these cases, an efficient object passing between
* cpus is necessary. This is provided by a shared array. The array
@@ -3486,9 +3604,9 @@ static void enable_cpucache(struct kmem_cache *cachep)
#endif
#if DEBUG
- /* With debugging enabled, large batchcount lead to excessively
- * long periods with disabled local interrupts. Limit the
- * batchcount
+ /*
+ * With debugging enabled, large batchcount lead to excessively long
+ * periods with disabled local interrupts. Limit the batchcount
*/
if (limit > 32)
limit = 32;
@@ -3499,23 +3617,32 @@ static void enable_cpucache(struct kmem_cache *cachep)
cachep->name, -err);
}
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
- int force, int node)
+/*
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary. Note that the l3 listlock also protects the array_cache
+ * if drain_array() is used on the shared array.
+ */
+void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+ struct array_cache *ac, int force, int node)
{
int tofree;
- check_spinlock_acquired_node(cachep, node);
+ if (!ac || !ac->avail)
+ return;
if (ac->touched && !force) {
ac->touched = 0;
- } else if (ac->avail) {
- tofree = force ? ac->avail : (ac->limit + 4) / 5;
- if (tofree > ac->avail) {
- tofree = (ac->avail + 1) / 2;
+ } else {
+ spin_lock_irq(&l3->list_lock);
+ if (ac->avail) {
+ tofree = force ? ac->avail : (ac->limit + 4) / 5;
+ if (tofree > ac->avail)
+ tofree = (ac->avail + 1) / 2;
+ free_block(cachep, ac->entry, tofree, node);
+ ac->avail -= tofree;
+ memmove(ac->entry, &(ac->entry[tofree]),
+ sizeof(void *) * ac->avail);
}
- free_block(cachep, ac->entry, tofree, node);
- ac->avail -= tofree;
- memmove(ac->entry, &(ac->entry[tofree]),
- sizeof(void *) * ac->avail);
+ spin_unlock_irq(&l3->list_lock);
}
}
@@ -3528,13 +3655,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac
* - clear the per-cpu caches for this CPU.
* - return freeable pages to the main free memory pool.
*
- * If we cannot acquire the cache chain mutex then just give up - we'll
- * try again on the next iteration.
+ * If we cannot acquire the cache chain mutex then just give up - we'll try
+ * again on the next iteration.
*/
static void cache_reap(void *unused)
{
struct list_head *walk;
struct kmem_list3 *l3;
+ int node = numa_node_id();
if (!mutex_trylock(&cache_chain_mutex)) {
/* Give up. Setup the next iteration. */
@@ -3550,65 +3678,72 @@ static void cache_reap(void *unused)
struct slab *slabp;
searchp = list_entry(walk, struct kmem_cache, next);
-
- if (searchp->flags & SLAB_NO_REAP)
- goto next;
-
check_irq_on();
- l3 = searchp->nodelists[numa_node_id()];
+ /*
+ * We only take the l3 lock if absolutely necessary and we
+ * have established with reasonable certainty that
+ * we can do some work if the lock was obtained.
+ */
+ l3 = searchp->nodelists[node];
+
reap_alien(searchp, l3);
- spin_lock_irq(&l3->list_lock);
- drain_array_locked(searchp, cpu_cache_get(searchp), 0,
- numa_node_id());
+ drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
+ /*
+ * These are racy checks but it does not matter
+ * if we skip one check or scan twice.
+ */
if (time_after(l3->next_reap, jiffies))
- goto next_unlock;
+ goto next;
l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
- if (l3->shared)
- drain_array_locked(searchp, l3->shared, 0,
- numa_node_id());
+ drain_array(searchp, l3, l3->shared, 0, node);
if (l3->free_touched) {
l3->free_touched = 0;
- goto next_unlock;
+ goto next;
}
- tofree =
- (l3->free_limit + 5 * searchp->num -
- 1) / (5 * searchp->num);
+ tofree = (l3->free_limit + 5 * searchp->num - 1) /
+ (5 * searchp->num);
do {
+ /*
+ * Do not lock if there are no free blocks.
+ */
+ if (list_empty(&l3->slabs_free))
+ break;
+
+ spin_lock_irq(&l3->list_lock);
p = l3->slabs_free.next;
- if (p == &(l3->slabs_free))
+ if (p == &(l3->slabs_free)) {
+ spin_unlock_irq(&l3->list_lock);
break;
+ }
slabp = list_entry(p, struct slab, list);
BUG_ON(slabp->inuse);
list_del(&slabp->list);
STATS_INC_REAPED(searchp);
- /* Safe to drop the lock. The slab is no longer
- * linked to the cache.
- * searchp cannot disappear, we hold
+ /*
+ * Safe to drop the lock. The slab is no longer linked
+ * to the cache. searchp cannot disappear, we hold
* cache_chain_lock
*/
l3->free_objects -= searchp->num;
spin_unlock_irq(&l3->list_lock);
slab_destroy(searchp, slabp);
- spin_lock_irq(&l3->list_lock);
} while (--tofree > 0);
- next_unlock:
- spin_unlock_irq(&l3->list_lock);
- next:
+next:
cond_resched();
}
check_irq_on();
mutex_unlock(&cache_chain_mutex);
next_reap_node();
- /* Setup the next iteration */
+ /* Set up the next iteration */
schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
}
@@ -3658,8 +3793,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
struct kmem_cache *cachep = p;
++*pos;
- return cachep->next.next == &cache_chain ? NULL
- : list_entry(cachep->next.next, struct kmem_cache, next);
+ return cachep->next.next == &cache_chain ?
+ NULL : list_entry(cachep->next.next, struct kmem_cache, next);
}
static void s_stop(struct seq_file *m, void *p)
@@ -3681,7 +3816,6 @@ static int s_show(struct seq_file *m, void *p)
int node;
struct kmem_list3 *l3;
- spin_lock(&cachep->spinlock);
active_objs = 0;
num_slabs = 0;
for_each_online_node(node) {
@@ -3748,7 +3882,9 @@ static int s_show(struct seq_file *m, void *p)
unsigned long node_frees = cachep->node_frees;
seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
- %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
+ %4lu %4lu %4lu %4lu", allocs, high, grown,
+ reaped, errors, max_freeable, node_allocs,
+ node_frees);
}
/* cpu stats */
{
@@ -3762,7 +3898,6 @@ static int s_show(struct seq_file *m, void *p)
}
#endif
seq_putc(m, '\n');
- spin_unlock(&cachep->spinlock);
return 0;
}
@@ -3820,13 +3955,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
mutex_lock(&cache_chain_mutex);
res = -EINVAL;
list_for_each(p, &cache_chain) {
- struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
- next);
+ struct kmem_cache *cachep;
+ cachep = list_entry(p, struct kmem_cache, next);
if (!strcmp(cachep->name, kbuf)) {
- if (limit < 1 ||
- batchcount < 1 ||
- batchcount > limit || shared < 0) {
+ if (limit < 1 || batchcount < 1 ||
+ batchcount > limit || shared < 0) {
res = 0;
} else {
res = do_tune_cpucache(cachep, limit,
@@ -3840,6 +3974,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
res = count;
return res;
}
+
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+
+static void *leaks_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+ struct list_head *p;
+
+ mutex_lock(&cache_chain_mutex);
+ p = cache_chain.next;
+ while (n--) {
+ p = p->next;
+ if (p == &cache_chain)
+ return NULL;
+ }
+ return list_entry(p, struct kmem_cache, next);
+}
+
+static inline int add_caller(unsigned long *n, unsigned long v)
+{
+ unsigned long *p;
+ int l;
+ if (!v)
+ return 1;
+ l = n[1];
+ p = n + 2;
+ while (l) {
+ int i = l/2;
+ unsigned long *q = p + 2 * i;
+ if (*q == v) {
+ q[1]++;
+ return 1;
+ }
+ if (*q > v) {
+ l = i;
+ } else {
+ p = q + 2;
+ l -= i + 1;
+ }
+ }
+ if (++n[1] == n[0])
+ return 0;
+ memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
+ p[0] = v;
+ p[1] = 1;
+ return 1;
+}
+
+static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
+{
+ void *p;
+ int i;
+ if (n[0] == n[1])
+ return;
+ for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
+ if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
+ continue;
+ if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+ return;
+ }
+}
+
+static void show_symbol(struct seq_file *m, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+ char *modname;
+ const char *name;
+ unsigned long offset, size;
+ char namebuf[KSYM_NAME_LEN+1];
+
+ name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+
+ if (name) {
+ seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
+ if (modname)
+ seq_printf(m, " [%s]", modname);
+ return;
+ }
+#endif
+ seq_printf(m, "%p", (void *)address);
+}
+
+static int leaks_show(struct seq_file *m, void *p)
+{
+ struct kmem_cache *cachep = p;
+ struct list_head *q;
+ struct slab *slabp;
+ struct kmem_list3 *l3;
+ const char *name;
+ unsigned long *n = m->private;
+ int node;
+ int i;
+
+ if (!(cachep->flags & SLAB_STORE_USER))
+ return 0;
+ if (!(cachep->flags & SLAB_RED_ZONE))
+ return 0;
+
+ /* OK, we can do it */
+
+ n[1] = 0;
+
+ for_each_online_node(node) {
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+
+ check_irq_on();
+ spin_lock_irq(&l3->list_lock);
+
+ list_for_each(q, &l3->slabs_full) {
+ slabp = list_entry(q, struct slab, list);
+ handle_slab(n, cachep, slabp);
+ }
+ list_for_each(q, &l3->slabs_partial) {
+ slabp = list_entry(q, struct slab, list);
+ handle_slab(n, cachep, slabp);
+ }
+ spin_unlock_irq(&l3->list_lock);
+ }
+ name = cachep->name;
+ if (n[0] == n[1]) {
+ /* Increase the buffer size */
+ mutex_unlock(&cache_chain_mutex);
+ m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
+ if (!m->private) {
+ /* Too bad, we are really out */
+ m->private = n;
+ mutex_lock(&cache_chain_mutex);
+ return -ENOMEM;
+ }
+ *(unsigned long *)m->private = n[0] * 2;
+ kfree(n);
+ mutex_lock(&cache_chain_mutex);
+ /* Now make sure this entry will be retried */
+ m->count = m->size;
+ return 0;
+ }
+ for (i = 0; i < n[1]; i++) {
+ seq_printf(m, "%s: %lu ", name, n[2*i+3]);
+ show_symbol(m, n[2*i+2]);
+ seq_putc(m, '\n');
+ }
+ return 0;
+}
+
+struct seq_operations slabstats_op = {
+ .start = leaks_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = leaks_show,
+};
+#endif
#endif
/**
diff --git a/mm/slob.c b/mm/slob.c
index a1f42bdc024..9bcc7e2cabf 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
+void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
+{
+ void *ret = kmem_cache_alloc(c, flags);
+ if (ret)
+ memset(ret, 0, c->size);
+
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_zalloc);
+
void kmem_cache_free(struct kmem_cache *c, void *b)
{
if (c->dtor)
diff --git a/mm/swap.c b/mm/swap.c
index b524ea90bdd..88895c249bc 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -209,19 +209,18 @@ int lru_add_drain_all(void)
*/
void fastcall __page_cache_release(struct page *page)
{
- unsigned long flags;
- struct zone *zone = page_zone(page);
+ if (PageLRU(page)) {
+ unsigned long flags;
+ struct zone *zone = page_zone(page);
- spin_lock_irqsave(&zone->lru_lock, flags);
- if (TestClearPageLRU(page))
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ BUG_ON(!PageLRU(page));
+ __ClearPageLRU(page);
del_page_from_lru(zone, page);
- if (page_count(page) != 0)
- page = NULL;
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- if (page)
- free_hot_page(page);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ }
+ free_hot_page(page);
}
-
EXPORT_SYMBOL(__page_cache_release);
/*
@@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold)
pagevec_init(&pages_to_free, cold);
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
- struct zone *pagezone;
if (unlikely(PageCompound(page))) {
if (zone) {
@@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold)
if (!put_page_testzero(page))
continue;
- pagezone = page_zone(page);
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
- }
- if (TestClearPageLRU(page))
+ if (PageLRU(page)) {
+ struct zone *pagezone = page_zone(page);
+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock_irq(&zone->lru_lock);
+ }
+ BUG_ON(!PageLRU(page));
+ __ClearPageLRU(page);
del_page_from_lru(zone, page);
- if (page_count(page) == 0) {
- if (!pagevec_add(&pages_to_free, page)) {
+ }
+
+ if (!pagevec_add(&pages_to_free, page)) {
+ if (zone) {
spin_unlock_irq(&zone->lru_lock);
- __pagevec_free(&pages_to_free);
- pagevec_reinit(&pages_to_free);
- zone = NULL; /* No lock is held */
+ zone = NULL;
}
- }
+ __pagevec_free(&pages_to_free);
+ pagevec_reinit(&pages_to_free);
+ }
}
if (zone)
spin_unlock_irq(&zone->lru_lock);
@@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
- if (TestSetPageLRU(page))
- BUG();
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
add_page_to_inactive_list(zone, page);
}
if (zone)
@@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
- if (TestSetPageLRU(page))
- BUG();
- if (TestSetPageActive(page))
- BUG();
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ BUG_ON(PageActive(page));
+ SetPageActive(page);
add_page_to_active_list(zone, page);
}
if (zone)
@@ -510,7 +512,7 @@ long percpu_counter_sum(struct percpu_counter *fbc)
spin_lock(&fbc->lock);
ret = fbc->count;
- for_each_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
long *pcount = per_cpu_ptr(fbc->counters, cpu);
ret += *pcount;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index db8a3d3e163..d7af296833f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
#include <linux/buffer_head.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
+#include <linux/migrate.h>
#include <asm/pgtable.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f9cf0d073b..39aa9d12961 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry ";
struct swap_list_t swap_list = {-1, -1};
-struct swap_info_struct swap_info[MAX_SWAPFILES];
+static struct swap_info_struct swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
@@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
last_in_cluster = offset + SWAPFILE_CLUSTER;
else if (offset == last_in_cluster) {
spin_lock(&swap_lock);
- si->cluster_next = offset-SWAPFILE_CLUSTER-1;
+ si->cluster_next = offset-SWAPFILE_CLUSTER+1;
goto cluster;
}
if (unlikely(--latency_ration < 0)) {
@@ -417,6 +417,61 @@ void free_swap_and_cache(swp_entry_t entry)
}
}
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Find the swap type that corresponds to given device (if any)
+ *
+ * This is needed for software suspend and is done in such a way that inode
+ * aliasing is allowed.
+ */
+int swap_type_of(dev_t device)
+{
+ int i;
+
+ spin_lock(&swap_lock);
+ for (i = 0; i < nr_swapfiles; i++) {
+ struct inode *inode;
+
+ if (!(swap_info[i].flags & SWP_WRITEOK))
+ continue;
+ if (!device) {
+ spin_unlock(&swap_lock);
+ return i;
+ }
+ inode = swap_info->swap_file->f_dentry->d_inode;
+ if (S_ISBLK(inode->i_mode) &&
+ device == MKDEV(imajor(inode), iminor(inode))) {
+ spin_unlock(&swap_lock);
+ return i;
+ }
+ }
+ spin_unlock(&swap_lock);
+ return -ENODEV;
+}
+
+/*
+ * Return either the total number of swap pages of given type, or the number
+ * of free pages of that type (depending on @free)
+ *
+ * This is needed for software suspend
+ */
+unsigned int count_swap_pages(int type, int free)
+{
+ unsigned int n = 0;
+
+ if (type < nr_swapfiles) {
+ spin_lock(&swap_lock);
+ if (swap_info[type].flags & SWP_WRITEOK) {
+ n = swap_info[type].pages;
+ if (free)
+ n -= swap_info[type].inuse_pages;
+ }
+ spin_unlock(&swap_lock);
+ }
+ return n;
+}
+#endif
+
/*
* No need to decide whether this PTE shares the swap entry with others,
* just let do_wp_page work it out if a write is requested later - to
diff --git a/mm/util.c b/mm/util.c
index 5f4bb59da63..7368479220b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,20 +1,22 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/module.h>
+#include <linux/err.h>
+#include <asm/uaccess.h>
/**
- * kzalloc - allocate memory. The memory is set to zero.
+ * __kzalloc - allocate memory. The memory is set to zero.
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
*/
-void *kzalloc(size_t size, gfp_t flags)
+void *__kzalloc(size_t size, gfp_t flags)
{
- void *ret = kmalloc(size, flags);
+ void *ret = ____kmalloc(size, flags);
if (ret)
memset(ret, 0, size);
return ret;
}
-EXPORT_SYMBOL(kzalloc);
+EXPORT_SYMBOL(__kzalloc);
/*
* kstrdup - allocate space for and copy an existing string
@@ -31,9 +33,44 @@ char *kstrdup(const char *s, gfp_t gfp)
return NULL;
len = strlen(s) + 1;
- buf = kmalloc(len, gfp);
+ buf = ____kmalloc(len, gfp);
if (buf)
memcpy(buf, s, len);
return buf;
}
EXPORT_SYMBOL(kstrdup);
+
+/*
+ * strndup_user - duplicate an existing string from user space
+ *
+ * @s: The string to duplicate
+ * @n: Maximum number of bytes to copy, including the trailing NUL.
+ */
+char *strndup_user(const char __user *s, long n)
+{
+ char *p;
+ long length;
+
+ length = strnlen_user(s, n);
+
+ if (!length)
+ return ERR_PTR(-EFAULT);
+
+ if (length > n)
+ return ERR_PTR(-EINVAL);
+
+ p = kmalloc(length, GFP_KERNEL);
+
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(p, s, length)) {
+ kfree(p);
+ return ERR_PTR(-EFAULT);
+ }
+
+ p[length - 1] = '\0';
+
+ return p;
+}
+EXPORT_SYMBOL(strndup_user);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4fe7e3aa02e..acdf001d694 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -33,39 +33,21 @@
#include <linux/cpuset.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
+#include <linux/delay.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
-/* possible outcome of pageout() */
-typedef enum {
- /* failed to write page out, page is locked */
- PAGE_KEEP,
- /* move page to the active list, page is locked */
- PAGE_ACTIVATE,
- /* page has been sent to the disk successfully, page is unlocked */
- PAGE_SUCCESS,
- /* page is clean and locked */
- PAGE_CLEAN,
-} pageout_t;
+#include "internal.h"
struct scan_control {
- /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
- unsigned long nr_to_scan;
-
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
- /* Incremented by the number of pages reclaimed */
- unsigned long nr_reclaimed;
-
unsigned long nr_mapped; /* From page_state */
- /* Ask shrink_caches, or shrink_zone to scan at this priority */
- unsigned int priority;
-
/* This context's GFP mask */
gfp_t gfp_mask;
@@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker);
*
* Returns the number of slab objects which we shrunk.
*/
-int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+ unsigned long lru_pages)
{
struct shrinker *shrinker;
- int ret = 0;
+ unsigned long ret = 0;
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
@@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping,
}
/*
- * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
*/
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+pageout_t pageout(struct page *page, struct address_space *mapping)
{
/*
* If the page is dirty, only perform writeback if that write
@@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
return PAGE_CLEAN;
}
-static int remove_mapping(struct address_space *mapping, struct page *page)
+int remove_mapping(struct address_space *mapping, struct page *page)
{
if (!mapping)
return 0; /* truncate got there first */
@@ -414,14 +398,15 @@ cannot_free:
}
/*
- * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
+ * shrink_page_list() returns the number of reclaimed pages
*/
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static unsigned long shrink_page_list(struct list_head *page_list,
+ struct scan_control *sc)
{
LIST_HEAD(ret_pages);
struct pagevec freed_pvec;
int pgactivate = 0;
- int reclaimed = 0;
+ unsigned long nr_reclaimed = 0;
cond_resched();
@@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
- if (PageAnon(page) && !PageSwapCache(page)) {
- if (!sc->may_swap)
- goto keep_locked;
+ if (PageAnon(page) && !PageSwapCache(page))
if (!add_to_swap(page, GFP_ATOMIC))
goto activate_locked;
- }
#endif /* CONFIG_SWAP */
mapping = page_mapping(page);
@@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- /*
- * No unmapping if we do not swap
- */
- if (!sc->may_swap)
- goto keep_locked;
-
switch (try_to_unmap(page, 0)) {
case SWAP_FAIL:
goto activate_locked;
@@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
free_it:
unlock_page(page);
- reclaimed++;
+ nr_reclaimed++;
if (!pagevec_add(&freed_pvec, page))
__pagevec_release_nonlru(&freed_pvec);
continue;
@@ -579,483 +555,8 @@ keep:
if (pagevec_count(&freed_pvec))
__pagevec_release_nonlru(&freed_pvec);
mod_page_state(pgactivate, pgactivate);
- sc->nr_reclaimed += reclaimed;
- return reclaimed;
-}
-
-#ifdef CONFIG_MIGRATION
-static inline void move_to_lru(struct page *page)
-{
- list_del(&page->lru);
- if (PageActive(page)) {
- /*
- * lru_cache_add_active checks that
- * the PG_active bit is off.
- */
- ClearPageActive(page);
- lru_cache_add_active(page);
- } else {
- lru_cache_add(page);
- }
- put_page(page);
-}
-
-/*
- * Add isolated pages on the list back to the LRU.
- *
- * returns the number of pages put back.
- */
-int putback_lru_pages(struct list_head *l)
-{
- struct page *page;
- struct page *page2;
- int count = 0;
-
- list_for_each_entry_safe(page, page2, l, lru) {
- move_to_lru(page);
- count++;
- }
- return count;
-}
-
-/*
- * Non migratable page
- */
-int fail_migrate_page(struct page *newpage, struct page *page)
-{
- return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-
-/*
- * swapout a single page
- * page is locked upon entry, unlocked on exit
- */
-static int swap_page(struct page *page)
-{
- struct address_space *mapping = page_mapping(page);
-
- if (page_mapped(page) && mapping)
- if (try_to_unmap(page, 1) != SWAP_SUCCESS)
- goto unlock_retry;
-
- if (PageDirty(page)) {
- /* Page is dirty, try to write it out here */
- switch(pageout(page, mapping)) {
- case PAGE_KEEP:
- case PAGE_ACTIVATE:
- goto unlock_retry;
-
- case PAGE_SUCCESS:
- goto retry;
-
- case PAGE_CLEAN:
- ; /* try to free the page below */
- }
- }
-
- if (PagePrivate(page)) {
- if (!try_to_release_page(page, GFP_KERNEL) ||
- (!mapping && page_count(page) == 1))
- goto unlock_retry;
- }
-
- if (remove_mapping(mapping, page)) {
- /* Success */
- unlock_page(page);
- return 0;
- }
-
-unlock_retry:
- unlock_page(page);
-
-retry:
- return -EAGAIN;
+ return nr_reclaimed;
}
-EXPORT_SYMBOL(swap_page);
-
-/*
- * Page migration was first developed in the context of the memory hotplug
- * project. The main authors of the migration code are:
- *
- * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
- * Hirokazu Takahashi <taka@valinux.co.jp>
- * Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
- */
-
-/*
- * Remove references for a page and establish the new page with the correct
- * basic settings to be able to stop accesses to the page.
- */
-int migrate_page_remove_references(struct page *newpage,
- struct page *page, int nr_refs)
-{
- struct address_space *mapping = page_mapping(page);
- struct page **radix_pointer;
-
- /*
- * Avoid doing any of the following work if the page count
- * indicates that the page is in use or truncate has removed
- * the page.
- */
- if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
- return -EAGAIN;
-
- /*
- * Establish swap ptes for anonymous pages or destroy pte
- * maps for files.
- *
- * In order to reestablish file backed mappings the fault handlers
- * will take the radix tree_lock which may then be used to stop
- * processses from accessing this page until the new page is ready.
- *
- * A process accessing via a swap pte (an anonymous page) will take a
- * page_lock on the old page which will block the process until the
- * migration attempt is complete. At that time the PageSwapCache bit
- * will be examined. If the page was migrated then the PageSwapCache
- * bit will be clear and the operation to retrieve the page will be
- * retried which will find the new page in the radix tree. Then a new
- * direct mapping may be generated based on the radix tree contents.
- *
- * If the page was not migrated then the PageSwapCache bit
- * is still set and the operation may continue.
- */
- if (try_to_unmap(page, 1) == SWAP_FAIL)
- /* A vma has VM_LOCKED set -> Permanent failure */
- return -EPERM;
-
- /*
- * Give up if we were unable to remove all mappings.
- */
- if (page_mapcount(page))
- return -EAGAIN;
-
- write_lock_irq(&mapping->tree_lock);
-
- radix_pointer = (struct page **)radix_tree_lookup_slot(
- &mapping->page_tree,
- page_index(page));
-
- if (!page_mapping(page) || page_count(page) != nr_refs ||
- *radix_pointer != page) {
- write_unlock_irq(&mapping->tree_lock);
- return -EAGAIN;
- }
-
- /*
- * Now we know that no one else is looking at the page.
- *
- * Certain minimal information about a page must be available
- * in order for other subsystems to properly handle the page if they
- * find it through the radix tree update before we are finished
- * copying the page.
- */
- get_page(newpage);
- newpage->index = page->index;
- newpage->mapping = page->mapping;
- if (PageSwapCache(page)) {
- SetPageSwapCache(newpage);
- set_page_private(newpage, page_private(page));
- }
-
- *radix_pointer = newpage;
- __put_page(page);
- write_unlock_irq(&mapping->tree_lock);
-
- return 0;
-}
-EXPORT_SYMBOL(migrate_page_remove_references);
-
-/*
- * Copy the page to its new location
- */
-void migrate_page_copy(struct page *newpage, struct page *page)
-{
- copy_highpage(newpage, page);
-
- if (PageError(page))
- SetPageError(newpage);
- if (PageReferenced(page))
- SetPageReferenced(newpage);
- if (PageUptodate(page))
- SetPageUptodate(newpage);
- if (PageActive(page))
- SetPageActive(newpage);
- if (PageChecked(page))
- SetPageChecked(newpage);
- if (PageMappedToDisk(page))
- SetPageMappedToDisk(newpage);
-
- if (PageDirty(page)) {
- clear_page_dirty_for_io(page);
- set_page_dirty(newpage);
- }
-
- ClearPageSwapCache(page);
- ClearPageActive(page);
- ClearPagePrivate(page);
- set_page_private(page, 0);
- page->mapping = NULL;
-
- /*
- * If any waiters have accumulated on the new page then
- * wake them up.
- */
- if (PageWriteback(newpage))
- end_page_writeback(newpage);
-}
-EXPORT_SYMBOL(migrate_page_copy);
-
-/*
- * Common logic to directly migrate a single page suitable for
- * pages that do not use PagePrivate.
- *
- * Pages are locked upon entry and exit.
- */
-int migrate_page(struct page *newpage, struct page *page)
-{
- int rc;
-
- BUG_ON(PageWriteback(page)); /* Writeback must be complete */
-
- rc = migrate_page_remove_references(newpage, page, 2);
-
- if (rc)
- return rc;
-
- migrate_page_copy(newpage, page);
-
- /*
- * Remove auxiliary swap entries and replace
- * them with real ptes.
- *
- * Note that a real pte entry will allow processes that are not
- * waiting on the page lock to use the new page via the page tables
- * before the new page is unlocked.
- */
- remove_from_swap(newpage);
- return 0;
-}
-EXPORT_SYMBOL(migrate_page);
-
-/*
- * migrate_pages
- *
- * Two lists are passed to this function. The first list
- * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the pages isolated
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
- *
- * The function returns after 10 attempts or if no pages
- * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
- *
- * Return: Number of pages not migrated when "to" ran empty.
- */
-int migrate_pages(struct list_head *from, struct list_head *to,
- struct list_head *moved, struct list_head *failed)
-{
- int retry;
- int nr_failed = 0;
- int pass = 0;
- struct page *page;
- struct page *page2;
- int swapwrite = current->flags & PF_SWAPWRITE;
- int rc;
-
- if (!swapwrite)
- current->flags |= PF_SWAPWRITE;
-
-redo:
- retry = 0;
-
- list_for_each_entry_safe(page, page2, from, lru) {
- struct page *newpage = NULL;
- struct address_space *mapping;
-
- cond_resched();
-
- rc = 0;
- if (page_count(page) == 1)
- /* page was freed from under us. So we are done. */
- goto next;
-
- if (to && list_empty(to))
- break;
-
- /*
- * Skip locked pages during the first two passes to give the
- * functions holding the lock time to release the page. Later we
- * use lock_page() to have a higher chance of acquiring the
- * lock.
- */
- rc = -EAGAIN;
- if (pass > 2)
- lock_page(page);
- else
- if (TestSetPageLocked(page))
- goto next;
-
- /*
- * Only wait on writeback if we have already done a pass where
- * we we may have triggered writeouts for lots of pages.
- */
- if (pass > 0) {
- wait_on_page_writeback(page);
- } else {
- if (PageWriteback(page))
- goto unlock_page;
- }
-
- /*
- * Anonymous pages must have swap cache references otherwise
- * the information contained in the page maps cannot be
- * preserved.
- */
- if (PageAnon(page) && !PageSwapCache(page)) {
- if (!add_to_swap(page, GFP_KERNEL)) {
- rc = -ENOMEM;
- goto unlock_page;
- }
- }
-
- if (!to) {
- rc = swap_page(page);
- goto next;
- }
-
- newpage = lru_to_page(to);
- lock_page(newpage);
-
- /*
- * Pages are properly locked and writeback is complete.
- * Try to migrate the page.
- */
- mapping = page_mapping(page);
- if (!mapping)
- goto unlock_both;
-
- if (mapping->a_ops->migratepage) {
- /*
- * Most pages have a mapping and most filesystems
- * should provide a migration function. Anonymous
- * pages are part of swap space which also has its
- * own migration function. This is the most common
- * path for page migration.
- */
- rc = mapping->a_ops->migratepage(newpage, page);
- goto unlock_both;
- }
-
- /*
- * Default handling if a filesystem does not provide
- * a migration function. We can only migrate clean
- * pages so try to write out any dirty pages first.
- */
- if (PageDirty(page)) {
- switch (pageout(page, mapping)) {
- case PAGE_KEEP:
- case PAGE_ACTIVATE:
- goto unlock_both;
-
- case PAGE_SUCCESS:
- unlock_page(newpage);
- goto next;
-
- case PAGE_CLEAN:
- ; /* try to migrate the page below */
- }
- }
-
- /*
- * Buffers are managed in a filesystem specific way.
- * We must have no buffers or drop them.
- */
- if (!page_has_buffers(page) ||
- try_to_release_page(page, GFP_KERNEL)) {
- rc = migrate_page(newpage, page);
- goto unlock_both;
- }
-
- /*
- * On early passes with mapped pages simply
- * retry. There may be a lock held for some
- * buffers that may go away. Later
- * swap them out.
- */
- if (pass > 4) {
- /*
- * Persistently unable to drop buffers..... As a
- * measure of last resort we fall back to
- * swap_page().
- */
- unlock_page(newpage);
- newpage = NULL;
- rc = swap_page(page);
- goto next;
- }
-
-unlock_both:
- unlock_page(newpage);
-
-unlock_page:
- unlock_page(page);
-
-next:
- if (rc == -EAGAIN) {
- retry++;
- } else if (rc) {
- /* Permanent failure */
- list_move(&page->lru, failed);
- nr_failed++;
- } else {
- if (newpage) {
- /* Successful migration. Return page to LRU */
- move_to_lru(newpage);
- }
- list_move(&page->lru, moved);
- }
- }
- if (retry && pass++ < 10)
- goto redo;
-
- if (!swapwrite)
- current->flags &= ~PF_SWAPWRITE;
-
- return nr_failed + retry;
-}
-
-/*
- * Isolate one page from the LRU lists and put it on the
- * indicated list with elevated refcount.
- *
- * Result:
- * 0 = page not on LRU list
- * 1 = page removed from LRU list and added to the specified list.
- */
-int isolate_lru_page(struct page *page)
-{
- int ret = 0;
-
- if (PageLRU(page)) {
- struct zone *zone = page_zone(page);
- spin_lock_irq(&zone->lru_lock);
- if (TestClearPageLRU(page)) {
- ret = 1;
- get_page(page);
- if (PageActive(page))
- del_page_from_active_list(zone, page);
- else
- del_page_from_inactive_list(zone, page);
- }
- spin_unlock_irq(&zone->lru_lock);
- }
-
- return ret;
-}
-#endif
/*
* zone->lru_lock is heavily contended. Some of the functions that
@@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page)
*
* returns how many pages were moved onto *@dst.
*/
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
- struct list_head *dst, int *scanned)
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+ struct list_head *src, struct list_head *dst,
+ unsigned long *scanned)
{
- int nr_taken = 0;
+ unsigned long nr_taken = 0;
struct page *page;
- int scan = 0;
+ unsigned long scan;
- while (scan++ < nr_to_scan && !list_empty(src)) {
+ for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+ struct list_head *target;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
- if (!TestClearPageLRU(page))
- BUG();
+ BUG_ON(!PageLRU(page));
+
list_del(&page->lru);
- if (get_page_testone(page)) {
+ target = src;
+ if (likely(get_page_unless_zero(page))) {
/*
- * It is being freed elsewhere
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
*/
- __put_page(page);
- SetPageLRU(page);
- list_add(&page->lru, src);
- continue;
- } else {
- list_add(&page->lru, dst);
+ ClearPageLRU(page);
+ target = dst;
nr_taken++;
- }
+ } /* else it is being freed elsewhere */
+
+ list_add(&page->lru, target);
}
*scanned = scan;
@@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
}
/*
- * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
+ * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
+ * of reclaimed pages
*/
-static void shrink_cache(struct zone *zone, struct scan_control *sc)
+static unsigned long shrink_inactive_list(unsigned long max_scan,
+ struct zone *zone, struct scan_control *sc)
{
LIST_HEAD(page_list);
struct pagevec pvec;
- int max_scan = sc->nr_to_scan;
+ unsigned long nr_scanned = 0;
+ unsigned long nr_reclaimed = 0;
pagevec_init(&pvec, 1);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
- while (max_scan > 0) {
+ do {
struct page *page;
- int nr_taken;
- int nr_scan;
- int nr_freed;
+ unsigned long nr_taken;
+ unsigned long nr_scan;
+ unsigned long nr_freed;
nr_taken = isolate_lru_pages(sc->swap_cluster_max,
&zone->inactive_list,
@@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
zone->pages_scanned += nr_scan;
spin_unlock_irq(&zone->lru_lock);
- if (nr_taken == 0)
- goto done;
-
- max_scan -= nr_scan;
- nr_freed = shrink_list(&page_list, sc);
-
+ nr_scanned += nr_scan;
+ nr_freed = shrink_page_list(&page_list, sc);
+ nr_reclaimed += nr_freed;
local_irq_disable();
if (current_is_kswapd()) {
__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
@@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
__mod_page_state_zone(zone, pgscan_direct, nr_scan);
__mod_page_state_zone(zone, pgsteal, nr_freed);
+ if (nr_taken == 0)
+ goto done;
+
spin_lock(&zone->lru_lock);
/*
* Put back any unfreeable pages.
*/
while (!list_empty(&page_list)) {
page = lru_to_page(&page_list);
- if (TestSetPageLRU(page))
- BUG();
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
list_del(&page->lru);
if (PageActive(page))
add_page_to_active_list(zone, page);
@@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
spin_lock_irq(&zone->lru_lock);
}
}
- }
- spin_unlock_irq(&zone->lru_lock);
+ } while (nr_scanned < max_scan);
+ spin_unlock(&zone->lru_lock);
done:
+ local_irq_enable();
pagevec_release(&pvec);
+ return nr_reclaimed;
}
/*
@@ -1188,13 +697,12 @@ done:
* The downside is that we have to touch page->_count against each page.
* But we had to alter page->flags anyway.
*/
-static void
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+ struct scan_control *sc)
{
- int pgmoved;
+ unsigned long pgmoved;
int pgdeactivate = 0;
- int pgscanned;
- int nr_pages = sc->nr_to_scan;
+ unsigned long pgscanned;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
LIST_HEAD(l_active); /* Pages to go onto the active_list */
@@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
struct pagevec pvec;
int reclaim_mapped = 0;
- if (unlikely(sc->may_swap)) {
+ if (sc->may_swap) {
long mapped_ratio;
long distress;
long swap_tendency;
@@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
while (!list_empty(&l_inactive)) {
page = lru_to_page(&l_inactive);
prefetchw_prev_lru_page(page, &l_inactive, flags);
- if (TestSetPageLRU(page))
- BUG();
- if (!TestClearPageActive(page))
- BUG();
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ BUG_ON(!PageActive(page));
+ ClearPageActive(page);
+
list_move(&page->lru, &zone->inactive_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
@@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
while (!list_empty(&l_active)) {
page = lru_to_page(&l_active);
prefetchw_prev_lru_page(page, &l_active, flags);
- if (TestSetPageLRU(page))
- BUG();
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
pgmoved++;
@@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void
-shrink_zone(struct zone *zone, struct scan_control *sc)
+static unsigned long shrink_zone(int priority, struct zone *zone,
+ struct scan_control *sc)
{
unsigned long nr_active;
unsigned long nr_inactive;
+ unsigned long nr_to_scan;
+ unsigned long nr_reclaimed = 0;
atomic_inc(&zone->reclaim_in_progress);
@@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
* Add one to `nr_to_scan' just to make sure that the kernel will
* slowly sift through the active list.
*/
- zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+ zone->nr_scan_active += (zone->nr_active >> priority) + 1;
nr_active = zone->nr_scan_active;
if (nr_active >= sc->swap_cluster_max)
zone->nr_scan_active = 0;
else
nr_active = 0;
- zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
+ zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
nr_inactive = zone->nr_scan_inactive;
if (nr_inactive >= sc->swap_cluster_max)
zone->nr_scan_inactive = 0;
@@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
while (nr_active || nr_inactive) {
if (nr_active) {
- sc->nr_to_scan = min(nr_active,
+ nr_to_scan = min(nr_active,
(unsigned long)sc->swap_cluster_max);
- nr_active -= sc->nr_to_scan;
- refill_inactive_zone(zone, sc);
+ nr_active -= nr_to_scan;
+ shrink_active_list(nr_to_scan, zone, sc);
}
if (nr_inactive) {
- sc->nr_to_scan = min(nr_inactive,
+ nr_to_scan = min(nr_inactive,
(unsigned long)sc->swap_cluster_max);
- nr_inactive -= sc->nr_to_scan;
- shrink_cache(zone, sc);
+ nr_inactive -= nr_to_scan;
+ nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
+ sc);
}
}
throttle_vm_writeout();
atomic_dec(&zone->reclaim_in_progress);
+ return nr_reclaimed;
}
/*
@@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static void
-shrink_caches(struct zone **zones, struct scan_control *sc)
+static unsigned long shrink_zones(int priority, struct zone **zones,
+ struct scan_control *sc)
{
+ unsigned long nr_reclaimed = 0;
int i;
for (i = 0; zones[i] != NULL; i++) {
@@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
continue;
- zone->temp_priority = sc->priority;
- if (zone->prev_priority > sc->priority)
- zone->prev_priority = sc->priority;
+ zone->temp_priority = priority;
+ if (zone->prev_priority > priority)
+ zone->prev_priority = priority;
- if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
- shrink_zone(zone, sc);
+ nr_reclaimed += shrink_zone(priority, zone, sc);
}
+ return nr_reclaimed;
}
/*
@@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
{
int priority;
int ret = 0;
- int total_scanned = 0, total_reclaimed = 0;
+ unsigned long total_scanned = 0;
+ unsigned long nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
- struct scan_control sc;
unsigned long lru_pages = 0;
int i;
-
- sc.gfp_mask = gfp_mask;
- sc.may_writepage = !laptop_mode;
- sc.may_swap = 1;
+ struct scan_control sc = {
+ .gfp_mask = gfp_mask,
+ .may_writepage = !laptop_mode,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .may_swap = 1,
+ };
inc_page_state(allocstall);
@@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- sc.priority = priority;
- sc.swap_cluster_max = SWAP_CLUSTER_MAX;
if (!priority)
disable_swap_token();
- shrink_caches(zones, &sc);
+ nr_reclaimed += shrink_zones(priority, zones, &sc);
shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
if (reclaim_state) {
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
total_scanned += sc.nr_scanned;
- total_reclaimed += sc.nr_reclaimed;
- if (total_reclaimed >= sc.swap_cluster_max) {
+ if (nr_reclaimed >= sc.swap_cluster_max) {
ret = 1;
goto out;
}
@@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
- if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+ if (total_scanned > sc.swap_cluster_max +
+ sc.swap_cluster_max / 2) {
wakeup_pdflush(laptop_mode ? 0 : total_scanned);
sc.may_writepage = 1;
}
@@ -1528,22 +1042,26 @@ out:
* the page allocator fallback scheme to ensure that aging of pages is balanced
* across the zones.
*/
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+ int order)
{
- int to_free = nr_pages;
+ unsigned long to_free = nr_pages;
int all_zones_ok;
int priority;
int i;
- int total_scanned, total_reclaimed;
+ unsigned long total_scanned;
+ unsigned long nr_reclaimed;
struct reclaim_state *reclaim_state = current->reclaim_state;
- struct scan_control sc;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_swap = 1,
+ .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+ };
loop_again:
total_scanned = 0;
- total_reclaimed = 0;
- sc.gfp_mask = GFP_KERNEL;
- sc.may_writepage = !laptop_mode;
- sc.may_swap = 1;
+ nr_reclaimed = 0;
+ sc.may_writepage = !laptop_mode,
sc.nr_mapped = read_page_state(nr_mapped);
inc_page_state(pageoutrun);
@@ -1624,15 +1142,11 @@ scan:
if (zone->prev_priority > priority)
zone->prev_priority = priority;
sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- sc.priority = priority;
- sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
- shrink_zone(zone, &sc);
+ nr_reclaimed += shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- total_reclaimed += sc.nr_reclaimed;
+ nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
@@ -1645,10 +1159,10 @@ scan:
* even in laptop mode
*/
if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
- total_scanned > total_reclaimed+total_reclaimed/2)
+ total_scanned > nr_reclaimed + nr_reclaimed / 2)
sc.may_writepage = 1;
}
- if (nr_pages && to_free > total_reclaimed)
+ if (nr_pages && to_free > nr_reclaimed)
continue; /* swsusp: need to do more work */
if (all_zones_ok)
break; /* kswapd: all done */
@@ -1665,7 +1179,7 @@ scan:
* matches the direct reclaim path behaviour in terms of impact
* on zone->*_priority.
*/
- if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
+ if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
break;
}
out:
@@ -1679,7 +1193,7 @@ out:
goto loop_again;
}
- return total_reclaimed;
+ return nr_reclaimed;
}
/*
@@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order)
* Try to free `nr_pages' of memory, system-wide. Returns the number of freed
* pages.
*/
-int shrink_all_memory(int nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages)
{
pg_data_t *pgdat;
- int nr_to_free = nr_pages;
- int ret = 0;
+ unsigned long nr_to_free = nr_pages;
+ unsigned long ret = 0;
+ unsigned retry = 2;
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
current->reclaim_state = &reclaim_state;
- for_each_pgdat(pgdat) {
- int freed;
+repeat:
+ for_each_online_pgdat(pgdat) {
+ unsigned long freed;
+
freed = balance_pgdat(pgdat, nr_to_free, 0);
ret += freed;
nr_to_free -= freed;
- if (nr_to_free <= 0)
+ if ((long)nr_to_free <= 0)
break;
}
+ if (retry-- && ret < nr_pages) {
+ blk_congestion_wait(WRITE, HZ/5);
+ goto repeat;
+ }
current->reclaim_state = NULL;
return ret;
}
@@ -1808,14 +1329,13 @@ int shrink_all_memory(int nr_pages)
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
static int __devinit cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+ unsigned long action, void *hcpu)
{
pg_data_t *pgdat;
cpumask_t mask;
if (action == CPU_ONLINE) {
- for_each_pgdat(pgdat) {
+ for_each_online_pgdat(pgdat) {
mask = node_to_cpumask(pgdat->node_id);
if (any_online_cpu(mask) != NR_CPUS)
/* One of our CPUs online: restore mask */
@@ -1829,10 +1349,17 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
static int __init kswapd_init(void)
{
pg_data_t *pgdat;
+
swap_setup();
- for_each_pgdat(pgdat)
- pgdat->kswapd
- = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+ for_each_online_pgdat(pgdat) {
+ pid_t pid;
+
+ pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+ BUG_ON(pid < 0);
+ read_lock(&tasklist_lock);
+ pgdat->kswapd = find_task_by_pid(pid);
+ read_unlock(&tasklist_lock);
+ }
total_memory = nr_free_pagecache_pages();
hotcpu_notifier(cpu_callback, 0);
return 0;
@@ -1874,46 +1401,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
/*
* Try to free up some pages from this zone through reclaim.
*/
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
- int nr_pages;
+ /* Minimum pages needed in order to stay on node */
+ const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
- struct scan_control sc;
- cpumask_t mask;
- int node_id;
-
- if (time_before(jiffies,
- zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
- return 0;
-
- if (!(gfp_mask & __GFP_WAIT) ||
- zone->all_unreclaimable ||
- atomic_read(&zone->reclaim_in_progress) > 0 ||
- (p->flags & PF_MEMALLOC))
- return 0;
-
- node_id = zone->zone_pgdat->node_id;
- mask = node_to_cpumask(node_id);
- if (!cpus_empty(mask) && node_id != numa_node_id())
- return 0;
-
- sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
- sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
- sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- sc.priority = ZONE_RECLAIM_PRIORITY + 1;
- sc.nr_mapped = read_page_state(nr_mapped);
- sc.gfp_mask = gfp_mask;
+ int priority;
+ unsigned long nr_reclaimed = 0;
+ struct scan_control sc = {
+ .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+ .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .nr_mapped = read_page_state(nr_mapped),
+ .swap_cluster_max = max_t(unsigned long, nr_pages,
+ SWAP_CLUSTER_MAX),
+ .gfp_mask = gfp_mask,
+ };
disable_swap_token();
-
- nr_pages = 1 << order;
- if (nr_pages > SWAP_CLUSTER_MAX)
- sc.swap_cluster_max = nr_pages;
- else
- sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-
cond_resched();
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -1928,17 +1433,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* Free memory by calling shrink zone with increasing priorities
* until we have enough memory freed.
*/
+ priority = ZONE_RECLAIM_PRIORITY;
do {
- sc.priority--;
- shrink_zone(zone, &sc);
+ nr_reclaimed += shrink_zone(priority, zone, &sc);
+ priority--;
+ } while (priority >= 0 && nr_reclaimed < nr_pages);
- } while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
-
- if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+ if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
/*
- * shrink_slab does not currently allow us to determine
- * how many pages were freed in the zone. So we just
- * shake the slab and then go offnode for a single allocation.
+ * shrink_slab() does not currently allow us to determine how
+ * many pages were freed in this zone. So we just shake the slab
+ * a bit and then go off node for this particular allocation
+ * despite possibly having freed enough memory to allocate in
+ * this zone. If we freed local memory then the next
+ * allocations will be local again.
*
* shrink_slab will free memory on all zones and may take
* a long time.
@@ -1949,10 +1457,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
- if (sc.nr_reclaimed == 0)
+ if (nr_reclaimed == 0) {
+ /*
+ * We were unable to reclaim enough pages to stay on node. We
+ * now allow off node accesses for a certain time period before
+ * trying again to reclaim pages from the local zone.
+ */
zone->last_unsuccessful_zone_reclaim = jiffies;
+ }
- return sc.nr_reclaimed >= nr_pages;
+ return nr_reclaimed >= nr_pages;
}
-#endif
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+ cpumask_t mask;
+ int node_id;
+
+ /*
+ * Do not reclaim if there was a recent unsuccessful attempt at zone
+ * reclaim. In that case we let allocations go off node for the
+ * zone_reclaim_interval. Otherwise we would scan for each off-node
+ * page allocation.
+ */
+ if (time_before(jiffies,
+ zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+ return 0;
+
+ /*
+ * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+ * not have reclaimable pages and if we should not delay the allocation
+ * then do not scan.
+ */
+ if (!(gfp_mask & __GFP_WAIT) ||
+ zone->all_unreclaimable ||
+ atomic_read(&zone->reclaim_in_progress) > 0 ||
+ (current->flags & PF_MEMALLOC))
+ return 0;
+
+ /*
+ * Only run zone reclaim on the local zone or on zones that do not
+ * have associated processors. This will favor the local processor
+ * over remote processors and spread off node memory allocations
+ * as wide as possible.
+ */
+ node_id = zone->zone_pgdat->node_id;
+ mask = node_to_cpumask(node_id);
+ if (!cpus_empty(mask) && node_id != numa_node_id())
+ return 0;
+ return __zone_reclaim(zone, gfp_mask, order);
+}
+#endif