aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig14
-rw-r--r--mm/Makefile5
-rw-r--r--mm/allocpercpu.c28
-rw-r--r--mm/backing-dev.c90
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/hugetlb.c251
-rw-r--r--mm/internal.h10
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/ksm.c1703
-rw-r--r--mm/madvise.c53
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory.c213
-rw-r--r--mm/memory_hotplug.c7
-rw-r--r--mm/mempool.c7
-rw-r--r--mm/migrate.c24
-rw-r--r--mm/mlock.c128
-rw-r--r--mm/mmap.c59
-rw-r--r--mm/mmu_context.c58
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/mremap.c14
-rw-r--r--mm/nommu.c45
-rw-r--r--mm/oom_kill.c86
-rw-r--r--mm/page-writeback.c43
-rw-r--r--mm/page_alloc.c284
-rw-r--r--mm/page_cgroup.c12
-rw-r--r--mm/percpu.c1420
-rw-r--r--mm/quicklist.c2
-rw-r--r--mm/rmap.c78
-rw-r--r--mm/shmem.c24
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slub.c9
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c9
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swap_state.c143
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/vmalloc.c559
-rw-r--r--mm/vmscan.c213
-rw-r--r--mm/vmstat.c5
40 files changed, 4553 insertions, 1101 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index fe5f674d7a7..71eb0b4cce8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,7 +153,7 @@ config MEMORY_HOTREMOVE
#
config PAGEFLAGS_EXTENDED
def_bool y
- depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM
+ depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
@@ -214,6 +214,18 @@ config HAVE_MLOCKED_PAGE_BIT
config MMU_NOTIFIER
bool
+config KSM
+ bool "Enable KSM for page merging"
+ depends on MMU
+ help
+ Enable Kernel Samepage Merging: KSM periodically scans those areas
+ of an application's address space that an app has advised may be
+ mergeable. When it finds pages of identical content, it replaces
+ the many instances by a single resident page with that content, so
+ saving memory until one or another app needs to modify the content.
+ Recommended for use with KVM, or with other duplicative applications.
+ See Documentation/vm/ksm.txt for more information.
+
config DEFAULT_MMAP_MIN_ADDR
int "Low address space to protect from user allocation"
default 4096
diff --git a/mm/Makefile b/mm/Makefile
index 147a7a7873c..728a9fde49d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o mm_init.o $(mmu-y)
+ page_isolation.o mm_init.o mmu_context.o $(mmu-y)
obj-y += init-mm.o
obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_KSM) += ksm.o
obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
@@ -33,7 +34,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
-ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
obj-$(CONFIG_SMP) += percpu.o
else
obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index dfdee6a4735..df34ceae0c6 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -5,6 +5,8 @@
*/
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/bootmem.h>
+#include <asm/sections.h>
#ifndef cache_line_size
#define cache_line_size() L1_CACHE_BYTES
@@ -147,3 +149,29 @@ void free_percpu(void *__pdata)
kfree(__percpu_disguise(__pdata));
}
EXPORT_SYMBOL_GPL(free_percpu);
+
+/*
+ * Generic percpu area setup.
+ */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+
+EXPORT_SYMBOL(__per_cpu_offset);
+
+void __init setup_per_cpu_areas(void)
+{
+ unsigned long size, i;
+ char *ptr;
+ unsigned long nr_possible_cpus = num_possible_cpus();
+
+ /* Copy section for each CPU (we discard the original) */
+ size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
+ ptr = alloc_bootmem_pages(size * nr_possible_cpus);
+
+ for_each_possible_cpu(i) {
+ __per_cpu_offset[i] = ptr - __per_cpu_start;
+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+ ptr += size;
+ }
+}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d3ca0dac111..3d3accb1f80 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -26,6 +26,12 @@ struct backing_dev_info default_backing_dev_info = {
EXPORT_SYMBOL_GPL(default_backing_dev_info);
static struct class *bdi_class;
+
+/*
+ * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
+ * reader side protection for bdi_pending_list. bdi_list has RCU reader side
+ * locking.
+ */
DEFINE_SPINLOCK(bdi_lock);
LIST_HEAD(bdi_list);
LIST_HEAD(bdi_pending_list);
@@ -284,9 +290,9 @@ static int bdi_start_fn(void *ptr)
/*
* Add us to the active bdi_list
*/
- spin_lock(&bdi_lock);
- list_add(&bdi->bdi_list, &bdi_list);
- spin_unlock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
+ list_add_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
bdi_task_init(bdi, wb);
@@ -389,7 +395,7 @@ static int bdi_forker_task(void *ptr)
if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
wb_do_writeback(me, 0);
- spin_lock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
/*
* Check if any existing bdi's have dirty data without
@@ -410,7 +416,7 @@ static int bdi_forker_task(void *ptr)
if (list_empty(&bdi_pending_list)) {
unsigned long wait;
- spin_unlock(&bdi_lock);
+ spin_unlock_bh(&bdi_lock);
wait = msecs_to_jiffies(dirty_writeback_interval * 10);
schedule_timeout(wait);
try_to_freeze();
@@ -426,7 +432,7 @@ static int bdi_forker_task(void *ptr)
bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
bdi_list);
list_del_init(&bdi->bdi_list);
- spin_unlock(&bdi_lock);
+ spin_unlock_bh(&bdi_lock);
wb = &bdi->wb;
wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
@@ -445,9 +451,9 @@ static int bdi_forker_task(void *ptr)
* a chance to flush other bdi's to free
* memory.
*/
- spin_lock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock(&bdi_lock);
+ spin_unlock_bh(&bdi_lock);
bdi_flush_io(bdi);
}
@@ -456,6 +462,24 @@ static int bdi_forker_task(void *ptr)
return 0;
}
+static void bdi_add_to_pending(struct rcu_head *head)
+{
+ struct backing_dev_info *bdi;
+
+ bdi = container_of(head, struct backing_dev_info, rcu_head);
+ INIT_LIST_HEAD(&bdi->bdi_list);
+
+ spin_lock(&bdi_lock);
+ list_add_tail(&bdi->bdi_list, &bdi_pending_list);
+ spin_unlock(&bdi_lock);
+
+ /*
+ * We are now on the pending list, wake up bdi_forker_task()
+ * to finish the job and add us back to the active bdi_list
+ */
+ wake_up_process(default_backing_dev_info.wb.task);
+}
+
/*
* Add the default flusher task that gets created for any bdi
* that has dirty data pending writeout
@@ -478,16 +502,29 @@ void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
* waiting for previous additions to finish.
*/
if (!test_and_set_bit(BDI_pending, &bdi->state)) {
- list_move_tail(&bdi->bdi_list, &bdi_pending_list);
+ list_del_rcu(&bdi->bdi_list);
/*
- * We are now on the pending list, wake up bdi_forker_task()
- * to finish the job and add us back to the active bdi_list
+ * We must wait for the current RCU period to end before
+ * moving to the pending list. So schedule that operation
+ * from an RCU callback.
*/
- wake_up_process(default_backing_dev_info.wb.task);
+ call_rcu(&bdi->rcu_head, bdi_add_to_pending);
}
}
+/*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+ spin_lock_bh(&bdi_lock);
+ list_del_rcu(&bdi->bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ synchronize_rcu();
+}
+
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
@@ -506,9 +543,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
goto exit;
}
- spin_lock(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_list);
- spin_unlock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
bdi->dev = dev;
@@ -526,9 +563,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
wb->task = NULL;
ret = -ENOMEM;
- spin_lock(&bdi_lock);
- list_del(&bdi->bdi_list);
- spin_unlock(&bdi_lock);
+ bdi_remove_from_list(bdi);
goto exit;
}
}
@@ -565,9 +600,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
/*
* Make sure nobody finds us on the bdi_list anymore
*/
- spin_lock(&bdi_lock);
- list_del(&bdi->bdi_list);
- spin_unlock(&bdi_lock);
+ bdi_remove_from_list(bdi);
/*
* Finally, kill the kernel threads. We don't need to be RCU
@@ -599,6 +632,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->max_ratio = 100;
bdi->max_prop_frac = PROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
+ INIT_RCU_HEAD(&bdi->rcu_head);
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
INIT_LIST_HEAD(&bdi->work_list);
@@ -634,7 +668,19 @@ void bdi_destroy(struct backing_dev_info *bdi)
{
int i;
- WARN_ON(bdi_has_dirty_io(bdi));
+ /*
+ * Splice our entries to the default_backing_dev_info, if this
+ * bdi disappears
+ */
+ if (bdi_has_dirty_io(bdi)) {
+ struct bdi_writeback *dst = &default_backing_dev_info.wb;
+
+ spin_lock(&inode_lock);
+ list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
+ list_splice(&bdi->wb.b_io, &dst->b_io);
+ list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
+ spin_unlock(&inode_lock);
+ }
bdi_unregister(bdi);
diff --git a/mm/filemap.c b/mm/filemap.c
index dd51c68e2b8..bcc7372aebb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -119,6 +119,8 @@ void __remove_from_page_cache(struct page *page)
page->mapping = NULL;
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
+ if (PageSwapBacked(page))
+ __dec_zone_page_state(page, NR_SHMEM);
BUG_ON(page_mapped(page));
/*
@@ -431,6 +433,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
+ if (PageSwapBacked(page))
+ __inc_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b16d6363477..815dbd4a6dc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
h->free_huge_pages_node[nid]++;
}
-static struct page *dequeue_huge_page(struct hstate *h)
-{
- int nid;
- struct page *page = NULL;
-
- for (nid = 0; nid < MAX_NUMNODES; ++nid) {
- if (!list_empty(&h->hugepage_freelists[nid])) {
- page = list_entry(h->hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- break;
- }
- }
- return page;
-}
-
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve)
@@ -641,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
/*
* Use a helper variable to find the next node and then
- * copy it back to hugetlb_next_nid afterwards:
+ * copy it back to next_nid_to_alloc afterwards:
* otherwise there's a window in which a racer might
* pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
* But we don't need to use a spin_lock here: it really
@@ -650,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
* if we just successfully allocated a hugepage so that
* the next caller gets hugepages on the next node.
*/
-static int hstate_next_node(struct hstate *h)
+static int hstate_next_node_to_alloc(struct hstate *h)
{
int next_nid;
- next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+ next_nid = next_node(h->next_nid_to_alloc, node_online_map);
if (next_nid == MAX_NUMNODES)
next_nid = first_node(node_online_map);
- h->hugetlb_next_nid = next_nid;
+ h->next_nid_to_alloc = next_nid;
return next_nid;
}
@@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
int next_nid;
int ret = 0;
- start_nid = h->hugetlb_next_nid;
+ start_nid = h->next_nid_to_alloc;
+ next_nid = start_nid;
do {
- page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
+ page = alloc_fresh_huge_page_node(h, next_nid);
if (page)
ret = 1;
- next_nid = hstate_next_node(h);
- } while (!page && h->hugetlb_next_nid != start_nid);
+ next_nid = hstate_next_node_to_alloc(h);
+ } while (!page && next_nid != start_nid);
if (ret)
count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -684,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
return ret;
}
+/*
+ * helper for free_pool_huge_page() - find next node
+ * from which to free a huge page
+ */
+static int hstate_next_node_to_free(struct hstate *h)
+{
+ int next_nid;
+ next_nid = next_node(h->next_nid_to_free, node_online_map);
+ if (next_nid == MAX_NUMNODES)
+ next_nid = first_node(node_online_map);
+ h->next_nid_to_free = next_nid;
+ return next_nid;
+}
+
+/*
+ * Free huge page from pool from next node to free.
+ * Attempt to keep persistent huge pages more or less
+ * balanced over allowed nodes.
+ * Called with hugetlb_lock locked.
+ */
+static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
+{
+ int start_nid;
+ int next_nid;
+ int ret = 0;
+
+ start_nid = h->next_nid_to_free;
+ next_nid = start_nid;
+
+ do {
+ /*
+ * If we're returning unused surplus pages, only examine
+ * nodes with surplus pages.
+ */
+ if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
+ !list_empty(&h->hugepage_freelists[next_nid])) {
+ struct page *page =
+ list_entry(h->hugepage_freelists[next_nid].next,
+ struct page, lru);
+ list_del(&page->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[next_nid]--;
+ if (acct_surplus) {
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[next_nid]--;
+ }
+ update_and_free_page(h, page);
+ ret = 1;
+ }
+ next_nid = hstate_next_node_to_free(h);
+ } while (!ret && next_nid != start_nid);
+
+ return ret;
+}
+
static struct page *alloc_buddy_huge_page(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
@@ -855,22 +893,13 @@ free:
* When releasing a hugetlb pool reservation, any surplus pages that were
* allocated to satisfy the reservation must be explicitly freed if they were
* never used.
+ * Called with hugetlb_lock held.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
- static int nid = -1;
- struct page *page;
unsigned long nr_pages;
- /*
- * We want to release as many surplus pages as possible, spread
- * evenly across all nodes. Iterate across all nodes until we
- * can no longer free unreserved surplus pages. This occurs when
- * the nodes with surplus pages have no free pages.
- */
- unsigned long remaining_iterations = nr_online_nodes;
-
/* Uncommit the reservation */
h->resv_huge_pages -= unused_resv_pages;
@@ -880,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
- while (remaining_iterations-- && nr_pages) {
- nid = next_node(nid, node_online_map);
- if (nid == MAX_NUMNODES)
- nid = first_node(node_online_map);
-
- if (!h->surplus_huge_pages_node[nid])
- continue;
-
- if (!list_empty(&h->hugepage_freelists[nid])) {
- page = list_entry(h->hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- update_and_free_page(h, page);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- h->surplus_huge_pages--;
- h->surplus_huge_pages_node[nid]--;
- nr_pages--;
- remaining_iterations = nr_online_nodes;
- }
+ /*
+ * We want to release as many surplus pages as possible, spread
+ * evenly across all nodes. Iterate across all nodes until we
+ * can no longer free unreserved surplus pages. This occurs when
+ * the nodes with surplus pages have no free pages.
+ * free_pool_huge_page() will balance the the frees across the
+ * on-line nodes for us and will handle the hstate accounting.
+ */
+ while (nr_pages--) {
+ if (!free_pool_huge_page(h, 1))
+ break;
}
}
@@ -1008,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
void *addr;
addr = __alloc_bootmem_node_nopanic(
- NODE_DATA(h->hugetlb_next_nid),
+ NODE_DATA(h->next_nid_to_alloc),
huge_page_size(h), huge_page_size(h), 0);
+ hstate_next_node_to_alloc(h);
if (addr) {
/*
* Use the beginning of the huge page to store the
@@ -1020,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
m = addr;
goto found;
}
- hstate_next_node(h);
nr_nodes--;
}
return 0;
@@ -1141,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
*/
static int adjust_pool_surplus(struct hstate *h, int delta)
{
- static int prev_nid;
- int nid = prev_nid;
+ int start_nid, next_nid;
int ret = 0;
VM_BUG_ON(delta != -1 && delta != 1);
- do {
- nid = next_node(nid, node_online_map);
- if (nid == MAX_NUMNODES)
- nid = first_node(node_online_map);
- /* To shrink on this node, there must be a surplus page */
- if (delta < 0 && !h->surplus_huge_pages_node[nid])
- continue;
- /* Surplus cannot exceed the total number of pages */
- if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+ if (delta < 0)
+ start_nid = h->next_nid_to_alloc;
+ else
+ start_nid = h->next_nid_to_free;
+ next_nid = start_nid;
+
+ do {
+ int nid = next_nid;
+ if (delta < 0) {
+ next_nid = hstate_next_node_to_alloc(h);
+ /*
+ * To shrink on this node, there must be a surplus page
+ */
+ if (!h->surplus_huge_pages_node[nid])
+ continue;
+ }
+ if (delta > 0) {
+ next_nid = hstate_next_node_to_free(h);
+ /*
+ * Surplus cannot exceed the total number of pages
+ */
+ if (h->surplus_huge_pages_node[nid] >=
h->nr_huge_pages_node[nid])
- continue;
+ continue;
+ }
h->surplus_huge_pages += delta;
h->surplus_huge_pages_node[nid] += delta;
ret = 1;
break;
- } while (nid != prev_nid);
+ } while (next_nid != start_nid);
- prev_nid = nid;
return ret;
}
@@ -1227,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
min_count = max(count, min_count);
try_to_free_low(h, min_count);
while (min_count < persistent_huge_pages(h)) {
- struct page *page = dequeue_huge_page(h);
- if (!page)
+ if (!free_pool_huge_page(h, 0))
break;
- update_and_free_page(h, page);
}
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, 1))
@@ -1442,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
h->free_huge_pages = 0;
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
- h->hugetlb_next_nid = first_node(node_online_map);
+ h->next_nid_to_alloc = first_node(node_online_map);
+ h->next_nid_to_free = first_node(node_online_map);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
@@ -1985,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
return find_lock_page(mapping, idx);
}
+/*
+ * Return whether there is a pagecache page to back given address within VMA.
+ * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
+ */
+static bool hugetlbfs_pagecache_present(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct address_space *mapping;
+ pgoff_t idx;
+ struct page *page;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ page = find_get_page(mapping, idx);
+ if (page)
+ put_page(page);
+ return page != NULL;
+}
+
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, unsigned int flags)
{
@@ -2180,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
return NULL;
}
-static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
-{
- if (!ptep || write || shared)
- return 0;
- else
- return huge_pte_none(huge_ptep_get(ptep));
-}
-
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, int *length, int i,
- int write)
+ unsigned int flags)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
int remainder = *length;
struct hstate *h = hstate_vma(vma);
- int zeropage_ok = 0;
- int shared = vma->vm_flags & VM_SHARED;
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
+ int absent;
struct page *page;
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
- * each hugepage. We have to make * sure we get the
+ * each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
*/
pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
- if (huge_zeropage_ok(pte, write, shared))
- zeropage_ok = 1;
+ absent = !pte || huge_pte_none(huge_ptep_get(pte));
+
+ /*
+ * When coredumping, it suits get_dump_page if we just return
+ * an error where there's an empty slot with no huge pagecache
+ * to back it. This way, we avoid allocating a hugepage, and
+ * the sparse dumpfile avoids allocating disk blocks, but its
+ * huge holes still show up with zeroes where they need to be.
+ */
+ if (absent && (flags & FOLL_DUMP) &&
+ !hugetlbfs_pagecache_present(h, vma, vaddr)) {
+ remainder = 0;
+ break;
+ }
- if (!pte ||
- (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
- (write && !pte_write(huge_ptep_get(pte)))) {
+ if (absent ||
+ ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
int ret;
spin_unlock(&mm->page_table_lock);
- ret = hugetlb_fault(mm, vma, vaddr, write);
+ ret = hugetlb_fault(mm, vma, vaddr,
+ (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
spin_lock(&mm->page_table_lock);
if (!(ret & VM_FAULT_ERROR))
continue;
remainder = 0;
- if (!i)
- i = -EFAULT;
break;
}
@@ -2235,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = pte_page(huge_ptep_get(pte));
same_page:
if (pages) {
- if (zeropage_ok)
- pages[i] = ZERO_PAGE(0);
- else
- pages[i] = mem_map_offset(page, pfn_offset);
+ pages[i] = mem_map_offset(page, pfn_offset);
get_page(pages[i]);
}
@@ -2262,7 +2311,7 @@ same_page:
*length = remainder;
*position = vaddr;
- return i;
+ return i ? i : -EFAULT;
}
void hugetlb_change_protection(struct vm_area_struct *vma,
diff --git a/mm/internal.h b/mm/internal.h
index f290c4db528..22ec8d2b0fb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,8 @@ static inline void __put_page(struct page *page)
atomic_dec(&page->_count);
}
+extern unsigned long highest_memmap_pfn;
+
/*
* in mm/vmscan.c:
*/
@@ -46,7 +48,6 @@ extern void putback_lru_page(struct page *page);
/*
* in mm/page_alloc.c
*/
-extern unsigned long highest_memmap_pfn;
extern void __free_pages_bootmem(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
@@ -250,13 +251,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
-#define GUP_FLAGS_WRITE 0x1
-#define GUP_FLAGS_FORCE 0x2
-#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
-#define GUP_FLAGS_IGNORE_SIGKILL 0x8
-
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int len, int flags,
+ unsigned long start, int len, unsigned int foll_flags,
struct page **pages, struct vm_area_struct **vmas);
#define ZONE_RECLAIM_NOSCAN -2
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index d5292fc6f52..177a5169bbd 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -36,7 +36,7 @@ struct test_node {
};
static LIST_HEAD(test_list);
-static DEFINE_PER_CPU(void *, test_pointer);
+static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
/*
* Some very simple testing. This function needs to be extended for
@@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void)
}
for_each_possible_cpu(i) {
- per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL);
+ per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
pr_info("kmemleak: kmalloc(129) = %p\n",
- per_cpu(test_pointer, i));
+ per_cpu(kmemleak_test_pointer, i));
}
return 0;
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 00000000000..37cc3732509
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,1703 @@
+/*
+ * Memory merging support.
+ *
+ * This code enables dynamic sharing of identical pages found in different
+ * memory areas, even if they are not shared by fork()
+ *
+ * Copyright (C) 2008-2009 Red Hat, Inc.
+ * Authors:
+ * Izik Eidus
+ * Andrea Arcangeli
+ * Chris Wright
+ * Hugh Dickins
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/spinlock.h>
+#include <linux/jhash.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/mmu_notifier.h>
+#include <linux/ksm.h>
+
+#include <asm/tlbflush.h>
+
+/*
+ * A few notes about the KSM scanning process,
+ * to make it easier to understand the data structures below:
+ *
+ * In order to reduce excessive scanning, KSM sorts the memory pages by their
+ * contents into a data structure that holds pointers to the pages' locations.
+ *
+ * Since the contents of the pages may change at any moment, KSM cannot just
+ * insert the pages into a normal sorted tree and expect it to find anything.
+ * Therefore KSM uses two data structures - the stable and the unstable tree.
+ *
+ * The stable tree holds pointers to all the merged pages (ksm pages), sorted
+ * by their contents. Because each such page is write-protected, searching on
+ * this tree is fully assured to be working (except when pages are unmapped),
+ * and therefore this tree is called the stable tree.
+ *
+ * In addition to the stable tree, KSM uses a second data structure called the
+ * unstable tree: this tree holds pointers to pages which have been found to
+ * be "unchanged for a period of time". The unstable tree sorts these pages
+ * by their contents, but since they are not write-protected, KSM cannot rely
+ * upon the unstable tree to work correctly - the unstable tree is liable to
+ * be corrupted as its contents are modified, and so it is called unstable.
+ *
+ * KSM solves this problem by several techniques:
+ *
+ * 1) The unstable tree is flushed every time KSM completes scanning all
+ * memory areas, and then the tree is rebuilt again from the beginning.
+ * 2) KSM will only insert into the unstable tree, pages whose hash value
+ * has not changed since the previous scan of all memory areas.
+ * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
+ * colors of the nodes and not on their contents, assuring that even when
+ * the tree gets "corrupted" it won't get out of balance, so scanning time
+ * remains the same (also, searching and inserting nodes in an rbtree uses
+ * the same algorithm, so we have no overhead when we flush and rebuild).
+ * 4) KSM never flushes the stable tree, which means that even if it were to
+ * take 10 attempts to find a page in the unstable tree, once it is found,
+ * it is secured in the stable tree. (When we scan a new page, we first
+ * compare it against the stable tree, and then against the unstable tree.)
+ */
+
+/**
+ * struct mm_slot - ksm information per mm that is being scanned
+ * @link: link to the mm_slots hash list
+ * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
+ * @rmap_list: head for this mm_slot's list of rmap_items
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node link;
+ struct list_head mm_list;
+ struct list_head rmap_list;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct ksm_scan - cursor for scanning
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ * @rmap_item: the current rmap that we are scanning inside the rmap_list
+ * @seqnr: count of completed full scans (needed when removing unstable node)
+ *
+ * There is only the one ksm_scan instance of this cursor structure.
+ */
+struct ksm_scan {
+ struct mm_slot *mm_slot;
+ unsigned long address;
+ struct rmap_item *rmap_item;
+ unsigned long seqnr;
+};
+
+/**
+ * struct rmap_item - reverse mapping item for virtual addresses
+ * @link: link into mm_slot's rmap_list (rmap_list is per mm)
+ * @mm: the memory structure this rmap_item is pointing into
+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
+ * @oldchecksum: previous checksum of the page at that virtual address
+ * @node: rb_node of this rmap_item in either unstable or stable tree
+ * @next: next rmap_item hanging off the same node of the stable tree
+ * @prev: previous rmap_item hanging off the same node of the stable tree
+ */
+struct rmap_item {
+ struct list_head link;
+ struct mm_struct *mm;
+ unsigned long address; /* + low bits used for flags below */
+ union {
+ unsigned int oldchecksum; /* when unstable */
+ struct rmap_item *next; /* when stable */
+ };
+ union {
+ struct rb_node node; /* when tree node */
+ struct rmap_item *prev; /* in stable list */
+ };
+};
+
+#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
+#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
+#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
+
+/* The stable and unstable tree heads */
+static struct rb_root root_stable_tree = RB_ROOT;
+static struct rb_root root_unstable_tree = RB_ROOT;
+
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash;
+
+static struct mm_slot ksm_mm_head = {
+ .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
+};
+static struct ksm_scan ksm_scan = {
+ .mm_slot = &ksm_mm_head,
+};
+
+static struct kmem_cache *rmap_item_cache;
+static struct kmem_cache *mm_slot_cache;
+
+/* The number of nodes in the stable tree */
+static unsigned long ksm_pages_shared;
+
+/* The number of page slots additionally sharing those nodes */
+static unsigned long ksm_pages_sharing;
+
+/* The number of nodes in the unstable tree */
+static unsigned long ksm_pages_unshared;
+
+/* The number of rmap_items in use: to calculate pages_volatile */
+static unsigned long ksm_rmap_items;
+
+/* Limit on the number of unswappable pages used */
+static unsigned long ksm_max_kernel_pages = 2000;
+
+/* Number of pages ksmd should scan in one batch */
+static unsigned int ksm_thread_pages_to_scan = 200;
+
+/* Milliseconds ksmd should sleep between batches */
+static unsigned int ksm_thread_sleep_millisecs = 20;
+
+#define KSM_RUN_STOP 0
+#define KSM_RUN_MERGE 1
+#define KSM_RUN_UNMERGE 2
+static unsigned int ksm_run = KSM_RUN_MERGE;
+
+static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
+static DEFINE_MUTEX(ksm_thread_mutex);
+static DEFINE_SPINLOCK(ksm_mmlist_lock);
+
+#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
+ sizeof(struct __struct), __alignof__(struct __struct),\
+ (__flags), NULL)
+
+static int __init ksm_slab_init(void)
+{
+ rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
+ if (!rmap_item_cache)
+ goto out;
+
+ mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
+ if (!mm_slot_cache)
+ goto out_free;
+
+ return 0;
+
+out_free:
+ kmem_cache_destroy(rmap_item_cache);
+out:
+ return -ENOMEM;
+}
+
+static void __init ksm_slab_free(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+ kmem_cache_destroy(rmap_item_cache);
+ mm_slot_cache = NULL;
+}
+
+static inline struct rmap_item *alloc_rmap_item(void)
+{
+ struct rmap_item *rmap_item;
+
+ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
+ if (rmap_item)
+ ksm_rmap_items++;
+ return rmap_item;
+}
+
+static inline void free_rmap_item(struct rmap_item *rmap_item)
+{
+ ksm_rmap_items--;
+ rmap_item->mm = NULL; /* debug safety */
+ kmem_cache_free(rmap_item_cache, rmap_item);
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static int __init mm_slots_hash_init(void)
+{
+ mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!mm_slots_hash)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __init mm_slots_hash_free(void)
+{
+ kfree(mm_slots_hash);
+}
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ struct hlist_head *bucket;
+ struct hlist_node *node;
+
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
+ hlist_for_each_entry(mm_slot, node, bucket, link) {
+ if (mm == mm_slot->mm)
+ return mm_slot;
+ }
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ struct hlist_head *bucket;
+
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
+ mm_slot->mm = mm;
+ INIT_LIST_HEAD(&mm_slot->rmap_list);
+ hlist_add_head(&mm_slot->link, bucket);
+}
+
+static inline int in_stable_tree(struct rmap_item *rmap_item)
+{
+ return rmap_item->address & STABLE_FLAG;
+}
+
+/*
+ * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
+ * page tables after it has passed through ksm_exit() - which, if necessary,
+ * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
+ * a special flag: they can just back out as soon as mm_users goes to zero.
+ * ksm_test_exit() is used throughout to make this test for exit: in some
+ * places for correctness, in some places just to avoid unnecessary work.
+ */
+static inline bool ksm_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+/*
+ * We use break_ksm to break COW on a ksm page: it's a stripped down
+ *
+ * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
+ * put_page(page);
+ *
+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
+ * in case the application has unmapped and remapped mm,addr meanwhile.
+ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ */
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
+{
+ struct page *page;
+ int ret = 0;
+
+ do {
+ cond_resched();
+ page = follow_page(vma, addr, FOLL_GET);
+ if (!page)
+ break;
+ if (PageKsm(page))
+ ret = handle_mm_fault(vma->vm_mm, vma, addr,
+ FAULT_FLAG_WRITE);
+ else
+ ret = VM_FAULT_WRITE;
+ put_page(page);
+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
+ /*
+ * We must loop because handle_mm_fault() may back out if there's
+ * any difficulty e.g. if pte accessed bit gets updated concurrently.
+ *
+ * VM_FAULT_WRITE is what we have been hoping for: it indicates that
+ * COW has been broken, even if the vma does not permit VM_WRITE;
+ * but note that a concurrent fault might break PageKsm for us.
+ *
+ * VM_FAULT_SIGBUS could occur if we race with truncation of the
+ * backing file, which also invalidates anonymous pages: that's
+ * okay, that truncation will have unmapped the PageKsm for us.
+ *
+ * VM_FAULT_OOM: at the time of writing (late July 2009), setting
+ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
+ * current task has TIF_MEMDIE set, and will be OOM killed on return
+ * to user; and ksmd, having no mm, would never be chosen for that.
+ *
+ * But if the mm is in a limited mem_cgroup, then the fault may fail
+ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
+ * even ksmd can fail in this way - though it's usually breaking ksm
+ * just to undo a merge it made a moment before, so unlikely to oom.
+ *
+ * That's a pity: we might therefore have more kernel pages allocated
+ * than we're counting as nodes in the stable tree; but ksm_do_scan
+ * will retry to break_cow on each pass, so should recover the page
+ * in due course. The important thing is to not let VM_MERGEABLE
+ * be cleared while any such pages might remain in the area.
+ */
+ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
+}
+
+static void break_cow(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ if (ksm_test_exit(mm))
+ goto out;
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr)
+ goto out;
+ if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ goto out;
+ break_ksm(vma, addr);
+out:
+ up_read(&mm->mmap_sem);
+}
+
+static struct page *get_mergeable_page(struct rmap_item *rmap_item)
+{
+ struct mm_struct *mm = rmap_item->mm;
+ unsigned long addr = rmap_item->address;
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ down_read(&mm->mmap_sem);
+ if (ksm_test_exit(mm))
+ goto out;
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr)
+ goto out;
+ if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ goto out;
+
+ page = follow_page(vma, addr, FOLL_GET);
+ if (!page)
+ goto out;
+ if (PageAnon(page)) {
+ flush_anon_page(vma, page, addr);
+ flush_dcache_page(page);
+ } else {
+ put_page(page);
+out: page = NULL;
+ }
+ up_read(&mm->mmap_sem);
+ return page;
+}
+
+/*
+ * get_ksm_page: checks if the page at the virtual address in rmap_item
+ * is still PageKsm, in which case we can trust the content of the page,
+ * and it returns the gotten page; but NULL if the page has been zapped.
+ */
+static struct page *get_ksm_page(struct rmap_item *rmap_item)
+{
+ struct page *page;
+
+ page = get_mergeable_page(rmap_item);
+ if (page && !PageKsm(page)) {
+ put_page(page);
+ page = NULL;
+ }
+ return page;
+}
+
+/*
+ * Removing rmap_item from stable or unstable tree.
+ * This function will clean the information from the stable/unstable tree.
+ */
+static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
+{
+ if (in_stable_tree(rmap_item)) {
+ struct rmap_item *next_item = rmap_item->next;
+
+ if (rmap_item->address & NODE_FLAG) {
+ if (next_item) {
+ rb_replace_node(&rmap_item->node,
+ &next_item->node,
+ &root_stable_tree);
+ next_item->address |= NODE_FLAG;
+ ksm_pages_sharing--;
+ } else {
+ rb_erase(&rmap_item->node, &root_stable_tree);
+ ksm_pages_shared--;
+ }
+ } else {
+ struct rmap_item *prev_item = rmap_item->prev;
+
+ BUG_ON(prev_item->next != rmap_item);
+ prev_item->next = next_item;
+ if (next_item) {
+ BUG_ON(next_item->prev != rmap_item);
+ next_item->prev = rmap_item->prev;
+ }
+ ksm_pages_sharing--;
+ }
+
+ rmap_item->next = NULL;
+
+ } else if (rmap_item->address & NODE_FLAG) {
+ unsigned char age;
+ /*
+ * Usually ksmd can and must skip the rb_erase, because
+ * root_unstable_tree was already reset to RB_ROOT.
+ * But be careful when an mm is exiting: do the rb_erase
+ * if this rmap_item was inserted by this scan, rather
+ * than left over from before.
+ */
+ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
+ BUG_ON(age > 1);
+ if (!age)
+ rb_erase(&rmap_item->node, &root_unstable_tree);
+ ksm_pages_unshared--;
+ }
+
+ rmap_item->address &= PAGE_MASK;
+
+ cond_resched(); /* we're called from many long loops */
+}
+
+static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
+ struct list_head *cur)
+{
+ struct rmap_item *rmap_item;
+
+ while (cur != &mm_slot->rmap_list) {
+ rmap_item = list_entry(cur, struct rmap_item, link);
+ cur = cur->next;
+ remove_rmap_item_from_tree(rmap_item);
+ list_del(&rmap_item->link);
+ free_rmap_item(rmap_item);
+ }
+}
+
+/*
+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
+ * than check every pte of a given vma, the locking doesn't quite work for
+ * that - an rmap_item is assigned to the stable tree after inserting ksm
+ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
+ * rmap_items from parent to child at fork time (so as not to waste time
+ * if exit comes before the next scan reaches it).
+ *
+ * Similarly, although we'd like to remove rmap_items (so updating counts
+ * and freeing memory) when unmerging an area, it's easier to leave that
+ * to the next pass of ksmd - consider, for example, how ksmd might be
+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
+ */
+static int unmerge_ksm_pages(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ unsigned long addr;
+ int err = 0;
+
+ for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
+ if (ksm_test_exit(vma->vm_mm))
+ break;
+ if (signal_pending(current))
+ err = -ERESTARTSYS;
+ else
+ err = break_ksm(vma, addr);
+ }
+ return err;
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * Only called through the sysfs control interface:
+ */
+static int unmerge_and_remove_all_rmap_items(void)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
+ struct mm_slot, mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ for (mm_slot = ksm_scan.mm_slot;
+ mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
+ mm = mm_slot->mm;
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (ksm_test_exit(mm))
+ break;
+ if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ continue;
+ err = unmerge_ksm_pages(vma,
+ vma->vm_start, vma->vm_end);
+ if (err)
+ goto error;
+ }
+
+ remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
+
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
+ struct mm_slot, mm_list);
+ if (ksm_test_exit(mm)) {
+ hlist_del(&mm_slot->link);
+ list_del(&mm_slot->mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ free_mm_slot(mm_slot);
+ clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ up_read(&mm->mmap_sem);
+ mmdrop(mm);
+ } else {
+ spin_unlock(&ksm_mmlist_lock);
+ up_read(&mm->mmap_sem);
+ }
+ }
+
+ ksm_scan.seqnr = 0;
+ return 0;
+
+error:
+ up_read(&mm->mmap_sem);
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = &ksm_mm_head;
+ spin_unlock(&ksm_mmlist_lock);
+ return err;
+}
+#endif /* CONFIG_SYSFS */
+
+static u32 calc_checksum(struct page *page)
+{
+ u32 checksum;
+ void *addr = kmap_atomic(page, KM_USER0);
+ checksum = jhash2(addr, PAGE_SIZE / 4, 17);
+ kunmap_atomic(addr, KM_USER0);
+ return checksum;
+}
+
+static int memcmp_pages(struct page *page1, struct page *page2)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_atomic(page1, KM_USER0);
+ addr2 = kmap_atomic(page2, KM_USER1);
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
+ kunmap_atomic(addr2, KM_USER1);
+ kunmap_atomic(addr1, KM_USER0);
+ return ret;
+}
+
+static inline int pages_identical(struct page *page1, struct page *page2)
+{
+ return !memcmp_pages(page1, page2);
+}
+
+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
+ pte_t *orig_pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr;
+ pte_t *ptep;
+ spinlock_t *ptl;
+ int swapped;
+ int err = -EFAULT;
+
+ addr = page_address_in_vma(page, vma);
+ if (addr == -EFAULT)
+ goto out;
+
+ ptep = page_check_address(page, mm, addr, &ptl, 0);
+ if (!ptep)
+ goto out;
+
+ if (pte_write(*ptep)) {
+ pte_t entry;
+
+ swapped = PageSwapCache(page);
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ /*
+ * Ok this is tricky, when get_user_pages_fast() run it doesnt
+ * take any lock, therefore the check that we are going to make
+ * with the pagecount against the mapcount is racey and
+ * O_DIRECT can happen right after the check.
+ * So we clear the pte and flush the tlb before the check
+ * this assure us that no O_DIRECT can happen after the check
+ * or in the middle of the check.
+ */
+ entry = ptep_clear_flush(vma, addr, ptep);
+ /*
+ * Check that no O_DIRECT or similar I/O is in progress on the
+ * page
+ */
+ if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
+ set_pte_at_notify(mm, addr, ptep, entry);
+ goto out_unlock;
+ }
+ entry = pte_wrprotect(entry);
+ set_pte_at_notify(mm, addr, ptep, entry);
+ }
+ *orig_pte = *ptep;
+ err = 0;
+
+out_unlock:
+ pte_unmap_unlock(ptep, ptl);
+out:
+ return err;
+}
+
+/**
+ * replace_page - replace page in vma by new ksm page
+ * @vma: vma that holds the pte pointing to oldpage
+ * @oldpage: the page we are replacing by newpage
+ * @newpage: the ksm page we replace oldpage by
+ * @orig_pte: the original value of the pte
+ *
+ * Returns 0 on success, -EFAULT on failure.
+ */
+static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
+ struct page *newpage, pte_t orig_pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep;
+ spinlock_t *ptl;
+ unsigned long addr;
+ pgprot_t prot;
+ int err = -EFAULT;
+
+ prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
+
+ addr = page_address_in_vma(oldpage, vma);
+ if (addr == -EFAULT)
+ goto out;
+
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ goto out;
+
+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte_same(*ptep, orig_pte)) {
+ pte_unmap_unlock(ptep, ptl);
+ goto out;
+ }
+
+ get_page(newpage);
+ page_add_ksm_rmap(newpage);
+
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
+
+ page_remove_rmap(oldpage);
+ put_page(oldpage);
+
+ pte_unmap_unlock(ptep, ptl);
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * try_to_merge_one_page - take two pages and merge them into one
+ * @vma: the vma that hold the pte pointing into oldpage
+ * @oldpage: the page that we want to replace with newpage
+ * @newpage: the page that we want to map instead of oldpage
+ *
+ * Note:
+ * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
+ * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
+ *
+ * This function returns 0 if the pages were merged, -EFAULT otherwise.
+ */
+static int try_to_merge_one_page(struct vm_area_struct *vma,
+ struct page *oldpage,
+ struct page *newpage)
+{
+ pte_t orig_pte = __pte(0);
+ int err = -EFAULT;
+
+ if (!(vma->vm_flags & VM_MERGEABLE))
+ goto out;
+
+ if (!PageAnon(oldpage))
+ goto out;
+
+ get_page(newpage);
+ get_page(oldpage);
+
+ /*
+ * We need the page lock to read a stable PageSwapCache in
+ * write_protect_page(). We use trylock_page() instead of
+ * lock_page() because we don't want to wait here - we
+ * prefer to continue scanning and merging different pages,
+ * then come back to this page when it is unlocked.
+ */
+ if (!trylock_page(oldpage))
+ goto out_putpage;
+ /*
+ * If this anonymous page is mapped only here, its pte may need
+ * to be write-protected. If it's mapped elsewhere, all of its
+ * ptes are necessarily already write-protected. But in either
+ * case, we need to lock and check page_count is not raised.
+ */
+ if (write_protect_page(vma, oldpage, &orig_pte)) {
+ unlock_page(oldpage);
+ goto out_putpage;
+ }
+ unlock_page(oldpage);
+
+ if (pages_identical(oldpage, newpage))
+ err = replace_page(vma, oldpage, newpage, orig_pte);
+
+out_putpage:
+ put_page(oldpage);
+ put_page(newpage);
+out:
+ return err;
+}
+
+/*
+ * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
+ * but no new kernel page is allocated: kpage must already be a ksm page.
+ */
+static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
+ unsigned long addr1,
+ struct page *page1,
+ struct page *kpage)
+{
+ struct vm_area_struct *vma;
+ int err = -EFAULT;
+
+ down_read(&mm1->mmap_sem);
+ if (ksm_test_exit(mm1))
+ goto out;
+
+ vma = find_vma(mm1, addr1);
+ if (!vma || vma->vm_start > addr1)
+ goto out;
+
+ err = try_to_merge_one_page(vma, page1, kpage);
+out:
+ up_read(&mm1->mmap_sem);
+ return err;
+}
+
+/*
+ * try_to_merge_two_pages - take two identical pages and prepare them
+ * to be merged into one page.
+ *
+ * This function returns 0 if we successfully mapped two identical pages
+ * into one page, -EFAULT otherwise.
+ *
+ * Note that this function allocates a new kernel page: if one of the pages
+ * is already a ksm page, try_to_merge_with_ksm_page should be used.
+ */
+static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
+ struct page *page1, struct mm_struct *mm2,
+ unsigned long addr2, struct page *page2)
+{
+ struct vm_area_struct *vma;
+ struct page *kpage;
+ int err = -EFAULT;
+
+ /*
+ * The number of nodes in the stable tree
+ * is the number of kernel pages that we hold.
+ */
+ if (ksm_max_kernel_pages &&
+ ksm_max_kernel_pages <= ksm_pages_shared)
+ return err;
+
+ kpage = alloc_page(GFP_HIGHUSER);
+ if (!kpage)
+ return err;
+
+ down_read(&mm1->mmap_sem);
+ if (ksm_test_exit(mm1)) {
+ up_read(&mm1->mmap_sem);
+ goto out;
+ }
+ vma = find_vma(mm1, addr1);
+ if (!vma || vma->vm_start > addr1) {
+ up_read(&mm1->mmap_sem);
+ goto out;
+ }
+
+ copy_user_highpage(kpage, page1, addr1, vma);
+ err = try_to_merge_one_page(vma, page1, kpage);
+ up_read(&mm1->mmap_sem);
+
+ if (!err) {
+ err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
+ /*
+ * If that fails, we have a ksm page with only one pte
+ * pointing to it: so break it.
+ */
+ if (err)
+ break_cow(mm1, addr1);
+ }
+out:
+ put_page(kpage);
+ return err;
+}
+
+/*
+ * stable_tree_search - search page inside the stable tree
+ * @page: the page that we are searching identical pages to.
+ * @page2: pointer into identical page that we are holding inside the stable
+ * tree that we have found.
+ * @rmap_item: the reverse mapping item
+ *
+ * This function checks if there is a page inside the stable tree
+ * with identical content to the page that we are scanning right now.
+ *
+ * This function return rmap_item pointer to the identical item if found,
+ * NULL otherwise.
+ */
+static struct rmap_item *stable_tree_search(struct page *page,
+ struct page **page2,
+ struct rmap_item *rmap_item)
+{
+ struct rb_node *node = root_stable_tree.rb_node;
+
+ while (node) {
+ struct rmap_item *tree_rmap_item, *next_rmap_item;
+ int ret;
+
+ tree_rmap_item = rb_entry(node, struct rmap_item, node);
+ while (tree_rmap_item) {
+ BUG_ON(!in_stable_tree(tree_rmap_item));
+ cond_resched();
+ page2[0] = get_ksm_page(tree_rmap_item);
+ if (page2[0])
+ break;
+ next_rmap_item = tree_rmap_item->next;
+ remove_rmap_item_from_tree(tree_rmap_item);
+ tree_rmap_item = next_rmap_item;
+ }
+ if (!tree_rmap_item)
+ return NULL;
+
+ ret = memcmp_pages(page, page2[0]);
+
+ if (ret < 0) {
+ put_page(page2[0]);
+ node = node->rb_left;
+ } else if (ret > 0) {
+ put_page(page2[0]);
+ node = node->rb_right;
+ } else {
+ return tree_rmap_item;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * stable_tree_insert - insert rmap_item pointing to new ksm page
+ * into the stable tree.
+ *
+ * @page: the page that we are searching identical page to inside the stable
+ * tree.
+ * @rmap_item: pointer to the reverse mapping item.
+ *
+ * This function returns rmap_item if success, NULL otherwise.
+ */
+static struct rmap_item *stable_tree_insert(struct page *page,
+ struct rmap_item *rmap_item)
+{
+ struct rb_node **new = &root_stable_tree.rb_node;
+ struct rb_node *parent = NULL;
+
+ while (*new) {
+ struct rmap_item *tree_rmap_item, *next_rmap_item;
+ struct page *tree_page;
+ int ret;
+
+ tree_rmap_item = rb_entry(*new, struct rmap_item, node);
+ while (tree_rmap_item) {
+ BUG_ON(!in_stable_tree(tree_rmap_item));
+ cond_resched();
+ tree_page = get_ksm_page(tree_rmap_item);
+ if (tree_page)
+ break;
+ next_rmap_item = tree_rmap_item->next;
+ remove_rmap_item_from_tree(tree_rmap_item);
+ tree_rmap_item = next_rmap_item;
+ }
+ if (!tree_rmap_item)
+ return NULL;
+
+ ret = memcmp_pages(page, tree_page);
+ put_page(tree_page);
+
+ parent = *new;
+ if (ret < 0)
+ new = &parent->rb_left;
+ else if (ret > 0)
+ new = &parent->rb_right;
+ else {
+ /*
+ * It is not a bug that stable_tree_search() didn't
+ * find this node: because at that time our page was
+ * not yet write-protected, so may have changed since.
+ */
+ return NULL;
+ }
+ }
+
+ rmap_item->address |= NODE_FLAG | STABLE_FLAG;
+ rmap_item->next = NULL;
+ rb_link_node(&rmap_item->node, parent, new);
+ rb_insert_color(&rmap_item->node, &root_stable_tree);
+
+ ksm_pages_shared++;
+ return rmap_item;
+}
+
+/*
+ * unstable_tree_search_insert - search and insert items into the unstable tree.
+ *
+ * @page: the page that we are going to search for identical page or to insert
+ * into the unstable tree
+ * @page2: pointer into identical page that was found inside the unstable tree
+ * @rmap_item: the reverse mapping item of page
+ *
+ * This function searches for a page in the unstable tree identical to the
+ * page currently being scanned; and if no identical page is found in the
+ * tree, we insert rmap_item as a new object into the unstable tree.
+ *
+ * This function returns pointer to rmap_item found to be identical
+ * to the currently scanned page, NULL otherwise.
+ *
+ * This function does both searching and inserting, because they share
+ * the same walking algorithm in an rbtree.
+ */
+static struct rmap_item *unstable_tree_search_insert(struct page *page,
+ struct page **page2,
+ struct rmap_item *rmap_item)
+{
+ struct rb_node **new = &root_unstable_tree.rb_node;
+ struct rb_node *parent = NULL;
+
+ while (*new) {
+ struct rmap_item *tree_rmap_item;
+ int ret;
+
+ tree_rmap_item = rb_entry(*new, struct rmap_item, node);
+ page2[0] = get_mergeable_page(tree_rmap_item);
+ if (!page2[0])
+ return NULL;
+
+ /*
+ * Don't substitute an unswappable ksm page
+ * just for one good swappable forked page.
+ */
+ if (page == page2[0]) {
+ put_page(page2[0]);
+ return NULL;
+ }
+
+ ret = memcmp_pages(page, page2[0]);
+
+ parent = *new;
+ if (ret < 0) {
+ put_page(page2[0]);
+ new = &parent->rb_left;
+ } else if (ret > 0) {
+ put_page(page2[0]);
+ new = &parent->rb_right;
+ } else {
+ return tree_rmap_item;
+ }
+ }
+
+ rmap_item->address |= NODE_FLAG;
+ rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
+ rb_link_node(&rmap_item->node, parent, new);
+ rb_insert_color(&rmap_item->node, &root_unstable_tree);
+
+ ksm_pages_unshared++;
+ return NULL;
+}
+
+/*
+ * stable_tree_append - add another rmap_item to the linked list of
+ * rmap_items hanging off a given node of the stable tree, all sharing
+ * the same ksm page.
+ */
+static void stable_tree_append(struct rmap_item *rmap_item,
+ struct rmap_item *tree_rmap_item)
+{
+ rmap_item->next = tree_rmap_item->next;
+ rmap_item->prev = tree_rmap_item;
+
+ if (tree_rmap_item->next)
+ tree_rmap_item->next->prev = rmap_item;
+
+ tree_rmap_item->next = rmap_item;
+ rmap_item->address |= STABLE_FLAG;
+
+ ksm_pages_sharing++;
+}
+
+/*
+ * cmp_and_merge_page - first see if page can be merged into the stable tree;
+ * if not, compare checksum to previous and if it's the same, see if page can
+ * be inserted into the unstable tree, or merged with a page already there and
+ * both transferred to the stable tree.
+ *
+ * @page: the page that we are searching identical page to.
+ * @rmap_item: the reverse mapping into the virtual address of this page
+ */
+static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
+{
+ struct page *page2[1];
+ struct rmap_item *tree_rmap_item;
+ unsigned int checksum;
+ int err;
+
+ if (in_stable_tree(rmap_item))
+ remove_rmap_item_from_tree(rmap_item);
+
+ /* We first start with searching the page inside the stable tree */
+ tree_rmap_item = stable_tree_search(page, page2, rmap_item);
+ if (tree_rmap_item) {
+ if (page == page2[0]) /* forked */
+ err = 0;
+ else
+ err = try_to_merge_with_ksm_page(rmap_item->mm,
+ rmap_item->address,
+ page, page2[0]);
+ put_page(page2[0]);
+
+ if (!err) {
+ /*
+ * The page was successfully merged:
+ * add its rmap_item to the stable tree.
+ */
+ stable_tree_append(rmap_item, tree_rmap_item);
+ }
+ return;
+ }
+
+ /*
+ * A ksm page might have got here by fork, but its other
+ * references have already been removed from the stable tree.
+ * Or it might be left over from a break_ksm which failed
+ * when the mem_cgroup had reached its limit: try again now.
+ */
+ if (PageKsm(page))
+ break_cow(rmap_item->mm, rmap_item->address);
+
+ /*
+ * In case the hash value of the page was changed from the last time we
+ * have calculated it, this page to be changed frequely, therefore we
+ * don't want to insert it to the unstable tree, and we don't want to
+ * waste our time to search if there is something identical to it there.
+ */
+ checksum = calc_checksum(page);
+ if (rmap_item->oldchecksum != checksum) {
+ rmap_item->oldchecksum = checksum;
+ return;
+ }
+
+ tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
+ if (tree_rmap_item) {
+ err = try_to_merge_two_pages(rmap_item->mm,
+ rmap_item->address, page,
+ tree_rmap_item->mm,
+ tree_rmap_item->address, page2[0]);
+ /*
+ * As soon as we merge this page, we want to remove the
+ * rmap_item of the page we have merged with from the unstable
+ * tree, and insert it instead as new node in the stable tree.
+ */
+ if (!err) {
+ rb_erase(&tree_rmap_item->node, &root_unstable_tree);
+ tree_rmap_item->address &= ~NODE_FLAG;
+ ksm_pages_unshared--;
+
+ /*
+ * If we fail to insert the page into the stable tree,
+ * we will have 2 virtual addresses that are pointing
+ * to a ksm page left outside the stable tree,
+ * in which case we need to break_cow on both.
+ */
+ if (stable_tree_insert(page2[0], tree_rmap_item))
+ stable_tree_append(rmap_item, tree_rmap_item);
+ else {
+ break_cow(tree_rmap_item->mm,
+ tree_rmap_item->address);
+ break_cow(rmap_item->mm, rmap_item->address);
+ }
+ }
+
+ put_page(page2[0]);
+ }
+}
+
+static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
+ struct list_head *cur,
+ unsigned long addr)
+{
+ struct rmap_item *rmap_item;
+
+ while (cur != &mm_slot->rmap_list) {
+ rmap_item = list_entry(cur, struct rmap_item, link);
+ if ((rmap_item->address & PAGE_MASK) == addr) {
+ if (!in_stable_tree(rmap_item))
+ remove_rmap_item_from_tree(rmap_item);
+ return rmap_item;
+ }
+ if (rmap_item->address > addr)
+ break;
+ cur = cur->next;
+ remove_rmap_item_from_tree(rmap_item);
+ list_del(&rmap_item->link);
+ free_rmap_item(rmap_item);
+ }
+
+ rmap_item = alloc_rmap_item();
+ if (rmap_item) {
+ /* It has already been zeroed */
+ rmap_item->mm = mm_slot->mm;
+ rmap_item->address = addr;
+ list_add_tail(&rmap_item->link, cur);
+ }
+ return rmap_item;
+}
+
+static struct rmap_item *scan_get_next_rmap_item(struct page **page)
+{
+ struct mm_struct *mm;
+ struct mm_slot *slot;
+ struct vm_area_struct *vma;
+ struct rmap_item *rmap_item;
+
+ if (list_empty(&ksm_mm_head.mm_list))
+ return NULL;
+
+ slot = ksm_scan.mm_slot;
+ if (slot == &ksm_mm_head) {
+ root_unstable_tree = RB_ROOT;
+
+ spin_lock(&ksm_mmlist_lock);
+ slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
+ ksm_scan.mm_slot = slot;
+ spin_unlock(&ksm_mmlist_lock);
+next_mm:
+ ksm_scan.address = 0;
+ ksm_scan.rmap_item = list_entry(&slot->rmap_list,
+ struct rmap_item, link);
+ }
+
+ mm = slot->mm;
+ down_read(&mm->mmap_sem);
+ if (ksm_test_exit(mm))
+ vma = NULL;
+ else
+ vma = find_vma(mm, ksm_scan.address);
+
+ for (; vma; vma = vma->vm_next) {
+ if (!(vma->vm_flags & VM_MERGEABLE))
+ continue;
+ if (ksm_scan.address < vma->vm_start)
+ ksm_scan.address = vma->vm_start;
+ if (!vma->anon_vma)
+ ksm_scan.address = vma->vm_end;
+
+ while (ksm_scan.address < vma->vm_end) {
+ if (ksm_test_exit(mm))
+ break;
+ *page = follow_page(vma, ksm_scan.address, FOLL_GET);
+ if (*page && PageAnon(*page)) {
+ flush_anon_page(vma, *page, ksm_scan.address);
+ flush_dcache_page(*page);
+ rmap_item = get_next_rmap_item(slot,
+ ksm_scan.rmap_item->link.next,
+ ksm_scan.address);
+ if (rmap_item) {
+ ksm_scan.rmap_item = rmap_item;
+ ksm_scan.address += PAGE_SIZE;
+ } else
+ put_page(*page);
+ up_read(&mm->mmap_sem);
+ return rmap_item;
+ }
+ if (*page)
+ put_page(*page);
+ ksm_scan.address += PAGE_SIZE;
+ cond_resched();
+ }
+ }
+
+ if (ksm_test_exit(mm)) {
+ ksm_scan.address = 0;
+ ksm_scan.rmap_item = list_entry(&slot->rmap_list,
+ struct rmap_item, link);
+ }
+ /*
+ * Nuke all the rmap_items that are above this current rmap:
+ * because there were no VM_MERGEABLE vmas with such addresses.
+ */
+ remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
+
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = list_entry(slot->mm_list.next,
+ struct mm_slot, mm_list);
+ if (ksm_scan.address == 0) {
+ /*
+ * We've completed a full scan of all vmas, holding mmap_sem
+ * throughout, and found no VM_MERGEABLE: so do the same as
+ * __ksm_exit does to remove this mm from all our lists now.
+ * This applies either when cleaning up after __ksm_exit
+ * (but beware: we can reach here even before __ksm_exit),
+ * or when all VM_MERGEABLE areas have been unmapped (and
+ * mmap_sem then protects against race with MADV_MERGEABLE).
+ */
+ hlist_del(&slot->link);
+ list_del(&slot->mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ free_mm_slot(slot);
+ clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ up_read(&mm->mmap_sem);
+ mmdrop(mm);
+ } else {
+ spin_unlock(&ksm_mmlist_lock);
+ up_read(&mm->mmap_sem);
+ }
+
+ /* Repeat until we've completed scanning the whole list */
+ slot = ksm_scan.mm_slot;
+ if (slot != &ksm_mm_head)
+ goto next_mm;
+
+ ksm_scan.seqnr++;
+ return NULL;
+}
+
+/**
+ * ksm_do_scan - the ksm scanner main worker function.
+ * @scan_npages - number of pages we want to scan before we return.
+ */
+static void ksm_do_scan(unsigned int scan_npages)
+{
+ struct rmap_item *rmap_item;
+ struct page *page;
+
+ while (scan_npages--) {
+ cond_resched();
+ rmap_item = scan_get_next_rmap_item(&page);
+ if (!rmap_item)
+ return;
+ if (!PageKsm(page) || !in_stable_tree(rmap_item))
+ cmp_and_merge_page(page, rmap_item);
+ else if (page_mapcount(page) == 1) {
+ /*
+ * Replace now-unshared ksm page by ordinary page.
+ */
+ break_cow(rmap_item->mm, rmap_item->address);
+ remove_rmap_item_from_tree(rmap_item);
+ rmap_item->oldchecksum = calc_checksum(page);
+ }
+ put_page(page);
+ }
+}
+
+static int ksmd_should_run(void)
+{
+ return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
+}
+
+static int ksm_scan_thread(void *nothing)
+{
+ set_user_nice(current, 5);
+
+ while (!kthread_should_stop()) {
+ mutex_lock(&ksm_thread_mutex);
+ if (ksmd_should_run())
+ ksm_do_scan(ksm_thread_pages_to_scan);
+ mutex_unlock(&ksm_thread_mutex);
+
+ if (ksmd_should_run()) {
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(ksm_thread_sleep_millisecs));
+ } else {
+ wait_event_interruptible(ksm_thread_wait,
+ ksmd_should_run() || kthread_should_stop());
+ }
+ }
+ return 0;
+}
+
+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, int advice, unsigned long *vm_flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int err;
+
+ switch (advice) {
+ case MADV_MERGEABLE:
+ /*
+ * Be somewhat over-protective for now!
+ */
+ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
+ VM_PFNMAP | VM_IO | VM_DONTEXPAND |
+ VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
+ VM_MIXEDMAP | VM_SAO))
+ return 0; /* just ignore the advice */
+
+ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ err = __ksm_enter(mm);
+ if (err)
+ return err;
+ }
+
+ *vm_flags |= VM_MERGEABLE;
+ break;
+
+ case MADV_UNMERGEABLE:
+ if (!(*vm_flags & VM_MERGEABLE))
+ return 0; /* just ignore the advice */
+
+ if (vma->anon_vma) {
+ err = unmerge_ksm_pages(vma, start, end);
+ if (err)
+ return err;
+ }
+
+ *vm_flags &= ~VM_MERGEABLE;
+ break;
+ }
+
+ return 0;
+}
+
+int __ksm_enter(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int needs_wakeup;
+
+ mm_slot = alloc_mm_slot();
+ if (!mm_slot)
+ return -ENOMEM;
+
+ /* Check ksm_run too? Would need tighter locking */
+ needs_wakeup = list_empty(&ksm_mm_head.mm_list);
+
+ spin_lock(&ksm_mmlist_lock);
+ insert_to_mm_slots_hash(mm, mm_slot);
+ /*
+ * Insert just behind the scanning cursor, to let the area settle
+ * down a little; when fork is followed by immediate exec, we don't
+ * want ksmd to waste time setting up and tearing down an rmap_list.
+ */
+ list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ set_bit(MMF_VM_MERGEABLE, &mm->flags);
+ atomic_inc(&mm->mm_count);
+
+ if (needs_wakeup)
+ wake_up_interruptible(&ksm_thread_wait);
+
+ return 0;
+}
+
+void __ksm_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int easy_to_free = 0;
+
+ /*
+ * This process is exiting: if it's straightforward (as is the
+ * case when ksmd was never running), free mm_slot immediately.
+ * But if it's at the cursor or has rmap_items linked to it, use
+ * mmap_sem to synchronize with any break_cows before pagetables
+ * are freed, and leave the mm_slot on the list for ksmd to free.
+ * Beware: ksm may already have noticed it exiting and freed the slot.
+ */
+
+ spin_lock(&ksm_mmlist_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && ksm_scan.mm_slot != mm_slot) {
+ if (list_empty(&mm_slot->rmap_list)) {
+ hlist_del(&mm_slot->link);
+ list_del(&mm_slot->mm_list);
+ easy_to_free = 1;
+ } else {
+ list_move(&mm_slot->mm_list,
+ &ksm_scan.mm_slot->mm_list);
+ }
+ }
+ spin_unlock(&ksm_mmlist_lock);
+
+ if (easy_to_free) {
+ free_mm_slot(mm_slot);
+ clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mmdrop(mm);
+ } else if (mm_slot) {
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * This all compiles without CONFIG_SYSFS, but is a waste of space.
+ */
+
+#define KSM_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define KSM_ATTR(_name) \
+ static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
+}
+
+static ssize_t sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ ksm_thread_sleep_millisecs = msecs;
+
+ return count;
+}
+KSM_ATTR(sleep_millisecs);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
+}
+
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long nr_pages;
+
+ err = strict_strtoul(buf, 10, &nr_pages);
+ if (err || nr_pages > UINT_MAX)
+ return -EINVAL;
+
+ ksm_thread_pages_to_scan = nr_pages;
+
+ return count;
+}
+KSM_ATTR(pages_to_scan);
+
+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_run);
+}
+
+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long flags;
+
+ err = strict_strtoul(buf, 10, &flags);
+ if (err || flags > UINT_MAX)
+ return -EINVAL;
+ if (flags > KSM_RUN_UNMERGE)
+ return -EINVAL;
+
+ /*
+ * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
+ * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
+ * breaking COW to free the unswappable pages_shared (but leaves
+ * mm_slots on the list for when ksmd may be set running again).
+ */
+
+ mutex_lock(&ksm_thread_mutex);
+ if (ksm_run != flags) {
+ ksm_run = flags;
+ if (flags & KSM_RUN_UNMERGE) {
+ current->flags |= PF_OOM_ORIGIN;
+ err = unmerge_and_remove_all_rmap_items();
+ current->flags &= ~PF_OOM_ORIGIN;
+ if (err) {
+ ksm_run = KSM_RUN_STOP;
+ count = err;
+ }
+ }
+ }
+ mutex_unlock(&ksm_thread_mutex);
+
+ if (flags & KSM_RUN_MERGE)
+ wake_up_interruptible(&ksm_thread_wait);
+
+ return count;
+}
+KSM_ATTR(run);
+
+static ssize_t max_kernel_pages_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long nr_pages;
+
+ err = strict_strtoul(buf, 10, &nr_pages);
+ if (err)
+ return -EINVAL;
+
+ ksm_max_kernel_pages = nr_pages;
+
+ return count;
+}
+
+static ssize_t max_kernel_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
+}
+KSM_ATTR(max_kernel_pages);
+
+static ssize_t pages_shared_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_pages_shared);
+}
+KSM_ATTR_RO(pages_shared);
+
+static ssize_t pages_sharing_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_pages_sharing);
+}
+KSM_ATTR_RO(pages_sharing);
+
+static ssize_t pages_unshared_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_pages_unshared);
+}
+KSM_ATTR_RO(pages_unshared);
+
+static ssize_t pages_volatile_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ long ksm_pages_volatile;
+
+ ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
+ - ksm_pages_sharing - ksm_pages_unshared;
+ /*
+ * It was not worth any locking to calculate that statistic,
+ * but it might therefore sometimes be negative: conceal that.
+ */
+ if (ksm_pages_volatile < 0)
+ ksm_pages_volatile = 0;
+ return sprintf(buf, "%ld\n", ksm_pages_volatile);
+}
+KSM_ATTR_RO(pages_volatile);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_scan.seqnr);
+}
+KSM_ATTR_RO(full_scans);
+
+static struct attribute *ksm_attrs[] = {
+ &sleep_millisecs_attr.attr,
+ &pages_to_scan_attr.attr,
+ &run_attr.attr,
+ &max_kernel_pages_attr.attr,
+ &pages_shared_attr.attr,
+ &pages_sharing_attr.attr,
+ &pages_unshared_attr.attr,
+ &pages_volatile_attr.attr,
+ &full_scans_attr.attr,
+ NULL,
+};
+
+static struct attribute_group ksm_attr_group = {
+ .attrs = ksm_attrs,
+ .name = "ksm",
+};
+#endif /* CONFIG_SYSFS */
+
+static int __init ksm_init(void)
+{
+ struct task_struct *ksm_thread;
+ int err;
+
+ err = ksm_slab_init();
+ if (err)
+ goto out;
+
+ err = mm_slots_hash_init();
+ if (err)
+ goto out_free1;
+
+ ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
+ if (IS_ERR(ksm_thread)) {
+ printk(KERN_ERR "ksm: creating kthread failed\n");
+ err = PTR_ERR(ksm_thread);
+ goto out_free2;
+ }
+
+#ifdef CONFIG_SYSFS
+ err = sysfs_create_group(mm_kobj, &ksm_attr_group);
+ if (err) {
+ printk(KERN_ERR "ksm: register sysfs failed\n");
+ kthread_stop(ksm_thread);
+ goto out_free2;
+ }
+#endif /* CONFIG_SYSFS */
+
+ return 0;
+
+out_free2:
+ mm_slots_hash_free();
+out_free1:
+ ksm_slab_free();
+out:
+ return err;
+}
+module_init(ksm_init)
diff --git a/mm/madvise.c b/mm/madvise.c
index 76eb4193acd..d9ae2067952 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
#include <linux/mempolicy.h>
#include <linux/hugetlb.h>
#include <linux/sched.h>
+#include <linux/ksm.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -41,7 +42,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
struct mm_struct * mm = vma->vm_mm;
int error = 0;
pgoff_t pgoff;
- int new_flags = vma->vm_flags;
+ unsigned long new_flags = vma->vm_flags;
switch (behavior) {
case MADV_NORMAL:
@@ -57,8 +58,18 @@ static long madvise_behavior(struct vm_area_struct * vma,
new_flags |= VM_DONTCOPY;
break;
case MADV_DOFORK:
+ if (vma->vm_flags & VM_IO) {
+ error = -EINVAL;
+ goto out;
+ }
new_flags &= ~VM_DONTCOPY;
break;
+ case MADV_MERGEABLE:
+ case MADV_UNMERGEABLE:
+ error = ksm_madvise(vma, start, end, behavior, &new_flags);
+ if (error)
+ goto out;
+ break;
}
if (new_flags == vma->vm_flags) {
@@ -211,37 +222,16 @@ static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
- long error;
-
switch (behavior) {
- case MADV_DOFORK:
- if (vma->vm_flags & VM_IO) {
- error = -EINVAL;
- break;
- }
- case MADV_DONTFORK:
- case MADV_NORMAL:
- case MADV_SEQUENTIAL:
- case MADV_RANDOM:
- error = madvise_behavior(vma, prev, start, end, behavior);
- break;
case MADV_REMOVE:
- error = madvise_remove(vma, prev, start, end);
- break;
-
+ return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
- error = madvise_willneed(vma, prev, start, end);
- break;
-
+ return madvise_willneed(vma, prev, start, end);
case MADV_DONTNEED:
- error = madvise_dontneed(vma, prev, start, end);
- break;
-
+ return madvise_dontneed(vma, prev, start, end);
default:
- BUG();
- break;
+ return madvise_behavior(vma, prev, start, end, behavior);
}
- return error;
}
static int
@@ -256,12 +246,17 @@ madvise_behavior_valid(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+#ifdef CONFIG_KSM
+ case MADV_MERGEABLE:
+ case MADV_UNMERGEABLE:
+#endif
return 1;
default:
return 0;
}
}
+
/*
* The madvise(2) system call.
*
@@ -286,6 +281,12 @@ madvise_behavior_valid(int behavior)
* so the kernel can free resources associated with it.
* MADV_REMOVE - the application wants to free up the given range of
* pages and associated backing store.
+ * MADV_DONTFORK - omit this area from child's address space when forking:
+ * typically, to avoid COWing pages pinned by get_user_pages().
+ * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
+ * this area with pages of identical content from other such areas.
+ * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
*
* return values:
* zero - success
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fd4529d86de..9b10d875378 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -648,7 +648,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
int nid = z->zone_pgdat->node_id;
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
- int lru = LRU_FILE * !!file + !!active;
+ int lru = LRU_FILE * file + active;
int ret;
BUG_ON(!mem_cont);
diff --git a/mm/memory.c b/mm/memory.c
index aede2ce3aba..b1443ac07c0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -45,6 +45,7 @@
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/delayacct.h>
@@ -56,6 +57,7 @@
#include <linux/swapops.h>
#include <linux/elf.h>
+#include <asm/io.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@@ -106,6 +108,18 @@ static int __init disable_randmaps(char *s)
}
__setup("norandmaps", disable_randmaps);
+unsigned long zero_pfn __read_mostly;
+unsigned long highest_memmap_pfn __read_mostly;
+
+/*
+ * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
+ */
+static int __init init_zero_pfn(void)
+{
+ zero_pfn = page_to_pfn(ZERO_PAGE(0));
+ return 0;
+}
+core_initcall(init_zero_pfn);
/*
* If a p?d_bad entry is found while walking page tables, report
@@ -442,6 +456,20 @@ static inline int is_cow_mapping(unsigned int flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
+#ifndef is_zero_pfn
+static inline int is_zero_pfn(unsigned long pfn)
+{
+ return pfn == zero_pfn;
+}
+#endif
+
+#ifndef my_zero_pfn
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+ return zero_pfn;
+}
+#endif
+
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*
@@ -497,7 +525,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
if (HAVE_PTE_SPECIAL) {
if (likely(!pte_special(pte)))
goto check_pfn;
- if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
+ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+ return NULL;
+ if (!is_zero_pfn(pfn))
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
@@ -519,6 +549,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
}
}
+ if (is_zero_pfn(pfn))
+ return NULL;
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
@@ -596,8 +628,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
page = vm_normal_page(vma, addr, pte);
if (page) {
get_page(page);
- page_dup_rmap(page, vma, addr);
- rss[!!PageAnon(page)]++;
+ page_dup_rmap(page);
+ rss[PageAnon(page)]++;
}
out_set_pte:
@@ -1142,9 +1174,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
+
page = vm_normal_page(vma, address, pte);
- if (unlikely(!page))
- goto bad_page;
+ if (unlikely(!page)) {
+ if ((flags & FOLL_DUMP) ||
+ !is_zero_pfn(pte_pfn(pte)))
+ goto bad_page;
+ page = pte_page(pte);
+ }
if (flags & FOLL_GET)
get_page(page);
@@ -1172,65 +1209,46 @@ no_page:
pte_unmap_unlock(ptep, ptl);
if (!pte_none(pte))
return page;
- /* Fall through to ZERO_PAGE handling */
+
no_page_table:
/*
* When core dumping an enormous anonymous area that nobody
- * has touched so far, we don't want to allocate page tables.
+ * has touched so far, we don't want to allocate unnecessary pages or
+ * page tables. Return error instead of NULL to skip handle_mm_fault,
+ * then get_dump_page() will return NULL to leave a hole in the dump.
+ * But we can only make this optimization where a hole would surely
+ * be zero-filled if handle_mm_fault() actually did handle it.
*/
- if (flags & FOLL_ANON) {
- page = ZERO_PAGE(0);
- if (flags & FOLL_GET)
- get_page(page);
- BUG_ON(flags & FOLL_WRITE);
- }
+ if ((flags & FOLL_DUMP) &&
+ (!vma->vm_ops || !vma->vm_ops->fault))
+ return ERR_PTR(-EFAULT);
return page;
}
-/* Can we do the FOLL_ANON optimization? */
-static inline int use_zero_page(struct vm_area_struct *vma)
-{
- /*
- * We don't want to optimize FOLL_ANON for make_pages_present()
- * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
- * we want to get the page from the page tables to make sure
- * that we serialize and update with any other user of that
- * mapping.
- */
- if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
- return 0;
- /*
- * And if we have a fault routine, it's not an anonymous region.
- */
- return !vma->vm_ops || !vma->vm_ops->fault;
-}
-
-
-
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int nr_pages, int flags,
+ unsigned long start, int nr_pages, unsigned int gup_flags,
struct page **pages, struct vm_area_struct **vmas)
{
int i;
- unsigned int vm_flags = 0;
- int write = !!(flags & GUP_FLAGS_WRITE);
- int force = !!(flags & GUP_FLAGS_FORCE);
- int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
- int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
+ unsigned long vm_flags;
if (nr_pages <= 0)
return 0;
+
+ VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
+
/*
* Require read or write permissions.
- * If 'force' is set, we only require the "MAY" flags.
+ * If FOLL_FORCE is set, we only require the "MAY" flags.
*/
- vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
- vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+ vm_flags = (gup_flags & FOLL_WRITE) ?
+ (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= (gup_flags & FOLL_FORCE) ?
+ (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
i = 0;
do {
struct vm_area_struct *vma;
- unsigned int foll_flags;
vma = find_extend_vma(mm, start);
if (!vma && in_gate_area(tsk, start)) {
@@ -1242,7 +1260,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
pte_t *pte;
/* user gate pages are read-only */
- if (!ignore && write)
+ if (gup_flags & FOLL_WRITE)
return i ? : -EFAULT;
if (pg > TASK_SIZE)
pgd = pgd_offset_k(pg);
@@ -1276,38 +1294,26 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (!vma ||
(vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
- (!ignore && !(vm_flags & vma->vm_flags)))
+ !(vm_flags & vma->vm_flags))
return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &nr_pages, i, write);
+ &start, &nr_pages, i, gup_flags);
continue;
}
- foll_flags = FOLL_TOUCH;
- if (pages)
- foll_flags |= FOLL_GET;
- if (!write && use_zero_page(vma))
- foll_flags |= FOLL_ANON;
-
do {
struct page *page;
+ unsigned int foll_flags = gup_flags;
/*
* If we have a pending SIGKILL, don't keep faulting
- * pages and potentially allocating memory, unless
- * current is handling munlock--e.g., on exit. In
- * that case, we are not allocating memory. Rather,
- * we're only unlocking already resident/mapped pages.
+ * pages and potentially allocating memory.
*/
- if (unlikely(!ignore_sigkill &&
- fatal_signal_pending(current)))
+ if (unlikely(fatal_signal_pending(current)))
return i ? i : -ERESTARTSYS;
- if (write)
- foll_flags |= FOLL_WRITE;
-
cond_resched();
while (!(page = follow_page(vma, start, foll_flags))) {
int ret;
@@ -1418,18 +1424,47 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
{
- int flags = 0;
+ int flags = FOLL_TOUCH;
+ if (pages)
+ flags |= FOLL_GET;
if (write)
- flags |= GUP_FLAGS_WRITE;
+ flags |= FOLL_WRITE;
if (force)
- flags |= GUP_FLAGS_FORCE;
+ flags |= FOLL_FORCE;
return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
}
-
EXPORT_SYMBOL(get_user_pages);
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by page_cache_release() or put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_sem, but after all other threads have been killed.
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ if (__get_user_pages(current, current->mm, addr, 1,
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+ return NULL;
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ return page;
+}
+#endif /* CONFIG_ELF_CORE */
+
pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
@@ -1607,7 +1642,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* If we don't have pte special, then we have to use the pfn_valid()
* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
* refcount the page if pfn_valid is true (hence insert_page rather
- * than insert_pfn).
+ * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
+ * without pte special, it would there be refcounted as a normal page.
*/
if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
struct page *page;
@@ -1973,7 +2009,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(old_page)) {
+ if (PageAnon(old_page) && !PageKsm(old_page)) {
if (!trylock_page(old_page)) {
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
@@ -2074,10 +2110,19 @@ gotten:
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- VM_BUG_ON(old_page == ZERO_PAGE(0));
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
- if (!new_page)
- goto oom;
+
+ if (is_zero_pfn(pte_pfn(orig_pte))) {
+ new_page = alloc_zeroed_user_highpage_movable(vma, address);
+ if (!new_page)
+ goto oom;
+ } else {
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!new_page)
+ goto oom;
+ cow_user_page(new_page, old_page, address, vma);
+ }
+ __SetPageUptodate(new_page);
+
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
@@ -2087,8 +2132,6 @@ gotten:
clear_page_mlock(old_page);
unlock_page(old_page);
}
- cow_user_page(new_page, old_page, address, vma);
- __SetPageUptodate(new_page);
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
@@ -2114,9 +2157,14 @@ gotten:
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush_notify(vma, address, page_table);
+ ptep_clear_flush(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
- set_pte_at(mm, address, page_table, entry);
+ /*
+ * We call the notify macro here because, when using secondary
+ * mmu page tables (such as kvm shadow page tables), we want the
+ * new page to be mapped directly into the secondary page table.
+ */
+ set_pte_at_notify(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
if (old_page) {
/*
@@ -2624,6 +2672,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl;
pte_t entry;
+ if (!(flags & FAULT_FLAG_WRITE)) {
+ entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
+ vma->vm_page_prot));
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (!pte_none(*page_table))
+ goto unlock;
+ goto setpte;
+ }
+
/* Allocate our own private page. */
pte_unmap(page_table);
@@ -2638,13 +2696,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto oom_free_page;
entry = mk_pte(page, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto release;
+
inc_mm_counter(mm, anon_rss);
page_add_new_anon_rmap(page, vma, address);
+setpte:
set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4412a676c8..efe3e0ec2e6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -339,8 +339,11 @@ EXPORT_SYMBOL_GPL(__remove_pages);
void online_page(struct page *page)
{
+ unsigned long pfn = page_to_pfn(page);
+
totalram_pages++;
- num_physpages++;
+ if (pfn >= num_physpages)
+ num_physpages = pfn + 1;
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
@@ -422,6 +425,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
+ zone_pcp_update(zone);
setup_per_zone_wmarks();
calculate_zone_inactive_ratio(zone);
if (onlined_pages) {
@@ -831,7 +835,6 @@ repeat:
zone->present_pages -= offlined_pages;
zone->zone_pgdat->node_present_pages -= offlined_pages;
totalram_pages -= offlined_pages;
- num_physpages -= offlined_pages;
setup_per_zone_wmarks();
calculate_zone_inactive_ratio(zone);
diff --git a/mm/mempool.c b/mm/mempool.c
index 32e75d40050..1a3bc3d4d55 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -308,13 +308,6 @@ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
}
EXPORT_SYMBOL(mempool_kmalloc);
-void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
-{
- size_t size = (size_t)pool_data;
- return kzalloc(size, gfp_mask);
-}
-EXPORT_SYMBOL(mempool_kzalloc);
-
void mempool_kfree(void *element, void *pool_data)
{
kfree(element);
diff --git a/mm/migrate.c b/mm/migrate.c
index 939888f9dda..16052e80aaa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -67,6 +67,8 @@ int putback_lru_pages(struct list_head *l)
list_for_each_entry_safe(page, page2, l, lru) {
list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
putback_lru_page(page);
count++;
}
@@ -147,7 +149,7 @@ out:
static void remove_file_migration_ptes(struct page *old, struct page *new)
{
struct vm_area_struct *vma;
- struct address_space *mapping = page_mapping(new);
+ struct address_space *mapping = new->mapping;
struct prio_tree_iter iter;
pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -270,7 +272,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
pslot = radix_tree_lookup_slot(&mapping->page_tree,
page_index(page));
- expected_count = 2 + !!page_has_private(page);
+ expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
(struct page *)radix_tree_deref_slot(pslot) != page) {
spin_unlock_irq(&mapping->tree_lock);
@@ -312,7 +314,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
*/
__dec_zone_page_state(page, NR_FILE_PAGES);
__inc_zone_page_state(newpage, NR_FILE_PAGES);
-
+ if (PageSwapBacked(page)) {
+ __dec_zone_page_state(page, NR_SHMEM);
+ __inc_zone_page_state(newpage, NR_SHMEM);
+ }
spin_unlock_irq(&mapping->tree_lock);
return 0;
@@ -664,13 +669,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* needs to be effective.
*/
try_to_free_buffers(page);
+ goto rcu_unlock;
}
- goto rcu_unlock;
+ goto skip_unmap;
}
/* Establish migration ptes or remove ptes */
try_to_unmap(page, 1);
+skip_unmap:
if (!page_mapped(page))
rc = move_to_new_page(newpage, page);
@@ -693,6 +700,8 @@ unlock:
* restored.
*/
list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
putback_lru_page(page);
}
@@ -737,6 +746,13 @@ int migrate_pages(struct list_head *from,
struct page *page2;
int swapwrite = current->flags & PF_SWAPWRITE;
int rc;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ list_for_each_entry(page, from, lru)
+ __inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ local_irq_restore(flags);
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
diff --git a/mm/mlock.c b/mm/mlock.c
index 45eb650b965..bd6f0e466f6 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -139,49 +139,36 @@ static void munlock_vma_page(struct page *page)
}
/**
- * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma.
+ * __mlock_vma_pages_range() - mlock a range of pages in the vma.
* @vma: target vma
* @start: start address
* @end: end address
- * @mlock: 0 indicate munlock, otherwise mlock.
*
- * If @mlock == 0, unlock an mlocked range;
- * else mlock the range of pages. This takes care of making the pages present ,
- * too.
+ * This takes care of making the pages present too.
*
* return 0 on success, negative error code on error.
*
* vma->vm_mm->mmap_sem must be held for at least read.
*/
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- int mlock)
+ unsigned long start, unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start;
struct page *pages[16]; /* 16 gives a reasonable batch */
int nr_pages = (end - start) / PAGE_SIZE;
int ret = 0;
- int gup_flags = 0;
+ int gup_flags;
VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(end & ~PAGE_MASK);
VM_BUG_ON(start < vma->vm_start);
VM_BUG_ON(end > vma->vm_end);
- VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
- (atomic_read(&mm->mm_users) != 0));
-
- /*
- * mlock: don't page populate if vma has PROT_NONE permission.
- * munlock: always do munlock although the vma has PROT_NONE
- * permission, or SIGKILL is pending.
- */
- if (!mlock)
- gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
- GUP_FLAGS_IGNORE_SIGKILL;
+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+ gup_flags = FOLL_TOUCH | FOLL_GET;
if (vma->vm_flags & VM_WRITE)
- gup_flags |= GUP_FLAGS_WRITE;
+ gup_flags |= FOLL_WRITE;
while (nr_pages > 0) {
int i;
@@ -201,51 +188,45 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
* This can happen for, e.g., VM_NONLINEAR regions before
* a page has been allocated and mapped at a given offset,
* or for addresses that map beyond end of a file.
- * We'll mlock the the pages if/when they get faulted in.
+ * We'll mlock the pages if/when they get faulted in.
*/
if (ret < 0)
break;
- if (ret == 0) {
- /*
- * We know the vma is there, so the only time
- * we cannot get a single page should be an
- * error (ret < 0) case.
- */
- WARN_ON(1);
- break;
- }
lru_add_drain(); /* push cached pages to LRU */
for (i = 0; i < ret; i++) {
struct page *page = pages[i];
- lock_page(page);
- /*
- * Because we lock page here and migration is blocked
- * by the elevated reference, we need only check for
- * page truncation (file-cache only).
- */
if (page->mapping) {
- if (mlock)
+ /*
+ * That preliminary check is mainly to avoid
+ * the pointless overhead of lock_page on the
+ * ZERO_PAGE: which might bounce very badly if
+ * there is contention. However, we're still
+ * dirtying its cacheline with get/put_page:
+ * we'll add another __get_user_pages flag to
+ * avoid it if that case turns out to matter.
+ */
+ lock_page(page);
+ /*
+ * Because we lock page here and migration is
+ * blocked by the elevated reference, we need
+ * only check for file-cache page truncation.
+ */
+ if (page->mapping)
mlock_vma_page(page);
- else
- munlock_vma_page(page);
+ unlock_page(page);
}
- unlock_page(page);
- put_page(page); /* ref from get_user_pages() */
-
- /*
- * here we assume that get_user_pages() has given us
- * a list of virtually contiguous pages.
- */
- addr += PAGE_SIZE; /* for next get_user_pages() */
- nr_pages--;
+ put_page(page); /* ref from get_user_pages() */
}
+
+ addr += ret * PAGE_SIZE;
+ nr_pages -= ret;
ret = 0;
}
- return ret; /* count entire vma as locked_vm */
+ return ret; /* 0 or negative error code */
}
/*
@@ -289,7 +270,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current))) {
- __mlock_vma_pages_range(vma, start, end, 1);
+ __mlock_vma_pages_range(vma, start, end);
/* Hide errors from mmap() and other callers */
return 0;
@@ -310,7 +291,6 @@ no_mlock:
return nr_pages; /* error or pages NOT mlocked */
}
-
/*
* munlock_vma_pages_range() - munlock all pages in the vma range.'
* @vma - vma containing range to be munlock()ed.
@@ -330,10 +310,38 @@ no_mlock:
* free them. This will result in freeing mlocked pages.
*/
void munlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end)
{
+ unsigned long addr;
+
+ lru_add_drain();
vma->vm_flags &= ~VM_LOCKED;
- __mlock_vma_pages_range(vma, start, end, 0);
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ struct page *page;
+ /*
+ * Although FOLL_DUMP is intended for get_dump_page(),
+ * it just so happens that its special treatment of the
+ * ZERO_PAGE (returning an error instead of doing get_page)
+ * suits munlock very well (and if somehow an abnormal page
+ * has sneaked into the range, we won't oops here: great).
+ */
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+ if (page && !IS_ERR(page)) {
+ lock_page(page);
+ /*
+ * Like in __mlock_vma_pages_range(),
+ * because we lock page here and migration is
+ * blocked by the elevated reference, we need
+ * only check for file-cache page truncation.
+ */
+ if (page->mapping)
+ munlock_vma_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ cond_resched();
+ }
}
/*
@@ -400,18 +408,14 @@ success:
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, __mlock_vma_pages_range will bring it back.
*/
- vma->vm_flags = newflags;
if (lock) {
- ret = __mlock_vma_pages_range(vma, start, end, 1);
-
- if (ret > 0) {
- mm->locked_vm -= ret;
- ret = 0;
- } else
- ret = __mlock_posix_error_return(ret); /* translate if needed */
+ vma->vm_flags = newflags;
+ ret = __mlock_vma_pages_range(vma, start, end);
+ if (ret < 0)
+ ret = __mlock_posix_error_return(ret);
} else {
- __mlock_vma_pages_range(vma, start, end, 0);
+ munlock_vma_pages_range(vma, start, end);
}
out:
diff --git a/mm/mmap.c b/mm/mmap.c
index 8101de490c7..21d4029a07b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,7 +28,7 @@
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -570,9 +570,9 @@ again: remove_next = 1 + (end > next->vm_end);
/*
* When changing only vma->vm_end, we don't really need
- * anon_vma lock: but is that case worth optimizing out?
+ * anon_vma lock.
*/
- if (vma->anon_vma)
+ if (vma->anon_vma && (insert || importer || start != vma->vm_start))
anon_vma = vma->anon_vma;
if (anon_vma) {
spin_lock(&anon_vma->lock);
@@ -656,9 +656,6 @@ again: remove_next = 1 + (end > next->vm_end);
validate_mm(mm);
}
-/* Flags that can be inherited from an existing mapping when merging */
-#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
-
/*
* If the vma has a ->close operation then the driver probably needs to release
* per-vma resources, so we don't attempt to merge those.
@@ -666,7 +663,8 @@ again: remove_next = 1 + (end > next->vm_end);
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
- if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS)
+ /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
+ if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
return 0;
if (vma->vm_file != file)
return 0;
@@ -905,7 +903,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
#endif /* CONFIG_PROC_FS */
/*
- * The caller must hold down_write(current->mm->mmap_sem).
+ * The caller must hold down_write(&current->mm->mmap_sem).
*/
unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
@@ -951,6 +949,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
+ if (flags & MAP_HUGETLB) {
+ struct user_struct *user = NULL;
+ if (file)
+ return -EINVAL;
+
+ /*
+ * VM_NORESERVE is used because the reservations will be
+ * taken when vm_ops->mmap() is called
+ * A dummy user value is used because we are not locking
+ * memory so no accounting is necessary
+ */
+ len = ALIGN(len, huge_page_size(&default_hstate));
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+ &user, HUGETLB_ANONHUGE_INODE);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ }
+
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
@@ -965,11 +981,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- if (flags & MAP_LOCKED) {
+ if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
- vm_flags |= VM_LOCKED;
- }
/* mlock MCL_FUTURE? */
if (vm_flags & VM_LOCKED) {
@@ -1195,21 +1209,21 @@ munmap_back:
goto unmap_and_free_vma;
if (vm_flags & VM_EXECUTABLE)
added_exe_file_vma(mm);
+
+ /* Can addr have changed??
+ *
+ * Answer: Yes, several device drivers can do it in their
+ * f_op->mmap method. -DaveM
+ */
+ addr = vma->vm_start;
+ pgoff = vma->vm_pgoff;
+ vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
- */
- addr = vma->vm_start;
- pgoff = vma->vm_pgoff;
- vm_flags = vma->vm_flags;
-
if (vma_wants_writenotify(vma))
vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
@@ -1220,7 +1234,7 @@ munmap_back:
if (correct_wcount)
atomic_inc(&inode->i_writecount);
out:
- perf_counter_mmap(vma);
+ perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
@@ -2111,6 +2125,7 @@ void exit_mmap(struct mm_struct *mm)
/* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
+
free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
@@ -2308,7 +2323,7 @@ int install_special_mapping(struct mm_struct *mm,
mm->total_vm += len >> PAGE_SHIFT;
- perf_counter_mmap(vma);
+ perf_event_mmap(vma);
return 0;
}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
new file mode 100644
index 00000000000..ded9081f402
--- /dev/null
+++ b/mm/mmu_context.c
@@ -0,0 +1,58 @@
+/* Copyright (C) 2009 Red Hat, Inc.
+ *
+ * See ../COPYING for licensing terms.
+ */
+
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/sched.h>
+
+#include <asm/mmu_context.h>
+
+/*
+ * use_mm
+ * Makes the calling kernel thread take on the specified
+ * mm context.
+ * Called by the retry thread execute retries within the
+ * iocb issuer's mm context, so that copy_from/to_user
+ * operations work seamlessly for aio.
+ * (Note: this routine is intended to be called only
+ * from a kernel thread context)
+ */
+void use_mm(struct mm_struct *mm)
+{
+ struct mm_struct *active_mm;
+ struct task_struct *tsk = current;
+
+ task_lock(tsk);
+ active_mm = tsk->active_mm;
+ if (active_mm != mm) {
+ atomic_inc(&mm->mm_count);
+ tsk->active_mm = mm;
+ }
+ tsk->mm = mm;
+ switch_mm(active_mm, mm, tsk);
+ task_unlock(tsk);
+
+ if (active_mm != mm)
+ mmdrop(active_mm);
+}
+
+/*
+ * unuse_mm
+ * Reverses the effect of use_mm, i.e. releases the
+ * specified mm context which was earlier taken on
+ * by the calling kernel thread
+ * (Note: this routine is intended to be called only
+ * from a kernel thread context)
+ */
+void unuse_mm(struct mm_struct *mm)
+{
+ struct task_struct *tsk = current;
+
+ task_lock(tsk);
+ tsk->mm = NULL;
+ /* active_mm is still 'mm' */
+ enter_lazy_tlb(mm, tsk);
+ task_unlock(tsk);
+}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5f4ef0250be..7e33f2cb3c7 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
return young;
}
+void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
+ pte_t pte)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->change_pte)
+ mn->ops->change_pte(mn, mm, address, pte);
+ /*
+ * Some drivers don't have change_pte,
+ * so we must call invalidate_page in that case.
+ */
+ else if (mn->ops->invalidate_page)
+ mn->ops->invalidate_page(mn, mm, address);
+ }
+ rcu_read_unlock();
+}
+
void __mmu_notifier_invalidate_page(struct mm_struct *mm,
unsigned long address)
{
diff --git a/mm/mprotect.c b/mm/mprotect.c
index d80311baeb2..8bc969d8112 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,7 +23,7 @@
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
goto out;
- perf_counter_mmap(vma);
+ perf_event_mmap(vma);
nstart = tmp;
if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index a39b7b91be4..20a07dba6be 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -11,6 +11,7 @@
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/shm.h>
+#include <linux/ksm.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/capability.h>
@@ -174,6 +175,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long excess = 0;
unsigned long hiwater_vm;
int split = 0;
+ int err;
/*
* We'd prefer to avoid failure later on in do_munmap:
@@ -182,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
+ /*
+ * Advise KSM to break any KSM pages in the area to be moved:
+ * it would be confusing if they were to turn up at the new
+ * location, where they happen to coincide with different KSM
+ * pages recently unmapped. But leave vma->vm_flags as it was,
+ * so KSM can come around to merge on vma and new_vma afterwards.
+ */
+ err = ksm_madvise(vma, old_addr, old_addr + old_len,
+ MADV_UNMERGEABLE, &vm_flags);
+ if (err)
+ return err;
+
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
if (!new_vma)
diff --git a/mm/nommu.c b/mm/nommu.c
index 66e81e7e9fe..1a4473faac4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -33,6 +33,7 @@
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
#include "internal.h"
static inline __attribute__((format(printf, 1, 2)))
@@ -56,8 +57,6 @@ void no_printk(const char *fmt, ...)
no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
#endif
-#include "internal.h"
-
void *high_memory;
struct page *mem_map;
unsigned long max_mapnr;
@@ -170,21 +169,20 @@ unsigned int kobjsize(const void *objp)
}
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int nr_pages, int flags,
+ unsigned long start, int nr_pages, int foll_flags,
struct page **pages, struct vm_area_struct **vmas)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
int i;
- int write = !!(flags & GUP_FLAGS_WRITE);
- int force = !!(flags & GUP_FLAGS_FORCE);
- int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
/* calculate required read or write permissions.
- * - if 'force' is set, we only require the "MAY" flags.
+ * If FOLL_FORCE is set, we only require the "MAY" flags.
*/
- vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
- vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+ vm_flags = (foll_flags & FOLL_WRITE) ?
+ (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= (foll_flags & FOLL_FORCE) ?
+ (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
for (i = 0; i < nr_pages; i++) {
vma = find_vma(mm, start);
@@ -192,8 +190,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
goto finish_or_fault;
/* protect what we can, including chardevs */
- if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
- (!ignore && !(vm_flags & vma->vm_flags)))
+ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+ !(vm_flags & vma->vm_flags))
goto finish_or_fault;
if (pages) {
@@ -212,7 +210,6 @@ finish_or_fault:
return i ? : -EFAULT;
}
-
/*
* get a list of pages in an address range belonging to the specified process
* and indicate the VMA that covers each page
@@ -227,9 +224,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
int flags = 0;
if (write)
- flags |= GUP_FLAGS_WRITE;
+ flags |= FOLL_WRITE;
if (force)
- flags |= GUP_FLAGS_FORCE;
+ flags |= FOLL_FORCE;
return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
}
@@ -627,6 +624,22 @@ static void put_nommu_region(struct vm_region *region)
}
/*
+ * update protection on a vma
+ */
+static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
+{
+#ifdef CONFIG_MPU
+ struct mm_struct *mm = vma->vm_mm;
+ long start = vma->vm_start & PAGE_MASK;
+ while (start < vma->vm_end) {
+ protect_page(mm, start, flags);
+ start += PAGE_SIZE;
+ }
+ update_protections(mm);
+#endif
+}
+
+/*
* add a VMA into a process's mm_struct in the appropriate place in the list
* and tree and add to the address space's page tree also if not an anonymous
* page
@@ -645,6 +658,8 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
mm->map_count++;
vma->vm_mm = mm;
+ protect_vma(vma, vma->vm_flags);
+
/* add the VMA to the mapping */
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
@@ -707,6 +722,8 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
kenter("%p", vma);
+ protect_vma(vma, 0);
+
mm->map_count--;
if (mm->mmap_cache == vma)
mm->mmap_cache = NULL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a7b2460e922..ea2147dabba 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
static DEFINE_SPINLOCK(zone_scan_lock);
/* #define DEBUG */
+/*
+ * Is all threads of the target process nodes overlap ours?
+ */
+static int has_intersects_mems_allowed(struct task_struct *tsk)
+{
+ struct task_struct *t;
+
+ t = tsk;
+ do {
+ if (cpuset_mems_allowed_intersects(current, t))
+ return 1;
+ t = next_thread(t);
+ } while (t != tsk);
+
+ return 0;
+}
+
/**
* badness - calculate a numeric value for how bad this task has been
* @p: task struct of which task we should calculate
@@ -58,6 +75,13 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
unsigned long points, cpu_time, run_time;
struct mm_struct *mm;
struct task_struct *child;
+ int oom_adj = p->signal->oom_adj;
+ struct task_cputime task_time;
+ unsigned long utime;
+ unsigned long stime;
+
+ if (oom_adj == OOM_DISABLE)
+ return 0;
task_lock(p);
mm = p->mm;
@@ -79,7 +103,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
/*
* swapoff can easily use up all memory, so kill those first.
*/
- if (p->flags & PF_SWAPOFF)
+ if (p->flags & PF_OOM_ORIGIN)
return ULONG_MAX;
/*
@@ -102,8 +126,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
* of seconds. There is no particular reason for this other than
* that it turned out to work very well in practice.
*/
- cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
- >> (SHIFT_HZ + 3);
+ thread_group_cputime(p, &task_time);
+ utime = cputime_to_jiffies(task_time.utime);
+ stime = cputime_to_jiffies(task_time.stime);
+ cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
+
if (uptime >= p->start_time.tv_sec)
run_time = (uptime - p->start_time.tv_sec) >> 10;
@@ -144,19 +171,19 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
* because p may have allocated or otherwise mapped memory on
* this node before. However it will be less likely.
*/
- if (!cpuset_mems_allowed_intersects(current, p))
+ if (!has_intersects_mems_allowed(p))
points /= 8;
/*
- * Adjust the score by oomkilladj.
+ * Adjust the score by oom_adj.
*/
- if (p->oomkilladj) {
- if (p->oomkilladj > 0) {
+ if (oom_adj) {
+ if (oom_adj > 0) {
if (!points)
points = 1;
- points <<= p->oomkilladj;
+ points <<= oom_adj;
} else
- points >>= -(p->oomkilladj);
+ points >>= -(oom_adj);
}
#ifdef DEBUG
@@ -200,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
static struct task_struct *select_bad_process(unsigned long *ppoints,
struct mem_cgroup *mem)
{
- struct task_struct *g, *p;
+ struct task_struct *p;
struct task_struct *chosen = NULL;
struct timespec uptime;
*ppoints = 0;
do_posix_clock_monotonic_gettime(&uptime);
- do_each_thread(g, p) {
+ for_each_process(p) {
unsigned long points;
/*
@@ -251,7 +278,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
*ppoints = ULONG_MAX;
}
- if (p->oomkilladj == OOM_DISABLE)
+ if (p->signal->oom_adj == OOM_DISABLE)
continue;
points = badness(p, uptime.tv_sec);
@@ -259,7 +286,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
chosen = p;
*ppoints = points;
}
- } while_each_thread(g, p);
+ }
return chosen;
}
@@ -304,7 +331,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
}
printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
- get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
+ get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
p->comm);
task_unlock(p);
} while_each_thread(g, p);
@@ -346,11 +373,6 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
static int oom_kill_task(struct task_struct *p)
{
- struct mm_struct *mm;
- struct task_struct *g, *q;
-
- mm = p->mm;
-
/* WARNING: mm may not be dereferenced since we did not obtain its
* value from get_task_mm(p). This is OK since all we need to do is
* compare mm to q->mm below.
@@ -359,30 +381,11 @@ static int oom_kill_task(struct task_struct *p)
* change to NULL at any time since we do not hold task_lock(p).
* However, this is of no concern to us.
*/
-
- if (mm == NULL)
+ if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
return 1;
- /*
- * Don't kill the process if any threads are set to OOM_DISABLE
- */
- do_each_thread(g, q) {
- if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
- return 1;
- } while_each_thread(g, q);
-
__oom_kill_task(p, 1);
- /*
- * kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group. Don't let them have access
- * to memory reserves though, otherwise we might deplete all memory.
- */
- do_each_thread(g, q) {
- if (q->mm == mm && !same_thread_group(q, p))
- force_sig(SIGKILL, q);
- } while_each_thread(g, q);
-
return 0;
}
@@ -394,8 +397,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
if (printk_ratelimit()) {
printk(KERN_WARNING "%s invoked oom-killer: "
- "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
- current->comm, gfp_mask, order, current->oomkilladj);
+ "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
+ current->comm, gfp_mask, order,
+ current->signal->oom_adj);
task_lock(current);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 25e7770309b..5f378dd5880 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -315,7 +315,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
int ret = 0;
- spin_lock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
@@ -327,7 +327,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
ret = -EINVAL;
}
}
- spin_unlock(&bdi_lock);
+ spin_unlock_bh(&bdi_lock);
return ret;
}
@@ -339,14 +339,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
if (max_ratio > 100)
return -EINVAL;
- spin_lock(&bdi_lock);
+ spin_lock_bh(&bdi_lock);
if (bdi->min_ratio > max_ratio) {
ret = -EINVAL;
} else {
bdi->max_ratio = max_ratio;
bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
}
- spin_unlock(&bdi_lock);
+ spin_unlock_bh(&bdi_lock);
return ret;
}
@@ -380,7 +380,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
struct zone *z =
&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
- x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
+ x += zone_page_state(z, NR_FREE_PAGES) +
+ zone_reclaimable_pages(z);
}
/*
* Make sure that the number of highmem pages is never larger
@@ -404,7 +405,7 @@ unsigned long determine_dirtyable_memory(void)
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
+ x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -485,6 +486,7 @@ static void balance_dirty_pages(struct address_space *mapping)
unsigned long bdi_thresh;
unsigned long pages_written = 0;
unsigned long write_chunk = sync_writeback_pages();
+ unsigned long pause = 1;
struct backing_dev_info *bdi = mapping->backing_dev_info;
@@ -561,7 +563,15 @@ static void balance_dirty_pages(struct address_space *mapping)
if (pages_written >= write_chunk)
break; /* We've done our duty */
- schedule_timeout(1);
+ schedule_timeout_interruptible(pause);
+
+ /*
+ * Increase the delay for each loop, up to our previous
+ * default of taking a 100ms nap.
+ */
+ pause <<= 1;
+ if (pause > HZ / 10)
+ pause = HZ / 10;
}
if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -582,16 +592,8 @@ static void balance_dirty_pages(struct address_space *mapping)
if ((laptop_mode && pages_written) ||
(!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
+ global_page_state(NR_UNSTABLE_NFS))
- > background_thresh))) {
- struct writeback_control wbc = {
- .bdi = bdi,
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = nr_writeback,
- };
-
-
- bdi_start_writeback(&wbc);
- }
+ > background_thresh)))
+ bdi_start_writeback(bdi, nr_writeback);
}
void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -604,6 +606,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
}
}
+static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+
/**
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
* @mapping: address_space which was dirtied
@@ -621,7 +625,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
{
- static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
unsigned long ratelimit;
unsigned long *p;
@@ -634,7 +637,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
* tasks in balance_dirty_pages(). Period.
*/
preempt_disable();
- p = &__get_cpu_var(ratelimits);
+ p = &__get_cpu_var(bdp_ratelimits);
*p += nr_pages_dirtied;
if (unlikely(*p >= ratelimit)) {
*p = 0;
@@ -1019,12 +1022,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
if (wbc->nr_to_write <= 0)
return 0;
- wbc->for_writepages = 1;
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
else
ret = generic_writepages(mapping, wbc);
- wbc->for_writepages = 0;
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a0de15f4698..5717f27a070 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
#include <linux/page_cgroup.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
+#include <trace/events/kmem.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -71,7 +72,6 @@ EXPORT_SYMBOL(node_states);
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
-unsigned long highest_memmap_pfn __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -123,8 +123,8 @@ static char * const zone_names[MAX_NR_ZONES] = {
int min_free_kbytes = 1024;
-unsigned long __meminitdata nr_kernel_pages;
-unsigned long __meminitdata nr_all_pages;
+static unsigned long __meminitdata nr_kernel_pages;
+static unsigned long __meminitdata nr_all_pages;
static unsigned long __meminitdata dma_reserve;
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@ -510,7 +510,7 @@ static inline int free_pages_check(struct page *page)
}
/*
- * Frees a list of pages.
+ * Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free.
*
@@ -520,22 +520,42 @@ static inline int free_pages_check(struct page *page)
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
-static void free_pages_bulk(struct zone *zone, int count,
- struct list_head *list, int order)
+static void free_pcppages_bulk(struct zone *zone, int count,
+ struct per_cpu_pages *pcp)
{
+ int migratetype = 0;
+ int batch_free = 0;
+
spin_lock(&zone->lock);
zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
zone->pages_scanned = 0;
- __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
- while (count--) {
+ __mod_zone_page_state(zone, NR_FREE_PAGES, count);
+ while (count) {
struct page *page;
+ struct list_head *list;
- VM_BUG_ON(list_empty(list));
- page = list_entry(list->prev, struct page, lru);
- /* have to delete it as __free_one_page list manipulates */
- list_del(&page->lru);
- __free_one_page(page, zone, order, page_private(page));
+ /*
+ * Remove pages from lists in a round-robin fashion. A
+ * batch_free count is maintained that is incremented when an
+ * empty list is encountered. This is so more pages are freed
+ * off fuller lists instead of spinning excessively around empty
+ * lists
+ */
+ do {
+ batch_free++;
+ if (++migratetype == MIGRATE_PCPTYPES)
+ migratetype = 0;
+ list = &pcp->lists[migratetype];
+ } while (list_empty(list));
+
+ do {
+ page = list_entry(list->prev, struct page, lru);
+ /* must delete as __free_one_page list manipulates */
+ list_del(&page->lru);
+ __free_one_page(page, zone, 0, migratetype);
+ trace_mm_page_pcpu_drain(page, 0, migratetype);
+ } while (--count && --batch_free && !list_empty(list));
}
spin_unlock(&zone->lock);
}
@@ -557,7 +577,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
unsigned long flags;
int i;
int bad = 0;
- int wasMlocked = TestClearPageMlocked(page);
+ int wasMlocked = __TestClearPageMlocked(page);
kmemcheck_free_shadow(page, order);
@@ -783,6 +803,17 @@ static int move_freepages_block(struct zone *zone, struct page *page,
return move_freepages(zone, start_page, end_page, migratetype);
}
+static void change_pageblock_range(struct page *pageblock_page,
+ int start_order, int migratetype)
+{
+ int nr_pageblocks = 1 << (start_order - pageblock_order);
+
+ while (nr_pageblocks--) {
+ set_pageblock_migratetype(pageblock_page, migratetype);
+ pageblock_page += pageblock_nr_pages;
+ }
+}
+
/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
@@ -836,11 +867,16 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
list_del(&page->lru);
rmv_page_order(page);
- if (current_order == pageblock_order)
- set_pageblock_migratetype(page,
+ /* Take ownership for orders >= pageblock_order */
+ if (current_order >= pageblock_order)
+ change_pageblock_range(page, current_order,
start_migratetype);
expand(zone, page, order, current_order, area, migratetype);
+
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, migratetype);
+
return page;
}
}
@@ -874,6 +910,7 @@ retry_reserve:
}
}
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
@@ -934,7 +971,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
to_drain = pcp->batch;
else
to_drain = pcp->count;
- free_pages_bulk(zone, to_drain, &pcp->list, 0);
+ free_pcppages_bulk(zone, to_drain, pcp);
pcp->count -= to_drain;
local_irq_restore(flags);
}
@@ -960,7 +997,7 @@ static void drain_pages(unsigned int cpu)
pcp = &pset->pcp;
local_irq_save(flags);
- free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+ free_pcppages_bulk(zone, pcp->count, pcp);
pcp->count = 0;
local_irq_restore(flags);
}
@@ -1026,7 +1063,8 @@ static void free_hot_cold_page(struct page *page, int cold)
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
- int wasMlocked = TestClearPageMlocked(page);
+ int migratetype;
+ int wasMlocked = __TestClearPageMlocked(page);
kmemcheck_free_shadow(page, 0);
@@ -1043,35 +1081,49 @@ static void free_hot_cold_page(struct page *page, int cold)
kernel_map_pages(page, 1, 0);
pcp = &zone_pcp(zone, get_cpu())->pcp;
- set_page_private(page, get_pageblock_migratetype(page));
+ migratetype = get_pageblock_migratetype(page);
+ set_page_private(page, migratetype);
local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
__count_vm_event(PGFREE);
+ /*
+ * We only track unmovable, reclaimable and movable on pcp lists.
+ * Free ISOLATE pages back to the allocator because they are being
+ * offlined but treat RESERVE as movable pages so we can get those
+ * areas back if necessary. Otherwise, we may have to free
+ * excessively into the page allocator
+ */
+ if (migratetype >= MIGRATE_PCPTYPES) {
+ if (unlikely(migratetype == MIGRATE_ISOLATE)) {
+ free_one_page(zone, page, 0, migratetype);
+ goto out;
+ }
+ migratetype = MIGRATE_MOVABLE;
+ }
+
if (cold)
- list_add_tail(&page->lru, &pcp->list);
+ list_add_tail(&page->lru, &pcp->lists[migratetype]);
else
- list_add(&page->lru, &pcp->list);
+ list_add(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
- free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ free_pcppages_bulk(zone, pcp->batch, pcp);
pcp->count -= pcp->batch;
}
+
+out:
local_irq_restore(flags);
put_cpu();
}
void free_hot_page(struct page *page)
{
+ trace_mm_page_free_direct(page, 0);
free_hot_cold_page(page, 0);
}
-void free_cold_page(struct page *page)
-{
- free_hot_cold_page(page, 1);
-}
-
/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
@@ -1119,35 +1171,23 @@ again:
cpu = get_cpu();
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
+ struct list_head *list;
pcp = &zone_pcp(zone, cpu)->pcp;
+ list = &pcp->lists[migratetype];
local_irq_save(flags);
- if (!pcp->count) {
- pcp->count = rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list,
+ if (list_empty(list)) {
+ pcp->count += rmqueue_bulk(zone, 0,
+ pcp->batch, list,
migratetype, cold);
- if (unlikely(!pcp->count))
+ if (unlikely(list_empty(list)))
goto failed;
}
- /* Find a page of the appropriate migrate type */
- if (cold) {
- list_for_each_entry_reverse(page, &pcp->list, lru)
- if (page_private(page) == migratetype)
- break;
- } else {
- list_for_each_entry(page, &pcp->list, lru)
- if (page_private(page) == migratetype)
- break;
- }
-
- /* Allocate more to the pcp list if necessary */
- if (unlikely(&page->lru == &pcp->list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list,
- migratetype, cold);
- page = list_entry(pcp->list.next, struct page, lru);
- }
+ if (cold)
+ page = list_entry(list->prev, struct page, lru);
+ else
+ page = list_entry(list->next, struct page, lru);
list_del(&page->lru);
pcp->count--;
@@ -1627,10 +1667,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
-
- /*
- * The task's cpuset might have expanded its set of allowable nodes
- */
p->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
@@ -1765,6 +1801,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
wake_all_kswapd(order, zonelist, high_zoneidx);
+restart:
/*
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
@@ -1772,7 +1809,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
-restart:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -1907,6 +1943,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
+ trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -1916,44 +1953,41 @@ EXPORT_SYMBOL(__alloc_pages_nodemask);
*/
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
- struct page * page;
+ struct page *page;
+
+ /*
+ * __get_free_pages() returns a 32-bit address, which cannot represent
+ * a highmem page
+ */
+ VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
+
page = alloc_pages(gfp_mask, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
-
EXPORT_SYMBOL(__get_free_pages);
unsigned long get_zeroed_page(gfp_t gfp_mask)
{
- struct page * page;
-
- /*
- * get_zeroed_page() returns a 32-bit address, which cannot represent
- * a highmem page
- */
- VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
-
- page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
- if (page)
- return (unsigned long) page_address(page);
- return 0;
+ return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
}
-
EXPORT_SYMBOL(get_zeroed_page);
void __pagevec_free(struct pagevec *pvec)
{
int i = pagevec_count(pvec);
- while (--i >= 0)
+ while (--i >= 0) {
+ trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
free_hot_cold_page(pvec->pages[i], pvec->cold);
+ }
}
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
+ trace_mm_page_free_direct(page, order);
if (order == 0)
free_hot_page(page);
else
@@ -2128,23 +2162,28 @@ void show_free_areas(void)
}
}
- printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
- " inactive_file:%lu"
+ printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
+ " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu"
- " dirty:%lu writeback:%lu unstable:%lu\n"
- " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
+ " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
+ " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
global_page_state(NR_ACTIVE_ANON),
- global_page_state(NR_ACTIVE_FILE),
global_page_state(NR_INACTIVE_ANON),
+ global_page_state(NR_ISOLATED_ANON),
+ global_page_state(NR_ACTIVE_FILE),
global_page_state(NR_INACTIVE_FILE),
+ global_page_state(NR_ISOLATED_FILE),
global_page_state(NR_UNEVICTABLE),
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
+ nr_blockdev_pages(),
global_page_state(NR_FREE_PAGES),
- global_page_state(NR_SLAB_RECLAIMABLE) +
- global_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_page_state(NR_SLAB_RECLAIMABLE),
+ global_page_state(NR_SLAB_UNRECLAIMABLE),
global_page_state(NR_FILE_MAPPED),
+ global_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE));
@@ -2162,7 +2201,21 @@ void show_free_areas(void)
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
+ " isolated(anon):%lukB"
+ " isolated(file):%lukB"
" present:%lukB"
+ " mlocked:%lukB"
+ " dirty:%lukB"
+ " writeback:%lukB"
+ " mapped:%lukB"
+ " shmem:%lukB"
+ " slab_reclaimable:%lukB"
+ " slab_unreclaimable:%lukB"
+ " kernel_stack:%lukB"
+ " pagetables:%lukB"
+ " unstable:%lukB"
+ " bounce:%lukB"
+ " writeback_tmp:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
"\n",
@@ -2176,7 +2229,22 @@ void show_free_areas(void)
K(zone_page_state(zone, NR_ACTIVE_FILE)),
K(zone_page_state(zone, NR_INACTIVE_FILE)),
K(zone_page_state(zone, NR_UNEVICTABLE)),
+ K(zone_page_state(zone, NR_ISOLATED_ANON)),
+ K(zone_page_state(zone, NR_ISOLATED_FILE)),
K(zone->present_pages),
+ K(zone_page_state(zone, NR_MLOCK)),
+ K(zone_page_state(zone, NR_FILE_DIRTY)),
+ K(zone_page_state(zone, NR_WRITEBACK)),
+ K(zone_page_state(zone, NR_FILE_MAPPED)),
+ K(zone_page_state(zone, NR_SHMEM)),
+ K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
+ K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
+ zone_page_state(zone, NR_KERNEL_STACK) *
+ THREAD_SIZE / 1024,
+ K(zone_page_state(zone, NR_PAGETABLE)),
+ K(zone_page_state(zone, NR_UNSTABLE_NFS)),
+ K(zone_page_state(zone, NR_BOUNCE)),
+ K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
zone->pages_scanned,
(zone_is_all_unreclaimable(zone) ? "yes" : "no")
);
@@ -2783,7 +2851,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
{
unsigned long start_pfn, pfn, end_pfn;
struct page *page;
- unsigned long reserve, block_migratetype;
+ unsigned long block_migratetype;
+ int reserve;
/* Get the start pfn, end pfn and the number of blocks to reserve */
start_pfn = zone->zone_start_pfn;
@@ -2791,6 +2860,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
pageblock_order;
+ /*
+ * Reserve blocks are generally in place to help high-order atomic
+ * allocations that are short-lived. A min_free_kbytes value that
+ * would result in more than 2 reserve blocks for atomic allocations
+ * is assumed to be in place to help anti-fragmentation for the
+ * future allocation of hugepages at runtime.
+ */
+ reserve = min(2, reserve);
+
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
if (!pfn_valid(pfn))
continue;
@@ -2961,6 +3039,7 @@ static int zone_batchsize(struct zone *zone)
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
{
struct per_cpu_pages *pcp;
+ int migratetype;
memset(p, 0, sizeof(*p));
@@ -2968,7 +3047,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
pcp->count = 0;
pcp->high = 6 * batch;
pcp->batch = max(1UL, 1 * batch);
- INIT_LIST_HEAD(&pcp->list);
+ for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
+ INIT_LIST_HEAD(&pcp->lists[migratetype]);
}
/*
@@ -3146,6 +3226,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
return 0;
}
+static int __zone_pcp_update(void *data)
+{
+ struct zone *zone = data;
+ int cpu;
+ unsigned long batch = zone_batchsize(zone), flags;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ struct per_cpu_pageset *pset;
+ struct per_cpu_pages *pcp;
+
+ pset = zone_pcp(zone, cpu);
+ pcp = &pset->pcp;
+
+ local_irq_save(flags);
+ free_pcppages_bulk(zone, pcp->count, pcp);
+ setup_pageset(pset, batch);
+ local_irq_restore(flags);
+ }
+ return 0;
+}
+
+void zone_pcp_update(struct zone *zone)
+{
+ stop_machine(__zone_pcp_update, zone, NULL);
+}
+
static __meminit void zone_pcp_init(struct zone *zone)
{
int cpu;
@@ -3720,7 +3826,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone_pcp_init(zone);
for_each_lru(l) {
INIT_LIST_HEAD(&zone->lru[l].list);
- zone->lru[l].nr_saved_scan = 0;
+ zone->reclaim_stat.nr_saved_scan[l] = 0;
}
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
@@ -4509,7 +4615,7 @@ void setup_per_zone_wmarks(void)
calculate_totalreserve_pages();
}
-/**
+/*
* The inactive anon list should be small enough that the VM never has to
* do too much work, but large enough that each inactive page has a chance
* to be referenced again before it is swapped out.
@@ -4732,7 +4838,14 @@ void *__init alloc_large_system_hash(const char *tablename,
numentries <<= (PAGE_SHIFT - scale);
/* Make sure we've got at least a 0-order allocation.. */
- if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ if (unlikely(flags & HASH_SMALL)) {
+ /* Makes no sense without HASH_EARLY */
+ WARN_ON(!(flags & HASH_EARLY));
+ if (!(numentries >> *_hash_shift)) {
+ numentries = 1UL << *_hash_shift;
+ BUG_ON(!numentries);
+ }
+ } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
numentries = PAGE_SIZE / bucketsize;
}
numentries = roundup_pow_of_two(numentries);
@@ -4874,13 +4987,16 @@ int set_migratetype_isolate(struct page *page)
struct zone *zone;
unsigned long flags;
int ret = -EBUSY;
+ int zone_idx;
zone = page_zone(page);
+ zone_idx = zone_idx(zone);
spin_lock_irqsave(&zone->lock, flags);
/*
* In future, more migrate types will be able to be isolation target.
*/
- if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+ if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
+ zone_idx != ZONE_MOVABLE)
goto out;
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
move_freepages_block(zone, page, MIGRATE_ISOLATE);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index f22b4ebbd8d..3d535d59482 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -116,10 +116,16 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
nid = page_to_nid(pfn_to_page(pfn));
table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
VM_BUG_ON(!slab_is_available());
- base = kmalloc_node(table_size,
+ if (node_state(nid, N_HIGH_MEMORY)) {
+ base = kmalloc_node(table_size,
GFP_KERNEL | __GFP_NOWARN, nid);
- if (!base)
- base = vmalloc_node(table_size, nid);
+ if (!base)
+ base = vmalloc_node(table_size, nid);
+ } else {
+ base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!base)
+ base = vmalloc(table_size);
+ }
} else {
/*
* We don't have to allocate page_cgroup again, but
diff --git a/mm/percpu.c b/mm/percpu.c
index 3311c8919f3..43d8cacfdaa 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,13 @@
*
* This is percpu allocator which can handle both static and dynamic
* areas. Percpu areas are allocated in chunks in vmalloc area. Each
- * chunk is consisted of nr_cpu_ids units and the first chunk is used
- * for static percpu variables in the kernel image (special boot time
- * alloc/init handling necessary as these areas need to be brought up
- * before allocation services are running). Unit grows as necessary
- * and all units grow or shrink in unison. When a chunk is filled up,
- * another chunk is allocated. ie. in vmalloc area
+ * chunk is consisted of boot-time determined number of units and the
+ * first chunk is used for static percpu variables in the kernel image
+ * (special boot time alloc/init handling necessary as these areas
+ * need to be brought up before allocation services are running).
+ * Unit grows as necessary and all units grow or shrink in unison.
+ * When a chunk is filled up, another chunk is allocated. ie. in
+ * vmalloc area
*
* c0 c1 c2
* ------------------- ------------------- ------------
@@ -22,11 +23,13 @@
*
* Allocation is done in offset-size areas of single unit space. Ie,
* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
- * percpu base registers pcpu_unit_size apart.
+ * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
+ * cpus. On NUMA, the mapping can be non-linear and even sparse.
+ * Percpu access can be done by configuring percpu base registers
+ * according to cpu to unit mapping and pcpu_unit_size.
*
- * There are usually many small percpu allocations many of them as
- * small as 4 bytes. The allocator organizes chunks into lists
+ * There are usually many small percpu allocations many of them being
+ * as small as 4 bytes. The allocator organizes chunks into lists
* according to free size and tries to allocate from the fullest one.
* Each chunk keeps the maximum contiguous area size hint which is
* guaranteed to be eqaul to or larger than the maximum contiguous
@@ -43,7 +46,7 @@
*
* To use this allocator, arch code should do the followings.
*
- * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back if they need to be
@@ -55,7 +58,9 @@
#include <linux/bitmap.h>
#include <linux/bootmem.h>
+#include <linux/err.h>
#include <linux/list.h>
+#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -89,25 +94,38 @@ struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
int free_size; /* free bytes in the chunk */
int contig_hint; /* max contiguous size hint */
- struct vm_struct *vm; /* mapped vmalloc region */
+ void *base_addr; /* base address of this chunk */
int map_used; /* # of map entries used */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
+ struct vm_struct **vms; /* mapped vmalloc regions */
bool immutable; /* no [de]population allowed */
- struct page **page; /* points to page array */
- struct page *page_ar[]; /* #cpus * UNIT_PAGES */
+ unsigned long populated[]; /* populated bitmap */
};
static int pcpu_unit_pages __read_mostly;
static int pcpu_unit_size __read_mostly;
-static int pcpu_chunk_size __read_mostly;
+static int pcpu_nr_units __read_mostly;
+static int pcpu_atom_size __read_mostly;
static int pcpu_nr_slots __read_mostly;
static size_t pcpu_chunk_struct_size __read_mostly;
+/* cpus with the lowest and highest unit numbers */
+static unsigned int pcpu_first_unit_cpu __read_mostly;
+static unsigned int pcpu_last_unit_cpu __read_mostly;
+
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
+static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
+const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
+
+/* group information, used for vm allocation */
+static int pcpu_nr_groups __read_mostly;
+static const unsigned long *pcpu_group_offsets __read_mostly;
+static const size_t *pcpu_group_sizes __read_mostly;
+
/*
* The first chunk which always exists. Note that unlike other
* chunks, this one can be allocated and mapped in several different
@@ -129,9 +147,9 @@ static int pcpu_reserved_chunk_limit;
* Synchronization rules.
*
* There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
- * protects allocation/reclaim paths, chunks and chunk->page arrays.
- * The latter is a spinlock and protects the index data structures -
- * chunk slots, chunks and area maps in chunks.
+ * protects allocation/reclaim paths, chunks, populated bitmap and
+ * vmalloc mapping. The latter is a spinlock and protects the index
+ * data structures - chunk slots, chunks and area maps in chunks.
*
* During allocation, pcpu_alloc_mutex is kept locked all the time and
* pcpu_lock is grabbed and released as necessary. All actual memory
@@ -178,31 +196,23 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
static int pcpu_page_idx(unsigned int cpu, int page_idx)
{
- return cpu * pcpu_unit_pages + page_idx;
-}
-
-static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
- unsigned int cpu, int page_idx)
-{
- return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+ return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
- return (unsigned long)chunk->vm->addr +
- (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+ return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
+ (page_idx << PAGE_SHIFT);
}
-static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
- int page_idx)
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
{
- /*
- * Any possible cpu id can be used here, so there's no need to
- * worry about preemption or cpu hotplug.
- */
- return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(),
- page_idx) != NULL;
+ /* must not be used on pre-mapped chunk */
+ WARN_ON(chunk->immutable);
+
+ return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
}
/* set the pointer to a chunk in a page struct */
@@ -217,6 +227,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
return (struct pcpu_chunk *)page->index;
}
+static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+ *rs = find_next_zero_bit(chunk->populated, end, *rs);
+ *re = find_next_bit(chunk->populated, end, *rs + 1);
+}
+
+static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+ *rs = find_next_bit(chunk->populated, end, *rs);
+ *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+}
+
+/*
+ * (Un)populated page region iterators. Iterate over (un)populated
+ * page regions betwen @start and @end in @chunk. @rs and @re should
+ * be integer variables and will be set to start and end page index of
+ * the current region.
+ */
+#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+
+#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+
/**
* pcpu_mem_alloc - allocate memory
* @size: bytes to allocate
@@ -292,10 +330,10 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
- void *first_start = pcpu_first_chunk->vm->addr;
+ void *first_start = pcpu_first_chunk->base_addr;
/* is it in the first chunk? */
- if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
+ if (addr >= first_start && addr < first_start + pcpu_unit_size) {
/* is it in the reserved area? */
if (addr < first_start + pcpu_reserved_chunk_limit)
return pcpu_reserved_chunk;
@@ -309,7 +347,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
* space. Note that any possible cpu id can be used here, so
* there's no need to worry about preemption or cpu hotplug.
*/
- addr += raw_smp_processor_id() * pcpu_unit_size;
+ addr += pcpu_unit_offsets[raw_smp_processor_id()];
return pcpu_get_page_chunk(vmalloc_to_page(addr));
}
@@ -558,125 +596,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
}
/**
- * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
* @chunk: chunk of interest
- * @page_start: page index of the first page to unmap
- * @page_end: page index of the last page to unmap + 1
- * @flush_tlb: whether to flush tlb or not
+ * @bitmapp: output parameter for bitmap
+ * @may_alloc: may allocate the array
*
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * If @flush is true, vcache is flushed before unmapping and tlb
- * after.
+ * Returns pointer to array of pointers to struct page and bitmap,
+ * both of which can be indexed with pcpu_page_idx(). The returned
+ * array is cleared to zero and *@bitmapp is copied from
+ * @chunk->populated. Note that there is only one array and bitmap
+ * and access exclusion is the caller's responsibility.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ * Otherwise, don't care.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success, NULL on failure.
*/
-static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
- bool flush_tlb)
+static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+ unsigned long **bitmapp,
+ bool may_alloc)
{
- unsigned int last = nr_cpu_ids - 1;
- unsigned int cpu;
+ static struct page **pages;
+ static unsigned long *bitmap;
+ size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+ size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+ sizeof(unsigned long);
+
+ if (!pages || !bitmap) {
+ if (may_alloc && !pages)
+ pages = pcpu_mem_alloc(pages_size);
+ if (may_alloc && !bitmap)
+ bitmap = pcpu_mem_alloc(bitmap_size);
+ if (!pages || !bitmap)
+ return NULL;
+ }
- /* unmap must not be done on immutable chunk */
- WARN_ON(chunk->immutable);
+ memset(pages, 0, pages_size);
+ bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
- /*
- * Each flushing trial can be very expensive, issue flush on
- * the whole region at once rather than doing it for each cpu.
- * This could be an overkill but is more scalable.
- */
- flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
+ *bitmapp = bitmap;
+ return pages;
+}
- for_each_possible_cpu(cpu)
- unmap_kernel_range_noflush(
- pcpu_chunk_addr(chunk, cpu, page_start),
- (page_end - page_start) << PAGE_SHIFT);
-
- /* ditto as flush_cache_vunmap() */
- if (flush_tlb)
- flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page = pages[pcpu_page_idx(cpu, i)];
+
+ if (page)
+ __free_page(page);
+ }
+ }
}
/**
- * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
- * @chunk: chunk to depopulate
- * @off: offset to the area to depopulate
- * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
- *
- * For each cpu, depopulate and unmap pages [@page_start,@page_end)
- * from @chunk. If @flush is true, vcache is flushed before unmapping
- * and tlb after.
- *
- * CONTEXT:
- * pcpu_alloc_mutex.
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk. Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
*/
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
- bool flush)
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
{
- int page_start = PFN_DOWN(off);
- int page_end = PFN_UP(off + size);
- int unmap_start = -1;
- int uninitialized_var(unmap_end);
+ const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
unsigned int cpu;
int i;
- for (i = page_start; i < page_end; i++) {
- for_each_possible_cpu(cpu) {
- struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+
+ *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+ if (!*pagep) {
+ pcpu_free_pages(chunk, pages, populated,
+ page_start, page_end);
+ return -ENOMEM;
+ }
+ }
+ }
+ return 0;
+}
- if (!*pagep)
- continue;
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped. Flush cache. As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu. This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vunmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+ unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
- __free_page(*pagep);
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished. The caller should call
+ * proper pre/post flush functions.
+ */
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
- /*
- * If it's partial depopulation, it might get
- * populated or depopulated again. Mark the
- * page gone.
- */
- *pagep = NULL;
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page;
- unmap_start = unmap_start < 0 ? i : unmap_start;
- unmap_end = i + 1;
+ page = pcpu_chunk_page(chunk, cpu, i);
+ WARN_ON(!page);
+ pages[pcpu_page_idx(cpu, i)] = page;
}
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ page_end - page_start);
}
- if (unmap_start >= 0)
- pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+ for (i = page_start; i < page_end; i++)
+ __clear_bit(i, populated);
+}
+
+/**
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
+ * TLB for the regions. This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_tlb_kernel_range(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+static int __pcpu_map_pages(unsigned long addr, struct page **pages,
+ int nr_pages)
+{
+ return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
+ PAGE_KERNEL, pages);
}
/**
- * pcpu_map - map pages into a pcpu_chunk
+ * pcpu_map_pages - map pages into a pcpu_chunk
* @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @populated: populated bitmap
* @page_start: page index of the first page to map
* @page_end: page index of the last page to map + 1
*
- * For each cpu, map pages [@page_start,@page_end) into @chunk.
- * vcache is flushed afterwards.
+ * For each cpu, map pages [@page_start,@page_end) into @chunk. The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting corresponding bits in
+ * @chunk->populated bitmap and whatever is necessary for reverse
+ * lookup (addr -> chunk).
*/
-static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
{
- unsigned int last = nr_cpu_ids - 1;
- unsigned int cpu;
- int err;
-
- /* map must not be done on immutable chunk */
- WARN_ON(chunk->immutable);
+ unsigned int cpu, tcpu;
+ int i, err;
for_each_possible_cpu(cpu) {
- err = map_kernel_range_noflush(
- pcpu_chunk_addr(chunk, cpu, page_start),
- (page_end - page_start) << PAGE_SHIFT,
- PAGE_KERNEL,
- pcpu_chunk_pagep(chunk, cpu, page_start));
+ err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ &pages[pcpu_page_idx(cpu, page_start)],
+ page_end - page_start);
if (err < 0)
- return err;
+ goto err;
+ }
+
+ /* mapping successful, link chunk and mark populated */
+ for (i = page_start; i < page_end; i++) {
+ for_each_possible_cpu(cpu)
+ pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+ chunk);
+ __set_bit(i, populated);
}
- /* flush at once, please read comments in pcpu_unmap() */
- flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
return 0;
+
+err:
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+ page_end - page_start);
+ }
+ return err;
+}
+
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk. If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ struct page **pages;
+ unsigned long *populated;
+ int rs, re;
+
+ /* quick path, check whether it's empty already */
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ if (rs == page_start && re == page_end)
+ return;
+ break;
+ }
+
+ /* immutable chunks can't be depopulated */
+ WARN_ON(chunk->immutable);
+
+ /*
+ * If control reaches here, there must have been at least one
+ * successful population attempt so the temp pages array must
+ * be available now.
+ */
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+ BUG_ON(!pages);
+
+ /* unmap and free */
+ pcpu_pre_unmap_flush(chunk, page_start, page_end);
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+
+ /* no need to flush tlb, vmalloc will handle it lazily */
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
}
/**
@@ -693,58 +933,68 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
- const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
- int map_start = -1;
- int uninitialized_var(map_end);
+ int free_end = page_start, unmap_end = page_start;
+ struct page **pages;
+ unsigned long *populated;
unsigned int cpu;
- int i;
+ int rs, re, rc;
- for (i = page_start; i < page_end; i++) {
- if (pcpu_chunk_page_occupied(chunk, i)) {
- if (map_start >= 0) {
- if (pcpu_map(chunk, map_start, map_end))
- goto err;
- map_start = -1;
- }
- continue;
- }
+ /* quick path, check whether all pages are already there */
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
+ if (rs == page_start && re == page_end)
+ goto clear;
+ break;
+ }
- map_start = map_start < 0 ? i : map_start;
- map_end = i + 1;
+ /* need to allocate and map pages, this chunk can't be immutable */
+ WARN_ON(chunk->immutable);
- for_each_possible_cpu(cpu) {
- struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+ if (!pages)
+ return -ENOMEM;
- *pagep = alloc_pages_node(cpu_to_node(cpu),
- alloc_mask, 0);
- if (!*pagep)
- goto err;
- pcpu_set_page_chunk(*pagep, chunk);
- }
+ /* alloc and map */
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_free;
+ free_end = re;
}
- if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
- goto err;
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_unmap;
+ unmap_end = re;
+ }
+ pcpu_post_map_flush(chunk, page_start, page_end);
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+clear:
for_each_possible_cpu(cpu)
- memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
- size);
-
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
return 0;
-err:
- /* likely under heavy memory pressure, give memory back */
- pcpu_depopulate_chunk(chunk, off, size, true);
- return -ENOMEM;
+
+err_unmap:
+ pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+ pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+err_free:
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+ return rc;
}
static void free_pcpu_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
- if (chunk->vm)
- free_vm_area(chunk->vm);
+ if (chunk->vms)
+ pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
kfree(chunk);
}
@@ -760,10 +1010,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
- chunk->page = chunk->page_ar;
- chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
- if (!chunk->vm) {
+ chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
+ pcpu_nr_groups, pcpu_atom_size,
+ GFP_KERNEL);
+ if (!chunk->vms) {
free_pcpu_chunk(chunk);
return NULL;
}
@@ -771,6 +1022,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
INIT_LIST_HEAD(&chunk->list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
+ chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
return chunk;
}
@@ -860,7 +1112,8 @@ area_found:
mutex_unlock(&pcpu_alloc_mutex);
- return __addr_to_pcpu_ptr(chunk->vm->addr + off);
+ /* return address relative to base address */
+ return __addr_to_pcpu_ptr(chunk->base_addr + off);
fail_unlock:
spin_unlock_irq(&pcpu_lock);
@@ -938,12 +1191,13 @@ static void pcpu_reclaim(struct work_struct *work)
}
spin_unlock_irq(&pcpu_lock);
- mutex_unlock(&pcpu_alloc_mutex);
list_for_each_entry_safe(chunk, next, &todo, list) {
- pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+ pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
free_pcpu_chunk(chunk);
}
+
+ mutex_unlock(&pcpu_alloc_mutex);
}
/**
@@ -968,7 +1222,7 @@ void free_percpu(void *ptr)
spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr);
- off = addr - chunk->vm->addr;
+ off = addr - chunk->base_addr;
pcpu_free_area(chunk, off);
@@ -987,30 +1241,295 @@ void free_percpu(void *ptr)
}
EXPORT_SYMBOL_GPL(free_percpu);
+static inline size_t pcpu_calc_fc_sizes(size_t static_size,
+ size_t reserved_size,
+ ssize_t *dyn_sizep)
+{
+ size_t size_sum;
+
+ size_sum = PFN_ALIGN(static_size + reserved_size +
+ (*dyn_sizep >= 0 ? *dyn_sizep : 0));
+ if (*dyn_sizep != 0)
+ *dyn_sizep = size_sum - static_size - reserved_size;
+
+ return size_sum;
+}
+
/**
- * pcpu_setup_first_chunk - initialize the first percpu chunk
- * @get_page_fn: callback to fetch page pointer
- * @static_size: the size of static percpu area in bytes
+ * pcpu_alloc_alloc_info - allocate percpu allocation info
+ * @nr_groups: the number of groups
+ * @nr_units: the number of units
+ *
+ * Allocate ai which is large enough for @nr_groups groups containing
+ * @nr_units units. The returned ai's groups[0].cpu_map points to the
+ * cpu_map array which is long enough for @nr_units and filled with
+ * NR_CPUS. It's the caller's responsibility to initialize cpu_map
+ * pointer of other groups.
+ *
+ * RETURNS:
+ * Pointer to the allocated pcpu_alloc_info on success, NULL on
+ * failure.
+ */
+struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
+ int nr_units)
+{
+ struct pcpu_alloc_info *ai;
+ size_t base_size, ai_size;
+ void *ptr;
+ int unit;
+
+ base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
+ __alignof__(ai->groups[0].cpu_map[0]));
+ ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
+
+ ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
+ if (!ptr)
+ return NULL;
+ ai = ptr;
+ ptr += base_size;
+
+ ai->groups[0].cpu_map = ptr;
+
+ for (unit = 0; unit < nr_units; unit++)
+ ai->groups[0].cpu_map[unit] = NR_CPUS;
+
+ ai->nr_groups = nr_groups;
+ ai->__ai_size = PFN_ALIGN(ai_size);
+
+ return ai;
+}
+
+/**
+ * pcpu_free_alloc_info - free percpu allocation info
+ * @ai: pcpu_alloc_info to free
+ *
+ * Free @ai which was allocated by pcpu_alloc_alloc_info().
+ */
+void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
+{
+ free_bootmem(__pa(ai), ai->__ai_size);
+}
+
+/**
+ * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
- * @base_addr: mapped address, NULL for auto
- * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ *
+ * This function determines grouping of units, their mappings to cpus
+ * and other parameters considering needed percpu size, allocation
+ * atom size and distances between CPUs.
+ *
+ * Groups are always mutliples of atom size and CPUs which are of
+ * LOCAL_DISTANCE both ways are grouped together and share space for
+ * units in the same group. The returned configuration is guaranteed
+ * to have CPUs on different nodes on different groups and >=75% usage
+ * of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, pointer to the new allocation_info is returned. On
+ * failure, ERR_PTR value is returned.
+ */
+struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+ size_t reserved_size, ssize_t dyn_size,
+ size_t atom_size,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+ static int group_map[NR_CPUS] __initdata;
+ static int group_cnt[NR_CPUS] __initdata;
+ const size_t static_size = __per_cpu_end - __per_cpu_start;
+ int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
+ size_t size_sum, min_unit_size, alloc_size;
+ int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int last_allocs, group, unit;
+ unsigned int cpu, tcpu;
+ struct pcpu_alloc_info *ai;
+ unsigned int *cpu_map;
+
+ /*
+ * Determine min_unit_size, alloc_size and max_upa such that
+ * alloc_size is multiple of atom_size and is the smallest
+ * which can accomodate 4k aligned segments which are equal to
+ * or larger than min_unit_size.
+ */
+ size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+ alloc_size = roundup(min_unit_size, atom_size);
+ upa = alloc_size / min_unit_size;
+ while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ upa--;
+ max_upa = upa;
+
+ /* group cpus according to their proximity */
+ for_each_possible_cpu(cpu) {
+ group = 0;
+ next_group:
+ for_each_possible_cpu(tcpu) {
+ if (cpu == tcpu)
+ break;
+ if (group_map[tcpu] == group && cpu_distance_fn &&
+ (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ group++;
+ nr_groups = max(nr_groups, group + 1);
+ goto next_group;
+ }
+ }
+ group_map[cpu] = group;
+ group_cnt[group]++;
+ group_cnt_max = max(group_cnt_max, group_cnt[group]);
+ }
+
+ /*
+ * Expand unit size until address space usage goes over 75%
+ * and then as much as possible without using more address
+ * space.
+ */
+ last_allocs = INT_MAX;
+ for (upa = max_upa; upa; upa--) {
+ int allocs = 0, wasted = 0;
+
+ if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ continue;
+
+ for (group = 0; group < nr_groups; group++) {
+ int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ allocs += this_allocs;
+ wasted += this_allocs * upa - group_cnt[group];
+ }
+
+ /*
+ * Don't accept if wastage is over 25%. The
+ * greater-than comparison ensures upa==1 always
+ * passes the following check.
+ */
+ if (wasted > num_possible_cpus() / 3)
+ continue;
+
+ /* and then don't consume more memory */
+ if (allocs > last_allocs)
+ break;
+ last_allocs = allocs;
+ best_upa = upa;
+ }
+ upa = best_upa;
+
+ /* allocate and fill alloc_info */
+ for (group = 0; group < nr_groups; group++)
+ nr_units += roundup(group_cnt[group], upa);
+
+ ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
+ if (!ai)
+ return ERR_PTR(-ENOMEM);
+ cpu_map = ai->groups[0].cpu_map;
+
+ for (group = 0; group < nr_groups; group++) {
+ ai->groups[group].cpu_map = cpu_map;
+ cpu_map += roundup(group_cnt[group], upa);
+ }
+
+ ai->static_size = static_size;
+ ai->reserved_size = reserved_size;
+ ai->dyn_size = dyn_size;
+ ai->unit_size = alloc_size / upa;
+ ai->atom_size = atom_size;
+ ai->alloc_size = alloc_size;
+
+ for (group = 0, unit = 0; group_cnt[group]; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+
+ /*
+ * Initialize base_offset as if all groups are located
+ * back-to-back. The caller should update this to
+ * reflect actual allocation.
+ */
+ gi->base_offset = unit * ai->unit_size;
+
+ for_each_possible_cpu(cpu)
+ if (group_map[cpu] == group)
+ gi->cpu_map[gi->nr_units++] = cpu;
+ gi->nr_units = roundup(gi->nr_units, upa);
+ unit += gi->nr_units;
+ }
+ BUG_ON(unit != nr_units);
+
+ return ai;
+}
+
+/**
+ * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
+ * @lvl: loglevel
+ * @ai: allocation info to dump
+ *
+ * Print out information about @ai using loglevel @lvl.
+ */
+static void pcpu_dump_alloc_info(const char *lvl,
+ const struct pcpu_alloc_info *ai)
+{
+ int group_width = 1, cpu_width = 1, width;
+ char empty_str[] = "--------";
+ int alloc = 0, alloc_end = 0;
+ int group, v;
+ int upa, apl; /* units per alloc, allocs per line */
+
+ v = ai->nr_groups;
+ while (v /= 10)
+ group_width++;
+
+ v = num_possible_cpus();
+ while (v /= 10)
+ cpu_width++;
+ empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
+
+ upa = ai->alloc_size / ai->unit_size;
+ width = upa * (cpu_width + 1) + group_width + 3;
+ apl = rounddown_pow_of_two(max(60 / width, 1));
+
+ printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
+ lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
+ ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
+
+ for (group = 0; group < ai->nr_groups; group++) {
+ const struct pcpu_group_info *gi = &ai->groups[group];
+ int unit = 0, unit_end = 0;
+
+ BUG_ON(gi->nr_units % upa);
+ for (alloc_end += gi->nr_units / upa;
+ alloc < alloc_end; alloc++) {
+ if (!(alloc % apl)) {
+ printk("\n");
+ printk("%spcpu-alloc: ", lvl);
+ }
+ printk("[%0*d] ", group_width, group);
+
+ for (unit_end += upa; unit < unit_end; unit++)
+ if (gi->cpu_map[unit] != NR_CPUS)
+ printk("%0*d ", cpu_width,
+ gi->cpu_map[unit]);
+ else
+ printk("%s ", empty_str);
+ }
+ }
+ printk("\n");
+}
+
+/**
+ * pcpu_setup_first_chunk - initialize the first percpu chunk
+ * @ai: pcpu_alloc_info describing how to percpu area is shaped
+ * @base_addr: mapped address
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area. This function is to be called from arch percpu area
- * setup path. The first two parameters are mandatory. The rest are
- * optional.
- *
- * @get_page_fn() should return pointer to percpu page given cpu
- * number and page number. It should at least return enough pages to
- * cover the static area. The returned pages for static area should
- * have been initialized with valid data. If @unit_size is specified,
- * it can also return pages after the static area. NULL return
- * indicates end of pages for the cpu. Note that @get_page_fn() must
- * return the same number of pages for all cpus.
- *
- * @reserved_size, if non-zero, specifies the amount of bytes to
+ * setup path.
+ *
+ * @ai contains all information necessary to initialize the first
+ * chunk and prime the dynamic percpu allocator.
+ *
+ * @ai->static_size is the size of static percpu area.
+ *
+ * @ai->reserved_size, if non-zero, specifies the amount of bytes to
* reserve after the static area in the first chunk. This reserves
* the first chunk such that it's available only through reserved
* percpu allocation. This is primarily used to serve module percpu
@@ -1018,22 +1537,29 @@ EXPORT_SYMBOL_GPL(free_percpu);
* limited offset range for symbol relocations to guarantee module
* percpu symbols fall inside the relocatable range.
*
- * @dyn_size, if non-negative, determines the number of bytes
- * available for dynamic allocation in the first chunk. Specifying
- * non-negative value makes percpu leave alone the area beyond
- * @static_size + @reserved_size + @dyn_size.
+ * @ai->dyn_size determines the number of bytes available for dynamic
+ * allocation in the first chunk. The area between @ai->static_size +
+ * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
*
- * @unit_size, if non-negative, specifies unit size and must be
- * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @reserved_size + if non-negative, @dyn_size.
+ * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
+ * and equal to or larger than @ai->static_size + @ai->reserved_size +
+ * @ai->dyn_size.
*
- * Non-null @base_addr means that the caller already allocated virtual
- * region for the first chunk and mapped it. percpu must not mess
- * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
- * @populate_pte_fn doesn't make any sense.
+ * @ai->atom_size is the allocation atom size and used as alignment
+ * for vm areas.
*
- * @populate_pte_fn is used to populate the pagetable. NULL means the
- * caller already populated the pagetable.
+ * @ai->alloc_size is the allocation size and always multiple of
+ * @ai->atom_size. This is larger than @ai->atom_size if
+ * @ai->unit_size is larger than @ai->atom_size.
+ *
+ * @ai->nr_groups and @ai->groups describe virtual memory layout of
+ * percpu areas. Units which should be colocated are put into the
+ * same group. Dynamic VM areas will be allocated according to these
+ * groupings. If @ai->nr_groups is zero, a single group containing
+ * all units is assumed.
+ *
+ * The caller should have mapped the first chunk at @base_addr and
+ * copied static data to each unit.
*
* If the first chunk ends up with both reserved and dynamic areas, it
* is served by two chunks - one to serve the core static and reserved
@@ -1043,49 +1569,83 @@ EXPORT_SYMBOL_GPL(free_percpu);
* and available for dynamic allocation like any other chunks.
*
* RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access.
+ * 0 on success, -errno on failure.
*/
-size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
- size_t static_size, size_t reserved_size,
- ssize_t dyn_size, ssize_t unit_size,
- void *base_addr,
- pcpu_populate_pte_fn_t populate_pte_fn)
+int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+ void *base_addr)
{
- static struct vm_struct first_vm;
static int smap[2], dmap[2];
- size_t size_sum = static_size + reserved_size +
- (dyn_size >= 0 ? dyn_size : 0);
+ size_t dyn_size = ai->dyn_size;
+ size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
struct pcpu_chunk *schunk, *dchunk = NULL;
+ unsigned long *group_offsets;
+ size_t *group_sizes;
+ unsigned long *unit_off;
unsigned int cpu;
- int nr_pages;
- int err, i;
+ int *unit_map;
+ int group, unit, i;
- /* santiy checks */
+ /* sanity checks */
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
- BUG_ON(!static_size);
- if (unit_size >= 0) {
- BUG_ON(unit_size < size_sum);
- BUG_ON(unit_size & ~PAGE_MASK);
- BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
- } else
- BUG_ON(base_addr);
- BUG_ON(base_addr && populate_pte_fn);
-
- if (unit_size >= 0)
- pcpu_unit_pages = unit_size >> PAGE_SHIFT;
- else
- pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
- PFN_UP(size_sum));
+ BUG_ON(ai->nr_groups <= 0);
+ BUG_ON(!ai->static_size);
+ BUG_ON(!base_addr);
+ BUG_ON(ai->unit_size < size_sum);
+ BUG_ON(ai->unit_size & ~PAGE_MASK);
+ BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+
+ pcpu_dump_alloc_info(KERN_DEBUG, ai);
+
+ /* process group information and build config tables accordingly */
+ group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
+ group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
+ unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
+ unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+ unit_map[cpu] = NR_CPUS;
+ pcpu_first_unit_cpu = NR_CPUS;
+
+ for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
+ const struct pcpu_group_info *gi = &ai->groups[group];
+
+ group_offsets[group] = gi->base_offset;
+ group_sizes[group] = gi->nr_units * ai->unit_size;
+
+ for (i = 0; i < gi->nr_units; i++) {
+ cpu = gi->cpu_map[i];
+ if (cpu == NR_CPUS)
+ continue;
- pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
- pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
- pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
- + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
+ BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
+ BUG_ON(unit_map[cpu] != NR_CPUS);
- if (dyn_size < 0)
- dyn_size = pcpu_unit_size - static_size - reserved_size;
+ unit_map[cpu] = unit + i;
+ unit_off[cpu] = gi->base_offset + i * ai->unit_size;
+
+ if (pcpu_first_unit_cpu == NR_CPUS)
+ pcpu_first_unit_cpu = cpu;
+ }
+ }
+ pcpu_last_unit_cpu = cpu;
+ pcpu_nr_units = unit;
+
+ for_each_possible_cpu(cpu)
+ BUG_ON(unit_map[cpu] == NR_CPUS);
+
+ pcpu_nr_groups = ai->nr_groups;
+ pcpu_group_offsets = group_offsets;
+ pcpu_group_sizes = group_sizes;
+ pcpu_unit_map = unit_map;
+ pcpu_unit_offsets = unit_off;
+
+ /* determine basic parameters */
+ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
+ pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
+ pcpu_atom_size = ai->atom_size;
+ pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
+ BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
/*
* Allocate chunk slots. The additional last slot is for
@@ -1105,189 +1665,351 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
*/
schunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
- schunk->vm = &first_vm;
+ schunk->base_addr = base_addr;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
- schunk->page = schunk->page_ar;
+ schunk->immutable = true;
+ bitmap_fill(schunk->populated, pcpu_unit_pages);
- if (reserved_size) {
- schunk->free_size = reserved_size;
+ if (ai->reserved_size) {
+ schunk->free_size = ai->reserved_size;
pcpu_reserved_chunk = schunk;
- pcpu_reserved_chunk_limit = static_size + reserved_size;
+ pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
} else {
schunk->free_size = dyn_size;
dyn_size = 0; /* dynamic area covered */
}
schunk->contig_hint = schunk->free_size;
- schunk->map[schunk->map_used++] = -static_size;
+ schunk->map[schunk->map_used++] = -ai->static_size;
if (schunk->free_size)
schunk->map[schunk->map_used++] = schunk->free_size;
/* init dynamic chunk if necessary */
if (dyn_size) {
- dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+ dchunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&dchunk->list);
- dchunk->vm = &first_vm;
+ dchunk->base_addr = base_addr;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
- dchunk->page = schunk->page_ar; /* share page map with schunk */
+ dchunk->immutable = true;
+ bitmap_fill(dchunk->populated, pcpu_unit_pages);
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
dchunk->map[dchunk->map_used++] = dchunk->free_size;
}
- /* allocate vm address */
- first_vm.flags = VM_ALLOC;
- first_vm.size = pcpu_chunk_size;
-
- if (!base_addr)
- vm_area_register_early(&first_vm, PAGE_SIZE);
- else {
- /*
- * Pages already mapped. No need to remap into
- * vmalloc area. In this case the first chunks can't
- * be mapped or unmapped by percpu and are marked
- * immutable.
- */
- first_vm.addr = base_addr;
- schunk->immutable = true;
- if (dchunk)
- dchunk->immutable = true;
- }
-
- /* assign pages */
- nr_pages = -1;
- for_each_possible_cpu(cpu) {
- for (i = 0; i < pcpu_unit_pages; i++) {
- struct page *page = get_page_fn(cpu, i);
-
- if (!page)
- break;
- *pcpu_chunk_pagep(schunk, cpu, i) = page;
- }
-
- BUG_ON(i < PFN_UP(static_size));
-
- if (nr_pages < 0)
- nr_pages = i;
- else
- BUG_ON(nr_pages != i);
- }
-
- /* map them */
- if (populate_pte_fn) {
- for_each_possible_cpu(cpu)
- for (i = 0; i < nr_pages; i++)
- populate_pte_fn(pcpu_chunk_addr(schunk,
- cpu, i));
-
- err = pcpu_map(schunk, 0, nr_pages);
- if (err)
- panic("failed to setup static percpu area, err=%d\n",
- err);
- }
-
/* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
/* we're done */
- pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
- return pcpu_unit_size;
+ pcpu_base_addr = base_addr;
+ return 0;
}
-/*
- * Embedding first chunk setup helper.
- */
-static void *pcpue_ptr __initdata;
-static size_t pcpue_size __initdata;
-static size_t pcpue_unit_size __initdata;
+const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
+ [PCPU_FC_AUTO] = "auto",
+ [PCPU_FC_EMBED] = "embed",
+ [PCPU_FC_PAGE] = "page",
+};
-static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
-{
- size_t off = (size_t)pageno << PAGE_SHIFT;
+enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
- if (off >= pcpue_size)
- return NULL;
+static int __init percpu_alloc_setup(char *str)
+{
+ if (0)
+ /* nada */;
+#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
+ else if (!strcmp(str, "embed"))
+ pcpu_chosen_fc = PCPU_FC_EMBED;
+#endif
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+ else if (!strcmp(str, "page"))
+ pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
+ else
+ pr_warning("PERCPU: unknown allocator %s specified\n", str);
- return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
+ return 0;
}
+early_param("percpu_alloc", percpu_alloc_setup);
+#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
+ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
- * @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ * @alloc_fn: function to allocate percpu page
+ * @free_fn: funtion to free percpu page
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
- * as a contiguous area using bootmem allocator and used as-is without
- * being mapped into vmalloc area. This enables the first chunk to
- * piggy back on the linear physical mapping which often uses larger
- * page size.
+ * by calling @alloc_fn and used as-is without being mapped into
+ * vmalloc area. Allocations are always whole multiples of @atom_size
+ * aligned to @atom_size.
+ *
+ * This enables the first chunk to piggy back on the linear physical
+ * mapping which often uses larger page size. Please note that this
+ * can result in very sparse cpu->unit mapping on NUMA machines thus
+ * requiring large vmalloc address space. Don't use this allocator if
+ * vmalloc space is not orders of magnitude larger than distances
+ * between node memory addresses (ie. 32bit NUMA machines).
*
* When @dyn_size is positive, dynamic area might be larger than
- * specified to fill page alignment. Also, when @dyn_size is auto,
- * @dyn_size does not fill the whole first chunk but only what's
- * necessary for page alignment after static and reserved areas.
+ * specified to fill page alignment. When @dyn_size is auto,
+ * @dyn_size is just big enough to fill page alignment after static
+ * and reserved areas.
*
* If the needed size is smaller than the minimum or specified unit
- * size, the leftover is returned to the bootmem allocator.
+ * size, the leftover is returned using @free_fn.
*
* RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access on success, -errno on failure.
+ * 0 on success, -errno on failure.
*/
-ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
- ssize_t dyn_size, ssize_t unit_size)
+int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
+ size_t atom_size,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn)
{
- size_t chunk_size;
- unsigned int cpu;
+ void *base = (void *)ULONG_MAX;
+ void **areas = NULL;
+ struct pcpu_alloc_info *ai;
+ size_t size_sum, areas_size;
+ int group, i, rc;
+
+ ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
+ cpu_distance_fn);
+ if (IS_ERR(ai))
+ return PTR_ERR(ai);
+
+ size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
+ areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
+
+ areas = alloc_bootmem_nopanic(areas_size);
+ if (!areas) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
- /* determine parameters and allocate */
- pcpue_size = PFN_ALIGN(static_size + reserved_size +
- (dyn_size >= 0 ? dyn_size : 0));
- if (dyn_size != 0)
- dyn_size = pcpue_size - static_size - reserved_size;
-
- if (unit_size >= 0) {
- BUG_ON(unit_size < pcpue_size);
- pcpue_unit_size = unit_size;
- } else
- pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-
- chunk_size = pcpue_unit_size * nr_cpu_ids;
-
- pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
- __pa(MAX_DMA_ADDRESS));
- if (!pcpue_ptr) {
- pr_warning("PERCPU: failed to allocate %zu bytes for "
- "embedding\n", chunk_size);
- return -ENOMEM;
+ /* allocate, copy and determine base address */
+ for (group = 0; group < ai->nr_groups; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+ unsigned int cpu = NR_CPUS;
+ void *ptr;
+
+ for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
+ cpu = gi->cpu_map[i];
+ BUG_ON(cpu == NR_CPUS);
+
+ /* allocate space for the whole group */
+ ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
+ if (!ptr) {
+ rc = -ENOMEM;
+ goto out_free_areas;
+ }
+ areas[group] = ptr;
+
+ base = min(ptr, base);
+
+ for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
+ if (gi->cpu_map[i] == NR_CPUS) {
+ /* unused unit, free whole */
+ free_fn(ptr, ai->unit_size);
+ continue;
+ }
+ /* copy and return the unused part */
+ memcpy(ptr, __per_cpu_load, ai->static_size);
+ free_fn(ptr + size_sum, ai->unit_size - size_sum);
+ }
}
- /* return the leftover and copy */
- for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
- void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+ /* base address is now known, determine group base offsets */
+ for (group = 0; group < ai->nr_groups; group++)
+ ai->groups[group].base_offset = areas[group] - base;
+
+ pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
+ PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
+ ai->dyn_size, ai->unit_size);
+
+ rc = pcpu_setup_first_chunk(ai, base);
+ goto out_free;
+
+out_free_areas:
+ for (group = 0; group < ai->nr_groups; group++)
+ free_fn(areas[group],
+ ai->groups[group].nr_units * ai->unit_size);
+out_free:
+ pcpu_free_alloc_info(ai);
+ if (areas)
+ free_bootmem(__pa(areas), areas_size);
+ return rc;
+}
+#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
+ !CONFIG_HAVE_SETUP_PER_CPU_AREA */
+
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+/**
+ * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
+ * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ * @populate_pte_fn: function to populate pte
+ *
+ * This is a helper to ease setting up page-remapped first percpu
+ * chunk and can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * This is the basic allocator. Static percpu area is allocated
+ * page-by-page into vmalloc area.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init pcpu_page_first_chunk(size_t reserved_size,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn,
+ pcpu_fc_populate_pte_fn_t populate_pte_fn)
+{
+ static struct vm_struct vm;
+ struct pcpu_alloc_info *ai;
+ char psize_str[16];
+ int unit_pages;
+ size_t pages_size;
+ struct page **pages;
+ int unit, i, j, rc;
+
+ snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
+
+ ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
+ if (IS_ERR(ai))
+ return PTR_ERR(ai);
+ BUG_ON(ai->nr_groups != 1);
+ BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
+
+ unit_pages = ai->unit_size >> PAGE_SHIFT;
+
+ /* unaligned allocations can't be freed, round up to page size */
+ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
+ sizeof(pages[0]));
+ pages = alloc_bootmem(pages_size);
+
+ /* allocate pages */
+ j = 0;
+ for (unit = 0; unit < num_possible_cpus(); unit++)
+ for (i = 0; i < unit_pages; i++) {
+ unsigned int cpu = ai->groups[0].cpu_map[unit];
+ void *ptr;
+
+ ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
+ if (!ptr) {
+ pr_warning("PERCPU: failed to allocate %s page "
+ "for cpu%u\n", psize_str, cpu);
+ goto enomem;
+ }
+ pages[j++] = virt_to_page(ptr);
+ }
+
+ /* allocate vm area, map the pages and copy static data */
+ vm.flags = VM_ALLOC;
+ vm.size = num_possible_cpus() * ai->unit_size;
+ vm_area_register_early(&vm, PAGE_SIZE);
+
+ for (unit = 0; unit < num_possible_cpus(); unit++) {
+ unsigned long unit_addr =
+ (unsigned long)vm.addr + unit * ai->unit_size;
+
+ for (i = 0; i < unit_pages; i++)
+ populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
+
+ /* pte already populated, the following shouldn't fail */
+ rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
+ unit_pages);
+ if (rc < 0)
+ panic("failed to map percpu area, err=%d\n", rc);
- if (cpu_possible(cpu)) {
- free_bootmem(__pa(ptr + pcpue_size),
- pcpue_unit_size - pcpue_size);
- memcpy(ptr, __per_cpu_load, static_size);
- } else
- free_bootmem(__pa(ptr), pcpue_unit_size);
+ /*
+ * FIXME: Archs with virtual cache should flush local
+ * cache for the linear mapping here - something
+ * equivalent to flush_cache_vmap() on the local cpu.
+ * flush_cache_vmap() can't be used as most supporting
+ * data structures are not set up yet.
+ */
+
+ /* copy static data */
+ memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
}
/* we're ready, commit */
- pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
- pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+ pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
+ unit_pages, psize_str, vm.addr, ai->static_size,
+ ai->reserved_size, ai->dyn_size);
+
+ rc = pcpu_setup_first_chunk(ai, vm.addr);
+ goto out_free_ar;
+
+enomem:
+ while (--j >= 0)
+ free_fn(page_address(pages[j]), PAGE_SIZE);
+ rc = -ENOMEM;
+out_free_ar:
+ free_bootmem(__pa(pages), pages_size);
+ pcpu_free_alloc_info(ai);
+ return rc;
+}
+#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
+
+/*
+ * Generic percpu area setup.
+ *
+ * The embedding helper is used because its behavior closely resembles
+ * the original non-dynamic generic percpu area setup. This is
+ * important because many archs have addressing restrictions and might
+ * fail if the percpu area is located far away from the previous
+ * location. As an added bonus, in non-NUMA cases, embedding is
+ * generally a good idea TLB-wise because percpu area can piggy back
+ * on the physical linear memory mapping which uses large page
+ * mappings on applicable archs.
+ */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
+ size_t align)
+{
+ return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
+}
- return pcpu_setup_first_chunk(pcpue_get_page, static_size,
- reserved_size, dyn_size,
- pcpue_unit_size, pcpue_ptr, NULL);
+static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
+{
+ free_bootmem(__pa(ptr), size);
+}
+
+void __init setup_per_cpu_areas(void)
+{
+ unsigned long delta;
+ unsigned int cpu;
+ int rc;
+
+ /*
+ * Always reserve area for module percpu variables. That's
+ * what the legacy allocator did.
+ */
+ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+ PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
+ pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
+ if (rc < 0)
+ panic("Failed to initialized percpu areas.");
+
+ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ for_each_possible_cpu(cpu)
+ __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index e66d07d1b4f..6eedf7e473d 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
#include <linux/module.h>
#include <linux/quicklist.h>
-DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
+DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
#define FRACTION_OF_NODE_MEM 16
diff --git a/mm/rmap.c b/mm/rmap.c
index 0895b5c7cbf..720fc03a7bc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -710,27 +710,6 @@ void page_add_file_rmap(struct page *page)
}
}
-#ifdef CONFIG_DEBUG_VM
-/**
- * page_dup_rmap - duplicate pte mapping to a page
- * @page: the page to add the mapping to
- * @vma: the vm area being duplicated
- * @address: the user virtual address mapped
- *
- * For copy_page_range only: minimal extract from page_add_file_rmap /
- * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
- * quicker.
- *
- * The caller needs to hold the pte lock.
- */
-void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
-{
- if (PageAnon(page))
- __page_check_anon_rmap(page, vma, address);
- atomic_inc(&page->_mapcount);
-}
-#endif
-
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
@@ -739,34 +718,37 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
*/
void page_remove_rmap(struct page *page)
{
- if (atomic_add_negative(-1, &page->_mapcount)) {
- /*
- * Now that the last pte has gone, s390 must transfer dirty
- * flag from storage key to struct page. We can usually skip
- * this if the page is anon, so about to be freed; but perhaps
- * not if it's in swapcache - there might be another pte slot
- * containing the swap entry, but page not yet written to swap.
- */
- if ((!PageAnon(page) || PageSwapCache(page)) &&
- page_test_dirty(page)) {
- page_clear_dirty(page);
- set_page_dirty(page);
- }
- if (PageAnon(page))
- mem_cgroup_uncharge_page(page);
- __dec_zone_page_state(page,
- PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
- mem_cgroup_update_mapped_file_stat(page, -1);
- /*
- * It would be tidy to reset the PageAnon mapping here,
- * but that might overwrite a racing page_add_anon_rmap
- * which increments mapcount after us but sets mapping
- * before us: so leave the reset to free_hot_cold_page,
- * and remember that it's only reliable while mapped.
- * Leaving it set also helps swapoff to reinstate ptes
- * faster for those pages still in swapcache.
- */
+ /* page still mapped by someone else? */
+ if (!atomic_add_negative(-1, &page->_mapcount))
+ return;
+
+ /*
+ * Now that the last pte has gone, s390 must transfer dirty
+ * flag from storage key to struct page. We can usually skip
+ * this if the page is anon, so about to be freed; but perhaps
+ * not if it's in swapcache - there might be another pte slot
+ * containing the swap entry, but page not yet written to swap.
+ */
+ if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
+ page_clear_dirty(page);
+ set_page_dirty(page);
}
+ if (PageAnon(page)) {
+ mem_cgroup_uncharge_page(page);
+ __dec_zone_page_state(page, NR_ANON_PAGES);
+ } else {
+ __dec_zone_page_state(page, NR_FILE_MAPPED);
+ }
+ mem_cgroup_update_mapped_file_stat(page, -1);
+ /*
+ * It would be tidy to reset the PageAnon mapping here,
+ * but that might overwrite a racing page_add_anon_rmap
+ * which increments mapcount after us but sets mapping
+ * before us: so leave the reset to free_hot_cold_page,
+ * and remember that it's only reliable while mapped.
+ * Leaving it set also helps swapoff to reinstate ptes
+ * faster for those pages still in swapcache.
+ */
}
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 5a0b3d4055f..b206a7a32e2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -49,7 +49,6 @@ static struct vfsmount *shm_mnt;
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
-#include <linux/vfs.h>
#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/swapops.h>
@@ -1097,6 +1096,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
shmem_swp_unmap(entry);
unlock:
spin_unlock(&info->lock);
+ /*
+ * add_to_swap_cache() doesn't return -EEXIST, so we can safely
+ * clear SWAP_HAS_CACHE flag.
+ */
swapcache_free(swap, NULL);
redirty:
set_page_dirty(page);
@@ -2298,8 +2301,7 @@ static void shmem_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
}
-static int shmem_fill_super(struct super_block *sb,
- void *data, int silent)
+int shmem_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
struct dentry *root;
@@ -2307,17 +2309,14 @@ static int shmem_fill_super(struct super_block *sb,
int err = -ENOMEM;
/* Round up to L1_CACHE_BYTES to resist false sharing */
- sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
+ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
L1_CACHE_BYTES), GFP_KERNEL);
if (!sbinfo)
return -ENOMEM;
- sbinfo->max_blocks = 0;
- sbinfo->max_inodes = 0;
sbinfo->mode = S_IRWXUGO | S_ISVTX;
sbinfo->uid = current_fsuid();
sbinfo->gid = current_fsgid();
- sbinfo->mpol = NULL;
sb->s_fs_info = sbinfo;
#ifdef CONFIG_TMPFS
@@ -2519,7 +2518,7 @@ static struct file_system_type tmpfs_fs_type = {
.kill_sb = kill_litter_super,
};
-static int __init init_tmpfs(void)
+int __init init_tmpfs(void)
{
int error;
@@ -2576,7 +2575,7 @@ static struct file_system_type tmpfs_fs_type = {
.kill_sb = kill_litter_super,
};
-static int __init init_tmpfs(void)
+int __init init_tmpfs(void)
{
BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
@@ -2591,6 +2590,11 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
return 0;
}
+int shmem_lock(struct file *file, int lock, struct user_struct *user)
+{
+ return 0;
+}
+
#define shmem_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
@@ -2687,5 +2691,3 @@ int shmem_zero_setup(struct vm_area_struct *vma)
vma->vm_ops = &shmem_vm_ops;
return 0;
}
-
-module_init(init_tmpfs)
diff --git a/mm/slab.c b/mm/slab.c
index 7b5d4deacfc..7dfa481c96b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1384,7 +1384,7 @@ void __init kmem_cache_init(void)
* Fragmentation resistance on low memory - only use bigger
* page orders on machines with more than 32MB of memory.
*/
- if (num_physpages > (32 << 20) >> PAGE_SHIFT)
+ if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
slab_break_gfp_order = BREAK_GFP_ORDER_HI;
/* Bootstrap is tricky, because several objects are allocated
diff --git a/mm/slub.c b/mm/slub.c
index 417ed843b25..4996fc71955 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1071,6 +1071,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
}
#define slub_debug 0
+#define disable_higher_order_debug 0
+
static inline unsigned long slabs_node(struct kmem_cache *s, int node)
{ return 0; }
static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
@@ -2111,8 +2113,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
*/
#define NR_KMEM_CACHE_CPU 100
-static DEFINE_PER_CPU(struct kmem_cache_cpu,
- kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
+static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
+ kmem_cache_cpu);
static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
@@ -3343,6 +3345,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
{
struct kmem_cache *s;
+ if (WARN_ON(!name))
+ return NULL;
+
down_write(&slub_lock);
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a13ea6401ae..d9714bdcb4a 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -48,8 +48,14 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
{
/* If the main allocator is up use that, fallback to bootmem. */
if (slab_is_available()) {
- struct page *page = alloc_pages_node(node,
+ struct page *page;
+
+ if (node_state(node, N_HIGH_MEMORY))
+ page = alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO, get_order(size));
+ else
+ page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(size));
if (page)
return page_address(page);
return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index da432d9f0ae..6ce4aab69e9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -62,9 +62,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
unsigned long array_size = SECTIONS_PER_ROOT *
sizeof(struct mem_section);
- if (slab_is_available())
- section = kmalloc_node(array_size, GFP_KERNEL, nid);
- else
+ if (slab_is_available()) {
+ if (node_state(nid, N_HIGH_MEMORY))
+ section = kmalloc_node(array_size, GFP_KERNEL, nid);
+ else
+ section = kmalloc(array_size, GFP_KERNEL);
+ } else
section = alloc_bootmem_node(NODE_DATA(nid), array_size);
if (section)
diff --git a/mm/swap.c b/mm/swap.c
index cb29ae5d33a..308e57d8d7e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -118,7 +118,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
spin_lock(&zone->lru_lock);
}
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- int lru = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
list_move_tail(&page->lru, &zone->lru[lru].list);
pgmoved++;
}
@@ -181,7 +181,7 @@ void activate_page(struct page *page)
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
int file = page_is_file_cache(page);
- int lru = LRU_BASE + file;
+ int lru = page_lru_base_type(page);
del_page_from_lru_list(zone, page, lru);
SetPageActive(page);
@@ -189,7 +189,7 @@ void activate_page(struct page *page)
add_page_to_lru_list(zone, page, lru);
__count_vm_event(PGACTIVATE);
- update_page_reclaim_stat(zone, page, !!file, 1);
+ update_page_reclaim_stat(zone, page, file, 1);
}
spin_unlock_irq(&zone->lru_lock);
}
@@ -496,7 +496,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
*/
void __init swap_setup(void)
{
- unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
+ unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
#ifdef CONFIG_SWAP
bdi_init(swapper_space.backing_dev_info);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5ae6b8b78c8..6d1daeb1cb4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -67,10 +67,10 @@ void show_swap_cache_info(void)
}
/*
- * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
+ * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
+static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
{
int error;
@@ -78,28 +78,43 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
VM_BUG_ON(PageSwapCache(page));
VM_BUG_ON(!PageSwapBacked(page));
+ page_cache_get(page);
+ SetPageSwapCache(page);
+ set_page_private(page, entry.val);
+
+ spin_lock_irq(&swapper_space.tree_lock);
+ error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
+ if (likely(!error)) {
+ total_swapcache_pages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ INC_CACHE_INFO(add_total);
+ }
+ spin_unlock_irq(&swapper_space.tree_lock);
+
+ if (unlikely(error)) {
+ /*
+ * Only the context which have set SWAP_HAS_CACHE flag
+ * would call add_to_swap_cache().
+ * So add_to_swap_cache() doesn't returns -EEXIST.
+ */
+ VM_BUG_ON(error == -EEXIST);
+ set_page_private(page, 0UL);
+ ClearPageSwapCache(page);
+ page_cache_release(page);
+ }
+
+ return error;
+}
+
+
+int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
+{
+ int error;
+
error = radix_tree_preload(gfp_mask);
if (!error) {
- page_cache_get(page);
- SetPageSwapCache(page);
- set_page_private(page, entry.val);
-
- spin_lock_irq(&swapper_space.tree_lock);
- error = radix_tree_insert(&swapper_space.page_tree,
- entry.val, page);
- if (likely(!error)) {
- total_swapcache_pages++;
- __inc_zone_page_state(page, NR_FILE_PAGES);
- INC_CACHE_INFO(add_total);
- }
- spin_unlock_irq(&swapper_space.tree_lock);
+ error = __add_to_swap_cache(page, entry);
radix_tree_preload_end();
-
- if (unlikely(error)) {
- set_page_private(page, 0UL);
- ClearPageSwapCache(page);
- page_cache_release(page);
- }
}
return error;
}
@@ -137,38 +152,34 @@ int add_to_swap(struct page *page)
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(!PageUptodate(page));
- for (;;) {
- entry = get_swap_page();
- if (!entry.val)
- return 0;
+ entry = get_swap_page();
+ if (!entry.val)
+ return 0;
+ /*
+ * Radix-tree node allocations from PF_MEMALLOC contexts could
+ * completely exhaust the page allocator. __GFP_NOMEMALLOC
+ * stops emergency reserves from being allocated.
+ *
+ * TODO: this could cause a theoretical memory reclaim
+ * deadlock in the swap out path.
+ */
+ /*
+ * Add it to the swap cache and mark it dirty
+ */
+ err = add_to_swap_cache(page, entry,
+ __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
+
+ if (!err) { /* Success */
+ SetPageDirty(page);
+ return 1;
+ } else { /* -ENOMEM radix-tree allocation failure */
/*
- * Radix-tree node allocations from PF_MEMALLOC contexts could
- * completely exhaust the page allocator. __GFP_NOMEMALLOC
- * stops emergency reserves from being allocated.
- *
- * TODO: this could cause a theoretical memory reclaim
- * deadlock in the swap out path.
- */
- /*
- * Add it to the swap cache and mark it dirty
+ * add_to_swap_cache() doesn't return -EEXIST, so we can safely
+ * clear SWAP_HAS_CACHE flag.
*/
- err = add_to_swap_cache(page, entry,
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
-
- switch (err) {
- case 0: /* Success */
- SetPageDirty(page);
- return 1;
- case -EEXIST:
- /* Raced with "speculative" read_swap_cache_async */
- swapcache_free(entry, NULL);
- continue;
- default:
- /* -ENOMEM radix-tree allocation failure */
- swapcache_free(entry, NULL);
- return 0;
- }
+ swapcache_free(entry, NULL);
+ return 0;
}
}
@@ -290,26 +301,31 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
/*
+ * call radix_tree_preload() while we can wait.
+ */
+ err = radix_tree_preload(gfp_mask & GFP_KERNEL);
+ if (err)
+ break;
+
+ /*
* Swap entry may have been freed since our caller observed it.
*/
err = swapcache_prepare(entry);
- if (err == -EEXIST) /* seems racy */
+ if (err == -EEXIST) { /* seems racy */
+ radix_tree_preload_end();
continue;
- if (err) /* swp entry is obsolete ? */
+ }
+ if (err) { /* swp entry is obsolete ? */
+ radix_tree_preload_end();
break;
+ }
- /*
- * Associate the page with swap entry in the swap cache.
- * May fail (-EEXIST) if there is already a page associated
- * with this entry in the swap cache: added by a racing
- * read_swap_cache_async, or add_to_swap or shmem_writepage
- * re-using the just freed swap entry for an existing page.
- * May fail (-ENOMEM) if radix-tree node allocation failed.
- */
+ /* May fail (-ENOMEM) if radix-tree node allocation failed. */
__set_page_locked(new_page);
SetPageSwapBacked(new_page);
- err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
+ err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
+ radix_tree_preload_end();
/*
* Initiate read into locked page and return.
*/
@@ -317,8 +333,13 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
swap_readpage(new_page);
return new_page;
}
+ radix_tree_preload_end();
ClearPageSwapBacked(new_page);
__clear_page_locked(new_page);
+ /*
+ * add_to_swap_cache() doesn't return -EEXIST, so we can safely
+ * clear SWAP_HAS_CACHE flag.
+ */
swapcache_free(entry, NULL);
} while (err != -ENOMEM);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 74f1102e874..f1bf19daadc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1575,9 +1575,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->flags &= ~SWP_WRITEOK;
spin_unlock(&swap_lock);
- current->flags |= PF_SWAPOFF;
+ current->flags |= PF_OOM_ORIGIN;
err = try_to_unuse(type);
- current->flags &= ~PF_SWAPOFF;
+ current->flags &= ~PF_OOM_ORIGIN;
if (err) {
/* re-insert swap space back into swap_list */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f8189a4b3e1..5535da1d696 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -25,7 +25,7 @@
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
-
+#include <linux/highmem.h>
#include <asm/atomic.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
@@ -168,11 +168,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
next = pgd_addr_end(addr, end);
err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
if (err)
- break;
+ return err;
} while (pgd++, addr = next, addr != end);
- if (unlikely(err))
- return err;
return nr;
}
@@ -265,6 +263,7 @@ struct vmap_area {
static DEFINE_SPINLOCK(vmap_area_lock);
static struct rb_root vmap_area_root = RB_ROOT;
static LIST_HEAD(vmap_area_list);
+static unsigned long vmap_area_pcpu_hole;
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
@@ -431,6 +430,15 @@ static void __free_vmap_area(struct vmap_area *va)
RB_CLEAR_NODE(&va->rb_node);
list_del_rcu(&va->list);
+ /*
+ * Track the highest possible candidate for pcpu area
+ * allocation. Areas outside of vmalloc area can be returned
+ * here too, consider only end addresses which fall inside
+ * vmalloc area proper.
+ */
+ if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
+ vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
+
call_rcu(&va->rcu_head, rcu_free_va);
}
@@ -1038,6 +1046,9 @@ void __init vmalloc_init(void)
va->va_end = va->va_start + tmp->size;
__insert_vmap_area(va);
}
+
+ vmap_area_pcpu_hole = VMALLOC_END;
+
vmap_initialized = true;
}
@@ -1122,13 +1133,34 @@ EXPORT_SYMBOL_GPL(map_vm_area);
DEFINE_RWLOCK(vmlist_lock);
struct vm_struct *vmlist;
+static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, void *caller)
+{
+ struct vm_struct *tmp, **p;
+
+ vm->flags = flags;
+ vm->addr = (void *)va->va_start;
+ vm->size = va->va_end - va->va_start;
+ vm->caller = caller;
+ va->private = vm;
+ va->flags |= VM_VM_AREA;
+
+ write_lock(&vmlist_lock);
+ for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
+ if (tmp->addr >= vm->addr)
+ break;
+ }
+ vm->next = *p;
+ *p = vm;
+ write_unlock(&vmlist_lock);
+}
+
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long flags, unsigned long start, unsigned long end,
int node, gfp_t gfp_mask, void *caller)
{
static struct vmap_area *va;
struct vm_struct *area;
- struct vm_struct *tmp, **p;
unsigned long align = 1;
BUG_ON(in_interrupt());
@@ -1147,7 +1179,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (unlikely(!size))
return NULL;
- area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
+ area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
@@ -1162,25 +1194,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return NULL;
}
- area->flags = flags;
- area->addr = (void *)va->va_start;
- area->size = size;
- area->pages = NULL;
- area->nr_pages = 0;
- area->phys_addr = 0;
- area->caller = caller;
- va->private = area;
- va->flags |= VM_VM_AREA;
-
- write_lock(&vmlist_lock);
- for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
- if (tmp->addr >= area->addr)
- break;
- }
- area->next = *p;
- *p = area;
- write_unlock(&vmlist_lock);
-
+ insert_vmalloc_vm(area, va, flags, caller);
return area;
}
@@ -1256,17 +1270,21 @@ struct vm_struct *remove_vm_area(const void *addr)
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->private;
struct vm_struct *tmp, **p;
-
- vmap_debug_free_range(va->va_start, va->va_end);
- free_unmap_vmap_area(va);
- vm->size -= PAGE_SIZE;
-
+ /*
+ * remove from list and disallow access to this vm_struct
+ * before unmap. (address range confliction is maintained by
+ * vmap.)
+ */
write_lock(&vmlist_lock);
for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
;
*p = tmp->next;
write_unlock(&vmlist_lock);
+ vmap_debug_free_range(va->va_start, va->va_end);
+ free_unmap_vmap_area(va);
+ vm->size -= PAGE_SIZE;
+
return vm;
}
return NULL;
@@ -1368,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count,
might_sleep();
- if (count > num_physpages)
+ if (count > totalram_pages)
return NULL;
area = get_vm_area_caller((count << PAGE_SHIFT), flags,
@@ -1475,7 +1493,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
unsigned long real_size = size;
size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > num_physpages)
+ if (!size || (size >> PAGE_SHIFT) > totalram_pages)
return NULL;
area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
@@ -1625,10 +1643,120 @@ void *vmalloc_32_user(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_32_user);
+/*
+ * small helper routine , copy contents to buf from addr.
+ * If the page is not present, fill zero.
+ */
+
+static int aligned_vread(char *buf, char *addr, unsigned long count)
+{
+ struct page *p;
+ int copied = 0;
+
+ while (count) {
+ unsigned long offset, length;
+
+ offset = (unsigned long)addr & ~PAGE_MASK;
+ length = PAGE_SIZE - offset;
+ if (length > count)
+ length = count;
+ p = vmalloc_to_page(addr);
+ /*
+ * To do safe access to this _mapped_ area, we need
+ * lock. But adding lock here means that we need to add
+ * overhead of vmalloc()/vfree() calles for this _debug_
+ * interface, rarely used. Instead of that, we'll use
+ * kmap() and get small overhead in this access function.
+ */
+ if (p) {
+ /*
+ * we can expect USER0 is not used (see vread/vwrite's
+ * function description)
+ */
+ void *map = kmap_atomic(p, KM_USER0);
+ memcpy(buf, map + offset, length);
+ kunmap_atomic(map, KM_USER0);
+ } else
+ memset(buf, 0, length);
+
+ addr += length;
+ buf += length;
+ copied += length;
+ count -= length;
+ }
+ return copied;
+}
+
+static int aligned_vwrite(char *buf, char *addr, unsigned long count)
+{
+ struct page *p;
+ int copied = 0;
+
+ while (count) {
+ unsigned long offset, length;
+
+ offset = (unsigned long)addr & ~PAGE_MASK;
+ length = PAGE_SIZE - offset;
+ if (length > count)
+ length = count;
+ p = vmalloc_to_page(addr);
+ /*
+ * To do safe access to this _mapped_ area, we need
+ * lock. But adding lock here means that we need to add
+ * overhead of vmalloc()/vfree() calles for this _debug_
+ * interface, rarely used. Instead of that, we'll use
+ * kmap() and get small overhead in this access function.
+ */
+ if (p) {
+ /*
+ * we can expect USER0 is not used (see vread/vwrite's
+ * function description)
+ */
+ void *map = kmap_atomic(p, KM_USER0);
+ memcpy(map + offset, buf, length);
+ kunmap_atomic(map, KM_USER0);
+ }
+ addr += length;
+ buf += length;
+ copied += length;
+ count -= length;
+ }
+ return copied;
+}
+
+/**
+ * vread() - read vmalloc area in a safe way.
+ * @buf: buffer for reading data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
+ *
+ * Returns # of bytes which addr and buf should be increased.
+ * (same number to @count). Returns 0 if [addr...addr+count) doesn't
+ * includes any intersect with alive vmalloc area.
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from that area to a given buffer. If the given memory range
+ * of [addr...addr+count) includes some valid address, data is copied to
+ * proper area of @buf. If there are memory holes, they'll be zero-filled.
+ * IOREMAP area is treated as memory hole and no copy is done.
+ *
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0.
+ * @buf should be kernel's buffer. Because this function uses KM_USER0,
+ * the caller should guarantee KM_USER0 is not used.
+ *
+ * Note: In usual ops, vread() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any informaion, as /dev/kmem.
+ *
+ */
+
long vread(char *buf, char *addr, unsigned long count)
{
struct vm_struct *tmp;
char *vaddr, *buf_start = buf;
+ unsigned long buflen = count;
unsigned long n;
/* Don't allow overflow */
@@ -1636,7 +1764,7 @@ long vread(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
read_lock(&vmlist_lock);
- for (tmp = vmlist; tmp; tmp = tmp->next) {
+ for (tmp = vmlist; count && tmp; tmp = tmp->next) {
vaddr = (char *) tmp->addr;
if (addr >= vaddr + tmp->size - PAGE_SIZE)
continue;
@@ -1649,32 +1777,72 @@ long vread(char *buf, char *addr, unsigned long count)
count--;
}
n = vaddr + tmp->size - PAGE_SIZE - addr;
- do {
- if (count == 0)
- goto finished;
- *buf = *addr;
- buf++;
- addr++;
- count--;
- } while (--n > 0);
+ if (n > count)
+ n = count;
+ if (!(tmp->flags & VM_IOREMAP))
+ aligned_vread(buf, addr, n);
+ else /* IOREMAP area is treated as memory hole */
+ memset(buf, 0, n);
+ buf += n;
+ addr += n;
+ count -= n;
}
finished:
read_unlock(&vmlist_lock);
- return buf - buf_start;
+
+ if (buf == buf_start)
+ return 0;
+ /* zero-fill memory holes */
+ if (buf != buf_start + buflen)
+ memset(buf, 0, buflen - (buf - buf_start));
+
+ return buflen;
}
+/**
+ * vwrite() - write vmalloc area in a safe way.
+ * @buf: buffer for source data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
+ *
+ * Returns # of bytes which addr and buf should be incresed.
+ * (same number to @count).
+ * If [addr...addr+count) doesn't includes any intersect with valid
+ * vmalloc area, returns 0.
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from a buffer to the given addr. If specified range of
+ * [addr...addr+count) includes some valid address, data is copied from
+ * proper area of @buf. If there are memory holes, no copy to hole.
+ * IOREMAP area is treated as memory hole and no copy is done.
+ *
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0.
+ * @buf should be kernel's buffer. Because this function uses KM_USER0,
+ * the caller should guarantee KM_USER0 is not used.
+ *
+ * Note: In usual ops, vwrite() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any informaion, as /dev/kmem.
+ *
+ * The caller should guarantee KM_USER1 is not used.
+ */
+
long vwrite(char *buf, char *addr, unsigned long count)
{
struct vm_struct *tmp;
- char *vaddr, *buf_start = buf;
- unsigned long n;
+ char *vaddr;
+ unsigned long n, buflen;
+ int copied = 0;
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
+ buflen = count;
read_lock(&vmlist_lock);
- for (tmp = vmlist; tmp; tmp = tmp->next) {
+ for (tmp = vmlist; count && tmp; tmp = tmp->next) {
vaddr = (char *) tmp->addr;
if (addr >= vaddr + tmp->size - PAGE_SIZE)
continue;
@@ -1686,18 +1854,21 @@ long vwrite(char *buf, char *addr, unsigned long count)
count--;
}
n = vaddr + tmp->size - PAGE_SIZE - addr;
- do {
- if (count == 0)
- goto finished;
- *addr = *buf;
- buf++;
- addr++;
- count--;
- } while (--n > 0);
+ if (n > count)
+ n = count;
+ if (!(tmp->flags & VM_IOREMAP)) {
+ aligned_vwrite(buf, addr, n);
+ copied++;
+ }
+ buf += n;
+ addr += n;
+ count -= n;
}
finished:
read_unlock(&vmlist_lock);
- return buf - buf_start;
+ if (!copied)
+ return 0;
+ return buflen;
}
/**
@@ -1818,6 +1989,286 @@ void free_vm_area(struct vm_struct *area)
}
EXPORT_SYMBOL_GPL(free_vm_area);
+static struct vmap_area *node_to_va(struct rb_node *n)
+{
+ return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
+}
+
+/**
+ * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
+ * @end: target address
+ * @pnext: out arg for the next vmap_area
+ * @pprev: out arg for the previous vmap_area
+ *
+ * Returns: %true if either or both of next and prev are found,
+ * %false if no vmap_area exists
+ *
+ * Find vmap_areas end addresses of which enclose @end. ie. if not
+ * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
+ */
+static bool pvm_find_next_prev(unsigned long end,
+ struct vmap_area **pnext,
+ struct vmap_area **pprev)
+{
+ struct rb_node *n = vmap_area_root.rb_node;
+ struct vmap_area *va = NULL;
+
+ while (n) {
+ va = rb_entry(n, struct vmap_area, rb_node);
+ if (end < va->va_end)
+ n = n->rb_left;
+ else if (end > va->va_end)
+ n = n->rb_right;
+ else
+ break;
+ }
+
+ if (!va)
+ return false;
+
+ if (va->va_end > end) {
+ *pnext = va;
+ *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
+ } else {
+ *pprev = va;
+ *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
+ }
+ return true;
+}
+
+/**
+ * pvm_determine_end - find the highest aligned address between two vmap_areas
+ * @pnext: in/out arg for the next vmap_area
+ * @pprev: in/out arg for the previous vmap_area
+ * @align: alignment
+ *
+ * Returns: determined end address
+ *
+ * Find the highest aligned address between *@pnext and *@pprev below
+ * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
+ * down address is between the end addresses of the two vmap_areas.
+ *
+ * Please note that the address returned by this function may fall
+ * inside *@pnext vmap_area. The caller is responsible for checking
+ * that.
+ */
+static unsigned long pvm_determine_end(struct vmap_area **pnext,
+ struct vmap_area **pprev,
+ unsigned long align)
+{
+ const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+ unsigned long addr;
+
+ if (*pnext)
+ addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
+ else
+ addr = vmalloc_end;
+
+ while (*pprev && (*pprev)->va_end > addr) {
+ *pnext = *pprev;
+ *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
+ }
+
+ return addr;
+}
+
+/**
+ * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
+ * @offsets: array containing offset of each area
+ * @sizes: array containing size of each area
+ * @nr_vms: the number of areas to allocate
+ * @align: alignment, all entries in @offsets and @sizes must be aligned to this
+ * @gfp_mask: allocation mask
+ *
+ * Returns: kmalloc'd vm_struct pointer array pointing to allocated
+ * vm_structs on success, %NULL on failure
+ *
+ * Percpu allocator wants to use congruent vm areas so that it can
+ * maintain the offsets among percpu areas. This function allocates
+ * congruent vmalloc areas for it. These areas tend to be scattered
+ * pretty far, distance between two areas easily going up to
+ * gigabytes. To avoid interacting with regular vmallocs, these areas
+ * are allocated from top.
+ *
+ * Despite its complicated look, this allocator is rather simple. It
+ * does everything top-down and scans areas from the end looking for
+ * matching slot. While scanning, if any of the areas overlaps with
+ * existing vmap_area, the base address is pulled down to fit the
+ * area. Scanning is repeated till all the areas fit and then all
+ * necessary data structres are inserted and the result is returned.
+ */
+struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
+ const size_t *sizes, int nr_vms,
+ size_t align, gfp_t gfp_mask)
+{
+ const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
+ const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+ struct vmap_area **vas, *prev, *next;
+ struct vm_struct **vms;
+ int area, area2, last_area, term_area;
+ unsigned long base, start, end, last_end;
+ bool purged = false;
+
+ gfp_mask &= GFP_RECLAIM_MASK;
+
+ /* verify parameters and allocate data structures */
+ BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
+ for (last_area = 0, area = 0; area < nr_vms; area++) {
+ start = offsets[area];
+ end = start + sizes[area];
+
+ /* is everything aligned properly? */
+ BUG_ON(!IS_ALIGNED(offsets[area], align));
+ BUG_ON(!IS_ALIGNED(sizes[area], align));
+
+ /* detect the area with the highest address */
+ if (start > offsets[last_area])
+ last_area = area;
+
+ for (area2 = 0; area2 < nr_vms; area2++) {
+ unsigned long start2 = offsets[area2];
+ unsigned long end2 = start2 + sizes[area2];
+
+ if (area2 == area)
+ continue;
+
+ BUG_ON(start2 >= start && start2 < end);
+ BUG_ON(end2 <= end && end2 > start);
+ }
+ }
+ last_end = offsets[last_area] + sizes[last_area];
+
+ if (vmalloc_end - vmalloc_start < last_end) {
+ WARN_ON(true);
+ return NULL;
+ }
+
+ vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
+ vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
+ if (!vas || !vms)
+ goto err_free;
+
+ for (area = 0; area < nr_vms; area++) {
+ vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
+ vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
+ if (!vas[area] || !vms[area])
+ goto err_free;
+ }
+retry:
+ spin_lock(&vmap_area_lock);
+
+ /* start scanning - we scan from the top, begin with the last area */
+ area = term_area = last_area;
+ start = offsets[area];
+ end = start + sizes[area];
+
+ if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
+ base = vmalloc_end - last_end;
+ goto found;
+ }
+ base = pvm_determine_end(&next, &prev, align) - end;
+
+ while (true) {
+ BUG_ON(next && next->va_end <= base + end);
+ BUG_ON(prev && prev->va_end > base + end);
+
+ /*
+ * base might have underflowed, add last_end before
+ * comparing.
+ */
+ if (base + last_end < vmalloc_start + last_end) {
+ spin_unlock(&vmap_area_lock);
+ if (!purged) {
+ purge_vmap_area_lazy();
+ purged = true;
+ goto retry;
+ }
+ goto err_free;
+ }
+
+ /*
+ * If next overlaps, move base downwards so that it's
+ * right below next and then recheck.
+ */
+ if (next && next->va_start < base + end) {
+ base = pvm_determine_end(&next, &prev, align) - end;
+ term_area = area;
+ continue;
+ }
+
+ /*
+ * If prev overlaps, shift down next and prev and move
+ * base so that it's right below new next and then
+ * recheck.
+ */
+ if (prev && prev->va_end > base + start) {
+ next = prev;
+ prev = node_to_va(rb_prev(&next->rb_node));
+ base = pvm_determine_end(&next, &prev, align) - end;
+ term_area = area;
+ continue;
+ }
+
+ /*
+ * This area fits, move on to the previous one. If
+ * the previous one is the terminal one, we're done.
+ */
+ area = (area + nr_vms - 1) % nr_vms;
+ if (area == term_area)
+ break;
+ start = offsets[area];
+ end = start + sizes[area];
+ pvm_find_next_prev(base + end, &next, &prev);
+ }
+found:
+ /* we've found a fitting base, insert all va's */
+ for (area = 0; area < nr_vms; area++) {
+ struct vmap_area *va = vas[area];
+
+ va->va_start = base + offsets[area];
+ va->va_end = va->va_start + sizes[area];
+ __insert_vmap_area(va);
+ }
+
+ vmap_area_pcpu_hole = base + offsets[last_area];
+
+ spin_unlock(&vmap_area_lock);
+
+ /* insert all vm's */
+ for (area = 0; area < nr_vms; area++)
+ insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
+ pcpu_get_vm_areas);
+
+ kfree(vas);
+ return vms;
+
+err_free:
+ for (area = 0; area < nr_vms; area++) {
+ if (vas)
+ kfree(vas[area]);
+ if (vms)
+ kfree(vms[area]);
+ }
+ kfree(vas);
+ kfree(vms);
+ return NULL;
+}
+
+/**
+ * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
+ * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
+ * @nr_vms: the number of allocated areas
+ *
+ * Free vm_structs and the array allocated by pcpu_get_vm_areas().
+ */
+void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
+{
+ int i;
+
+ for (i = 0; i < nr_vms; i++)
+ free_vm_area(vms[i]);
+ kfree(vms);
+}
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ba8228e0a80..613e89f471d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -148,8 +148,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
return &zone->reclaim_stat;
}
-static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
- enum lru_list lru)
+static unsigned long zone_nr_lru_pages(struct zone *zone,
+ struct scan_control *sc, enum lru_list lru)
{
if (!scanning_global_lru(sc))
return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
@@ -286,7 +286,12 @@ static inline int page_mapping_inuse(struct page *page)
static inline int is_page_cache_freeable(struct page *page)
{
- return page_count(page) - !!page_has_private(page) == 2;
+ /*
+ * A freeable page cache page is referenced only by the caller
+ * that isolated the page, the page cache radix tree and
+ * optional buffer heads at page->private.
+ */
+ return page_count(page) - page_has_private(page) == 2;
}
static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -361,7 +366,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
- * See swapfile.c:page_queue_congested().
*/
if (!is_page_cache_freeable(page))
return PAGE_KEEP;
@@ -531,7 +535,7 @@ redo:
* unevictable page on [in]active list.
* We know how to handle that.
*/
- lru = active + page_is_file_cache(page);
+ lru = active + page_lru_base_type(page);
lru_cache_add_lru(page, lru);
} else {
/*
@@ -821,7 +825,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
return ret;
- if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
+ if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
return ret;
/*
@@ -935,6 +939,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
/* Check that we have not crossed a zone boundary. */
if (unlikely(page_zone_id(cursor_page) != zone_id))
continue;
+
+ /*
+ * If we don't have enough swap space, reclaiming of
+ * anon page which don't already have a swap slot is
+ * pointless.
+ */
+ if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+ !PageSwapCache(cursor_page))
+ continue;
+
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
@@ -961,7 +975,7 @@ static unsigned long isolate_pages_global(unsigned long nr,
if (file)
lru += LRU_FILE;
return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
- mode, !!file);
+ mode, file);
}
/*
@@ -976,7 +990,7 @@ static unsigned long clear_active_flags(struct list_head *page_list,
struct page *page;
list_for_each_entry(page, page_list, lru) {
- lru = page_is_file_cache(page);
+ lru = page_lru_base_type(page);
if (PageActive(page)) {
lru += LRU_ACTIVE;
ClearPageActive(page);
@@ -1034,6 +1048,31 @@ int isolate_lru_page(struct page *page)
}
/*
+ * Are there way too many processes in the direct reclaim path already?
+ */
+static int too_many_isolated(struct zone *zone, int file,
+ struct scan_control *sc)
+{
+ unsigned long inactive, isolated;
+
+ if (current_is_kswapd())
+ return 0;
+
+ if (!scanning_global_lru(sc))
+ return 0;
+
+ if (file) {
+ inactive = zone_page_state(zone, NR_INACTIVE_FILE);
+ isolated = zone_page_state(zone, NR_ISOLATED_FILE);
+ } else {
+ inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+ isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+ }
+
+ return isolated > inactive;
+}
+
+/*
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
*/
@@ -1048,6 +1087,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
int lumpy_reclaim = 0;
+ while (unlikely(too_many_isolated(zone, file, sc))) {
+ congestion_wait(WRITE, HZ/10);
+
+ /* We are about to die and free our memory. Return now. */
+ if (fatal_signal_pending(current))
+ return SWAP_CLUSTER_MAX;
+ }
+
/*
* If we need a large contiguous chunk of memory, or have
* trouble getting a small set of contiguous pages, we
@@ -1072,10 +1119,26 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
unsigned long nr_active;
unsigned int count[NR_LRU_LISTS] = { 0, };
int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
+ unsigned long nr_anon;
+ unsigned long nr_file;
nr_taken = sc->isolate_pages(sc->swap_cluster_max,
&page_list, &nr_scan, sc->order, mode,
zone, sc->mem_cgroup, 0, file);
+
+ if (scanning_global_lru(sc)) {
+ zone->pages_scanned += nr_scan;
+ if (current_is_kswapd())
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+ nr_scan);
+ else
+ __count_zone_vm_events(PGSCAN_DIRECT, zone,
+ nr_scan);
+ }
+
+ if (nr_taken == 0)
+ goto done;
+
nr_active = clear_active_flags(&page_list, count);
__count_vm_events(PGDEACTIVATE, nr_active);
@@ -1088,8 +1151,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
__mod_zone_page_state(zone, NR_INACTIVE_ANON,
-count[LRU_INACTIVE_ANON]);
- if (scanning_global_lru(sc))
- zone->pages_scanned += nr_scan;
+ nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+ nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
@@ -1123,18 +1188,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
}
nr_reclaimed += nr_freed;
+
local_irq_disable();
- if (current_is_kswapd()) {
- __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
+ if (current_is_kswapd())
__count_vm_events(KSWAPD_STEAL, nr_freed);
- } else if (scanning_global_lru(sc))
- __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
-
__count_zone_vm_events(PGSTEAL, zone, nr_freed);
- if (nr_taken == 0)
- goto done;
-
spin_lock(&zone->lru_lock);
/*
* Put back any unfreeable pages.
@@ -1153,8 +1212,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
SetPageLRU(page);
lru = page_lru(page);
add_page_to_lru_list(zone, page, lru);
- if (PageActive(page)) {
- int file = !!page_is_file_cache(page);
+ if (is_active_lru(lru)) {
+ int file = is_file_lru(lru);
reclaim_stat->recent_rotated[file]++;
}
if (!pagevec_add(&pvec, page)) {
@@ -1163,10 +1222,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
spin_lock_irq(&zone->lru_lock);
}
}
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+
} while (nr_scanned < max_scan);
- spin_unlock(&zone->lru_lock);
+
done:
- local_irq_enable();
+ spin_unlock_irq(&zone->lru_lock);
pagevec_release(&pvec);
return nr_reclaimed;
}
@@ -1215,15 +1277,10 @@ static void move_active_pages_to_lru(struct zone *zone,
while (!list_empty(list)) {
page = lru_to_page(list);
- prefetchw_prev_lru_page(page, list, flags);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
- VM_BUG_ON(!PageActive(page));
- if (!is_active_lru(lru))
- ClearPageActive(page); /* we are de-activating */
-
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
pgmoved++;
@@ -1244,7 +1301,7 @@ static void move_active_pages_to_lru(struct zone *zone,
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct scan_control *sc, int priority, int file)
{
- unsigned long pgmoved;
+ unsigned long nr_taken;
unsigned long pgscanned;
unsigned long vm_flags;
LIST_HEAD(l_hold); /* The pages which were snipped off */
@@ -1252,10 +1309,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
LIST_HEAD(l_inactive);
struct page *page;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+ unsigned long nr_rotated = 0;
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
- pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
+ nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
ISOLATE_ACTIVE, zone,
sc->mem_cgroup, 1, file);
/*
@@ -1265,16 +1323,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
if (scanning_global_lru(sc)) {
zone->pages_scanned += pgscanned;
}
- reclaim_stat->recent_scanned[!!file] += pgmoved;
+ reclaim_stat->recent_scanned[file] += nr_taken;
__count_zone_vm_events(PGREFILL, zone, pgscanned);
if (file)
- __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
+ __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
else
- __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
+ __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
spin_unlock_irq(&zone->lru_lock);
- pgmoved = 0; /* count referenced (mapping) mapped pages */
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
@@ -1288,7 +1346,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
/* page_referenced clears PageReferenced */
if (page_mapping_inuse(page) &&
page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
- pgmoved++;
+ nr_rotated++;
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
@@ -1304,6 +1362,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
}
}
+ ClearPageActive(page); /* we are de-activating */
list_add(&page->lru, &l_inactive);
}
@@ -1317,13 +1376,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* helps balance scan pressure between file and anonymous pages in
* get_scan_ratio.
*/
- reclaim_stat->recent_rotated[!!file] += pgmoved;
+ reclaim_stat->recent_rotated[file] += nr_rotated;
move_active_pages_to_lru(zone, &l_active,
LRU_ACTIVE + file * LRU_FILE);
move_active_pages_to_lru(zone, &l_inactive,
LRU_BASE + file * LRU_FILE);
-
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
}
@@ -1429,10 +1488,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
unsigned long ap, fp;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
- zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
- file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
- zone_nr_pages(zone, sc, LRU_INACTIVE_FILE);
+ anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+ file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
if (scanning_global_lru(sc)) {
free = zone_page_state(zone, NR_FREE_PAGES);
@@ -1526,6 +1585,7 @@ static void shrink_zone(int priority, struct zone *zone,
enum lru_list l;
unsigned long nr_reclaimed = sc->nr_reclaimed;
unsigned long swap_cluster_max = sc->swap_cluster_max;
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
int noswap = 0;
/* If we have no swap space, do not bother scanning anon pages. */
@@ -1540,17 +1600,14 @@ static void shrink_zone(int priority, struct zone *zone,
int file = is_file_lru(l);
unsigned long scan;
- scan = zone_nr_pages(zone, sc, l);
+ scan = zone_nr_lru_pages(zone, sc, l);
if (priority || noswap) {
scan >>= priority;
scan = (scan * percent[file]) / 100;
}
- if (scanning_global_lru(sc))
- nr[l] = nr_scan_try_batch(scan,
- &zone->lru[l].nr_saved_scan,
- swap_cluster_max);
- else
- nr[l] = scan;
+ nr[l] = nr_scan_try_batch(scan,
+ &reclaim_stat->nr_saved_scan[l],
+ swap_cluster_max);
}
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1685,7 +1742,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- lru_pages += zone_lru_pages(zone);
+ lru_pages += zone_reclaimable_pages(zone);
}
}
@@ -1902,7 +1959,7 @@ loop_again:
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
- lru_pages += zone_lru_pages(zone);
+ lru_pages += zone_reclaimable_pages(zone);
}
/*
@@ -1946,7 +2003,7 @@ loop_again:
if (zone_is_all_unreclaimable(zone))
continue;
if (nr_slab == 0 && zone->pages_scanned >=
- (zone_lru_pages(zone) * 6))
+ (zone_reclaimable_pages(zone) * 6))
zone_set_flag(zone,
ZONE_ALL_UNRECLAIMABLE);
/*
@@ -2113,12 +2170,39 @@ void wakeup_kswapd(struct zone *zone, int order)
wake_up_interruptible(&pgdat->kswapd_wait);
}
-unsigned long global_lru_pages(void)
+/*
+ * The reclaimable count would be mostly accurate.
+ * The less reclaimable pages may be
+ * - mlocked pages, which will be moved to unevictable list when encountered
+ * - mapped pages, which may require several travels to be reclaimed
+ * - dirty pages, which is not "instantly" reclaimable
+ */
+unsigned long global_reclaimable_pages(void)
+{
+ int nr;
+
+ nr = global_page_state(NR_ACTIVE_FILE) +
+ global_page_state(NR_INACTIVE_FILE);
+
+ if (nr_swap_pages > 0)
+ nr += global_page_state(NR_ACTIVE_ANON) +
+ global_page_state(NR_INACTIVE_ANON);
+
+ return nr;
+}
+
+unsigned long zone_reclaimable_pages(struct zone *zone)
{
- return global_page_state(NR_ACTIVE_ANON)
- + global_page_state(NR_ACTIVE_FILE)
- + global_page_state(NR_INACTIVE_ANON)
- + global_page_state(NR_INACTIVE_FILE);
+ int nr;
+
+ nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
+
+ if (nr_swap_pages > 0)
+ nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+
+ return nr;
}
#ifdef CONFIG_HIBERNATION
@@ -2133,6 +2217,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
{
struct zone *zone;
unsigned long nr_reclaimed = 0;
+ struct zone_reclaim_stat *reclaim_stat;
for_each_populated_zone(zone) {
enum lru_list l;
@@ -2149,11 +2234,14 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
l == LRU_ACTIVE_FILE))
continue;
- zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
- if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
+ reclaim_stat = get_reclaim_stat(zone, sc);
+ reclaim_stat->nr_saved_scan[l] +=
+ (lru_pages >> prio) + 1;
+ if (reclaim_stat->nr_saved_scan[l]
+ >= nr_pages || pass > 3) {
unsigned long nr_to_scan;
- zone->lru[l].nr_saved_scan = 0;
+ reclaim_stat->nr_saved_scan[l] = 0;
nr_to_scan = min(nr_pages, lru_pages);
nr_reclaimed += shrink_list(l, nr_to_scan, zone,
sc, prio);
@@ -2190,7 +2278,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
current->reclaim_state = &reclaim_state;
- lru_pages = global_lru_pages();
+ lru_pages = global_reclaimable_pages();
nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
/* If slab caches are huge, it's better to hit them first */
while (nr_slab >= lru_pages) {
@@ -2232,7 +2320,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
reclaim_state.reclaimed_slab = 0;
shrink_slab(sc.nr_scanned, sc.gfp_mask,
- global_lru_pages());
+ global_reclaimable_pages());
sc.nr_reclaimed += reclaim_state.reclaimed_slab;
if (sc.nr_reclaimed >= nr_pages)
goto out;
@@ -2249,7 +2337,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
if (!sc.nr_reclaimed) {
do {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
+ shrink_slab(nr_pages, sc.gfp_mask,
+ global_reclaimable_pages());
sc.nr_reclaimed += reclaim_state.reclaimed_slab;
} while (sc.nr_reclaimed < nr_pages &&
reclaim_state.reclaimed_slab > 0);
@@ -2569,7 +2658,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone)
retry:
ClearPageUnevictable(page);
if (page_evictable(page, NULL)) {
- enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
+ enum lru_list l = page_lru_base_type(page);
__dec_zone_state(zone, NR_UNEVICTABLE);
list_move(&page->lru, &zone->lru[l].list);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 138bed53706..c81321f9fee 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -639,11 +639,14 @@ static const char * const vmstat_text[] = {
"nr_slab_reclaimable",
"nr_slab_unreclaimable",
"nr_page_table_pages",
+ "nr_kernel_stack",
"nr_unstable",
"nr_bounce",
"nr_vmscan_write",
"nr_writeback_temp",
-
+ "nr_isolated_anon",
+ "nr_isolated_file",
+ "nr_shmem",
#ifdef CONFIG_NUMA
"numa_hit",
"numa_miss",