aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorJeff Garzik <jeff@garzik.org>2006-09-27 18:16:47 -0400
committerJeff Garzik <jeff@garzik.org>2006-09-27 18:16:47 -0400
commit3b9f6cb8a1ec791be79c6c7595fea922f12d1e64 (patch)
tree2393a448add846e6c2ed12f68106c3018b72c6a9 /mm
parentc38778c3a9aeadcd1ee319cfc8ea5a9cbf8cdafa (diff)
parenta77c64c1a641950626181b4857abb701d8f38ccc (diff)
Merge branch 'master' into upstream
Diffstat (limited to 'mm')
-rw-r--r--mm/memory.c117
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/nommu.c245
-rw-r--r--mm/page_alloc.c749
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.c126
-rw-r--r--mm/slob.c3
-rw-r--r--mm/truncate.c25
-rw-r--r--mm/vmalloc.c30
-rw-r--r--mm/vmscan.c30
-rw-r--r--mm/vmstat.c3
11 files changed, 1136 insertions, 200 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 92a3ebd8d79..601159a46ab 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2256,6 +2256,54 @@ oom:
}
/*
+ * do_no_pfn() tries to create a new page mapping for a page without
+ * a struct_page backing it
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ *
+ * It is expected that the ->nopfn handler always returns the same pfn
+ * for a given virtual mapping.
+ *
+ * Mark this `noinline' to prevent it from bloating the main pagefault code.
+ */
+static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
+{
+ spinlock_t *ptl;
+ pte_t entry;
+ unsigned long pfn;
+ int ret = VM_FAULT_MINOR;
+
+ pte_unmap(page_table);
+ BUG_ON(!(vma->vm_flags & VM_PFNMAP));
+ BUG_ON(is_cow_mapping(vma->vm_flags));
+
+ pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+ if (pfn == NOPFN_OOM)
+ return VM_FAULT_OOM;
+ if (pfn == NOPFN_SIGBUS)
+ return VM_FAULT_SIGBUS;
+
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+
+ /* Only go through if we didn't race with anybody else... */
+ if (pte_none(*page_table)) {
+ entry = pfn_pte(pfn, vma->vm_page_prot);
+ if (write_access)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ set_pte_at(mm, address, page_table, entry);
+ }
+ pte_unmap_unlock(page_table, ptl);
+ return ret;
+}
+
+/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
@@ -2317,11 +2365,17 @@ static inline int handle_pte_fault(struct mm_struct *mm,
old_entry = entry = *pte;
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (!vma->vm_ops || !vma->vm_ops->nopage)
- return do_anonymous_page(mm, vma, address,
- pte, pmd, write_access);
- return do_no_page(mm, vma, address,
- pte, pmd, write_access);
+ if (vma->vm_ops) {
+ if (vma->vm_ops->nopage)
+ return do_no_page(mm, vma, address,
+ pte, pmd,
+ write_access);
+ if (unlikely(vma->vm_ops->nopfn))
+ return do_no_pfn(mm, vma, address, pte,
+ pmd, write_access);
+ }
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, write_access);
}
if (pte_file(entry))
return do_file_page(mm, vma, address,
@@ -2550,3 +2604,56 @@ int in_gate_area_no_task(unsigned long addr)
}
#endif /* __HAVE_ARCH_GATE_AREA */
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ struct page *page;
+ void *old_buf = buf;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ down_read(&mm->mmap_sem);
+ /* ignore errors, just check how much was sucessfully transfered */
+ while (len) {
+ int bytes, ret, offset;
+ void *maddr;
+
+ ret = get_user_pages(tsk, mm, addr, 1,
+ write, 1, &page, &vma);
+ if (ret <= 0)
+ break;
+
+ bytes = len;
+ offset = addr & (PAGE_SIZE-1);
+ if (bytes > PAGE_SIZE-offset)
+ bytes = PAGE_SIZE-offset;
+
+ maddr = kmap(page);
+ if (write) {
+ copy_to_user_page(vma, page, addr,
+ maddr + offset, buf, bytes);
+ set_page_dirty_lock(page);
+ } else {
+ copy_from_user_page(vma, page, addr,
+ buf, maddr + offset, bytes);
+ }
+ kunmap(page);
+ page_cache_release(page);
+ len -= bytes;
+ buf += bytes;
+ addr += bytes;
+ }
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+
+ return buf - old_buf;
+}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 38f89650bc8..cf18f094255 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1136,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
*/
unsigned slab_node(struct mempolicy *policy)
{
- switch (policy->policy) {
+ int pol = policy ? policy->policy : MPOL_DEFAULT;
+
+ switch (pol) {
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
diff --git a/mm/nommu.c b/mm/nommu.c
index d99dea31e44..56454066219 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp)
}
/*
- * The nommu dodgy version :-)
+ * get a list of pages in an address range belonging to the specified process
+ * and indicate the VMA that covers each page
+ * - this is potentially dodgy as we may end incrementing the page count of a
+ * slab page or a secondary page from a compound page
+ * - don't permit access to VMAs that don't support it, such as I/O mappings
*/
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
{
+ struct vm_area_struct *vma;
+ unsigned long vm_flags;
int i;
- static struct vm_area_struct dummy_vma;
+
+ /* calculate required read or write permissions.
+ * - if 'force' is set, we only require the "MAY" flags.
+ */
+ vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
for (i = 0; i < len; i++) {
+ vma = find_vma(mm, start);
+ if (!vma)
+ goto finish_or_fault;
+
+ /* protect what we can, including chardevs */
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
+ !(vm_flags & vma->vm_flags))
+ goto finish_or_fault;
+
if (pages) {
pages[i] = virt_to_page(start);
if (pages[i])
page_cache_get(pages[i]);
}
if (vmas)
- vmas[i] = &dummy_vma;
+ vmas[i] = vma;
start += PAGE_SIZE;
}
- return(i);
+
+ return i;
+
+finish_or_fault:
+ return i ? : -EFAULT;
}
EXPORT_SYMBOL(get_user_pages);
@@ -286,6 +310,77 @@ static void show_process_blocks(void)
}
#endif /* DEBUG */
+/*
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * - should be called with mm->mmap_sem held writelocked
+ */
+static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
+{
+ struct vm_list_struct **ppv;
+
+ for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
+ if ((*ppv)->vma->vm_start > vml->vma->vm_start)
+ break;
+
+ vml->next = *ppv;
+ *ppv = vml;
+}
+
+/*
+ * look up the first VMA in which addr resides, NULL if none
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_list_struct *loop, *vml;
+
+ /* search the vm_start ordered list */
+ vml = NULL;
+ for (loop = mm->context.vmlist; loop; loop = loop->next) {
+ if (loop->vma->vm_start > addr)
+ break;
+ vml = loop;
+ }
+
+ if (vml && vml->vma->vm_end > addr)
+ return vml->vma;
+
+ return NULL;
+}
+EXPORT_SYMBOL(find_vma);
+
+/*
+ * find a VMA
+ * - we don't extend stack VMAs under NOMMU conditions
+ */
+struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ return find_vma(mm, addr);
+}
+
+/*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_list_struct *vml;
+
+ /* search the vm_start ordered list */
+ for (vml = mm->context.vmlist; vml; vml = vml->next) {
+ if (vml->vma->vm_start == addr)
+ return vml->vma;
+ if (vml->vma->vm_start > addr)
+ break;
+ }
+
+ return NULL;
+}
+
+/*
+ * find a VMA in the global tree
+ */
static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
{
struct vm_area_struct *vma;
@@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
return NULL;
}
+/*
+ * add a VMA in the global tree
+ */
static void add_nommu_vma(struct vm_area_struct *vma)
{
struct vm_area_struct *pvma;
@@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma)
rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
}
+/*
+ * delete a VMA from the global list
+ */
static void delete_nommu_vma(struct vm_area_struct *vma)
{
struct address_space *mapping;
@@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file,
realalloc += kobjsize(vml);
askedalloc += sizeof(*vml);
- vml->next = current->mm->context.vmlist;
- current->mm->context.vmlist = vml;
+ add_vma_to_mm(current->mm, vml);
up_write(&nommu_vma_sem);
@@ -908,6 +1008,11 @@ static void put_vma(struct vm_area_struct *vma)
}
}
+/*
+ * release a mapping
+ * - under NOMMU conditions the parameters must match exactly to the mapping to
+ * be removed
+ */
int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
{
struct vm_list_struct *vml, **parent;
@@ -917,10 +1022,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
printk("do_munmap:\n");
#endif
- for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next)
+ for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
+ if ((*parent)->vma->vm_start > addr)
+ break;
if ((*parent)->vma->vm_start == addr &&
((len == 0) || ((*parent)->vma->vm_end == end)))
goto found;
+ }
printk("munmap of non-mmaped memory by process %d (%s): %p\n",
current->pid, current->comm, (void *) addr);
@@ -946,7 +1054,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
return 0;
}
-/* Release all mmaps. */
+asmlinkage long sys_munmap(unsigned long addr, size_t len)
+{
+ int ret;
+ struct mm_struct *mm = current->mm;
+
+ down_write(&mm->mmap_sem);
+ ret = do_munmap(mm, addr, len);
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+/*
+ * Release all mappings
+ */
void exit_mmap(struct mm_struct * mm)
{
struct vm_list_struct *tmp;
@@ -973,37 +1094,26 @@ void exit_mmap(struct mm_struct * mm)
}
}
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
-{
- int ret;
- struct mm_struct *mm = current->mm;
-
- down_write(&mm->mmap_sem);
- ret = do_munmap(mm, addr, len);
- up_write(&mm->mmap_sem);
- return ret;
-}
-
unsigned long do_brk(unsigned long addr, unsigned long len)
{
return -ENOMEM;
}
/*
- * Expand (or shrink) an existing mapping, potentially moving it at the
- * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ * expand (or shrink) an existing mapping, potentially moving it at the same
+ * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
*
- * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
- * This option implies MREMAP_MAYMOVE.
+ * under NOMMU conditions, we only permit changing a mapping's size, and only
+ * as long as it stays within the hole allocated by the kmalloc() call in
+ * do_mmap_pgoff() and the block is not shareable
*
- * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the
- * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable
+ * MREMAP_FIXED is not supported under NOMMU conditions
*/
unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr)
{
- struct vm_list_struct *vml = NULL;
+ struct vm_area_struct *vma;
/* insanity checks first */
if (new_len == 0)
@@ -1012,58 +1122,46 @@ unsigned long do_mremap(unsigned long addr,
if (flags & MREMAP_FIXED && new_addr != addr)
return (unsigned long) -EINVAL;
- for (vml = current->mm->context.vmlist; vml; vml = vml->next)
- if (vml->vma->vm_start == addr)
- goto found;
-
- return (unsigned long) -EINVAL;
+ vma = find_vma_exact(current->mm, addr);
+ if (!vma)
+ return (unsigned long) -EINVAL;
- found:
- if (vml->vma->vm_end != vml->vma->vm_start + old_len)
+ if (vma->vm_end != vma->vm_start + old_len)
return (unsigned long) -EFAULT;
- if (vml->vma->vm_flags & VM_MAYSHARE)
+ if (vma->vm_flags & VM_MAYSHARE)
return (unsigned long) -EPERM;
if (new_len > kobjsize((void *) addr))
return (unsigned long) -ENOMEM;
/* all checks complete - do it */
- vml->vma->vm_end = vml->vma->vm_start + new_len;
+ vma->vm_end = vma->vm_start + new_len;
askedalloc -= old_len;
askedalloc += new_len;
- return vml->vma->vm_start;
+ return vma->vm_start;
}
-/*
- * Look up the first VMA which satisfies addr < vm_end, NULL if none
- */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+asmlinkage unsigned long sys_mremap(unsigned long addr,
+ unsigned long old_len, unsigned long new_len,
+ unsigned long flags, unsigned long new_addr)
{
- struct vm_list_struct *vml;
-
- for (vml = mm->context.vmlist; vml; vml = vml->next)
- if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
- return vml->vma;
+ unsigned long ret;
- return NULL;
+ down_write(&current->mm->mmap_sem);
+ ret = do_mremap(addr, old_len, new_len, flags, new_addr);
+ up_write(&current->mm->mmap_sem);
+ return ret;
}
-EXPORT_SYMBOL(find_vma);
-
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int foll_flags)
{
return NULL;
}
-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
-{
- return NULL;
-}
-
int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
unsigned long to, unsigned long size, pgprot_t prot)
{
@@ -1206,3 +1304,44 @@ struct page *filemap_nopage(struct vm_area_struct *area,
BUG();
return NULL;
}
+
+/*
+ * Access another process' address space.
+ * - source/target buffer must be kernel space
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+
+ if (addr + len < addr)
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ down_read(&mm->mmap_sem);
+
+ /* the access must start within one of the target process's mappings */
+ vma = find_vma(mm, addr);
+ if (vma) {
+ /* don't overrun this mapping */
+ if (addr + len >= vma->vm_end)
+ len = vma->vm_end - addr;
+
+ /* only read or write mappings where it is permitted */
+ if (write && vma->vm_flags & VM_MAYWRITE)
+ len -= copy_to_user((void *) addr, buf, len);
+ else if (!write && vma->vm_flags & VM_MAYREAD)
+ len -= copy_from_user(buf, (void *) addr, len);
+ else
+ len = 0;
+ } else {
+ len = 0;
+ }
+
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ return len;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9810f0a60db..4f59d90b81e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,8 @@
#include <linux/vmalloc.h>
#include <linux/mempolicy.h>
#include <linux/stop_machine.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -102,6 +104,38 @@ int min_free_kbytes = 1024;
unsigned long __meminitdata nr_kernel_pages;
unsigned long __meminitdata nr_all_pages;
+static unsigned long __initdata dma_reserve;
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ /*
+ * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
+ * ranges of memory (RAM) that may be registered with add_active_range().
+ * Ranges passed to add_active_range() will be merged if possible
+ * so the number of times add_active_range() can be called is
+ * related to the number of nodes and the number of holes
+ */
+ #ifdef CONFIG_MAX_ACTIVE_REGIONS
+ /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
+ #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+ #else
+ #if MAX_NUMNODES >= 32
+ /* If there can be many nodes, allow up to 50 holes per node */
+ #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
+ #else
+ /* By default, allow up to 256 distinct regions */
+ #define MAX_ACTIVE_REGIONS 256
+ #endif
+ #endif
+
+ struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+ int __initdata nr_nodemap_entries;
+ unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+ unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+ unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
+ unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -908,7 +942,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
*/
do {
zone = *z;
- if (unlikely((gfp_mask & __GFP_THISNODE) &&
+ if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
break;
if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1222,14 +1256,12 @@ unsigned int nr_free_pagecache_pages(void)
{
return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
}
-#ifdef CONFIG_NUMA
-static void show_node(struct zone *zone)
+
+static inline void show_node(struct zone *zone)
{
- printk("Node %ld ", zone_to_nid(zone));
+ if (NUMA_BUILD)
+ printk("Node %ld ", zone_to_nid(zone));
}
-#else
-#define show_node(zone) do { } while (0)
-#endif
void si_meminfo(struct sysinfo *val)
{
@@ -1271,34 +1303,30 @@ void si_meminfo_node(struct sysinfo *val, int nid)
*/
void show_free_areas(void)
{
- int cpu, temperature;
+ int cpu;
unsigned long active;
unsigned long inactive;
unsigned long free;
struct zone *zone;
for_each_zone(zone) {
- show_node(zone);
- printk("%s per-cpu:", zone->name);
-
- if (!populated_zone(zone)) {
- printk(" empty\n");
+ if (!populated_zone(zone))
continue;
- } else
- printk("\n");
+
+ show_node(zone);
+ printk("%s per-cpu:\n", zone->name);
for_each_online_cpu(cpu) {
struct per_cpu_pageset *pageset;
pageset = zone_pcp(zone, cpu);
- for (temperature = 0; temperature < 2; temperature++)
- printk("cpu %d %s: high %d, batch %d used:%d\n",
- cpu,
- temperature ? "cold" : "hot",
- pageset->pcp[temperature].high,
- pageset->pcp[temperature].batch,
- pageset->pcp[temperature].count);
+ printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
+ "Cold: hi:%5d, btch:%4d usd:%4d\n",
+ cpu, pageset->pcp[0].high,
+ pageset->pcp[0].batch, pageset->pcp[0].count,
+ pageset->pcp[1].high, pageset->pcp[1].batch,
+ pageset->pcp[1].count);
}
}
@@ -1320,6 +1348,9 @@ void show_free_areas(void)
for_each_zone(zone) {
int i;
+ if (!populated_zone(zone))
+ continue;
+
show_node(zone);
printk("%s"
" free:%lukB"
@@ -1352,12 +1383,11 @@ void show_free_areas(void)
for_each_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ if (!populated_zone(zone))
+ continue;
+
show_node(zone);
printk("%s: ", zone->name);
- if (!populated_zone(zone)) {
- printk("empty\n");
- continue;
- }
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
@@ -1561,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy)
void __meminit build_all_zonelists(void)
{
if (system_state == SYSTEM_BOOTING) {
- __build_all_zonelists(0);
+ __build_all_zonelists(NULL);
cpuset_init_current_mems_allowed();
} else {
/* we have to stop all cpus to guaranntee there is no user
@@ -1642,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
-static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
- unsigned long *zones_size, unsigned long *zholes_size)
-{
- unsigned long realtotalpages, totalpages = 0;
- enum zone_type i;
-
- for (i = 0; i < MAX_NR_ZONES; i++)
- totalpages += zones_size[i];
- pgdat->node_spanned_pages = totalpages;
-
- realtotalpages = totalpages;
- if (zholes_size)
- for (i = 0; i < MAX_NR_ZONES; i++)
- realtotalpages -= zholes_size[i];
- pgdat->node_present_pages = realtotalpages;
- printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
-}
-
-
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
@@ -1818,6 +1829,9 @@ static int __cpuinit process_zones(int cpu)
for_each_zone(zone) {
+ if (!populated_zone(zone))
+ continue;
+
zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
GFP_KERNEL, cpu_to_node(cpu));
if (!zone_pcp(zone, cpu))
@@ -1977,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone,
return 0;
}
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/*
+ * Basic iterator support. Return the first range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns first region regardless of node
+ */
+static int __init first_active_region_index_in_nid(int nid)
+{
+ int i;
+
+ for (i = 0; i < nr_nodemap_entries; i++)
+ if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+ return i;
+
+ return -1;
+}
+
+/*
+ * Basic iterator support. Return the next active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardles of node
+ */
+static int __init next_active_region_index_in_nid(int index, int nid)
+{
+ for (index = index + 1; index < nr_nodemap_entries; index++)
+ if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+ return index;
+
+ return -1;
+}
+
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ * Architectures may implement their own version but if add_active_range()
+ * was used and there are no special requirements, this is a convenient
+ * alternative
+ */
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+ int i;
+
+ for (i = 0; i < nr_nodemap_entries; i++) {
+ unsigned long start_pfn = early_node_map[i].start_pfn;
+ unsigned long end_pfn = early_node_map[i].end_pfn;
+
+ if (start_pfn <= pfn && pfn < end_pfn)
+ return early_node_map[i].nid;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+
+/* Basic iterator support to walk early_node_map[] */
+#define for_each_active_range_index_in_nid(i, nid) \
+ for (i = first_active_region_index_in_nid(nid); i != -1; \
+ i = next_active_region_index_in_nid(i, nid))
+
+/**
+ * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed
+ * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node
+ *
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * this function may be used instead of calling free_bootmem() manually.
+ */
+void __init free_bootmem_with_active_regions(int nid,
+ unsigned long max_low_pfn)
+{
+ int i;
+
+ for_each_active_range_index_in_nid(i, nid) {
+ unsigned long size_pages = 0;
+ unsigned long end_pfn = early_node_map[i].end_pfn;
+
+ if (early_node_map[i].start_pfn >= max_low_pfn)
+ continue;
+
+ if (end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
+
+ size_pages = end_pfn - early_node_map[i].start_pfn;
+ free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+ PFN_PHYS(early_node_map[i].start_pfn),
+ size_pages << PAGE_SHIFT);
+ }
+}
+
+/**
+ * sparse_memory_present_with_active_regions - Call memory_present for each active range
+ * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used
+ *
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * this function may be used instead of calling memory_present() manually.
+ */
+void __init sparse_memory_present_with_active_regions(int nid)
+{
+ int i;
+
+ for_each_active_range_index_in_nid(i, nid)
+ memory_present(early_node_map[i].nid,
+ early_node_map[i].start_pfn,
+ early_node_map[i].end_pfn);
+}
+
+/**
+ * push_node_boundaries - Push node boundaries to at least the requested boundary
+ * @nid: The nid of the node to push the boundary for
+ * @start_pfn: The start pfn of the node
+ * @end_pfn: The end pfn of the node
+ *
+ * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
+ * time. Specifically, on x86_64, SRAT will report ranges that can potentially
+ * be hotplugged even though no physical memory exists. This function allows
+ * an arch to push out the node boundaries so mem_map is allocated that can
+ * be used later.
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+void __init push_node_boundaries(unsigned int nid,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
+ nid, start_pfn, end_pfn);
+
+ /* Initialise the boundary for this node if necessary */
+ if (node_boundary_end_pfn[nid] == 0)
+ node_boundary_start_pfn[nid] = -1UL;
+
+ /* Update the boundaries */
+ if (node_boundary_start_pfn[nid] > start_pfn)
+ node_boundary_start_pfn[nid] = start_pfn;
+ if (node_boundary_end_pfn[nid] < end_pfn)
+ node_boundary_end_pfn[nid] = end_pfn;
+}
+
+/* If necessary, push the node boundary out for reserve hotadd */
+static void __init account_node_boundary(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
+ nid, *start_pfn, *end_pfn);
+
+ /* Return if boundary information has not been provided */
+ if (node_boundary_end_pfn[nid] == 0)
+ return;
+
+ /* Check the boundaries and update if necessary */
+ if (node_boundary_start_pfn[nid] < *start_pfn)
+ *start_pfn = node_boundary_start_pfn[nid];
+ if (node_boundary_end_pfn[nid] > *end_pfn)
+ *end_pfn = node_boundary_end_pfn[nid];
+}
+#else
+void __init push_node_boundaries(unsigned int nid,
+ unsigned long start_pfn, unsigned long end_pfn) {}
+
+static void __init account_node_boundary(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn) {}
+#endif
+
+
+/**
+ * get_pfn_range_for_nid - Return the start and end page frames for a node
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn
+ *
+ * It returns the start and end page frame of a node based on information
+ * provided by an arch calling add_active_range(). If called for a node
+ * with no available memory, a warning is printed and the start and end
+ * PFNs will be 0
+ */
+void __init get_pfn_range_for_nid(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ int i;
+ *start_pfn = -1UL;
+ *end_pfn = 0;
+
+ for_each_active_range_index_in_nid(i, nid) {
+ *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
+ *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+ }
+
+ if (*start_pfn == -1UL) {
+ printk(KERN_WARNING "Node %u active with no memory\n", nid);
+ *start_pfn = 0;
+ }
+
+ /* Push the node boundaries out if requested */
+ account_node_boundary(nid, start_pfn, end_pfn);
+}
+
+/*
+ * Return the number of pages a zone spans in a node, including holes
+ * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
+ */
+unsigned long __init zone_spanned_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *ignored)
+{
+ unsigned long node_start_pfn, node_end_pfn;
+ unsigned long zone_start_pfn, zone_end_pfn;
+
+ /* Get the start and end of the node and zone */
+ get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+ zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+
+ /* Check that this node has pages within the zone's required range */
+ if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+ return 0;
+
+ /* Move the zone boundaries inside the node if necessary */
+ zone_end_pfn = min(zone_end_pfn, node_end_pfn);
+ zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+
+ /* Return the spanned pages */
+ return zone_end_pfn - zone_start_pfn;
+}
+
+/*
+ * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
+ * then all holes in the requested range will be accounted for
+ */
+unsigned long __init __absent_pages_in_range(int nid,
+ unsigned long range_start_pfn,
+ unsigned long range_end_pfn)
+{
+ int i = 0;
+ unsigned long prev_end_pfn = 0, hole_pages = 0;
+ unsigned long start_pfn;
+
+ /* Find the end_pfn of the first active range of pfns in the node */
+ i = first_active_region_index_in_nid(nid);
+ if (i == -1)
+ return 0;
+
+ /* Account for ranges before physical memory on this node */
+ if (early_node_map[i].start_pfn > range_start_pfn)
+ hole_pages = early_node_map[i].start_pfn - range_start_pfn;
+
+ prev_end_pfn = early_node_map[i].start_pfn;
+
+ /* Find all holes for the zone within the node */
+ for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
+
+ /* No need to continue if prev_end_pfn is outside the zone */
+ if (prev_end_pfn >= range_end_pfn)
+ break;
+
+ /* Make sure the end of the zone is not within the hole */
+ start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
+ prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+
+ /* Update the hole size cound and move on */
+ if (start_pfn > range_start_pfn) {
+ BUG_ON(prev_end_pfn > start_pfn);
+ hole_pages += start_pfn - prev_end_pfn;
+ }
+ prev_end_pfn = early_node_map[i].end_pfn;
+ }
+
+ /* Account for ranges past physical memory on this node */
+ if (range_end_pfn > prev_end_pfn)
+ hole_pages = range_end_pfn -
+ max(range_start_pfn, prev_end_pfn);
+
+ return hole_pages;
+}
+
+/**
+ * absent_pages_in_range - Return number of page frames in holes within a range
+ * @start_pfn: The start PFN to start searching for holes
+ * @end_pfn: The end PFN to stop searching for holes
+ *
+ * It returns the number of pages frames in memory holes within a range
+ */
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+/* Return the number of page frames in holes in a zone on a node */
+unsigned long __init zone_absent_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *ignored)
+{
+ unsigned long node_start_pfn, node_end_pfn;
+ unsigned long zone_start_pfn, zone_end_pfn;
+
+ get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+ zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
+ node_start_pfn);
+ zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
+ node_end_pfn);
+
+ return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+}
+
+/* Return the zone index a PFN is in */
+int memmap_zone_idx(struct page *lmem_map)
+{
+ int i;
+ unsigned long phys_addr = virt_to_phys(lmem_map);
+ unsigned long pfn = phys_addr >> PAGE_SHIFT;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ if (pfn < arch_zone_highest_possible_pfn[i])
+ break;
+
+ return i;
+}
+#else
+static inline unsigned long zone_spanned_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *zones_size)
+{
+ return zones_size[zone_type];
+}
+
+static inline unsigned long zone_absent_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *zholes_size)
+{
+ if (!zholes_size)
+ return 0;
+
+ return zholes_size[zone_type];
+}
+
+static inline int memmap_zone_idx(struct page *lmem_map)
+{
+ return MAX_NR_ZONES;
+}
+#endif
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+ unsigned long *zones_size, unsigned long *zholes_size)
+{
+ unsigned long realtotalpages, totalpages = 0;
+ enum zone_type i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
+ zones_size);
+ pgdat->node_spanned_pages = totalpages;
+
+ realtotalpages = totalpages;
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ realtotalpages -=
+ zone_absent_pages_in_node(pgdat->node_id, i,
+ zholes_size);
+ pgdat->node_present_pages = realtotalpages;
+ printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+ realtotalpages);
+}
+
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -1998,11 +2372,34 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long size, realsize;
+ unsigned long size, realsize, memmap_pages;
- realsize = size = zones_size[j];
- if (zholes_size)
- realsize -= zholes_size[j];
+ size = zone_spanned_pages_in_node(nid, j, zones_size);
+ realsize = size - zone_absent_pages_in_node(nid, j,
+ zholes_size);
+
+ /*
+ * Adjust realsize so that it accounts for how much memory
+ * is used by this zone for memmap. This affects the watermark
+ * and per-cpu initialisations
+ */
+ memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
+ if (realsize >= memmap_pages) {
+ realsize -= memmap_pages;
+ printk(KERN_DEBUG
+ " %s zone: %lu pages used for memmap\n",
+ zone_names[j], memmap_pages);
+ } else
+ printk(KERN_WARNING
+ " %s zone: %lu pages exceeds realsize %lu\n",
+ zone_names[j], memmap_pages, realsize);
+
+ /* Account for reserved DMA pages */
+ if (j == ZONE_DMA && realsize > dma_reserve) {
+ realsize -= dma_reserve;
+ printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",
+ dma_reserve);
+ }
if (!is_highmem_idx(j))
nr_kernel_pages += realsize;
@@ -2011,6 +2408,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
zone->spanned_pages = size;
zone->present_pages = realsize;
#ifdef CONFIG_NUMA
+ zone->node = nid;
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
/ 100;
zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
@@ -2073,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
- if (pgdat == NODE_DATA(0))
+ if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+ mem_map -= pgdat->node_start_pfn;
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+ }
#endif
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
@@ -2085,13 +2488,255 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
{
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
- calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+ calculate_node_totalpages(pgdat, zones_size, zholes_size);
alloc_node_mem_map(pgdat);
free_area_init_core(pgdat, zones_size, zholes_size);
}
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/**
+ * add_active_range - Register a range of PFNs backed by physical memory
+ * @nid: The node ID the range resides on
+ * @start_pfn: The start PFN of the available physical memory
+ * @end_pfn: The end PFN of the available physical memory
+ *
+ * These ranges are stored in an early_node_map[] and later used by
+ * free_area_init_nodes() to calculate zone sizes and holes. If the
+ * range spans a memory hole, it is up to the architecture to ensure
+ * the memory is not freed by the bootmem allocator. If possible
+ * the range being registered will be merged with existing ranges.
+ */
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ int i;
+
+ printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
+ "%d entries of %d used\n",
+ nid, start_pfn, end_pfn,
+ nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+
+ /* Merge with existing active regions if possible */
+ for (i = 0; i < nr_nodemap_entries; i++) {
+ if (early_node_map[i].nid != nid)
+ continue;
+
+ /* Skip if an existing region covers this new one */
+ if (start_pfn >= early_node_map[i].start_pfn &&
+ end_pfn <= early_node_map[i].end_pfn)
+ return;
+
+ /* Merge forward if suitable */
+ if (start_pfn <= early_node_map[i].end_pfn &&
+ end_pfn > early_node_map[i].end_pfn) {
+ early_node_map[i].end_pfn = end_pfn;
+ return;
+ }
+
+ /* Merge backward if suitable */
+ if (start_pfn < early_node_map[i].end_pfn &&
+ end_pfn >= early_node_map[i].start_pfn) {
+ early_node_map[i].start_pfn = start_pfn;
+ return;
+ }
+ }
+
+ /* Check that early_node_map is large enough */
+ if (i >= MAX_ACTIVE_REGIONS) {
+ printk(KERN_CRIT "More than %d memory regions, truncating\n",
+ MAX_ACTIVE_REGIONS);
+ return;
+ }
+
+ early_node_map[i].nid = nid;
+ early_node_map[i].start_pfn = start_pfn;
+ early_node_map[i].end_pfn = end_pfn;
+ nr_nodemap_entries = i + 1;
+}
+
+/**
+ * shrink_active_range - Shrink an existing registered range of PFNs
+ * @nid: The node id the range is on that should be shrunk
+ * @old_end_pfn: The old end PFN of the range
+ * @new_end_pfn: The new PFN of the range
+ *
+ * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
+ * The map is kept at the end physical page range that has already been
+ * registered with add_active_range(). This function allows an arch to shrink
+ * an existing registered range.
+ */
+void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
+ unsigned long new_end_pfn)
+{
+ int i;
+
+ /* Find the old active region end and shrink */
+ for_each_active_range_index_in_nid(i, nid)
+ if (early_node_map[i].end_pfn == old_end_pfn) {
+ early_node_map[i].end_pfn = new_end_pfn;
+ break;
+ }
+}
+
+/**
+ * remove_all_active_ranges - Remove all currently registered regions
+ * During discovery, it may be found that a table like SRAT is invalid
+ * and an alternative discovery method must be used. This function removes
+ * all currently registered regions.
+ */
+void __init remove_all_active_ranges()
+{
+ memset(early_node_map, 0, sizeof(early_node_map));
+ nr_nodemap_entries = 0;
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+ memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
+ memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+}
+
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+ struct node_active_region *arange = (struct node_active_region *)a;
+ struct node_active_region *brange = (struct node_active_region *)b;
+
+ /* Done this way to avoid overflows */
+ if (arange->start_pfn > brange->start_pfn)
+ return 1;
+ if (arange->start_pfn < brange->start_pfn)
+ return -1;
+
+ return 0;
+}
+
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+ sort(early_node_map, (size_t)nr_nodemap_entries,
+ sizeof(struct node_active_region),
+ cmp_node_active_region, NULL);
+}
+
+/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+unsigned long __init find_min_pfn_for_node(unsigned long nid)
+{
+ int i;
+
+ /* Assuming a sorted map, the first range found has the starting pfn */
+ for_each_active_range_index_in_nid(i, nid)
+ return early_node_map[i].start_pfn;
+
+ printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
+ return 0;
+}
+
+/**
+ * find_min_pfn_with_active_regions - Find the minimum PFN registered
+ *
+ * It returns the minimum PFN based on information provided via
+ * add_active_range()
+ */
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+ return find_min_pfn_for_node(MAX_NUMNODES);
+}
+
+/**
+ * find_max_pfn_with_active_regions - Find the maximum PFN registered
+ *
+ * It returns the maximum PFN based on information provided via
+ * add_active_range()
+ */
+unsigned long __init find_max_pfn_with_active_regions(void)
+{
+ int i;
+ unsigned long max_pfn = 0;
+
+ for (i = 0; i < nr_nodemap_entries; i++)
+ max_pfn = max(max_pfn, early_node_map[i].end_pfn);
+
+ return max_pfn;
+}
+
+/**
+ * free_area_init_nodes - Initialise all pg_data_t and zone data
+ * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA
+ * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32
+ * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL
+ * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM
+ *
+ * This will call free_area_init_node() for each active node in the system.
+ * Using the page ranges provided by add_active_range(), the size of each
+ * zone in each node and their holes is calculated. If the maximum PFN
+ * between two adjacent zones match, it is assumed that the zone is empty.
+ * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
+ * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
+ * starts where the previous one ended. For example, ZONE_DMA32 starts
+ * at arch_max_dma_pfn.
+ */
+void __init free_area_init_nodes(unsigned long *max_zone_pfn)
+{
+ unsigned long nid;
+ enum zone_type i;
+
+ /* Record where the zone boundaries are */
+ memset(arch_zone_lowest_possible_pfn, 0,
+ sizeof(arch_zone_lowest_possible_pfn));
+ memset(arch_zone_highest_possible_pfn, 0,
+ sizeof(arch_zone_highest_possible_pfn));
+ arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
+ arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
+ for (i = 1; i < MAX_NR_ZONES; i++) {
+ arch_zone_lowest_possible_pfn[i] =
+ arch_zone_highest_possible_pfn[i-1];
+ arch_zone_highest_possible_pfn[i] =
+ max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
+ }
+
+ /* Regions in the early_node_map can be in any order */
+ sort_node_map();
+
+ /* Print out the zone ranges */
+ printk("Zone PFN ranges:\n");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(" %-8s %8lu -> %8lu\n",
+ zone_names[i],
+ arch_zone_lowest_possible_pfn[i],
+ arch_zone_highest_possible_pfn[i]);
+
+ /* Print out the early_node_map[] */
+ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
+ for (i = 0; i < nr_nodemap_entries; i++)
+ printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
+ early_node_map[i].start_pfn,
+ early_node_map[i].end_pfn);
+
+ /* Initialise every node */
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ free_area_init_node(nid, pgdat, NULL,
+ find_min_pfn_for_node(nid), NULL);
+ }
+}
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
+/**
+ * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA
+ * @new_dma_reserve - The number of pages to mark reserved
+ *
+ * The per-cpu batchsize and zone watermarks are determined by present_pages.
+ * In the DMA zone, a significant percentage may be consumed by kernel image
+ * and other unfreeable allocations which can skew the watermarks badly. This
+ * function may optionally be used to account for unfreeable pages in
+ * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize
+ */
+void __init set_dma_reserve(unsigned long new_dma_reserve)
+{
+ dma_reserve = new_dma_reserve;
+}
+
#ifndef CONFIG_NEED_MULTIPLE_NODES
static bootmem_data_t contig_bootmem_data;
struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
diff --git a/mm/shmem.c b/mm/shmem.c
index 8631be45b40..eda907c3a86 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1351,7 +1351,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
inode->i_mode = mode;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
- inode->i_blksize = PAGE_CACHE_SIZE;
inode->i_blocks = 0;
inode->i_mapping->a_ops = &shmem_aops;
inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -2157,8 +2156,7 @@ static int init_inodecache(void)
static void destroy_inodecache(void)
{
- if (kmem_cache_destroy(shmem_inode_cachep))
- printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
+ kmem_cache_destroy(shmem_inode_cachep);
}
static const struct address_space_operations shmem_aops = {
diff --git a/mm/slab.c b/mm/slab.c
index 7a48eb1a60c..792bfe320a8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -972,7 +972,39 @@ static int transfer_objects(struct array_cache *to,
return nr;
}
-#ifdef CONFIG_NUMA
+#ifndef CONFIG_NUMA
+
+#define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
+
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+ return (struct array_cache **)BAD_ALIEN_MAGIC;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
+
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+ return 0;
+}
+
+static inline void *alternate_node_alloc(struct kmem_cache *cachep,
+ gfp_t flags)
+{
+ return NULL;
+}
+
+static inline void *__cache_alloc_node(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid)
+{
+ return NULL;
+}
+
+#else /* CONFIG_NUMA */
+
static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
@@ -1101,26 +1133,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
}
return 1;
}
-
-#else
-
-#define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
-
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
-{
- return (struct array_cache **)BAD_ALIEN_MAGIC;
-}
-
-static inline void free_alien_cache(struct array_cache **ac_ptr)
-{
-}
-
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
-{
- return 0;
-}
-
#endif
static int __cpuinit cpuup_callback(struct notifier_block *nfb,
@@ -1564,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
*/
flags |= __GFP_COMP;
#endif
- flags |= cachep->gfpflags;
+
+ /*
+ * Under NUMA we want memory on the indicated node. We will handle
+ * the needed fallback ourselves since we want to serve from our
+ * per node object lists first for other nodes.
+ */
+ flags |= cachep->gfpflags | GFP_THISNODE;
page = alloc_pages_node(nodeid, flags, cachep->gfporder);
if (!page)
@@ -2442,7 +2460,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
* @cachep: the cache to destroy
*
* Remove a struct kmem_cache object from the slab cache.
- * Returns 0 on success.
*
* It is expected this function will be called by a module when it is
* unloaded. This will remove the cache completely, and avoid a duplicate
@@ -2454,7 +2471,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
* The caller must guarantee that noone will allocate memory from the cache
* during the kmem_cache_destroy().
*/
-int kmem_cache_destroy(struct kmem_cache *cachep)
+void kmem_cache_destroy(struct kmem_cache *cachep)
{
BUG_ON(!cachep || in_interrupt());
@@ -2475,7 +2492,7 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
list_add(&cachep->next, &cache_chain);
mutex_unlock(&cache_chain_mutex);
unlock_cpu_hotplug();
- return 1;
+ return;
}
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
@@ -2483,7 +2500,6 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
__kmem_cache_destroy(cachep);
unlock_cpu_hotplug();
- return 0;
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -3030,14 +3046,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
void *objp;
struct array_cache *ac;
-#ifdef CONFIG_NUMA
- if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
- objp = alternate_node_alloc(cachep, flags);
- if (objp != NULL)
- return objp;
- }
-#endif
-
check_irq_off();
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
@@ -3055,12 +3063,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
gfp_t flags, void *caller)
{
unsigned long save_flags;
- void *objp;
+ void *objp = NULL;
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
- objp = ____cache_alloc(cachep, flags);
+
+ if (unlikely(NUMA_BUILD &&
+ current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
+ objp = alternate_node_alloc(cachep, flags);
+
+ if (!objp)
+ objp = ____cache_alloc(cachep, flags);
+ /*
+ * We may just have run out of memory on the local node.
+ * __cache_alloc_node() knows how to locate memory on other nodes
+ */
+ if (NUMA_BUILD && !objp)
+ objp = __cache_alloc_node(cachep, flags, numa_node_id());
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp,
caller);
@@ -3079,7 +3099,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
{
int nid_alloc, nid_here;
- if (in_interrupt())
+ if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
nid_alloc = nid_here = numa_node_id();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
@@ -3092,6 +3112,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
}
/*
+ * Fallback function if there was no memory available and no objects on a
+ * certain node and we are allowed to fall back. We mimick the behavior of
+ * the page allocator. We fall back according to a zonelist determined by
+ * the policy layer while obeying cpuset constraints.
+ */
+void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+{
+ struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
+ ->node_zonelists[gfp_zone(flags)];
+ struct zone **z;
+ void *obj = NULL;
+
+ for (z = zonelist->zones; *z && !obj; z++)
+ if (zone_idx(*z) <= ZONE_NORMAL &&
+ cpuset_zone_allowed(*z, flags))
+ obj = __cache_alloc_node(cache,
+ flags | __GFP_THISNODE,
+ zone_to_nid(*z));
+ return obj;
+}
+
+/*
* A interface to enable slab creation on nodeid
*/
static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
@@ -3144,11 +3186,15 @@ retry:
must_grow:
spin_unlock(&l3->list_lock);
x = cache_grow(cachep, flags, nodeid);
+ if (x)
+ goto retry;
- if (!x)
- return NULL;
+ if (!(flags & __GFP_THISNODE))
+ /* Unable to grow the cache. Fall back to other nodes. */
+ return fallback_alloc(cachep, flags);
+
+ return NULL;
- goto retry;
done:
return obj;
}
diff --git a/mm/slob.c b/mm/slob.c
index 20188627347..542394184a5 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -270,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
}
EXPORT_SYMBOL(kmem_cache_create);
-int kmem_cache_destroy(struct kmem_cache *c)
+void kmem_cache_destroy(struct kmem_cache *c)
{
slob_free(c, sizeof(struct kmem_cache));
- return 0;
}
EXPORT_SYMBOL(kmem_cache_destroy);
diff --git a/mm/truncate.c b/mm/truncate.c
index c6ab55ec688..a654928323d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
@@ -52,36 +53,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
/*
* This is for invalidate_inode_pages(). That function can be called at
* any time, and is not supposed to throw away dirty pages. But pages can
- * be marked dirty at any time too. So we re-check the dirtiness inside
- * ->tree_lock. That provides exclusion against the __set_page_dirty
- * functions.
+ * be marked dirty at any time too, so use remove_mapping which safely
+ * discards clean, unused pages.
*
* Returns non-zero if the page was successfully invalidated.
*/
static int
invalidate_complete_page(struct address_space *mapping, struct page *page)
{
+ int ret;
+
if (page->mapping != mapping)
return 0;
if (PagePrivate(page) && !try_to_release_page(page, 0))
return 0;
- write_lock_irq(&mapping->tree_lock);
- if (PageDirty(page))
- goto failed;
- if (page_count(page) != 2) /* caller's ref + pagecache ref */
- goto failed;
-
- BUG_ON(PagePrivate(page));
- __remove_from_page_cache(page);
- write_unlock_irq(&mapping->tree_lock);
+ ret = remove_mapping(mapping, page);
ClearPageUptodate(page);
- page_cache_release(page); /* pagecache ref */
- return 1;
-failed:
- write_unlock_irq(&mapping->tree_lock);
- return 0;
+
+ return ret;
}
/**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9aad8b0cc6e..1ac191ce564 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -241,7 +241,6 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
/**
* get_vm_area - reserve a contingous kernel virtual area
- *
* @size: size of the area
* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
*
@@ -273,7 +272,7 @@ static struct vm_struct *__find_vm_area(void *addr)
}
/* Caller must hold vmlist_lock */
-struct vm_struct *__remove_vm_area(void *addr)
+static struct vm_struct *__remove_vm_area(void *addr)
{
struct vm_struct **p, *tmp;
@@ -296,7 +295,6 @@ found:
/**
* remove_vm_area - find and remove a contingous kernel virtual area
- *
* @addr: base address
*
* Search for the kernel VM area starting at @addr, and remove it.
@@ -355,7 +353,6 @@ void __vunmap(void *addr, int deallocate_pages)
/**
* vfree - release memory allocated by vmalloc()
- *
* @addr: memory base address
*
* Free the virtually contiguous memory area starting at @addr, as
@@ -373,7 +370,6 @@ EXPORT_SYMBOL(vfree);
/**
* vunmap - release virtual mapping obtained by vmap()
- *
* @addr: memory base address
*
* Free the virtually contiguous memory area starting at @addr,
@@ -390,7 +386,6 @@ EXPORT_SYMBOL(vunmap);
/**
* vmap - map an array of pages into virtually contiguous space
- *
* @pages: array of page pointers
* @count: number of pages to map
* @flags: vm_area->flags
@@ -471,7 +466,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
/**
* __vmalloc_node - allocate virtually contiguous memory
- *
* @size: allocation size
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
@@ -505,9 +499,7 @@ EXPORT_SYMBOL(__vmalloc);
/**
* vmalloc - allocate virtually contiguous memory
- *
* @size: allocation size
- *
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
@@ -521,11 +513,11 @@ void *vmalloc(unsigned long size)
EXPORT_SYMBOL(vmalloc);
/**
- * vmalloc_user - allocate virtually contiguous memory which has
- * been zeroed so it can be mapped to userspace without
- * leaking data.
+ * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
+ * @size: allocation size
*
- * @size: allocation size
+ * The resulting memory area is zeroed so it can be mapped to userspace
+ * without leaking data.
*/
void *vmalloc_user(unsigned long size)
{
@@ -544,7 +536,6 @@ EXPORT_SYMBOL(vmalloc_user);
/**
* vmalloc_node - allocate memory on a specific node
- *
* @size: allocation size
* @node: numa node
*
@@ -566,7 +557,6 @@ EXPORT_SYMBOL(vmalloc_node);
/**
* vmalloc_exec - allocate virtually contiguous, executable memory
- *
* @size: allocation size
*
* Kernel-internal function to allocate enough pages to cover @size
@@ -584,7 +574,6 @@ void *vmalloc_exec(unsigned long size)
/**
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
- *
* @size: allocation size
*
* Allocate enough 32bit PA addressable pages to cover @size from the
@@ -597,11 +586,11 @@ void *vmalloc_32(unsigned long size)
EXPORT_SYMBOL(vmalloc_32);
/**
- * vmalloc_32_user - allocate virtually contiguous memory (32bit
- * addressable) which is zeroed so it can be
- * mapped to userspace without leaking data.
- *
+ * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
* @size: allocation size
+ *
+ * The resulting memory area is 32bit addressable and zeroed so it can be
+ * mapped to userspace without leaking data.
*/
void *vmalloc_32_user(unsigned long size)
{
@@ -695,7 +684,6 @@ finished:
/**
* remap_vmalloc_range - map vmalloc pages to userspace
- *
* @vma: vma to cover (map full range of vma)
* @addr: vmalloc memory
* @pgoff: number of pages into addr before first page to map
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 87779dda4ec..eca70310adb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
+#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
@@ -370,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
-
+ inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -383,11 +384,30 @@ int remove_mapping(struct address_space *mapping, struct page *page)
BUG_ON(mapping != page_mapping(page));
write_lock_irq(&mapping->tree_lock);
-
/*
- * The non-racy check for busy page. It is critical to check
- * PageDirty _after_ making sure that the page is freeable and
- * not in use by anybody. (pagecache + us == 2)
+ * The non racy check for a busy page.
+ *
+ * Must be careful with the order of the tests. When someone has
+ * a ref to the page, it may be possible that they dirty it then
+ * drop the reference. So if PageDirty is tested before page_count
+ * here, then the following race may occur:
+ *
+ * get_user_pages(&page);
+ * [user mapping goes away]
+ * write_to(page);
+ * !PageDirty(page) [good]
+ * SetPageDirty(page);
+ * put_page(page);
+ * !page_count(page) [good, discard it]
+ *
+ * [oops, our write_to data is lost]
+ *
+ * Reversing the order of the tests ensures such a situation cannot
+ * escape unnoticed. The smp_rmb is needed to ensure the page->flags
+ * load is not satisfied before that of page->_count.
+ *
+ * Note that if SetPageDirty is always performed via set_page_dirty,
+ * and thus under tree_lock, then this ordering is not required.
*/
if (unlikely(page_count(page) != 2))
goto cannot_free;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 490d8c1a0de..a2b6a9f96e5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -371,7 +371,7 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z)
__inc_zone_state(z, NUMA_MISS);
__inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
}
- if (z->zone_pgdat == NODE_DATA(numa_node_id()))
+ if (z->node == numa_node_id())
__inc_zone_state(z, NUMA_LOCAL);
else
__inc_zone_state(z, NUMA_OTHER);
@@ -465,6 +465,7 @@ static char *vmstat_text[] = {
"nr_writeback",
"nr_unstable",
"nr_bounce",
+ "nr_vmscan_write",
#ifdef CONFIG_NUMA
"numa_hit",