aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/mempolicy.c84
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/oom_kill.c64
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/percpu.c35
-rw-r--r--mm/rmap.c1
-rw-r--r--mm/vmscan.c9
10 files changed, 136 insertions, 81 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c948d4ca8bd..fe5f674d7a7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR
For most ia64, ppc64 and x86 users with lots of address space
a value of 65536 is reasonable and should cause no problems.
On arm and other archs it should not be higher than 32768.
- Programs which use vm86 functionality would either need additional
- permissions from either the LSM or the capabilities module or have
- this protection disabled.
+ Programs which use vm86 functionality or have some need to map
+ this low address space will need CAP_SYS_RAWIO or disable this
+ protection by setting the value to 0.
This value can be changed after boot using the
/proc/sys/vm/mmap_min_addr tunable.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e08e2c4da63..7dd9d9f8069 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_semaphore for write.
*/
-static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
+static int mpol_set_nodemask(struct mempolicy *pol,
+ const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
- nodemask_t cpuset_context_nmask;
int ret;
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
if (pol == NULL)
return 0;
+ /* Check N_HIGH_MEMORY */
+ nodes_and(nsc->mask1,
+ cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
VM_BUG_ON(!nodes);
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
nodes = NULL; /* explicit local allocation */
else {
if (pol->flags & MPOL_F_RELATIVE_NODES)
- mpol_relative_nodemask(&cpuset_context_nmask, nodes,
- &cpuset_current_mems_allowed);
+ mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
else
- nodes_and(cpuset_context_nmask, *nodes,
- cpuset_current_mems_allowed);
+ nodes_and(nsc->mask2, *nodes, nsc->mask1);
+
if (mpol_store_user_nodemask(pol))
pol->w.user_nodemask = *nodes;
else
@@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
cpuset_current_mems_allowed;
}
- ret = mpol_ops[pol->mode].create(pol,
- nodes ? &cpuset_context_nmask : NULL);
+ if (nodes)
+ ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+ else
+ ret = mpol_ops[pol->mode].create(pol, NULL);
return ret;
}
@@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
{
struct mempolicy *new, *old;
struct mm_struct *mm = current->mm;
+ NODEMASK_SCRATCH(scratch);
int ret;
- new = mpol_new(mode, flags, nodes);
- if (IS_ERR(new))
- return PTR_ERR(new);
+ if (!scratch)
+ return -ENOMEM;
+ new = mpol_new(mode, flags, nodes);
+ if (IS_ERR(new)) {
+ ret = PTR_ERR(new);
+ goto out;
+ }
/*
* prevent changing our mempolicy while show_numa_maps()
* is using it.
@@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
if (mm)
down_write(&mm->mmap_sem);
task_lock(current);
- ret = mpol_set_nodemask(new, nodes);
+ ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
task_unlock(current);
if (mm)
up_write(&mm->mmap_sem);
mpol_put(new);
- return ret;
+ goto out;
}
old = current->mempolicy;
current->mempolicy = new;
@@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
up_write(&mm->mmap_sem);
mpol_put(old);
- return 0;
+ ret = 0;
+out:
+ NODEMASK_SCRATCH_FREE(scratch);
+ return ret;
}
/*
@@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len,
if (err)
return err;
}
- down_write(&mm->mmap_sem);
- task_lock(current);
- err = mpol_set_nodemask(new, nmask);
- task_unlock(current);
+ {
+ NODEMASK_SCRATCH(scratch);
+ if (scratch) {
+ down_write(&mm->mmap_sem);
+ task_lock(current);
+ err = mpol_set_nodemask(new, nmask, scratch);
+ task_unlock(current);
+ if (err)
+ up_write(&mm->mmap_sem);
+ } else
+ err = -ENOMEM;
+ NODEMASK_SCRATCH_FREE(scratch);
+ }
if (err) {
- up_write(&mm->mmap_sem);
mpol_put(new);
return err;
}
@@ -1891,6 +1911,7 @@ restart:
* Install non-NULL @mpol in inode's shared policy rb-tree.
* On entry, the current task has a reference on a non-NULL @mpol.
* This must be released on exit.
+ * This is called at get_inode() calls and we can use GFP_KERNEL.
*/
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
@@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
if (mpol) {
struct vm_area_struct pvma;
struct mempolicy *new;
+ NODEMASK_SCRATCH(scratch);
+ if (!scratch)
+ return;
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
if (IS_ERR(new)) {
mpol_put(mpol); /* drop our ref on sb mpol */
+ NODEMASK_SCRATCH_FREE(scratch);
return; /* no valid nodemask intersection */
}
task_lock(current);
- ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
+ ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
task_unlock(current);
mpol_put(mpol); /* drop our ref on sb mpol */
if (ret) {
+ NODEMASK_SCRATCH_FREE(scratch);
mpol_put(new);
return;
}
@@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
mpol_put(new); /* drop initial ref */
+ NODEMASK_SCRATCH_FREE(scratch);
}
}
@@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
err = 1;
else {
int ret;
-
- task_lock(current);
- ret = mpol_set_nodemask(new, &nodes);
- task_unlock(current);
- if (ret)
+ NODEMASK_SCRATCH(scratch);
+ if (scratch) {
+ task_lock(current);
+ ret = mpol_set_nodemask(new, &nodes, scratch);
+ task_unlock(current);
+ } else
+ ret = -ENOMEM;
+ NODEMASK_SCRATCH_FREE(scratch);
+ if (ret) {
err = 1;
- else if (no_context) {
+ mpol_put(new);
+ } else if (no_context) {
/* save for contextualization */
new->w.user_nodemask = nodes;
}
diff --git a/mm/mempool.c b/mm/mempool.c
index a46eb1b4bb6..32e75d40050 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab);
*/
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
{
- size_t size = (size_t)(long)pool_data;
+ size_t size = (size_t)pool_data;
return kmalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kmalloc);
void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
{
- size_t size = (size_t) pool_data;
+ size_t size = (size_t)pool_data;
return kzalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kzalloc);
diff --git a/mm/mmap.c b/mm/mmap.c
index 34579b23ebd..8101de490c7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
struct percpu_counter vm_committed_as;
-/* amount of vm to protect from userspace access */
-unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
-
/*
* Check that a process has enough memory to allocate a new virtual
* mapping. 0 means there is enough memory for the allocation to
diff --git a/mm/nommu.c b/mm/nommu.c
index 53cab10fece..4bde489ec43 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
int heap_stack_gap = 0;
-/* amount of vm to protect from userspace access */
-unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
-
atomic_long_t mmap_pages_allocated;
EXPORT_SYMBOL(mem_map);
@@ -922,6 +919,10 @@ static int validate_mmap_request(struct file *file,
if (!file->f_op->read)
capabilities &= ~BDI_CAP_MAP_COPY;
+ /* The file shall have been opened with read permission. */
+ if (!(file->f_mode & FMODE_READ))
+ return -EACCES;
+
if (flags & MAP_SHARED) {
/* do checks for writing, appending and locking */
if ((prot & PROT_WRITE) &&
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 175a67a78a9..a7b2460e922 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
unsigned long points, cpu_time, run_time;
struct mm_struct *mm;
struct task_struct *child;
- int oom_adj;
task_lock(p);
mm = p->mm;
@@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
task_unlock(p);
return 0;
}
- oom_adj = mm->oom_adj;
- if (oom_adj == OOM_DISABLE) {
- task_unlock(p);
- return 0;
- }
/*
* The memory size of the process is the basis for the badness.
@@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
points /= 8;
/*
- * Adjust the score by oom_adj.
+ * Adjust the score by oomkilladj.
*/
- if (oom_adj) {
- if (oom_adj > 0) {
+ if (p->oomkilladj) {
+ if (p->oomkilladj > 0) {
if (!points)
points = 1;
- points <<= oom_adj;
+ points <<= p->oomkilladj;
} else
- points >>= -(oom_adj);
+ points >>= -(p->oomkilladj);
}
#ifdef DEBUG
@@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
*ppoints = ULONG_MAX;
}
+ if (p->oomkilladj == OOM_DISABLE)
+ continue;
+
points = badness(p, uptime.tv_sec);
- if (points > *ppoints) {
+ if (points > *ppoints || !chosen) {
chosen = p;
*ppoints = points;
}
@@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem)
}
printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
- get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
+ get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
+ p->comm);
task_unlock(p);
} while_each_thread(g, p);
}
@@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
return;
}
- if (!p->mm)
+ if (!p->mm) {
+ WARN_ON(1);
+ printk(KERN_WARNING "tried to kill an mm-less task!\n");
return;
+ }
if (verbose)
printk(KERN_ERR "Killed process %d (%s)\n",
@@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p)
struct mm_struct *mm;
struct task_struct *g, *q;
- task_lock(p);
mm = p->mm;
- if (!mm || mm->oom_adj == OOM_DISABLE) {
- task_unlock(p);
+
+ /* WARNING: mm may not be dereferenced since we did not obtain its
+ * value from get_task_mm(p). This is OK since all we need to do is
+ * compare mm to q->mm below.
+ *
+ * Furthermore, even if mm contains a non-NULL value, p->mm may
+ * change to NULL at any time since we do not hold task_lock(p).
+ * However, this is of no concern to us.
+ */
+
+ if (mm == NULL)
return 1;
- }
- task_unlock(p);
+
+ /*
+ * Don't kill the process if any threads are set to OOM_DISABLE
+ */
+ do_each_thread(g, q) {
+ if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
+ return 1;
+ } while_each_thread(g, q);
+
__oom_kill_task(p, 1);
/*
@@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
struct task_struct *c;
if (printk_ratelimit()) {
- task_lock(current);
printk(KERN_WARNING "%s invoked oom-killer: "
- "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
- current->comm, gfp_mask, order,
- current->mm ? current->mm->oom_adj : OOM_DISABLE);
+ "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
+ current->comm, gfp_mask, order, current->oomkilladj);
+ task_lock(current);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
dump_stack();
@@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
- * if its mm is still attached.
*/
- if (p->mm && (p->flags & PF_EXITING)) {
+ if (p->flags & PF_EXITING) {
__oom_kill_task(p, 0);
return 0;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d052abbe306..5cc986eb9f6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2544,7 +2544,6 @@ static void build_zonelists(pg_data_t *pgdat)
prev_node = local_node;
nodes_clear(used_mask);
- memset(node_load, 0, sizeof(node_load));
memset(node_order, 0, sizeof(node_order));
j = 0;
@@ -2653,6 +2652,9 @@ static int __build_all_zonelists(void *dummy)
{
int nid;
+#ifdef CONFIG_NUMA
+ memset(node_load, 0, sizeof(node_load));
+#endif
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
diff --git a/mm/percpu.c b/mm/percpu.c
index b70f2acd885..5fe37842e0e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,12 @@
*
* This is percpu allocator which can handle both static and dynamic
* areas. Percpu areas are allocated in chunks in vmalloc area. Each
- * chunk is consisted of num_possible_cpus() units and the first chunk
- * is used for static percpu variables in the kernel image (special
- * boot time alloc/init handling necessary as these areas need to be
- * brought up before allocation services are running). Unit grows as
- * necessary and all units grow or shrink in unison. When a chunk is
- * filled up, another chunk is allocated. ie. in vmalloc area
+ * chunk is consisted of nr_cpu_ids units and the first chunk is used
+ * for static percpu variables in the kernel image (special boot time
+ * alloc/init handling necessary as these areas need to be brought up
+ * before allocation services are running). Unit grows as necessary
+ * and all units grow or shrink in unison. When a chunk is filled up,
+ * another chunk is allocated. ie. in vmalloc area
*
* c0 c1 c2
* ------------------- ------------------- ------------
@@ -558,7 +558,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
bool flush_tlb)
{
- unsigned int last = num_possible_cpus() - 1;
+ unsigned int last = nr_cpu_ids - 1;
unsigned int cpu;
/* unmap must not be done on immutable chunk */
@@ -643,7 +643,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
*/
static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
{
- unsigned int last = num_possible_cpus() - 1;
+ unsigned int last = nr_cpu_ids - 1;
unsigned int cpu;
int err;
@@ -749,7 +749,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
chunk->map[chunk->map_used++] = pcpu_unit_size;
chunk->page = chunk->page_ar;
- chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+ chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
if (!chunk->vm) {
free_pcpu_chunk(chunk);
return NULL;
@@ -1067,9 +1067,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
PFN_UP(size_sum));
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
- pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+ pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
- + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
+ + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
if (dyn_size < 0)
dyn_size = pcpu_unit_size - static_size - reserved_size;
@@ -1248,7 +1248,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
} else
pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
- chunk_size = pcpue_unit_size * num_possible_cpus();
+ chunk_size = pcpue_unit_size * nr_cpu_ids;
pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
__pa(MAX_DMA_ADDRESS));
@@ -1259,12 +1259,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
}
/* return the leftover and copy */
- for_each_possible_cpu(cpu) {
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
- free_bootmem(__pa(ptr + pcpue_size),
- pcpue_unit_size - pcpue_size);
- memcpy(ptr, __per_cpu_load, static_size);
+ if (cpu_possible(cpu)) {
+ free_bootmem(__pa(ptr + pcpue_size),
+ pcpue_unit_size - pcpue_size);
+ memcpy(ptr, __per_cpu_load, static_size);
+ } else
+ free_bootmem(__pa(ptr), pcpue_unit_size);
}
/* we're ready, commit */
diff --git a/mm/rmap.c b/mm/rmap.c
index 836c6c63e1f..0895b5c7cbf 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -358,6 +358,7 @@ static int page_referenced_one(struct page *page,
*/
if (vma->vm_flags & VM_LOCKED) {
*mapcount = 1; /* break early from loop */
+ *vm_flags |= VM_LOCKED;
goto out_unmap;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dea7abd3109..94e86dd6954 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -630,9 +630,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
referenced = page_referenced(page, 1,
sc->mem_cgroup, &vm_flags);
- /* In active use or really unfreeable? Activate it. */
+ /*
+ * In active use or really unfreeable? Activate it.
+ * If page which have PG_mlocked lost isoltation race,
+ * try_to_unmap moves it to unevictable list
+ */
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
- referenced && page_mapping_inuse(page))
+ referenced && page_mapping_inuse(page)
+ && !(vm_flags & VM_LOCKED))
goto activate_locked;
/*