aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c219
-rw-r--r--mm/hugetlb.c8
-rw-r--r--mm/memcontrol.c190
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/migrate.c9
-rw-r--r--mm/mmap.c24
-rw-r--r--mm/nommu.c23
-rw-r--r--mm/page-writeback.c77
-rw-r--r--mm/page_alloc.c47
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c14
-rw-r--r--mm/slob.c3
-rw-r--r--mm/slub.c537
-rw-r--r--mm/sparse.c19
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/vmalloc.c3
-rw-r--r--mm/vmscan.c11
-rw-r--r--mm/vmstat.c5
20 files changed, 838 insertions, 371 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e8644b1e552..7c4f9e09709 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -4,12 +4,229 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/device.h>
+
+
+static struct class *bdi_class;
+
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bdi_debug_root;
+
+static void bdi_debug_init(void)
+{
+ bdi_debug_root = debugfs_create_dir("bdi", NULL);
+}
+
+static int bdi_debug_stats_show(struct seq_file *m, void *v)
+{
+ struct backing_dev_info *bdi = m->private;
+ long background_thresh;
+ long dirty_thresh;
+ long bdi_thresh;
+
+ get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+ seq_printf(m,
+ "BdiWriteback: %8lu kB\n"
+ "BdiReclaimable: %8lu kB\n"
+ "BdiDirtyThresh: %8lu kB\n"
+ "DirtyThresh: %8lu kB\n"
+ "BackgroundThresh: %8lu kB\n",
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
+ (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
+ K(bdi_thresh),
+ K(dirty_thresh),
+ K(background_thresh));
+#undef K
+
+ return 0;
+}
+
+static int bdi_debug_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, bdi_debug_stats_show, inode->i_private);
+}
+
+static const struct file_operations bdi_debug_stats_fops = {
+ .open = bdi_debug_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+{
+ bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
+ bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
+ bdi, &bdi_debug_stats_fops);
+}
+
+static void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+ debugfs_remove(bdi->debug_stats);
+ debugfs_remove(bdi->debug_dir);
+}
+#else
+static inline void bdi_debug_init(void)
+{
+}
+static inline void bdi_debug_register(struct backing_dev_info *bdi,
+ const char *name)
+{
+}
+static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+}
+#endif
+
+static ssize_t read_ahead_kb_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ char *end;
+ unsigned long read_ahead_kb;
+ ssize_t ret = -EINVAL;
+
+ read_ahead_kb = simple_strtoul(buf, &end, 10);
+ if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+ bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
+ ret = count;
+ }
+ return ret;
+}
+
+#define K(pages) ((pages) << (PAGE_SHIFT - 10))
+
+#define BDI_SHOW(name, expr) \
+static ssize_t name##_show(struct device *dev, \
+ struct device_attribute *attr, char *page) \
+{ \
+ struct backing_dev_info *bdi = dev_get_drvdata(dev); \
+ \
+ return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
+}
+
+BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
+
+static ssize_t min_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ char *end;
+ unsigned int ratio;
+ ssize_t ret = -EINVAL;
+
+ ratio = simple_strtoul(buf, &end, 10);
+ if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+ ret = bdi_set_min_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+ }
+ return ret;
+}
+BDI_SHOW(min_ratio, bdi->min_ratio)
+
+static ssize_t max_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ char *end;
+ unsigned int ratio;
+ ssize_t ret = -EINVAL;
+
+ ratio = simple_strtoul(buf, &end, 10);
+ if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+ ret = bdi_set_max_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+ }
+ return ret;
+}
+BDI_SHOW(max_ratio, bdi->max_ratio)
+
+#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
+
+static struct device_attribute bdi_dev_attrs[] = {
+ __ATTR_RW(read_ahead_kb),
+ __ATTR_RW(min_ratio),
+ __ATTR_RW(max_ratio),
+ __ATTR_NULL,
+};
+
+static __init int bdi_class_init(void)
+{
+ bdi_class = class_create(THIS_MODULE, "bdi");
+ bdi_class->dev_attrs = bdi_dev_attrs;
+ bdi_debug_init();
+ return 0;
+}
+
+postcore_initcall(bdi_class_init);
+
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+ const char *fmt, ...)
+{
+ char *name;
+ va_list args;
+ int ret = 0;
+ struct device *dev;
+
+ va_start(args, fmt);
+ name = kvasprintf(GFP_KERNEL, fmt, args);
+ va_end(args);
+
+ if (!name)
+ return -ENOMEM;
+
+ dev = device_create(bdi_class, parent, MKDEV(0, 0), name);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto exit;
+ }
+
+ bdi->dev = dev;
+ dev_set_drvdata(bdi->dev, bdi);
+ bdi_debug_register(bdi, name);
+
+exit:
+ kfree(name);
+ return ret;
+}
+EXPORT_SYMBOL(bdi_register);
+
+int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+{
+ return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+}
+EXPORT_SYMBOL(bdi_register_dev);
+
+void bdi_unregister(struct backing_dev_info *bdi)
+{
+ if (bdi->dev) {
+ bdi_debug_unregister(bdi);
+ device_unregister(bdi->dev);
+ bdi->dev = NULL;
+ }
+}
+EXPORT_SYMBOL(bdi_unregister);
int bdi_init(struct backing_dev_info *bdi)
{
int i;
int err;
+ bdi->dev = NULL;
+
+ bdi->min_ratio = 0;
+ bdi->max_ratio = 100;
+ bdi->max_prop_frac = PROP_FRAC_BASE;
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
if (err)
@@ -33,6 +250,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
{
int i;
+ bdi_unregister(bdi);
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index df28c1773fb..bbf953eeb58 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -199,12 +199,13 @@ static struct page *alloc_fresh_huge_page_node(int nid)
struct page *page;
page = alloc_pages_node(nid,
- htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+ __GFP_REPEAT|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
if (page) {
if (arch_prepare_hugepage(page)) {
__free_pages(page, HUGETLB_PAGE_ORDER);
- return 0;
+ return NULL;
}
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
@@ -294,7 +295,8 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
}
spin_unlock(&hugetlb_lock);
- page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
+ page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+ __GFP_REPEAT|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
spin_lock(&hugetlb_lock);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2e0bfc93484..e46451e1d9b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -26,15 +26,18 @@
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
+#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
struct cgroup_subsys mem_cgroup_subsys;
static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
+static struct kmem_cache *page_cgroup_cache;
/*
* Statistics for memory cgroup.
@@ -45,6 +48,8 @@ enum mem_cgroup_stat_index {
*/
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */
+ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
+ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_NSTATS,
};
@@ -196,6 +201,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
else
__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
+
+ if (charge)
+ __mem_cgroup_stat_add_safe(stat,
+ MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
+ else
+ __mem_cgroup_stat_add_safe(stat,
+ MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
}
static struct mem_cgroup_per_zone *
@@ -236,26 +248,12 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
css);
}
-static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
struct mem_cgroup, css);
}
-void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
-{
- struct mem_cgroup *mem;
-
- mem = mem_cgroup_from_task(p);
- css_get(&mem->css);
- mm->mem_cgroup = mem;
-}
-
-void mm_free_cgroup(struct mm_struct *mm)
-{
- css_put(&mm->mem_cgroup->css);
-}
-
static inline int page_cgroup_locked(struct page *page)
{
return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
@@ -287,10 +285,10 @@ static void unlock_page_cgroup(struct page *page)
bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
-static void __mem_cgroup_remove_list(struct page_cgroup *pc)
+static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
+ struct page_cgroup *pc)
{
int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
- struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
if (from)
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
@@ -301,10 +299,10 @@ static void __mem_cgroup_remove_list(struct page_cgroup *pc)
list_del_init(&pc->lru);
}
-static void __mem_cgroup_add_list(struct page_cgroup *pc)
+static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
+ struct page_cgroup *pc)
{
int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
- struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
if (!to) {
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
@@ -476,6 +474,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
+ BUG_ON(!mem_cont);
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
if (active)
src = &mz->active_list;
@@ -560,7 +559,7 @@ retry:
}
unlock_page_cgroup(page);
- pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
+ pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
if (pc == NULL)
goto err;
@@ -574,7 +573,7 @@ retry:
mm = &init_mm;
rcu_read_lock();
- mem = rcu_dereference(mm->mem_cgroup);
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
/*
* For every charge from the cgroup, increment reference count
*/
@@ -602,7 +601,6 @@ retry:
mem_cgroup_out_of_memory(mem, gfp_mask);
goto out;
}
- congestion_wait(WRITE, HZ/10);
}
pc->ref_cnt = 1;
@@ -610,7 +608,7 @@ retry:
pc->page = page;
pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
- pc->flags |= PAGE_CGROUP_FLAG_CACHE;
+ pc->flags = PAGE_CGROUP_FLAG_CACHE;
lock_page_cgroup(page);
if (page_get_page_cgroup(page)) {
@@ -622,14 +620,14 @@ retry:
*/
res_counter_uncharge(&mem->res, PAGE_SIZE);
css_put(&mem->css);
- kfree(pc);
+ kmem_cache_free(page_cgroup_cache, pc);
goto retry;
}
page_assign_page_cgroup(page, pc);
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_add_list(pc);
+ __mem_cgroup_add_list(mz, pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
unlock_page_cgroup(page);
@@ -637,7 +635,7 @@ done:
return 0;
out:
css_put(&mem->css);
- kfree(pc);
+ kmem_cache_free(page_cgroup_cache, pc);
err:
return -ENOMEM;
}
@@ -685,7 +683,7 @@ void mem_cgroup_uncharge_page(struct page *page)
if (--(pc->ref_cnt) == 0) {
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_remove_list(pc);
+ __mem_cgroup_remove_list(mz, pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
page_assign_page_cgroup(page, NULL);
@@ -695,7 +693,7 @@ void mem_cgroup_uncharge_page(struct page *page)
res_counter_uncharge(&mem->res, PAGE_SIZE);
css_put(&mem->css);
- kfree(pc);
+ kmem_cache_free(page_cgroup_cache, pc);
return;
}
@@ -747,7 +745,7 @@ void mem_cgroup_page_migration(struct page *page, struct page *newpage)
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_remove_list(pc);
+ __mem_cgroup_remove_list(mz, pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
page_assign_page_cgroup(page, NULL);
@@ -759,7 +757,7 @@ void mem_cgroup_page_migration(struct page *page, struct page *newpage)
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_add_list(pc);
+ __mem_cgroup_add_list(mz, pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
unlock_page_cgroup(newpage);
@@ -853,13 +851,10 @@ static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
return 0;
}
-static ssize_t mem_cgroup_read(struct cgroup *cont,
- struct cftype *cft, struct file *file,
- char __user *userbuf, size_t nbytes, loff_t *ppos)
+static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
{
- return res_counter_read(&mem_cgroup_from_cont(cont)->res,
- cft->private, userbuf, nbytes, ppos,
- NULL);
+ return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
+ cft->private);
}
static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
@@ -871,27 +866,25 @@ static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
mem_cgroup_write_strategy);
}
-static ssize_t mem_force_empty_write(struct cgroup *cont,
- struct cftype *cft, struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *ppos)
+static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
- int ret = mem_cgroup_force_empty(mem);
- if (!ret)
- ret = nbytes;
- return ret;
+ struct mem_cgroup *mem;
+
+ mem = mem_cgroup_from_cont(cont);
+ switch (event) {
+ case RES_MAX_USAGE:
+ res_counter_reset_max(&mem->res);
+ break;
+ case RES_FAILCNT:
+ res_counter_reset_failcnt(&mem->res);
+ break;
+ }
+ return 0;
}
-/*
- * Note: This should be removed if cgroup supports write-only file.
- */
-static ssize_t mem_force_empty_read(struct cgroup *cont,
- struct cftype *cft,
- struct file *file, char __user *userbuf,
- size_t nbytes, loff_t *ppos)
+static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
{
- return -EINVAL;
+ return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
}
static const struct mem_cgroup_stat_desc {
@@ -900,11 +893,13 @@ static const struct mem_cgroup_stat_desc {
} mem_cgroup_stat_desc[] = {
[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
+ [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
+ [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
};
-static int mem_control_stat_show(struct seq_file *m, void *arg)
+static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+ struct cgroup_map_cb *cb)
{
- struct cgroup *cont = m->private;
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
struct mem_cgroup_stat *stat = &mem_cont->stat;
int i;
@@ -914,8 +909,7 @@ static int mem_control_stat_show(struct seq_file *m, void *arg)
val = mem_cgroup_read_stat(stat, i);
val *= mem_cgroup_stat_desc[i].unit;
- seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg,
- (long long)val);
+ cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
}
/* showing # of active pages */
{
@@ -925,52 +919,43 @@ static int mem_control_stat_show(struct seq_file *m, void *arg)
MEM_CGROUP_ZSTAT_INACTIVE);
active = mem_cgroup_get_all_zonestat(mem_cont,
MEM_CGROUP_ZSTAT_ACTIVE);
- seq_printf(m, "active %ld\n", (active) * PAGE_SIZE);
- seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE);
+ cb->fill(cb, "active", (active) * PAGE_SIZE);
+ cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
}
return 0;
}
-static const struct file_operations mem_control_stat_file_operations = {
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int mem_control_stat_open(struct inode *unused, struct file *file)
-{
- /* XXX __d_cont */
- struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
-
- file->f_op = &mem_control_stat_file_operations;
- return single_open(file, mem_control_stat_show, cont);
-}
-
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = RES_USAGE,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read,
+ },
+ {
+ .name = "max_usage_in_bytes",
+ .private = RES_MAX_USAGE,
+ .trigger = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read,
},
{
.name = "limit_in_bytes",
.private = RES_LIMIT,
.write = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read,
},
{
.name = "failcnt",
.private = RES_FAILCNT,
- .read = mem_cgroup_read,
+ .trigger = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read,
},
{
.name = "force_empty",
- .write = mem_force_empty_write,
- .read = mem_force_empty_read,
+ .trigger = mem_force_empty_write,
},
{
.name = "stat",
- .open = mem_control_stat_open,
+ .read_map = mem_control_stat_show,
},
};
@@ -1010,6 +995,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
kfree(mem->info.nodeinfo[node]);
}
+static struct mem_cgroup *mem_cgroup_alloc(void)
+{
+ struct mem_cgroup *mem;
+
+ if (sizeof(*mem) < PAGE_SIZE)
+ mem = kmalloc(sizeof(*mem), GFP_KERNEL);
+ else
+ mem = vmalloc(sizeof(*mem));
+
+ if (mem)
+ memset(mem, 0, sizeof(*mem));
+ return mem;
+}
+
+static void mem_cgroup_free(struct mem_cgroup *mem)
+{
+ if (sizeof(*mem) < PAGE_SIZE)
+ kfree(mem);
+ else
+ vfree(mem);
+}
+
+
static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
@@ -1018,17 +1026,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
if (unlikely((cont->parent) == NULL)) {
mem = &init_mem_cgroup;
- init_mm.mem_cgroup = mem;
- } else
- mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
-
- if (mem == NULL)
- return ERR_PTR(-ENOMEM);
+ page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
+ } else {
+ mem = mem_cgroup_alloc();
+ if (!mem)
+ return ERR_PTR(-ENOMEM);
+ }
res_counter_init(&mem->res);
- memset(&mem->info, 0, sizeof(mem->info));
-
for_each_node_state(node, N_POSSIBLE)
if (alloc_mem_cgroup_per_zone_info(mem, node))
goto free_out;
@@ -1038,7 +1044,7 @@ free_out:
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
if (cont->parent != NULL)
- kfree(mem);
+ mem_cgroup_free(mem);
return ERR_PTR(-ENOMEM);
}
@@ -1058,7 +1064,7 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
- kfree(mem_cgroup_from_cont(cont));
+ mem_cgroup_free(mem_cgroup_from_cont(cont));
}
static int mem_cgroup_populate(struct cgroup_subsys *ss,
@@ -1098,10 +1104,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
if (!thread_group_leader(p))
goto out;
- css_get(&mem->css);
- rcu_assign_pointer(mm->mem_cgroup, mem);
- css_put(&old_mem->css);
-
out:
mmput(mm);
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c4ba85c8cb0..b17dca7249f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,8 @@
#include <asm/tlbflush.h>
+#include "internal.h"
+
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
diff --git a/mm/migrate.c b/mm/migrate.c
index 4e0eccca5e2..449d77d409f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -383,7 +383,14 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
if (PageDirty(page)) {
clear_page_dirty_for_io(page);
- set_page_dirty(newpage);
+ /*
+ * Want to mark the page and the radix tree as dirty, and
+ * redo the accounting that clear_page_dirty_for_io undid,
+ * but we can't use set_page_dirty because that function
+ * is actually a signal that all of the page has become dirty.
+ * Wheras only part of our page may be dirty.
+ */
+ __set_page_dirty_nobuffers(newpage);
}
#ifdef CONFIG_SWAP
diff --git a/mm/mmap.c b/mm/mmap.c
index 677d184b0d4..fac66337da2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -230,8 +230,11 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file)
+ if (vma->vm_file) {
fput(vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(vma->vm_mm);
+ }
mpol_put(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
return next;
@@ -623,8 +626,11 @@ again: remove_next = 1 + (end > next->vm_end);
spin_unlock(&mapping->i_mmap_lock);
if (remove_next) {
- if (file)
+ if (file) {
fput(file);
+ if (next->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ }
mm->map_count--;
mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
@@ -1154,6 +1160,8 @@ munmap_back:
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
+ if (vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
@@ -1185,6 +1193,8 @@ munmap_back:
mpol_put(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
fput(file);
+ if (vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
} else {
vma_link(mm, vma, prev, rb_link, rb_parent);
file = vma->vm_file;
@@ -1817,8 +1827,11 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
}
vma_set_policy(new, pol);
- if (new->vm_file)
+ if (new->vm_file) {
get_file(new->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
+ }
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
@@ -2135,8 +2148,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
- if (new_vma->vm_file)
+ if (new_vma->vm_file) {
get_file(new_vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
+ }
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
diff --git a/mm/nommu.c b/mm/nommu.c
index 1d32fe89d57..ef8c62cec69 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -966,8 +966,13 @@ unsigned long do_mmap_pgoff(struct file *file,
INIT_LIST_HEAD(&vma->anon_vma_node);
atomic_set(&vma->vm_usage, 1);
- if (file)
+ if (file) {
get_file(file);
+ if (vm_flags & VM_EXECUTABLE) {
+ added_exe_file_vma(current->mm);
+ vma->vm_mm = current->mm;
+ }
+ }
vma->vm_file = file;
vma->vm_flags = vm_flags;
vma->vm_start = addr;
@@ -1022,8 +1027,11 @@ unsigned long do_mmap_pgoff(struct file *file,
up_write(&nommu_vma_sem);
kfree(vml);
if (vma) {
- if (vma->vm_file)
+ if (vma->vm_file) {
fput(vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(vma->vm_mm);
+ }
kfree(vma);
}
return ret;
@@ -1053,7 +1061,7 @@ EXPORT_SYMBOL(do_mmap_pgoff);
/*
* handle mapping disposal for uClinux
*/
-static void put_vma(struct vm_area_struct *vma)
+static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma)
{
if (vma) {
down_write(&nommu_vma_sem);
@@ -1075,8 +1083,11 @@ static void put_vma(struct vm_area_struct *vma)
realalloc -= kobjsize(vma);
askedalloc -= sizeof(*vma);
- if (vma->vm_file)
+ if (vma->vm_file) {
fput(vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ }
kfree(vma);
}
@@ -1113,7 +1124,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
found:
vml = *parent;
- put_vma(vml->vma);
+ put_vma(mm, vml->vma);
*parent = vml->next;
realalloc -= kobjsize(vml);
@@ -1158,7 +1169,7 @@ void exit_mmap(struct mm_struct * mm)
while ((tmp = mm->context.vmlist)) {
mm->context.vmlist = tmp->next;
- put_vma(tmp->vma);
+ put_vma(mm, tmp->vma);
realalloc -= kobjsize(tmp);
askedalloc -= sizeof(*tmp);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5e00f1772c2..789b6adbef3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -164,9 +164,20 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
*/
static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
{
- __prop_inc_percpu(&vm_completions, &bdi->completions);
+ __prop_inc_percpu_max(&vm_completions, &bdi->completions,
+ bdi->max_prop_frac);
}
+void bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __bdi_writeout_inc(bdi);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(bdi_writeout_inc);
+
static inline void task_dirty_inc(struct task_struct *tsk)
{
prop_inc_single(&vm_dirties, &tsk->dirties);
@@ -200,7 +211,8 @@ clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
avail_dirty = dirty -
(global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_WRITEBACK) +
- global_page_state(NR_UNSTABLE_NFS));
+ global_page_state(NR_UNSTABLE_NFS) +
+ global_page_state(NR_WRITEBACK_TEMP));
if (avail_dirty < 0)
avail_dirty = 0;
@@ -243,6 +255,55 @@ static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
}
/*
+ *
+ */
+static DEFINE_SPINLOCK(bdi_lock);
+static unsigned int bdi_min_ratio;
+
+int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
+{
+ int ret = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bdi_lock, flags);
+ if (min_ratio > bdi->max_ratio) {
+ ret = -EINVAL;
+ } else {
+ min_ratio -= bdi->min_ratio;
+ if (bdi_min_ratio + min_ratio < 100) {
+ bdi_min_ratio += min_ratio;
+ bdi->min_ratio += min_ratio;
+ } else {
+ ret = -EINVAL;
+ }
+ }
+ spin_unlock_irqrestore(&bdi_lock, flags);
+
+ return ret;
+}
+
+int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (max_ratio > 100)
+ return -EINVAL;
+
+ spin_lock_irqsave(&bdi_lock, flags);
+ if (bdi->min_ratio > max_ratio) {
+ ret = -EINVAL;
+ } else {
+ bdi->max_ratio = max_ratio;
+ bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
+ }
+ spin_unlock_irqrestore(&bdi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(bdi_set_max_ratio);
+
+/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
*
@@ -300,7 +361,7 @@ static unsigned long determine_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
-static void
+void
get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
struct backing_dev_info *bdi)
{
@@ -330,7 +391,7 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
*pdirty = dirty;
if (bdi) {
- u64 bdi_dirty = dirty;
+ u64 bdi_dirty;
long numerator, denominator;
/*
@@ -338,8 +399,12 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
*/
bdi_writeout_fraction(bdi, &numerator, &denominator);
+ bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
bdi_dirty *= numerator;
do_div(bdi_dirty, denominator);
+ bdi_dirty += (dirty * bdi->min_ratio) / 100;
+ if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+ bdi_dirty = dirty * bdi->max_ratio / 100;
*pbdi_dirty = bdi_dirty;
clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
@@ -1192,7 +1257,7 @@ int test_clear_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
- if (bdi_cap_writeback_dirty(bdi)) {
+ if (bdi_cap_account_writeback(bdi)) {
__dec_bdi_stat(bdi, BDI_WRITEBACK);
__bdi_writeout_inc(bdi);
}
@@ -1221,7 +1286,7 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_set(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
- if (bdi_cap_writeback_dirty(bdi))
+ if (bdi_cap_account_writeback(bdi))
__inc_bdi_stat(bdi, BDI_WRITEBACK);
}
if (!PageDirty(page))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d1cf4f05dcd..bdd5c432c42 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -45,6 +45,7 @@
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
#include <linux/memcontrol.h>
+#include <linux/debugobjects.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -532,8 +533,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
if (reserved)
return;
- if (!PageHighMem(page))
+ if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
+ debug_check_no_obj_freed(page_address(page),
+ PAGE_SIZE << order);
+ }
arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
@@ -995,8 +999,10 @@ static void free_hot_cold_page(struct page *page, int cold)
if (free_pages_check(page))
return;
- if (!PageHighMem(page))
+ if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
+ debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
+ }
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
@@ -1461,7 +1467,8 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
struct task_struct *p = current;
int do_retry;
int alloc_flags;
- int did_some_progress;
+ unsigned long did_some_progress;
+ unsigned long pages_reclaimed = 0;
might_sleep_if(wait);
@@ -1611,14 +1618,26 @@ nofail_alloc:
* Don't let big-order allocations loop unless the caller explicitly
* requests that. Wait for some write requests to complete then retry.
*
- * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
- * <= 3, but that may not be true in other implementations.
+ * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+ * means __GFP_NOFAIL, but that may not be true in other
+ * implementations.
+ *
+ * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+ * specified, then we retry until we no longer reclaim any pages
+ * (above), or we've reclaimed an order of pages at least as
+ * large as the allocation's order. In both cases, if the
+ * allocation still fails, we stop retrying.
*/
+ pages_reclaimed += did_some_progress;
do_retry = 0;
if (!(gfp_mask & __GFP_NORETRY)) {
- if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
- (gfp_mask & __GFP_REPEAT))
+ if (order <= PAGE_ALLOC_COSTLY_ORDER) {
do_retry = 1;
+ } else {
+ if (gfp_mask & __GFP_REPEAT &&
+ pages_reclaimed < (1 << order))
+ do_retry = 1;
+ }
if (gfp_mask & __GFP_NOFAIL)
do_retry = 1;
}
@@ -2524,7 +2543,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
+ struct zone *z;
+ z = &NODE_DATA(nid)->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
@@ -2542,7 +2563,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
init_page_count(page);
reset_page_mapcount(page);
SetPageReserved(page);
-
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
@@ -2551,8 +2571,15 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* kernel allocations are made. Later some blocks near
* the start are marked MIGRATE_RESERVE by
* setup_zone_migrate_reserve()
+ *
+ * bitmap is created for zone's valid pfn range. but memmap
+ * can be created for invalid pages (for alignment)
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
*/
- if ((pfn & (pageblock_nr_pages-1)))
+ if ((z->zone_start_pfn <= pfn)
+ && (pfn < z->zone_start_pfn + z->spanned_pages)
+ && !(pfn & (pageblock_nr_pages - 1)))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
INIT_LIST_HEAD(&page->lru);
@@ -4464,6 +4491,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
+ VM_BUG_ON(pfn < zone->zone_start_pfn);
+ VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
if (flags & value)
diff --git a/mm/readahead.c b/mm/readahead.c
index 8762e898897..d8723a5f649 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -235,7 +235,13 @@ unsigned long max_sane_readahead(unsigned long nr)
static int __init readahead_init(void)
{
- return bdi_init(&default_backing_dev_info);
+ int err;
+
+ err = bdi_init(&default_backing_dev_info);
+ if (!err)
+ bdi_register(&default_backing_dev_info, NULL, "default");
+
+ return err;
}
subsys_initcall(readahead_init);
diff --git a/mm/shmem.c b/mm/shmem.c
index e6d9298aa22..e2a6ae1a44e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -201,7 +201,7 @@ static struct vm_operations_struct shmem_vm_ops;
static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
.ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
.unplug_io_fn = default_unplug_io_fn,
};
diff --git a/mm/slab.c b/mm/slab.c
index 39d20f8a079..06236e4ddc1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -110,6 +110,7 @@
#include <linux/fault-inject.h>
#include <linux/rtmutex.h>
#include <linux/reciprocal_div.h>
+#include <linux/debugobjects.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -174,12 +175,14 @@
SLAB_CACHE_DMA | \
SLAB_STORE_USER | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
+ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
+ SLAB_DEBUG_OBJECTS)
#else
# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
SLAB_CACHE_DMA | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
+ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
+ SLAB_DEBUG_OBJECTS)
#endif
/*
@@ -858,7 +861,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}
-#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
+#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
static void __slab_error(const char *function, struct kmem_cache *cachep,
char *msg)
@@ -2153,7 +2156,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
size > KMALLOC_MAX_SIZE) {
- printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
+ printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
name);
BUG();
}
@@ -3760,6 +3763,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
local_irq_save(flags);
debug_check_no_locks_freed(objp, obj_size(cachep));
+ if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, obj_size(cachep));
__cache_free(cachep, objp);
local_irq_restore(flags);
}
@@ -3785,6 +3790,7 @@ void kfree(const void *objp)
kfree_debugcheck(objp);
c = virt_to_cache(objp);
debug_check_no_locks_freed(objp, obj_size(c));
+ debug_check_no_obj_freed(objp, obj_size(c));
__cache_free(c, (void *)objp);
local_irq_restore(flags);
}
diff --git a/mm/slob.c b/mm/slob.c
index e2c3c0ec546..6038cbadf79 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -533,7 +533,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
{
struct kmem_cache *c;
- c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
+ c = slob_alloc(sizeof(struct kmem_cache),
+ flags, ARCH_KMALLOC_MINALIGN, -1);
if (c) {
c->name = name;
diff --git a/mm/slub.c b/mm/slub.c
index 38914bc64ac..d379b782fc8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -19,8 +19,10 @@
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
+#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/memory.h>
+#include <linux/math64.h>
/*
* Lock order:
@@ -149,25 +151,6 @@ static inline void ClearSlabDebug(struct page *page)
/* Enable to test recovery from slab corruption on boot */
#undef SLUB_RESILIENCY_TEST
-#if PAGE_SHIFT <= 12
-
-/*
- * Small page size. Make sure that we do not fragment memory
- */
-#define DEFAULT_MAX_ORDER 1
-#define DEFAULT_MIN_OBJECTS 4
-
-#else
-
-/*
- * Large page machines are customarily able to handle larger
- * page orders.
- */
-#define DEFAULT_MAX_ORDER 2
-#define DEFAULT_MIN_OBJECTS 8
-
-#endif
-
/*
* Mininum number of partial slabs. These will be left on the partial
* lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -204,8 +187,6 @@ static inline void ClearSlabDebug(struct page *page)
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000 /* Poison object */
#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
-#define __KMALLOC_CACHE 0x20000000 /* objects freed using kfree */
-#define __PAGE_ALLOC_FALLBACK 0x10000000 /* Allow fallback to page alloc */
static int kmem_size = sizeof(struct kmem_cache);
@@ -236,7 +217,7 @@ struct track {
enum track_item { TRACK_ALLOC, TRACK_FREE };
-#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
+#ifdef CONFIG_SLUB_DEBUG
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
@@ -296,7 +277,7 @@ static inline int check_valid_pointer(struct kmem_cache *s,
return 1;
base = page_address(page);
- if (object < base || object >= base + s->objects * s->size ||
+ if (object < base || object >= base + page->objects * s->size ||
(object - base) % s->size) {
return 0;
}
@@ -322,8 +303,8 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
}
/* Loop over all objects in a slab */
-#define for_each_object(__p, __s, __addr) \
- for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
+#define for_each_object(__p, __s, __addr, __objects) \
+ for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
__p += (__s)->size)
/* Scan freelist */
@@ -336,6 +317,26 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
return (p - addr) / s->size;
}
+static inline struct kmem_cache_order_objects oo_make(int order,
+ unsigned long size)
+{
+ struct kmem_cache_order_objects x = {
+ (order << 16) + (PAGE_SIZE << order) / size
+ };
+
+ return x;
+}
+
+static inline int oo_order(struct kmem_cache_order_objects x)
+{
+ return x.x >> 16;
+}
+
+static inline int oo_objects(struct kmem_cache_order_objects x)
+{
+ return x.x & ((1 << 16) - 1);
+}
+
#ifdef CONFIG_SLUB_DEBUG
/*
* Debug settings:
@@ -446,8 +447,8 @@ static void print_tracking(struct kmem_cache *s, void *object)
static void print_page_info(struct page *page)
{
- printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
- page, page->inuse, page->freelist, page->flags);
+ printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
+ page, page->objects, page->inuse, page->freelist, page->flags);
}
@@ -647,6 +648,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
p + off, POISON_INUSE, s->size - off);
}
+/* Check the pad bytes at the end of a slab page */
static int slab_pad_check(struct kmem_cache *s, struct page *page)
{
u8 *start;
@@ -659,20 +661,20 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
return 1;
start = page_address(page);
- end = start + (PAGE_SIZE << s->order);
- length = s->objects * s->size;
- remainder = end - (start + length);
+ length = (PAGE_SIZE << compound_order(page));
+ end = start + length;
+ remainder = length % s->size;
if (!remainder)
return 1;
- fault = check_bytes(start + length, POISON_INUSE, remainder);
+ fault = check_bytes(end - remainder, POISON_INUSE, remainder);
if (!fault)
return 1;
while (end > fault && end[-1] == POISON_INUSE)
end--;
slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
- print_section("Padding", start, length);
+ print_section("Padding", end - remainder, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, start, end);
return 0;
@@ -734,15 +736,24 @@ static int check_object(struct kmem_cache *s, struct page *page,
static int check_slab(struct kmem_cache *s, struct page *page)
{
+ int maxobj;
+
VM_BUG_ON(!irqs_disabled());
if (!PageSlab(page)) {
slab_err(s, page, "Not a valid slab page");
return 0;
}
- if (page->inuse > s->objects) {
+
+ maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
+ if (page->objects > maxobj) {
+ slab_err(s, page, "objects %u > max %u",
+ s->name, page->objects, maxobj);
+ return 0;
+ }
+ if (page->inuse > page->objects) {
slab_err(s, page, "inuse %u > max %u",
- s->name, page->inuse, s->objects);
+ s->name, page->inuse, page->objects);
return 0;
}
/* Slab_pad_check fixes things up after itself */
@@ -759,8 +770,9 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
int nr = 0;
void *fp = page->freelist;
void *object = NULL;
+ unsigned long max_objects;
- while (fp && nr <= s->objects) {
+ while (fp && nr <= page->objects) {
if (fp == search)
return 1;
if (!check_valid_pointer(s, page, fp)) {
@@ -772,7 +784,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
} else {
slab_err(s, page, "Freepointer corrupt");
page->freelist = NULL;
- page->inuse = s->objects;
+ page->inuse = page->objects;
slab_fix(s, "Freelist cleared");
return 0;
}
@@ -783,16 +795,27 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
nr++;
}
- if (page->inuse != s->objects - nr) {
+ max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
+ if (max_objects > 65535)
+ max_objects = 65535;
+
+ if (page->objects != max_objects) {
+ slab_err(s, page, "Wrong number of objects. Found %d but "
+ "should be %d", page->objects, max_objects);
+ page->objects = max_objects;
+ slab_fix(s, "Number of objects adjusted.");
+ }
+ if (page->inuse != page->objects - nr) {
slab_err(s, page, "Wrong object count. Counter is %d but "
- "counted were %d", page->inuse, s->objects - nr);
- page->inuse = s->objects - nr;
+ "counted were %d", page->inuse, page->objects - nr);
+ page->inuse = page->objects - nr;
slab_fix(s, "Object count adjusted.");
}
return search == NULL;
}
-static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc)
+static void trace(struct kmem_cache *s, struct page *page, void *object,
+ int alloc)
{
if (s->flags & SLAB_TRACE) {
printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
@@ -840,7 +863,7 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node)
return atomic_long_read(&n->nr_slabs);
}
-static inline void inc_slabs_node(struct kmem_cache *s, int node)
+static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
{
struct kmem_cache_node *n = get_node(s, node);
@@ -850,14 +873,17 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node)
* dilemma by deferring the increment of the count during
* bootstrap (see early_kmem_cache_node_alloc).
*/
- if (!NUMA_BUILD || n)
+ if (!NUMA_BUILD || n) {
atomic_long_inc(&n->nr_slabs);
+ atomic_long_add(objects, &n->total_objects);
+ }
}
-static inline void dec_slabs_node(struct kmem_cache *s, int node)
+static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
{
struct kmem_cache_node *n = get_node(s, node);
atomic_long_dec(&n->nr_slabs);
+ atomic_long_sub(objects, &n->total_objects);
}
/* Object debug checks for alloc/free paths */
@@ -905,7 +931,7 @@ bad:
* as used avoids touching the remaining objects.
*/
slab_fix(s, "Marking all objects used");
- page->inuse = s->objects;
+ page->inuse = page->objects;
page->freelist = NULL;
}
return 0;
@@ -1055,31 +1081,52 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
static inline unsigned long slabs_node(struct kmem_cache *s, int node)
{ return 0; }
-static inline void inc_slabs_node(struct kmem_cache *s, int node) {}
-static inline void dec_slabs_node(struct kmem_cache *s, int node) {}
+static inline void inc_slabs_node(struct kmem_cache *s, int node,
+ int objects) {}
+static inline void dec_slabs_node(struct kmem_cache *s, int node,
+ int objects) {}
#endif
+
/*
* Slab allocation and freeing
*/
+static inline struct page *alloc_slab_page(gfp_t flags, int node,
+ struct kmem_cache_order_objects oo)
+{
+ int order = oo_order(oo);
+
+ if (node == -1)
+ return alloc_pages(flags, order);
+ else
+ return alloc_pages_node(node, flags, order);
+}
+
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
- int pages = 1 << s->order;
+ struct kmem_cache_order_objects oo = s->oo;
flags |= s->allocflags;
- if (node == -1)
- page = alloc_pages(flags, s->order);
- else
- page = alloc_pages_node(node, flags, s->order);
-
- if (!page)
- return NULL;
+ page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node,
+ oo);
+ if (unlikely(!page)) {
+ oo = s->min;
+ /*
+ * Allocation may have failed due to fragmentation.
+ * Try a lower order alloc if possible
+ */
+ page = alloc_slab_page(flags, node, oo);
+ if (!page)
+ return NULL;
+ stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
+ }
+ page->objects = oo_objects(oo);
mod_zone_page_state(page_zone(page),
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
- pages);
+ 1 << oo_order(oo));
return page;
}
@@ -1106,7 +1153,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
if (!page)
goto out;
- inc_slabs_node(s, page_to_nid(page));
+ inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab = s;
page->flags |= 1 << PG_slab;
if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
@@ -1116,10 +1163,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
start = page_address(page);
if (unlikely(s->flags & SLAB_POISON))
- memset(start, POISON_INUSE, PAGE_SIZE << s->order);
+ memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
last = start;
- for_each_object(p, s, start) {
+ for_each_object(p, s, start, page->objects) {
setup_object(s, page, last);
set_freepointer(s, last, p);
last = p;
@@ -1135,13 +1182,15 @@ out:
static void __free_slab(struct kmem_cache *s, struct page *page)
{
- int pages = 1 << s->order;
+ int order = compound_order(page);
+ int pages = 1 << order;
if (unlikely(SlabDebug(page))) {
void *p;
slab_pad_check(s, page);
- for_each_object(p, s, page_address(page))
+ for_each_object(p, s, page_address(page),
+ page->objects)
check_object(s, page, p, 0);
ClearSlabDebug(page);
}
@@ -1153,7 +1202,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlab(page);
reset_page_mapcount(page);
- __free_pages(page, s->order);
+ __free_pages(page, order);
}
static void rcu_free_slab(struct rcu_head *h)
@@ -1179,7 +1228,7 @@ static void free_slab(struct kmem_cache *s, struct page *page)
static void discard_slab(struct kmem_cache *s, struct page *page)
{
- dec_slabs_node(s, page_to_nid(page));
+ dec_slabs_node(s, page_to_nid(page), page->objects);
free_slab(s, page);
}
@@ -1219,8 +1268,7 @@ static void add_partial(struct kmem_cache_node *n,
spin_unlock(&n->list_lock);
}
-static void remove_partial(struct kmem_cache *s,
- struct page *page)
+static void remove_partial(struct kmem_cache *s, struct page *page)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -1235,7 +1283,8 @@ static void remove_partial(struct kmem_cache *s,
*
* Must hold list_lock.
*/
-static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page)
+static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
+ struct page *page)
{
if (slab_trylock(page)) {
list_del(&page->lru);
@@ -1372,8 +1421,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
* so that the others get filled first. That way the
* size of the partial list stays small.
*
- * kmem_cache_shrink can reclaim any empty slabs from the
- * partial list.
+ * kmem_cache_shrink can reclaim any empty slabs from
+ * the partial list.
*/
add_partial(n, page, 1);
slab_unlock(page);
@@ -1515,7 +1564,7 @@ load_freelist:
goto debug;
c->freelist = object[c->offset];
- c->page->inuse = s->objects;
+ c->page->inuse = c->page->objects;
c->page->freelist = NULL;
c->node = page_to_nid(c->page);
unlock_out:
@@ -1552,27 +1601,6 @@ new_slab:
c->page = new;
goto load_freelist;
}
-
- /*
- * No memory available.
- *
- * If the slab uses higher order allocs but the object is
- * smaller than a page size then we can fallback in emergencies
- * to the page allocator via kmalloc_large. The page allocator may
- * have failed to obtain a higher order page and we can try to
- * allocate a single page if the object fits into a single page.
- * That is only possible if certain conditions are met that are being
- * checked when a slab is created.
- */
- if (!(gfpflags & __GFP_NORETRY) &&
- (s->flags & __PAGE_ALLOC_FALLBACK)) {
- if (gfpflags & __GFP_WAIT)
- local_irq_enable();
- object = kmalloc_large(s->objsize, gfpflags);
- if (gfpflags & __GFP_WAIT)
- local_irq_disable();
- return object;
- }
return NULL;
debug:
if (!alloc_debug_processing(s, c->page, object, addr))
@@ -1722,6 +1750,8 @@ static __always_inline void slab_free(struct kmem_cache *s,
local_irq_save(flags);
c = get_cpu_slab(s, smp_processor_id());
debug_check_no_locks_freed(object, c->objsize);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(object, s->objsize);
if (likely(page == c->page && c->node >= 0)) {
object[c->offset] = c->freelist;
c->freelist = object;
@@ -1773,8 +1803,8 @@ static struct page *get_object_page(const void *x)
* take the list_lock.
*/
static int slub_min_order;
-static int slub_max_order = DEFAULT_MAX_ORDER;
-static int slub_min_objects = DEFAULT_MIN_OBJECTS;
+static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
+static int slub_min_objects;
/*
* Merge control. If this is set then no merging of slab caches will occur.
@@ -1789,7 +1819,7 @@ static int slub_nomerge;
* system components. Generally order 0 allocations should be preferred since
* order 0 does not cause fragmentation in the page allocator. Larger objects
* be problematic to put into order 0 slabs because there may be too much
- * unused space left. We go to a higher order if more than 1/8th of the slab
+ * unused space left. We go to a higher order if more than 1/16th of the slab
* would be wasted.
*
* In order to reach satisfactory performance we must ensure that a minimum
@@ -1814,6 +1844,9 @@ static inline int slab_order(int size, int min_objects,
int rem;
int min_order = slub_min_order;
+ if ((PAGE_SIZE << min_order) / size > 65535)
+ return get_order(size * 65535) - 1;
+
for (order = max(min_order,
fls(min_objects * size - 1) - PAGE_SHIFT);
order <= max_order; order++) {
@@ -1848,8 +1881,10 @@ static inline int calculate_order(int size)
* we reduce the minimum objects required in a slab.
*/
min_objects = slub_min_objects;
+ if (!min_objects)
+ min_objects = 4 * (fls(nr_cpu_ids) + 1);
while (min_objects > 1) {
- fraction = 8;
+ fraction = 16;
while (fraction >= 4) {
order = slab_order(size, min_objects,
slub_max_order, fraction);
@@ -2091,7 +2126,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
init_tracking(kmalloc_caches, n);
#endif
init_kmem_cache_node(n);
- inc_slabs_node(kmalloc_caches, node);
+ inc_slabs_node(kmalloc_caches, node, page->objects);
/*
* lockdep requires consistent irq usage for each lock
@@ -2167,11 +2202,12 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
* calculate_sizes() determines the order and the distribution of data within
* a slab object.
*/
-static int calculate_sizes(struct kmem_cache *s)
+static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
unsigned long flags = s->flags;
unsigned long size = s->objsize;
unsigned long align = s->align;
+ int order;
/*
* Round up object size to the next word boundary. We can only
@@ -2255,26 +2291,16 @@ static int calculate_sizes(struct kmem_cache *s)
*/
size = ALIGN(size, align);
s->size = size;
+ if (forced_order >= 0)
+ order = forced_order;
+ else
+ order = calculate_order(size);
- if ((flags & __KMALLOC_CACHE) &&
- PAGE_SIZE / size < slub_min_objects) {
- /*
- * Kmalloc cache that would not have enough objects in
- * an order 0 page. Kmalloc slabs can fallback to
- * page allocator order 0 allocs so take a reasonably large
- * order that will allows us a good number of objects.
- */
- s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER);
- s->flags |= __PAGE_ALLOC_FALLBACK;
- s->allocflags |= __GFP_NOWARN;
- } else
- s->order = calculate_order(size);
-
- if (s->order < 0)
+ if (order < 0)
return 0;
s->allocflags = 0;
- if (s->order)
+ if (order)
s->allocflags |= __GFP_COMP;
if (s->flags & SLAB_CACHE_DMA)
@@ -2286,9 +2312,12 @@ static int calculate_sizes(struct kmem_cache *s)
/*
* Determine the number of objects per slab
*/
- s->objects = (PAGE_SIZE << s->order) / size;
+ s->oo = oo_make(order, size);
+ s->min = oo_make(get_order(size), size);
+ if (oo_objects(s->oo) > oo_objects(s->max))
+ s->max = s->oo;
- return !!s->objects;
+ return !!oo_objects(s->oo);
}
@@ -2304,7 +2333,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
s->align = align;
s->flags = kmem_cache_flags(size, flags, name, ctor);
- if (!calculate_sizes(s))
+ if (!calculate_sizes(s, -1))
goto error;
s->refcount = 1;
@@ -2321,7 +2350,7 @@ error:
if (flags & SLAB_PANIC)
panic("Cannot create slab %s size=%lu realsize=%u "
"order=%u offset=%u flags=%lx\n",
- s->name, (unsigned long)size, s->size, s->order,
+ s->name, (unsigned long)size, s->size, oo_order(s->oo),
s->offset, flags);
return 0;
}
@@ -2367,26 +2396,52 @@ const char *kmem_cache_name(struct kmem_cache *s)
}
EXPORT_SYMBOL(kmem_cache_name);
+static void list_slab_objects(struct kmem_cache *s, struct page *page,
+ const char *text)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ void *addr = page_address(page);
+ void *p;
+ DECLARE_BITMAP(map, page->objects);
+
+ bitmap_zero(map, page->objects);
+ slab_err(s, page, "%s", text);
+ slab_lock(page);
+ for_each_free_object(p, s, page->freelist)
+ set_bit(slab_index(p, s, addr), map);
+
+ for_each_object(p, s, addr, page->objects) {
+
+ if (!test_bit(slab_index(p, s, addr), map)) {
+ printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
+ p, p - addr);
+ print_tracking(s, p);
+ }
+ }
+ slab_unlock(page);
+#endif
+}
+
/*
- * Attempt to free all slabs on a node. Return the number of slabs we
- * were unable to free.
+ * Attempt to free all partial slabs on a node.
*/
-static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
- struct list_head *list)
+static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
- int slabs_inuse = 0;
unsigned long flags;
struct page *page, *h;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry_safe(page, h, list, lru)
+ list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
list_del(&page->lru);
discard_slab(s, page);
- } else
- slabs_inuse++;
+ n->nr_partial--;
+ } else {
+ list_slab_objects(s, page,
+ "Objects remaining on kmem_cache_close()");
+ }
+ }
spin_unlock_irqrestore(&n->list_lock, flags);
- return slabs_inuse;
}
/*
@@ -2403,8 +2458,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
- n->nr_partial -= free_list(s, n, &n->partial);
- if (slabs_node(s, node))
+ free_partial(s, n);
+ if (n->nr_partial || slabs_node(s, node))
return 1;
}
free_kmem_cache_nodes(s);
@@ -2422,8 +2477,11 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (!s->refcount) {
list_del(&s->list);
up_write(&slub_lock);
- if (kmem_cache_close(s))
- WARN_ON(1);
+ if (kmem_cache_close(s)) {
+ printk(KERN_ERR "SLUB %s: %s called for cache that "
+ "still has objects.\n", s->name, __func__);
+ dump_stack();
+ }
sysfs_slab_remove(s);
} else
up_write(&slub_lock);
@@ -2482,7 +2540,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
down_write(&slub_lock);
if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
- flags | __KMALLOC_CACHE, NULL))
+ flags, NULL))
goto panic;
list_add(&s->list, &slab_caches);
@@ -2730,8 +2788,9 @@ int kmem_cache_shrink(struct kmem_cache *s)
struct kmem_cache_node *n;
struct page *page;
struct page *t;
+ int objects = oo_objects(s->max);
struct list_head *slabs_by_inuse =
- kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
+ kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
unsigned long flags;
if (!slabs_by_inuse)
@@ -2744,7 +2803,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
if (!n->nr_partial)
continue;
- for (i = 0; i < s->objects; i++)
+ for (i = 0; i < objects; i++)
INIT_LIST_HEAD(slabs_by_inuse + i);
spin_lock_irqsave(&n->list_lock, flags);
@@ -2776,7 +2835,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
* Rebuild the partial list with the slabs filled up most
* first and the least used slabs at the end.
*/
- for (i = s->objects - 1; i >= 0; i--)
+ for (i = objects - 1; i >= 0; i--)
list_splice(slabs_by_inuse + i, n->partial.prev);
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -2851,7 +2910,7 @@ static int slab_mem_going_online_callback(void *arg)
return 0;
/*
- * We are bringing a node online. No memory is availabe yet. We must
+ * We are bringing a node online. No memory is available yet. We must
* allocate a kmem_cache_node structure in order to bring the node
* online.
*/
@@ -2924,7 +2983,7 @@ void __init kmem_cache_init(void)
kmalloc_caches[0].refcount = -1;
caches++;
- hotplug_memory_notifier(slab_memory_callback, 1);
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
#endif
/* Able to allocate the per node structures */
@@ -2997,9 +3056,6 @@ static int slab_unmergeable(struct kmem_cache *s)
if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
return 1;
- if ((s->flags & __PAGE_ALLOC_FALLBACK))
- return 1;
-
if (s->ctor)
return 1;
@@ -3191,8 +3247,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
return slab_alloc(s, gfpflags, node, caller);
}
-#if (defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)) || defined(CONFIG_SLABINFO)
-static unsigned long count_partial(struct kmem_cache_node *n)
+#ifdef CONFIG_SLUB_DEBUG
+static unsigned long count_partial(struct kmem_cache_node *n,
+ int (*get_count)(struct page *))
{
unsigned long flags;
unsigned long x = 0;
@@ -3200,13 +3257,26 @@ static unsigned long count_partial(struct kmem_cache_node *n)
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, lru)
- x += page->inuse;
+ x += get_count(page);
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
-#endif
-#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
+static int count_inuse(struct page *page)
+{
+ return page->inuse;
+}
+
+static int count_total(struct page *page)
+{
+ return page->objects;
+}
+
+static int count_free(struct page *page)
+{
+ return page->objects - page->inuse;
+}
+
static int validate_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
{
@@ -3218,7 +3288,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
return 0;
/* Now we know that a valid freelist exists */
- bitmap_zero(map, s->objects);
+ bitmap_zero(map, page->objects);
for_each_free_object(p, s, page->freelist) {
set_bit(slab_index(p, s, addr), map);
@@ -3226,7 +3296,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
return 0;
}
- for_each_object(p, s, addr)
+ for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
if (!check_object(s, page, p, 1))
return 0;
@@ -3292,7 +3362,7 @@ static long validate_slab_cache(struct kmem_cache *s)
{
int node;
unsigned long count = 0;
- unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) *
+ unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
sizeof(unsigned long), GFP_KERNEL);
if (!map)
@@ -3495,14 +3565,14 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
struct page *page, enum track_item alloc)
{
void *addr = page_address(page);
- DECLARE_BITMAP(map, s->objects);
+ DECLARE_BITMAP(map, page->objects);
void *p;
- bitmap_zero(map, s->objects);
+ bitmap_zero(map, page->objects);
for_each_free_object(p, s, page->freelist)
set_bit(slab_index(p, s, addr), map);
- for_each_object(p, s, addr)
+ for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
add_location(t, s, get_track(s, p, alloc));
}
@@ -3551,12 +3621,10 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf + len, "<not-available>");
if (l->sum_time != l->min_time) {
- unsigned long remainder;
-
len += sprintf(buf + len, " age=%ld/%ld/%ld",
- l->min_time,
- div_long_long_rem(l->sum_time, l->count, &remainder),
- l->max_time);
+ l->min_time,
+ (long)div_u64(l->sum_time, l->count),
+ l->max_time);
} else
len += sprintf(buf + len, " age=%ld",
l->min_time);
@@ -3592,22 +3660,23 @@ static int list_locations(struct kmem_cache *s, char *buf,
}
enum slab_stat_type {
- SL_FULL,
- SL_PARTIAL,
- SL_CPU,
- SL_OBJECTS
+ SL_ALL, /* All slabs */
+ SL_PARTIAL, /* Only partially allocated slabs */
+ SL_CPU, /* Only slabs used for cpu caches */
+ SL_OBJECTS, /* Determine allocated objects not slabs */
+ SL_TOTAL /* Determine object capacity not slabs */
};
-#define SO_FULL (1 << SL_FULL)
+#define SO_ALL (1 << SL_ALL)
#define SO_PARTIAL (1 << SL_PARTIAL)
#define SO_CPU (1 << SL_CPU)
#define SO_OBJECTS (1 << SL_OBJECTS)
+#define SO_TOTAL (1 << SL_TOTAL)
static ssize_t show_slab_objects(struct kmem_cache *s,
char *buf, unsigned long flags)
{
unsigned long total = 0;
- int cpu;
int node;
int x;
unsigned long *nodes;
@@ -3618,56 +3687,60 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
return -ENOMEM;
per_cpu = nodes + nr_node_ids;
- for_each_possible_cpu(cpu) {
- struct page *page;
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ if (flags & SO_CPU) {
+ int cpu;
- if (!c)
- continue;
+ for_each_possible_cpu(cpu) {
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
- page = c->page;
- node = c->node;
- if (node < 0)
- continue;
- if (page) {
- if (flags & SO_CPU) {
- if (flags & SO_OBJECTS)
- x = page->inuse;
+ if (!c || c->node < 0)
+ continue;
+
+ if (c->page) {
+ if (flags & SO_TOTAL)
+ x = c->page->objects;
+ else if (flags & SO_OBJECTS)
+ x = c->page->inuse;
else
x = 1;
+
total += x;
- nodes[node] += x;
+ nodes[c->node] += x;
}
- per_cpu[node]++;
+ per_cpu[c->node]++;
}
}
- for_each_node_state(node, N_NORMAL_MEMORY) {
- struct kmem_cache_node *n = get_node(s, node);
+ if (flags & SO_ALL) {
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ if (flags & SO_TOTAL)
+ x = atomic_long_read(&n->total_objects);
+ else if (flags & SO_OBJECTS)
+ x = atomic_long_read(&n->total_objects) -
+ count_partial(n, count_free);
- if (flags & SO_PARTIAL) {
- if (flags & SO_OBJECTS)
- x = count_partial(n);
else
- x = n->nr_partial;
+ x = atomic_long_read(&n->nr_slabs);
total += x;
nodes[node] += x;
}
- if (flags & SO_FULL) {
- int full_slabs = atomic_long_read(&n->nr_slabs)
- - per_cpu[node]
- - n->nr_partial;
+ } else if (flags & SO_PARTIAL) {
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
- if (flags & SO_OBJECTS)
- x = full_slabs * s->objects;
+ if (flags & SO_TOTAL)
+ x = count_partial(n, count_total);
+ else if (flags & SO_OBJECTS)
+ x = count_partial(n, count_inuse);
else
- x = full_slabs;
+ x = n->nr_partial;
total += x;
nodes[node] += x;
}
}
-
x = sprintf(buf, "%lu", total);
#ifdef CONFIG_NUMA
for_each_node_state(node, N_NORMAL_MEMORY)
@@ -3682,14 +3755,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
static int any_slab_objects(struct kmem_cache *s)
{
int node;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-
- if (c && c->page)
- return 1;
- }
for_each_online_node(node) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3697,7 +3762,7 @@ static int any_slab_objects(struct kmem_cache *s)
if (!n)
continue;
- if (n->nr_partial || atomic_long_read(&n->nr_slabs))
+ if (atomic_read(&n->total_objects))
return 1;
}
return 0;
@@ -3739,15 +3804,32 @@ SLAB_ATTR_RO(object_size);
static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->objects);
+ return sprintf(buf, "%d\n", oo_objects(s->oo));
}
SLAB_ATTR_RO(objs_per_slab);
+static ssize_t order_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ unsigned long order;
+ int err;
+
+ err = strict_strtoul(buf, 10, &order);
+ if (err)
+ return err;
+
+ if (order > slub_max_order || order < slub_min_order)
+ return -EINVAL;
+
+ calculate_sizes(s, order);
+ return length;
+}
+
static ssize_t order_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->order);
+ return sprintf(buf, "%d\n", oo_order(s->oo));
}
-SLAB_ATTR_RO(order);
+SLAB_ATTR(order);
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
@@ -3768,7 +3850,7 @@ SLAB_ATTR_RO(aliases);
static ssize_t slabs_show(struct kmem_cache *s, char *buf)
{
- return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU);
+ return show_slab_objects(s, buf, SO_ALL);
}
SLAB_ATTR_RO(slabs);
@@ -3786,10 +3868,22 @@ SLAB_ATTR_RO(cpu_slabs);
static ssize_t objects_show(struct kmem_cache *s, char *buf)
{
- return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS);
+ return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
}
SLAB_ATTR_RO(objects);
+static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
+}
+SLAB_ATTR_RO(objects_partial);
+
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
+}
+SLAB_ATTR_RO(total_objects);
+
static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
@@ -3869,7 +3963,7 @@ static ssize_t red_zone_store(struct kmem_cache *s,
s->flags &= ~SLAB_RED_ZONE;
if (buf[0] == '1')
s->flags |= SLAB_RED_ZONE;
- calculate_sizes(s);
+ calculate_sizes(s, -1);
return length;
}
SLAB_ATTR(red_zone);
@@ -3888,7 +3982,7 @@ static ssize_t poison_store(struct kmem_cache *s,
s->flags &= ~SLAB_POISON;
if (buf[0] == '1')
s->flags |= SLAB_POISON;
- calculate_sizes(s);
+ calculate_sizes(s, -1);
return length;
}
SLAB_ATTR(poison);
@@ -3907,7 +4001,7 @@ static ssize_t store_user_store(struct kmem_cache *s,
s->flags &= ~SLAB_STORE_USER;
if (buf[0] == '1')
s->flags |= SLAB_STORE_USER;
- calculate_sizes(s);
+ calculate_sizes(s, -1);
return length;
}
SLAB_ATTR(store_user);
@@ -3975,10 +4069,16 @@ static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- int n = simple_strtoul(buf, NULL, 10);
+ unsigned long ratio;
+ int err;
+
+ err = strict_strtoul(buf, 10, &ratio);
+ if (err)
+ return err;
+
+ if (ratio < 100)
+ s->remote_node_defrag_ratio = ratio * 10;
- if (n < 100)
- s->remote_node_defrag_ratio = n * 10;
return length;
}
SLAB_ATTR(remote_node_defrag_ratio);
@@ -4038,7 +4138,7 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
-
+STAT_ATTR(ORDER_FALLBACK, order_fallback);
#endif
static struct attribute *slab_attrs[] = {
@@ -4047,6 +4147,8 @@ static struct attribute *slab_attrs[] = {
&objs_per_slab_attr.attr,
&order_attr.attr,
&objects_attr.attr,
+ &objects_partial_attr.attr,
+ &total_objects_attr.attr,
&slabs_attr.attr,
&partial_attr.attr,
&cpu_slabs_attr.attr,
@@ -4089,6 +4191,7 @@ static struct attribute *slab_attrs[] = {
&deactivate_to_head_attr.attr,
&deactivate_to_tail_attr.attr,
&deactivate_remote_frees_attr.attr,
+ &order_fallback_attr.attr,
#endif
NULL
};
@@ -4332,8 +4435,8 @@ __initcall(slab_sysfs_init);
*/
#ifdef CONFIG_SLABINFO
-ssize_t slabinfo_write(struct file *file, const char __user * buffer,
- size_t count, loff_t *ppos)
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
{
return -EINVAL;
}
@@ -4375,7 +4478,8 @@ static int s_show(struct seq_file *m, void *p)
unsigned long nr_partials = 0;
unsigned long nr_slabs = 0;
unsigned long nr_inuse = 0;
- unsigned long nr_objs;
+ unsigned long nr_objs = 0;
+ unsigned long nr_free = 0;
struct kmem_cache *s;
int node;
@@ -4389,14 +4493,15 @@ static int s_show(struct seq_file *m, void *p)
nr_partials += n->nr_partial;
nr_slabs += atomic_long_read(&n->nr_slabs);
- nr_inuse += count_partial(n);
+ nr_objs += atomic_long_read(&n->total_objects);
+ nr_free += count_partial(n, count_free);
}
- nr_objs = nr_slabs * s->objects;
- nr_inuse += (nr_slabs - nr_partials) * s->objects;
+ nr_inuse = nr_objs - nr_free;
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
- nr_objs, s->size, s->objects, (1 << s->order));
+ nr_objs, s->size, oo_objects(s->oo),
+ (1 << oo_order(s->oo)));
seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
0UL);
diff --git a/mm/sparse.c b/mm/sparse.c
index dff71f173ae..36511c7b5e2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -250,29 +250,18 @@ static unsigned long *__kmalloc_section_usemap(void)
static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
{
- unsigned long *usemap, section_nr;
+ unsigned long *usemap;
struct mem_section *ms = __nr_to_section(pnum);
int nid = sparse_early_nid(ms);
- struct pglist_data *pgdat = NODE_DATA(nid);
- /*
- * Usemap's page can't be freed until freeing other sections
- * which use it. And, Pgdat has same feature.
- * If section A has pgdat and section B has usemap for other
- * sections (includes section A), both sections can't be removed,
- * because there is the dependency each other.
- * To solve above issue, this collects all usemap on the same section
- * which has pgdat.
- */
- section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
- usemap = alloc_bootmem_section(usemap_size(), section_nr);
+ usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
if (usemap)
return usemap;
/* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
nid = 0;
- printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+ printk(KERN_WARNING "%s: allocation failed\n", __func__);
return NULL;
}
@@ -302,7 +291,7 @@ struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
return map;
printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __FUNCTION__);
+ "some memory will not be available.\n", __func__);
ms->section_mem_map = 0;
return NULL;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 50757ee3f9f..d8aadaf2a0b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
};
static struct backing_dev_info swap_backing_dev_info = {
- .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
.unplug_io_fn = swap_unplug_io_fn,
};
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 67051be7083..bd1bb592030 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1426,11 +1426,7 @@ static const struct file_operations proc_swaps_operations = {
static int __init procswaps_init(void)
{
- struct proc_dir_entry *entry;
-
- entry = create_proc_entry("swaps", 0, NULL);
- if (entry)
- entry->proc_fops = &proc_swaps_operations;
+ proc_create("swaps", 0, NULL, &proc_swaps_operations);
return 0;
}
__initcall(procswaps_init);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e33e0ae69ad..6e45b0f3d12 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -15,6 +15,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/seq_file.h>
+#include <linux/debugobjects.h>
#include <linux/vmalloc.h>
#include <linux/kallsyms.h>
@@ -394,6 +395,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
}
debug_check_no_locks_freed(addr, area->size);
+ debug_check_no_obj_freed(addr, area->size);
if (deallocate_pages) {
int i;
@@ -545,6 +547,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @node: node to use for allocation or -1
+ * @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eceac9f9032..9a29901ad3b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -191,7 +191,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
shrinker->nr += delta;
if (shrinker->nr < 0) {
printk(KERN_ERR "%s: nr=%ld\n",
- __FUNCTION__, shrinker->nr);
+ __func__, shrinker->nr);
shrinker->nr = max_pass;
}
@@ -339,7 +339,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
if (PagePrivate(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
- printk("%s: orphaned page\n", __FUNCTION__);
+ printk("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
}
}
@@ -1299,6 +1299,9 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
* hope that some of these pages can be written. But if the allocating task
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
+ *
+ * returns: 0, if no pages reclaimed
+ * else, the number of pages reclaimed
*/
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
@@ -1347,7 +1350,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
}
total_scanned += sc->nr_scanned;
if (nr_reclaimed >= sc->swap_cluster_max) {
- ret = 1;
+ ret = nr_reclaimed;
goto out;
}
@@ -1370,7 +1373,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
}
/* top priority shrink_caches still had more to do? don't OOM, then */
if (!sc->all_unreclaimable && scan_global_lru(sc))
- ret = 1;
+ ret = nr_reclaimed;
out:
/*
* Now that we've scanned all the zones at this priority level, note
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ec6035eda93..1a32130b958 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -548,6 +548,10 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
+ /* check memoryless node */
+ if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+ return 0;
+
seq_printf(m, "Page block order: %d\n", pageblock_order);
seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
seq_putc(m, '\n');
@@ -608,6 +612,7 @@ static const char * const vmstat_text[] = {
"nr_unstable",
"nr_bounce",
"nr_vmscan_write",
+ "nr_writeback_temp",
#ifdef CONFIG_NUMA
"numa_hit",