From 6047a007d0f6b7395cd158f3bdda34ab39a48821 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 14 Jan 2009 12:22:25 +0200 Subject: SLUB: Use ->objsize from struct kmem_cache_cpu in slab_free() There's no reason to use ->objsize from struct kmem_cache in slab_free() for the SLAB_DEBUG_OBJECTS case. All it does is generate extra cache pressure as we try very hard not to touch struct kmem_cache in the fast-path. Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 6392ae5cc6b..f21e25ad453 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1724,7 +1724,7 @@ static __always_inline void slab_free(struct kmem_cache *s, c = get_cpu_slab(s, smp_processor_id()); debug_check_no_locks_freed(object, c->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); + debug_check_no_obj_freed(object, c->objsize); if (likely(page == c->page && c->node >= 0)) { object[c->offset] = c->freelist; c->freelist = object; -- cgit v1.2.3 From 6e9ed0cc4b963fde66ab47d9fb19147631e44555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9rico=20Wang?= Date: Mon, 19 Jan 2009 02:00:38 +0800 Subject: slob: clean up the code - Use NULL instead of plain 0; - Rename slob_page() to is_slob_page(); - Define slob_page() to convert void* to struct slob_page*; - Rename slob_new_page() to slob_new_pages(); - Define slob_free_pages() accordingly. Compile tests only. Signed-off-by: WANG Cong Signed-off-by: Matt Mackall Cc: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slob.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed..c9cd31d27e6 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -126,9 +126,9 @@ static LIST_HEAD(free_slob_medium); static LIST_HEAD(free_slob_large); /* - * slob_page: True for all slob pages (false for bigblock pages) + * is_slob_page: True for all slob pages (false for bigblock pages) */ -static inline int slob_page(struct slob_page *sp) +static inline int is_slob_page(struct slob_page *sp) { return PageSlobPage((struct page *)sp); } @@ -143,6 +143,11 @@ static inline void clear_slob_page(struct slob_page *sp) __ClearPageSlobPage((struct page *)sp); } +static inline struct slob_page *slob_page(const void *addr) +{ + return (struct slob_page *)virt_to_page(addr); +} + /* * slob_page_free: true for pages on free_slob_pages list. */ @@ -230,7 +235,7 @@ static int slob_last(slob_t *s) return !((unsigned long)slob_next(s) & ~PAGE_MASK); } -static void *slob_new_page(gfp_t gfp, int order, int node) +static void *slob_new_pages(gfp_t gfp, int order, int node) { void *page; @@ -247,12 +252,17 @@ static void *slob_new_page(gfp_t gfp, int order, int node) return page_address(page); } +static void slob_free_pages(void *b, int order) +{ + free_pages((unsigned long)b, order); +} + /* * Allocate a slob block within a given slob_page sp. */ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) { - slob_t *prev, *cur, *aligned = 0; + slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { @@ -349,10 +359,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) /* Not enough space: must allocate a new page */ if (!b) { - b = slob_new_page(gfp & ~__GFP_ZERO, 0, node); + b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); if (!b) - return 0; - sp = (struct slob_page *)virt_to_page(b); + return NULL; + sp = slob_page(b); set_slob_page(sp); spin_lock_irqsave(&slob_lock, flags); @@ -384,7 +394,7 @@ static void slob_free(void *block, int size) return; BUG_ON(!size); - sp = (struct slob_page *)virt_to_page(block); + sp = slob_page(block); units = SLOB_UNITS(size); spin_lock_irqsave(&slob_lock, flags); @@ -476,7 +486,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) } else { void *ret; - ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); + ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); if (ret) { struct page *page; page = virt_to_page(ret); @@ -494,8 +504,8 @@ void kfree(const void *block) if (unlikely(ZERO_OR_NULL_PTR(block))) return; - sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) { + sp = slob_page(block); + if (is_slob_page(sp)) { int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); @@ -513,8 +523,8 @@ size_t ksize(const void *block) if (unlikely(block == ZERO_SIZE_PTR)) return 0; - sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) { + sp = slob_page(block); + if (is_slob_page(sp)) { int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; @@ -572,7 +582,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) b = slob_alloc(c->size, flags, c->align, node); else - b = slob_new_page(flags, get_order(c->size), node); + b = slob_new_pages(flags, get_order(c->size), node); if (c->ctor) c->ctor(b); @@ -586,7 +596,7 @@ static void __kmem_cache_free(void *b, int size) if (size < PAGE_SIZE) slob_free(b, size); else - free_pages((unsigned long)b, get_order(size)); + slob_free_pages(b, get_order(size)); } static void kmem_rcu_free(struct rcu_head *head) -- cgit v1.2.3 From ffadd4d0feb5376c82dc3a4104731b7ce2794edc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 17 Feb 2009 12:05:07 -0500 Subject: SLUB: Introduce and use SLUB_MAX_SIZE and SLUB_PAGE_SHIFT constants As a preparational patch to bump up page allocator pass-through threshold, introduce two new constants SLUB_MAX_SIZE and SLUB_PAGE_SHIFT and convert mm/slub.c to use them. Reported-by: "Zhang, Yanmin" Tested-by: "Zhang, Yanmin" Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a2..5a5e7f5bf79 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2475,7 +2475,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; +struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); static int __init setup_slub_min_order(char *str) @@ -2537,7 +2537,7 @@ panic: } #ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; +static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; static void sysfs_add_func(struct work_struct *w) { @@ -2658,7 +2658,7 @@ void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, flags); s = get_slab(size, flags); @@ -2686,7 +2686,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large_node(size, flags, node); s = get_slab(size, flags); @@ -2985,7 +2985,7 @@ void __init kmem_cache_init(void) caches++; } - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_KERNEL); caches++; @@ -3022,7 +3022,7 @@ void __init kmem_cache_init(void) slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) kmalloc_caches[i]. name = kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); @@ -3222,7 +3222,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, gfpflags); s = get_slab(size, gfpflags); @@ -3238,7 +3238,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large_node(size, gfpflags, node); s = get_slab(size, gfpflags); -- cgit v1.2.3 From e8120ff1ffc51102ead1f4c98a3fd5d26fefc722 Mon Sep 17 00:00:00 2001 From: Zhang Yanmin Date: Thu, 12 Feb 2009 18:00:17 +0200 Subject: SLUB: Fix default slab order for big object sizes The default order of kmalloc-8192 on 2*4 stoakley is an issue of calculate_order. slab_size order name ------------------------------------------------- 4096 3 sgpool-128 8192 2 kmalloc-8192 16384 3 kmalloc-16384 kmalloc-8192's default order is smaller than sgpool-128's. On 4*4 tigerton machine, a similiar issue appears on another kmem_cache. Function calculate_order uses 'min_objects /= 2;' to shrink. Plus size calculation/checking in slab_order, sometimes above issue appear. Below patch against 2.6.29-rc2 fixes it. I checked the default orders of all kmem_cache and they don't become smaller than before. So the patch wouldn't hurt performance. Signed-off-by Zhang Yanmin Signed-off-by: Pekka Enberg --- mm/slub.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5a5e7f5bf79..c01a7a3001d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1844,6 +1844,7 @@ static inline int calculate_order(int size) int order; int min_objects; int fraction; + int max_objects; /* * Attempt to find best configuration for a slab. This @@ -1856,6 +1857,9 @@ static inline int calculate_order(int size) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); + max_objects = (PAGE_SIZE << slub_max_order)/size; + min_objects = min(min_objects, max_objects); + while (min_objects > 1) { fraction = 16; while (fraction >= 4) { @@ -1865,7 +1869,7 @@ static inline int calculate_order(int size) return order; fraction /= 2; } - min_objects /= 2; + min_objects --; } /* -- cgit v1.2.3 From 3b89d7d881a1dbb4da158f7eb5d6b3ceefc72810 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sun, 22 Feb 2009 17:40:07 -0800 Subject: slub: move min_partial to struct kmem_cache Although it allows for better cacheline use, it is unnecessary to save a copy of the cache's min_partial value in each kmem_cache_node. Cc: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a2..4fff385b17a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1335,7 +1335,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) n = get_node(s, zone_to_nid(zone)); if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > n->min_partial) { + n->nr_partial > s->min_partial) { page = get_partial_node(n); if (page) return page; @@ -1387,7 +1387,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) slab_unlock(page); } else { stat(c, DEACTIVATE_EMPTY); - if (n->nr_partial < n->min_partial) { + if (n->nr_partial < s->min_partial) { /* * Adding an empty slab to the partial slabs in order * to avoid page allocator overhead. This slab needs @@ -1928,17 +1928,6 @@ static void init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { n->nr_partial = 0; - - /* - * The larger the object size is, the more pages we want on the partial - * list to avoid pounding the page allocator excessively. - */ - n->min_partial = ilog2(s->size); - if (n->min_partial < MIN_PARTIAL) - n->min_partial = MIN_PARTIAL; - else if (n->min_partial > MAX_PARTIAL) - n->min_partial = MAX_PARTIAL; - spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG @@ -2181,6 +2170,15 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } #endif +static void calculate_min_partial(struct kmem_cache *s, unsigned long min) +{ + if (min < MIN_PARTIAL) + min = MIN_PARTIAL; + else if (min > MAX_PARTIAL) + min = MAX_PARTIAL; + s->min_partial = min; +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -2319,6 +2317,11 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, if (!calculate_sizes(s, -1)) goto error; + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + calculate_min_partial(s, ilog2(s->size)); s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; -- cgit v1.2.3 From 73d342b169db700b5a6ad626fe4b86911efec8db Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sun, 22 Feb 2009 17:40:09 -0800 Subject: slub: add min_partial sysfs tunable Now that a cache's min_partial has been moved to struct kmem_cache, it's possible to easily tune it from userspace by adding a sysfs attribute. It may not be desirable to keep a large number of partial slabs around if a cache is used infrequently and memory, especially when constrained by a cgroup, is scarce. It's better to allow userspace to set the minimum policy per cache instead of relying explicitly on kmem_cache_shrink(). The memory savings from simply moving min_partial from struct kmem_cache_node to struct kmem_cache is obviously not significant (unless maybe you're from SGI or something), at the largest it's # allocated caches * (MAX_NUMNODES - 1) * sizeof(unsigned long) The true savings occurs when userspace reduces the number of partial slabs that would otherwise be wasted, especially on machines with a large number of nodes (ia64 with CONFIG_NODES_SHIFT at 10 for default?). As well as the kernel estimates ideal values for n->min_partial and ensures it's within a sane range, userspace has no other input other than writing to /sys/kernel/slab/cache/shrink. There simply isn't any better heuristic to add when calculating the partial values for a better estimate that works for all possible caches. And since it's currently a static value, the user really has no way of reclaiming that wasted space, which can be significant when constrained by a cgroup (either cpusets or, later, memory controller slab limits) without shrinking it entirely. This also allows the user to specify that increased fragmentation and more partial slabs are actually desired to avoid the cost of allocating new slabs at runtime for specific caches. There's also no reason why this should be a per-struct kmem_cache_node value in the first place. You could argue that a machine would have such node size asymmetries that it should be specified on a per-node basis, but we know nobody is doing that right now since it's a purely static value at the moment and there's no convenient way to tune that via slub's sysfs interface. Cc: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 4fff385b17a..a3e2d552ff4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3838,6 +3838,26 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR(order); +static ssize_t min_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%lu\n", s->min_partial); +} + +static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long min; + int err; + + err = strict_strtoul(buf, 10, &min); + if (err) + return err; + + calculate_min_partial(s, min); + return length; +} +SLAB_ATTR(min_partial); + static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (s->ctor) { @@ -4153,6 +4173,7 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &min_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, &total_objects_attr.attr, -- cgit v1.2.3 From c0bdb232b23b51c23e551041510ad6bea5ce5a92 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 25 Feb 2009 09:16:35 +0200 Subject: slub: rename calculate_min_partial() to set_min_partial() As suggested by Christoph Lameter, rename calculate_min_partial() to set_min_partial() as the function doesn't really do any calculations. Cc: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a3e2d552ff4..77268d18e78 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2170,7 +2170,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } #endif -static void calculate_min_partial(struct kmem_cache *s, unsigned long min) +static void set_min_partial(struct kmem_cache *s, unsigned long min) { if (min < MIN_PARTIAL) min = MIN_PARTIAL; @@ -2321,7 +2321,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. */ - calculate_min_partial(s, ilog2(s->size)); + set_min_partial(s, ilog2(s->size)); s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -3853,7 +3853,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, if (err) return err; - calculate_min_partial(s, min); + set_min_partial(s, min); return length; } SLAB_ATTR(min_partial); -- cgit v1.2.3 From 1a00df4a2cc001dd9f45890e690548c24b2fa2d9 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 7 Mar 2009 00:36:21 +0900 Subject: slub: use get_track() Use get_track() in set_track() Signed-off-by: Akinobu Mita Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Pekka Enberg --- mm/slub.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index f21e25ad453..e150b5c0424 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -374,14 +374,8 @@ static struct track *get_track(struct kmem_cache *s, void *object, static void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) { - struct track *p; - - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + struct track *p = get_track(s, object, alloc); - p += alloc; if (addr) { p->addr = addr; p->cpu = smp_processor_id(); -- cgit v1.2.3 From 6fb8f424393025674fde7869b59f485d1e352182 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 16 Mar 2009 21:00:28 +1100 Subject: slob: fix lockup in slob_free() Don't hold SLOB lock when freeing the page. Reduces lock hold width. See the following thread for discussion of the bug: http://marc.info/?l=linux-kernel&m=123709983214143&w=2 Reported-by: Ingo Molnar Acked-by: Matt Mackall Signed-off-by: Nick Piggin Signed-off-by: Pekka Enberg --- mm/slob.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed..f901653707a 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -393,10 +393,11 @@ static void slob_free(void *block, int size) /* Go directly to page allocator. Do not pass slob allocator */ if (slob_page_free(sp)) clear_slob_page_free(sp); + spin_unlock_irqrestore(&slob_lock, flags); clear_slob_page(sp); free_slob_page(sp); free_page((unsigned long)b); - goto out; + return; } if (!slob_page_free(sp)) { -- cgit v1.2.3