From 2d3854a37e8b767a51aba38ed6d22817b0631e33 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 5 Nov 2008 13:39:10 +1100 Subject: cpumask: introduce new API, without changing anything Impact: introduce new APIs We want to deprecate cpumasks on the stack, as we are headed for gynormous numbers of CPUs. Eventually, we want to head towards an undefined 'struct cpumask' so they can never be declared on stack. 1) New cpumask functions which take pointers instead of copies. (cpus_* -> cpumask_*) 2) Several new helpers to reduce requirements for temporary cpumasks (cpumask_first_and, cpumask_next_and, cpumask_any_and) 3) Helpers for declaring cpumasks on or offstack for large NR_CPUS (cpumask_var_t, alloc_cpumask_var and free_cpumask_var) 4) 'struct cpumask' for explicitness and to mark new-style code. 5) Make iterator functions stop at nr_cpu_ids (a runtime constant), not NR_CPUS for time efficiency and for smaller dynamic allocations in future. 6) cpumask_copy() so we can allocate less than a full cpumask eventually (for alloc_cpumask_var), and so we can eliminate the 'struct cpumask' definition eventually. 7) work_on_cpu() helper for doing task on a CPU, rather than saving old cpumask for current thread and manipulating it. 8) smp_call_function_many() which is smp_call_function_mask() except taking a cpumask pointer. Note that this patch simply introduces the new functions and leaves the obsolescent ones in place. This is to simplify the transition patches. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- lib/cpumask.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'lib') diff --git a/lib/cpumask.c b/lib/cpumask.c index 5f97dc25ef9..5ceb4211c83 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -2,6 +2,7 @@ #include #include #include +#include int __first_cpu(const cpumask_t *srcp) { @@ -35,3 +36,75 @@ int __any_online_cpu(const cpumask_t *mask) return cpu; } EXPORT_SYMBOL(__any_online_cpu); + +/** + * cpumask_next_and - get the next cpu in *src1p & *src2p + * @n: the cpu prior to the place to search (ie. return will be > @n) + * @src1p: the first cpumask pointer + * @src2p: the second cpumask pointer + * + * Returns >= nr_cpu_ids if no further cpus set in both. + */ +int cpumask_next_and(int n, const struct cpumask *src1p, + const struct cpumask *src2p) +{ + while ((n = cpumask_next(n, src1p)) < nr_cpu_ids) + if (cpumask_test_cpu(n, src2p)) + break; + return n; +} +EXPORT_SYMBOL(cpumask_next_and); + +/** + * cpumask_any_but - return a "random" in a cpumask, but not this one. + * @mask: the cpumask to search + * @cpu: the cpu to ignore. + * + * Often used to find any cpu but smp_processor_id() in a mask. + * Returns >= nr_cpu_ids if no cpus set. + */ +int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) +{ + unsigned int i; + + for_each_cpu(i, mask) + if (i != cpu) + break; + return i; +} + +/* These are not inline because of header tangles. */ +#ifdef CONFIG_CPUMASK_OFFSTACK +bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +{ + if (likely(slab_is_available())) + *mask = kmalloc(cpumask_size(), flags); + else { +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + printk(KERN_ERR + "=> alloc_cpumask_var: kmalloc not available!\n"); + dump_stack(); +#endif + *mask = NULL; + } +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (!*mask) { + printk(KERN_ERR "=> alloc_cpumask_var: failed!\n"); + dump_stack(); + } +#endif + return *mask != NULL; +} +EXPORT_SYMBOL(alloc_cpumask_var); + +void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) +{ + *mask = alloc_bootmem(cpumask_size()); +} + +void free_cpumask_var(cpumask_var_t mask) +{ + kfree(mask); +} +EXPORT_SYMBOL(free_cpumask_var); +#endif -- cgit v1.2.3 From cd83e42c6b0413dcbb548c2ead799111ff7e6a13 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 7 Nov 2008 11:12:29 +1100 Subject: cpumask: new API, v2 - add cpumask_of() - add free_bootmem_cpumask_var() Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- lib/cpumask.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'lib') diff --git a/lib/cpumask.c b/lib/cpumask.c index 5ceb4211c83..2ebc3a9a746 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -107,4 +107,9 @@ void free_cpumask_var(cpumask_var_t mask) kfree(mask); } EXPORT_SYMBOL(free_cpumask_var); + +void free_bootmem_cpumask_var(cpumask_var_t mask) +{ + free_bootmem((unsigned long)mask, cpumask_size()); +} #endif -- cgit v1.2.3 From 984f2f377fdfd098f5ae58d09ee04d5e29e6112b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 8 Nov 2008 20:24:19 +1100 Subject: cpumask: introduce new API, without changing anything, v3 Impact: cleanup Clean up based on feedback from Andrew Morton and others: - change to inline functions instead of macros - add __init to bootmem method - add a missing debug check Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- lib/cpumask.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/cpumask.c b/lib/cpumask.c index 2ebc3a9a746..8d03f22c6ce 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -67,6 +67,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) { unsigned int i; + cpumask_check(cpu); for_each_cpu(i, mask) if (i != cpu) break; @@ -108,7 +109,7 @@ void free_cpumask_var(cpumask_var_t mask) } EXPORT_SYMBOL(free_cpumask_var); -void free_bootmem_cpumask_var(cpumask_var_t mask) +void __init free_bootmem_cpumask_var(cpumask_var_t mask) { free_bootmem((unsigned long)mask, cpumask_size()); } -- cgit v1.2.3 From 1e74f3000b86969de421ca0da08f42e7d21cbd99 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 17 Nov 2008 16:24:34 +0900 Subject: swiotlb: use coherent_dma_mask in alloc_coherent Impact: fix DMA buffer allocation coherency bug in certain configs This patch fixes swiotlb to use dev->coherent_dma_mask in swiotlb_alloc_coherent(). coherent_dma_mask is a subset of dma_mask (equal to it most of the time), enumerating the address range that a given device is able to DMA to/from in a cache-coherent way. But currently, swiotlb uses dev->dma_mask in alloc_coherent() implicitly via address_needs_mapping(), but alloc_coherent is really supposed to use coherent_dma_mask. This bug could break drivers that uses smaller coherent_dma_mask than dma_mask (though the current code works for the majority that use the same mask for coherent_dma_mask and dma_mask). Signed-off-by: FUJITA Tomonori Cc: tony.luck@intel.com Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 78330c37a61..5f6c629a924 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -467,9 +467,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t dev_addr; void *ret; int order = get_order(size); + u64 dma_mask = DMA_32BIT_MASK; + + if (hwdev && hwdev->coherent_dma_mask) + dma_mask = hwdev->coherent_dma_mask; ret = (void *)__get_free_pages(flags, order); - if (ret && address_needs_mapping(hwdev, virt_to_bus(ret), size)) { + if (ret && !is_buffer_dma_capable(dma_mask, virt_to_bus(ret), size)) { /* * The allocated memory isn't reachable by the device. * Fall back on swiotlb_map_single(). @@ -493,9 +497,9 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, dev_addr = virt_to_bus(ret); /* Confirm address can be DMA'd by device */ - if (address_needs_mapping(hwdev, dev_addr, size)) { + if (!is_buffer_dma_capable(dma_mask, dev_addr, size)) { printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", - (unsigned long long)*hwdev->dma_mask, + (unsigned long long)dma_mask, (unsigned long long)dev_addr); /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ -- cgit v1.2.3 From f652c521e0bec2e70cf123f47e80117a7e6ed139 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 19 Nov 2008 15:36:19 -0800 Subject: lib/scatterlist.c: fix kunmap() argument in sg_miter_stop() kunmap() takes as argument the struct page that orginally got kmap()'d, however the sg_miter_stop() function passed it the kernel virtual address instead, resulting in weird stuff. Somehow I ended up fixing this bug by accident while looking for a bug in the same area. Reported-by: kerneloops.org Acked-by: Tejun Heo Signed-off-by: Arjan van de Ven Cc: Hugh Dickins Cc: [2.6.27.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/scatterlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 8d2688ff135..b7b449dafbe 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -395,7 +395,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) WARN_ON(!irqs_disabled()); kunmap_atomic(miter->addr, KM_BIO_SRC_IRQ); } else - kunmap(miter->addr); + kunmap(miter->page); miter->page = NULL; miter->addr = NULL; -- cgit v1.2.3 From 6ff2d39b91aec3dcae951afa982059e3dd9b49dc Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 1 Dec 2008 13:14:02 -0800 Subject: lib/idr.c: fix rcu related race with idr_find 2nd part of the fixes needed for http://bugzilla.kernel.org/show_bug.cgi?id=11796. When the idr tree is either grown or shrunk, then the update to the number of layers and the top pointer were not atomic. This race caused crashes. The attached patch fixes that by replicating the layers counter in each layer, thus idr_find doesn't need idp->layers anymore. Signed-off-by: Manfred Spraul Cc: Clement Calmels Cc: Nadia Derbey Cc: Pierre Peiffer Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/idr.c b/lib/idr.c index e728c7fccc4..7a785a0c2ea 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -185,6 +185,7 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa) new = get_from_free_list(idp); if (!new) return -1; + new->layer = l-1; rcu_assign_pointer(p->ary[m], new); p->count++; } @@ -210,6 +211,7 @@ build_up: if (unlikely(!p)) { if (!(p = get_from_free_list(idp))) return -1; + p->layer = 0; layers = 1; } /* @@ -237,6 +239,7 @@ build_up: } new->ary[0] = p; new->count = 1; + new->layer = layers-1; if (p->bitmap == IDR_FULL) __set_bit(0, &new->bitmap); p = new; @@ -493,17 +496,21 @@ void *idr_find(struct idr *idp, int id) int n; struct idr_layer *p; - n = idp->layers * IDR_BITS; p = rcu_dereference(idp->top); + if (!p) + return NULL; + n = (p->layer+1) * IDR_BITS; /* Mask off upper bits we don't use for the search. */ id &= MAX_ID_MASK; if (id >= (1 << n)) return NULL; + BUG_ON(n == 0); while (n > 0 && p) { n -= IDR_BITS; + BUG_ON(n != p->layer*IDR_BITS); p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]); } return((void *)p); @@ -582,8 +589,11 @@ void *idr_replace(struct idr *idp, void *ptr, int id) int n; struct idr_layer *p, *old_p; - n = idp->layers * IDR_BITS; p = idp->top; + if (!p) + return ERR_PTR(-EINVAL); + + n = (p->layer+1) * IDR_BITS; id &= MAX_ID_MASK; -- cgit v1.2.3 From fd3d664fef97cf01f8e28fe0b024ad52f3bbc1bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Dec 2008 13:14:11 -0800 Subject: percpu_counter: fix CPU unplug race in percpu_counter_destroy() We should first delete the counter from percpu_counters list before freeing memory, or a percpu_counter_hotcpu_callback() could dereference a NULL pointer. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Cc: Peter Zijlstra Cc: Mingming Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/percpu_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index a8663890a88..71b265c330c 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -104,13 +104,13 @@ void percpu_counter_destroy(struct percpu_counter *fbc) if (!fbc->counters) return; - free_percpu(fbc->counters); - fbc->counters = NULL; #ifdef CONFIG_HOTPLUG_CPU mutex_lock(&percpu_counters_lock); list_del(&fbc->list); mutex_unlock(&percpu_counters_lock); #endif + free_percpu(fbc->counters); + fbc->counters = NULL; } EXPORT_SYMBOL(percpu_counter_destroy); -- cgit v1.2.3 From 71c5576fbd809f2015f4eddf72e501e298720cf3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 9 Dec 2008 13:14:13 -0800 Subject: revert "percpu counter: clean up percpu_counter_sum_and_set()" Revert commit 1f7c14c62ce63805f9574664a6c6de3633d4a354 Author: Mingming Cao Date: Thu Oct 9 12:50:59 2008 -0400 percpu counter: clean up percpu_counter_sum_and_set() Before this patch we had the following: percpu_counter_sum(): return the percpu_counter's value percpu_counter_sum_and_set(): return the percpu_counter's value, copying that value into the central value and zeroing the per-cpu counters before returning. After this patch, percpu_counter_sum_and_set() has gone, and percpu_counter_sum() gets the old percpu_counter_sum_and_set() functionality. Problem is, as Eric points out, the old percpu_counter_sum_and_set() functionality was racy and wrong. It zeroes out counters on "other" cpus, without holding any locks which will prevent races agaist updates from those other CPUS. This patch reverts 1f7c14c62ce63805f9574664a6c6de3633d4a354. This means that percpu_counter_sum_and_set() still has the race, but percpu_counter_sum() does not. Note that this is not a simple revert - ext4 has since started using percpu_counter_sum() for its dirty_blocks counter as well. Note that this revert patch changes percpu_counter_sum() semantics. Before the patch, a call to percpu_counter_sum() will bring the counter's central counter mostly up-to-date, so a following percpu_counter_read() will return a close value. After this patch, a call to percpu_counter_sum() will leave the counter's central accumulator unaltered, so a subsequent call to percpu_counter_read() can now return a significantly inaccurate result. If there is any code in the tree which was introduced after e8ced39d5e8911c662d4d69a342b9d053eaaac4e was merged, and which depends upon the new percpu_counter_sum() semantics, that code will break. Reported-by: Eric Dumazet Cc: "David S. Miller" Cc: Peter Zijlstra Cc: Mingming Cao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/percpu_counter.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 71b265c330c..dba1530a5b2 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add); * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive() */ -s64 __percpu_counter_sum(struct percpu_counter *fbc) +s64 __percpu_counter_sum(struct percpu_counter *fbc, int set) { s64 ret; int cpu; @@ -62,9 +62,11 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; - *pcount = 0; + if (set) + *pcount = 0; } - fbc->count = ret; + if (set) + fbc->count = ret; spin_unlock(&fbc->lock); return ret; -- cgit v1.2.3 From 02d211688727ad02bb4555b1aa8ae2de16b21b39 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 9 Dec 2008 13:14:14 -0800 Subject: revert "percpu_counter: new function percpu_counter_sum_and_set" Revert commit e8ced39d5e8911c662d4d69a342b9d053eaaac4e Author: Mingming Cao Date: Fri Jul 11 19:27:31 2008 -0400 percpu_counter: new function percpu_counter_sum_and_set As described in revert "percpu counter: clean up percpu_counter_sum_and_set()" the new percpu_counter_sum_and_set() is racy against updates to the cpu-local accumulators on other CPUs. Revert that change. This means that ext4 will be slow again. But correct. Reported-by: Eric Dumazet Cc: "David S. Miller" Cc: Peter Zijlstra Cc: Mingming Cao Cc: Cc: [2.6.27.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/percpu_counter.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index dba1530a5b2..b255b939bc1 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add); * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive() */ -s64 __percpu_counter_sum(struct percpu_counter *fbc, int set) +s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; @@ -62,12 +62,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc, int set) for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; - if (set) - *pcount = 0; } - if (set) - fbc->count = ret; - spin_unlock(&fbc->lock); return ret; } -- cgit v1.2.3 From 711a49a07f84f914aac26a52143f6e7526571143 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Wed, 10 Dec 2008 18:17:06 +0100 Subject: lib/idr.c: Fix bug introduced by RCU fix The last patch to lib/idr.c caused a bug if idr_get_new_above() was called on an empty idr. Usually, nodes stay on the same layer. New layers are added to the top of the tree. The exception is idr_get_new_above() on an empty tree: In this case, the new root node is first added on layer 0, then moved upwards. p->layer was not updated. As usual: You shall never rely on the source code comments, they will only mislead you. Signed-off-by: Manfred Spraul Signed-off-by: Linus Torvalds --- lib/idr.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/idr.c b/lib/idr.c index 7a785a0c2ea..1c4f9281f41 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -220,8 +220,14 @@ build_up: */ while ((layers < (MAX_LEVEL - 1)) && (id >= (1 << (layers*IDR_BITS)))) { layers++; - if (!p->count) + if (!p->count) { + /* special case: if the tree is currently empty, + * then we grow the tree by moving the top node + * upwards. + */ + p->layer++; continue; + } if (!(new = get_from_free_list(idp))) { /* * The allocation failed. If we built part of -- cgit v1.2.3 From 1c93ca09863a544cec24fc8e33491f645df80e59 Mon Sep 17 00:00:00 2001 From: Johann Felix Soden Date: Thu, 30 Oct 2008 22:44:39 +0100 Subject: driver core: fix using 'ret' variable in unregister_dynamic_debug_module The 'ret' variable is assigned, but not used in the return statement. Fix this. Signed-off-by: Johann Felix Soden Acked-by: Jason Baron Signed-off-by: Greg Kroah-Hartman --- lib/dynamic_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/dynamic_printk.c b/lib/dynamic_printk.c index d83660fd6fd..d0fd0e4ed9a 100644 --- a/lib/dynamic_printk.c +++ b/lib/dynamic_printk.c @@ -135,7 +135,7 @@ int unregister_dynamic_debug_module(char *mod_name) nr_entries--; out: up(&debug_list_mutex); - return 0; + return ret; } EXPORT_SYMBOL_GPL(unregister_dynamic_debug_module); -- cgit v1.2.3 From aa6f3c640781c8ac213a4ed3011dcced36f899e3 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 30 Nov 2008 14:01:26 +0100 Subject: driver core: add newlines to debugging enabled/disabled messages Both messages are missing the newline and thus dmesg output gets scrambled. Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- lib/dynamic_printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/dynamic_printk.c b/lib/dynamic_printk.c index d0fd0e4ed9a..8e30295e856 100644 --- a/lib/dynamic_printk.c +++ b/lib/dynamic_printk.c @@ -289,7 +289,7 @@ static ssize_t pr_debug_write(struct file *file, const char __user *buf, dynamic_enabled = DYNAMIC_ENABLED_SOME; err = 0; printk(KERN_DEBUG - "debugging enabled for module %s", + "debugging enabled for module %s\n", elem->name); } else if (!value && (elem->enable == 1)) { elem->enable = 0; @@ -309,7 +309,7 @@ static ssize_t pr_debug_write(struct file *file, const char __user *buf, err = 0; printk(KERN_DEBUG "debugging disabled for module " - "%s", elem->name); + "%s\n", elem->name); } } } -- cgit v1.2.3