From bd95b88d9e51fcbf392a7e90338a8fcc3499cbd6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Oct 2008 09:31:27 -0400 Subject: ftrace: release functions from hash The x86 architecture uses a static recording of mcount caller locations and is not affected by this patch. For architectures still using the dynamic ftrace daemon, this patch is critical. It removes the race between the recording of a function that calls mcount, the unloading of a module, and the ftrace daemon updating the call sites. This patch adds the releasing of the hash functions that the daemon uses to update the mcount call sites. When a module is unloaded, not only are the replaced call site table update, but now so is the hash recorded functions that the ftrace daemon will use. Again, architectures that implement MCOUNT_RECORD are not affected by this (which currently only x86 has). Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4dda4f60a2a..1f54a94189f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -164,10 +164,14 @@ static DEFINE_SPINLOCK(ftrace_hash_lock); #define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) #define ftrace_hash_unlock(flags) \ spin_unlock_irqrestore(&ftrace_hash_lock, flags) +static void ftrace_release_hash(unsigned long start, unsigned long end); #else /* This is protected via the ftrace_lock with MCOUNT_RECORD. */ #define ftrace_hash_lock(flags) do { (void)(flags); } while (0) #define ftrace_hash_unlock(flags) do { } while(0) +static inline void ftrace_release_hash(unsigned long start, unsigned long end) +{ +} #endif /* @@ -347,6 +351,7 @@ void ftrace_release(void *start, unsigned long size) } spin_unlock(&ftrace_lock); + ftrace_release_hash(s, e); } static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) @@ -1659,6 +1664,44 @@ void __init ftrace_init(void) ftrace_disabled = 1; } #else /* CONFIG_FTRACE_MCOUNT_RECORD */ + +static void ftrace_release_hash(unsigned long start, unsigned long end) +{ + struct dyn_ftrace *rec; + struct hlist_node *t, *n; + struct hlist_head *head, temp_list; + unsigned long flags; + int i, cpu; + + preempt_disable_notrace(); + + /* disable incase we call something that calls mcount */ + cpu = raw_smp_processor_id(); + per_cpu(ftrace_shutdown_disable_cpu, cpu)++; + + ftrace_hash_lock(flags); + + for (i = 0; i < FTRACE_HASHSIZE; i++) { + INIT_HLIST_HEAD(&temp_list); + head = &ftrace_hash[i]; + + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry_safe(rec, t, n, head, node) { + if (rec->flags & FTRACE_FL_FREE) + continue; + + if ((rec->ip >= start) && (rec->ip < end)) + ftrace_free_rec(rec); + } + } + + ftrace_hash_unlock(flags); + + per_cpu(ftrace_shutdown_disable_cpu, cpu)--; + preempt_enable_notrace(); + +} + static int ftraced(void *ignore) { unsigned long usecs; -- cgit v1.2.3 From c2db8054c1eaf99983d8deee347876b01c26c2cf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Oct 2008 19:06:11 -0400 Subject: ftrace: fix depends A lot of tracers have HAVE_FTRACE as a dependent config where it really should not. The HAVE_FTRACE is a misnomer (soon to be fixed) and describes if the architecture has the function tracer (mcount) implemented. The ftrace infrastructure is implemented in all archs. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1cb3e1f616a..5866edbc2ed 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,7 +49,6 @@ config IRQSOFF_TRACER default n depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME - depends on HAVE_FTRACE depends on DEBUG_KERNEL select TRACE_IRQFLAGS select TRACING @@ -73,7 +72,6 @@ config PREEMPT_TRACER default n depends on GENERIC_TIME depends on PREEMPT - depends on HAVE_FTRACE depends on DEBUG_KERNEL select TRACING select TRACER_MAX_TRACE @@ -101,7 +99,6 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" - depends on HAVE_FTRACE depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER @@ -112,7 +109,6 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" - depends on HAVE_FTRACE depends on DEBUG_KERNEL select TRACING select MARKERS @@ -122,7 +118,6 @@ config CONTEXT_SWITCH_TRACER config BOOT_TRACER bool "Trace boot initcalls" - depends on HAVE_FTRACE depends on DEBUG_KERNEL select TRACING help -- cgit v1.2.3 From 606576ce816603d9fe1fb453a88bc6eea16ca709 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Oct 2008 19:06:12 -0400 Subject: ftrace: rename FTRACE to FUNCTION_TRACER Due to confusion between the ftrace infrastructure and the gcc profiling tracer "ftrace", this patch renames the config options from FTRACE to FUNCTION_TRACER. The other two names that are offspring from FTRACE DYNAMIC_FTRACE and FTRACE_MCOUNT_RECORD will stay the same. This patch was generated mostly by script, and partially by hand. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/Makefile | 4 ++-- kernel/sysctl.c | 2 +- kernel/trace/Kconfig | 17 +++++++++-------- kernel/trace/Makefile | 6 +++--- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_irqsoff.c | 4 ++-- kernel/trace/trace_sched_wakeup.c | 4 ++-- kernel/trace/trace_selftest.c | 4 ++-- 9 files changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 8f9ce7ec21b..85f588a9d0b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -13,7 +13,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ CFLAGS_REMOVE_sched.o = -mno-spe -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg CFLAGS_REMOVE_lockdep_proc.o = -pg @@ -86,7 +86,7 @@ obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o -obj-$(CONFIG_FTRACE) += trace/ +obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 617d41e4d6a..619eb9f3acd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -464,7 +464,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER { .ctl_name = CTL_UNNUMBERED, .procname = "ftrace_enabled", diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5866edbc2ed..3533c583df4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,11 +1,12 @@ # -# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: +# Architectures that offer an FUNCTION_TRACER implementation should +# select HAVE_FUNCTION_TRACER: # config NOP_TRACER bool -config HAVE_FTRACE +config HAVE_FUNCTION_TRACER bool select NOP_TRACER @@ -28,9 +29,9 @@ config TRACING select STACKTRACE select TRACEPOINTS -config FTRACE +config FUNCTION_TRACER bool "Kernel Function Tracer" - depends on HAVE_FTRACE + depends on HAVE_FUNCTION_TRACER depends on DEBUG_KERNEL select FRAME_POINTER select TRACING @@ -136,9 +137,9 @@ config BOOT_TRACER config STACK_TRACER bool "Trace max stack" - depends on HAVE_FTRACE + depends on HAVE_FUNCTION_TRACER depends on DEBUG_KERNEL - select FTRACE + select FUNCTION_TRACER select STACKTRACE help This special tracer records the maximum stack footprint of the @@ -155,7 +156,7 @@ config STACK_TRACER config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" - depends on FTRACE + depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE depends on DEBUG_KERNEL default y @@ -165,7 +166,7 @@ config DYNAMIC_FTRACE with a No-Op instruction) as they are called. A table is created to dynamically enable them again. - This way a CONFIG_FTRACE kernel is slightly larger, but otherwise + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise has native performance as long as no tracing is active. The changes to the code are done by a kernel thread that diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index a85dfba88ba..c8228b1a49e 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,7 +1,7 @@ # Do not instrument the tracer itself: -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) @@ -10,13 +10,13 @@ CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif -obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o -obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d345d649d07..aeb2f2505bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -851,7 +851,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) preempt_enable_notrace(); } -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER static void function_trace_call(unsigned long ip, unsigned long parent_ip) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f1f99572cde..6889ca48f1f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -335,7 +335,7 @@ void update_max_tr_single(struct trace_array *tr, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER void tracing_start_function_trace(void); void tracing_stop_function_trace(void); #else diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a7db7f040ae..9c74071c10e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -63,7 +63,7 @@ irq_trace(void) */ static __cacheline_aligned_in_smp unsigned long max_sequence; -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* * irqsoff uses its own tracer function to keep the overhead down: */ @@ -104,7 +104,7 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = irqsoff_tracer_call, }; -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ /* * Should this new latency be reported/recorded? diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fe4a252c236..3ae93f16b56 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -31,7 +31,7 @@ static raw_spinlock_t wakeup_lock = static void __wakeup_reset(struct trace_array *tr); -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* * irqsoff uses its own tracer function to keep the overhead down: */ @@ -96,7 +96,7 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, }; -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ /* * Should this new latency be reported/recorded? diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 09cf230d7ec..95815d26a04 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -70,7 +70,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) return ret; } -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE @@ -226,7 +226,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) return ret; } -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_IRQSOFF_TRACER int -- cgit v1.2.3 From 3ce83aea86bf46fd1bff59d2e6d16f48fdce22fc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Oct 2008 19:06:13 -0400 Subject: ftrace: rename the ftrace tracer to function To avoid further confusion between the ftrace infrastructure and the function tracer. This patch renames the "ftrace" function tracer to "function". Now in available_tracers, instead of "ftrace" there will be "function". This makes more sense, since people will not know exactly what the "ftrace" tracer does. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index e90eb0c2c56..0f85a64003d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -64,7 +64,7 @@ static void function_trace_ctrl_update(struct trace_array *tr) static struct tracer function_trace __read_mostly = { - .name = "ftrace", + .name = "function", .init = function_trace_init, .reset = function_trace_reset, .ctrl_update = function_trace_ctrl_update, -- cgit v1.2.3 From 81520a1b0649d0701205b818714a8c1e1cfbbb5b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Oct 2008 21:24:18 -0400 Subject: ftrace: stack tracer only record when on stack The stack trace API does not record if the stack is not on the current task's stack. That is, if the stack is the interrupt stack or NMI stack, the output does not show. Also, the size of those stacks are not consistent with the size of the thread stack, this makes the calculation of the stack size usually bogus. This all confuses the stack tracer. I unfortunately do not have time to fix all these problems, but this patch does record the worst stack when the stack pointer is on the tasks stack (instead of bogus numbers). The patch simply returns if the stack pointer is not on the task's stack. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_stack.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 74c5d9a3afa..be682b62fe5 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -44,6 +44,10 @@ static inline void check_stack(void) if (this_size <= max_stack_size) return; + /* we do not handle interrupt stacks yet */ + if (!object_is_on_stack(&this_size)) + return; + raw_local_irq_save(flags); __raw_spin_lock(&max_stack_lock); -- cgit v1.2.3 From 3786fc710c32b61464c322e5cd0c3d1d34ae72d0 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Tue, 21 Oct 2008 19:49:09 -0400 Subject: irq: make variable static This variable is only used in the source file, so make it static. Signed-off-by: Roel Kluin Signed-off-by: Ingo Molnar --- kernel/irq/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index fac014a81b2..4d161c70ba5 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -220,7 +220,7 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) } } -void register_default_affinity_proc(void) +static void register_default_affinity_proc(void) { #ifdef CONFIG_SMP proc_create("irq/default_smp_affinity", 0600, NULL, -- cgit v1.2.3 From 17d80fd07d35ae1d231b3378ee4f00ace54f9d31 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Oct 2008 16:31:18 +0200 Subject: tracing: create tracers menu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We seem to have plenty tracers, lets create a menu and not clutter the already cluttered debug menu more. Signed-off-by: Peter Zijlstra Acked-by: Frédéric Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3533c583df4..bc535cb91de 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -29,6 +29,8 @@ config TRACING select STACKTRACE select TRACEPOINTS +menu "Tracers" + config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER @@ -191,3 +193,5 @@ config FTRACE_STARTUP_TEST a series of tests are made to verify that the tracer is functioning properly. It will do tests on all the configured tracers of ftrace. + +endmenu -- cgit v1.2.3 From 4ce72a2c063a7fa8e42a9435440ae3364115a58d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 22 Oct 2008 15:25:26 +0800 Subject: sched: add CONFIG_SMP consistency a patch from Henrik Austad did this: >> Do not declare select_task_rq as part of sched_class when CONFIG_SMP is >> not set. Peter observed: > While a proper cleanup, could you do it by re-arranging the methods so > as to not create an additional ifdef? Do not declare select_task_rq and some other methods as part of sched_class when CONFIG_SMP is not set. Also gather those methods to avoid CONFIG_SMP mess. Idea-by: Henrik Austad Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Acked-by: Henrik Austad Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 ++--- kernel/sched_idletask.c | 5 ++--- kernel/sched_rt.c | 5 ++--- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a0aa38b10fd..8de48a5da35 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1593,9 +1593,6 @@ static const struct sched_class fair_sched_class = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_fair, -#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_wakeup, @@ -1603,6 +1600,8 @@ static const struct sched_class fair_sched_class = { .put_prev_task = put_prev_task_fair, #ifdef CONFIG_SMP + .select_task_rq = select_task_rq_fair, + .load_balance = load_balance_fair, .move_one_task = move_one_task_fair, #endif diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index dec4ccabe2f..8a21a2e28c1 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -105,9 +105,6 @@ static const struct sched_class idle_sched_class = { /* dequeue is not valid, we print a debug message there: */ .dequeue_task = dequeue_task_idle, -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_idle, -#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_curr_idle, @@ -115,6 +112,8 @@ static const struct sched_class idle_sched_class = { .put_prev_task = put_prev_task_idle, #ifdef CONFIG_SMP + .select_task_rq = select_task_rq_idle, + .load_balance = load_balance_idle, .move_one_task = move_one_task_idle, #endif diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index cdf5740ab03..c9aa5bede22 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1502,9 +1502,6 @@ static const struct sched_class rt_sched_class = { .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_rt, -#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_curr_rt, @@ -1512,6 +1509,8 @@ static const struct sched_class rt_sched_class = { .put_prev_task = put_prev_task_rt, #ifdef CONFIG_SMP + .select_task_rq = select_task_rq_rt, + .load_balance = load_balance_rt, .move_one_task = move_one_task_rt, .set_cpus_allowed = set_cpus_allowed_rt, -- cgit v1.2.3 From 6ae2a0765ab764da11cc305058ee5333810228f4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Oct 2008 10:22:06 -0400 Subject: ring-buffer: fix free page The pages of a buffer was originally pointing to the page struct, it now points to the page address. The freeing of the page still uses the page frame free "__free_page" instead of the correct free_page to the address. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 94af1fe56bb..091aeefe321 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -130,7 +130,7 @@ struct buffer_page { static inline void free_buffer_page(struct buffer_page *bpage) { if (bpage->page) - __free_page(bpage->page); + free_page((unsigned long)bpage->page); kfree(bpage); } -- cgit v1.2.3 From 593eb8a2d63e95772a5f22d746f18a997c5ee463 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:32:59 -0400 Subject: ftrace: return error on failed modified text. Have the ftrace_modify_code return error values: -EFAULT on error of reading the address -EINVAL if what is read does not match what it expected -EPERM if the write fails to update after a successful match. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1f54a94189f..b2de8de7735 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -596,22 +596,22 @@ ftrace_code_disable(struct dyn_ftrace *rec) { unsigned long ip; unsigned char *nop, *call; - int failed; + int ret; ip = rec->ip; nop = ftrace_nop_replace(); call = ftrace_call_replace(ip, mcount_addr); - failed = ftrace_modify_code(ip, call, nop); - if (failed) { - switch (failed) { - case 1: + ret = ftrace_modify_code(ip, call, nop); + if (ret) { + switch (ret) { + case -EFAULT: WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); print_ip_sym(ip); break; - case 2: + case -EINVAL: WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); @@ -620,6 +620,15 @@ ftrace_code_disable(struct dyn_ftrace *rec) print_ip_ins(" replace: ", nop); printk(KERN_CONT "\n"); break; + case -EPERM: + WARN_ON_ONCE(1); + pr_info("ftrace faulted on writing "); + print_ip_sym(ip); + break; + default: + WARN_ON_ONCE(1); + pr_info("ftrace faulted on unknown error "); + print_ip_sym(ip); } rec->flags |= FTRACE_FL_FAILED; -- cgit v1.2.3 From 81adbdc029ecc416d56563e7f159100181dd711d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:33:02 -0400 Subject: ftrace: only have ftrace_kill atomic When an anomaly is detected, we need a way to completely disable ftrace. Right now we have two functions: ftrace_kill and ftrace_kill_atomic. The ftrace_kill tries to do it in a "nice" way by converting everything back to a nop. The "nice" way is dangerous itself, so this patch removes it and only has the "atomic" version, which is all that is needed. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 42 ++---------------------------------------- kernel/trace/trace.c | 2 +- 2 files changed, 3 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b2de8de7735..93245ae046e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1549,22 +1549,6 @@ int ftrace_force_update(void) return ret; } -static void ftrace_force_shutdown(void) -{ - struct task_struct *task; - int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC; - - mutex_lock(&ftraced_lock); - task = ftraced_task; - ftraced_task = NULL; - ftraced_suspend = -1; - ftrace_run_update_code(command); - mutex_unlock(&ftraced_lock); - - if (task) - kthread_stop(task); -} - static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; @@ -1795,17 +1779,16 @@ core_initcall(ftrace_dynamic_init); # define ftrace_shutdown() do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) -# define ftrace_force_shutdown() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ /** - * ftrace_kill_atomic - kill ftrace from critical sections + * ftrace_kill - kill ftrace * * This function should be used by panic code. It stops ftrace * but in a not so nice way. If you need to simply kill ftrace * from a non-atomic section, use ftrace_kill. */ -void ftrace_kill_atomic(void) +void ftrace_kill(void) { ftrace_disabled = 1; ftrace_enabled = 0; @@ -1815,27 +1798,6 @@ void ftrace_kill_atomic(void) clear_ftrace_function(); } -/** - * ftrace_kill - totally shutdown ftrace - * - * This is a safety measure. If something was detected that seems - * wrong, calling this function will keep ftrace from doing - * any more modifications, and updates. - * used when something went wrong. - */ -void ftrace_kill(void) -{ - mutex_lock(&ftrace_sysctl_lock); - ftrace_disabled = 1; - ftrace_enabled = 0; - - clear_ftrace_function(); - mutex_unlock(&ftrace_sysctl_lock); - - /* Try to totally disable ftrace */ - ftrace_force_shutdown(); -} - /** * register_ftrace_function - register a function for profiling * @ops - ops structure that holds the function for profiling. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aeb2f2505bc..333a5162149 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3097,7 +3097,7 @@ void ftrace_dump(void) dump_ran = 1; /* No turning back! */ - ftrace_kill_atomic(); + ftrace_kill(); for_each_tracing_cpu(cpu) { atomic_inc(&global_trace.data[cpu]->disabled); -- cgit v1.2.3 From 6912896e994ddaf06cc0f6d3f2098bc4b59bdd84 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:33:03 -0400 Subject: ftrace: add ftrace warn on to disable ftrace Add ftrace warn on to disable ftrace as well as report a warning. [ Thanks to Andrew Morton for suggesting using the WARN_ON return value ] Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 93245ae046e..b08996ca561 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -32,6 +32,18 @@ #include "trace.h" +#define FTRACE_WARN_ON(cond) \ + do { \ + if (WARN_ON(cond)) \ + ftrace_kill(); \ + } while (0) + +#define FTRACE_WARN_ON_ONCE(cond) \ + do { \ + if (WARN_ON_ONCE(cond)) \ + ftrace_kill(); \ + } while (0) + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; @@ -363,10 +375,8 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) rec = ftrace_free_records; if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); ftrace_free_records = NULL; - ftrace_disabled = 1; - ftrace_enabled = 0; return NULL; } @@ -415,7 +425,7 @@ ftrace_record_ip(unsigned long ip) key = hash_long(ip, FTRACE_HASHBITS); - WARN_ON_ONCE(key >= FTRACE_HASHSIZE); + FTRACE_WARN_ON_ONCE(key >= FTRACE_HASHSIZE); if (ftrace_ip_in_hash(ip, key)) goto out; @@ -607,12 +617,12 @@ ftrace_code_disable(struct dyn_ftrace *rec) if (ret) { switch (ret) { case -EFAULT: - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); print_ip_sym(ip); break; case -EINVAL: - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); print_ip_ins(" expected: ", call); @@ -621,12 +631,12 @@ ftrace_code_disable(struct dyn_ftrace *rec) printk(KERN_CONT "\n"); break; case -EPERM: - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on writing "); print_ip_sym(ip); break; default: - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } @@ -1722,8 +1732,7 @@ static int ftraced(void *ignore) ftrace_update_cnt != 1 ? "s" : "", ftrace_update_tot_cnt, usecs, usecs != 1 ? "s" : ""); - ftrace_disabled = 1; - WARN_ON_ONCE(1); + FTRACE_WARN_ON_ONCE(1); } } mutex_unlock(&ftraced_lock); -- cgit v1.2.3 From cb7be3b2fc2cf089ee52b16f0fd9ebb29e9944e1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:33:05 -0400 Subject: ftrace: remove daemon The ftrace daemon is complex and error prone. This patch strips it out of the code. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 322 ++++-------------------------------------- kernel/trace/trace_selftest.c | 14 -- 2 files changed, 28 insertions(+), 308 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b08996ca561..e758cab0836 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -165,25 +165,8 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) } #ifdef CONFIG_DYNAMIC_FTRACE - #ifndef CONFIG_FTRACE_MCOUNT_RECORD -/* - * The hash lock is only needed when the recording of the mcount - * callers are dynamic. That is, by the caller themselves and - * not recorded via the compilation. - */ -static DEFINE_SPINLOCK(ftrace_hash_lock); -#define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) -#define ftrace_hash_unlock(flags) \ - spin_unlock_irqrestore(&ftrace_hash_lock, flags) -static void ftrace_release_hash(unsigned long start, unsigned long end); -#else -/* This is protected via the ftrace_lock with MCOUNT_RECORD. */ -#define ftrace_hash_lock(flags) do { (void)(flags); } while (0) -#define ftrace_hash_unlock(flags) do { } while(0) -static inline void ftrace_release_hash(unsigned long start, unsigned long end) -{ -} +# error Dynamic ftrace depends on MCOUNT_RECORD #endif /* @@ -194,8 +177,6 @@ static inline void ftrace_release_hash(unsigned long start, unsigned long end) */ static unsigned long mcount_addr = MCOUNT_ADDR; -static struct task_struct *ftraced_task; - enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -212,7 +193,6 @@ static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); -static DEFINE_MUTEX(ftraced_lock); static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { @@ -230,10 +210,6 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static int ftraced_trigger; -static int ftraced_suspend; -static int ftraced_stop; - static int ftrace_record_suspend; static struct dyn_ftrace *ftrace_free_records; @@ -398,7 +374,6 @@ static void ftrace_record_ip(unsigned long ip) { struct dyn_ftrace *node; - unsigned long flags; unsigned long key; int resched; int cpu; @@ -430,24 +405,18 @@ ftrace_record_ip(unsigned long ip) if (ftrace_ip_in_hash(ip, key)) goto out; - ftrace_hash_lock(flags); - /* This ip may have hit the hash before the lock */ if (ftrace_ip_in_hash(ip, key)) - goto out_unlock; + goto out; node = ftrace_alloc_dyn_node(ip); if (!node) - goto out_unlock; + goto out; node->ip = ip; ftrace_add_hash(node, key); - ftraced_trigger = 1; - - out_unlock: - ftrace_hash_unlock(flags); out: per_cpu(ftrace_shutdown_disable_cpu, cpu)--; @@ -647,7 +616,7 @@ ftrace_code_disable(struct dyn_ftrace *rec) return 1; } -static int __ftrace_update_code(void *ignore); +static int ftrace_update_code(void *ignore); static int __ftrace_modify_code(void *data) { @@ -659,7 +628,7 @@ static int __ftrace_modify_code(void *data) * Update any recorded ips now that we have the * machine stopped */ - __ftrace_update_code(NULL); + ftrace_update_code(NULL); ftrace_replace_code(1); tracing_on = 1; } else if (*command & FTRACE_DISABLE_CALLS) { @@ -686,26 +655,9 @@ static void ftrace_run_update_code(int command) stop_machine(__ftrace_modify_code, &command, NULL); } -void ftrace_disable_daemon(void) -{ - /* Stop the daemon from calling kstop_machine */ - mutex_lock(&ftraced_lock); - ftraced_stop = 1; - mutex_unlock(&ftraced_lock); - - ftrace_force_update(); -} - -void ftrace_enable_daemon(void) -{ - mutex_lock(&ftraced_lock); - ftraced_stop = 0; - mutex_unlock(&ftraced_lock); - - ftrace_force_update(); -} - static ftrace_func_t saved_ftrace_func; +static int ftrace_start; +static DEFINE_MUTEX(ftrace_start_lock); static void ftrace_startup(void) { @@ -714,9 +666,9 @@ static void ftrace_startup(void) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftraced_lock); - ftraced_suspend++; - if (ftraced_suspend == 1) + mutex_lock(&ftrace_start_lock); + ftrace_start++; + if (ftrace_start == 1) command |= FTRACE_ENABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { @@ -729,7 +681,7 @@ static void ftrace_startup(void) ftrace_run_update_code(command); out: - mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_start_lock); } static void ftrace_shutdown(void) @@ -739,9 +691,9 @@ static void ftrace_shutdown(void) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftraced_lock); - ftraced_suspend--; - if (!ftraced_suspend) + mutex_lock(&ftrace_start_lock); + ftrace_start--; + if (!ftrace_start) command |= FTRACE_DISABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { @@ -754,7 +706,7 @@ static void ftrace_shutdown(void) ftrace_run_update_code(command); out: - mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_start_lock); } static void ftrace_startup_sysctl(void) @@ -764,15 +716,15 @@ static void ftrace_startup_sysctl(void) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftraced_lock); + mutex_lock(&ftrace_start_lock); /* Force update next time */ saved_ftrace_func = NULL; - /* ftraced_suspend is true if we want ftrace running */ - if (ftraced_suspend) + /* ftrace_start is true if we want ftrace running */ + if (ftrace_start) command |= FTRACE_ENABLE_CALLS; ftrace_run_update_code(command); - mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_start_lock); } static void ftrace_shutdown_sysctl(void) @@ -782,20 +734,20 @@ static void ftrace_shutdown_sysctl(void) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftraced_lock); - /* ftraced_suspend is true if ftrace is running */ - if (ftraced_suspend) + mutex_lock(&ftrace_start_lock); + /* ftrace_start is true if ftrace is running */ + if (ftrace_start) command |= FTRACE_DISABLE_CALLS; ftrace_run_update_code(command); - mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_start_lock); } static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int __ftrace_update_code(void *ignore) +static int ftrace_update_code(void *ignore) { int i, save_ftrace_enabled; cycle_t start, stop; @@ -869,7 +821,6 @@ static int __ftrace_update_code(void *ignore) stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; - ftraced_trigger = 0; ftrace_enabled = save_ftrace_enabled; ftrace_record_suspend--; @@ -877,17 +828,6 @@ static int __ftrace_update_code(void *ignore) return 0; } -static int ftrace_update_code(void) -{ - if (unlikely(ftrace_disabled) || - !ftrace_enabled || !ftraced_trigger) - return 0; - - stop_machine(__ftrace_update_code, NULL, NULL); - - return 1; -} - static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) { struct ftrace_page *pg; @@ -1425,10 +1365,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) } mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - if (iter->filtered && ftraced_suspend && ftrace_enabled) + mutex_lock(&ftrace_start_lock); + if (iter->filtered && ftrace_start && ftrace_enabled) ftrace_run_update_code(FTRACE_ENABLE_CALLS); - mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_start_lock); mutex_unlock(&ftrace_sysctl_lock); kfree(iter); @@ -1448,55 +1388,6 @@ ftrace_notrace_release(struct inode *inode, struct file *file) return ftrace_regex_release(inode, file, 0); } -static ssize_t -ftraced_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - /* don't worry about races */ - char *buf = ftraced_stop ? "disabled\n" : "enabled\n"; - int r = strlen(buf); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -ftraced_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - long val; - int ret; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - if (strncmp(buf, "enable", 6) == 0) - val = 1; - else if (strncmp(buf, "disable", 7) == 0) - val = 0; - else { - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) - return ret; - - val = !!val; - } - - if (val) - ftrace_enable_daemon(); - else - ftrace_disable_daemon(); - - filp->f_pos += cnt; - - return cnt; -} - static struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, @@ -1527,38 +1418,6 @@ static struct file_operations ftrace_notrace_fops = { .release = ftrace_notrace_release, }; -static struct file_operations ftraced_fops = { - .open = tracing_open_generic, - .read = ftraced_read, - .write = ftraced_write, -}; - -/** - * ftrace_force_update - force an update to all recording ftrace functions - */ -int ftrace_force_update(void) -{ - int ret = 0; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - - /* - * If ftraced_trigger is not set, then there is nothing - * to update. - */ - if (ftraced_trigger && !ftrace_update_code()) - ret = -EBUSY; - - mutex_unlock(&ftraced_lock); - mutex_unlock(&ftrace_sysctl_lock); - - return ret; -} - static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; @@ -1589,17 +1448,11 @@ static __init int ftrace_init_debugfs(void) pr_warning("Could not create debugfs " "'set_ftrace_notrace' entry\n"); - entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer, - NULL, &ftraced_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'ftraced_enabled' entry\n"); return 0; } fs_initcall(ftrace_init_debugfs); -#ifdef CONFIG_FTRACE_MCOUNT_RECORD static int ftrace_convert_nops(unsigned long *start, unsigned long *end) { @@ -1619,7 +1472,7 @@ static int ftrace_convert_nops(unsigned long *start, /* p is ignored */ local_irq_save(flags); - __ftrace_update_code(p); + ftrace_update_code(p); local_irq_restore(flags); return 0; @@ -1666,122 +1519,6 @@ void __init ftrace_init(void) failed: ftrace_disabled = 1; } -#else /* CONFIG_FTRACE_MCOUNT_RECORD */ - -static void ftrace_release_hash(unsigned long start, unsigned long end) -{ - struct dyn_ftrace *rec; - struct hlist_node *t, *n; - struct hlist_head *head, temp_list; - unsigned long flags; - int i, cpu; - - preempt_disable_notrace(); - - /* disable incase we call something that calls mcount */ - cpu = raw_smp_processor_id(); - per_cpu(ftrace_shutdown_disable_cpu, cpu)++; - - ftrace_hash_lock(flags); - - for (i = 0; i < FTRACE_HASHSIZE; i++) { - INIT_HLIST_HEAD(&temp_list); - head = &ftrace_hash[i]; - - /* all CPUS are stopped, we are safe to modify code */ - hlist_for_each_entry_safe(rec, t, n, head, node) { - if (rec->flags & FTRACE_FL_FREE) - continue; - - if ((rec->ip >= start) && (rec->ip < end)) - ftrace_free_rec(rec); - } - } - - ftrace_hash_unlock(flags); - - per_cpu(ftrace_shutdown_disable_cpu, cpu)--; - preempt_enable_notrace(); - -} - -static int ftraced(void *ignore) -{ - unsigned long usecs; - - while (!kthread_should_stop()) { - - set_current_state(TASK_INTERRUPTIBLE); - - /* check once a second */ - schedule_timeout(HZ); - - if (unlikely(ftrace_disabled)) - continue; - - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - if (!ftraced_suspend && !ftraced_stop && - ftrace_update_code()) { - usecs = nsecs_to_usecs(ftrace_update_time); - if (ftrace_update_tot_cnt > 100000) { - ftrace_update_tot_cnt = 0; - pr_info("hm, dftrace overflow: %lu change%s" - " (%lu total) in %lu usec%s\n", - ftrace_update_cnt, - ftrace_update_cnt != 1 ? "s" : "", - ftrace_update_tot_cnt, - usecs, usecs != 1 ? "s" : ""); - FTRACE_WARN_ON_ONCE(1); - } - } - mutex_unlock(&ftraced_lock); - mutex_unlock(&ftrace_sysctl_lock); - - ftrace_shutdown_replenish(); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -static int __init ftrace_dynamic_init(void) -{ - struct task_struct *p; - unsigned long addr; - int ret; - - addr = (unsigned long)ftrace_record_ip; - - stop_machine(ftrace_dyn_arch_init, &addr, NULL); - - /* ftrace_dyn_arch_init places the return code in addr */ - if (addr) { - ret = (int)addr; - goto failed; - } - - ret = ftrace_dyn_table_alloc(NR_TO_INIT); - if (ret) - goto failed; - - p = kthread_run(ftraced, NULL, "ftraced"); - if (IS_ERR(p)) { - ret = -1; - goto failed; - } - - last_ftrace_enabled = ftrace_enabled = 1; - ftraced_task = p; - - return 0; - - failed: - ftrace_disabled = 1; - return ret; -} - -core_initcall(ftrace_dynamic_init); -#endif /* CONFIG_FTRACE_MCOUNT_RECORD */ #else # define ftrace_startup() do { } while (0) @@ -1801,9 +1538,6 @@ void ftrace_kill(void) { ftrace_disabled = 1; ftrace_enabled = 0; -#ifdef CONFIG_DYNAMIC_FTRACE - ftraced_suspend = -1; -#endif clear_ftrace_function(); } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 95815d26a04..90bc752a758 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -99,13 +99,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* passed in by parameter to fool gcc from optimizing */ func(); - /* update the records */ - ret = ftrace_force_update(); - if (ret) { - printk(KERN_CONT ".. ftraced failed .. "); - return ret; - } - /* * Some archs *cough*PowerPC*cough* add charachters to the * start of the function names. We simply put a '*' to @@ -183,13 +176,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* make sure msleep has been recorded */ msleep(1); - /* force the recorded functions to be traced */ - ret = ftrace_force_update(); - if (ret) { - printk(KERN_CONT ".. ftraced failed .. "); - return ret; - } - /* start the tracing */ ftrace_enabled = 1; tracer_enabled = 1; -- cgit v1.2.3 From 4d296c24326783bff1282ac72f310d8bac8df413 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:33:06 -0400 Subject: ftrace: remove mcount set The arch dependent function ftrace_mcount_set was only used by the daemon start up code. This patch removes it. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e758cab0836..226fd9132d5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -620,7 +620,6 @@ static int ftrace_update_code(void *ignore); static int __ftrace_modify_code(void *data) { - unsigned long addr; int *command = data; if (*command & FTRACE_ENABLE_CALLS) { @@ -639,14 +638,6 @@ static int __ftrace_modify_code(void *data) if (*command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); - if (*command & FTRACE_ENABLE_MCOUNT) { - addr = (unsigned long)ftrace_record_ip; - ftrace_mcount_set(&addr); - } else if (*command & FTRACE_DISABLE_MCOUNT) { - addr = (unsigned long)ftrace_stub; - ftrace_mcount_set(&addr); - } - return 0; } -- cgit v1.2.3 From 08f5ac906d2c0faf96d608c54a0b03177376da8d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 09:33:07 -0400 Subject: ftrace: remove ftrace hash The ftrace hash was used by the ftrace_daemon code. The record ip function would place the calling address (ip) into the hash. The daemon would later read the hash and modify that code. The hash complicates the code. This patch removes it. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 243 ++++++++------------------------------------------ kernel/trace/trace.c | 3 - 2 files changed, 35 insertions(+), 211 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 226fd9132d5..07762c08a94 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include @@ -189,9 +188,7 @@ static int ftrace_filtered; static int tracing_on; static int frozen_record_count; -static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; - -static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); +static LIST_HEAD(ftrace_new_addrs); static DEFINE_MUTEX(ftrace_regex_lock); @@ -210,8 +207,6 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static int ftrace_record_suspend; - static struct dyn_ftrace *ftrace_free_records; @@ -242,72 +237,6 @@ static inline int record_frozen(struct dyn_ftrace *rec) # define record_frozen(rec) ({ 0; }) #endif /* CONFIG_KPROBES */ -int skip_trace(unsigned long ip) -{ - unsigned long fl; - struct dyn_ftrace *rec; - struct hlist_node *t; - struct hlist_head *head; - - if (frozen_record_count == 0) - return 0; - - head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)]; - hlist_for_each_entry_rcu(rec, t, head, node) { - if (rec->ip == ip) { - if (record_frozen(rec)) { - if (rec->flags & FTRACE_FL_FAILED) - return 1; - - if (!(rec->flags & FTRACE_FL_CONVERTED)) - return 1; - - if (!tracing_on || !ftrace_enabled) - return 1; - - if (ftrace_filtered) { - fl = rec->flags & (FTRACE_FL_FILTER | - FTRACE_FL_NOTRACE); - if (!fl || (fl & FTRACE_FL_NOTRACE)) - return 1; - } - } - break; - } - } - - return 0; -} - -static inline int -ftrace_ip_in_hash(unsigned long ip, unsigned long key) -{ - struct dyn_ftrace *p; - struct hlist_node *t; - int found = 0; - - hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) { - if (p->ip == ip) { - found = 1; - break; - } - } - - return found; -} - -static inline void -ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) -{ - hlist_add_head_rcu(&node->node, &ftrace_hash[key]); -} - -/* called from kstop_machine */ -static inline void ftrace_del_hash(struct dyn_ftrace *node) -{ - hlist_del(&node->node); -} - static void ftrace_free_rec(struct dyn_ftrace *rec) { rec->ip = (unsigned long)ftrace_free_records; @@ -362,69 +291,36 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) } if (ftrace_pages->index == ENTRIES_PER_PAGE) { - if (!ftrace_pages->next) - return NULL; + if (!ftrace_pages->next) { + /* allocate another page */ + ftrace_pages->next = + (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages->next) + return NULL; + } ftrace_pages = ftrace_pages->next; } return &ftrace_pages->records[ftrace_pages->index++]; } -static void +static struct dyn_ftrace * ftrace_record_ip(unsigned long ip) { - struct dyn_ftrace *node; - unsigned long key; - int resched; - int cpu; + struct dyn_ftrace *rec; if (!ftrace_enabled || ftrace_disabled) - return; - - resched = need_resched(); - preempt_disable_notrace(); - - /* - * We simply need to protect against recursion. - * Use the the raw version of smp_processor_id and not - * __get_cpu_var which can call debug hooks that can - * cause a recursive crash here. - */ - cpu = raw_smp_processor_id(); - per_cpu(ftrace_shutdown_disable_cpu, cpu)++; - if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1) - goto out; - - if (unlikely(ftrace_record_suspend)) - goto out; - - key = hash_long(ip, FTRACE_HASHBITS); - - FTRACE_WARN_ON_ONCE(key >= FTRACE_HASHSIZE); - - if (ftrace_ip_in_hash(ip, key)) - goto out; - - /* This ip may have hit the hash before the lock */ - if (ftrace_ip_in_hash(ip, key)) - goto out; - - node = ftrace_alloc_dyn_node(ip); - if (!node) - goto out; + return NULL; - node->ip = ip; + rec = ftrace_alloc_dyn_node(ip); + if (!rec) + return NULL; - ftrace_add_hash(node, key); + rec->ip = ip; - out: - per_cpu(ftrace_shutdown_disable_cpu, cpu)--; + list_add(&rec->list, &ftrace_new_addrs); - /* prevent recursion with scheduler */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + return rec; } #define FTRACE_ADDR ((long)(ftrace_caller)) @@ -543,7 +439,6 @@ static void ftrace_replace_code(int enable) rec->flags |= FTRACE_FL_FAILED; if ((system_state == SYSTEM_BOOTING) || !core_kernel_text(rec->ip)) { - ftrace_del_hash(rec); ftrace_free_rec(rec); } } @@ -551,15 +446,6 @@ static void ftrace_replace_code(int enable) } } -static void ftrace_shutdown_replenish(void) -{ - if (ftrace_pages->next) - return; - - /* allocate another page */ - ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); -} - static void print_ip_ins(const char *fmt, unsigned char *p) { int i; @@ -616,18 +502,11 @@ ftrace_code_disable(struct dyn_ftrace *rec) return 1; } -static int ftrace_update_code(void *ignore); - static int __ftrace_modify_code(void *data) { int *command = data; if (*command & FTRACE_ENABLE_CALLS) { - /* - * Update any recorded ips now that we have the - * machine stopped - */ - ftrace_update_code(NULL); ftrace_replace_code(1); tracing_on = 1; } else if (*command & FTRACE_DISABLE_CALLS) { @@ -738,84 +617,34 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int ftrace_update_code(void *ignore) +static int ftrace_update_code(void) { - int i, save_ftrace_enabled; + struct dyn_ftrace *p, *t; cycle_t start, stop; - struct dyn_ftrace *p; - struct hlist_node *t, *n; - struct hlist_head *head, temp_list; - - /* Don't be recording funcs now */ - ftrace_record_suspend++; - save_ftrace_enabled = ftrace_enabled; - ftrace_enabled = 0; start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; - /* No locks needed, the machine is stopped! */ - for (i = 0; i < FTRACE_HASHSIZE; i++) { - INIT_HLIST_HEAD(&temp_list); - head = &ftrace_hash[i]; + list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) { - /* all CPUS are stopped, we are safe to modify code */ - hlist_for_each_entry_safe(p, t, n, head, node) { - /* Skip over failed records which have not been - * freed. */ - if (p->flags & FTRACE_FL_FAILED) - continue; + /* If something went wrong, bail without enabling anything */ + if (unlikely(ftrace_disabled)) + return -1; - /* Unconverted records are always at the head of the - * hash bucket. Once we encounter a converted record, - * simply skip over to the next bucket. Saves ftraced - * some processor cycles (ftrace does its bid for - * global warming :-p ). */ - if (p->flags & (FTRACE_FL_CONVERTED)) - break; + list_del_init(&p->list); - /* Ignore updates to this record's mcount site. - * Reintroduce this record at the head of this - * bucket to attempt to "convert" it again if - * the kprobe on it is unregistered before the - * next run. */ - if (get_kprobe((void *)p->ip)) { - ftrace_del_hash(p); - INIT_HLIST_NODE(&p->node); - hlist_add_head(&p->node, &temp_list); - freeze_record(p); - continue; - } else { - unfreeze_record(p); - } - - /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(p)) { - p->flags |= FTRACE_FL_CONVERTED; - ftrace_update_cnt++; - } else { - if ((system_state == SYSTEM_BOOTING) || - !core_kernel_text(p->ip)) { - ftrace_del_hash(p); - ftrace_free_rec(p); - } - } - } - - hlist_for_each_entry_safe(p, t, n, &temp_list, node) { - hlist_del(&p->node); - INIT_HLIST_NODE(&p->node); - hlist_add_head(&p->node, head); - } + /* convert record (i.e, patch mcount-call with NOP) */ + if (ftrace_code_disable(p)) { + p->flags |= FTRACE_FL_CONVERTED; + ftrace_update_cnt++; + } else + ftrace_free_rec(p); } stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; - ftrace_enabled = save_ftrace_enabled; - ftrace_record_suspend--; - return 0; } @@ -847,7 +676,7 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) pg = ftrace_pages = ftrace_pages_start; cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld hash entries in %d pages\n", + pr_info("ftrace: allocating %ld entries in %d pages\n", num_to_init, cnt); for (i = 0; i < cnt; i++) { @@ -1451,20 +1280,18 @@ static int ftrace_convert_nops(unsigned long *start, unsigned long addr; unsigned long flags; + mutex_lock(&ftrace_start_lock); p = start; while (p < end) { addr = ftrace_call_adjust(*p++); - /* should not be called from interrupt context */ - spin_lock(&ftrace_lock); ftrace_record_ip(addr); - spin_unlock(&ftrace_lock); - ftrace_shutdown_replenish(); } - /* p is ignored */ + /* disable interrupts to prevent kstop machine */ local_irq_save(flags); - ftrace_update_code(p); + ftrace_update_code(); local_irq_restore(flags); + mutex_unlock(&ftrace_start_lock); return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 333a5162149..06951e22944 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -865,9 +865,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) if (unlikely(!ftrace_function_enabled)) return; - if (skip_trace(ip)) - return; - pc = preempt_count(); resched = need_resched(); preempt_disable_notrace(); -- cgit v1.2.3 From 66b0de3569b00f61978782b9f97aa4803dbec0fb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Oct 2008 16:11:03 +0200 Subject: ftrace: fix build failure fix: kernel/trace/ftrace.c: In function 'ftrace_release': kernel/trace/ftrace.c:271: error: implicit declaration of function 'ftrace_release_hash' release_hash is not needed without dftraced. Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 07762c08a94..27212321eb0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -267,8 +267,6 @@ void ftrace_release(void *start, unsigned long size) } } spin_unlock(&ftrace_lock); - - ftrace_release_hash(s, e); } static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) -- cgit v1.2.3 From acff181d3574244e651913df77332e897b88bff4 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Tue, 21 Oct 2008 19:49:09 -0400 Subject: printk: remove unused code from kernel/printk.c both log_buf_copy() and log_buf_len are unused. Signed-off-by: Ingo Molnar --- kernel/printk.c | 39 --------------------------------------- 1 file changed, 39 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 6341af77eb6..f492f1583d7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -232,45 +232,6 @@ static inline void boot_delay_msec(void) } #endif -/* - * Return the number of unread characters in the log buffer. - */ -static int log_buf_get_len(void) -{ - return logged_chars; -} - -/* - * Copy a range of characters from the log buffer. - */ -int log_buf_copy(char *dest, int idx, int len) -{ - int ret, max; - bool took_lock = false; - - if (!oops_in_progress) { - spin_lock_irq(&logbuf_lock); - took_lock = true; - } - - max = log_buf_get_len(); - if (idx < 0 || idx >= max) { - ret = -1; - } else { - if (len > max) - len = max; - ret = len; - idx += (log_end - max); - while (len-- > 0) - dest[len] = LOG_BUF(idx + len); - } - - if (took_lock) - spin_unlock_irq(&logbuf_lock); - - return ret; -} - /* * Commands to do_syslog: * -- cgit v1.2.3 From bea92112415635ecb7e681355834413c7c048f67 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 22 Oct 2008 19:31:11 +0900 Subject: kernel/resource: fix reserve_region_with_split() section mismatch Impact: cleanup, small kernel text size reduction, no functionality changed reserve_region_with_split() calls in to __reserve_region_with_split(), which is an __init function. The only caller of reserve_region_with_split() is an __init function, so make it __init too. Signed-off-by: Paul Mundt Signed-off-by: Ingo Molnar --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 4089d12af6e..7fec0e42723 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -571,7 +571,7 @@ static void __init __reserve_region_with_split(struct resource *root, } -void reserve_region_with_split(struct resource *root, +void __init reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name) { -- cgit v1.2.3 From 01c8c57d668d94f1036d9ab11a22aa24ca16a35d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Oct 2008 11:06:12 +0200 Subject: sched: fix a find_busiest_group buglet In one of the group load balancer patches: commit 408ed066b11cf9ee4536573b4269ee3613bd735e Author: Peter Zijlstra Date: Fri Jun 27 13:41:28 2008 +0200 Subject: sched: hierarchical load vs find_busiest_group The following change: - if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= + if (max_load - this_load + 2*busiest_load_per_task >= busiest_load_per_task * imbn) { made the condition always true, because imbn is [1,2]. Therefore, remove the 2*, and give the it a fair chance. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6625c3c4b10..12bc367d924 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3344,7 +3344,7 @@ small_imbalance: } else this_load_per_task = cpu_avg_load_per_task(this_cpu); - if (max_load - this_load + 2*busiest_load_per_task >= + if (max_load - this_load + busiest_load_per_task >= busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; -- cgit v1.2.3 From 1af5f730fc1bf7c62ec9fb2d307206e18bf40a69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Oct 2008 11:06:13 +0200 Subject: sched: more accurate min_vruntime accounting Mike noticed the current min_vruntime tracking can go wrong and skip the current task. If the only remaining task in the tree is a nice 19 task with huge vruntime, new tasks will be inserted too far to the right too, causing some interactibity issues. min_vruntime can only change due to the leftmost entry disappearing (dequeue_entity()), or by the leftmost entry being incremented past the next entry, which elects a new leftmost (__update_curr()) Due to the current entry not being part of the actual tree, we have to compare the leftmost tree entry with the current entry, and take the leftmost of these two. So create a update_min_vruntime() function that takes computes the leftmost vruntime in the system (either tree of current) and increases the cfs_rq->min_vruntime if the computed value is larger than the previously found min_vruntime. And call this from the two sites we've identified that can change min_vruntime. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 42d211f08c9..b27ccc52f6a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -223,6 +223,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) return se->vruntime - cfs_rq->min_vruntime; } +static void update_min_vruntime(struct cfs_rq *cfs_rq) +{ + u64 vruntime = cfs_rq->min_vruntime; + + if (cfs_rq->curr) + vruntime = cfs_rq->curr->vruntime; + + if (cfs_rq->rb_leftmost) { + struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, + struct sched_entity, + run_node); + + if (vruntime == cfs_rq->min_vruntime) + vruntime = se->vruntime; + else + vruntime = min_vruntime(vruntime, se->vruntime); + } + + cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +} + /* * Enqueue an entity into the rb-tree: */ @@ -256,15 +277,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * Maintain a cache of leftmost tree entries (it is frequently * used): */ - if (leftmost) { + if (leftmost) cfs_rq->rb_leftmost = &se->run_node; - /* - * maintain cfs_rq->min_vruntime to be a monotonic increasing - * value tracking the leftmost vruntime in the tree. - */ - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, se->vruntime); - } rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); @@ -274,18 +288,9 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->rb_leftmost == &se->run_node) { struct rb_node *next_node; - struct sched_entity *next; next_node = rb_next(&se->run_node); cfs_rq->rb_leftmost = next_node; - - if (next_node) { - next = rb_entry(next_node, - struct sched_entity, run_node); - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, - next->vruntime); - } } if (cfs_rq->next == se) @@ -424,6 +429,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; + update_min_vruntime(cfs_rq); } static void update_curr(struct cfs_rq *cfs_rq) @@ -613,13 +619,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { - u64 vruntime; - - if (first_fair(cfs_rq)) { - vruntime = min_vruntime(cfs_rq->min_vruntime, - __pick_next_entity(cfs_rq)->vruntime); - } else - vruntime = cfs_rq->min_vruntime; + u64 vruntime = cfs_rq->min_vruntime; /* * The 'current' period is already promised to the current tasks, @@ -696,6 +696,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); + update_min_vruntime(cfs_rq); } /* -- cgit v1.2.3 From 0d13033bc9257fe65c1aa25e84568b1608da0901 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 24 Oct 2008 11:06:14 +0200 Subject: sched: weaken sync hint Mysql+oltp and pgsql+oltp peaks are still shifted right. The below puts the peaks back to 1 client/server pair per core. Use the avg_overlap information to weaken the sync hint. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b27ccc52f6a..b71ee2c6229 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1123,10 +1123,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; - if (!sync && sched_feat(SYNC_WAKEUPS) && - curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - sync = 1; + if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || + p->se.avg_overlap > sysctl_sched_migration_cost)) + sync = 0; /* * If sync wakeup then subtract the (maximum possible) -- cgit v1.2.3 From 464b75273f64be7c81fee975bd6ca9593df3427b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Oct 2008 11:06:15 +0200 Subject: sched: re-instate vruntime based wakeup preemption The advantage is that vruntime based wakeup preemption has a better conceptual model. Here wakeup_gran = 0 means: preempt when 'fair'. Therefore wakeup_gran is the granularity of unfairness we allow in order to make progress. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b71ee2c6229..7af17e04a6d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -143,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } +/* return depth at which a sched entity is present in the hierarchy */ +static inline int depth_se(struct sched_entity *se) +{ + int depth = 0; + + for_each_sched_entity(se) + depth++; + + return depth; +} + +static void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ + int se_depth, pse_depth; + + /* + * preemption test can be made between sibling entities who are in the + * same cfs_rq i.e who have a common parent. Walk up the hierarchy of + * both tasks until we find their ancestors who are siblings of common + * parent. + */ + + /* First walk up until both entities are at same depth */ + se_depth = depth_se(*se); + pse_depth = depth_se(*pse); + + while (se_depth > pse_depth) { + se_depth--; + *se = parent_entity(*se); + } + + while (pse_depth > se_depth) { + pse_depth--; + *pse = parent_entity(*pse); + } + + while (!is_same_group(*se, *pse)) { + *se = parent_entity(*se); + *pse = parent_entity(*pse); + } +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) @@ -193,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return NULL; } +static inline void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -1244,12 +1292,41 @@ static unsigned long wakeup_gran(struct sched_entity *se) * More easily preempt - nice tasks, while not making it harder for * + nice tasks. */ - if (sched_feat(ASYM_GRAN)) - gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); + if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD) + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); return gran; } +/* + * Should 'se' preempt 'curr'. + * + * |s1 + * |s2 + * |s3 + * g + * |<--->|c + * + * w(c, s1) = -1 + * w(c, s2) = 0 + * w(c, s3) = 1 + * + */ +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +{ + s64 gran, vdiff = curr->vruntime - se->vruntime; + + if (vdiff <= 0) + return -1; + + gran = wakeup_gran(curr); + if (vdiff > gran) + return 1; + + return 0; +} + /* * Preempt the current task with a newly woken task if needed: */ @@ -1258,7 +1335,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - s64 delta_exec; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1296,9 +1372,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; } - delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; - if (delta_exec > wakeup_gran(pse)) - resched_task(curr); + find_matching_se(&se, &pse); + + while (se) { + BUG_ON(!pse); + + if (wakeup_preempt_entity(se, pse) == 1) { + resched_task(curr); + break; + } + + se = parent_entity(se); + pse = parent_entity(pse); + } } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v1.2.3 From 3f3a490480d8ab96e0fe30a41f80f14e6a0c579d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Oct 2008 11:06:16 +0200 Subject: sched: virtual time buddy preemption Since we moved wakeup preemption back to virtual time, it makes sense to move the buddy stuff back as well. The purpose of the buddy scheduling is to allow a quickly scheduling pair of tasks to run away from the group as far as a regular busy task would be allowed under wakeup preemption. This has the advantage that the pair can ping-pong for a while, enjoying cache-hotness. Without buddy scheduling other tasks would interleave destroying the cache. Also, it saves a word in cfs_rq. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 - kernel/sched_fair.c | 10 ++++------ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 12bc367d924..e8819bc6f46 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -386,7 +386,6 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; - u64 pair_start; struct rb_root tasks_timeline; struct rb_node *rb_leftmost; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7af17e04a6d..ce514afd78f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -791,16 +791,14 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + static struct sched_entity * pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rq *rq = rq_of(cfs_rq); - u64 pair_slice = rq->clock - cfs_rq->pair_start; - - if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) { - cfs_rq->pair_start = rq->clock; + if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1) return se; - } return cfs_rq->next; } -- cgit v1.2.3 From f17845e5d97ead8fbdadfd40039e058ec7cf4a42 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Oct 2008 12:47:10 +0200 Subject: ftrace: warning in kernel/trace/ftrace.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this warning: kernel/trace/ftrace.c:189: warning: ‘frozen_record_count’ defined but not used triggers because frozen_record_count is only used in the KCONFIG_MARKERS case. Move the variable it there. Alas, this frozen-record facility seems to have little use. The frozen_record_count variable is not used by anything, nor the flags. So this section might need a bit of dead-code-removal care as well. Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 27212321eb0..7618c528756 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -186,7 +186,6 @@ enum { static int ftrace_filtered; static int tracing_on; -static int frozen_record_count; static LIST_HEAD(ftrace_new_addrs); @@ -211,6 +210,9 @@ static struct dyn_ftrace *ftrace_free_records; #ifdef CONFIG_KPROBES + +static int frozen_record_count; + static inline void freeze_record(struct dyn_ftrace *rec) { if (!(rec->flags & FTRACE_FL_FROZEN)) { @@ -1443,3 +1445,4 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_unlock(&ftrace_sysctl_lock); return ret; } + -- cgit v1.2.3 From e2862c9470beb842d3f1c1965b03a2112114c160 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 27 Oct 2008 17:43:28 +1100 Subject: trace: fix printk warning for u64 A powerpc ppc64_defconfig build produces these warnings: kernel/trace/ring_buffer.c: In function 'rb_add_time_stamp': kernel/trace/ring_buffer.c:969: warning: format '%llu' expects type 'long long unsigned int', but argument 2 has type 'u64' kernel/trace/ring_buffer.c:969: warning: format '%llu' expects type 'long long unsigned int', but argument 3 has type 'u64' kernel/trace/ring_buffer.c:969: warning: format '%llu' expects type 'long long unsigned int', but argument 4 has type 'u64' Just cast the u64s to unsigned long long like we do everywhere else. Signed-off-by: Stephen Rothwell Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 091aeefe321..cedf4e26828 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -966,7 +966,9 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(*delta > (1ULL << 59) && !once++)) { printk(KERN_WARNING "Delta way too big! %llu" " ts=%llu write stamp = %llu\n", - *delta, *ts, cpu_buffer->write_stamp); + (unsigned long long)*delta, + (unsigned long long)*ts, + (unsigned long long)cpu_buffer->write_stamp); WARN_ON(1); } -- cgit v1.2.3 From f66af459a931f25807e1df7915b2b66bb5978d82 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Oct 2008 19:14:55 +0200 Subject: tracepoint: check if the probe has been registered Impact: fix kernel crash that can trigger during tracing If we try to remove a probe that has not been already registered, the tracepoint_entry_remove_probe() function will dereference a NULL pointer. Check the probe before removing it to avoid crashes. Signed-off-by: Frederic Weisbecker Acked-by: Mathieu Desnoyers Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f2b7c28a470..af8c8566488 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -131,6 +131,9 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) old = entry->funcs; + if (!old) + return NULL; + debug_print_probes(entry); /* (N -> M), (N > 1, M >= 0) probes */ for (nr_probes = 0; old[nr_probes]; nr_probes++) { @@ -388,6 +391,11 @@ int tracepoint_probe_unregister(const char *name, void *probe) if (entry->rcu_pending) rcu_barrier_sched(); old = tracepoint_entry_remove_probe(entry, probe); + if (!old) { + printk(KERN_WARNING "Warning: Trying to unregister a probe" + "that doesn't exist\n"); + goto end; + } mutex_unlock(&tracepoints_mutex); tracepoint_update_probes(); /* may update entry */ mutex_lock(&tracepoints_mutex); -- cgit v1.2.3 From ea31e72d753e5817a97de552f152d0cb55c7defc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Oct 2008 19:26:23 +0200 Subject: tracing/ftrace: make boot tracer select the sched_switch tracer Impact: build fix If the boot tracer is selected but not the sched_switch, there will be a build failure: kernel/built-in.o: In function `boot_trace_init': trace_boot.c:(.text+0x5ee38): undefined reference to `sched_switch_trace' kernel/built-in.o: In function `disable_boot_trace': (.text+0x5eee1): undefined reference to `tracing_stop_cmdline_record' kernel/built-in.o: In function `enable_boot_trace': (.text+0x5ef11): undefined reference to `tracing_start_cmdline_record' This patch fixes it. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index bc535cb91de..e0cea282e0c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -123,6 +123,7 @@ config BOOT_TRACER bool "Trace boot initcalls" depends on DEBUG_KERNEL select TRACING + select CONTEXT_SWITCH_TRACER help This tracer helps developers to optimize boot times: it records the timings of the initcalls and traces key events and the identity -- cgit v1.2.3 From 21798a84ab383cdac0e7ee3368e0792b718b867d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 28 Oct 2008 09:43:26 +0100 Subject: tracing: fix a build error on alpha Impact: build fix on Alpha When tracing is enabled, some arch have included on their but others like alpha or m68k don't. Build error on alpha: kernel/trace/trace.c: In function 'tracing_cpumask_write': kernel/trace/trace.c:2145: error: implicit declaration of function 'raw_local_irq_disable' kernel/trace/trace.c:2162: error: implicit declaration of function 'raw_local_irq_enable' Tested on Alpha through a cross-compiler (should correct a similar issue on m68k). Reported-by: Alexey Dobriyan Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 06951e22944..bc577dcc0e4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -34,6 +34,7 @@ #include #include +#include #include "trace.h" -- cgit v1.2.3 From 46fec7ac40e452a2ea5e63648d98b6bb2b5898f9 Mon Sep 17 00:00:00 2001 From: qinghuang feng Date: Tue, 28 Oct 2008 17:24:28 +0800 Subject: lockdep: minor fix for debug_show_all_locks() When we failed to get tasklist_lock eventually (count equals 0), we should only print " ignoring it.\n", and not print " locked it.\n" needlessly. Signed-off-by: Qinghuang Feng Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index dbda475b13b..11832acdde7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3417,9 +3417,10 @@ retry: } printk(" ignoring it.\n"); unlock = 0; + } else { + if (count != 10) + printk(KERN_CONT " locked it.\n"); } - if (count != 10) - printk(" locked it.\n"); do_each_thread(g, p) { /* -- cgit v1.2.3 From 6afe40b4dace385d7ba2faf24b352f066f3b71bf Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 28 Oct 2008 11:14:58 +0100 Subject: lockdep: fix irqs on/off ip tracing Impact: fix lockdep lock-api-caller output when irqsoff tracing is enabled 81d68a96 "ftrace: trace irq disabled critical timings" added wrappers around trace_hardirqs_on/off_caller. However these functions use __builtin_return_address(0) to figure out which function actually disabled or enabled irqs. The result is that we save the ips of trace_hardirqs_on/off instead of the real caller. Not very helpful. However since the patch from Steven the ip already gets passed. So use that and get rid of __builtin_return_address(0) in these two functions. Signed-off-by: Heiko Carstens Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 11832acdde7..06e157119d2 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2169,12 +2169,11 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on_caller(unsigned long a0) +void trace_hardirqs_on_caller(unsigned long ip) { struct task_struct *curr = current; - unsigned long ip; - time_hardirqs_on(CALLER_ADDR0, a0); + time_hardirqs_on(CALLER_ADDR0, ip); if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2188,7 +2187,6 @@ void trace_hardirqs_on_caller(unsigned long a0) } /* we'll do an OFF -> ON transition: */ curr->hardirqs_enabled = 1; - ip = (unsigned long) __builtin_return_address(0); if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@ -2224,11 +2222,11 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off_caller(unsigned long a0) +void trace_hardirqs_off_caller(unsigned long ip) { struct task_struct *curr = current; - time_hardirqs_off(CALLER_ADDR0, a0); + time_hardirqs_off(CALLER_ADDR0, ip); if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2241,7 +2239,7 @@ void trace_hardirqs_off_caller(unsigned long a0) * We have done an ON -> OFF transition: */ curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = _RET_IP_; + curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_off_events); } else -- cgit v1.2.3 From 60063a66236c15f5613f91390631e06718689782 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2008 10:44:24 -0400 Subject: ftrace: fix current_tracer error return The commit (in linux-tip) c2931e05ec5965597cbfb79ad332d4a29aeceb23 ( ftrace: return an error when setting a nonexistent tracer ) added useful code that would error when a bad tracer was written into the current_tracer file. But this had a bug if the amount written was more than the amount read by that code. The first iteration would set the tracer correctly, but since it did not consume the rest of what was written (usually whitespace), the userspace utility would continue to write what was not consumed. This second iteration would fail to find a tracer and return -EINVAL. Funny thing is that the tracer would have already been set. This patch just consumes all the data that is written to the file. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bc577dcc0e4..a610ca77155 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2377,9 +2377,10 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, int i; size_t ret; + ret = cnt; + if (cnt > max_tracer_type_len) cnt = max_tracer_type_len; - ret = cnt; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -2412,8 +2413,8 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&trace_types_lock); - if (ret == cnt) - filp->f_pos += cnt; + if (ret > 0) + filp->f_pos += ret; return ret; } -- cgit v1.2.3 From 0b6e4d56bf71866a2b58daa8323cf747988ce7e4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 28 Oct 2008 20:17:38 +0100 Subject: ftrace: perform an initialization for ftrace to enable it Impact: corrects a bug which made the non-dyn function tracer not functional With latest git, the non-dynamic function tracer didn't get any trace. The problem was the fact that ftrace_enabled wasn't initialized to 1 because ftrace hasn't any init function when DYNAMIC_FTRACE is disabled. So when a tracer tries to register an ftrace_ops struct, __register_ftrace_function failed to set the hook. This patch corrects it by setting an init function to initialize ftrace during the boot. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7618c528756..4a39d24568c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1339,6 +1339,14 @@ void __init ftrace_init(void) } #else + +static int __init ftrace_nodyn_init(void) +{ + ftrace_enabled = 1; + return 0; +} +device_initcall(ftrace_nodyn_init); + # define ftrace_startup() do { } while (0) # define ftrace_shutdown() do { } while (0) # define ftrace_startup_sysctl() do { } while (0) -- cgit v1.2.3 From d68612b257b5f4ea2e6535859c5a26b10011a9df Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 28 Oct 2008 11:45:42 -0700 Subject: resources: fix x86info results ioremap.c:226 __ioremap_caller+0xf2/0x2d6() WARNINGs Impact: avoid false-positive WARN_ON() Andi Kleen reported: > When running x86info on a 2.6.27-git8 system I get > > resource map sanity check conflict: 0x9e000 0x9efff 0x10000 0x9e7ff System RAM > ------------[ cut here ]------------ > WARNING: at /home/lsrc/linux/arch/x86/mm/ioremap.c:226 __ioremap_caller+0xf2/0x2d6() > ... Some of the pages below the 1MB ISA addresses will be shared typically by both BIOS and system usable RAM. For example: BIOS-e820: 0000000000000000 - 000000000009f800 (usable) BIOS-e820: 000000000009f800 - 00000000000a0000 (reserved) x86info reads the low physical address using /dev/mem, which internally uses ioremap() for accessing non RAM pages. ioremap() of such low pages conflicts with multiple resource entities leading to the above warning. Change the iomem_map_sanity_check() to allow mapping a page spanning multiple resource entities (minimum granularity that one can map is a page anyhow). Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- kernel/resource.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 7fec0e42723..6aac5c60b25 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -849,7 +850,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) continue; if (p->end < addr) continue; - if (p->start <= addr && (p->end >= addr + size - 1)) + if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && + PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) continue; printk(KERN_WARNING "resource map sanity check conflict: " "0x%llx 0x%llx 0x%llx 0x%llx %s\n", -- cgit v1.2.3 From f3384b28a05624783b53836ccfed95ecde66a7ad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Oct 2008 11:15:57 -0400 Subject: ftrace: fix trace_nop config select Impact: build fix on non-function-tracing architectures The trace_nop is the tracer that is defined when no tracer is set in the ftrace infrastructure. The trace_nop was mistakenly selected by HAVE_FTRACE due to the confusion between ftrace infrastructure and the ftrace function tracer (which has been solved by renaming the function tracer). This patch changes the select to the approriate TRACING. This patch should fix compile errors on architectures that do not define the FUNCTION_TRACER. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e0cea282e0c..b58f43bec36 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -8,7 +8,6 @@ config NOP_TRACER config HAVE_FUNCTION_TRACER bool - select NOP_TRACER config HAVE_DYNAMIC_FTRACE bool @@ -28,6 +27,7 @@ config TRACING select RING_BUFFER select STACKTRACE select TRACEPOINTS + select NOP_TRACER menu "Tracers" -- cgit v1.2.3 From a9cf4ddb3b2ce03c3027929b22a920aeff933009 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 30 Oct 2008 15:23:34 +0800 Subject: sched: change sched_debug's mode to 0444 Impact: change /proc/sched/debug from rw-r--r-- to r--r--r-- /proc/sched_debug is read-only. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index ad958c1ec70..5ae17762ec3 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -319,7 +319,7 @@ static int __init init_sched_debug_procfs(void) { struct proc_dir_entry *pe; - pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops); + pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops); if (!pe) return -ENOMEM; return 0; -- cgit v1.2.3 From 7ccb97437bcc818d0ba6067513475f6ee8177a15 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Oct 2008 14:00:51 -0700 Subject: freezer_cg: fix improper BUG_ON() causing oops The BUG_ON() should be protected by freezer->lock, otherwise it can be triggered easily when a task has been unfreezed but the corresponding cgroup hasn't been changed to FROZEN state. Signed-off-by: Li Zefan Acked-by: Cedric Le Goater Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e9505695449..7f54d1c4295 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -190,8 +190,9 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) freezer = task_freezer(task); task_unlock(task); - BUG_ON(freezer->state == CGROUP_FROZEN); spin_lock_irq(&freezer->lock); + BUG_ON(freezer->state == CGROUP_FROZEN); + /* Locking avoids race with FREEZING -> THAWED transitions. */ if (freezer->state == CGROUP_FREEZING) freeze_task(task, true); -- cgit v1.2.3 From 80a6a2cf3bebcf20285cf05373b9c5ec96816577 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Oct 2008 14:00:52 -0700 Subject: freezer_cg: remove redundant check in freezer_can_attach() It is sufficient to check if @task is frozen, and no need to check if the original freezer is frozen. Signed-off-by: Li Zefan Acked-by: Cedric Le Goater Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 7f54d1c4295..e9c856a265c 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -162,9 +162,13 @@ static int freezer_can_attach(struct cgroup_subsys *ss, struct task_struct *task) { struct freezer *freezer; - int retval; - /* Anything frozen can't move or be moved to/from */ + /* + * Anything frozen can't move or be moved to/from. + * + * Since orig_freezer->state == FROZEN means that @task has been + * frozen, so it's sufficient to check the latter condition. + */ if (is_task_frozen_enough(task)) return -EBUSY; @@ -173,13 +177,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state == CGROUP_FROZEN) return -EBUSY; - retval = 0; - task_lock(task); - freezer = task_freezer(task); - if (freezer->state == CGROUP_FROZEN) - retval = -EBUSY; - task_unlock(task); - return retval; + return 0; } static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) -- cgit v1.2.3 From 00c2e63c31d0f431952ff2a671c5c6997dd4f8b2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Oct 2008 14:00:53 -0700 Subject: freezer_cg: use thaw_process() in unfreeze_cgroup() Don't duplicate the implementation of thaw_process(). [akpm@linux-foundation.org: make __thaw_process() static] Signed-off-by: Li Zefan Cc: Cedric Le Goater Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 15 ++++----------- kernel/freezer.c | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e9c856a265c..5e6d26b66e8 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -275,25 +275,18 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) return num_cant_freeze_now ? -EBUSY : 0; } -static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { - int do_wake; - - task_lock(task); - do_wake = __thaw_process(task); - task_unlock(task); - if (do_wake) - wake_up_process(task); + thaw_process(task); } cgroup_iter_end(cgroup, &it); - freezer->state = CGROUP_THAWED; - return 0; + freezer->state = CGROUP_THAWED; } static int freezer_change_state(struct cgroup *cgroup, @@ -320,7 +313,7 @@ static int freezer_change_state(struct cgroup *cgroup, } /* state == FREEZING and goal_state == THAWED, so unfreeze */ case CGROUP_FROZEN: - retval = unfreeze_cgroup(cgroup, freezer); + unfreeze_cgroup(cgroup, freezer); break; default: break; diff --git a/kernel/freezer.c b/kernel/freezer.c index ba6248b323e..2f4936cf708 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -121,16 +121,7 @@ void cancel_freezing(struct task_struct *p) } } -/* - * Wake up a frozen process - * - * task_lock() is needed to prevent the race with refrigerator() which may - * occur if the freezing of tasks fails. Namely, without the lock, if the - * freezing of tasks failed, thaw_tasks() might have run before a task in - * refrigerator() could call frozen_process(), in which case the task would be - * frozen and no one would thaw it. - */ -int __thaw_process(struct task_struct *p) +static int __thaw_process(struct task_struct *p) { if (frozen(p)) { p->flags &= ~PF_FROZEN; @@ -140,6 +131,15 @@ int __thaw_process(struct task_struct *p) return 0; } +/* + * Wake up a frozen process + * + * task_lock() is needed to prevent the race with refrigerator() which may + * occur if the freezing of tasks fails. Namely, without the lock, if the + * freezing of tasks failed, thaw_tasks() might have run before a task in + * refrigerator() could call frozen_process(), in which case the task would be + * frozen and no one would thaw it. + */ int thaw_process(struct task_struct *p) { task_lock(p); -- cgit v1.2.3 From 51308ee59dee1136ed599d875ea8968d7be55c91 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Oct 2008 14:00:54 -0700 Subject: freezer_cg: simplify freezer_change_state() Just call unfreeze_cgroup() if goal_state == THAWED, and call try_to_freeze_cgroup() if goal_state == FROZEN. No behavior has been changed. Signed-off-by: Li Zefan Acked-by: Cedric Le Goater Acked-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 5e6d26b66e8..7fa476f01d0 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -296,27 +296,22 @@ static int freezer_change_state(struct cgroup *cgroup, int retval = 0; freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + update_freezer_state(cgroup, freezer); if (goal_state == freezer->state) goto out; - switch (freezer->state) { + + switch (goal_state) { case CGROUP_THAWED: - retval = try_to_freeze_cgroup(cgroup, freezer); + unfreeze_cgroup(cgroup, freezer); break; - case CGROUP_FREEZING: - if (goal_state == CGROUP_FROZEN) { - /* Userspace is retrying after - * "/bin/echo FROZEN > freezer.state" returned -EBUSY */ - retval = try_to_freeze_cgroup(cgroup, freezer); - break; - } - /* state == FREEZING and goal_state == THAWED, so unfreeze */ case CGROUP_FROZEN: - unfreeze_cgroup(cgroup, freezer); + retval = try_to_freeze_cgroup(cgroup, freezer); break; default: - break; + BUG(); } out: spin_unlock_irq(&freezer->lock); -- cgit v1.2.3 From ce05fcc30ea41c85f9d50bee1ce289f7cb7fb223 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 29 Oct 2008 14:01:07 -0700 Subject: kernel/profile: fix profile_init() section mismatch profile_init() calls in to alloc_bootmem() on early initialization. While alloc_bootmem() is __init, the reference itself is safe in that it is tucked below a !slab_is_available() check. So, flag profile_init() as __ref. Signed-off-by: Paul Mundt Cc: Dave Hansen Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index a9e422df6bf..9830a037d8d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -102,7 +102,7 @@ int profile_setup(char *str) __setup("profile=", profile_setup); -int profile_init(void) +int __ref profile_init(void) { int buffer_bytes; if (!prof_on) -- cgit v1.2.3 From d25141a818383b3c3b09f065698c544a7a0ec6e7 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 29 Oct 2008 14:01:11 -0700 Subject: 'kill sig -1' must only apply to caller's namespace Currently "kill -1" kills processes in all namespaces and breaks the isolation of namespaces. Earlier attempt to fix this was discussed at: http://lkml.org/lkml/2008/7/23/148 As suggested by Oleg Nesterov in that thread, use "task_pid_vnr() > 1" check since task_pid_vnr() returns 0 if process is outside the caller's namespace. Signed-off-by: Sukadev Bhattiprolu Acked-by: Eric W. Biederman Tested-by: Daniel Hokka Zakrisson Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 105217da5c8..4530fc65445 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1144,7 +1144,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) struct task_struct * p; for_each_process(p) { - if (p->pid > 1 && !same_thread_group(p, current)) { + if (task_pid_vnr(p) > 1 && + !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p); ++count; if (err != -EPERM) -- cgit v1.2.3 From 9244489a7b69fe0746dc7cb3957f02e05bd1ceb0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Oct 2008 09:42:59 -0400 Subject: ftrace: handle archs that do not support irqs_disabled_flags Impact: build fix on non-lockdep architectures Some architectures do not support a way to read the irq flags that is set from "local_irq_save(flags)" to determine if interrupts were disabled or enabled. Ftrace uses this information to display to the user if the trace occurred with interrupts enabled or disabled. Besides the fact that those archs that do not support this will fail to compile, unless they fix it, we do not want to have the trace simply say interrupts were not disabled or they were enabled, without knowing the real answer. This patch adds a 'X' in the output to let the user know that the architecture they are running on does not support a way for the tracer to determine if interrupts were enabled or disabled. It also lets those same archs compile with tracing enabled. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 7 ++++++- kernel/trace/trace.h | 20 +++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a610ca77155..8a499e2adae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -656,7 +656,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; entry->flags = +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | +#else + TRACE_FLAG_IRQS_NOSUPPORT | +#endif ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); @@ -1244,7 +1248,8 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); trace_seq_printf(s, "%3d", cpu); trace_seq_printf(s, "%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6889ca48f1f..8465ad05270 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -120,18 +120,20 @@ struct trace_boot { /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: - * IRQS_OFF - interrupts were disabled - * NEED_RESCED - reschedule is requested - * HARDIRQ - inside an interrupt handler - * SOFTIRQ - inside a softirq handler - * CONT - multiple entries hold the trace item + * IRQS_OFF - interrupts were disabled + * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags + * NEED_RESCED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + * CONT - multiple entries hold the trace item */ enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_NEED_RESCHED = 0x02, - TRACE_FLAG_HARDIRQ = 0x04, - TRACE_FLAG_SOFTIRQ = 0x08, - TRACE_FLAG_CONT = 0x10, + TRACE_FLAG_IRQS_NOSUPPORT = 0x02, + TRACE_FLAG_NEED_RESCHED = 0x04, + TRACE_FLAG_HARDIRQ = 0x08, + TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_CONT = 0x20, }; #define TRACE_BUF_SIZE 1024 -- cgit v1.2.3 From 42c0202363194007a1ac377d047a95aa39246eb0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 1 Nov 2008 09:53:58 -0700 Subject: reserve_region_with_split: Fix GFP_KERNEL usage under spinlock This one apparently doesn't generate any warnings, because the function is only used during system bootup, when the warnings are disabled. But it's still very wrong. The __reserve_region_with_split() function is called with the resource_lock held for writing, so it must only ever do GFP_ATOMIC allocations. Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 6aac5c60b25..4337063663e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -523,7 +523,7 @@ static void __init __reserve_region_with_split(struct resource *root, { struct resource *parent = root; struct resource *conflict; - struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); if (!res) return; -- cgit v1.2.3 From 28959742c14918f153c1de641bb12b4ea44315a1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 1 Nov 2008 18:20:09 +0000 Subject: PM_TEST_SUSPEND should depend on RTC_CLASS, not RTC_LIB Insufficient dependency - we really want CONFIG_RTC_CLASS=y there. That will give us CONFIG_RTC_LIB=y, so the old dependency can be simply replaced. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index dcd165f92a8..23bd4daeb96 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -96,7 +96,7 @@ config SUSPEND config PM_TEST_SUSPEND bool "Test suspend/resume and wakealarm during bootup" - depends on SUSPEND && PM_DEBUG && RTC_LIB=y + depends on SUSPEND && PM_DEBUG && RTC_CLASS=y ---help--- This option will let you suspend your machine during bootup, and make it wake up a few seconds later using an RTC wakeup alarm. -- cgit v1.2.3 From c2c80529460095035752bf0ecc1af82c1e0f6e0f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 31 Oct 2008 19:50:41 +0000 Subject: tracing, alpha: undefined reference to `save_stack_trace' Impact: build fix on !stacktrace architectures only select STACKTRACE on architectures that have STACKTRACE_SUPPORT ... since we also need to ifdef out the guts of ftrace_trace_stack(). We also want to disallow setting TRACE_ITER_STACKTRACE in trace_flags on such configs, but that can wait. Signed-off-by: Al Viro Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 2 +- kernel/trace/trace.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b58f43bec36..33dbefd471e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -25,7 +25,7 @@ config TRACING bool select DEBUG_FS select RING_BUFFER - select STACKTRACE + select STACKTRACE if STACKTRACE_SUPPORT select TRACEPOINTS select NOP_TRACER diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a499e2adae..85bee775a03 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -705,6 +705,7 @@ static void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { +#ifdef CONFIG_STACKTRACE struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; @@ -730,6 +731,7 @@ static void ftrace_trace_stack(struct trace_array *tr, save_stack_trace(&trace); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +#endif } void __trace_stack(struct trace_array *tr, -- cgit v1.2.3 From b3aa557722b3d5858f14ca559e03461c24125aaf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 31 Oct 2008 15:44:07 -0400 Subject: ftrace: use kretprobe trampoline name to test in output Impact: ia64+tracing build fix When a function is kprobed, the return address is set to the kprobe_trampoline, or something similar. This caused the output of the trace to look confusing when the parent seemed to be this "kprobe_trampoline" function. To fix this, Abhishek Sagar added a test of the instruction pointer of the parent to see if it matched the kprobe_trampoline. If it did, the output would print a "[unknown/kretprobe'd]" instead. Unfortunately, not all archs do this the same way, and the trampoline function may not be exported, which causes failures in builds. This patch will compare the name instead of the pointer to see if it matches. This prevents us from depending on a function from being exported, and should work on all archs. The worst that can happen is that an arch might use a different name and then we go back to the confusing output. At least the arch will still build. Reported-by: Abhishek Sagar Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Tested-by: Abhishek Sagar Acked-by: Abhishek Sagar --- kernel/trace/trace.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 85bee775a03..9f3b478f917 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1088,17 +1088,20 @@ static void s_stop(struct seq_file *m, void *p) mutex_unlock(&trace_types_lock); } -#define KRETPROBE_MSG "[unknown/kretprobe'd]" - #ifdef CONFIG_KRETPROBES -static inline int kretprobed(unsigned long addr) +static inline const char *kretprobed(const char *name) { - return addr == (unsigned long)kretprobe_trampoline; + static const char tramp_name[] = "kretprobe_trampoline"; + int size = sizeof(tramp_name); + + if (strncmp(tramp_name, name, size) == 0) + return "[unknown/kretprobe'd]"; + return name; } #else -static inline int kretprobed(unsigned long addr) +static inline const char *kretprobed(const char *name) { - return 0; + return name; } #endif /* CONFIG_KRETPROBES */ @@ -1107,10 +1110,13 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; + const char *name; kallsyms_lookup(address, NULL, NULL, NULL, str); - return trace_seq_printf(s, fmt, str); + name = kretprobed(str); + + return trace_seq_printf(s, fmt, name); #endif return 1; } @@ -1121,9 +1127,12 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, { #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; + const char *name; sprint_symbol(str, address); - return trace_seq_printf(s, fmt, str); + name = kretprobed(str); + + return trace_seq_printf(s, fmt, name); #endif return 1; } @@ -1377,10 +1386,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) seq_print_ip_sym(s, field->ip, sym_flags); trace_seq_puts(s, " ("); - if (kretprobed(field->parent_ip)) - trace_seq_puts(s, KRETPROBE_MSG); - else - seq_print_ip_sym(s, field->parent_ip, sym_flags); + seq_print_ip_sym(s, field->parent_ip, sym_flags); trace_seq_puts(s, ")\n"); break; } @@ -1496,12 +1502,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) ret = trace_seq_printf(s, " <-"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - if (kretprobed(field->parent_ip)) - ret = trace_seq_puts(s, KRETPROBE_MSG); - else - ret = seq_print_ip_sym(s, - field->parent_ip, - sym_flags); + ret = seq_print_ip_sym(s, + field->parent_ip, + sym_flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } -- cgit v1.2.3 From 818e3dd30a4ff34fff6d90e87ae59c73f6a53691 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 31 Oct 2008 09:58:35 -0400 Subject: tracing, ring-buffer: add paranoid checks for loops While writing a new tracer, I had a bug where I caused the ring-buffer to recurse in a bad way. The bug was with the tracer I was writing and not the ring-buffer itself. But it took a long time to find the problem. This patch adds paranoid checks into the ring-buffer infrastructure that will catch bugs of this nature. Note: I put the bug back in the tracer and this patch showed the error nicely and prevented the lockup. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cedf4e26828..3f338063864 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1022,8 +1022,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; u64 ts, delta; int commit = 0; + int nr_loops = 0; again: + /* + * We allow for interrupts to reenter here and do a trace. + * If one does, it will cause this original code to loop + * back here. Even with heavy interrupts happening, this + * should only happen a few times in a row. If this happens + * 1000 times in a row, there must be either an interrupt + * storm or we have something buggy. + * Bail! + */ + if (unlikely(++nr_loops > 1000)) { + RB_WARN_ON(cpu_buffer, 1); + return NULL; + } + ts = ring_buffer_time_stamp(cpu_buffer->cpu); /* @@ -1532,10 +1547,23 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long flags; + int nr_loops = 0; spin_lock_irqsave(&cpu_buffer->lock, flags); again: + /* + * This should normally only loop twice. But because the + * start of the reader inserts an empty page, it causes + * a case where we will loop three times. There should be no + * reason to loop four times (that I know of). + */ + if (unlikely(++nr_loops > 3)) { + RB_WARN_ON(cpu_buffer, 1); + reader = NULL; + goto out; + } + reader = cpu_buffer->reader_page; /* If there's more to read, return this page */ @@ -1665,6 +1693,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; struct buffer_page *reader; + int nr_loops = 0; if (!cpu_isset(cpu, buffer->cpumask)) return NULL; @@ -1672,6 +1701,19 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) cpu_buffer = buffer->buffers[cpu]; again: + /* + * We repeat when a timestamp is encountered. It is possible + * to get multiple timestamps from an interrupt entering just + * as one timestamp is about to be written. The max times + * that this can happen is the number of nested interrupts we + * can have. Nesting 10 deep of interrupts is clearly + * an anomaly. + */ + if (unlikely(++nr_loops > 10)) { + RB_WARN_ON(cpu_buffer, 1); + return NULL; + } + reader = rb_get_reader_page(cpu_buffer); if (!reader) return NULL; @@ -1722,6 +1764,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; + int nr_loops = 0; if (ring_buffer_iter_empty(iter)) return NULL; @@ -1730,6 +1773,19 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) buffer = cpu_buffer->buffer; again: + /* + * We repeat when a timestamp is encountered. It is possible + * to get multiple timestamps from an interrupt entering just + * as one timestamp is about to be written. The max times + * that this can happen is the number of nested interrupts we + * can have. Nesting 10 deep of interrupts is clearly + * an anomaly. + */ + if (unlikely(++nr_loops > 10)) { + RB_WARN_ON(cpu_buffer, 1); + return NULL; + } + if (rb_per_cpu_empty(cpu_buffer)) return NULL; -- cgit v1.2.3 From 072ba49838b42c873c496d72c91bb237914cf3b6 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Sun, 26 Oct 2008 15:26:57 -0700 Subject: ftrace: fix breakage in bin_fmt results In 777e208d40d0953efc6fb4ab58590da3f7d8f02d we changed from outputting field->cpu (a char) to iter->cpu (unsigned int), increasing the resulting structure size by 3 bytes. Signed-off-by: Eric Anholt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f3b478f917..974973e39e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1755,7 +1755,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, iter->cpu); + SEQ_PUT_FIELD_RET(s, entry->cpu); SEQ_PUT_FIELD_RET(s, iter->ts); switch (entry->type) { -- cgit v1.2.3 From f4b6755fb37595da3630d1d6fc130ea6888cd48f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Nov 2008 21:25:07 +0100 Subject: sched: cleanup fair task selection Impact: cleanup Clean up task selection Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ce514afd78f..6167336a237 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -347,17 +347,17 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rb_leftmost; -} - static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) { - return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); + struct rb_node *left = cfs_rq->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_entity, run_node); } -static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); @@ -794,28 +794,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); -static struct sched_entity * -pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) +static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { + struct sched_entity *se = __pick_next_entity(cfs_rq); + if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1) return se; return cfs_rq->next; } -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = NULL; - - if (first_fair(cfs_rq)) { - se = __pick_next_entity(cfs_rq); - se = pick_next(cfs_rq, se); - set_next_entity(cfs_rq, se); - } - - return se; -} - static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* @@ -1396,6 +1384,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) do { se = pick_next_entity(cfs_rq); + set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); -- cgit v1.2.3 From d95f98d0691d3aba5e35850011946a08c9b36428 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Nov 2008 21:25:08 +0100 Subject: sched: fix fair preempt check Impact: fix cross-class preemption Inter-class wakeup preemptions should go on class order. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6167336a237..ebd6de8d17f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1329,6 +1329,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; } + if (unlikely(p->sched_class != &fair_sched_class)) + return; + if (unlikely(se == pse)) return; -- cgit v1.2.3 From 4793241be408b3926ee00c704d7da3b3faf3a05f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Nov 2008 21:25:09 +0100 Subject: sched: backward looking buddy Impact: improve/change/fix wakeup-buddy scheduling Currently we only have a forward looking buddy, that is, we prefer to schedule to the task we last woke up, under the presumption that its going to consume the data we just produced, and therefore will have cache hot benefits. This allows co-waking producer/consumer task pairs to run ahead of the pack for a little while, keeping their cache warm. Without this, we would interleave all pairs, utterly trashing the cache. This patch introduces a backward looking buddy, that is, suppose that in the above scenario, the consumer preempts the producer before it can go to sleep, we will therefore miss the wakeup from consumer to producer (its already running, after all), breaking the cycle and reverting to the cache-trashing interleaved schedule pattern. The backward buddy will try to schedule back to the task that woke us up in case the forward buddy is not available, under the assumption that the last task will be the one with the most cache hot task around barring current. This will basically allow a task to continue after it got preempted. In order to avoid starvation, we allow either buddy to get wakeup_gran ahead of the pack. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++-- kernel/sched_fair.c | 32 +++++++++++++++++++++++++------- kernel/sched_features.h | 1 + 3 files changed, 30 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e8819bc6f46..82cc839c921 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -397,7 +397,7 @@ struct cfs_rq { * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next; + struct sched_entity *curr, *next, *last; unsigned long nr_spread_over; @@ -1805,7 +1805,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) + if (sched_feat(CACHE_HOT_BUDDY) && + (&p->se == cfs_rq_of(&p->se)->next || + &p->se == cfs_rq_of(&p->se)->last)) return 1; if (p->sched_class != &fair_sched_class) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ebd6de8d17f..a6b1db8a0bd 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -341,9 +341,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->rb_leftmost = next_node; } - if (cfs_rq->next == se) - cfs_rq->next = NULL; - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } @@ -741,6 +738,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) #endif } + if (cfs_rq->last == se) + cfs_rq->last = NULL; + + if (cfs_rq->next == se) + cfs_rq->next = NULL; + if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); @@ -798,10 +801,13 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); - if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1) - return se; + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) + return cfs_rq->next; - return cfs_rq->next; + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) + return cfs_rq->last; + + return se; } static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) @@ -1319,10 +1325,11 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; if (unlikely(rt_prio(p->prio))) { + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + update_rq_clock(rq); update_curr(cfs_rq); resched_task(curr); @@ -1335,6 +1342,17 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (unlikely(se == pse)) return; + /* + * Only set the backward buddy when the current task is still on the + * rq. This can happen when a wakeup gets interleaved with schedule on + * the ->pre_schedule() or idle_balance() point, either of which can + * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, for + * obvious reasons its a bad idea to schedule back to the idle thread. + */ + if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) + cfs_rq_of(se)->last = se; cfs_rq_of(pse)->next = pse; /* diff --git a/kernel/sched_features.h b/kernel/sched_features.h index fda01621829..da5d93b5d2c 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -12,3 +12,4 @@ SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) SCHED_FEAT(WAKEUP_OVERLAP, 0) +SCHED_FEAT(LAST_BUDDY, 1) -- cgit v1.2.3 From 02479099c286894644f8e96c6bbb535ab64662fd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Nov 2008 21:25:10 +0100 Subject: sched: fix buddies for group scheduling Impact: scheduling order fix for group scheduling For each level in the hierarchy, set the buddy to point to the right entity. Therefore, when we do the hierarchical schedule, we have a fair chance of ending up where we meant to. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a6b1db8a0bd..51aa3e102ac 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1319,6 +1319,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) return 0; } +static void set_last_buddy(struct sched_entity *se) +{ + for_each_sched_entity(se) + cfs_rq_of(se)->last = se; +} + +static void set_next_buddy(struct sched_entity *se) +{ + for_each_sched_entity(se) + cfs_rq_of(se)->next = se; +} + /* * Preempt the current task with a newly woken task if needed: */ @@ -1352,8 +1364,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) * obvious reasons its a bad idea to schedule back to the idle thread. */ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) - cfs_rq_of(se)->last = se; - cfs_rq_of(pse)->next = pse; + set_last_buddy(se); + set_next_buddy(pse); /* * We can come here with TIF_NEED_RESCHED already set from new task -- cgit v1.2.3 From 561920a0d2bb6d63343e83acfd784c0a77bd28d1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 30 Oct 2008 18:28:41 +0100 Subject: generic-ipi: fix the smp_mb() placement smp_mb() is needed (to make the memory operations visible globally) before sending the ipi on the sender and the receiver (on Alpha atleast) needs smp_read_barrier_depends() in the handler before reading the call_single_queue list in a lock-free fashion. On x86, x2apic mode register accesses for sending IPI's don't have serializing semantics. So the need for smp_mb() before sending the IPI becomes more critical in x2apic mode. Remove the unnecessary smp_mb() in csd_flag_wait(), as the presence of that smp_mb() doesn't mean anything on the sender, when the ipi receiver is not doing any thing special (like memory fence) after clearing the CSD_FLAG_WAIT. Signed-off-by: Suresh Siddha Signed-off-by: Jens Axboe --- kernel/smp.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index f362a855377..75c8dde58c5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -51,10 +51,6 @@ static void csd_flag_wait(struct call_single_data *data) { /* Wait for response */ do { - /* - * We need to see the flags store in the IPI handler - */ - smp_mb(); if (!(data->flags & CSD_FLAG_WAIT)) break; cpu_relax(); @@ -76,6 +72,11 @@ static void generic_exec_single(int cpu, struct call_single_data *data) list_add_tail(&data->list, &dst->list); spin_unlock_irqrestore(&dst->lock, flags); + /* + * Make the list addition visible before sending the ipi. + */ + smp_mb(); + if (ipi) arch_send_call_function_single_ipi(cpu); @@ -157,7 +158,7 @@ void generic_smp_call_function_single_interrupt(void) * Need to see other stores to list head for checking whether * list is empty without holding q->lock */ - smp_mb(); + smp_read_barrier_depends(); while (!list_empty(&q->list)) { unsigned int data_flags; @@ -191,7 +192,7 @@ void generic_smp_call_function_single_interrupt(void) /* * See comment on outer loop */ - smp_mb(); + smp_read_barrier_depends(); } } @@ -370,6 +371,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, list_add_tail_rcu(&data->csd.list, &call_function_queue); spin_unlock_irqrestore(&call_function_lock, flags); + /* + * Make the list addition visible before sending the ipi. + */ + smp_mb(); + /* Send a message to all CPUs in the map */ arch_send_call_function_ipi(mask); -- cgit v1.2.3 From 9c133c469d38043d5aadaa03f2fb840d88d1cf4f Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 6 Nov 2008 08:42:48 +0100 Subject: Add round_jiffies_up and related routines This patch (as1158b) adds round_jiffies_up() and friends. These routines work like the analogous round_jiffies() functions, except that they will never round down. The new routines will be useful for timeouts where we don't care exactly when the timer expires, provided it doesn't expire too soon. Signed-off-by: Alan Stern Signed-off-by: Jens Axboe --- kernel/timer.c | 129 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 99 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 56becf373c5..dbd50fabe4c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base) tbase_get_deferrable(timer->base)); } -/** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) +static unsigned long round_jiffies_common(unsigned long j, int cpu, + bool force_up) { int rem; unsigned long original = j; @@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu) * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. + * But never round down if @force_up is set. */ - if (rem < HZ/4) /* round down */ + if (rem < HZ/4 && !force_up) /* round down */ j = j - rem; else /* round up */ j = j - rem + HZ; @@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu) return original; return j; } + +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, false); +} EXPORT_SYMBOL_GPL(__round_jiffies); /** @@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies); */ unsigned long __round_jiffies_relative(unsigned long j, int cpu) { - /* - * In theory the following code can skip a jiffy in case jiffies - * increments right between the addition and the later subtraction. - * However since the entire point of this function is to use approximate - * timeouts, it's entirely ok to not handle that. - */ - return __round_jiffies(j + jiffies, cpu) - jiffies; + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, false) - j0; } EXPORT_SYMBOL_GPL(__round_jiffies_relative); @@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative); */ unsigned long round_jiffies(unsigned long j) { - return __round_jiffies(j, raw_smp_processor_id()); + return round_jiffies_common(j, raw_smp_processor_id(), false); } EXPORT_SYMBOL_GPL(round_jiffies); @@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j) } EXPORT_SYMBOL_GPL(round_jiffies_relative); +/** + * __round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, true); +} +EXPORT_SYMBOL_GPL(__round_jiffies_up); + +/** + * __round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, true) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); + +/** + * round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * This is the same as round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), true); +} +EXPORT_SYMBOL_GPL(round_jiffies_up); + +/** + * round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * This is the same as round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up_relative(unsigned long j) +{ + return __round_jiffies_up_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_up_relative); + static inline void set_running_timer(struct tvec_base *base, struct timer_list *timer) -- cgit v1.2.3 From 2d3854a37e8b767a51aba38ed6d22817b0631e33 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 5 Nov 2008 13:39:10 +1100 Subject: cpumask: introduce new API, without changing anything Impact: introduce new APIs We want to deprecate cpumasks on the stack, as we are headed for gynormous numbers of CPUs. Eventually, we want to head towards an undefined 'struct cpumask' so they can never be declared on stack. 1) New cpumask functions which take pointers instead of copies. (cpus_* -> cpumask_*) 2) Several new helpers to reduce requirements for temporary cpumasks (cpumask_first_and, cpumask_next_and, cpumask_any_and) 3) Helpers for declaring cpumasks on or offstack for large NR_CPUS (cpumask_var_t, alloc_cpumask_var and free_cpumask_var) 4) 'struct cpumask' for explicitness and to mark new-style code. 5) Make iterator functions stop at nr_cpu_ids (a runtime constant), not NR_CPUS for time efficiency and for smaller dynamic allocations in future. 6) cpumask_copy() so we can allocate less than a full cpumask eventually (for alloc_cpumask_var), and so we can eliminate the 'struct cpumask' definition eventually. 7) work_on_cpu() helper for doing task on a CPU, rather than saving old cpumask for current thread and manipulating it. 8) smp_call_function_many() which is smp_call_function_mask() except taking a cpumask pointer. Note that this patch simply introduces the new functions and leaves the obsolescent ones in place. This is to simplify the transition patches. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/cpu.c | 3 +++ kernel/workqueue.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 86d49045dae..5a732c5ef08 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { #endif }; EXPORT_SYMBOL_GPL(cpu_bit_bitmap); + +const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; +EXPORT_SYMBOL(cpu_all_bits); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f928f2a87b9..d4dc69ddebd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -970,6 +970,51 @@ undo: return ret; } +#ifdef CONFIG_SMP +struct work_for_cpu { + struct work_struct work; + long (*fn)(void *); + void *arg; + long ret; +}; + +static void do_work_for_cpu(struct work_struct *w) +{ + struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work); + + wfc->ret = wfc->fn(wfc->arg); +} + +/** + * work_on_cpu - run a function in user context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function arg + * + * This will return -EINVAL in the cpu is not online, or the return value + * of @fn otherwise. + */ +long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +{ + struct work_for_cpu wfc; + + INIT_WORK(&wfc.work, do_work_for_cpu); + wfc.fn = fn; + wfc.arg = arg; + get_online_cpus(); + if (unlikely(!cpu_online(cpu))) + wfc.ret = -EINVAL; + else { + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); + } + put_online_cpus(); + + return wfc.ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu); +#endif /* CONFIG_SMP */ + void __init init_workqueues(void) { cpu_populated_map = cpu_online_map; -- cgit v1.2.3 From 24eb089950ce44603b30a3145a2c8520e2b55bb1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 6 Nov 2008 12:53:32 -0800 Subject: cgroups: fix invalid cgrp->dentry before cgroup has been completely removed This fixes an oops when reading /proc/sched_debug. A cgroup won't be removed completely until finishing cgroup_diput(), so we shouldn't invalidate cgrp->dentry in cgroup_rmdir(). Otherwise, when a group is being removed while cgroup_path() gets called, we may trigger NULL dereference BUG. The bug can be reproduced: # cat test.sh #!/bin/sh mount -t cgroup -o cpu xxx /mnt for (( ; ; )) { mkdir /mnt/sub rmdir /mnt/sub } # ./test.sh & # cat /proc/sched_debug BUG: unable to handle kernel NULL pointer dereference at 00000038 IP: [] cgroup_path+0x39/0x90 ... Call Trace: [] ? print_cfs_rq+0x6e/0x75d [] ? sched_debug_show+0x72d/0xc1e ... Signed-off-by: Li Zefan Acked-by: Paul Menage Cc: Peter Zijlstra Cc: Ingo Molnar Cc: [2.6.26.x, 2.6.27.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 35eebd5510c..358e77564e6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2497,7 +2497,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) list_del(&cgrp->sibling); spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); - cgrp->dentry = NULL; spin_unlock(&d->d_lock); cgroup_d_remove_dir(d); -- cgit v1.2.3 From f29c9b1ccb52904ee442a933cf3dee628f9f4e62 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 6 Nov 2008 09:45:16 +0800 Subject: sched: fix a bug in sched domain degenerate Impact: re-add incorrectly eliminated sched domain layers (1) on i386 with SCHED_SMT and SCHED_MC enabled # mount -t cgroup -o cpuset xxx /mnt # echo 0 > /mnt/cpuset.sched_load_balance # mkdir /mnt/0 # echo 0 > /mnt/0/cpuset.cpus # dmesg CPU0 attaching sched-domain: domain 0: span 0 level CPU groups: 0 (2) on i386 with SCHED_MC enabled but SCHED_SMT disabled # same with (1) # dmesg CPU0 attaching NULL sched-domain. The bug is that some sched domains may be skipped unintentionally when degenerating (optimizing) sched domains. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 82cc839c921..4c7e2bcdfa8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6877,15 +6877,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) struct sched_domain *tmp; /* Remove the sched domains which do not contribute to scheduling. */ - for (tmp = sd; tmp; tmp = tmp->parent) { + for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; if (!parent) break; + if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; - } + } else + tmp = tmp->parent; } if (sd && sd_degenerate(sd)) { -- cgit v1.2.3 From ca3273f9646694e0419cfb9d6c12deb1c9aff27c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Nov 2008 14:47:21 +0800 Subject: sched: fix memory leak in a failure path Impact: fix rare memory leak in the sched-domains manual reconfiguration code In the failure path, rd is not attached to a sched domain, so it causes a leak. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 4c7e2bcdfa8..57c933ffbee 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7676,6 +7676,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, error: free_sched_groups(cpu_map, tmpmask); SCHED_CPUMASK_FREE((void *)allmasks); + kfree(rd); return -ENOMEM; #endif } -- cgit v1.2.3 From 5ac5c4d604bf894ef672a7971d03fefdc7ea7e49 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Nov 2008 10:46:32 +0100 Subject: sched: clean up debug info Impact: clean up and fix debug info printout While looking over the sched_debug code I noticed that we printed the rq schedstats for every cfs_rq, ammend this. Also change nr_spead_over into an int, and fix a little buglet in min_vruntime printing. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_debug.c | 41 +++++++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 57c933ffbee..f3149244e32 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -399,7 +399,7 @@ struct cfs_rq { */ struct sched_entity *curr, *next, *last; - unsigned long nr_spread_over; + unsigned int nr_spread_over; #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ae17762ec3..48ecc51e770 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -144,7 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; - min_vruntime = rq->cfs.min_vruntime; + min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", @@ -161,26 +161,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - - P(yld_exp_empty); - P(yld_act_empty); - P(yld_both_empty); - P(yld_count); - P(sched_switch); - P(sched_count); - P(sched_goidle); - - P(ttwu_count); - P(ttwu_local); - - P(bkl_count); - -#undef P -#endif - SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", + SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP @@ -260,6 +242,25 @@ static void print_cpu(struct seq_file *m, int cpu) #undef P #undef PN +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + + P(yld_exp_empty); + P(yld_act_empty); + P(yld_both_empty); + P(yld_count); + + P(sched_switch); + P(sched_count); + P(sched_goidle); + + P(ttwu_count); + P(ttwu_local); + + P(bkl_count); + +#undef P +#endif print_cfs_stats(m, cpu); print_rt_stats(m, cpu); -- cgit v1.2.3 From ee5f80a993539490a07477ff2526bf62c503fbb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Nov 2008 11:06:00 +0100 Subject: irq: call __irq_enter() before calling the tick_idle_check Impact: avoid spurious ksoftirqd wakeups The tick idle check which is called from irq_enter() was run before the call to __irq_enter() which did not set the in_interrupt() bits in preempt_count. That way the raise of a softirq woke up softirqd for nothing as the softirq was handled on return from interrupt. Call __irq_enter() before calling into the tick idle check code. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/softirq.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 7110daeb9a9..e7c69a720d6 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -269,10 +269,11 @@ void irq_enter(void) { int cpu = smp_processor_id(); - if (idle_cpu(cpu) && !in_interrupt()) + if (idle_cpu(cpu) && !in_interrupt()) { + __irq_enter(); tick_check_idle(cpu); - - __irq_enter(); + } else + __irq_enter(); } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED -- cgit v1.2.3 From ae99286b4f1be7788f2d6947c66a91dbd6351eec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Nov 2008 13:20:23 +0100 Subject: nohz: disable tick_nohz_kick_tick() for now Impact: nohz powersavings and wakeup regression commit fb02fbc14d17837b4b7b02dbb36142c16a7bf208 (NOHZ: restart tick device from irq_enter()) causes a serious wakeup regression. While the patch is correct it does not take into account that spurious wakeups happen on x86. A fix for this issue is available, but we just revert to the .27 behaviour and let long running softirqs screw themself. Disable it for now. Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5bbb1044f84..342fc9ccab4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -568,6 +568,9 @@ static void tick_nohz_switch_to_nohz(void) */ static void tick_nohz_kick_tick(int cpu) { +#if 0 + /* Switch back to 2.6.27 behaviour */ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t delta, now; @@ -584,6 +587,7 @@ static void tick_nohz_kick_tick(int cpu) return; tick_nohz_restart(ts, now); +#endif } #else -- cgit v1.2.3 From bf5e6519b85b3853f2d0bb4f17a4e2eaeffeb574 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 10 Nov 2008 21:46:00 -0500 Subject: ftrace: disable tracing on resize Impact: fix for bug on resize This patch addresses the bug found here: http://bugzilla.kernel.org/show_bug.cgi?id=11996 When ftrace converted to the new unified trace buffer, the resizing of the buffer was not protected as much as it was originally. If tracing is performed while the resize occurs, then the buffer can be corrupted. This patch disables all ftrace buffer modifications before a resize takes place. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f3b478f917..abfa8103d04 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2676,7 +2676,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int ret; + int ret, cpu; struct trace_array *tr = filp->private_data; if (cnt >= sizeof(buf)) @@ -2704,6 +2704,14 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, goto out; } + /* disable all cpu buffers */ + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_inc(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_inc(&max_tr.data[cpu]->disabled); + } + if (val != global_trace.entries) { ret = ring_buffer_resize(global_trace.buffer, val); if (ret < 0) { @@ -2735,6 +2743,13 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) cnt = -ENOMEM; out: + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_dec(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_dec(&max_tr.data[cpu]->disabled); + } + max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From 4143c5cb36331155a1823af8b3a8c761a59fed71 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 10 Nov 2008 21:46:01 -0500 Subject: ring-buffer: prevent infinite looping on time stamping Impact: removal of unnecessary looping The lockless part of the ring buffer allows for reentry into the code from interrupts. A timestamp is taken, a test is preformed and if it detects that an interrupt occurred that did tracing, it tries again. The problem arises if the timestamp code itself causes a trace. The detection will detect this and loop again. The difference between this and an interrupt doing tracing, is that this will fail every time, and cause an infinite loop. Currently, we test if the loop happens 1000 times, and if so, it will produce a warning and disable the ring buffer. The problem with this approach is that it makes it difficult to perform some types of tracing (tracing the timestamp code itself). Each trace entry has a delta timestamp from the previous entry. If a trace entry is reserved but and interrupt occurs and traces before the previous entry is commited, the delta timestamp for that entry will be zero. This actually makes sense in terms of tracing, because the interrupt entry happened before the preempted entry was commited, so one may consider the two happening at the same time. The order is still preserved in the buffer. With this idea, instead of trying to get a new timestamp if an interrupt made it in between the timestamp and the test, the entry could simply make the delta zero and continue. This will prevent interrupts or tracers in the timer code from causing the above loop. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3f338063864..2f76193c348 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1060,7 +1060,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, /* Did the write stamp get updated already? */ if (unlikely(ts < cpu_buffer->write_stamp)) - goto again; + delta = 0; if (test_time_stamp(delta)) { -- cgit v1.2.3 From ad474caca3e2a0550b7ce0706527ad5ab389a4d4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 10 Nov 2008 15:39:30 +0100 Subject: fix for account_group_exec_runtime(), make sure ->signal can't be freed under rq->lock Impact: fix hang/crash on ia64 under high load This is ugly, but the simplest patch by far. Unlike other similar routines, account_group_exec_runtime() could be called "implicitly" from within scheduler after exit_notify(). This means we can race with the parent doing release_task(), we can't just check ->signal != NULL. Change __exit_signal() to do spin_unlock_wait(&task_rq(tsk)->lock) before __cleanup_signal() to make sure ->signal can't be freed under task_rq(tsk)->lock. Note that task_rq_unlock_wait() doesn't care about the case when tsk changes cpu/rq under us, this should be OK. Thanks to Ingo who nacked my previous buggy patch. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar Reported-by: Doug Chapman --- kernel/exit.c | 5 +++++ kernel/sched.c | 8 ++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 80137a5d946..ae2b92be5fa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -141,6 +141,11 @@ static void __exit_signal(struct task_struct *tsk) if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); + /* + * Make sure ->signal can't go away under rq->lock, + * see account_group_exec_runtime(). + */ + task_rq_unlock_wait(tsk); __cleanup_signal(sig); } } diff --git a/kernel/sched.c b/kernel/sched.c index f3149244e32..50a21f96467 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -969,6 +969,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } +void task_rq_unlock_wait(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + smp_mb(); /* spin-unlock-wait is not a full memory barrier */ + spin_unlock_wait(&rq->lock); +} + static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { -- cgit v1.2.3 From 5d5254f0d3b9bebc47d97e357374c0ad0c291a7d Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Sat, 25 Oct 2008 10:22:38 +0530 Subject: timers: handle HRTIMER_CB_IRQSAFE_UNLOCKED correctly from softirq context Impact: fix incorrect locking triggered during hotplug-intense stress-tests While migrating the the CB_IRQSAFE_UNLOCKED timers during a cpu-offline, we queue them on the cb_pending list, so that they won't go stale. Thus, when the callbacks of the timers run from the softirq context, they could run into potential deadlocks, since these callbacks assume that they're running with irq's disabled, thereby annoying lockdep! Fix this by emulating hardirq context while running these callbacks from the hrtimer softirq. ================================= [ INFO: inconsistent lock state ] 2.6.27 #2 -------------------------------- inconsistent {in-hardirq-W} -> {hardirq-on-W} usage. ksoftirqd/0/4 [HC0[0]:SC1[1]:HE1:SE0] takes: (&rq->lock){++..}, at: [] sched_rt_period_timer+0x9e/0x1fc {in-hardirq-W} state was registered at: [] __lock_acquire+0x549/0x121e [] native_sched_clock+0x88/0x99 [] clocksource_get_next+0x39/0x3f [] update_wall_time+0x616/0x7df [] lock_acquire+0x5a/0x74 [] scheduler_tick+0x3a/0x18d [] _spin_lock+0x1c/0x45 [] scheduler_tick+0x3a/0x18d [] scheduler_tick+0x3a/0x18d [] update_process_times+0x3a/0x44 [] tick_periodic+0x63/0x6d [] tick_handle_periodic+0x14/0x5e [] timer_interrupt+0x44/0x4a [] handle_IRQ_event+0x13/0x3d [] handle_level_irq+0x79/0xbd [] do_IRQ+0x69/0x7d [] common_interrupt+0x28/0x30 [] aac_probe_one+0x1a3/0x3f3 [] _spin_unlock_irqrestore+0x36/0x39 [] setup_irq+0x1be/0x1f9 [] start_kernel+0x259/0x2c5 [] 0xffffffff irq event stamp: 50102 hardirqs last enabled at (50102): [] _spin_unlock_irq+0x20/0x23 hardirqs last disabled at (50101): [] _spin_lock_irq+0xa/0x4b softirqs last enabled at (50088): [] do_softirq+0x37/0x4d softirqs last disabled at (50099): [] do_softirq+0x37/0x4d other info that might help us debug this: no locks held by ksoftirqd/0/4. stack backtrace: Pid: 4, comm: ksoftirqd/0 Not tainted 2.6.27 #2 [] print_usage_bug+0x13e/0x147 [] mark_lock+0x493/0x797 [] __lock_acquire+0x5be/0x121e [] lock_acquire+0x5a/0x74 [] sched_rt_period_timer+0x9e/0x1fc [] _spin_lock+0x1c/0x45 [] sched_rt_period_timer+0x9e/0x1fc [] sched_rt_period_timer+0x9e/0x1fc [] finish_task_switch+0x41/0xbd [] native_sched_clock+0x88/0x99 [] sched_rt_period_timer+0x0/0x1fc [] run_hrtimer_pending+0x54/0xe5 [] sched_rt_period_timer+0x0/0x1fc [] __do_softirq+0x7b/0xef [] do_softirq+0x37/0x4d [] ksoftirqd+0x56/0xc5 [] ksoftirqd+0x0/0xc5 [] kthread+0x38/0x5d [] kthread+0x0/0x5d [] kernel_thread_helper+0x7/0x10 ======================= Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Acked-by: "Paul E. McKenney" Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 2b465dfde42..95d3949f2ae 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1209,6 +1209,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) enum hrtimer_restart (*fn)(struct hrtimer *); struct hrtimer *timer; int restart; + int emulate_hardirq_ctx = 0; timer = list_entry(cpu_base->cb_pending.next, struct hrtimer, cb_entry); @@ -1217,10 +1218,24 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) timer_stats_account_hrtimer(timer); fn = timer->function; + /* + * A timer might have been added to the cb_pending list + * when it was migrated during a cpu-offline operation. + * Emulate hardirq context for such timers. + */ + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || + timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) + emulate_hardirq_ctx = 1; + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); spin_unlock_irq(&cpu_base->lock); - restart = fn(timer); + if (unlikely(emulate_hardirq_ctx)) { + local_irq_disable(); + restart = fn(timer); + local_irq_enable(); + } else + restart = fn(timer); spin_lock_irq(&cpu_base->lock); -- cgit v1.2.3 From 2002c69595a092518107f7e3c1294c9710bc92ae Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 Nov 2008 11:52:33 +0100 Subject: sched: release buddies on yield Clear buddies on yield, so that the buddy rules don't schedule them despite them being placed right-most. This fixed a performance regression with yield-happy binary JVMs. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Tested-by: Lin Ming --- kernel/sched_fair.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 51aa3e102ac..98345e45b05 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -716,6 +716,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) __enqueue_entity(cfs_rq, se); } +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->last == se) + cfs_rq->last = NULL; + + if (cfs_rq->next == se) + cfs_rq->next = NULL; +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -738,11 +747,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) #endif } - if (cfs_rq->last == se) - cfs_rq->last = NULL; - - if (cfs_rq->next == se) - cfs_rq->next = NULL; + clear_buddies(cfs_rq, se); if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); @@ -977,6 +982,8 @@ static void yield_task_fair(struct rq *rq) if (unlikely(cfs_rq->nr_running == 1)) return; + clear_buddies(cfs_rq, se); + if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { update_rq_clock(rq); /* -- cgit v1.2.3 From a358324466b171e145df20bdb74fe81759906de6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 11 Nov 2008 15:01:42 -0500 Subject: ring-buffer: buffer record on/off switch Impact: enable/disable ring buffer recording API added Several kernel developers have requested that there be a way to stop recording into the ring buffers with a simple switch that can also be enabled from userspace. This patch addes a new kernel API to the ring buffers called: tracing_on() tracing_off() When tracing_off() is called, all ring buffers will not be able to record into their buffers. tracing_on() will enable the ring buffers again. These two act like an on/off switch. That is, there is no counting of the number of times tracing_off or tracing_on has been called. A new file is added to the debugfs/tracing directory called tracing_on This allows for userspace applications to also flip the switch. echo 0 > debugfs/tracing/tracing_on disables the tracing. echo 1 > /debugfs/tracing/tracing_on enables it. Note, this does not disable or enable any tracers. It only sets or clears a flag that needs to be set in order for the ring buffers to write to their buffers. It is a global flag, and affects all ring buffers. The buffers start out with tracing_on enabled. There are now three flags that control recording into the buffers: tracing_on: which affects all ring buffer tracers. buffer->record_disabled: which affects an allocated buffer, which may be set if an anomaly is detected, and tracing is disabled. cpu_buffer->record_disabled: which is set by tracing_stop() or if an anomaly is detected. tracing_start can not reenable this if an anomaly occurred. The userspace debugfs/tracing/tracing_enabled is implemented with tracing_stop() but the user space code can not enable it if the kernel called tracing_stop(). Userspace can enable the tracing_on even if the kernel disabled it. It is just a switch used to stop tracing if a condition was hit. tracing_on is not for protecting critical areas in the kernel nor is it for stopping tracing if an anomaly occurred. This is because userspace can reenable it at any time. Side effect: With this patch, I discovered a dead variable in ftrace.c called tracing_on. This patch removes it. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 +--- kernel/trace/ring_buffer.c | 101 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4a39d24568c..14fa52297b2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -185,7 +185,6 @@ enum { }; static int ftrace_filtered; -static int tracing_on; static LIST_HEAD(ftrace_new_addrs); @@ -506,13 +505,10 @@ static int __ftrace_modify_code(void *data) { int *command = data; - if (*command & FTRACE_ENABLE_CALLS) { + if (*command & FTRACE_ENABLE_CALLS) ftrace_replace_code(1); - tracing_on = 1; - } else if (*command & FTRACE_DISABLE_CALLS) { + else if (*command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); - tracing_on = 0; - } if (*command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2f76193c348..b08ee9f00c8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -16,6 +16,35 @@ #include #include +#include "trace.h" + +/* Global flag to disable all recording to ring buffers */ +static int ring_buffers_off __read_mostly; + +/** + * tracing_on - enable all tracing buffers + * + * This function enables all tracing buffers that may have been + * disabled with tracing_off. + */ +void tracing_on(void) +{ + ring_buffers_off = 0; +} + +/** + * tracing_off - turn off all tracing buffers + * + * This function stops all tracing buffers from recording data. + * It does not disable any overhead the tracers themselves may + * be causing. This function simply causes all recording to + * the ring buffers to fail. + */ +void tracing_off(void) +{ + ring_buffers_off = 1; +} + /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 @@ -1133,6 +1162,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, struct ring_buffer_event *event; int cpu, resched; + if (ring_buffers_off) + return NULL; + if (atomic_read(&buffer->record_disabled)) return NULL; @@ -1249,6 +1281,9 @@ int ring_buffer_write(struct ring_buffer *buffer, int ret = -EBUSY; int cpu, resched; + if (ring_buffers_off) + return -EBUSY; + if (atomic_read(&buffer->record_disabled)) return -EBUSY; @@ -2070,3 +2105,69 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, return 0; } +static ssize_t +rb_simple_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int *p = filp->private_data; + char buf[64]; + int r; + + /* !ring_buffers_off == tracing_on */ + r = sprintf(buf, "%d\n", !*p); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +rb_simple_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int *p = filp->private_data; + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + /* !ring_buffers_off == tracing_on */ + *p = !val; + + (*ppos)++; + + return cnt; +} + +static struct file_operations rb_simple_fops = { + .open = tracing_open_generic, + .read = rb_simple_read, + .write = rb_simple_write, +}; + + +static __init int rb_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("tracing_on", 0644, d_tracer, + &ring_buffers_off, &rb_simple_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_on' entry\n"); + + return 0; +} + +fs_initcall(rb_init_debugfs); -- cgit v1.2.3 From 621a0d5207c18012cb39932f2d9830a11a6cb03d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Nov 2008 09:36:35 +0100 Subject: hrtimer: clean up unused callback modes Impact: cleanup git grep HRTIMER_CB_IRQSAFE revealed half the callback modes are actually unused. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 95d3949f2ae..47e63349d1b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -664,14 +664,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, /* Timer is expired, act upon the callback mode */ switch(timer->cb_mode) { - case HRTIMER_CB_IRQSAFE_NO_RESTART: - debug_hrtimer_deactivate(timer); - /* - * We can call the callback from here. No restart - * happens, so no danger of recursion - */ - BUG_ON(timer->function(timer) != HRTIMER_NORESTART); - return 1; case HRTIMER_CB_IRQSAFE_PERCPU: case HRTIMER_CB_IRQSAFE_UNLOCKED: /* @@ -683,7 +675,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ debug_hrtimer_deactivate(timer); return 1; - case HRTIMER_CB_IRQSAFE: case HRTIMER_CB_SOFTIRQ: /* * Move everything else into the softirq pending list ! -- cgit v1.2.3 From 47e74f2ba8fbf9fb1378e2524e6cfdc2fb37f160 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 00:01:27 -0500 Subject: ring-buffer: no preempt for sched_clock() Impact: disable preemption when calling sched_clock() The ring_buffer_time_stamp still uses sched_clock as its counter. But it is a bug to call it with preemption enabled. This requirement should not be pushed to the ring_buffer_time_stamp callers, so the ring_buffer_time_stamp needs to disable preemption when calling sched_clock. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b08ee9f00c8..231db209fa8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -51,8 +51,14 @@ void tracing_off(void) /* FIXME!!! */ u64 ring_buffer_time_stamp(int cpu) { + u64 time; + + preempt_disable_notrace(); /* shift to debug/test normalization and TIME_EXTENTS */ - return sched_clock() << DEBUG_SHIFT; + time = sched_clock() << DEBUG_SHIFT; + preempt_enable_notrace(); + + return time; } void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) -- cgit v1.2.3 From a2d477778e82a60a0b7114cefdb70aa43af28782 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 12 Nov 2008 16:19:00 +0530 Subject: sched: fix stale value in average load per task Impact: fix load balancer load average calculation accuracy cpu_avg_load_per_task() returns a stale value when nr_running is 0. It returns an older stale (caculated when nr_running was non zero) value. This patch returns and sets rq->avg_load_per_task to zero when nr_running is 0. Compile and boot tested on a x86_64 box. Signed-off-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 50a21f96467..3bafbe350f4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1456,6 +1456,8 @@ static unsigned long cpu_avg_load_per_task(int cpu) if (rq->nr_running) rq->avg_load_per_task = rq->load.weight / rq->nr_running; + else + rq->avg_load_per_task = 0; return rq->avg_load_per_task; } -- cgit v1.2.3 From 5cbd54ef470d880fc37fbe4b21eb514806d51e0d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 Nov 2008 20:05:50 +0100 Subject: sched: fix init_idle()'s use of sched_clock() Maciej Rutecki reported: > I have this bug during suspend to disk: > > [ 188.592151] Enabling non-boot CPUs ... > [ 188.592151] SMP alternatives: switching to SMP code > [ 188.666058] BUG: using smp_processor_id() in preemptible > [00000000] > code: suspend_to_disk/2934 > [ 188.666064] caller is native_sched_clock+0x2b/0x80 Which, as noted by Linus, was caused by me, via: 7cbaef9c "sched: optimize sched_clock() a bit" Move the rq locking a bit earlier in the initialization sequence, that will make the sched_clock() call in init_idle() non-preemptible. Reported-by: Maciej Rutecki Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3bafbe350f4..c94baf2969e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5870,6 +5870,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + spin_lock_irqsave(&rq->lock, flags); + __sched_fork(idle); idle->se.exec_start = sched_clock(); @@ -5877,7 +5879,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->cpus_allowed = cpumask_of_cpu(cpu); __set_task_cpu(idle, cpu); - spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; -- cgit v1.2.3 From 687446760bd008df96655cb8c5900f8e48a7118c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 12 Nov 2008 13:26:49 -0800 Subject: freezer_cg: remove task_lock from freezer_fork() In theory the task can be moved to another cgroup and the freezer will be freed right after task_lock is dropped, so the lock results in zero protection. But in the case of freezer_fork() no lock is needed, since the task is not in tasklist yet so it won't be moved to another cgroup, so task->cgroups won't be changed or invalidated. Signed-off-by: Li Zefan Cc: Matt Helsley Cc: Cedric Le Goater Cc: "Serge E. Hallyn" Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 7fa476f01d0..66059071040 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -184,9 +184,13 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) { struct freezer *freezer; - task_lock(task); + /* + * No lock is needed, since the task isn't on tasklist yet, + * so it can't be moved to another cgroup, which means the + * freezer won't be removed and will be valid during this + * function call. + */ freezer = task_freezer(task); - task_unlock(task); spin_lock_irq(&freezer->lock); BUG_ON(freezer->state == CGROUP_FROZEN); -- cgit v1.2.3 From 3b1b3f6e57064aa8f91c290fe51cda4c74642902 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 12 Nov 2008 13:26:50 -0800 Subject: freezer_cg: disable writing freezer.state of root cgroup With this change, control file 'freezer.state' doesn't exist in root cgroup, making root cgroup unfreezable. I think it's reasonable to disallow freeze tasks in the root cgroup. And then we can avoid fork overhead when freezer subsystem is compiled but not used. Also make writing invalid value to freezer.state returns EINVAL rather than EIO. This is more consistent with other cgroup subsystem. Signed-off-by: Li Zefan Acked-by: Paul Menage Cc: Cedric Le Goater Cc: Paul Menage Cc: Matt Helsley Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 66059071040..fb249e2bcad 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -192,6 +192,13 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) */ freezer = task_freezer(task); + /* + * The root cgroup is non-freezable, so we can skip the + * following check. + */ + if (!freezer->css.cgroup->parent) + return; + spin_lock_irq(&freezer->lock); BUG_ON(freezer->state == CGROUP_FROZEN); @@ -335,7 +342,7 @@ static int freezer_write(struct cgroup *cgroup, else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) goal_state = CGROUP_FROZEN; else - return -EIO; + return -EINVAL; if (!cgroup_lock_live_group(cgroup)) return -ENODEV; @@ -354,6 +361,8 @@ static struct cftype files[] = { static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) { + if (!cgroup->parent) + return 0; return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); } -- cgit v1.2.3 From a189d0350f387786b1fb5a5d19e3a5ab0bc0cceb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 12 Nov 2008 13:26:51 -0800 Subject: kprobes: disable preempt for module_text_address() and kernel_text_address() __register_kprobe() can be preempted after checking probing address but before module_text_address() or try_module_get(), and in this interval the module can be unloaded. In that case, try_module_get(probed_mod) will access to invalid address, or kprobe will probe invalid address. This patch uses preempt_disable() to protect it and uses __module_text_address() and __kernel_text_address(). Signed-off-by: Lai Jiangshan Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Hiroshi Shimamoto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8b57a2597f2..f83c5e42fb0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; p->addr = addr; - if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr)) + preempt_disable(); + if (!__kernel_text_address((unsigned long) p->addr) || + in_kprobes_functions((unsigned long) p->addr)) { + preempt_enable(); return -EINVAL; + } p->mod_refcounted = 0; /* * Check if are we probing a module. */ - probed_mod = module_text_address((unsigned long) p->addr); + probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { - struct module *calling_mod = module_text_address(called_from); + struct module *calling_mod; + calling_mod = __module_text_address(called_from); /* * We must allow modules to probe themself and in this case * avoid incrementing the module refcount, so as to allow * unloading of self probing modules. */ if (calling_mod && calling_mod != probed_mod) { - if (unlikely(!try_module_get(probed_mod))) + if (unlikely(!try_module_get(probed_mod))) { + preempt_enable(); return -EINVAL; + } p->mod_refcounted = 1; } else probed_mod = NULL; } + preempt_enable(); p->nmissed = 0; INIT_LIST_HEAD(&p->list); @@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) struct kprobe *old_p; if (p->mod_refcounted) { + /* + * Since we've already incremented refcount, + * we don't need to disable preemption. + */ mod = module_text_address((unsigned long)p->addr); if (mod) module_put(mod); -- cgit v1.2.3 From 7e036d040a28bf95255d7eb9faf0ffbba3677e99 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 12 Nov 2008 13:26:57 -0800 Subject: kernel/kprobes.c: don't pad kretprobe_table_locks[] on uniprocessor builds We only need the cacheline padding on SMP kernels. Saves 6k: text data bss dec hex filename 5713 388 8840 14941 3a5d kernel/kprobes.o 5713 388 2632 8733 221d kernel/kprobes.o Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f83c5e42fb0..9f8a3f25259 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -72,7 +72,7 @@ static bool kprobe_enabled; DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { - spinlock_t lock ____cacheline_aligned; + spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) -- cgit v1.2.3 From ee51a1de7e3837577412be269e0100038068e691 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Nov 2008 14:58:31 +0100 Subject: tracing: fix mmiotrace resizing crash Pekka reported a crash when resizing the mmiotrace tracer (if only mmiotrace is enabled). This happens because in that case we do not allocate the max buffer, but we try to use it. Make ring_buffer_resize() idempotent against NULL buffers. Reported-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 231db209fa8..036456cbb4f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -538,6 +538,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) LIST_HEAD(pages); int i, cpu; + /* + * Always succeed at resizing a non-existent buffer: + */ + if (!buffer) + return size; + size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); size *= BUF_PAGE_SIZE; buffer_size = buffer->pages * BUF_PAGE_SIZE; -- cgit v1.2.3 From 8141c7f3e7aee618312fa1c15109e1219de784a7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Nov 2008 10:20:36 -0800 Subject: Move "exit_robust_list" into mm_release() We don't want to get rid of the futexes just at exit() time, we want to drop them when doing an execve() too, since that gets rid of the previous VM image too. Doing it at mm_release() time means that we automatically always do it when we disassociate a VM map from the task. Reported-by: pageexec@freemail.hu Cc: Andrew Morton Cc: Nick Piggin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Brad Spengler Cc: Alex Efros Cc: Peter Zijlstra Cc: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 9 --------- kernel/fork.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ae2b92be5fa..2d8be7ebb0f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include /* for audit_free() */ #include @@ -1059,14 +1058,6 @@ NORET_TYPE void do_exit(long code) exit_itimers(tsk->signal); } acct_collect(code, group_dead); -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) - exit_robust_list(tsk); -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) - compat_exit_robust_list(tsk); -#endif -#endif if (group_dead) tty_audit_exit(); if (unlikely(tsk->audit_context)) diff --git a/kernel/fork.c b/kernel/fork.c index f6083561dfe..2a372a0e206 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -519,6 +520,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) { struct completion *vfork_done = tsk->vfork_done; + /* Get rid of any futexes when releasing the mm */ +#ifdef CONFIG_FUTEX + if (unlikely(tsk->robust_list)) + exit_robust_list(tsk); +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) + compat_exit_robust_list(tsk); +#endif +#endif + /* Get rid of any cached register state */ deactivate_mm(tsk, mm); -- cgit v1.2.3 From 8f7b0ba1c853919b85b54774775f567f30006107 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 15 Nov 2008 01:15:43 +0000 Subject: Fix inotify watch removal/umount races Inotify watch removals suck violently. To kick the watch out we need (in this order) inode->inotify_mutex and ih->mutex. That's fine if we have a hold on inode; however, for all other cases we need to make damn sure we don't race with umount. We can *NOT* just grab a reference to a watch - inotify_unmount_inodes() will happily sail past it and we'll end with reference to inode potentially outliving its superblock. Ideally we just want to grab an active reference to superblock if we can; that will make sure we won't go into inotify_umount_inodes() until we are done. Cleanup is just deactivate_super(). However, that leaves a messy case - what if we *are* racing with umount() and active references to superblock can't be acquired anymore? We can bump ->s_count, grab ->s_umount, which will almost certainly wait until the superblock is shut down and the watch in question is pining for fjords. That's fine, but there is a problem - we might have hit the window between ->s_active getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock is past the point of no return and is heading for shutdown) and the moment when deactivate_super() acquires ->s_umount. We could just do drop_super() yield() and retry, but that's rather antisocial and this stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having found that we'd got there first (i.e. that ->s_root is non-NULL) we know that we won't race with inotify_umount_inodes(). So we could grab a reference to watch and do the rest as above, just with drop_super() instead of deactivate_super(), right? Wrong. We had to drop ih->mutex before we could grab ->s_umount. So the watch could've been gone already. That still can be dealt with - we need to save watch->wd, do idr_find() and compare its result with our pointer. If they match, we either have the damn thing still alive or we'd lost not one but two races at once, the watch had been killed and a new one got created with the same ->wd at the same address. That couldn't have happened in inotify_destroy(), but inotify_rm_wd() could run into that. Still, "new one got created" is not a problem - we have every right to kill it or leave it alone, whatever's more convenient. So we can use idr_find(...) == watch && watch->inode->i_sb == sb as "grab it and kill it" check. If it's been our original watch, we are fine, if it's a newcomer - nevermind, just pretend that we'd won the race and kill the fscker anyway; we are safe since we know that its superblock won't be going away. And yes, this is far beyond mere "not very pretty"; so's the entire concept of inotify to start with. Signed-off-by: Al Viro Acked-by: Greg KH Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 91 +++++++++++++++++++++++++++++++--------------------- kernel/auditfilter.c | 14 +++++--- 2 files changed, 63 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 8ba0e0d934f..8b509441f49 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -24,6 +24,7 @@ struct audit_chunk { struct list_head trees; /* with root here */ int dead; int count; + atomic_long_t refs; struct rcu_head head; struct node { struct list_head list; @@ -56,7 +57,8 @@ static LIST_HEAD(prune_list); * tree is refcounted; one reference for "some rules on rules_list refer to * it", one for each chunk with pointer to it. * - * chunk is refcounted by embedded inotify_watch. + * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount + * of watch contributes 1 to .refs). * * node.index allows to get from node.list to containing chunk. * MSB of that sucker is stolen to mark taggings that we might have to @@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count) INIT_LIST_HEAD(&chunk->hash); INIT_LIST_HEAD(&chunk->trees); chunk->count = count; + atomic_long_set(&chunk->refs, 1); for (i = 0; i < count; i++) { INIT_LIST_HEAD(&chunk->owners[i].list); chunk->owners[i].index = i; @@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count) return chunk; } -static void __free_chunk(struct rcu_head *rcu) +static void free_chunk(struct audit_chunk *chunk) { - struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); int i; for (i = 0; i < chunk->count; i++) { @@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu) kfree(chunk); } -static inline void free_chunk(struct audit_chunk *chunk) +void audit_put_chunk(struct audit_chunk *chunk) { - call_rcu(&chunk->head, __free_chunk); + if (atomic_long_dec_and_test(&chunk->refs)) + free_chunk(chunk); } -void audit_put_chunk(struct audit_chunk *chunk) +static void __put_chunk(struct rcu_head *rcu) { - put_inotify_watch(&chunk->watch); + struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); + audit_put_chunk(chunk); } enum {HASH_SIZE = 128}; @@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) list_for_each_entry_rcu(p, list, hash) { if (p->watch.inode == inode) { - get_inotify_watch(&p->watch); + atomic_long_inc(&p->refs); return p; } } @@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) /* tagging and untagging inodes with trees */ -static void untag_chunk(struct audit_chunk *chunk, struct node *p) +static struct audit_chunk *find_chunk(struct node *p) +{ + int index = p->index & ~(1U<<31); + p -= index; + return container_of(p, struct audit_chunk, owners[0]); +} + +static void untag_chunk(struct node *p) { + struct audit_chunk *chunk = find_chunk(p); struct audit_chunk *new; struct audit_tree *owner; int size = chunk->count - 1; int i, j; + if (!pin_inotify_watch(&chunk->watch)) { + /* + * Filesystem is shutting down; all watches are getting + * evicted, just take it off the node list for this + * tree and let the eviction logics take care of the + * rest. + */ + owner = p->owner; + if (owner->root == chunk) { + list_del_init(&owner->same_root); + owner->root = NULL; + } + list_del_init(&p->list); + p->owner = NULL; + put_tree(owner); + return; + } + + spin_unlock(&hash_lock); + + /* + * pin_inotify_watch() succeeded, so the watch won't go away + * from under us. + */ mutex_lock(&chunk->watch.inode->inotify_mutex); if (chunk->dead) { mutex_unlock(&chunk->watch.inode->inotify_mutex); - return; + goto out; } owner = p->owner; @@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p) inotify_evict_watch(&chunk->watch); mutex_unlock(&chunk->watch.inode->inotify_mutex); put_inotify_watch(&chunk->watch); - return; + goto out; } new = alloc_chunk(size); @@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p) inotify_evict_watch(&chunk->watch); mutex_unlock(&chunk->watch.inode->inotify_mutex); put_inotify_watch(&chunk->watch); - return; + goto out; Fallback: // do the best we can @@ -277,6 +313,9 @@ Fallback: put_tree(owner); spin_unlock(&hash_lock); mutex_unlock(&chunk->watch.inode->inotify_mutex); +out: + unpin_inotify_watch(&chunk->watch); + spin_lock(&hash_lock); } static int create_chunk(struct inode *inode, struct audit_tree *tree) @@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } -static struct audit_chunk *find_chunk(struct node *p) -{ - int index = p->index & ~(1U<<31); - p -= index; - return container_of(p, struct audit_chunk, owners[0]); -} - static void kill_rules(struct audit_tree *tree) { struct audit_krule *rule, *next; @@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim) spin_lock(&hash_lock); while (!list_empty(&victim->chunks)) { struct node *p; - struct audit_chunk *chunk; p = list_entry(victim->chunks.next, struct node, list); - chunk = find_chunk(p); - get_inotify_watch(&chunk->watch); - spin_unlock(&hash_lock); - - untag_chunk(chunk, p); - put_inotify_watch(&chunk->watch); - spin_lock(&hash_lock); + untag_chunk(p); } spin_unlock(&hash_lock); put_tree(victim); @@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree) while (!list_empty(&tree->chunks)) { struct node *node; - struct audit_chunk *chunk; node = list_entry(tree->chunks.next, struct node, list); @@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree) if (!(node->index & (1U<<31))) break; - chunk = find_chunk(node); - get_inotify_watch(&chunk->watch); - spin_unlock(&hash_lock); - - untag_chunk(chunk, node); - - put_inotify_watch(&chunk->watch); - spin_lock(&hash_lock); + untag_chunk(node); } if (!tree->root && !tree->goner) { tree->goner = 1; @@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, static void destroy_watch(struct inotify_watch *watch) { struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); - free_chunk(chunk); + call_rcu(&chunk->head, __put_chunk); } static const struct inotify_operations rtree_inotify_ops = { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b7d354e2b0e..9fd85a4640a 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list) list_for_each_entry_safe(p, n, in_list, ilist) { list_del(&p->ilist); inotify_rm_watch(audit_ih, &p->wdata); - /* the put matching the get in audit_do_del_rule() */ - put_inotify_watch(&p->wdata); + /* the unpin matching the pin in audit_do_del_rule() */ + unpin_inotify_watch(&p->wdata); } } @@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry, /* Put parent on the inotify un-registration * list. Grab a reference before releasing * audit_filter_mutex, to be released in - * audit_inotify_unregister(). */ - list_add(&parent->ilist, &inotify_list); - get_inotify_watch(&parent->wdata); + * audit_inotify_unregister(). + * If filesystem is going away, just leave + * the sucker alone, eviction will take + * care of it. + */ + if (pin_inotify_watch(&parent->wdata)) + list_add(&parent->ilist, &inotify_list); } } } -- cgit v1.2.3 From 29d7b90c15035741d15421b36000509212b3e135 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 16 Nov 2008 08:07:15 +0100 Subject: sched: fix kernel warning on /proc/sched_debug access Luis Henriques reported that with CONFIG_PREEMPT=y + CONFIG_PREEMPT_DEBUG=y + CONFIG_SCHED_DEBUG=y + CONFIG_LATENCYTOP=y enabled, the following warning triggers when using latencytop: > [ 775.663239] BUG: using smp_processor_id() in preemptible [00000000] code: latencytop/6585 > [ 775.663303] caller is native_sched_clock+0x3a/0x80 > [ 775.663314] Pid: 6585, comm: latencytop Tainted: G W 2.6.28-rc4-00355-g9c7c354 #1 > [ 775.663322] Call Trace: > [ 775.663343] [] debug_smp_processor_id+0xe4/0xf0 > [ 775.663356] [] native_sched_clock+0x3a/0x80 > [ 775.663368] [] sched_clock+0x9/0x10 > [ 775.663381] [] proc_sched_show_task+0x8bd/0x10e0 > [ 775.663395] [] sched_show+0x3e/0x80 > [ 775.663408] [] seq_read+0xdb/0x350 > [ 775.663421] [] ? security_file_permission+0x16/0x20 > [ 775.663435] [] vfs_read+0xc8/0x170 > [ 775.663447] [] sys_read+0x55/0x90 > [ 775.663460] [] system_call_fastpath+0x16/0x1b > ... This breakage was caused by me via: 7cbaef9: sched: optimize sched_clock() a bit Change the calls to cpu_clock(). Reported-by: Luis Henriques --- kernel/sched_debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 48ecc51e770..26ed8e3d1c1 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -423,10 +423,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #undef __P { + unsigned int this_cpu = raw_smp_processor_id(); u64 t0, t1; - t0 = sched_clock(); - t1 = sched_clock(); + t0 = cpu_clock(this_cpu); + t1 = cpu_clock(this_cpu); SEQ_printf(m, "%-35s:%21Ld\n", "clock-delta", (long long)(t1-t0)); } -- cgit v1.2.3 From 5821e1b74f0d08952cb5da4bfd2d9a388d8df58e Mon Sep 17 00:00:00 2001 From: walimis Date: Sat, 15 Nov 2008 15:19:06 +0800 Subject: function tracing: fix wrong pos computing when read buffer has been fulfilled Impact: make output of available_filter_functions complete phenomenon: The first value of dyn_ftrace_total_info is not equal with `cat available_filter_functions | wc -l`, but they should be equal. root cause: When printing functions with seq_printf in t_show, if the read buffer is just overflowed by current function record, then this function won't be printed to user space through read buffer, it will just be dropped. So we can't see this function printing. So, every time the last function to fill the read buffer, if overflowed, will be dropped. This also applies to set_ftrace_filter if set_ftrace_filter has more bytes than read buffer. fix: Through checking return value of seq_printf, if less than 0, we know this function doesn't be printed. Then we decrease position to force this function to be printed next time, in next read buffer. Another little fix is to show correct allocating pages count. Signed-off-by: walimis Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 14fa52297b2..e60205722d0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -673,7 +673,7 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) cnt = num_to_init / ENTRIES_PER_PAGE; pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt); + num_to_init, cnt + 1); for (i = 0; i < cnt; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); @@ -753,13 +753,11 @@ static void *t_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l = -1; - if (*pos != iter->pos) { - for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) - ; - } else { - l = *pos; - p = t_next(m, p, &l); - } + if (*pos > iter->pos) + *pos = iter->pos; + + l = *pos; + p = t_next(m, p, &l); return p; } @@ -770,15 +768,21 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { + struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = v; char str[KSYM_SYMBOL_LEN]; + int ret = 0; if (!rec) return 0; kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - seq_printf(m, "%s\n", str); + ret = seq_printf(m, "%s\n", str); + if (ret < 0) { + iter->pos--; + iter->idx--; + } return 0; } @@ -804,7 +808,7 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENOMEM; iter->pg = ftrace_pages_start; - iter->pos = -1; + iter->pos = 0; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -891,7 +895,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; - iter->pos = -1; + iter->pos = 0; iter->flags = enable ? FTRACE_ITER_FILTER : FTRACE_ITER_NOTRACE; -- cgit v1.2.3 From e14c8bf86350f6c39186a139c5c584a6111b2f01 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 17 Nov 2008 08:22:18 +1030 Subject: stop_machine: fix race with return value (fixes Bug #11989) Bug #11989: Suspend failure on NForce4-based boards due to chanes in stop_machine We should not access active.fnret outside the lock; in theory the next stop_machine could overwrite it. Signed-off-by: Rusty Russell Tested-by: "Rafael J. Wysocki" Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 9bc4c00872c..24e8ceacc38 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -112,7 +112,7 @@ static int chill(void *unused) int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { struct work_struct *sm_work; - int i; + int i, ret; /* Set up initial state. */ mutex_lock(&lock); @@ -137,8 +137,9 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) /* This will release the thread on our CPU. */ put_cpu(); flush_workqueue(stop_machine_wq); + ret = active.fnret; mutex_unlock(&lock); - return active.fnret; + return ret; } int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) -- cgit v1.2.3 From ad133ba3dc283300e5b62b5b7211d2f39fbf6ee7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Nov 2008 15:39:47 +0100 Subject: sched, signals: fix the racy usage of ->signal in account_group_xxx/run_posix_cpu_timers Impact: fix potential NULL dereference Contrary to ad474caca3e2a0550b7ce0706527ad5ab389a4d4 changelog, other acct_group_xxx() helpers can be called after exit_notify() by timer tick. Thanks to Roland for pointing out this. Somehow I missed this simple fact when I read the original patch, and I am afraid I confused Frank during the discussion. Sorry. Fortunately, these helpers work with current, we can check ->exit_state to ensure that ->signal can't go away under us. Also, add the comment and compiler barrier to account_group_exec_runtime(), to make sure we load ->signal only once. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 7 +++++-- kernel/sched_stats.h | 15 +++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 153dcb2639c..895337b16a2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1308,9 +1308,10 @@ static inline int task_cputime_expired(const struct task_cputime *sample, */ static inline int fastpath_timer_check(struct task_struct *tsk) { - struct signal_struct *sig = tsk->signal; + struct signal_struct *sig; - if (unlikely(!sig)) + /* tsk == current, ensure it is safe to use ->signal/sighand */ + if (unlikely(tsk->exit_state)) return 0; if (!task_cputime_zero(&tsk->cputime_expires)) { @@ -1323,6 +1324,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) return 1; } + + sig = tsk->signal; if (!task_cputime_zero(&sig->cputime_expires)) { struct task_cputime group_sample; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index ee71bec1da6..7dbf72a2b02 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -298,9 +298,11 @@ static inline void account_group_user_time(struct task_struct *tsk, { struct signal_struct *sig; - sig = tsk->signal; - if (unlikely(!sig)) + /* tsk == current, ensure it is safe to use ->signal */ + if (unlikely(tsk->exit_state)) return; + + sig = tsk->signal; if (sig->cputime.totals) { struct task_cputime *times; @@ -325,9 +327,11 @@ static inline void account_group_system_time(struct task_struct *tsk, { struct signal_struct *sig; - sig = tsk->signal; - if (unlikely(!sig)) + /* tsk == current, ensure it is safe to use ->signal */ + if (unlikely(tsk->exit_state)) return; + + sig = tsk->signal; if (sig->cputime.totals) { struct task_cputime *times; @@ -353,8 +357,11 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, struct signal_struct *sig; sig = tsk->signal; + /* see __exit_signal()->task_rq_unlock_wait() */ + barrier(); if (unlikely(!sig)) return; + if (sig->cputime.totals) { struct task_cputime *times; -- cgit v1.2.3 From 65ecc14a30ad21bed9aabdfd6a2ae1a1aaaa6a00 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Sat, 15 Nov 2008 12:02:34 -0600 Subject: Remove -mno-spe flags as they dont belong For some unknown reason at Steven Rostedt added in disabling of the SPE instruction generation for e500 based PPC cores in commit 6ec562328fda585be2d7f472cfac99d3b44d362a. We are removing it because: 1. It generates e500 kernels that don't work 2. its not the correct set of flags to do this 3. we handle this in the arch/powerpc/Makefile already 4. its unknown in talking to Steven why he did this Signed-off-by: Kumar Gala Tested-and-Acked-by: Steven Rostedt Signed-off-by: Linus Torvalds --- kernel/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9a3ec66a9d8..19fad003b19 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,8 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o -CFLAGS_REMOVE_sched.o = -mno-spe - ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg @@ -21,7 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg -CFLAGS_REMOVE_sched.o = -mno-spe -pg +CFLAGS_REMOVE_sched.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o -- cgit v1.2.3 From 700018e0a77b4113172257fcdaa1c58e27a5074f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 18 Nov 2008 14:02:03 +0800 Subject: cpuset: fix regression when failed to generate sched domains Impact: properly rebuild sched-domains on kmalloc() failure When cpuset failed to generate sched domains due to kmalloc() failure, the scheduler should fallback to the single partition 'fallback_doms' and rebuild sched domains, but now it only destroys but not rebuilds sched domains. The regression was introduced by: | commit dfb512ec4834116124da61d6c1ee10fd0aa32bd6 | Author: Max Krasnyansky | Date: Fri Aug 29 13:11:41 2008 -0700 | | sched: arch_reinit_sched_domains() must destroy domains to force rebuild After the above commit, partition_sched_domains(0, NULL, NULL) will only destroy sched domains and partition_sched_domains(1, NULL, NULL) will create the default sched domain. Signed-off-by: Li Zefan Cc: Max Krasnyansky Cc: Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 12 ++++++++---- kernel/sched.c | 13 +++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e00526f52e..81fc6791a29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -587,7 +587,6 @@ static int generate_sched_domains(cpumask_t **domains, int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - ndoms = 0; doms = NULL; dattr = NULL; csa = NULL; @@ -674,10 +673,8 @@ restart: * Convert to and populate cpu masks. */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); - if (!doms) { - ndoms = 0; + if (!doms) goto done; - } /* * The rest of the code, including the scheduler, can deal with @@ -732,6 +729,13 @@ restart: done: kfree(csa); + /* + * Fallback to the default domain if kmalloc() failed. + * See comments in partition_sched_domains(). + */ + if (doms == NULL) + ndoms = 1; + *domains = doms; *attributes = dattr; return ndoms; diff --git a/kernel/sched.c b/kernel/sched.c index c94baf2969e..9b1e79371c2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7789,13 +7789,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * The passed in 'doms_new' should be kmalloc'd. This routine takes * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL, - * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms', it also forces the domains to be rebuilt. + * failed the kmalloc call, then it can pass in doms_new == NULL && + * ndoms_new == 1, and partition_sched_domains() will fallback to + * the single partition 'fallback_doms', it also forces the domains + * to be rebuilt. * - * If doms_new==NULL it will be replaced with cpu_online_map. - * ndoms_new==0 is a special case for destroying existing domains. - * It will not create the default domain. + * If doms_new == NULL it will be replaced with cpu_online_map. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. * * Call with hotplug lock held */ -- cgit v1.2.3 From e270219f4372b58bd3eeac12bd9f7edc592b8f6b Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 18 Nov 2008 10:15:24 +0600 Subject: kernel/profile.c: fix section mismatch warning Impact: fix section mismatch warning in kernel/profile.c Here, profile_nop function has been called from a non-init function create_hash_tables(void). Which generetes a section mismatch warning. Previously, create_hash_tables(void) was a init function. So, removing __init from create_hash_tables(void) requires profile_nop to be non-init. This patch makes profile_nop function inline and fixes the following warning: WARNING: vmlinux.o(.text+0x6ebb6): Section mismatch in reference from the function create_hash_tables() to the function .init.text:profile_nop() The function create_hash_tables() references the function __init profile_nop(). This is often because create_hash_tables lacks a __init annotation or the annotation of profile_nop is wrong. Signed-off-by: Rakib Mullick Signed-off-by: Ingo Molnar --- kernel/profile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 9830a037d8d..5b7d1ac7124 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = { }; #ifdef CONFIG_SMP -static void __init profile_nop(void *unused) +static inline void profile_nop(void *unused) { } -- cgit v1.2.3 From 98ba4031ab2adc8b394295e68aa4c8fe9d5060db Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 14 Nov 2008 10:44:59 +0100 Subject: relay: fix cpu offline problem relay_open() will close allocated buffers when failed. but if cpu offlined, some buffer will not be closed. this patch fixed it. and did cleanup for relay_reset() too. Signed-off-by: Lai Jiangshan Signed-off-by: Jens Axboe --- kernel/relay.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 8d13a7855c0..32b0befdcb6 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -400,7 +400,7 @@ void relay_reset(struct rchan *chan) } mutex_lock(&relay_channels_mutex); - for_each_online_cpu(i) + for_each_possible_cpu(i) if (chan->buf[i]) __relay_reset(chan->buf[i], 0); mutex_unlock(&relay_channels_mutex); @@ -611,10 +611,9 @@ struct rchan *relay_open(const char *base_filename, return chan; free_bufs: - for_each_online_cpu(i) { - if (!chan->buf[i]) - break; - relay_close_buf(chan->buf[i]); + for_each_possible_cpu(i) { + if (chan->buf[i]) + relay_close_buf(chan->buf[i]); } kref_put(&chan->kref, relay_destroy_channel); -- cgit v1.2.3 From a6a0c4ca7edb378a8a7332501f097089cb1051c4 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 18 Nov 2008 06:56:51 -0800 Subject: suspend: use WARN not WARN_ON to print the message By using WARN(), kerneloops.org can collect which component is causing the delay and make statistics about that. suspend_test_finish() is currently the number 2 item but unless we can collect who's causing it we're not going to be able to fix the hot topic ones.. Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 19122cf6d82..b8f7ce9473e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -174,7 +174,7 @@ static void suspend_test_finish(const char *label) * has some performance issues. The stack dump of a WARN_ON * is more likely to get the right attention than a printk... */ - WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000)); + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); } #else -- cgit v1.2.3 From de11defebf00007677fb7ee91d9b089b78786fbb Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 19 Nov 2008 15:36:14 -0800 Subject: reintroduce accept4 Introduce a new accept4() system call. The addition of this system call matches analogous changes in 2.6.27 (dup3(), evenfd2(), signalfd4(), inotify_init1(), epoll_create1(), pipe2()) which added new system calls that differed from analogous traditional system calls in adding a flags argument that can be used to access additional functionality. The accept4() system call is exactly the same as accept(), except that it adds a flags bit-mask argument. Two flags are initially implemented. (Most of the new system calls in 2.6.27 also had both of these flags.) SOCK_CLOEXEC causes the close-on-exec (FD_CLOEXEC) flag to be enabled for the new file descriptor returned by accept4(). This is a useful security feature to avoid leaking information in a multithreaded program where one thread is doing an accept() at the same time as another thread is doing a fork() plus exec(). More details here: http://udrepper.livejournal.com/20407.html "Secure File Descriptor Handling", Ulrich Drepper). The other flag is SOCK_NONBLOCK, which causes the O_NONBLOCK flag to be enabled on the new open file description created by accept4(). (This flag is merely a convenience, saving the use of additional calls fcntl(F_GETFL) and fcntl (F_SETFL) to achieve the same result. Here's a test program. Works on x86-32. Should work on x86-64, but I (mtk) don't have a system to hand to test with. It tests accept4() with each of the four possible combinations of SOCK_CLOEXEC and SOCK_NONBLOCK set/clear in 'flags', and verifies that the appropriate flags are set on the file descriptor/open file description returned by accept4(). I tested Ulrich's patch in this thread by applying against 2.6.28-rc2, and it passes according to my test program. /* test_accept4.c Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk Licensed under the GNU GPLv2 or later. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #define PORT_NUM 33333 #define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0) /**********************************************************************/ /* The following is what we need until glibc gets a wrapper for accept4() */ /* Flags for socket(), socketpair(), accept4() */ #ifndef SOCK_CLOEXEC #define SOCK_CLOEXEC O_CLOEXEC #endif #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK O_NONBLOCK #endif #ifdef __x86_64__ #define SYS_accept4 288 #elif __i386__ #define USE_SOCKETCALL 1 #define SYS_ACCEPT4 18 #else #error "Sorry -- don't know the syscall # on this architecture" #endif static int accept4(int fd, struct sockaddr *sockaddr, socklen_t *addrlen, int flags) { printf("Calling accept4(): flags = %x", flags); if (flags != 0) { printf(" ("); if (flags & SOCK_CLOEXEC) printf("SOCK_CLOEXEC"); if ((flags & SOCK_CLOEXEC) && (flags & SOCK_NONBLOCK)) printf(" "); if (flags & SOCK_NONBLOCK) printf("SOCK_NONBLOCK"); printf(")"); } printf("\n"); #if USE_SOCKETCALL long args[6]; args[0] = fd; args[1] = (long) sockaddr; args[2] = (long) addrlen; args[3] = flags; return syscall(SYS_socketcall, SYS_ACCEPT4, args); #else return syscall(SYS_accept4, fd, sockaddr, addrlen, flags); #endif } /**********************************************************************/ static int do_test(int lfd, struct sockaddr_in *conn_addr, int closeonexec_flag, int nonblock_flag) { int connfd, acceptfd; int fdf, flf, fdf_pass, flf_pass; struct sockaddr_in claddr; socklen_t addrlen; printf("=======================================\n"); connfd = socket(AF_INET, SOCK_STREAM, 0); if (connfd == -1) die("socket"); if (connect(connfd, (struct sockaddr *) conn_addr, sizeof(struct sockaddr_in)) == -1) die("connect"); addrlen = sizeof(struct sockaddr_in); acceptfd = accept4(lfd, (struct sockaddr *) &claddr, &addrlen, closeonexec_flag | nonblock_flag); if (acceptfd == -1) { perror("accept4()"); close(connfd); return 0; } fdf = fcntl(acceptfd, F_GETFD); if (fdf == -1) die("fcntl:F_GETFD"); fdf_pass = ((fdf & FD_CLOEXEC) != 0) == ((closeonexec_flag & SOCK_CLOEXEC) != 0); printf("Close-on-exec flag is %sset (%s); ", (fdf & FD_CLOEXEC) ? "" : "not ", fdf_pass ? "OK" : "failed"); flf = fcntl(acceptfd, F_GETFL); if (flf == -1) die("fcntl:F_GETFD"); flf_pass = ((flf & O_NONBLOCK) != 0) == ((nonblock_flag & SOCK_NONBLOCK) !=0); printf("nonblock flag is %sset (%s)\n", (flf & O_NONBLOCK) ? "" : "not ", flf_pass ? "OK" : "failed"); close(acceptfd); close(connfd); printf("Test result: %s\n", (fdf_pass && flf_pass) ? "PASS" : "FAIL"); return fdf_pass && flf_pass; } static int create_listening_socket(int port_num) { struct sockaddr_in svaddr; int lfd; int optval; memset(&svaddr, 0, sizeof(struct sockaddr_in)); svaddr.sin_family = AF_INET; svaddr.sin_addr.s_addr = htonl(INADDR_ANY); svaddr.sin_port = htons(port_num); lfd = socket(AF_INET, SOCK_STREAM, 0); if (lfd == -1) die("socket"); optval = 1; if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)) == -1) die("setsockopt"); if (bind(lfd, (struct sockaddr *) &svaddr, sizeof(struct sockaddr_in)) == -1) die("bind"); if (listen(lfd, 5) == -1) die("listen"); return lfd; } int main(int argc, char *argv[]) { struct sockaddr_in conn_addr; int lfd; int port_num; int passed; passed = 1; port_num = (argc > 1) ? atoi(argv[1]) : PORT_NUM; memset(&conn_addr, 0, sizeof(struct sockaddr_in)); conn_addr.sin_family = AF_INET; conn_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); conn_addr.sin_port = htons(port_num); lfd = create_listening_socket(port_num); if (!do_test(lfd, &conn_addr, 0, 0)) passed = 0; if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, 0)) passed = 0; if (!do_test(lfd, &conn_addr, 0, SOCK_NONBLOCK)) passed = 0; if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, SOCK_NONBLOCK)) passed = 0; close(lfd); exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); } [mtk.manpages@gmail.com: rewrote changelog, updated test program] Signed-off-by: Ulrich Drepper Tested-by: Michael Kerrisk Acked-by: Michael Kerrisk Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a77b27b11b0..e14a2328170 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -31,7 +31,7 @@ cond_syscall(sys_socketpair); cond_syscall(sys_bind); cond_syscall(sys_listen); cond_syscall(sys_accept); -cond_syscall(sys_paccept); +cond_syscall(sys_accept4); cond_syscall(sys_connect); cond_syscall(sys_getsockname); cond_syscall(sys_getpeername); -- cgit v1.2.3 From f481891fdc49d3d1b8a9674a1825d183069a805f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 19 Nov 2008 15:36:30 -0800 Subject: cpuset: update top cpuset's mems after adding a node After adding a node into the machine, top cpuset's mems isn't updated. By reviewing the code, we found that the update function cpuset_track_online_nodes() was invoked after node_states[N_ONLINE] changes. It is wrong because N_ONLINE just means node has pgdat, and if node has/added memory, we use N_HIGH_MEMORY. So, We should invoke the update function after node_states[N_HIGH_MEMORY] changes, just like its commit says. This patch fixes it. And we use notifier of memory hotplug instead of direct calling of cpuset_track_online_nodes(). Signed-off-by: Miao Xie Acked-by: Yasunori Goto Cc: David Rientjes Cc: Paul Menage Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 81fc6791a29..da7ff6137f3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -2015,12 +2016,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. * See also the previous routine cpuset_track_online_cpus(). */ -void cpuset_track_online_nodes(void) +static int cpuset_track_online_nodes(struct notifier_block *self, + unsigned long action, void *arg) { cgroup_lock(); - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - scan_for_empty_cpusets(&top_cpuset); + switch (action) { + case MEM_ONLINE: + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + break; + case MEM_OFFLINE: + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + scan_for_empty_cpusets(&top_cpuset); + break; + default: + break; + } cgroup_unlock(); + return NOTIFY_OK; } #endif @@ -2036,6 +2048,7 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0); + hotplug_memory_notifier(cpuset_track_online_nodes, 10); } /** -- cgit v1.2.3 From 3fa59dfbc3b223f02c26593be69ce6fc9a940405 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 19 Nov 2008 15:36:34 -0800 Subject: cgroup: fix potential deadlock in pre_destroy As Balbir pointed out, memcg's pre_destroy handler has potential deadlock. It has following lock sequence. cgroup_mutex (cgroup_rmdir) -> pre_destroy -> mem_cgroup_pre_destroy-> force_empty -> cpu_hotplug.lock. (lru_add_drain_all-> schedule_work-> get_online_cpus) But, cpuset has following. cpu_hotplug.lock (call notifier) -> cgroup_mutex. (within notifier) Then, this lock sequence should be fixed. Considering how pre_destroy works, it's not necessary to holding cgroup_mutex() while calling it. As a side effect, we don't have to wait at this mutex while memcg's force_empty works.(it can be long when there are tons of pages.) Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Li Zefan Cc: Paul Menage Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 358e77564e6..1a06be61dcd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2472,10 +2472,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_unlock(&cgroup_mutex); return -EBUSY; } - - parent = cgrp->parent; - root = cgrp->root; - sb = root->sb; + mutex_unlock(&cgroup_mutex); /* * Call pre_destroy handlers of subsys. Notify subsystems @@ -2483,7 +2480,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) */ cgroup_call_pre_destroy(cgrp); - if (cgroup_has_css_refs(cgrp)) { + mutex_lock(&cgroup_mutex); + parent = cgrp->parent; + root = cgrp->root; + sb = root->sb; + + if (atomic_read(&cgrp->count) + || !list_empty(&cgrp->children) + || cgroup_has_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } -- cgit v1.2.3 From 966c8c12dc9e77f931e2281ba25d2f0244b06949 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 19 Nov 2008 15:36:36 -0800 Subject: sprint_symbol(): use less stack sprint_symbol(), itself used when dumping stacks, has been wasting 128 bytes of stack: lookup the symbol directly into the buffer supplied by the caller, instead of using a locally declared namebuf. I believe the name != buffer strcpy() is obsolete: the design here dates from when module symbol lookup pointed into a supposedly const but sadly volatile table; nowadays it copies, but an uncalled strcpy() looks better here than the risk of a recursive BUG_ON(). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5072cf1685a..7b8b0f21a5b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -304,17 +304,24 @@ int sprint_symbol(char *buffer, unsigned long address) char *modname; const char *name; unsigned long offset, size; - char namebuf[KSYM_NAME_LEN]; + int len; - name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); + name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) return sprintf(buffer, "0x%lx", address); + if (name != buffer) + strcpy(buffer, name); + len = strlen(buffer); + buffer += len; + if (modname) - return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, - size, modname); + len += sprintf(buffer, "+%#lx/%#lx [%s]", + offset, size, modname); else - return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); + len += sprintf(buffer, "+%#lx/%#lx", offset, size); + + return len; } /* Look up a kernel symbol and print it to the kernel messages. */ -- cgit v1.2.3 From 33d283bef23132c48195eafc21449f8ba88fce6b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Nov 2008 15:36:48 -0800 Subject: cgroups: fix a serious bug in cgroupstats Try this, and you'll get oops immediately: # cd Documentation/accounting/ # gcc -o getdelays getdelays.c # mount -t cgroup -o debug xxx /mnt # ./getdelays -C /mnt/tasks Because a normal file's dentry->d_fsdata is a pointer to struct cftype, not struct cgroup. After the patch, it returns EINVAL if we try to get cgroupstats from a normal file. Cc: Balbir Singh Signed-off-by: Li Zefan Acked-by: Paul Menage Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1a06be61dcd..fe00b3b983a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2039,10 +2039,13 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) struct cgroup *cgrp; struct cgroup_iter it; struct task_struct *tsk; + /* - * Validate dentry by checking the superblock operations + * Validate dentry by checking the superblock operations, + * and make sure it's a directory. */ - if (dentry->d_sb->s_op != &cgroup_ops) + if (dentry->d_sb->s_op != &cgroup_ops || + !S_ISDIR(dentry->d_inode->i_mode)) goto err; ret = 0; -- cgit v1.2.3