From 39a2eddb9b62959dc55c6978b5eaeb3dd57c5ff2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 May 2009 14:35:54 -0400 Subject: genirq: fix comment to say IRQ_WAKE_THREAD Trying to implement a driver to use threaded irqs, I was confused when the return value to use that was described in the comment above request_threaded_irq was not defined. Turns out that the enum is IRQ_WAKE_THREAD where as the comment said IRQ_THREAD_WAKE. [Impact: do not confuse developers with wrong comments ] Signed-off-by: Steven Rostedt LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2734eca5924..eb47f8b8055 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -851,7 +851,7 @@ EXPORT_SYMBOL(free_irq); * still called in hard interrupt context and has to check * whether the interrupt originates from the device. If yes it * needs to disable the interrupt on the device and return - * IRQ_THREAD_WAKE which will wake up the handler thread and run + * IRQ_WAKE_THREAD which will wake up the handler thread and run * @thread_fn. This split handler design is necessary to support * shared interrupts. * -- cgit v1.2.3 From f2e21c9610991e95621a81407cdbab881226419b Mon Sep 17 00:00:00 2001 From: Eero Nurkkala Date: Mon, 25 May 2009 09:57:37 +0300 Subject: NOHZ: Properly feed cpufreq ondemand governor A call from irq_exit() may occasionally pause the timing info for cpufreq ondemand governor. This results in the cpufreq ondemand governor to fail to calculate the system load properly. Thus, relocate the checks for this particular case to keep the governor always functional. Signed-off-by: Eero Nurkkala Reported-by: Tero Kristo Acked-by: Rik van Riel Acked-by: Venkatesh Pallipadi Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d3f1ef4d5cb..a3562ce5498 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle) cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); + + /* + * Call to tick_nohz_start_idle stops the last_update_time from being + * updated. Thus, it must not be called in the event we are called from + * irq_exit() with the prior state different than idle. + */ + if (!inidle && !ts->inidle) + goto end; + now = tick_nohz_start_idle(ts); /* @@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; - if (!inidle && !ts->inidle) - goto end; - ts->inidle = 1; if (need_resched()) -- cgit v1.2.3 From 87847b8f26cc7176ec9b239898dc7ce47a94e1a6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 13 Jun 2009 17:06:50 +1000 Subject: perf_counter: Fix atomic_set vs. atomic64_t type mismatch Using atomic_set on an atomic64_t variable gives a compiler warning on powerpc, and won't give the desired result at runtime. This fixes an instance of this error in the perf_counter code. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18995.20490.979429.244883@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 29b685f551a..8d14a733f22 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1283,7 +1283,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) if (!interrupts) { perf_disable(); counter->pmu->disable(counter); - atomic_set(&hwc->period_left, 0); + atomic64_set(&hwc->period_left, 0); counter->pmu->enable(counter); perf_enable(); } -- cgit v1.2.3 From d5e8da6449d4ef4bac35ea9b9719a2cda02e7b39 Mon Sep 17 00:00:00 2001 From: Marti Raudsepp Date: Sat, 13 Jun 2009 02:35:01 +0300 Subject: perf_counter: Fix stack corruption in perf_read_hw With PERF_FORMAT_ID, perf_read_hw now needs space for up to 4 values. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8d14a733f22..e914daff03b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1553,7 +1553,7 @@ static int perf_release(struct inode *inode, struct file *file) static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 values[3]; + u64 values[4]; int n; /* -- cgit v1.2.3 From 75f937f24bd9c003dcb9d7d5509f23459f1f6000 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Jun 2009 15:05:12 +0200 Subject: perf_counter: Fix ctx->mutex vs counter->mutex inversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simon triggered a lockdep inversion report about us taking ctx->mutex vs counter->mutex in inverse orders. Fix that up. Reported-by: Simon Holm Thøgersen Tested-by: Simon Holm Thøgersen Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e914daff03b..109a9572385 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1620,22 +1620,6 @@ static void perf_counter_reset(struct perf_counter *counter) perf_counter_update_userpage(counter); } -static void perf_counter_for_each_sibling(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *sibling; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - counter = counter->group_leader; - - func(counter); - list_for_each_entry(sibling, &counter->sibling_list, list_entry) - func(sibling); - mutex_unlock(&ctx->mutex); -} - /* * Holding the top-level counter's child_mutex means that any * descendant process that has inherited this counter will block @@ -1658,14 +1642,18 @@ static void perf_counter_for_each_child(struct perf_counter *counter, static void perf_counter_for_each(struct perf_counter *counter, void (*func)(struct perf_counter *)) { - struct perf_counter *child; + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *sibling; - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - perf_counter_for_each_sibling(counter, func); - list_for_each_entry(child, &counter->child_list, child_list) - perf_counter_for_each_sibling(child, func); - mutex_unlock(&counter->child_mutex); + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + counter = counter->group_leader; + + perf_counter_for_each_child(counter, func); + func(counter); + list_for_each_entry(sibling, &counter->sibling_list, list_entry) + perf_counter_for_each_child(counter, func); + mutex_unlock(&ctx->mutex); } static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) -- cgit v1.2.3 From 3f237a79ddeea34dda67e9eedece3a22918df75e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 21:15:30 +0930 Subject: cpumask: use new operators in kernel/trace Signed-off-by: Rusty Russell LKML-Reference: <200906122115.30787.rusty@rustcorp.com.au> Signed-off-by: Steven Rostedt --- kernel/trace/kmemtrace.c | 2 +- kernel/trace/ring_buffer.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 86cdf671d7e..1edaa9516e8 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -186,7 +186,7 @@ static int kmem_trace_init(struct trace_array *tr) int cpu; kmemtrace_array = tr; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); kmemtrace_start_probes(); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2e642b2b725..9c31c9f6b93 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3105,7 +3105,7 @@ static int rb_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (cpu_isset(cpu, *buffer->cpumask)) + if (cpumask_test_cpu(cpu, buffer->cpumask)) return NOTIFY_OK; buffer->buffers[cpu] = @@ -3116,7 +3116,7 @@ static int rb_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } smp_wmb(); - cpu_set(cpu, *buffer->cpumask); + cpumask_set_cpu(cpu, buffer->cpumask); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: -- cgit v1.2.3 From 215368e8e59023d6a0abdda896923018d74fdf7f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 15 Jun 2009 10:56:42 +0800 Subject: tracing: fix a typo in tracing_cpumask_write() It's tracing_cpumask_new that should be kfree()ed. This causes tracing_cpumask to be freed due to the typo: # echo z > tracing_cpumask bash: echo: write error: Invalid argument And subsequent reads/writes to tracing_cpuamsk will access this already-freed tracing_cpumask, thus may lead to crash. [ Impact: fix leak and crash when writing invalid val to tracing_cpumask ] Acked-by: Frederic Weisbecker Signed-off-by: Li Zefan LKML-Reference: <4A35B86A.7070608@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8acd9b81a5d..7355a38e18b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2191,11 +2191,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; - mutex_lock(&tracing_cpumask_update_lock); err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); if (err) goto err_unlock; + mutex_lock(&tracing_cpumask_update_lock); + local_irq_disable(); __raw_spin_lock(&ftrace_max_lock); for_each_tracing_cpu(cpu) { @@ -2223,8 +2224,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, return count; err_unlock: - mutex_unlock(&tracing_cpumask_update_lock); - free_cpumask_var(tracing_cpumask); + free_cpumask_var(tracing_cpumask_new); return err; } -- cgit v1.2.3 From e4f2d10f479d18198ebafcb5e124cc3dd8b8817a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 15 Jun 2009 10:57:28 +0800 Subject: tracing: replace a GFP_ATOMIC with GFP_KERNEL allocation Atomic allocation is not needed here. [ Impact: clean up of memory alloction type ] Acked-by: Frederic Weisbecker Signed-off-by: Li Zefan LKML-Reference: <4A35B898.2050607@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7355a38e18b..0f57f1b443b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3627,7 +3627,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, struct trace_seq *s; unsigned long cnt; - s = kmalloc(sizeof(*s), GFP_ATOMIC); + s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return ENOMEM; -- cgit v1.2.3 From 5e4904cb633177046bee5d26946a7ac918e642fc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 15 Jun 2009 10:58:39 +0800 Subject: tracing/filters: operand can be negative This should be a bug: # cat format name: foo_bar ID: 71 format: ... field:int bar; offset:24; size:4; # echo 'bar < 0' > filter # echo 'bar < -1' > filter bash: echo: write error: Invalid argument [ Impact: fix to allow negative operand in filer expr ] Acked-by: Frederic Weisbecker Signed-off-by: Li Zefan LKML-Reference: <4A35B8DF.60400@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index db6e54bdb59..1d819230484 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -546,6 +546,7 @@ static int filter_add_pred(struct filter_parse_state *ps, filter_pred_fn_t fn; unsigned long long val; int string_type; + int ret; pred->fn = filter_pred_none; @@ -581,7 +582,11 @@ static int filter_add_pred(struct filter_parse_state *ps, pred->not = 1; return filter_add_pred_fn(ps, call, pred, fn); } else { - if (strict_strtoull(pred->str_val, 0, &val)) { + if (field->is_signed) + ret = strict_strtoll(pred->str_val, 0, &val); + else + ret = strict_strtoull(pred->str_val, 0, &val); + if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; } -- cgit v1.2.3 From 0ac2058f686a19fe8ab25c4f3104fc1580dce7cf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 15 Jun 2009 10:59:17 +0800 Subject: tracing/filters: strloc should be unsigned short I forgot to update filter code accordingly in "tracing/events: change the type of __str_loc_item to unsigned short" (commt b0aae68cc5508f3c2fbf728988c954db4c8b8a53) It can cause system crash: # echo 1 > tracing/events/irq/irq_handler_entry/enable # echo 'name == eth0' > tracing/events/irq/irq_handler_entry/filter [ Impact: fix crash while filtering on __string() field ] Acked-by: Frederic Weisbecker Signed-off-by: Li Zefan LKML-Reference: <4A35B905.3090500@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1d819230484..b24ab0e6ea7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -178,7 +178,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, static int filter_pred_strloc(struct filter_pred *pred, void *event, int val1, int val2) { - int str_loc = *(int *)(event + pred->offset); + unsigned short str_loc = *(unsigned short *)(event + pred->offset); char *addr = (char *)(event + str_loc); int cmp, match; -- cgit v1.2.3 From c7b0930857e2278f2e7714db6294e94c57f623b0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Jun 2009 11:12:00 -0400 Subject: ring-buffer: prevent adding write in discarded area This a very tight race where an interrupt could come in and not have enough data to put into the end of a buffer page, and that it would fail to write and need to go to the next page. But if this happened when another writer was about to reserver their data, and that writer has smaller data to reserve, then it could succeed even though the interrupt moved the tail page. To pervent that, if we fail to store data, and by subtracting the amount we reserved we still have room for smaller data, we need to fill that space with "discarded" data. [ Impact: prevent race were buffer data may be lost ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 68 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c31c9f6b93..dbc0f93396a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -205,6 +205,7 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -1170,6 +1171,59 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } +static inline void +rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + unsigned long tail, unsigned long length) +{ + struct ring_buffer_event *event; + + /* + * Only the event that crossed the page boundary + * must fill the old tail_page with padding. + */ + if (tail >= BUF_PAGE_SIZE) { + local_sub(length, &tail_page->write); + return; + } + + event = __rb_page_index(tail_page, tail); + + /* + * If this event is bigger than the minimum size, then + * we need to be careful that we don't subtract the + * write counter enough to allow another writer to slip + * in on this page. + * We put in a discarded commit instead, to make sure + * that this space is not used again. + * + * If we are less than the minimum size, we don't need to + * worry about it. + */ + if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + /* No room for any events */ + + /* Mark the rest of the page with padding */ + rb_event_set_padding(event); + + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); + return; + } + + /* Put in a discarded event */ + event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + event->time_delta = 1; + /* Account for this as an entry */ + local_inc(&tail_page->entries); + local_inc(&cpu_buffer->entries); + + /* Set write to end of buffer */ + length = (tail + length) - BUF_PAGE_SIZE; + local_sub(length, &tail_page->write); +} static struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, @@ -1264,17 +1318,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->tail_page->page->time_stamp = *ts; } - /* - * The actual tail page has moved forward. - */ - if (tail < BUF_PAGE_SIZE) { - /* Mark the rest of the page with padding */ - event = __rb_page_index(tail_page, tail); - rb_event_set_padding(event); - } - - /* Set the write back to the previous setting */ - local_sub(length, &tail_page->write); + rb_reset_tail(cpu_buffer, tail_page, tail, length); /* * If this was a commit entry that failed, @@ -1293,7 +1337,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, out_reset: /* reset write */ - local_sub(length, &tail_page->write); + rb_reset_tail(cpu_buffer, tail_page, tail, length); if (likely(lock_taken)) __raw_spin_unlock(&cpu_buffer->lock); -- cgit v1.2.3 From 9086c7b90abbf4ec29543e8f2424e3ecd14e955d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Jun 2009 11:46:09 -0400 Subject: ring-buffer: have benchmark test handle discarded events With the addition of commit: c7b0930857e2278f2e7714db6294e94c57f623b0 ring-buffer: prevent adding write in discarded area The ring buffer may now add discarded events when a write passes the end of a buffer page. Before, a discarded event was only added when the tracer deliberately created one. The ring buffer benchmark test does not handle discarded events when it reads the buffer and fails when it encounters one. Also fix the increment for large data entries (luckily, the test did not add any yet). [ Impact: fix false failure of ring buffer self test ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 8d68e149a8b..cf6b0f50134 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -102,8 +102,10 @@ static enum event_status read_page(int cpu) event = (void *)&rpage->data[i]; switch (event->type_len) { case RINGBUF_TYPE_PADDING: - /* We don't expect any padding */ - KILL_TEST(); + /* failed writes may be discarded events */ + if (!event->time_delta) + KILL_TEST(); + inc = event->array[0] + 4; break; case RINGBUF_TYPE_TIME_EXTEND: inc = 8; @@ -119,7 +121,7 @@ static enum event_status read_page(int cpu) KILL_TEST(); break; } - inc = event->array[0]; + inc = event->array[0] + 4; break; default: entry = ring_buffer_event_data(event); -- cgit v1.2.3 From 263294f3e1e883b9dcbf0c09a54b644918f7729d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Jun 2009 11:50:18 -0400 Subject: ring-buffer: remove unused variable Fix the compiler error: kernel/trace/ring_buffer.c: In function 'rb_move_tail': kernel/trace/ring_buffer.c:1236: warning: unused variable 'event' Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dbc0f93396a..e857e944398 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1233,7 +1233,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, { struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; - struct ring_buffer_event *event; bool lock_taken = false; unsigned long flags; -- cgit v1.2.3 From fa7439531dee58277748c819785a44d3203c4b51 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Jun 2009 12:37:57 -0400 Subject: ring-buffer: use commit counters for commit pointer accounting The ring buffer is made up of three sets of pointers. The head page pointer, which points to the next page for the reader to get. The commit pointer and commit index, which points to the page and index of the last committed write respectively. The tail pointer and tail index, which points to the page and the index of the last reserved data respectively (non committed). The commit pointer is only moved forward by the outer most writer. If a nested writer comes in, it will not move the pointer forward. The current implementation has a flaw. It assumes that the outer most writer successfully reserved data. There's a small race window where the outer most writer could find the tail pointer, but a nested writer could come in (via interrupt) and move the tail forward, and even the commit forward. The outer writer would not realized the commit moved forward and the accounting will break. This patch changes the design to use counters in the per cpu buffers to keep track of commits. The counters are incremented at the start of the commit, and decremented at the end. If the end commit counter is 1, then it moves the commit pointers. A loop is made to check for races between checking and moving the commit pointers. Only the outer commit should move the pointers anyway. The test of knowing if a reserve is equal to the last commit update is still needed to know for time keeping. The time code is much less racey than the commit updates. This change not only solves the mentioned race, but also makes the code simpler. [ Impact: fix commit race and simplify code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 154 ++++++++++++++++++++++----------------------- 1 file changed, 75 insertions(+), 79 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e857e944398..ed3559944fc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -415,6 +415,8 @@ struct ring_buffer_per_cpu { unsigned long overrun; unsigned long read; local_t entries; + local_t committing; + local_t commits; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -1015,8 +1017,8 @@ rb_event_index(struct ring_buffer_event *event) } static inline int -rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; unsigned long index; @@ -1028,31 +1030,6 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_commit_index(cpu_buffer) == index; } -static void -rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - while (cpu_buffer->commit_page->page != (void *)addr) { - if (RB_WARN_ON(cpu_buffer, - cpu_buffer->commit_page == cpu_buffer->tail_page)) - return; - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - } - - /* Now set the commit to the event's index */ - local_set(&cpu_buffer->commit_page->page->commit, index); -} - static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { @@ -1319,15 +1296,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, rb_reset_tail(cpu_buffer, tail_page, tail, length); - /* - * If this was a commit entry that failed, - * increment that too - */ - if (tail_page == cpu_buffer->commit_page && - tail == rb_commit_index(cpu_buffer)) { - rb_set_commit_to_write(cpu_buffer); - } - __raw_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); @@ -1377,11 +1345,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, local_inc(&tail_page->entries); /* - * If this is a commit and the tail is zero, then update - * this page's time stamp. + * If this is the first commit on the page, then update + * its timestamp. */ - if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->page->time_stamp = *ts; + if (!tail) + tail_page->page->time_stamp = *ts; return event; } @@ -1450,16 +1418,16 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return -EAGAIN; /* Only a commited time event can update the write stamp */ - if (rb_is_commit(cpu_buffer, event)) { + if (rb_event_is_commit(cpu_buffer, event)) { /* - * If this is the first on the page, then we need to - * update the page itself, and just put in a zero. + * If this is the first on the page, then it was + * updated with the page itself. Try to discard it + * and if we can't just make it zero. */ if (rb_event_index(event)) { event->time_delta = *delta & TS_MASK; event->array[0] = *delta >> TS_SHIFT; } else { - cpu_buffer->commit_page->page->time_stamp = *ts; /* try to discard, since we do not need this */ if (!rb_try_to_discard(cpu_buffer, event)) { /* nope, just zero it */ @@ -1485,6 +1453,44 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return ret; } +static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_inc(&cpu_buffer->committing); + local_inc(&cpu_buffer->commits); +} + +static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long commits; + + if (RB_WARN_ON(cpu_buffer, + !local_read(&cpu_buffer->committing))) + return; + + again: + commits = local_read(&cpu_buffer->commits); + /* synchronize with interrupts */ + barrier(); + if (local_read(&cpu_buffer->committing) == 1) + rb_set_commit_to_write(cpu_buffer); + + local_dec(&cpu_buffer->committing); + + /* synchronize with interrupts */ + barrier(); + + /* + * Need to account for interrupts coming in between the + * updating of the commit page and the clearing of the + * committing counter. + */ + if (unlikely(local_read(&cpu_buffer->commits) != commits) && + !local_read(&cpu_buffer->committing)) { + local_inc(&cpu_buffer->committing); + goto again; + } +} + static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) @@ -1494,6 +1500,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, int commit = 0; int nr_loops = 0; + rb_start_commit(cpu_buffer); + length = rb_calculate_event_length(length); again: /* @@ -1506,7 +1514,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * Bail! */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) - return NULL; + goto out_fail; ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); @@ -1537,7 +1545,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); if (commit == -EBUSY) - return NULL; + goto out_fail; if (commit == -EAGAIN) goto again; @@ -1551,28 +1559,19 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; - if (!event) { - if (unlikely(commit)) - /* - * Ouch! We needed a timestamp and it was commited. But - * we didn't get our event reserved. - */ - rb_set_commit_to_write(cpu_buffer); - return NULL; - } + if (!event) + goto out_fail; - /* - * If the timestamp was commited, make the commit our entry - * now so that we will update it when needed. - */ - if (unlikely(commit)) - rb_set_commit_event(cpu_buffer, event); - else if (!rb_is_commit(cpu_buffer, event)) + if (!rb_event_is_commit(cpu_buffer, event)) delta = 0; event->time_delta = delta; return event; + + out_fail: + rb_end_commit(cpu_buffer); + return NULL; } #define TRACE_RECURSIVE_DEPTH 16 @@ -1682,13 +1681,14 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, { local_inc(&cpu_buffer->entries); - /* Only process further if we own the commit */ - if (!rb_is_commit(cpu_buffer, event)) - return; - - cpu_buffer->write_stamp += event->time_delta; + /* + * The event first in the commit queue updates the + * time stamp. + */ + if (rb_event_is_commit(cpu_buffer, event)) + cpu_buffer->write_stamp += event->time_delta; - rb_set_commit_to_write(cpu_buffer); + rb_end_commit(cpu_buffer); } /** @@ -1777,15 +1777,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, /* The event is discarded regardless */ rb_event_discard(event); + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* * This must only be called if the event has not been * committed yet. Thus we can assume that preemption * is still disabled. */ - RB_WARN_ON(buffer, preemptible()); - - cpu = smp_processor_id(); - cpu_buffer = buffer->buffers[cpu]; + RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); if (!rb_try_to_discard(cpu_buffer, event)) goto out; @@ -1796,13 +1796,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ local_inc(&cpu_buffer->entries); out: - /* - * If a write came in and pushed the tail page - * we still need to update the commit pointer - * if we were the commit. - */ - if (rb_is_commit(cpu_buffer, event)) - rb_set_commit_to_write(cpu_buffer); + rb_end_commit(cpu_buffer); trace_recursive_unlock(); @@ -2720,6 +2714,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->overrun = 0; cpu_buffer->read = 0; local_set(&cpu_buffer->entries, 0); + local_set(&cpu_buffer->committing, 0); + local_set(&cpu_buffer->commits, 0); cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; -- cgit v1.2.3 From 57be88878e7aa38750384704d811485a607bbda4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 16 Jun 2009 16:39:12 +0800 Subject: tracing/filters: free filter_string in destroy_preds() filter->filter_string is not freed when unloading a module: # insmod trace-events-sample.ko # echo "bar < 100" > /mnt/tracing/events/sample/foo_bar/filter # rmmod trace-events-sample.ko [ Impact: fix memory leak when unloading module ] Signed-off-by: Li Zefan LKML-Reference: <4A375A30.9060802@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b24ab0e6ea7..d9f01c1a042 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -381,6 +381,7 @@ void destroy_preds(struct ftrace_event_call *call) filter_free_pred(filter->preds[i]); } kfree(filter->preds); + kfree(filter->filter_string); kfree(filter); call->filter = NULL; } -- cgit v1.2.3 From 00e95830a4d6e49f764fdb19896a89199bc0aa3b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 16 Jun 2009 16:39:41 +0800 Subject: tracing/filters: fix race between filter setting and module unload Module unload is protected by event_mutex, while setting filter is protected by filter_mutex. This leads to the race: echo 'bar == 0 || bar == 10' \ | > sample/filter | | insmod sample.ko add_pred("bar == 0") | -> n_preds == 1 | add_pred("bar == 100") | -> n_preds == 2 | | rmmod sample.ko | insmod sample.ko add_pred("&&") | -> n_preds == 1 (should be 3) | Now event->filter->preds is corrupted. An then when filter_match_preds() is called, the WARN_ON() in it will be triggered. To avoid the race, we remove filter_mutex, and replace it with event_mutex. [ Impact: prevent corruption of filters by module removing and loading ] Signed-off-by: Li Zefan LKML-Reference: <4A375A4D.6000205@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d9f01c1a042..936c621bbf4 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -27,8 +27,6 @@ #include "trace.h" #include "trace_output.h" -static DEFINE_MUTEX(filter_mutex); - enum filter_op_ids { OP_OR, @@ -294,12 +292,12 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { struct event_filter *filter = call->filter; - mutex_lock(&filter_mutex); + mutex_lock(&event_mutex); if (filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); } void print_subsystem_event_filter(struct event_subsystem *system, @@ -307,12 +305,12 @@ void print_subsystem_event_filter(struct event_subsystem *system, { struct event_filter *filter = system->filter; - mutex_lock(&filter_mutex); + mutex_lock(&event_mutex); if (filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); } static struct ftrace_event_field * @@ -434,7 +432,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) filter->n_preds = 0; } - mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; @@ -444,7 +441,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) remove_filter_string(call->filter); } } - mutex_unlock(&event_mutex); } static int filter_add_pred_fn(struct filter_parse_state *ps, @@ -631,7 +627,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, filter->preds[filter->n_preds] = pred; filter->n_preds++; - mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -642,14 +637,12 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, err = filter_add_pred(ps, call, pred); if (err) { - mutex_unlock(&event_mutex); filter_free_subsystem_preds(system); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); goto out; } replace_filter_string(call->filter, filter_string); } - mutex_unlock(&event_mutex); out: return err; } @@ -1076,12 +1069,12 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) struct filter_parse_state *ps; - mutex_lock(&filter_mutex); + mutex_lock(&event_mutex); if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); return 0; } @@ -1109,7 +1102,7 @@ out: postfix_clear(ps); kfree(ps); out_unlock: - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); return err; } @@ -1121,12 +1114,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, struct filter_parse_state *ps; - mutex_lock(&filter_mutex); + mutex_lock(&event_mutex); if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); return 0; } @@ -1154,7 +1147,7 @@ out: postfix_clear(ps); kfree(ps); out_unlock: - mutex_unlock(&filter_mutex); + mutex_unlock(&event_mutex); return err; } -- cgit v1.2.3 From 22f470f8daea64bc03be1fe30c8c5df382295386 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Jun 2009 09:29:58 -0400 Subject: ring-buffer: use BUF_PAGE_HDR_SIZE in calculating index The index of the event is found by masking PAGE_MASK to it and subtracting the header size. Currently the header size is calculate by PAGE_SIZE - BUF_PAGE_SIZE, when we already have a macro BUF_PAGE_HDR_SIZE to define it. If we want to change BUF_PAGE_SIZE to something less than filling the rest of the page (this is done for debugging), then we break the algorithm to find the index. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ed3559944fc..6b17a11e42a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1013,7 +1013,7 @@ rb_event_index(struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; - return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); + return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } static inline int -- cgit v1.2.3 From c6a9d7b55e2df63de012a9a285bf2a0bee8e4d59 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Jun 2009 09:49:15 -0400 Subject: ring-buffer: remove useless warn on check A check if "write > BUF_PAGE_SIZE" is done right after a if (write > BUF_PAGE_SIZE) return ...; Thus the check is actually testing the compiler and not the kernel. This is useless, remove it. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6b17a11e42a..6cf340e1a4a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1334,9 +1334,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ - if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) - return NULL; - event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); -- cgit v1.2.3 From fd5e1b5dbaa8b4aacc0e251d74182eda37062194 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 15 Jun 2009 13:34:19 +0800 Subject: sched: Remove unneeded __ref tag Those two functions no longer call alloc_bootmmem_cpumask_var(), so no need to tag them with __init_refok. Signed-off-by: Li Zefan Acked-by: Pekka Enberg LKML-Reference: <4A35DD5B.9050106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_cpupri.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8fb88a906aa..00567959ab1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7828,7 +7828,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd, bool bootmem) { gfp_t gfp = GFP_KERNEL; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 7deffc9f0e5..e6c251790dd 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -152,7 +152,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * * Returns: -ENOMEM if memory fails. */ -int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) +int cpupri_init(struct cpupri *cp, bool bootmem) { gfp_t gfp = GFP_KERNEL; int i; -- cgit v1.2.3 From 348ec61e6268c3cd7ee75cfa50e408184a941506 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Wed, 17 Jun 2009 22:20:55 +0900 Subject: sched: Hide runqueues from direct refer at source code level There are some points which refer the per-cpu value "runqueues" directly. sched.c provides nice abstraction, such as cpu_rq() and this_rq(), so we should use these macros when looking runqueues. Signed-off-by: Hitoshi Mitake LKML-Reference: <20090617.222055.374768827975756908.mitake@dcl.info.waseda.ac.jp> Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 467ca72f165..70c7e0b7994 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, spread, rq0_min_vruntime, spread0; - struct rq *rq = &per_cpu(runqueues, cpu); + struct rq *rq = cpu_rq(cpu); struct sched_entity *last; unsigned long flags; @@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) if (last) max_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; - rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; + rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); @@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) static void print_cpu(struct seq_file *m, int cpu) { - struct rq *rq = &per_cpu(runqueues, cpu); + struct rq *rq = cpu_rq(cpu); #ifdef CONFIG_X86 { -- cgit v1.2.3 From 3104bf03a923c72043a9c5009d9cd56724304916 Mon Sep 17 00:00:00 2001 From: Christian Engelmayer Date: Tue, 16 Jun 2009 10:35:12 +0200 Subject: sched: Fix out of scope variable access in sched_slice() Access to local variable lw is aliased by usage of pointer load. Access to pointer load in calc_delta_mine() happens when lw is already out of scope. [ Reported by static code analysis. ] Signed-off-by: Christian Engelmayer LKML-Reference: <20090616103512.0c846e51@frequentis.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5f9650e8fe7..ba7fd6e9556 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -430,12 +430,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *load; + struct load_weight lw; cfs_rq = cfs_rq_of(se); load = &cfs_rq->load; if (unlikely(!se->on_rq)) { - struct load_weight lw = cfs_rq->load; + lw = cfs_rq->load; update_load_add(&lw, se->load.weight); load = &lw; -- cgit v1.2.3 From 0dcd4d6c3e5b17ccf88d41cb354bb4d57cb18cbf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 Jun 2009 14:03:44 -0400 Subject: ring-buffer: remove useless compile check for buffer_page size The original version of the ring buffer had a hack to map the page struct that held the pages of the buffer to also be the structure that the ring buffer would keep the pages in a link list. This overlap of the page struct was very dangerous and that hack was removed a while ago. But there was a check to make sure the buffer_page never became bigger than the page struct, and would fail the compile if it did. The check was only meaningful when we had the hack. Now that we have separate allocated descriptors for the buffer pages, we can remove this check. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6cf340e1a4a..162da2305cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -620,12 +620,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } -/* - * Causes compile errors if the struct buffer_page gets bigger - * than the struct page. - */ -extern int ring_buffer_page_too_big(void); - #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu); @@ -648,11 +642,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, int bsize; int cpu; - /* Paranoid! Optimizes out when all is well */ - if (sizeof(struct buffer_page) > sizeof(struct page)) - ring_buffer_page_too_big(); - - /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); -- cgit v1.2.3 From 5f78abeebbf0a80975d719e11374535ca15396cb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 Jun 2009 14:11:10 -0400 Subject: ring-buffer: check for less than two in size allocation The ring buffer must have at least two pages allocated for the reader page swap to work. The page count check will miss the case of a zero size passed in. Even though a zero size ring buffer would probably fail an allocation, making the min size check for less than two instead of equal to one makes the code a bit more robust. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 162da2305cb..2e99dba6dc4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -657,8 +657,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, buffer->reader_lock_key = key; /* need at least two pages */ - if (buffer->pages == 1) - buffer->pages++; + if (buffer->pages < 2) + buffer->pages = 2; /* * In case of non-hotplug cpu, if the ring-buffer is allocated -- cgit v1.2.3 From d47882078f05c2cb46b85f1e12a58ed9315b9d63 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 Jun 2009 00:39:43 -0400 Subject: ring-buffer: add locks around rb_per_cpu_empty The checking of whether the buffer is empty or not needs to be serialized among the readers. Add the reader spin lock around it. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2e99dba6dc4..969f7cbe8e9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2756,12 +2756,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset); int ring_buffer_empty(struct ring_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; int cpu; + int ret; /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - if (!rb_per_cpu_empty(cpu_buffer)) + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + ret = rb_per_cpu_empty(cpu_buffer); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (!ret) return 0; } @@ -2777,14 +2782,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; cpu_buffer = buffer->buffers[cpu]; + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); ret = rb_per_cpu_empty(cpu_buffer); - + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return ret; } -- cgit v1.2.3 From 8d707e8eb4de4b930573155ab4df4b3270ee25dd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Jun 2009 21:22:48 -0400 Subject: ring-buffer: do not grab locks in nmi If ftrace_dump_on_oops is set, and an NMI detects a lockup, then it will need to read from the ring buffer. But the read side of the ring buffer still takes locks. This patch adds a check on the read side that if it is in an NMI, then it will disable the ring buffer and not take any locks. Reads can still happen on a disabled ring buffer. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 59 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 969f7cbe8e9..589b3eedfa6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2466,6 +2466,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); +static inline int rb_ok_to_lock(void) +{ + /* + * If an NMI die dumps out the content of the ring buffer + * do not grab locks. We also permanently disable the ring + * buffer too. A one time deal is all you get from reading + * the ring buffer from an NMI. + */ + if (likely(!in_nmi() && !oops_in_progress)) + return 1; + + tracing_off_permanent(); + return 0; +} + /** * ring_buffer_peek - peek at the next event to be read * @buffer: The ring buffer to read @@ -2481,14 +2496,20 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; unsigned long flags; + int dolock; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; + dolock = rb_ok_to_lock(); again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); @@ -2540,6 +2561,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; unsigned long flags; + int dolock; + + dolock = rb_ok_to_lock(); again: /* might be called in atomic */ @@ -2549,7 +2573,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) goto out; cpu_buffer = buffer->buffers[cpu]; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); if (!event) @@ -2558,7 +2584,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) rb_advance_reader(cpu_buffer); out_unlock: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); out: preempt_enable(); @@ -2757,15 +2785,23 @@ int ring_buffer_empty(struct ring_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; + int dolock; int cpu; int ret; + dolock = rb_ok_to_lock(); + /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + if (!ret) return 0; } @@ -2783,15 +2819,22 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; + int dolock; int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; + dolock = rb_ok_to_lock(); + cpu_buffer = buffer->buffers[cpu]; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); return ret; } -- cgit v1.2.3 From 4b221f0313f0f7f1f7aa0a1fd16ad400840def26 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 Jun 2009 17:01:09 -0400 Subject: ring-buffer: have benchmark test print to trace buffer Currently the output of the ring buffer benchmark/test prints to the console. This test runs for ten seconds every ten seconds and ouputs the result after every iteration. This needlessly fills up the logs. This patch makes the ring buffer benchmark/test print to the ftrace buffer using trace_printk. To view the test results, you must examine the debug/tracing/trace file. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 37 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index cf6b0f50134..573d3cc762c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -203,7 +203,7 @@ static void ring_buffer_producer(void) * Hammer the buffer for 10 secs (this may * make the system stall) */ - pr_info("Starting ring buffer hammer\n"); + trace_printk("Starting ring buffer hammer\n"); do_gettimeofday(&start_tv); do { struct ring_buffer_event *event; @@ -239,7 +239,7 @@ static void ring_buffer_producer(void) #endif } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); - pr_info("End ring buffer hammer\n"); + trace_printk("End ring buffer hammer\n"); if (consumer) { /* Init both completions here to avoid races */ @@ -262,49 +262,50 @@ static void ring_buffer_producer(void) overruns = ring_buffer_overruns(buffer); if (kill_test) - pr_info("ERROR!\n"); - pr_info("Time: %lld (usecs)\n", time); - pr_info("Overruns: %lld\n", overruns); + trace_printk("ERROR!\n"); + trace_printk("Time: %lld (usecs)\n", time); + trace_printk("Overruns: %lld\n", overruns); if (disable_reader) - pr_info("Read: (reader disabled)\n"); + trace_printk("Read: (reader disabled)\n"); else - pr_info("Read: %ld (by %s)\n", read, + trace_printk("Read: %ld (by %s)\n", read, read_events ? "events" : "pages"); - pr_info("Entries: %lld\n", entries); - pr_info("Total: %lld\n", entries + overruns + read); - pr_info("Missed: %ld\n", missed); - pr_info("Hit: %ld\n", hit); + trace_printk("Entries: %lld\n", entries); + trace_printk("Total: %lld\n", entries + overruns + read); + trace_printk("Missed: %ld\n", missed); + trace_printk("Hit: %ld\n", hit); /* Convert time from usecs to millisecs */ do_div(time, USEC_PER_MSEC); if (time) hit /= (long)time; else - pr_info("TIME IS ZERO??\n"); + trace_printk("TIME IS ZERO??\n"); - pr_info("Entries per millisec: %ld\n", hit); + trace_printk("Entries per millisec: %ld\n", hit); if (hit) { /* Calculate the average time in nanosecs */ avg = NSEC_PER_MSEC / hit; - pr_info("%ld ns per entry\n", avg); + trace_printk("%ld ns per entry\n", avg); } if (missed) { if (time) missed /= (long)time; - pr_info("Total iterations per millisec: %ld\n", hit + missed); + trace_printk("Total iterations per millisec: %ld\n", + hit + missed); /* it is possible that hit + missed will overflow and be zero */ if (!(hit + missed)) { - pr_info("hit + missed overflowed and totalled zero!\n"); + trace_printk("hit + missed overflowed and totalled zero!\n"); hit--; /* make it non zero */ } /* Caculate the average time in nanosecs */ avg = NSEC_PER_MSEC / (hit + missed); - pr_info("%ld ns per entry\n", avg); + trace_printk("%ld ns per entry\n", avg); } } @@ -355,7 +356,7 @@ static int ring_buffer_producer_thread(void *arg) ring_buffer_producer(); - pr_info("Sleeping for 10 secs\n"); + trace_printk("Sleeping for 10 secs\n"); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ * SLEEP_TIME); __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From 43a21ea81a2400992561146327c4785ce7f7be38 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 19:39:37 +0100 Subject: perf_counter: Add event overlow handling Alternative method of mmap() data output handling that provides better overflow management and a more reliable data stream. Unlike the previous method, that didn't have any user->kernel feedback and relied on userspace keeping up, this method relies on userspace writing its last read position into the control page. It will ensure new output doesn't overwrite not-yet read events, new events for which there is no space left are lost and the overflow counter is incremented, providing exact event loss numbers. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 185 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 130 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 109a9572385..7e9108efd30 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1794,6 +1794,12 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct perf_mmap_data *data; int ret = VM_FAULT_SIGBUS; + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + rcu_read_lock(); data = rcu_dereference(counter->data); if (!data) @@ -1807,9 +1813,16 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if ((unsigned)nr > data->nr_pages) goto unlock; + if (vmf->flags & FAULT_FLAG_WRITE) + goto unlock; + vmf->page = virt_to_page(data->data_pages[nr]); } + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + ret = 0; unlock: rcu_read_unlock(); @@ -1862,6 +1875,14 @@ fail: return -ENOMEM; } +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page(addr); + + page->mapping = NULL; + __free_page(page); +} + static void __perf_mmap_data_free(struct rcu_head *rcu_head) { struct perf_mmap_data *data; @@ -1869,9 +1890,10 @@ static void __perf_mmap_data_free(struct rcu_head *rcu_head) data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - free_page((unsigned long)data->user_page); + perf_mmap_free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) - free_page((unsigned long)data->data_pages[i]); + perf_mmap_free_page((unsigned long)data->data_pages[i]); + kfree(data); } @@ -1908,9 +1930,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) } static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, }; static int perf_mmap(struct file *file, struct vm_area_struct *vma) @@ -1924,7 +1947,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) long user_extra, extra; int ret = 0; - if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) + if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; vma_size = vma->vm_end - vma->vm_start; @@ -1983,10 +2006,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; counter->data->nr_locked = extra; + if (vma->vm_flags & VM_WRITE) + counter->data->writable = 1; + unlock: mutex_unlock(&counter->mmap_mutex); - vma->vm_flags &= ~VM_MAYWRITE; vma->vm_flags |= VM_RESERVED; vma->vm_ops = &perf_mmap_vmops; @@ -2163,11 +2188,38 @@ struct perf_output_handle { unsigned long head; unsigned long offset; int nmi; - int overflow; + int sample; int locked; unsigned long flags; }; +static bool perf_output_space(struct perf_mmap_data *data, + unsigned int offset, unsigned int head) +{ + unsigned long tail; + unsigned long mask; + + if (!data->writable) + return true; + + mask = (data->nr_pages << PAGE_SHIFT) - 1; + /* + * Userspace could choose to issue a mb() before updating the tail + * pointer. So that all reads will be completed before the write is + * issued. + */ + tail = ACCESS_ONCE(data->user_page->data_tail); + smp_rmb(); + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->data->poll, POLL_IN); @@ -2258,12 +2310,57 @@ out: local_irq_restore(handle->flags); } +static void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; + + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, - int nmi, int overflow) + int nmi, int sample) { struct perf_mmap_data *data; unsigned int offset, head; + int have_lost; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; /* * For inherited counters we send all the output towards the parent. @@ -2276,19 +2373,25 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data) goto out; - handle->data = data; - handle->counter = counter; - handle->nmi = nmi; - handle->overflow = overflow; + handle->data = data; + handle->counter = counter; + handle->nmi = nmi; + handle->sample = sample; if (!data->nr_pages) goto fail; + have_lost = atomic_read(&data->lost); + if (have_lost) + size += sizeof(lost_event); + perf_output_lock(handle); do { offset = head = atomic_long_read(&data->head); head += size; + if (unlikely(!perf_output_space(data, offset, head))) + goto fail; } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); handle->offset = offset; @@ -2297,55 +2400,27 @@ static int perf_output_begin(struct perf_output_handle *handle, if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) atomic_set(&data->wakeup, 1); + if (have_lost) { + lost_event.header.type = PERF_EVENT_LOST; + lost_event.header.misc = 0; + lost_event.header.size = sizeof(lost_event); + lost_event.id = counter->id; + lost_event.lost = atomic_xchg(&data->lost, 0); + + perf_output_put(handle, lost_event); + } + return 0; fail: - perf_output_wakeup(handle); + atomic_inc(&data->lost); + perf_output_unlock(handle); out: rcu_read_unlock(); return -ENOSPC; } -static void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - unsigned int pages_mask; - unsigned int offset; - unsigned int size; - void **pages; - - offset = handle->offset; - pages_mask = handle->data->nr_pages - 1; - pages = handle->data->data_pages; - - do { - unsigned int page_offset; - int nr; - - nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); - - memcpy(pages[nr] + page_offset, buf, size); - - len -= size; - buf += size; - offset += size; - } while (len); - - handle->offset = offset; - - /* - * Check we didn't copy past our reservation window, taking the - * possible unsigned int wrap into account. - */ - WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); -} - -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) - static void perf_output_end(struct perf_output_handle *handle) { struct perf_counter *counter = handle->counter; @@ -2353,7 +2428,7 @@ static void perf_output_end(struct perf_output_handle *handle) int wakeup_events = counter->attr.wakeup_events; - if (handle->overflow && wakeup_events) { + if (handle->sample && wakeup_events) { int events = atomic_inc_return(&data->events); if (events >= wakeup_events) { atomic_sub(wakeup_events, &data->events); @@ -2958,7 +3033,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) } /* - * Generic counter overflow handling. + * Generic counter overflow handling, sampling. */ int perf_counter_overflow(struct perf_counter *counter, int nmi, -- cgit v1.2.3 From eb4a03780d4c4464ef2ad86d80cca3f3284fe81d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 18 Jun 2009 12:53:21 -0400 Subject: function-graph: disable when both x86_32 and optimize for size are configured On x86_32, when optimize for size is set, gcc may align the frame pointer and make a copy of the the return address inside the stack frame. The return address that is located in the stack frame may not be the one used to return to the calling function. This will break the function graph tracer. The function graph tracer replaces the return address with a jump to a hook function that can trace the exit of the function. If it only replaces a copy, then the hook will not be called when the function returns. Worse yet, when the parent function returns, the function graph tracer will return back to the location of the child function which will easily crash the kernel with weird results. To see the problem, when i386 is compiled with -Os we get: c106be03: 57 push %edi c106be04: 8d 7c 24 08 lea 0x8(%esp),%edi c106be08: 83 e4 e0 and $0xffffffe0,%esp c106be0b: ff 77 fc pushl 0xfffffffc(%edi) c106be0e: 55 push %ebp c106be0f: 89 e5 mov %esp,%ebp c106be11: 57 push %edi c106be12: 56 push %esi c106be13: 53 push %ebx c106be14: 81 ec 8c 00 00 00 sub $0x8c,%esp c106be1a: e8 f5 57 fb ff call c1021614 When it is compiled with -O2 instead we get: c10896f0: 55 push %ebp c10896f1: 89 e5 mov %esp,%ebp c10896f3: 83 ec 28 sub $0x28,%esp c10896f6: 89 5d f4 mov %ebx,0xfffffff4(%ebp) c10896f9: 89 75 f8 mov %esi,0xfffffff8(%ebp) c10896fc: 89 7d fc mov %edi,0xfffffffc(%ebp) c10896ff: e8 d0 08 fa ff call c1029fd4 The compile with -Os will align the stack pointer then set up the frame pointer (%ebp), and it copies the return address back into the stack frame. The change to the return address in mcount is done to the copy and not the real place holder of the return address. Then compile with -O2 sets up the frame pointer first, this makes the change to the return address by mcount affect where the function will jump on exit. Reported-by: Jake Edge Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4a13e5a01ce..1eac85253ce 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -121,6 +121,7 @@ config FUNCTION_GRAPH_TRACER bool "Kernel Function Graph Tracer" depends on HAVE_FUNCTION_GRAPH_TRACER depends on FUNCTION_TRACER + depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE default y help Enable the kernel to trace a function at both its return -- cgit v1.2.3 From 71e308a239c098673570d0b417d42262bb535909 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 18 Jun 2009 12:45:08 -0400 Subject: function-graph: add stack frame test In case gcc does something funny with the stack frames, or the return from function code, we would like to detect that. An arch may implement passing of a variable that is unique to the function and can be saved on entering a function and can be tested when exiting the function. Usually the frame pointer can be used for this purpose. This patch also implements this for x86. Where it passes in the stack frame of the parent function, and will test that frame on exit. There was a case in x86_32 with optimize for size (-Os) where, for a few functions, gcc would align the stack frame and place a copy of the return address into it. The function graph tracer modified the copy and not the actual return address. On return from the funtion, it did not go to the tracer hook, but returned to the parent. This broke the function graph tracer, because the return of the parent (where gcc did not do this funky manipulation) returned to the location that the child function was suppose to. This caused strange kernel crashes. This test detected the problem and pointed out where the issue was. This modifies the parameters of one of the functions that the arch specific code calls, so it includes changes to arch code to accommodate the new prototype. Note, I notice that the parsic arch implements its own push_return_trace. This is now a generic function and the ftrace_push_return_trace should be used instead. This patch does not touch that code. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Frederic Weisbecker Cc: Helge Deller Cc: Kyle McMartin Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 7 +++++++ kernel/trace/trace_functions_graph.c | 36 ++++++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1eac85253ce..b17ed8787de 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER config HAVE_FUNCTION_GRAPH_TRACER bool +config HAVE_FUNCTION_GRAPH_FP_TEST + bool + help + An arch may pass in a unique value (frame pointer) to both the + entering and exiting of a function. On exit, the value is compared + and if it does not match, then it will panic the kernel. + config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8b592418d8b..d2249abafb5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -57,7 +57,8 @@ static struct tracer_flags tracer_flags = { /* Add a function return address to the trace stack on thread info.*/ int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) +ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, + unsigned long frame_pointer) { unsigned long long calltime; int index; @@ -85,6 +86,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; current->ret_stack[index].subtime = 0; + current->ret_stack[index].fp = frame_pointer; *depth = index; return 0; @@ -92,7 +94,8 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) /* Retrieve a function return address to the trace stack on thread info.*/ static void -ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) +ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, + unsigned long frame_pointer) { int index; @@ -106,6 +109,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) return; } +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST + /* + * The arch may choose to record the frame pointer used + * and check it here to make sure that it is what we expect it + * to be. If gcc does not set the place holder of the return + * address in the frame pointer, and does a copy instead, then + * the function graph trace will fail. This test detects this + * case. + * + * Currently, x86_32 with optimize for size (-Os) makes the latest + * gcc do the above. + */ + if (unlikely(current->ret_stack[index].fp != frame_pointer)) { + ftrace_graph_stop(); + WARN(1, "Bad frame pointer: expected %lx, received %lx\n" + " from func %pF return to %lx\n", + current->ret_stack[index].fp, + frame_pointer, + (void *)current->ret_stack[index].func, + current->ret_stack[index].ret); + *ret = (unsigned long)panic; + return; + } +#endif + *ret = current->ret_stack[index].ret; trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; @@ -117,12 +145,12 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) * Send the trace to the ring-buffer. * @return the original return address. */ -unsigned long ftrace_return_to_handler(void) +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) { struct ftrace_graph_ret trace; unsigned long ret; - ftrace_pop_return_trace(&trace, &ret); + ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); barrier(); -- cgit v1.2.3 From e5289d4a181fb6c0b7a7607649af2ffdc491335c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 Jun 2009 13:22:51 +0200 Subject: perf_counter: Simplify and fix task migration counting The task migrations counter was causing rare and hard to decypher memory corruptions under load. After a day of debugging and bisection we found that the problem was introduced with: 3f731ca: perf_counter: Fix cpu migration counter Turning them off fixes the crashes. Incidentally, the whole perf_counter_task_migration() logic can be done simpler as well, by injecting a proper sw-counter event. This cleanup also fixed the crashes. The precise failure mode is not completely clear yet, but we are clearly not unhappy about having a fix ;-) Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 23 +---------------------- kernel/sched.c | 3 ++- 2 files changed, 3 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7e9108efd30..8d4f0dd41c2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -124,7 +124,7 @@ void perf_enable(void) static void get_ctx(struct perf_counter_context *ctx) { - atomic_inc(&ctx->refcount); + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); } static void free_ctx(struct rcu_head *head) @@ -3467,27 +3467,6 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -/* - * Software counter: cpu migrations - */ -void perf_counter_task_migration(struct task_struct *task, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx; - - perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE, - PERF_COUNT_SW_CPU_MIGRATIONS, - 1, 1, NULL, 0); - - ctx = perf_pin_task_context(task); - if (ctx) { - perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE, - PERF_COUNT_SW_CPU_MIGRATIONS, - 1, 1, NULL, 0); - perf_unpin_context(ctx); - } -} - #ifdef CONFIG_EVENT_PROFILE void perf_tpcounter_event(int event_id) { diff --git a/kernel/sched.c b/kernel/sched.c index 8fb88a906aa..f46540b359c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1978,7 +1978,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif - perf_counter_task_migration(p, new_cpu); + perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + 1, 1, NULL, 0); } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; -- cgit v1.2.3 From b49a9e7e72103ea91946453c19703a4dfa1994fe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 Jun 2009 17:39:33 +0200 Subject: perf_counter: Close race in perf_lock_task_context() perf_lock_task_context() is buggy because it can return a dead context. the RCU read lock in perf_lock_task_context() only guarantees the memory won't get freed, it doesn't guarantee the object is valid (in our case refcount > 0). Therefore we can return a locked object that can get freed the moment we release the rcu read lock. perf_pin_task_context() then increases the refcount and does an unlock on freed memory. That increased refcount will cause a double free, in case it started out with 0. Ammend this by including the get_ctx() functionality in perf_lock_task_context() (all users already did this later anyway), and return a NULL context when the found one is already dead. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8d4f0dd41c2..adb6ae506d5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -175,6 +175,11 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } } rcu_read_unlock(); return ctx; @@ -193,7 +198,6 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; - get_ctx(ctx); spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; @@ -1459,11 +1463,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) put_ctx(parent_ctx); ctx->parent_ctx = NULL; /* no longer a clone */ } - /* - * Get an extra reference before dropping the lock so that - * this context won't get freed if the task exits. - */ - get_ctx(ctx); spin_unlock_irqrestore(&ctx->lock, flags); } -- cgit v1.2.3 From befca96779b0259ac8fad0183e748a62935c39cb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 18 Jun 2009 16:49:11 -0700 Subject: ptrace: wait_task_zombie: do not account traced sub-threads The bug is ancient. If we trace the sub-thread of our natural child and this sub-thread exits, we update parent->signal->cxxx fields. But we should not do this until the whole thread-group exits, otherwise we account this thread (and all other live threads) twice. Add the task_detached() check. No need to check thread_group_empty(), wait_consider_task()->delay_group_leader() already did this. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Acked-by: Roland McGrath Cc: Stanislaw Gruszka Cc: Vitaly Mayatskikh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 13ae64001fe..628d41f0dd5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1197,8 +1197,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) } traced = ptrace_reparented(p); - - if (likely(!traced)) { + /* + * It can be ptraced but not reparented, check + * !task_detached() to filter out sub-threads. + */ + if (likely(!traced) && likely(!task_detached(p))) { struct signal_struct *psig; struct signal_struct *sig; -- cgit v1.2.3 From c85a17e22695969aa24a7ffa40cf26d6e6fcfd50 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Jun 2009 05:45:14 +0200 Subject: tracing/urgent: fix unbalanced ftrace_start_up Perfcounter reports the following stats for a wide system profiling: # # (2364 samples) # # Overhead Symbol # ........ ...... # 15.40% [k] mwait_idle_with_hints 8.29% [k] read_hpet 5.75% [k] ftrace_caller 3.60% [k] ftrace_call [...] This snapshot has been taken while neither the function tracer nor the function graph tracer was running. With dynamic ftrace, such results show a wrong ftrace behaviour because all calls to ftrace_caller or ftrace_graph_caller (the patched calls to mcount) are supposed to be patched into nop if none of those tracers are running. The problem occurs after the first run of the function tracer. Once we launch it a second time, the callsites will never be nopped back, unless you set custom filters. For example it happens during the self tests at boot time. The function tracer selftest runs, and then the dynamic tracing is tested too. After that, the callsites are left un-nopped. This is because the reset callback of the function tracer tries to unregister two ftrace callbacks in once: the common function tracer and the function tracer with stack backtrace, regardless of which one is currently in use. It then creates an unbalance on ftrace_start_up value which is expected to be zero when the last ftrace callback is unregistered. When it reaches zero, the FTRACE_DISABLE_CALLS is set on the next ftrace command, triggering the patching into nop. But since it becomes unbalanced, ie becomes lower than zero, if the kernel functions are patched again (as in every further function tracer runs), they won't ever be nopped back. Note that ftrace_call and ftrace_graph_call are still patched back to ftrace_stub in the off case, but not the callers of ftrace_call and ftrace_graph_caller. It means that the tracing is well deactivated but we waste a useless call into every kernel function. This patch just unregisters the right ftrace_ops for the function tracer on its reset callback and ignores the other one which is not registered, fixing the unbalance. The problem also happens is .30 Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: stable@kernel.org --- kernel/trace/trace_functions.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c9a0b7df44f..90f13476483 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -193,9 +193,11 @@ static void tracing_start_function_trace(void) static void tracing_stop_function_trace(void) { ftrace_function_enabled = 0; - /* OK if they are not registered */ - unregister_ftrace_function(&trace_stack_ops); - unregister_ftrace_function(&trace_ops); + + if (func_flags.val & TRACE_FUNC_OPT_STACK) + unregister_ftrace_function(&trace_stack_ops); + else + unregister_ftrace_function(&trace_ops); } static int func_set_flag(u32 old_flags, u32 bit, int set) -- cgit v1.2.3 From 9ea1a153a4fb435c22e9988784bb476671286112 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Jun 2009 06:52:21 +0200 Subject: tracing/urgent: warn in case of ftrace_start_up inbalance Prevent from further ftrace_start_up inbalances so that we avoid future nop patching omissions with dynamic ftrace. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/ftrace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bb60732ade0..3718d55fb4c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1224,6 +1224,13 @@ static void ftrace_shutdown(int command) return; ftrace_start_up--; + /* + * Just warn in case of unbalance, no need to kill ftrace, it's not + * critical but the ftrace_call callers may be never nopped again after + * further ftrace uses. + */ + WARN_ON_ONCE(ftrace_start_up < 0); + if (!ftrace_start_up) command |= FTRACE_DISABLE_CALLS; -- cgit v1.2.3 From 92bf309a9cd5fedd6c8eefbce0b9a95ada82d0a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 Jun 2009 18:11:53 +0200 Subject: perf_counter: Push perf_sample_data through the swcounter code Push the perf_sample_data further outwards to the swcounter interface, to abstract it away some more. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 55 +++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index adb6ae506d5..1a933a221ea 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3171,20 +3171,15 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) } static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs, u64 addr) + int nmi, struct perf_sample_data *data) { - struct perf_sample_data data = { - .regs = regs, - .addr = addr, - .period = counter->hw.last_period, - }; + data->period = counter->hw.last_period; perf_swcounter_update(counter); perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, &data)) + if (perf_counter_overflow(counter, nmi, data)) /* soft-disable the counter */ ; - } static int perf_swcounter_is_counting(struct perf_counter *counter) @@ -3249,18 +3244,18 @@ static int perf_swcounter_match(struct perf_counter *counter, } static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct pt_regs *regs, u64 addr) + int nmi, struct perf_sample_data *data) { int neg = atomic64_add_negative(nr, &counter->hw.count); - if (counter->hw.sample_period && !neg && regs) - perf_swcounter_overflow(counter, nmi, regs, addr); + if (counter->hw.sample_period && !neg && data->regs) + perf_swcounter_overflow(counter, nmi, data); } static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum perf_type_id type, u32 event, - u64 nr, int nmi, struct pt_regs *regs, - u64 addr) + enum perf_type_id type, + u32 event, u64 nr, int nmi, + struct perf_sample_data *data) { struct perf_counter *counter; @@ -3269,8 +3264,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event, regs)) - perf_swcounter_add(counter, nr, nmi, regs, addr); + if (perf_swcounter_match(counter, type, event, data->regs)) + perf_swcounter_add(counter, nr, nmi, data); } rcu_read_unlock(); } @@ -3289,9 +3284,9 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) return &cpuctx->recursion[0]; } -static void __perf_swcounter_event(enum perf_type_id type, u32 event, - u64 nr, int nmi, struct pt_regs *regs, - u64 addr) +static void do_perf_swcounter_event(enum perf_type_id type, u32 event, + u64 nr, int nmi, + struct perf_sample_data *data) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); int *recursion = perf_swcounter_recursion_context(cpuctx); @@ -3304,7 +3299,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event, barrier(); perf_swcounter_ctx_event(&cpuctx->ctx, type, event, - nr, nmi, regs, addr); + nr, nmi, data); rcu_read_lock(); /* * doesn't really matter which of the child contexts the @@ -3312,7 +3307,7 @@ static void __perf_swcounter_event(enum perf_type_id type, u32 event, */ ctx = rcu_dereference(current->perf_counter_ctxp); if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr); + perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data); rcu_read_unlock(); barrier(); @@ -3325,7 +3320,12 @@ out: void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); + struct perf_sample_data data = { + .regs = regs, + .addr = addr, + }; + + do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data); } static void perf_swcounter_read(struct perf_counter *counter) @@ -3469,12 +3469,15 @@ static const struct pmu perf_ops_task_clock = { #ifdef CONFIG_EVENT_PROFILE void perf_tpcounter_event(int event_id) { - struct pt_regs *regs = get_irq_regs(); + struct perf_sample_data data = { + .regs = get_irq_regs(); + .addr = 0, + }; - if (!regs) - regs = task_pt_regs(current); + if (!data.regs) + data.regs = task_pt_regs(current); - __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); -- cgit v1.2.3 From 31950eb66ff47c946fd9c65c2f8c94b6b7ba13fc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 22 Jun 2009 21:18:12 -0700 Subject: mm/init: cpu_hotplug_init() must be initialized before SLAB SLAB uses get/put_online_cpus() which use a mutex which is itself only initialized when cpu_hotplug_init() is called. Currently we hang suring boot in SLAB due to doing that too late. Reported by James Bottomley and Sachin Sant (and possibly others). Debugged by Benjamin Herrenschmidt. This just removes the dynamic initialization of the data structures, and replaces it with a static one, avoiding this dependency entirely, and removing one unnecessary special initcall. Tested-by: Sachin Sant Tested-by: James Bottomley Tested-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- kernel/cpu.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 395b6974dc8..8ce10043e4a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -34,14 +34,11 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; -} cpu_hotplug; - -void __init cpu_hotplug_init(void) -{ - cpu_hotplug.active_writer = NULL; - mutex_init(&cpu_hotplug.lock); - cpu_hotplug.refcount = 0; -} +} cpu_hotplug = { + .active_writer = NULL, + .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), + .refcount = 0, +}; #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3 From bfdb4d9f0f611687d71cf6a460efc9e755f4a462 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Tue, 23 Jun 2009 10:00:58 +0530 Subject: timers: Fix timer_migration interface which accepts any number as input Poornima Nayek reported: | Timer migration interface /proc/sys/kernel/timer_migration in | 2.6.30-git9 accepts any numerical value as input. | | Steps to reproduce: | 1. echo -6666666 > /proc/sys/kernel/timer_migration | 2. cat /proc/sys/kernel/timer_migration | -6666666 | | 1. echo 44444444444444444444444444444444444444444444444444444444444 > /proc/sys/kernel/timer_migration | 2. cat /proc/sys/kernel/timer_migration | -1357789412 | | Expected behavior: Should 'echo: write error: Invalid argument' while | setting any value other then 0 & 1 Restrict valid values to 0 and 1. Reported-by: Poornima Nayak Tested-by: Poornima Nayak Signed-off-by: Arun R Bharadwaj Cc: poornima nayak Cc: Arun Bharadwaj LKML-Reference: <20090623043058.GA3249@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 62e4ff9968b..c428ba161db 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -335,7 +335,10 @@ static struct ctl_table kern_table[] = { .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, }, #endif { -- cgit v1.2.3 From 35aa901c0b66cb3c2eeee23f13624014825a44a8 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:33 -0400 Subject: Audit: fix audit watch use after free When an audit watch is added to a parent the temporary watch inside the original krule from userspace is freed. Yet the original watch is used after the real watch was created in audit_add_rules() Signed-off-by: Eric Paris --- kernel/auditfilter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 713098ee5a0..19c0a0a2ced 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1320,6 +1320,8 @@ static inline int audit_add_rule(struct audit_entry *entry) mutex_unlock(&audit_filter_mutex); goto error; } + /* entry->rule.watch may have changed during audit_add_watch() */ + watch = entry->rule.watch; h = audit_hash_ino((u32)watch->ino); list = &audit_inode_hash[h]; } -- cgit v1.2.3 From b87ce6e4187c24b06483c8266822ce5e6b7fa7f3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:34 -0400 Subject: Audit: better estimation of execve record length The audit execve record splitting code estimates the length of the message generated. But it forgot to include the "" that wrap each string in its estimation. This means that execve messages with lots of tiny (1-2 byte) arguments could still cause records greater than 8k to be emitted. Simply fix the estimate. Signed-off-by: Eric Paris --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7d6ac7c1f41..b14d234b85f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1024,8 +1024,8 @@ static int audit_log_single_execve_arg(struct audit_context *context, { char arg_num_len_buf[12]; const char __user *tmp_p = p; - /* how many digits are in arg_num? 3 is the length of " a=" */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; + /* how many digits are in arg_num? 5 is the length of ' a=""' */ + size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; size_t len, len_left, to_send; size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; unsigned int i, has_cntl = 0, too_long = 0; -- cgit v1.2.3 From e85188f424c8eec7f311deed9a70bec57aeed741 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:34 -0400 Subject: Audit: dereferencing krule as if it were an audit_watch audit_update_watch() runs all of the rules for a given watch and duplicates them, attaches a new watch to them, and then when it finishes that process and has called free on all of the old rules (ok maybe still inside the rcu grace period) it proceeds to use the last element from list_for_each_entry_safe() as if it were a krule rather than being the audit_watch which was anchoring the list to output a message about audit rules changing. This patch unfies the audit message from two different places into a helper function and calls it from the correct location in audit_update_rules(). We will now get an audit message about the config changing for each rule (with each rules filterkey) rather than the previous garbage. Signed-off-by: Eric Paris --- kernel/auditfilter.c | 58 ++++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 19c0a0a2ced..e7466dd145c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -977,6 +977,27 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, return entry; } +static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) +{ + if (audit_enabled) { + struct audit_buffer *ab; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "auid=%u ses=%u op=", + audit_get_loginuid(current), + audit_get_sessionid(current)); + audit_log_string(ab, op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, w->path); + if (r->filterkey) { + audit_log_format(ab, " key="); + audit_log_untrustedstring(ab, r->filterkey); + } else + audit_log_format(ab, " key=(null)"); + audit_log_format(ab, " list=%d res=1", r->listnr); + audit_log_end(ab); + } +} + /* Update inode info in audit rules based on filesystem event. */ static void audit_update_watch(struct audit_parent *parent, const char *dname, dev_t dev, @@ -1023,24 +1044,11 @@ static void audit_update_watch(struct audit_parent *parent, &nentry->rule.list); } + audit_watch_log_rule_change(r, owatch, "updated rules"); + call_rcu(&oentry->rcu, audit_free_rule_rcu); } - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, - AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "auid=%u ses=%u", - audit_get_loginuid(current), - audit_get_sessionid(current)); - audit_log_format(ab, - " op=updated rules specifying path="); - audit_log_untrustedstring(ab, owatch->path); - audit_log_format(ab, " with dev=%u ino=%lu\n", - dev, ino); - audit_log_format(ab, " list=%d res=1", r->listnr); - audit_log_end(ab); - } audit_remove_watch(owatch); goto add_watch_to_parent; /* event applies to a single watch */ } @@ -1065,25 +1073,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, - AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "auid=%u ses=%u", - audit_get_loginuid(current), - audit_get_sessionid(current)); - audit_log_format(ab, " op=remove rule path="); - audit_log_untrustedstring(ab, w->path); - if (r->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, - r->filterkey); - } else - audit_log_format(ab, " key=(null)"); - audit_log_format(ab, " list=%d res=1", - r->listnr); - audit_log_end(ab); - } + audit_watch_log_rule_change(r, w, "remove rule"); list_del(&r->rlist); list_del(&r->list); list_del_rcu(&e->list); -- cgit v1.2.3 From 038cbcf65fd6a30c79e3917690b8c46321a27915 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:35 -0400 Subject: Audit: unify the printk of an skb when auditd not around Remove code duplication of skb printk when auditd is not around in userspace to deal with this message. Signed-off-by: Eric Paris --- kernel/audit.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 9442c3533ba..f7ab4a479cd 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -375,6 +375,25 @@ static void audit_hold_skb(struct sk_buff *skb) kfree_skb(skb); } +/* + * For one reason or another this nlh isn't getting delivered to the userspace + * audit daemon, just send it to printk. + */ +static void audit_printk_skb(struct sk_buff *skb) +{ + struct nlmsghdr *nlh = nlmsg_hdr(skb); + char *data = NLMSG_DATA(nlh); + + if (nlh->nlmsg_type != AUDIT_EOE) { + if (printk_ratelimit()) + printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data); + else + audit_log_lost("printk limit exceeded\n"); + } + + audit_hold_skb(skb); +} + static void kauditd_send_skb(struct sk_buff *skb) { int err; @@ -427,14 +446,8 @@ static int kauditd_thread(void *dummy) if (skb) { if (audit_pid) kauditd_send_skb(skb); - else { - if (printk_ratelimit()) - printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); - else - audit_log_lost("printk limit exceeded\n"); - - audit_hold_skb(skb); - } + else + audit_printk_skb(skb); } else { DECLARE_WAITQUEUE(wait, current); set_current_state(TASK_INTERRUPTIBLE); @@ -1475,15 +1488,7 @@ void audit_log_end(struct audit_buffer *ab) skb_queue_tail(&audit_skb_queue, ab->skb); wake_up_interruptible(&kauditd_wait); } else { - if (nlh->nlmsg_type != AUDIT_EOE) { - if (printk_ratelimit()) { - printk(KERN_NOTICE "type=%d %s\n", - nlh->nlmsg_type, - ab->skb->data + NLMSG_SPACE(0)); - } else - audit_log_lost("printk limit exceeded\n"); - } - audit_hold_skb(ab->skb); + audit_printk_skb(ab->skb); } ab->skb = NULL; } -- cgit v1.2.3 From ee080e6ce93d5993390bccf68c1df5efd9351276 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:35 -0400 Subject: Audit: cleanup netlink mesg handling The audit handling of netlink messages is all over the place. Clean things up, use predetermined macros, generally make it more readable. Signed-off-by: Eric Paris --- kernel/audit.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index f7ab4a479cd..01082a1d2bc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -528,22 +528,20 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, { struct sk_buff *skb; struct nlmsghdr *nlh; - int len = NLMSG_SPACE(size); void *data; int flags = multi ? NLM_F_MULTI : 0; int t = done ? NLMSG_DONE : type; - skb = alloc_skb(len, GFP_KERNEL); + skb = nlmsg_new(size, GFP_KERNEL); if (!skb) return NULL; - nlh = NLMSG_PUT(skb, pid, seq, t, size); - nlh->nlmsg_flags = flags; - data = NLMSG_DATA(nlh); + nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); + data = NLMSG_DATA(nlh); memcpy(data, payload, size); return skb; -nlmsg_failure: /* Used by NLMSG_PUT */ +nlmsg_failure: /* Used by NLMSG_NEW */ if (skb) kfree_skb(skb); return NULL; @@ -1083,18 +1081,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, goto err; } - ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask); - if (!ab->skb) - goto err; - ab->ctx = ctx; ab->gfp_mask = gfp_mask; - nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); - nlh->nlmsg_type = type; - nlh->nlmsg_flags = 0; - nlh->nlmsg_pid = 0; - nlh->nlmsg_seq = 0; + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); + if (!ab->skb) + goto nlmsg_failure; + + nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); + return ab; + +nlmsg_failure: /* Used by NLMSG_NEW */ + kfree_skb(ab->skb); + ab->skb = NULL; err: audit_buffer_free(ab); return NULL; -- cgit v1.2.3 From ea7ae60bfe39aeedfb29571c47280bf0067ee5f3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:35 -0400 Subject: Audit: clean up audit_receive_skb audit_receive_skb is hard to clearly parse what it is doing to the netlink message. Clean the function up so it is easy and clear to see what is going on. Signed-off-by: Eric Paris --- kernel/audit.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 01082a1d2bc..ce77e81a0e7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -937,28 +937,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* - * Get message from skb (based on rtnetlink_rcv_skb). Each message is - * processed by audit_receive_msg. Malformed skbs with wrong length are - * discarded silently. + * Get message from skb. Each message is processed by audit_receive_msg. + * Malformed skbs with wrong length are discarded silently. */ static void audit_receive_skb(struct sk_buff *skb) { - int err; - struct nlmsghdr *nlh; - u32 rlen; + struct nlmsghdr *nlh; + /* + * len MUST be signed for NLMSG_NEXT to be able to dec it below 0 + * if the nlmsg_len was not aligned + */ + int len; + int err; - while (skb->len >= NLMSG_SPACE(0)) { - nlh = nlmsg_hdr(skb); - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - if ((err = audit_receive_msg(skb, nlh))) { + nlh = nlmsg_hdr(skb); + len = skb->len; + + while (NLMSG_OK(nlh, len)) { + err = audit_receive_msg(skb, nlh); + /* if err or if this message says it wants a response */ + if (err || (nlh->nlmsg_flags & NLM_F_ACK)) netlink_ack(skb, nlh, err); - } else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); + + nlh = NLMSG_NEXT(nlh, len); } } -- cgit v1.2.3 From cfcad62c74abfef83762dc05a556d21bdf3980a2 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:36 -0400 Subject: audit: seperate audit inode watches into a subfile In preparation for converting audit to use fsnotify instead of inotify we seperate the inode watching code into it's own file. This is similar to how the audit tree watching code is already seperated into audit_tree.c Signed-off-by: Eric Paris --- kernel/Makefile | 2 +- kernel/audit.c | 16 -- kernel/audit.h | 39 ++-- kernel/audit_watch.c | 534 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/auditfilter.c | 481 ++-------------------------------------------- kernel/auditsc.c | 6 +- 6 files changed, 572 insertions(+), 506 deletions(-) create mode 100644 kernel/audit_watch.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0a32cb21ec9..da750010a6f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,7 +70,7 @@ obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o -obj-$(CONFIG_AUDITSYSCALL) += auditsc.o +obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o diff --git a/kernel/audit.c b/kernel/audit.c index ce77e81a0e7..e07ad2340db 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; -/* Inotify handle. */ -struct inotify_handle *audit_ih; - /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -971,13 +968,6 @@ static void audit_receive(struct sk_buff *skb) mutex_unlock(&audit_cmd_mutex); } -#ifdef CONFIG_AUDITSYSCALL -static const struct inotify_operations audit_inotify_ops = { - .handle_event = audit_handle_ievent, - .destroy_watch = audit_free_parent, -}; -#endif - /* Initialize audit support at boot time. */ static int __init audit_init(void) { @@ -1003,12 +993,6 @@ static int __init audit_init(void) audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); -#ifdef CONFIG_AUDITSYSCALL - audit_ih = inotify_init(&audit_inotify_ops); - if (IS_ERR(audit_ih)) - audit_panic("cannot initialize inotify handle"); -#endif - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); diff --git a/kernel/audit.h b/kernel/audit.h index 16f18cac661..704d5b01d9f 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -53,18 +53,7 @@ enum audit_state { }; /* Rule lists */ -struct audit_parent; - -struct audit_watch { - atomic_t count; /* reference count */ - char *path; /* insertion path */ - dev_t dev; /* associated superblock device */ - unsigned long ino; /* associated inode number */ - struct audit_parent *parent; /* associated parent */ - struct list_head wlist; /* entry in parent->watches list */ - struct list_head rules; /* associated rules */ -}; - +struct audit_watch; struct audit_tree; struct audit_chunk; @@ -108,19 +97,31 @@ struct audit_netlink_list { int audit_send_list(void *); -struct inotify_watch; -/* Inotify handle */ -extern struct inotify_handle *audit_ih; - -extern void audit_free_parent(struct inotify_watch *); -extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, - const char *, struct inode *); extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; extern void audit_free_rule_rcu(struct rcu_head *); extern struct list_head audit_filter_list[]; +/* audit watch functions */ +extern unsigned long audit_watch_inode(struct audit_watch *watch); +extern dev_t audit_watch_dev(struct audit_watch *watch); +extern void audit_put_watch(struct audit_watch *watch); +extern void audit_get_watch(struct audit_watch *watch); +extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); +extern int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw); +extern void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw); +extern int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, + struct nameidata *ndw); +extern void audit_remove_watch(struct audit_watch *watch); +extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list); +extern void audit_inotify_unregister(struct list_head *in_list); +extern char *audit_watch_path(struct audit_watch *watch); +extern struct list_head *audit_watch_rules(struct audit_watch *watch); + +extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch *watch); + #ifdef CONFIG_AUDIT_TREE extern struct audit_chunk *audit_tree_lookup(const struct inode *); extern void audit_put_chunk(struct audit_chunk *); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c new file mode 100644 index 00000000000..da8be6d39c1 --- /dev/null +++ b/kernel/audit_watch.c @@ -0,0 +1,534 @@ +/* audit_watch.c -- watching inodes + * + * Copyright 2003-2009 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "audit.h" + +/* + * Reference counting: + * + * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED + * event. Each audit_watch holds a reference to its associated parent. + * + * audit_watch: if added to lists, lifetime is from audit_init_watch() to + * audit_remove_watch(). Additionally, an audit_watch may exist + * temporarily to assist in searching existing filter data. Each + * audit_krule holds a reference to its associated watch. + */ + +struct audit_watch { + atomic_t count; /* reference count */ + char *path; /* insertion path */ + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + struct audit_parent *parent; /* associated parent */ + struct list_head wlist; /* entry in parent->watches list */ + struct list_head rules; /* associated rules */ +}; + +struct audit_parent { + struct list_head ilist; /* entry in inotify registration list */ + struct list_head watches; /* associated watches */ + struct inotify_watch wdata; /* inotify watch data */ + unsigned flags; /* status flags */ +}; + +/* Inotify handle. */ +struct inotify_handle *audit_ih; + +/* + * audit_parent status flags: + * + * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to + * a filesystem event to ensure we're adding audit watches to a valid parent. + * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot + * receive them while we have nameidata, but must be used for IN_MOVE_SELF which + * we can receive while holding nameidata. + */ +#define AUDIT_PARENT_INVALID 0x001 + +/* Inotify events we care about. */ +#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF + +static void audit_free_parent(struct inotify_watch *i_watch) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + WARN_ON(!list_empty(&parent->watches)); + kfree(parent); +} + +void audit_get_watch(struct audit_watch *watch) +{ + atomic_inc(&watch->count); +} + +void audit_put_watch(struct audit_watch *watch) +{ + if (atomic_dec_and_test(&watch->count)) { + WARN_ON(watch->parent); + WARN_ON(!list_empty(&watch->rules)); + kfree(watch->path); + kfree(watch); + } +} + +void audit_remove_watch(struct audit_watch *watch) +{ + list_del(&watch->wlist); + put_inotify_watch(&watch->parent->wdata); + watch->parent = NULL; + audit_put_watch(watch); /* match initial get */ +} + +char *audit_watch_path(struct audit_watch *watch) +{ + return watch->path; +} + +struct list_head *audit_watch_rules(struct audit_watch *watch) +{ + return &watch->rules; +} + +unsigned long audit_watch_inode(struct audit_watch *watch) +{ + return watch->ino; +} + +dev_t audit_watch_dev(struct audit_watch *watch) +{ + return watch->dev; +} + +/* Initialize a parent watch entry. */ +static struct audit_parent *audit_init_parent(struct nameidata *ndp) +{ + struct audit_parent *parent; + s32 wd; + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (unlikely(!parent)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&parent->watches); + parent->flags = 0; + + inotify_init_watch(&parent->wdata); + /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ + get_inotify_watch(&parent->wdata); + wd = inotify_add_watch(audit_ih, &parent->wdata, + ndp->path.dentry->d_inode, AUDIT_IN_WATCH); + if (wd < 0) { + audit_free_parent(&parent->wdata); + return ERR_PTR(wd); + } + + return parent; +} + +/* Initialize a watch entry. */ +static struct audit_watch *audit_init_watch(char *path) +{ + struct audit_watch *watch; + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (unlikely(!watch)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&watch->rules); + atomic_set(&watch->count, 1); + watch->path = path; + watch->dev = (dev_t)-1; + watch->ino = (unsigned long)-1; + + return watch; +} + +/* Translate a watch string to kernel respresentation. */ +int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) +{ + struct audit_watch *watch; + + if (!audit_ih) + return -EOPNOTSUPP; + + if (path[0] != '/' || path[len-1] == '/' || + krule->listnr != AUDIT_FILTER_EXIT || + op != Audit_equal || + krule->inode_f || krule->watch || krule->tree) + return -EINVAL; + + watch = audit_init_watch(path); + if (IS_ERR(watch)) + return PTR_ERR(watch); + + audit_get_watch(watch); + krule->watch = watch; + + return 0; +} + +/* Duplicate the given audit watch. The new watch's rules list is initialized + * to an empty list and wlist is undefined. */ +static struct audit_watch *audit_dupe_watch(struct audit_watch *old) +{ + char *path; + struct audit_watch *new; + + path = kstrdup(old->path, GFP_KERNEL); + if (unlikely(!path)) + return ERR_PTR(-ENOMEM); + + new = audit_init_watch(path); + if (IS_ERR(new)) { + kfree(path); + goto out; + } + + new->dev = old->dev; + new->ino = old->ino; + get_inotify_watch(&old->parent->wdata); + new->parent = old->parent; + +out: + return new; +} + +static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) +{ + if (audit_enabled) { + struct audit_buffer *ab; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "auid=%u ses=%u op=", + audit_get_loginuid(current), + audit_get_sessionid(current)); + audit_log_string(ab, op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, w->path); + if (r->filterkey) { + audit_log_format(ab, " key="); + audit_log_untrustedstring(ab, r->filterkey); + } else + audit_log_format(ab, " key=(null)"); + audit_log_format(ab, " list=%d res=1", r->listnr); + audit_log_end(ab); + } +} + +/* Update inode info in audit rules based on filesystem event. */ +static void audit_update_watch(struct audit_parent *parent, + const char *dname, dev_t dev, + unsigned long ino, unsigned invalidating) +{ + struct audit_watch *owatch, *nwatch, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *oentry, *nentry; + + mutex_lock(&audit_filter_mutex); + list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { + if (audit_compare_dname_path(dname, owatch->path, NULL)) + continue; + + /* If the update involves invalidating rules, do the inode-based + * filtering now, so we don't omit records. */ + if (invalidating && current->audit_context) + audit_filter_inodes(current, current->audit_context); + + nwatch = audit_dupe_watch(owatch); + if (IS_ERR(nwatch)) { + mutex_unlock(&audit_filter_mutex); + audit_panic("error updating watch, skipping"); + return; + } + nwatch->dev = dev; + nwatch->ino = ino; + + list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { + + oentry = container_of(r, struct audit_entry, rule); + list_del(&oentry->rule.rlist); + list_del_rcu(&oentry->list); + + nentry = audit_dupe_rule(&oentry->rule, nwatch); + if (IS_ERR(nentry)) { + list_del(&oentry->rule.list); + audit_panic("error updating watch, removing"); + } else { + int h = audit_hash_ino((u32)ino); + list_add(&nentry->rule.rlist, &nwatch->rules); + list_add_rcu(&nentry->list, &audit_inode_hash[h]); + list_replace(&oentry->rule.list, + &nentry->rule.list); + } + + audit_watch_log_rule_change(r, owatch, "updated rules"); + + call_rcu(&oentry->rcu, audit_free_rule_rcu); + } + + audit_remove_watch(owatch); + goto add_watch_to_parent; /* event applies to a single watch */ + } + mutex_unlock(&audit_filter_mutex); + return; + +add_watch_to_parent: + list_add(&nwatch->wlist, &parent->watches); + mutex_unlock(&audit_filter_mutex); + return; +} + +/* Remove all watches & rules associated with a parent that is going away. */ +static void audit_remove_parent_watches(struct audit_parent *parent) +{ + struct audit_watch *w, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *e; + + mutex_lock(&audit_filter_mutex); + parent->flags |= AUDIT_PARENT_INVALID; + list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { + list_for_each_entry_safe(r, nextr, &w->rules, rlist) { + e = container_of(r, struct audit_entry, rule); + audit_watch_log_rule_change(r, w, "remove rule"); + list_del(&r->rlist); + list_del(&r->list); + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + } + audit_remove_watch(w); + } + mutex_unlock(&audit_filter_mutex); +} + +/* Unregister inotify watches for parents on in_list. + * Generates an IN_IGNORED event. */ +void audit_inotify_unregister(struct list_head *in_list) +{ + struct audit_parent *p, *n; + + list_for_each_entry_safe(p, n, in_list, ilist) { + list_del(&p->ilist); + inotify_rm_watch(audit_ih, &p->wdata); + /* the unpin matching the pin in audit_do_del_rule() */ + unpin_inotify_watch(&p->wdata); + } +} + +/* Get path information necessary for adding watches. */ +int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) +{ + struct nameidata *ndparent, *ndwatch; + int err; + + ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); + if (unlikely(!ndparent)) + return -ENOMEM; + + ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); + if (unlikely(!ndwatch)) { + kfree(ndparent); + return -ENOMEM; + } + + err = path_lookup(path, LOOKUP_PARENT, ndparent); + if (err) { + kfree(ndparent); + kfree(ndwatch); + return err; + } + + err = path_lookup(path, 0, ndwatch); + if (err) { + kfree(ndwatch); + ndwatch = NULL; + } + + *ndp = ndparent; + *ndw = ndwatch; + + return 0; +} + +/* Release resources used for watch path information. */ +void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) +{ + if (ndp) { + path_put(&ndp->path); + kfree(ndp); + } + if (ndw) { + path_put(&ndw->path); + kfree(ndw); + } +} + +/* Associate the given rule with an existing parent inotify_watch. + * Caller must hold audit_filter_mutex. */ +static void audit_add_to_parent(struct audit_krule *krule, + struct audit_parent *parent) +{ + struct audit_watch *w, *watch = krule->watch; + int watch_found = 0; + + list_for_each_entry(w, &parent->watches, wlist) { + if (strcmp(watch->path, w->path)) + continue; + + watch_found = 1; + + /* put krule's and initial refs to temporary watch */ + audit_put_watch(watch); + audit_put_watch(watch); + + audit_get_watch(w); + krule->watch = watch = w; + break; + } + + if (!watch_found) { + get_inotify_watch(&parent->wdata); + watch->parent = parent; + + list_add(&watch->wlist, &parent->watches); + } + list_add(&krule->rlist, &watch->rules); +} + +/* Find a matching watch entry, or add this one. + * Caller must hold audit_filter_mutex. */ +int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, + struct nameidata *ndw) +{ + struct audit_watch *watch = krule->watch; + struct inotify_watch *i_watch; + struct audit_parent *parent; + int ret = 0; + + /* update watch filter fields */ + if (ndw) { + watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; + watch->ino = ndw->path.dentry->d_inode->i_ino; + } + + /* The audit_filter_mutex must not be held during inotify calls because + * we hold it during inotify event callback processing. If an existing + * inotify watch is found, inotify_find_watch() grabs a reference before + * returning. + */ + mutex_unlock(&audit_filter_mutex); + + if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, + &i_watch) < 0) { + parent = audit_init_parent(ndp); + if (IS_ERR(parent)) { + /* caller expects mutex locked */ + mutex_lock(&audit_filter_mutex); + return PTR_ERR(parent); + } + } else + parent = container_of(i_watch, struct audit_parent, wdata); + + mutex_lock(&audit_filter_mutex); + + /* parent was moved before we took audit_filter_mutex */ + if (parent->flags & AUDIT_PARENT_INVALID) + ret = -ENOENT; + else + audit_add_to_parent(krule, parent); + + /* match get in audit_init_parent or inotify_find_watch */ + put_inotify_watch(&parent->wdata); + return ret; +} + +void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) +{ + struct audit_watch *watch = krule->watch; + struct audit_parent *parent = watch->parent; + + list_del(&krule->rlist); + + if (list_empty(&watch->rules)) { + audit_remove_watch(watch); + + if (list_empty(&parent->watches)) { + /* Put parent on the inotify un-registration + * list. Grab a reference before releasing + * audit_filter_mutex, to be released in + * audit_inotify_unregister(). + * If filesystem is going away, just leave + * the sucker alone, eviction will take + * care of it. */ + if (pin_inotify_watch(&parent->wdata)) + list_add(&parent->ilist, list); + } + } +} + +/* Update watch data in audit rules based on inotify events. */ +static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, + u32 cookie, const char *dname, struct inode *inode) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + + if (mask & (IN_CREATE|IN_MOVED_TO) && inode) + audit_update_watch(parent, dname, inode->i_sb->s_dev, + inode->i_ino, 0); + else if (mask & (IN_DELETE|IN_MOVED_FROM)) + audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + /* inotify automatically removes the watch and sends IN_IGNORED */ + else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) + audit_remove_parent_watches(parent); + /* inotify does not remove the watch, so remove it manually */ + else if(mask & IN_MOVE_SELF) { + audit_remove_parent_watches(parent); + inotify_remove_watch_locked(audit_ih, i_watch); + } else if (mask & IN_IGNORED) + put_inotify_watch(i_watch); +} + +static const struct inotify_operations audit_inotify_ops = { + .handle_event = audit_handle_ievent, + .destroy_watch = audit_free_parent, +}; + +static int __init audit_watch_init(void) +{ + audit_ih = inotify_init(&audit_inotify_ops); + if (IS_ERR(audit_ih)) + audit_panic("cannot initialize inotify handle"); + return 0; +} +subsys_initcall(audit_watch_init); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e7466dd145c..9d4c93437de 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include "audit.h" @@ -44,36 +43,6 @@ * be written directly provided audit_filter_mutex is held. */ -/* - * Reference counting: - * - * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED - * event. Each audit_watch holds a reference to its associated parent. - * - * audit_watch: if added to lists, lifetime is from audit_init_watch() to - * audit_remove_watch(). Additionally, an audit_watch may exist - * temporarily to assist in searching existing filter data. Each - * audit_krule holds a reference to its associated watch. - */ - -struct audit_parent { - struct list_head ilist; /* entry in inotify registration list */ - struct list_head watches; /* associated watches */ - struct inotify_watch wdata; /* inotify watch data */ - unsigned flags; /* status flags */ -}; - -/* - * audit_parent status flags: - * - * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to - * a filesystem event to ensure we're adding audit watches to a valid parent. - * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot - * receive them while we have nameidata, but must be used for IN_MOVE_SELF which - * we can receive while holding nameidata. - */ -#define AUDIT_PARENT_INVALID 0x001 - /* Audit filter lists, defined in */ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[0]), @@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { DEFINE_MUTEX(audit_filter_mutex); -/* Inotify events we care about. */ -#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF - -void audit_free_parent(struct inotify_watch *i_watch) -{ - struct audit_parent *parent; - - parent = container_of(i_watch, struct audit_parent, wdata); - WARN_ON(!list_empty(&parent->watches)); - kfree(parent); -} - -static inline void audit_get_watch(struct audit_watch *watch) -{ - atomic_inc(&watch->count); -} - -static void audit_put_watch(struct audit_watch *watch) -{ - if (atomic_dec_and_test(&watch->count)) { - WARN_ON(watch->parent); - WARN_ON(!list_empty(&watch->rules)); - kfree(watch->path); - kfree(watch); - } -} - -static void audit_remove_watch(struct audit_watch *watch) -{ - list_del(&watch->wlist); - put_inotify_watch(&watch->parent->wdata); - watch->parent = NULL; - audit_put_watch(watch); /* match initial get */ -} - static inline void audit_free_rule(struct audit_entry *e) { int i; @@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head) audit_free_rule(e); } -/* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct nameidata *ndp) -{ - struct audit_parent *parent; - s32 wd; - - parent = kzalloc(sizeof(*parent), GFP_KERNEL); - if (unlikely(!parent)) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&parent->watches); - parent->flags = 0; - - inotify_init_watch(&parent->wdata); - /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ - get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, - ndp->path.dentry->d_inode, AUDIT_IN_WATCH); - if (wd < 0) { - audit_free_parent(&parent->wdata); - return ERR_PTR(wd); - } - - return parent; -} - -/* Initialize a watch entry. */ -static struct audit_watch *audit_init_watch(char *path) -{ - struct audit_watch *watch; - - watch = kzalloc(sizeof(*watch), GFP_KERNEL); - if (unlikely(!watch)) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&watch->rules); - atomic_set(&watch->count, 1); - watch->path = path; - watch->dev = (dev_t)-1; - watch->ino = (unsigned long)-1; - - return watch; -} - /* Initialize an audit filterlist entry. */ static inline struct audit_entry *audit_init_entry(u32 field_count) { @@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule, return 0; } -/* Translate a watch string to kernel respresentation. */ -static int audit_to_watch(struct audit_krule *krule, char *path, int len, - u32 op) -{ - struct audit_watch *watch; - - if (!audit_ih) - return -EOPNOTSUPP; - - if (path[0] != '/' || path[len-1] == '/' || - krule->listnr != AUDIT_FILTER_EXIT || - op != Audit_equal || - krule->inode_f || krule->watch || krule->tree) - return -EINVAL; - - watch = audit_init_watch(path); - if (IS_ERR(watch)) - return PTR_ERR(watch); - - audit_get_watch(watch); - krule->watch = watch; - - return 0; -} - static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) @@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) break; case AUDIT_WATCH: data->buflen += data->values[i] = - audit_pack_string(&bufp, krule->watch->path); + audit_pack_string(&bufp, + audit_watch_path(krule->watch)); break; case AUDIT_DIR: data->buflen += data->values[i] = @@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 1; break; case AUDIT_WATCH: - if (strcmp(a->watch->path, b->watch->path)) + if (strcmp(audit_watch_path(a->watch), + audit_watch_path(b->watch))) return 1; break; case AUDIT_DIR: @@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 0; } -/* Duplicate the given audit watch. The new watch's rules list is initialized - * to an empty list and wlist is undefined. */ -static struct audit_watch *audit_dupe_watch(struct audit_watch *old) -{ - char *path; - struct audit_watch *new; - - path = kstrdup(old->path, GFP_KERNEL); - if (unlikely(!path)) - return ERR_PTR(-ENOMEM); - - new = audit_init_watch(path); - if (IS_ERR(new)) { - kfree(path); - goto out; - } - - new->dev = old->dev; - new->ino = old->ino; - get_inotify_watch(&old->parent->wdata); - new->parent = old->parent; - -out: - return new; -} - /* Duplicate LSM field information. The lsm_rule is opaque, so must be * re-initialized. */ static inline int audit_dupe_lsm_field(struct audit_field *df, @@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, * rule with the new rule in the filterlist, then free the old rule. * The rlist element is undefined; list manipulations are handled apart from * the initial copy. */ -static struct audit_entry *audit_dupe_rule(struct audit_krule *old, - struct audit_watch *watch) +struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch *watch) { u32 fcount = old->field_count; struct audit_entry *entry; @@ -977,127 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, return entry; } -static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) -{ - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "auid=%u ses=%u op=", - audit_get_loginuid(current), - audit_get_sessionid(current)); - audit_log_string(ab, op); - audit_log_format(ab, " path="); - audit_log_untrustedstring(ab, w->path); - if (r->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, r->filterkey); - } else - audit_log_format(ab, " key=(null)"); - audit_log_format(ab, " list=%d res=1", r->listnr); - audit_log_end(ab); - } -} - -/* Update inode info in audit rules based on filesystem event. */ -static void audit_update_watch(struct audit_parent *parent, - const char *dname, dev_t dev, - unsigned long ino, unsigned invalidating) -{ - struct audit_watch *owatch, *nwatch, *nextw; - struct audit_krule *r, *nextr; - struct audit_entry *oentry, *nentry; - - mutex_lock(&audit_filter_mutex); - list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path, NULL)) - continue; - - /* If the update involves invalidating rules, do the inode-based - * filtering now, so we don't omit records. */ - if (invalidating && current->audit_context) - audit_filter_inodes(current, current->audit_context); - - nwatch = audit_dupe_watch(owatch); - if (IS_ERR(nwatch)) { - mutex_unlock(&audit_filter_mutex); - audit_panic("error updating watch, skipping"); - return; - } - nwatch->dev = dev; - nwatch->ino = ino; - - list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { - - oentry = container_of(r, struct audit_entry, rule); - list_del(&oentry->rule.rlist); - list_del_rcu(&oentry->list); - - nentry = audit_dupe_rule(&oentry->rule, nwatch); - if (IS_ERR(nentry)) { - list_del(&oentry->rule.list); - audit_panic("error updating watch, removing"); - } else { - int h = audit_hash_ino((u32)ino); - list_add(&nentry->rule.rlist, &nwatch->rules); - list_add_rcu(&nentry->list, &audit_inode_hash[h]); - list_replace(&oentry->rule.list, - &nentry->rule.list); - } - - audit_watch_log_rule_change(r, owatch, "updated rules"); - - call_rcu(&oentry->rcu, audit_free_rule_rcu); - } - - audit_remove_watch(owatch); - goto add_watch_to_parent; /* event applies to a single watch */ - } - mutex_unlock(&audit_filter_mutex); - return; - -add_watch_to_parent: - list_add(&nwatch->wlist, &parent->watches); - mutex_unlock(&audit_filter_mutex); - return; -} - -/* Remove all watches & rules associated with a parent that is going away. */ -static void audit_remove_parent_watches(struct audit_parent *parent) -{ - struct audit_watch *w, *nextw; - struct audit_krule *r, *nextr; - struct audit_entry *e; - - mutex_lock(&audit_filter_mutex); - parent->flags |= AUDIT_PARENT_INVALID; - list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { - list_for_each_entry_safe(r, nextr, &w->rules, rlist) { - e = container_of(r, struct audit_entry, rule); - audit_watch_log_rule_change(r, w, "remove rule"); - list_del(&r->rlist); - list_del(&r->list); - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule_rcu); - } - audit_remove_watch(w); - } - mutex_unlock(&audit_filter_mutex); -} - -/* Unregister inotify watches for parents on in_list. - * Generates an IN_IGNORED event. */ -static void audit_inotify_unregister(struct list_head *in_list) -{ - struct audit_parent *p, *n; - - list_for_each_entry_safe(p, n, in_list, ilist) { - list_del(&p->ilist); - inotify_rm_watch(audit_ih, &p->wdata); - /* the unpin matching the pin in audit_do_del_rule() */ - unpin_inotify_watch(&p->wdata); - } -} - /* Find an existing audit rule. * Caller must hold audit_filter_mutex to prevent stale rule data. */ static struct audit_entry *audit_find_rule(struct audit_entry *entry, @@ -1135,134 +855,6 @@ out: return found; } -/* Get path information necessary for adding watches. */ -static int audit_get_nd(char *path, struct nameidata **ndp, - struct nameidata **ndw) -{ - struct nameidata *ndparent, *ndwatch; - int err; - - ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); - if (unlikely(!ndparent)) - return -ENOMEM; - - ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); - if (unlikely(!ndwatch)) { - kfree(ndparent); - return -ENOMEM; - } - - err = path_lookup(path, LOOKUP_PARENT, ndparent); - if (err) { - kfree(ndparent); - kfree(ndwatch); - return err; - } - - err = path_lookup(path, 0, ndwatch); - if (err) { - kfree(ndwatch); - ndwatch = NULL; - } - - *ndp = ndparent; - *ndw = ndwatch; - - return 0; -} - -/* Release resources used for watch path information. */ -static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) -{ - if (ndp) { - path_put(&ndp->path); - kfree(ndp); - } - if (ndw) { - path_put(&ndw->path); - kfree(ndw); - } -} - -/* Associate the given rule with an existing parent inotify_watch. - * Caller must hold audit_filter_mutex. */ -static void audit_add_to_parent(struct audit_krule *krule, - struct audit_parent *parent) -{ - struct audit_watch *w, *watch = krule->watch; - int watch_found = 0; - - list_for_each_entry(w, &parent->watches, wlist) { - if (strcmp(watch->path, w->path)) - continue; - - watch_found = 1; - - /* put krule's and initial refs to temporary watch */ - audit_put_watch(watch); - audit_put_watch(watch); - - audit_get_watch(w); - krule->watch = watch = w; - break; - } - - if (!watch_found) { - get_inotify_watch(&parent->wdata); - watch->parent = parent; - - list_add(&watch->wlist, &parent->watches); - } - list_add(&krule->rlist, &watch->rules); -} - -/* Find a matching watch entry, or add this one. - * Caller must hold audit_filter_mutex. */ -static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, - struct nameidata *ndw) -{ - struct audit_watch *watch = krule->watch; - struct inotify_watch *i_watch; - struct audit_parent *parent; - int ret = 0; - - /* update watch filter fields */ - if (ndw) { - watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->path.dentry->d_inode->i_ino; - } - - /* The audit_filter_mutex must not be held during inotify calls because - * we hold it during inotify event callback processing. If an existing - * inotify watch is found, inotify_find_watch() grabs a reference before - * returning. - */ - mutex_unlock(&audit_filter_mutex); - - if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, - &i_watch) < 0) { - parent = audit_init_parent(ndp); - if (IS_ERR(parent)) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); - return PTR_ERR(parent); - } - } else - parent = container_of(i_watch, struct audit_parent, wdata); - - mutex_lock(&audit_filter_mutex); - - /* parent was moved before we took audit_filter_mutex */ - if (parent->flags & AUDIT_PARENT_INVALID) - ret = -ENOENT; - else - audit_add_to_parent(krule, parent); - - /* match get in audit_init_parent or inotify_find_watch */ - put_inotify_watch(&parent->wdata); - return ret; -} - static u64 prio_low = ~0ULL/2; static u64 prio_high = ~0ULL/2 - 1; @@ -1297,7 +889,7 @@ static inline int audit_add_rule(struct audit_entry *entry) /* Avoid calling path_lookup under audit_filter_mutex. */ if (watch) { - err = audit_get_nd(watch->path, &ndp, &ndw); + err = audit_get_nd(audit_watch_path(watch), &ndp, &ndw); if (err) goto error; } @@ -1312,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry) } /* entry->rule.watch may have changed during audit_add_watch() */ watch = entry->rule.watch; - h = audit_hash_ino((u32)watch->ino); + h = audit_hash_ino((u32)audit_watch_inode(watch)); list = &audit_inode_hash[h]; } if (tree) { @@ -1364,7 +956,7 @@ error: static inline int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_watch *watch, *tmp_watch = entry->rule.watch; + struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; LIST_HEAD(inotify_list); @@ -1386,29 +978,8 @@ static inline int audit_del_rule(struct audit_entry *entry) goto out; } - watch = e->rule.watch; - if (watch) { - struct audit_parent *parent = watch->parent; - - list_del(&e->rule.rlist); - - if (list_empty(&watch->rules)) { - audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - /* Put parent on the inotify un-registration - * list. Grab a reference before releasing - * audit_filter_mutex, to be released in - * audit_inotify_unregister(). - * If filesystem is going away, just leave - * the sucker alone, eviction will take - * care of it. - */ - if (pin_inotify_watch(&parent->wdata)) - list_add(&parent->ilist, &inotify_list); - } - } - } + if (e->rule.watch) + audit_remove_watch_rule(&e->rule, &inotify_list); if (e->rule.tree) audit_remove_tree_rule(&e->rule); @@ -1430,8 +1001,8 @@ static inline int audit_del_rule(struct audit_entry *entry) audit_inotify_unregister(&inotify_list); out: - if (tmp_watch) - audit_put_watch(tmp_watch); /* match initial get */ + if (watch) + audit_put_watch(watch); /* match initial get */ if (tree) audit_put_tree(tree); /* that's the temporary one */ @@ -1785,7 +1356,7 @@ static int update_lsm_rule(struct audit_krule *r) list_del(&r->list); } else { if (watch) { - list_add(&nentry->rule.rlist, &watch->rules); + list_add(&nentry->rule.rlist, audit_watch_rules(watch)); list_del(&r->rlist); } else if (tree) list_replace_init(&r->rlist, &nentry->rule.rlist); @@ -1821,27 +1392,3 @@ int audit_update_lsm_rules(void) return err; } - -/* Update watch data in audit rules based on inotify events. */ -void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, - u32 cookie, const char *dname, struct inode *inode) -{ - struct audit_parent *parent; - - parent = container_of(i_watch, struct audit_parent, wdata); - - if (mask & (IN_CREATE|IN_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, - inode->i_ino, 0); - else if (mask & (IN_DELETE|IN_MOVED_FROM)) - audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); - /* inotify automatically removes the watch and sends IN_IGNORED */ - else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) - audit_remove_parent_watches(parent); - /* inotify does not remove the watch, so remove it manually */ - else if(mask & IN_MOVE_SELF) { - audit_remove_parent_watches(parent); - inotify_remove_watch_locked(audit_ih, i_watch); - } else if (mask & IN_IGNORED) - put_inotify_watch(i_watch); -} diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b14d234b85f..0b862cac6ca 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -548,9 +548,9 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_WATCH: - if (name && rule->watch->ino != (unsigned long)-1) - result = (name->dev == rule->watch->dev && - name->ino == rule->watch->ino); + if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) + result = (name->dev == audit_watch_dev(rule->watch) && + name->ino == audit_watch_inode(rule->watch)); break; case AUDIT_DIR: if (ctx) -- cgit v1.2.3 From 35fe4d0b1b12286a81938e9c5fdfaf639ac0ce5b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:36 -0400 Subject: Audit: move audit_get_nd completely into audit_watch audit_get_nd() is only used by audit_watch and could be more cleanly implemented by having the audit watch functions call it when needed rather than making the generic audit rule parsing code deal with those objects. Signed-off-by: Eric Paris --- kernel/audit.h | 5 +---- kernel/audit_watch.c | 27 ++++++++++++++++++++------- kernel/auditfilter.c | 15 ++------------- 3 files changed, 23 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 704d5b01d9f..bb1c0d69db0 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -109,10 +109,7 @@ extern dev_t audit_watch_dev(struct audit_watch *watch); extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); -extern int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw); -extern void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw); -extern int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, - struct nameidata *ndw); +extern int audit_add_watch(struct audit_krule *krule); extern void audit_remove_watch(struct audit_watch *watch); extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list); extern void audit_inotify_unregister(struct list_head *in_list); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index da8be6d39c1..b49ab019fdf 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -345,7 +345,7 @@ void audit_inotify_unregister(struct list_head *in_list) } /* Get path information necessary for adding watches. */ -int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) +static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) { struct nameidata *ndparent, *ndwatch; int err; @@ -380,7 +380,7 @@ int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) } /* Release resources used for watch path information. */ -void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) +static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) { if (ndp) { path_put(&ndp->path); @@ -426,14 +426,24 @@ static void audit_add_to_parent(struct audit_krule *krule, /* Find a matching watch entry, or add this one. * Caller must hold audit_filter_mutex. */ -int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, - struct nameidata *ndw) +int audit_add_watch(struct audit_krule *krule) { struct audit_watch *watch = krule->watch; struct inotify_watch *i_watch; struct audit_parent *parent; + struct nameidata *ndp = NULL, *ndw = NULL; int ret = 0; + mutex_unlock(&audit_filter_mutex); + + /* Avoid calling path_lookup under audit_filter_mutex. */ + ret = audit_get_nd(watch->path, &ndp, &ndw); + if (ret) { + /* caller expects mutex locked */ + mutex_lock(&audit_filter_mutex); + goto error; + } + /* update watch filter fields */ if (ndw) { watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; @@ -445,15 +455,14 @@ int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, * inotify watch is found, inotify_find_watch() grabs a reference before * returning. */ - mutex_unlock(&audit_filter_mutex); - if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, &i_watch) < 0) { parent = audit_init_parent(ndp); if (IS_ERR(parent)) { /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - return PTR_ERR(parent); + ret = PTR_ERR(parent); + goto error; } } else parent = container_of(i_watch, struct audit_parent, wdata); @@ -468,7 +477,11 @@ int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, /* match get in audit_init_parent or inotify_find_watch */ put_inotify_watch(&parent->wdata); + +error: + audit_put_nd(ndp, ndw); /* NULL args OK */ return ret; + } void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9d4c93437de..21b623595aa 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -864,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry) struct audit_entry *e; struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; - struct nameidata *ndp = NULL, *ndw = NULL; struct list_head *list; int h, err; #ifdef CONFIG_AUDITSYSCALL @@ -878,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry) mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); - mutex_unlock(&audit_filter_mutex); if (e) { + mutex_unlock(&audit_filter_mutex); err = -EEXIST; /* normally audit_add_tree_rule() will free it on failure */ if (tree) @@ -887,17 +886,9 @@ static inline int audit_add_rule(struct audit_entry *entry) goto error; } - /* Avoid calling path_lookup under audit_filter_mutex. */ - if (watch) { - err = audit_get_nd(audit_watch_path(watch), &ndp, &ndw); - if (err) - goto error; - } - - mutex_lock(&audit_filter_mutex); if (watch) { /* audit_filter_mutex is dropped and re-taken during this call */ - err = audit_add_watch(&entry->rule, ndp, ndw); + err = audit_add_watch(&entry->rule); if (err) { mutex_unlock(&audit_filter_mutex); goto error; @@ -942,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry) #endif mutex_unlock(&audit_filter_mutex); - audit_put_nd(ndp, ndw); /* NULL args OK */ return 0; error: - audit_put_nd(ndp, ndw); /* NULL args OK */ if (watch) audit_put_watch(watch); /* tmp watch, matches initial get */ return err; -- cgit v1.2.3 From 9d9609851003ebed15957f0f2ce18492739ee124 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 14:31:37 -0400 Subject: Audit: clean up all op= output to include string quoting A number of places in the audit system we send an op= followed by a string that includes spaces. Somehow this works but it's just wrong. This patch moves all of those that I could find to be quoted. Example: Change From: type=CONFIG_CHANGE msg=audit(1244666690.117:31): auid=0 ses=1 subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 op=remove rule key="number2" list=4 res=0 Change To: type=CONFIG_CHANGE msg=audit(1244666690.117:31): auid=0 ses=1 subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 op="remove rule" key="number2" list=4 res=0 Signed-off-by: Eric Paris --- kernel/audit.c | 9 +++++++++ kernel/audit_tree.c | 10 ++++------ kernel/audit_watch.c | 6 +----- kernel/auditfilter.c | 12 +++++------- kernel/auditsc.c | 8 ++------ 5 files changed, 21 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e07ad2340db..6194c50e203 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1450,6 +1450,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, kfree(pathname); } +void audit_log_key(struct audit_buffer *ab, char *key) +{ + audit_log_format(ab, " key="); + if (key) + audit_log_untrustedstring(ab, key); + else + audit_log_format(ab, "(null)"); +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 1f6396d7668..3ff0731284a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -441,13 +441,11 @@ static void kill_rules(struct audit_tree *tree) if (rule->tree) { /* not a half-baked one */ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op=remove rule dir="); + audit_log_format(ab, "op="); + audit_log_string(ab, "remove rule"); + audit_log_format(ab, " dir="); audit_log_untrustedstring(ab, rule->tree->pathname); - if (rule->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, rule->filterkey); - } else - audit_log_format(ab, " key=(null)"); + audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=1", rule->listnr); audit_log_end(ab); rule->tree = NULL; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index b49ab019fdf..0e96dbc60ea 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -234,11 +234,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc audit_log_string(ab, op); audit_log_format(ab, " path="); audit_log_untrustedstring(ab, w->path); - if (r->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, r->filterkey); - } else - audit_log_format(ab, " key=(null)"); + audit_log_key(ab, r->filterkey); audit_log_format(ab, " list=%d res=1", r->listnr); audit_log_end(ab); } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 21b623595aa..a70604047f3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1079,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, security_release_secctx(ctx, len); } } - audit_log_format(ab, " op=%s rule key=", action); - if (rule->filterkey) - audit_log_untrustedstring(ab, rule->filterkey); - else - audit_log_format(ab, "(null)"); + audit_log_format(ab, " op="); + audit_log_string(ab, action); + audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=%d", rule->listnr, res); audit_log_end(ab); } @@ -1147,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return PTR_ERR(entry); err = audit_add_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "add", + audit_log_rule_change(loginuid, sessionid, sid, "add rule", &entry->rule, !err); if (err) @@ -1163,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return PTR_ERR(entry); err = audit_del_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "remove", + audit_log_rule_change(loginuid, sessionid, sid, "remove rule", &entry->rule, !err); audit_free_rule(entry); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 0b862cac6ca..2de95d1582b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1137,7 +1137,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, if (has_cntl) audit_log_n_hex(*ab, buf, to_send); else - audit_log_format(*ab, "\"%s\"", buf); + audit_log_string(*ab, buf); p += to_send; len_left -= to_send; @@ -1372,11 +1372,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_task_info(ab, tsk); - if (context->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, context->filterkey); - } else - audit_log_format(ab, " key=(null)"); + audit_log_key(ab, context->filterkey); audit_log_end(ab); for (aux = context->aux; aux; aux = aux->next) { -- cgit v1.2.3 From 916d75761c971b6e630a26bd4ba472e90ac9a4b9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 24 Jun 2009 00:02:38 -0400 Subject: Fix rule eviction order for AUDIT_DIR If syscall removes the root of subtree being watched, we definitely do not want the rules refering that subtree to be destroyed without the syscall in question having a chance to match them. Signed-off-by: Al Viro --- kernel/audit.c | 17 +--------------- kernel/audit.h | 7 +++++-- kernel/audit_tree.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++----- kernel/auditsc.c | 15 ++++++++++++++ 4 files changed, 72 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 6194c50e203..defc2e6f1e3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -133,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); /* Serialize requests from userspace. */ -static DEFINE_MUTEX(audit_cmd_mutex); +DEFINE_MUTEX(audit_cmd_mutex); /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -505,21 +505,6 @@ int audit_send_list(void *_dest) return 0; } -#ifdef CONFIG_AUDIT_TREE -static int prune_tree_thread(void *unused) -{ - mutex_lock(&audit_cmd_mutex); - audit_prune_trees(); - mutex_unlock(&audit_cmd_mutex); - return 0; -} - -void audit_schedule_prune(void) -{ - kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); -} -#endif - struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { diff --git a/kernel/audit.h b/kernel/audit.h index bb1c0d69db0..208687be4f3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -128,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *); extern int audit_remove_tree_rule(struct audit_krule *); extern void audit_trim_trees(void); extern int audit_tag_tree(char *old, char *new); -extern void audit_schedule_prune(void); -extern void audit_prune_trees(void); extern const char *audit_tree_path(struct audit_tree *); extern void audit_put_tree(struct audit_tree *); +extern void audit_kill_trees(struct list_head *); #else #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL @@ -140,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *); #define audit_put_tree(tree) (void)0 #define audit_tag_tree(old, new) -EINVAL #define audit_tree_path(rule) "" /* never called */ +#define audit_kill_trees(list) BUG() #endif extern char *audit_unpack_string(void **, size_t *, size_t); @@ -158,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return 0; } extern void audit_filter_inodes(struct task_struct *, struct audit_context *); +extern struct list_head *audit_killed_trees(void); #else #define audit_signal_info(s,t) AUDIT_DISABLED #define audit_filter_inodes(t,c) AUDIT_DISABLED #endif + +extern struct mutex audit_cmd_mutex; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 3ff0731284a..2451dc6f328 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -2,6 +2,7 @@ #include #include #include +#include struct audit_tree; struct audit_chunk; @@ -517,6 +518,8 @@ static void trim_marked(struct audit_tree *tree) } } +static void audit_schedule_prune(void); + /* called with audit_filter_mutex */ int audit_remove_tree_rule(struct audit_krule *rule) { @@ -822,10 +825,11 @@ int audit_tag_tree(char *old, char *new) /* * That gets run when evict_chunk() ends up needing to kill audit_tree. - * Runs from a separate thread, with audit_cmd_mutex held. + * Runs from a separate thread. */ -void audit_prune_trees(void) +static int prune_tree_thread(void *unused) { + mutex_lock(&audit_cmd_mutex); mutex_lock(&audit_filter_mutex); while (!list_empty(&prune_list)) { @@ -842,6 +846,40 @@ void audit_prune_trees(void) } mutex_unlock(&audit_filter_mutex); + mutex_unlock(&audit_cmd_mutex); + return 0; +} + +static void audit_schedule_prune(void) +{ + kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); +} + +/* + * ... and that one is done if evict_chunk() decides to delay until the end + * of syscall. Runs synchronously. + */ +void audit_kill_trees(struct list_head *list) +{ + mutex_lock(&audit_cmd_mutex); + mutex_lock(&audit_filter_mutex); + + while (!list_empty(list)) { + struct audit_tree *victim; + + victim = list_entry(list->next, struct audit_tree, list); + kill_rules(victim); + list_del_init(&victim->list); + + mutex_unlock(&audit_filter_mutex); + + prune_one(victim); + + mutex_lock(&audit_filter_mutex); + } + + mutex_unlock(&audit_filter_mutex); + mutex_unlock(&audit_cmd_mutex); } /* @@ -852,6 +890,8 @@ void audit_prune_trees(void) static void evict_chunk(struct audit_chunk *chunk) { struct audit_tree *owner; + struct list_head *postponed = audit_killed_trees(); + int need_prune = 0; int n; if (chunk->dead) @@ -867,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk) owner->root = NULL; list_del_init(&owner->same_root); spin_unlock(&hash_lock); - kill_rules(owner); - list_move(&owner->list, &prune_list); - audit_schedule_prune(); + if (!postponed) { + kill_rules(owner); + list_move(&owner->list, &prune_list); + need_prune = 1; + } else { + list_move(&owner->list, postponed); + } spin_lock(&hash_lock); } list_del_rcu(&chunk->hash); for (n = 0; n < chunk->count; n++) list_del_init(&chunk->owners[n].list); spin_unlock(&hash_lock); + if (need_prune) + audit_schedule_prune(); mutex_unlock(&audit_filter_mutex); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2de95d1582b..68d3c6a0ecd 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -199,6 +199,7 @@ struct audit_context { struct audit_tree_refs *trees, *first_trees; int tree_count; + struct list_head killed_trees; int type; union { @@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) return NULL; audit_zero_context(context, state); + INIT_LIST_HEAD(&context->killed_trees); return context; } @@ -1545,6 +1547,8 @@ void audit_free(struct task_struct *tsk) /* that can happen only if we are called from do_exit() */ if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(context, tsk); + if (!list_empty(&context->killed_trees)) + audit_kill_trees(&context->killed_trees); audit_free_context(context); } @@ -1688,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code) context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; + if (!list_empty(&context->killed_trees)) + audit_kill_trees(&context->killed_trees); + if (context->previous) { struct audit_context *new_context = context->previous; context->previous = NULL; @@ -2521,3 +2528,11 @@ void audit_core_dumps(long signr) audit_log_format(ab, " sig=%ld", signr); audit_log_end(ab); } + +struct list_head *audit_killed_trees(void) +{ + struct audit_context *ctx = current->audit_context; + if (likely(!ctx || !ctx->in_syscall)) + return NULL; + return &ctx->killed_trees; +} -- cgit v1.2.3 From e1c7e2a6e67fe9db19dd15e71614526a31b5fdb1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:52:29 +0800 Subject: tracing/events: Don't increment @pos in s_start() While testing syscall tracepoints posted by Jason, I found 3 entries were missing when reading available_events. The output size of available_events is < 4 pages, which means we lost 1 entry per page. The cause is, it's wrong to increment @pos in s_start(). Actually there's another bug here -- reading avaiable_events/set_events can race with module unload: # cat available_events | s_start() | s_stop() | | # rmmod foo.ko s_start() | call = list_entry(m->private) | @call might be freed and accessing it will lead to crash. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4186DD.6090405@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index aa08be69a1b..53c8fd376a8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -300,10 +300,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + struct ftrace_event_call *call = NULL; + loff_t l; + mutex_lock(&event_mutex); - if (*pos == 0) - m->private = ftrace_events.next; - return t_next(m, NULL, pos); + + m->private = ftrace_events.next; + for (l = 0; l <= *pos; ) { + call = t_next(m, NULL, &l); + if (!call) + break; + } + return call; } static void * @@ -332,10 +340,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { + struct ftrace_event_call *call = NULL; + loff_t l; + mutex_lock(&event_mutex); - if (*pos == 0) - m->private = ftrace_events.next; - return s_next(m, NULL, pos); + + m->private = ftrace_events.next; + for (l = 0; l <= *pos; ) { + call = s_next(m, NULL, &l); + if (!call) + break; + } + return call; } static int t_show(struct seq_file *m, void *v) -- cgit v1.2.3 From c8961ec6da22ea010bf4470a8e0fb3fdad0f11c4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:52:58 +0800 Subject: tracing_bprintk: Don't increment @pos in t_start() It's wrong to increment @pos in t_start(), otherwise we'll lose some entries when reading printk_formats, if the output is larger than PAGE_SIZE. Reported-by: Lai Jiangshan Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4186FA.1020106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_printk.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 9bece9687b6..7b627811082 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) EXPORT_SYMBOL_GPL(__ftrace_vprintk); static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +t_start(struct seq_file *m, loff_t *pos) { - const char **fmt = m->private; - const char **next = fmt; - - (*pos)++; + const char **fmt = __start___trace_bprintk_fmt + *pos; if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) return NULL; - - next = fmt; - m->private = ++next; - return fmt; } -static void *t_start(struct seq_file *m, loff_t *pos) +static void *t_next(struct seq_file *m, void * v, loff_t *pos) { - return t_next(m, NULL, pos); + (*pos)++; + return t_start(m, pos); } static int t_show(struct seq_file *m, void *v) @@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = { static int ftrace_formats_open(struct inode *inode, struct file *file) { - int ret; - - ret = seq_open(file, &show_format_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = __start___trace_bprintk_fmt; - } - return ret; + return seq_open(file, &show_format_seq_ops); } static const struct file_operations ftrace_formats_fops = { -- cgit v1.2.3 From 2961bf345fd1b736c3db46cad0f69855f67fbe9c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:53:26 +0800 Subject: trace_stat: Don't increment @pos in seq start() It's wrong to increment @pos in stat_seq_start(). It causes some stat entries lost when reading stat file, if the output of the file is larger than PAGE_SIZE. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A418716.90209@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index c00643733f4..e66f5e49334 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -199,17 +199,13 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) { - (*pos)++; + if (!*pos && session->ts->stat_headers) return SEQ_START_TOKEN; - } node = rb_first(&session->stat_root); for (i = 0; node && i < *pos; i++) node = rb_next(node); - (*pos)++; - return node; } -- cgit v1.2.3 From f129e965bef40c6153e4fe505f1e408286213424 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:53:44 +0800 Subject: tracing: Reset iterator in t_start() The iterator is m->private, but it's not reset to trace_types in t_start(). If the output is larger than PAGE_SIZE and t_start() is called the 2nd time, things will go wrong. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A418728.5020506@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 076fa6f0ee4..3bb31006b5c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2053,25 +2053,23 @@ static int tracing_open(struct inode *inode, struct file *file) static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct tracer *t = m->private; + struct tracer *t = v; (*pos)++; if (t) t = t->next; - m->private = t; - return t; } static void *t_start(struct seq_file *m, loff_t *pos) { - struct tracer *t = m->private; + struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); - for (; t && l < *pos; t = t_next(m, t, &l)) + for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) ; return t; @@ -2107,18 +2105,10 @@ static struct seq_operations show_traces_seq_ops = { static int show_traces_open(struct inode *inode, struct file *file) { - int ret; - if (tracing_disabled) return -ENODEV; - ret = seq_open(file, &show_traces_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = trace_types; - } - - return ret; + return seq_open(file, &show_traces_seq_ops); } static ssize_t -- cgit v1.2.3 From 85951842a1020669f0a9eb0f0d1853b41341f097 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:54:00 +0800 Subject: ftrace: Don't increment @pos in g_start() It's wrong to increment @pos in g_start(). It causes some entries lost when reading set_graph_function, if the output of the file is larger than PAGE_SIZE. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A418738.7090401@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3718d55fb4c..cde74b9973b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2500,32 +2500,31 @@ int ftrace_graph_count; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * -g_next(struct seq_file *m, void *v, loff_t *pos) +__g_next(struct seq_file *m, loff_t *pos) { unsigned long *array = m->private; - int index = *pos; - - (*pos)++; - if (index >= ftrace_graph_count) + if (*pos >= ftrace_graph_count) return NULL; + return &array[*pos]; +} - return &array[index]; +static void * +g_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __g_next(m, pos); } static void *g_start(struct seq_file *m, loff_t *pos) { - void *p = NULL; - mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ if (!ftrace_graph_count && !*pos) return (void *)1; - p = g_next(m, p, pos); - - return p; + return __g_next(m, pos); } static void g_stop(struct seq_file *m, void *p) -- cgit v1.2.3 From 694ce0a544fba37a60025a6803ee6265be8a2a22 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:54:19 +0800 Subject: ftrace: Don't manipulate @pos in t_start() It's rather confusing that in t_start(), in some cases @pos is incremented, and in some cases it's decremented and then incremented. This patch rewrites t_start() in a much more general way. Thus we fix a bug that if ftrace_filtered == 1, functions have tracer hooks won't be printed, because the branch is always unreachable: static void *t_start(...) { ... if (!p) return t_hash_start(m, pos); return p; } Before: # echo 'sys_open' > /mnt/tracing/set_ftrace_filter # echo 'sys_write:traceon:4' >> /mnt/tracing/set_ftrace_filter sys_open After: # echo 'sys_open' > /mnt/tracing/set_ftrace_filter # echo 'sys_write:traceon:4' >> /mnt/tracing/set_ftrace_filter sys_open sys_write:traceon:count=4 Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A41874B.4090507@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cde74b9973b..dc810208edd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1467,8 +1467,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) iter->pg = iter->pg->next; iter->idx = 0; goto retry; - } else { - iter->idx = -1; } } else { rec = &iter->pg->records[iter->idx++]; @@ -1497,6 +1495,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; + loff_t l; mutex_lock(&ftrace_lock); /* @@ -1508,23 +1507,21 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; - (*pos)++; return iter; } if (iter->flags & FTRACE_ITER_HASH) return t_hash_start(m, pos); - if (*pos > 0) { - if (iter->idx < 0) - return p; - (*pos)--; - iter->idx--; + iter->pg = ftrace_pages_start; + iter->idx = 0; + for (l = 0; l <= *pos; ) { + p = t_next(m, p, &l); + if (!p) + break; } - p = t_next(m, p, pos); - - if (!p) + if (!p && iter->flags & FTRACE_ITER_FILTER) return t_hash_start(m, pos); return p; -- cgit v1.2.3 From d82d62444f87e5993af2fa82ed636b2206e052ea Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:54:54 +0800 Subject: ftrace: Fix t_hash_start() When the output of set_ftrace_filter is larger than PAGE_SIZE, t_hash_start() will be called the 2nd time, and then we start from the head of a hlist, which is wrong and causes some entries to be outputed twice. The worse is, if the hlist is large enough, reading set_ftrace_filter won't stop but in a dead loop. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A41876E.2060407@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dc810208edd..71a52c17214 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1417,10 +1417,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; + loff_t l; + + if (!(iter->flags & FTRACE_ITER_HASH)) + *pos = 0; iter->flags |= FTRACE_ITER_HASH; - return t_hash_next(m, p, pos); + iter->hidx = 0; + for (l = 0; l <= *pos; ) { + p = t_hash_next(m, p, &l); + if (!p) + break; + } + return p; } static int t_hash_show(struct seq_file *m, void *v) -- cgit v1.2.3 From 507e123151149e578c9aae33eb876c49824da5f8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 23 Jun 2009 17:38:15 +0200 Subject: timer stats: Optimize by adding quick check to avoid function calls When the kernel is configured with CONFIG_TIMER_STATS but timer stats are runtime disabled we still get calls to __timer_stats_timer_set_start_info which initializes some fields in the corresponding struct timer_list. So add some quick checks in the the timer stats setup functions to avoid function calls to __timer_stats_timer_set_start_info when timer stats are disabled. In an artificial workload that does nothing but playing ping pong with a single tcp packet via loopback this decreases cpu consumption by 1 - 1.5%. This is part of a modified function trace output on SLES11: perl-2497 [00] 28630647177732388 [+ 125]: sk_reset_timer <-tcp_v4_rcv perl-2497 [00] 28630647177732513 [+ 125]: mod_timer <-sk_reset_timer perl-2497 [00] 28630647177732638 [+ 125]: __timer_stats_timer_set_start_info <-mod_timer perl-2497 [00] 28630647177732763 [+ 125]: __mod_timer <-mod_timer perl-2497 [00] 28630647177732888 [+ 125]: __timer_stats_timer_set_start_info <-__mod_timer perl-2497 [00] 28630647177733013 [+ 93]: lock_timer_base <-__mod_timer Signed-off-by: Heiko Carstens Cc: Andrew Morton Cc: Martin Schwidefsky Cc: Mustafa Mesanovic Cc: Arjan van de Ven LKML-Reference: <20090623153811.GA4641@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/time/timer_stats.c | 16 ++++++++-------- kernel/timer.c | 2 ++ 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index c994530d166..4cde8b9c716 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex); /* * Collection status, active/inactive: */ -static int __read_mostly active; +int __read_mostly timer_stats_active; /* * Beginning/end timestamps of measurement: @@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, struct entry *entry, input; unsigned long flags; - if (likely(!active)) + if (likely(!timer_stats_active)) return; lock = &per_cpu(lookup_lock, raw_smp_processor_id()); @@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.timer_flag = timer_flag; spin_lock_irqsave(lock, flags); - if (!active) + if (!timer_stats_active) goto out_unlock; entry = tstat_lookup(&input, comm); @@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v) /* * If still active then calculate up to now: */ - if (active) + if (timer_stats_active) time_stop = ktime_get(); time = ktime_sub(time_stop, time_start); @@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf, mutex_lock(&show_mutex); switch (ctl[0]) { case '0': - if (active) { - active = 0; + if (timer_stats_active) { + timer_stats_active = 0; time_stop = ktime_get(); sync_access(); } break; case '1': - if (!active) { + if (!timer_stats_active) { reset_entries(); time_start = ktime_get(); smp_mb(); - active = 1; + timer_stats_active = 1; } break; default: diff --git a/kernel/timer.c b/kernel/timer.c index 54d3912f8ca..0b36b9e5cc8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer) { unsigned int flag = 0; + if (likely(!timer->start_site)) + return; if (unlikely(tbase_get_deferrable(timer->base))) flag |= TIMER_STATS_FLAG_DEFERRABLE; -- cgit v1.2.3 From 9d612beff5089b89a295a2331883a8ce3fff08c1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 17:33:15 +0800 Subject: tracing: Fix trace_buf_size boot option We should be able to specify [KMG] when setting trace_buf_size boot option, as documented in kernel-parameters.txt Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A41F2DB.4020102@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3bb31006b5c..3aa0a0dfdfa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -284,13 +284,12 @@ void trace_wake_up(void) static int __init set_buf_size(char *str) { unsigned long buf_size; - int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &buf_size); + buf_size = memparse(str, &str); /* nr_entries can not be zero */ - if (ret < 0 || buf_size == 0) + if (buf_size == 0) return 0; trace_buf_size = buf_size; return 1; -- cgit v1.2.3 From d0725992c8a6fb63a16bc9e8b2a50094cc4db3cd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Jun 2009 23:15:43 +0200 Subject: futex: Fix the write access fault problem for real commit 64d1304a64 (futex: setup writeable mapping for futex ops which modify user space data) did address only half of the problem of write access faults. The patch was made on two wrong assumptions: 1) access_ok(VERIFY_WRITE,...) would actually check write access. On x86 it does _NOT_. It's a pure address range check. 2) a RW mapped region can not go away under us. That's wrong as well. Nobody can prevent another thread to call mprotect(PROT_READ) on that region where the futex resides. If that call hits between the get_user_pages_fast() verification and the actual write access in the atomic region we are toast again. The solution is to not rely on access_ok and get_user() for any write access related fault on private and shared futexes. Instead we need to fault it in with verification of write access. There is no generic non destructive write mechanism which would fault the user page in trough a #PF, but as we already know that we will fault we can as well call get_user_pages() directly and avoid the #PF overhead. If get_user_pages() returns -EFAULT we know that we can not fix it anymore and need to bail out to user space. Remove a bunch of confusing comments on this issue as well. Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/futex.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 80b5ce71659..1c337112335 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -284,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } +/* + * fault_in_user_writeable - fault in user address and verify RW access + * @uaddr: pointer to faulting user space address + * + * Slow path to fixup the fault we just took in the atomic write + * access to @uaddr. + * + * We have no generic implementation of a non destructive write to the + * user address. We know that we faulted in the atomic pagefault + * disabled section so we can as well avoid the #PF overhead by + * calling get_user_pages() right away. + */ +static int fault_in_user_writeable(u32 __user *uaddr) +{ + int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, + sizeof(*uaddr), 1, 0, NULL, NULL); + return ret < 0 ? ret : 0; +} + /** * futex_top_waiter() - Return the highest priority waiter on a futex * @hb: the hash bucket the futex_q's reside in @@ -896,7 +915,6 @@ retry: retry_private: op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { - u32 dummy; double_unlock_hb(hb1, hb2); @@ -914,7 +932,7 @@ retry_private: goto out_put_keys; } - ret = get_user(dummy, uaddr2); + ret = fault_in_user_writeable(uaddr2); if (ret) goto out_put_keys; @@ -1204,7 +1222,7 @@ retry_private: double_unlock_hb(hb1, hb2); put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); - ret = get_user(curval2, uaddr2); + ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; goto out; @@ -1482,7 +1500,7 @@ retry: handle_fault: spin_unlock(q->lock_ptr); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); spin_lock(q->lock_ptr); @@ -1807,7 +1825,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, { struct hrtimer_sleeper timeout, *to = NULL; struct futex_hash_bucket *hb; - u32 uval; struct futex_q q; int res, ret; @@ -1909,16 +1926,9 @@ out: return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: - /* - * We have to r/w *(int __user *)uaddr, and we have to modify it - * atomically. Therefore, if we continue to fault after get_user() - * below, we need to handle the fault ourselves, while still holding - * the mmap_sem. This can occur if the uaddr is under contention as - * we have to drop the mmap_sem in order to call get_user(). - */ queue_unlock(&q, hb); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); if (ret) goto out_put_key; @@ -2013,17 +2023,10 @@ out: return ret; pi_faulted: - /* - * We have to r/w *(int __user *)uaddr, and we have to modify it - * atomically. Therefore, if we continue to fault after get_user() - * below, we need to handle the fault ourselves, while still holding - * the mmap_sem. This can occur if the uaddr is under contention as - * we have to drop the mmap_sem in order to call get_user(). - */ spin_unlock(&hb->lock); put_futex_key(fshared, &key); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); if (!ret) goto retry; -- cgit v1.2.3 From 3a6a6c16be78472a52f6dd7d88913373b42ad0f7 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 24 Jun 2009 16:09:01 -0400 Subject: audit: inode watches depend on CONFIG_AUDIT not CONFIG_AUDIT_SYSCALL Even though one cannot make use of the audit watch code without CONFIG_AUDIT_SYSCALL the spaghetti nature of the audit code means that the audit rule filtering requires that it at least be compiled. Thus build the audit_watch code when we build auditfilter like it was before cfcad62c74abfef83762dc05a556d21bdf3980a2 Clearly this is a point of potential future cleanup.. Reported-by: Frans Pop Signed-off-by: Eric Paris Signed-off-by: Al Viro --- kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index da750010a6f..780c8dcf451 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,8 +69,8 @@ obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o -obj-$(CONFIG_AUDIT) += audit.o auditfilter.o -obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o +obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o +obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o -- cgit v1.2.3 From 00e54d087afb3867b0b461aef6c1ff433d0df564 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 25 Jun 2009 14:05:27 +0800 Subject: ftrace: Remove duplicate newline Before: # echo 'sys_open:traceon:' > set_ftrace_filter # echo 'sys_close:traceoff:5' > set_ftrace_filter # cat set_ftrace_filter #### all functions enabled #### sys_open:traceon:unlimited sys_close:traceoff:count=0 After: # cat set_ftrace_filter #### all functions enabled #### sys_open:traceon:unlimited sys_close:traceoff:count=0 Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4313A7.7030105@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 90f13476483..7402144bff2 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -302,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, if (count == -1) seq_printf(m, ":unlimited\n"); else - seq_printf(m, ":count=%ld", count); - seq_putc(m, '\n'); + seq_printf(m, ":count=%ld\n", count); return 0; } -- cgit v1.2.3 From 1155de47cd66d0c496d5a6fb2223e980ef1285b2 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 25 Jun 2009 14:30:12 +0900 Subject: ring-buffer: Make it generally available In hunting down the cause for the hwlat_detector ring buffer spew in my failed -next builds it became obvious that folks are now treating ring_buffer as something that is generic independent of tracing and thus, suitable for public driver consumption. Given that there are only a few minor areas in ring_buffer that have any reliance on CONFIG_TRACING or CONFIG_FUNCTION_TRACER, provide stubs for those and make it generally available. Signed-off-by: Paul Mundt Cc: Jon Masters Cc: Steven Rostedt LKML-Reference: <20090625053012.GB19944@linux-sh.org> Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/trace/ring_buffer.c | 11 +++++++++++ kernel/trace/trace.h | 7 +++++++ 3 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0a32cb21ec9..0630e293cd4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -96,6 +96,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ +obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 04dac263825..bf27bb7a63e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1563,6 +1563,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +#ifdef CONFIG_TRACING + #define TRACE_RECURSIVE_DEPTH 16 static int trace_recursive_lock(void) @@ -1593,6 +1595,13 @@ static void trace_recursive_unlock(void) current->trace_recursion--; } +#else + +#define trace_recursive_lock() (0) +#define trace_recursive_unlock() do { } while (0) + +#endif + static DEFINE_PER_CPU(int, rb_need_resched); /** @@ -3104,6 +3113,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +#ifdef CONFIG_TRACING static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3171,6 +3181,7 @@ static __init int rb_init_debugfs(void) } fs_initcall(rb_init_debugfs); +#endif #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6e735d4771f..3548ae5cc78 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -597,6 +597,7 @@ print_graph_function(struct trace_iterator *iter) extern struct pid *ftrace_pid_trace; +#ifdef CONFIG_FUNCTION_TRACER static inline int ftrace_trace_task(struct task_struct *task) { if (!ftrace_pid_trace) @@ -604,6 +605,12 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } +#else +static inline int ftrace_trace_task(struct task_struct *task) +{ + return 1; +} +#endif /* * trace_iterator_flags is an enumeration that defines bit -- cgit v1.2.3 From aa715284b4d28cabde6c25c568d769a6be712bc8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Jun 2009 14:27:58 +0200 Subject: futex: request only one page from get_user_pages() Yanmin noticed that fault_in_user_writeable() requests 4 pages instead of one. That's the result of blindly trusting Linus' proposal :) I even looked up the prototype to verify the correctness: the argument in question is confusingly enough named "len" while in reality it means number of pages. Pointed-out-by: Yanmin Zhang Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 1c337112335..794c862125f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -299,7 +299,7 @@ void put_futex_key(int fshared, union futex_key *key) static int fault_in_user_writeable(u32 __user *uaddr) { int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, - sizeof(*uaddr), 1, 0, NULL, NULL); + 1, 1, 0, NULL, NULL); return ret < 0 ? ret : 0; } -- cgit v1.2.3 From 5211a242d0cbdded372aee59da18f80552b0a80a Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Wed, 24 Jun 2009 14:32:11 -0700 Subject: x86: Add sysctl to allow panic on IOCK NMI error This patch introduces a new sysctl: /proc/sys/kernel/panic_on_io_nmi which defaults to 0 (off). When enabled, the kernel panics when the kernel receives an NMI caused by an IO error. The IO error triggered NMI indicates a serious system condition, which could result in IO data corruption. Rather than contiuing, panicing and dumping might be a better choice, so one can figure out what's causing the IO error. This could be especially important to companies running IO intensive applications where corruption must be avoided, e.g. a bank's databases. [ SuSE has been shipping it for a while, it was done at the request of a large database vendor, for their users. ] Signed-off-by: Kurt Garloff Signed-off-by: Roberto Angelino Signed-off-by: Greg Kroah-Hartman Cc: "Eric W. Biederman" LKML-Reference: <20090624213211.GA11291@kroah.com> Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 62e4ff9968b..fba42eda8de 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -743,6 +743,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", -- cgit v1.2.3 From 0296e4254f3318e0dcad9706fa1daf8e5addc1e9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 26 Jun 2009 11:15:37 +0800 Subject: ftrace: Fix the output of profile The first entry of the ftrace profile was always skipped when reading trace_stat/functionX. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A443D59.4080307@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 71a52c17214..f3716bf04df 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -291,7 +291,9 @@ function_stat_next(void *v, int idx) pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); again: - rec++; + if (idx != 0) + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { pg = pg->next; if (!pg) -- cgit v1.2.3