From 71a851b4d2a815adcfac09c1adda7ef6811fde66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 09:06:56 +0200 Subject: perf_counter: Stop open coding unclone_ctx Instead of open coding the unclone context thingy, put it in a common function. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d55a50da234..8bf997d86bf 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -146,6 +146,14 @@ static void put_ctx(struct perf_counter_context *ctx) } } +static void unclone_ctx(struct perf_counter_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + /* * Get the perf_counter_context for a task and lock it. * This has to cope with with the fact that until it is locked, @@ -1463,10 +1471,8 @@ static void perf_counter_enable_on_exec(struct task_struct *task) /* * Unclone this context if we enabled any counter. */ - if (enabled && ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } + if (enabled) + unclone_ctx(ctx); spin_unlock(&ctx->lock); @@ -1526,7 +1532,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx, static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { - struct perf_counter_context *parent_ctx; struct perf_counter_context *ctx; struct perf_cpu_context *cpuctx; struct task_struct *task; @@ -1586,11 +1591,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) retry: ctx = perf_lock_task_context(task, &flags); if (ctx) { - parent_ctx = ctx->parent_ctx; - if (parent_ctx) { - put_ctx(parent_ctx); - ctx->parent_ctx = NULL; /* no longer a clone */ - } + unclone_ctx(ctx); spin_unlock_irqrestore(&ctx->lock, flags); } @@ -4255,15 +4256,12 @@ void perf_counter_exit_task(struct task_struct *child) */ spin_lock(&child_ctx->lock); child->perf_counter_ctxp = NULL; - if (child_ctx->parent_ctx) { - /* - * This context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the counters from it. - */ - put_ctx(child_ctx->parent_ctx); - child_ctx->parent_ctx = NULL; - } + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the counters from it. + */ + unclone_ctx(child_ctx); spin_unlock(&child_ctx->lock); local_irq_restore(flags); -- cgit v1.2.3 From d4d7d0b9545721d3cabb19d15163bbc66b797707 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 6 Jul 2009 09:31:33 +0100 Subject: perf_counter: Fix the tracepoint channel to perfcounters Fix a missed rename in EVENT_PROFILE support so that it gets built and allows tracepoint tracing from the 'perf' tool. Fix a typo in the (never before built & enabled) portion in perf_counter.c as well, and update that code to the attr.config changes as well. Signed-off-by: Chris Wilson Cc: Ben Gamari Cc: Jason Baron Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt LKML-Reference: <1246869094-21237-1-git-send-email-chris@chris-wilson.co.uk> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d55a50da234..c6c38fb7766 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3671,7 +3671,7 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id) { struct perf_sample_data data = { - .regs = get_irq_regs(); + .regs = get_irq_regs(), .addr = 0, }; @@ -3687,16 +3687,12 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - ftrace_profile_disable(perf_event_id(&counter->attr)); + ftrace_profile_disable(counter->attr.config); } static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { - int event_id = perf_event_id(&counter->attr); - int ret; - - ret = ftrace_profile_enable(event_id); - if (ret) + if (ftrace_profile_enable(counter->attr.config)) return NULL; counter->destroy = tp_perf_counter_destroy; -- cgit v1.2.3 From 54fdc5816631b43ba55fc3206d7add2d85850bc6 Mon Sep 17 00:00:00 2001 From: Fabio Checconi Date: Thu, 16 Jul 2009 12:32:27 +0200 Subject: sched: Account for vruntime wrapping I spotted two sites that didn't take vruntime wrap-around into account. Fix these by creating a comparison helper that does do so. Signed-off-by: Fabio Checconi Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7c248dc30f4..9ffb2b2ceba 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } +static inline int entity_before(struct sched_entity *a, + struct sched_entity *b) +{ + return (s64)(a->vruntime - b->vruntime) < 0; +} + static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { return se->vruntime - cfs_rq->min_vruntime; @@ -1017,7 +1023,7 @@ static void yield_task_fair(struct rq *rq) /* * Already in the rightmost position? */ - if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) + if (unlikely(!rightmost || entity_before(rightmost, se))) return; /* @@ -1713,7 +1719,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) /* 'curr' will be NULL if the child belongs to a different group */ if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && curr->vruntime < se->vruntime) { + curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. -- cgit v1.2.3 From 413ee3b48ab582ffea33e7e140c7a2c5ea657e9a Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 16 Jul 2009 15:15:52 +0200 Subject: perf_counter: Make sure we dont leak kernel memory to userspace There are a few places we are leaking tiny amounts of kernel memory to userspace. This happens when writing out strings because we always align the end to 64 bits. To avoid this we should always use an appropriately sized temporary buffer and ensure it is zeroed. Since d_path assembles the string from the end of the buffer backwards, we need to add 64 bits after the buffer to allow for alignment. We also need to copy arch_vma_name to the temporary buffer, because if we use it directly we may end up copying to userspace a number of bytes after the end of the string constant. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090716104817.273972048@samba.org> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c6c38fb7766..f7a8ab9576e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2968,8 +2968,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; unsigned int size; - char *comm = comm_event->task->comm; + char comm[TASK_COMM_LEN]; + memset(comm, 0, sizeof(comm)); + strncpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -3088,8 +3090,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) char *buf = NULL; const char *name; + memset(tmp, 0, sizeof(tmp)); + if (file) { - buf = kzalloc(PATH_MAX, GFP_KERNEL); + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); if (!buf) { name = strncpy(tmp, "//enomem", sizeof(tmp)); goto got_name; @@ -3100,9 +3109,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } } else { - name = arch_vma_name(mmap_event->vma); - if (name) + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); goto got_name; + } if (!vma->vm_mm) { name = strncpy(tmp, "[vdso]", sizeof(tmp)); -- cgit v1.2.3 From ed900c054b541254f0ce5cedaf75206e29bd614e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: perf_counter: Log vfork as a fork event Right now we don't output vfork events. Even though we should always see an exec after a vfork, we may get perfcounter samples between the vfork and exec. These samples can lead to some confusion when parsing perfcounter data. To keep things consistent we should always log a fork event. It will result in a little more log data, but is less confusing to trace parsing tools. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090716104817.589309391@samba.org> Signed-off-by: Ingo Molnar --- kernel/fork.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 467746b3f0a..4812d60b29f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1408,14 +1408,11 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); - } else if (!(clone_flags & CLONE_VM)) { - /* - * vfork will do an exec which will call - * set_task_comm() - */ - perf_counter_fork(p); } + if (!(clone_flags & CLONE_THREAD)) + perf_counter_fork(p); + audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); -- cgit v1.2.3 From e5d490b252423605a77c54b2e35b10ea663763df Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Jul 2009 12:23:11 +0100 Subject: profile: Suppress warning about large allocations when profile=1 is specified When profile= is used, a large buffer is allocated early at boot. This can be larger than what the page allocator can provide so it prints a warning. However, the caller is able to handle the situation so this patch suppresses the warning. Signed-off-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Cc: Linux Memory Management List Cc: Heinz Diehl Cc: David Miller Cc: Arnaldo Carvalho de Melo Cc: Mel Gorman Cc: Andrew Morton LKML-Reference: <1247656992-19846-3-git-send-email-mel@csn.ul.ie> Signed-off-by: Ingo Molnar --- kernel/profile.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 69911b5745e..419250ebec4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -117,11 +117,12 @@ int __ref profile_init(void) cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; - prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + prof_buffer = alloc_pages_exact(buffer_bytes, + GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); if (prof_buffer) return 0; -- cgit v1.2.3 From a468d389349a7560249b355cdb6d2097ea1616c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2009 14:15:46 +0200 Subject: sched: fix load average accounting vs. cpu hotplug The new load average code clears rq->calc_load_active on CPU_ONLINE. That's wrong as the new onlined CPU might have got a scheduler tick already and accounted the delta to the stale value of the time we offlined the CPU. Clear the value when we cleanup the dead CPU instead. Also move the update of the calc_load_update time for the newly online CPU to CPU_UP_PREPARE to avoid that the CPU plays catch up with the stale update time value. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 98972d366fd..1b59e265273 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7289,6 +7289,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) static void calc_global_load_remove(struct rq *rq) { atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; } #endif /* CONFIG_HOTPLUG_CPU */ @@ -7515,6 +7516,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) task_rq_unlock(rq, &flags); get_task_struct(p); cpu_rq(cpu)->migration_thread = p; + rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: @@ -7525,8 +7527,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); - rq->calc_load_update = calc_load_update; - rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -- cgit v1.2.3 From 6301cb95c119ebf324bb96ee226fa9ddffad80a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2009 14:15:47 +0200 Subject: sched: fix nr_uninterruptible accounting of frozen tasks really commit e3c8ca8336 (sched: do not count frozen tasks toward load) broke the nr_uninterruptible accounting on freeze/thaw. On freeze the task is excluded from accounting with a check for (task->flags & PF_FROZEN), but that flag is cleared before the task is thawed. So while we prevent that the task with state TASK_UNINTERRUPTIBLE is accounted to nr_uninterruptible on freeze we decrement nr_uninterruptible on thaw. Use a separate flag which is handled by the freezing task itself. Set it before calling the scheduler with TASK_UNINTERRUPTIBLE state and clear it after we return from frozen state. Cc: Signed-off-by: Thomas Gleixner --- kernel/freezer.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 2f4936cf708..bd1d42b17cb 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,12 +44,19 @@ void refrigerator(void) recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); + /* prevent accounting of that task to load */ + current->flags |= PF_FREEZING; + for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!frozen(current)) break; schedule(); } + + /* Remove the accounting blocker */ + current->flags &= ~PF_FREEZING; + pr_debug("%s left refrigerator\n", current->comm); __set_current_state(save); } -- cgit v1.2.3 From 4841158b26e28e1476eed84c7347c18f11317750 Mon Sep 17 00:00:00 2001 From: Pavel Roskin Date: Sat, 18 Jul 2009 16:46:02 -0400 Subject: timer: Avoid reading uninitialized data timer->expires may be uninitialized, so check timer_pending() before touching timer->expires to pacify kmemcheck. Signed-off-by: Pavel Roskin LKML-Reference: <20090718204602.5191.360.stgit@mj.roinet.com> Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 0b36b9e5cc8..a7f07d5a624 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -714,7 +714,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) * networking code - if the timer is re-modified * to be the same thing then just return: */ - if (timer->expires == expires && timer_pending(timer)) + if (timer_pending(timer) && timer->expires == expires) return 1; return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); -- cgit v1.2.3 From 79ef2bb01445400def20c7993b27fbcad27ca95f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 Jul 2009 17:09:12 +0200 Subject: clocksource: Prevent NULL pointer dereference Writing a zero length string to sys/.../current_clocksource will cause a NULL pointer dereference if the clock events system is in one shot (highres or nohz) mode. Pointed-out-by: Dan Carpenter LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 592bf584d1d..7466cb81125 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -513,7 +513,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, * Check to make sure we don't switch to a non-highres capable * clocksource if the tick code is in oneshot mode (highres or nohz) */ - if (tick_oneshot_mode_active() && + if (tick_oneshot_mode_active() && ovr && !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { printk(KERN_WARNING "%s clocksource is not HRT compatible. " "Cannot switch while in HRT/NOHZ mode\n", ovr->name); -- cgit v1.2.3 From 591d2fb02ea80472d846c0b8507007806bdd69cc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Jul 2009 11:09:39 +0200 Subject: genirq: Delegate irq affinity setting to the irq thread irq_set_thread_affinity() calls set_cpus_allowed_ptr() which might sleep, but irq_set_thread_affinity() is called with desc->lock held and can be called from hard interrupt context as well. The code has another bug as it does not hold a ref on the task struct as required by set_cpus_allowed_ptr(). Just set the IRQTF_AFFINITY bit in action->thread_flags. The next time the thread runs it migrates itself. Solves all of the above problems nicely. Add kerneldoc to irq_set_thread_affinity() while at it. Signed-off-by: Thomas Gleixner LKML-Reference: --- kernel/irq/internals.h | 3 +-- kernel/irq/manage.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ kernel/irq/migration.c | 2 +- 3 files changed, 46 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 73468253143..e70ed5592eb 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -42,8 +42,7 @@ static inline void unregister_handler_proc(unsigned int irq, extern int irq_select_affinity_usr(unsigned int irq); -extern void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); +extern void irq_set_thread_affinity(struct irq_desc *desc); /* * Debugging printout: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 50da6767290..f0de36f13a4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq) return 1; } -void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) +/** + * irq_set_thread_affinity - Notify irq threads to adjust affinity + * @desc: irq descriptor which has affitnity changed + * + * We just set IRQTF_AFFINITY and delegate the affinity setting + * to the interrupt thread itself. We can not call + * set_cpus_allowed_ptr() here as we hold desc->lock and this + * code can be called from hard interrupt context. + */ +void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action = desc->action; while (action) { if (action->thread) - set_cpus_allowed_ptr(action->thread, cpumask); + set_bit(IRQTF_AFFINITY, &action->thread_flags); action = action->next; } } @@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) if (desc->status & IRQ_MOVE_PCNTXT) { if (!desc->chip->set_affinity(irq, cpumask)) { cpumask_copy(desc->affinity, cpumask); - irq_set_thread_affinity(desc, cpumask); + irq_set_thread_affinity(desc); } } else { @@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) #else if (!desc->chip->set_affinity(irq, cpumask)) { cpumask_copy(desc->affinity, cpumask); - irq_set_thread_affinity(desc, cpumask); + irq_set_thread_affinity(desc); } #endif desc->status |= IRQ_AFFINITY_SET; @@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); if (!ret) - irq_set_thread_affinity(desc, desc->affinity); + irq_set_thread_affinity(desc); spin_unlock_irqrestore(&desc->lock, flags); return ret; @@ -443,6 +451,34 @@ static int irq_wait_for_interrupt(struct irqaction *action) return -1; } +/* + * Check whether we need to change the affinity of the interrupt thread. + */ +static void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) +{ + cpumask_var_t mask; + + if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) + return; + + /* + * In case we are out of memory we set IRQTF_AFFINITY again and + * try again next time + */ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + set_bit(IRQTF_AFFINITY, &action->thread_flags); + return; + } + + spin_lock_irq(&desc->lock); + cpumask_copy(mask, desc->affinity); + spin_unlock_irq(&desc->lock); + + set_cpus_allowed_ptr(current, mask); + free_cpumask_var(mask); +} + /* * Interrupt handler thread */ @@ -458,6 +494,8 @@ static int irq_thread(void *data) while (!irq_wait_for_interrupt(action)) { + irq_thread_check_affinity(desc, action); + atomic_inc(&desc->threads_active); spin_lock_irq(&desc->lock); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index cfe767ca154..fcb6c96f262 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -45,7 +45,7 @@ void move_masked_irq(int irq) < nr_cpu_ids)) if (!desc->chip->set_affinity(irq, desc->pending_mask)) { cpumask_copy(desc->affinity, desc->pending_mask); - irq_set_thread_affinity(desc, desc->pending_mask); + irq_set_thread_affinity(desc); } cpumask_clear(desc->pending_mask); -- cgit v1.2.3 From 9ba5f005c994ad28e266a0cd14ef29354be382c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 14:18:35 +0200 Subject: softirq: introduce tasklet_hrtimer infrastructure commit ca109491f (hrtimer: removing all ur callback modes) moved all hrtimer callbacks into hard interrupt context when high resolution timers are active. That breaks code which relied on the assumption that the callback happens in softirq context. Provide a generic infrastructure which combines tasklets and hrtimers together to provide an in-softirq hrtimer experience. Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Cc: kaber@trash.net Cc: David Miller LKML-Reference: <1248265724.27058.1366.camel@twins> Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 3a94905fa5d..eb5e131a048 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *)) softirq_vec[nr].action = action; } -/* Tasklets */ +/* + * Tasklets + */ struct tasklet_head { struct tasklet_struct *head; @@ -493,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +/* + * tasklet_hrtimer + */ + +/* + * The trampoline is called when the hrtimer expires. If this is + * called from the hrtimer interrupt then we schedule the tasklet as + * the timer callback function expects to run in softirq context. If + * it's called in softirq context anyway (i.e. high resolution timers + * disabled) then the hrtimer callback is called right away. + */ +static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) +{ + struct tasklet_hrtimer *ttimer = + container_of(timer, struct tasklet_hrtimer, timer); + + if (hrtimer_is_hres_active(timer)) { + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; + } + return ttimer->function(timer); +} + +/* + * Helper function which calls the hrtimer callback from + * tasklet/softirq context + */ +static void __tasklet_hrtimer_trampoline(unsigned long data) +{ + struct tasklet_hrtimer *ttimer = (void *)data; + enum hrtimer_restart restart; + + restart = ttimer->function(&ttimer->timer); + if (restart != HRTIMER_NORESTART) + hrtimer_restart(&ttimer->timer); +} + +/** + * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks + * @ttimer: tasklet_hrtimer which is initialized + * @function: hrtimer callback funtion which gets called from softirq context + * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) + * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) + */ +void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t which_clock, enum hrtimer_mode mode) +{ + hrtimer_init(&ttimer->timer, which_clock, mode); + ttimer->timer.function = __hrtimer_tasklet_trampoline; + tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, + (unsigned long)ttimer); + ttimer->function = function; +} +EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); + +/* + * Remote softirq bits + */ + DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); EXPORT_PER_CPU_SYMBOL(softirq_work_list); -- cgit v1.2.3 From c9f73a3dd27e03411f18a58c0814d51392d2b17a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 21 Jul 2009 00:55:05 -0700 Subject: perf: Fix stack data leak the "reserved" field was not initialized to zero, resulting in 4 bytes of stack data leaking to userspace.... Signed-off-by: Arjan van de Ven Signed-off-by: Peter Zijlstra --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5c6fae4f43d..ff854fd89a8 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2666,6 +2666,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); + cpu_entry.reserved = 0; } if (sample_type & PERF_SAMPLE_PERIOD) -- cgit v1.2.3 From 573402db02746179b3f95f83a11a787501f52d0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 11:13:50 +0200 Subject: perf_counter: Plug more stack leaks Per example of Arjan's patch, I went through and found a few more. Signed-off-by: Peter Zijlstra --- kernel/perf_counter.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ff854fd89a8..e1d6a3aa133 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2897,8 +2897,11 @@ void perf_counter_fork(struct task_struct *task) .event = { .header = { .type = PERF_EVENT_FORK, + .misc = 0, .size = sizeof(fork_event.event), }, + /* .pid */ + /* .ppid */ }, }; @@ -3008,8 +3011,16 @@ void perf_counter_comm(struct task_struct *task) comm_event = (struct perf_comm_event){ .task = task, + /* .comm */ + /* .comm_size */ .event = { - .header = { .type = PERF_EVENT_COMM, }, + .header = { + .type = PERF_EVENT_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ }, }; @@ -3160,8 +3171,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma) mmap_event = (struct perf_mmap_event){ .vma = vma, + /* .file_name */ + /* .file_size */ .event = { - .header = { .type = PERF_EVENT_MMAP, }, + .header = { + .type = PERF_EVENT_MMAP, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ .start = vma->vm_start, .len = vma->vm_end - vma->vm_start, .pgoff = vma->vm_pgoff, -- cgit v1.2.3 From 7f453c24b95a085fc7bd35d53b33abc4dc5a048b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 13:19:40 +0200 Subject: perf_counter: PERF_SAMPLE_ID and inherited counters Anton noted that for inherited counters the counter-id as provided by PERF_SAMPLE_ID isn't mappable to the id found through PERF_RECORD_ID because each inherited counter gets its own id. His suggestion was to always return the parent counter id, since that is the primary counter id as exposed. However, these inherited counters have a unique identifier so that events like PERF_EVENT_PERIOD and PERF_EVENT_THROTTLE can be specific about which counter gets modified, which is important when trying to normalize the sample streams. This patch removes PERF_EVENT_PERIOD in favour of PERF_SAMPLE_PERIOD, which is more useful anyway, since changing periods became a lot more common than initially thought -- rendering PERF_EVENT_PERIOD the less useful solution (also, PERF_SAMPLE_PERIOD reports the more accurate value, since it reports the value used to trigger the overflow, whereas PERF_EVENT_PERIOD simply reports the requested period changed, which might only take effect on the next cycle). This still leaves us PERF_EVENT_THROTTLE to consider, but since that _should_ be a rare occurrence, and linking it to a primary id is the most useful bit to diagnose the problem, we introduce a PERF_SAMPLE_STREAM_ID, for those few cases where the full reconstruction is important. [Does change the ABI a little, but I see no other way out] Suggested-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <1248095846.15751.8781.camel@twins> --- kernel/perf_counter.c | 92 +++++++++++++++++---------------------------------- 1 file changed, 31 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e1d6a3aa133..7530588fa5c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -154,6 +154,20 @@ static void unclone_ctx(struct perf_counter_context *ctx) } } +/* + * If we inherit counters we want to return the parent counter id + * to userspace. + */ +static u64 primary_counter_id(struct perf_counter *counter) +{ + u64 id = counter->id; + + if (counter->parent) + id = counter->parent->id; + + return id; +} + /* * Get the perf_counter_context for a task and lock it. * This has to cope with with the fact that until it is locked, @@ -1296,7 +1310,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_counter *counter, int enable); -static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_period(struct perf_counter *counter, u64 events) { @@ -1315,8 +1328,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events) if (!sample_period) sample_period = 1; - perf_log_period(counter, sample_period); - hwc->sample_period = sample_period; } @@ -1705,7 +1716,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = counter->id; + values[n++] = primary_counter_id(counter); mutex_unlock(&counter->child_mutex); if (count < n * sizeof(u64)) @@ -1812,8 +1823,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) counter->attr.sample_freq = value; } else { - perf_log_period(counter, value); - counter->attr.sample_period = value; counter->hw.sample_period = value; } @@ -2662,6 +2671,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_ID) header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_STREAM_ID) + header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_CPU) { header.size += sizeof(cpu_entry); @@ -2705,7 +2717,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(&handle, data->addr); - if (sample_type & PERF_SAMPLE_ID) + if (sample_type & PERF_SAMPLE_ID) { + u64 id = primary_counter_id(counter); + + perf_output_put(&handle, id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(&handle, counter->id); if (sample_type & PERF_SAMPLE_CPU) @@ -2728,7 +2746,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sub != counter) sub->pmu->read(sub); - group_entry.id = sub->id; + group_entry.id = primary_counter_id(sub); group_entry.counter = atomic64_read(&sub->count); perf_output_put(&handle, group_entry); @@ -2788,15 +2806,8 @@ perf_counter_read_event(struct perf_counter *counter, } if (counter->attr.read_format & PERF_FORMAT_ID) { - u64 id; - event.header.size += sizeof(u64); - if (counter->parent) - id = counter->parent->id; - else - id = counter->id; - - event.format[i++] = id; + event.format[i++] = primary_counter_id(counter); } ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); @@ -3190,49 +3201,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma) perf_counter_mmap_event(&mmap_event); } -/* - * Log sample_period changes so that analyzing tools can re-normalize the - * event flow. - */ - -struct freq_event { - struct perf_event_header header; - u64 time; - u64 id; - u64 period; -}; - -static void perf_log_period(struct perf_counter *counter, u64 period) -{ - struct perf_output_handle handle; - struct freq_event event; - int ret; - - if (counter->hw.sample_period == period) - return; - - if (counter->attr.sample_type & PERF_SAMPLE_PERIOD) - return; - - event = (struct freq_event) { - .header = { - .type = PERF_EVENT_PERIOD, - .misc = 0, - .size = sizeof(event), - }, - .time = sched_clock(), - .id = counter->id, - .period = period, - }; - - ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0); - if (ret) - return; - - perf_output_put(&handle, event); - perf_output_end(&handle); -} - /* * IRQ throttle logging */ @@ -3246,14 +3214,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) struct perf_event_header header; u64 time; u64 id; + u64 stream_id; } throttle_event = { .header = { .type = PERF_EVENT_THROTTLE + 1, .misc = 0, .size = sizeof(throttle_event), }, - .time = sched_clock(), - .id = counter->id, + .time = sched_clock(), + .id = primary_counter_id(counter), + .stream_id = counter->id, }; ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); -- cgit v1.2.3 From 966ee4d6b887c14159043ac80b8c3661d2bbe5e2 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 22 Jul 2009 23:05:46 +1000 Subject: perf_counter: Fix throttle/unthrottle event logging Right now we only print PERF_EVENT_THROTTLE + 1 (ie PERF_EVENT_UNTHROTTLE). Fix this to print both a throttle and unthrottle event. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090722130546.GE9029@kryten> --- kernel/perf_counter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7530588fa5c..787d4daef18 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3217,7 +3217,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) u64 stream_id; } throttle_event = { .header = { - .type = PERF_EVENT_THROTTLE + 1, + .type = PERF_EVENT_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, @@ -3226,6 +3226,9 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) .stream_id = counter->id, }; + if (enable) + throttle_event.header.type = PERF_EVENT_UNTHROTTLE; + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); if (ret) return; -- cgit v1.2.3 From 0dc3d523e8bc4718e0be2e4a742367d6e4be77cd Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 21 Jul 2009 00:55:05 -0700 Subject: perf: fix stack data leak the "reserved" field was not initialized to zero, resulting in 4 bytes of stack data leaking to userspace.... Signed-off-by: Arjan van de Ven Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a641eb753b8..7bc888dfd06 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2665,6 +2665,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); + cpu_entry.reserved = 0; } if (sample_type & PERF_SAMPLE_PERIOD) -- cgit v1.2.3 From 61f3826133dc07142935fb5712fc738e19eb5575 Mon Sep 17 00:00:00 2001 From: Bruno Premont Date: Wed, 22 Jul 2009 22:22:32 +0200 Subject: genirq: Fix UP compile failure caused by irq_thread_check_affinity Since genirq: Delegate irq affinity setting to the irq thread (591d2fb02ea80472d846c0b8507007806bdd69cc) compilation with CONFIG_SMP=n fails with following error: /usr/src/linux-2.6/kernel/irq/manage.c: In function 'irq_thread_check_affinity': /usr/src/linux-2.6/kernel/irq/manage.c:475: error: 'struct irq_desc' has no member named 'affinity' make[4]: *** [kernel/irq/manage.o] Error 1 That commit adds a new function irq_thread_check_affinity() which uses struct irq_desc.affinity which is only available for CONFIG_SMP=y. Move that function under #ifdef CONFIG_SMP. [ tglx@brownpaperbag: compile and boot tested on UP and SMP ] Signed-off-by: Bruno Premont LKML-Reference: <20090722222232.2eb3e1c4@neptune.home> Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f0de36f13a4..61c679db468 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -451,6 +451,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) return -1; } +#ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. */ @@ -478,6 +479,10 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } +#else +static inline void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } +#endif /* * Interrupt handler thread -- cgit v1.2.3 From d8cc1ab793993c886c62abf77c93287df33ffd8b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:28:40 +0800 Subject: trace_stack: Fix seqfile memory leak Every time we cat stack_trace, we leak memory allocated by seq_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D8E8.3020500@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e644af91012..6a2a9d484cd 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = { static int stack_trace_open(struct inode *inode, struct file *file) { - int ret; - - ret = seq_open(file, &stack_trace_seq_ops); - - return ret; + return seq_open(file, &stack_trace_seq_ops); } static const struct file_operations stack_trace_fops = { .open = stack_trace_open, .read = seq_read, .llseek = seq_lseek, + .release = seq_release, }; int -- cgit v1.2.3 From 87827111a5538633b18e5c641ced673c4c2bb6ce Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:29:11 +0800 Subject: function-graph: Fix seqfile memory leak Every time we cat set_graph_function, we leak memory allocated by seq_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D907.2010500@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4521c77d1a1..1f3ec2afa51 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2595,6 +2595,14 @@ ftrace_graph_open(struct inode *inode, struct file *file) return ret; } +static int +ftrace_graph_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + return 0; +} + static int ftrace_set_func(unsigned long *array, int *idx, char *buffer) { @@ -2724,9 +2732,10 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, } static const struct file_operations ftrace_graph_fops = { - .open = ftrace_graph_open, - .read = seq_read, - .write = ftrace_graph_write, + .open = ftrace_graph_open, + .read = seq_read, + .write = ftrace_graph_write, + .release = ftrace_graph_release, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.2.3 From 636eacee3b0c76915151db37203cc624becb6d7b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 23 Jul 2009 11:29:47 +0800 Subject: tracing/stat: Fix seqfile memory leak Every time we cat a trace_stat file, we leak memory allocated by seq_open(). Also fix memory leak in a failure path in tracing_stat_open(). Signed-off-by: Li Zefan LKML-Reference: <4A67D92B.4060704@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stat.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index e66f5e49334..aea321c82fa 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -73,7 +73,7 @@ static struct rb_node *release_next(struct rb_node *node) } } -static void reset_stat_session(struct stat_session *session) +static void __reset_stat_session(struct stat_session *session) { struct rb_node *node = session->stat_root.rb_node; @@ -83,10 +83,17 @@ static void reset_stat_session(struct stat_session *session) session->stat_root = RB_ROOT; } +static void reset_stat_session(struct stat_session *session) +{ + mutex_lock(&session->stat_mutex); + __reset_stat_session(session); + mutex_unlock(&session->stat_mutex); +} + static void destroy_session(struct stat_session *session) { debugfs_remove(session->file); - reset_stat_session(session); + __reset_stat_session(session); mutex_destroy(&session->stat_mutex); kfree(session); } @@ -150,7 +157,7 @@ static int stat_seq_init(struct stat_session *session) int i; mutex_lock(&session->stat_mutex); - reset_stat_session(session); + __reset_stat_session(session); if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; @@ -183,7 +190,7 @@ exit: return ret; exit_free_rbtree: - reset_stat_session(session); + __reset_stat_session(session); mutex_unlock(&session->stat_mutex); return ret; } @@ -250,16 +257,21 @@ static const struct seq_operations trace_stat_seq_ops = { static int tracing_stat_open(struct inode *inode, struct file *file) { int ret; - + struct seq_file *m; struct stat_session *session = inode->i_private; + ret = stat_seq_init(session); + if (ret) + return ret; + ret = seq_open(file, &trace_stat_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = session; - ret = stat_seq_init(session); + if (ret) { + reset_stat_session(session); + return ret; } + m = file->private_data; + m->private = session; return ret; } @@ -270,11 +282,9 @@ static int tracing_stat_release(struct inode *i, struct file *f) { struct stat_session *session = i->i_private; - mutex_lock(&session->stat_mutex); reset_stat_session(session); - mutex_unlock(&session->stat_mutex); - return 0; + return seq_release(i, f); } static const struct file_operations tracing_stat_fops = { -- cgit v1.2.3 From 4c739ff043e5787d97c9691d62cabf7a29e75a9d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Jul 2009 23:11:03 -0400 Subject: tracing: show proper address for trace-printk format Since the trace_printk may use pointers to the format fields in the buffer, they are exported via debugfs/tracing/printk_formats. This is used by utilities that read the ring buffer in binary format. It helps the utilities map the address of the format in the binary buffer to what the printf format looks like. Unfortunately, the way the output code works, it exports the address of the pointer to the format address, and not the format address itself. This makes the file totally useless in trying to figure out what format string a binary address belongs to. Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 7b627811082..687699d365a 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -176,7 +176,7 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; - seq_printf(m, "0x%lx : \"", (unsigned long)fmt); + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* * Tabs and new lines need to be converted. -- cgit v1.2.3 From 8650ae32ef7045e763825dee6256dde7f331bb85 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Jul 2009 23:29:30 -0400 Subject: tracing: only truncate ftrace files when O_TRUNC is set The current code will truncate the ftrace files contents if O_APPEND is not set and the file is opened in write mode. This is incorrect. It should only truncate the file if O_TRUNC is set. Otherwise if one of these files is opened by a C program with fopen "r+", it will incorrectly truncate the file. Reported-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace.c | 2 +- kernel/trace/trace_events.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1f3ec2afa51..1e1d23c2630 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1662,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) + (file->f_flags & O_TRUNC)) ftrace_filter_reset(enable); if (file->f_mode & FMODE_READ) { @@ -2577,7 +2577,7 @@ ftrace_graph_open(struct inode *inode, struct file *file) mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) { + (file->f_flags & O_TRUNC)) { ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bc8d8afea6..d8ef28574aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2031,7 +2031,7 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) { + (file->f_flags & O_TRUNC)) { long cpu = (long) inode->i_private; if (cpu == TRACE_PIPE_ALL_CPU) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53c8fd376a8..23d2972b22d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -376,7 +376,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) const struct seq_operations *seq_ops; if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) + (file->f_flags & O_TRUNC)) ftrace_clear_events(); seq_ops = inode->i_private; -- cgit v1.2.3 From bdff78707f3ce47e891f3201c9666122a70556ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2009 15:30:45 -0400 Subject: trace: stop tracer in oops_enter() If trace_printk_on_oops is set we lose interesting trace information when the tracer is enabled across oops handling and printing. We want the trace which might give us information _WHY_ we oopsed. Signed-off-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 984b3ecbd72..512ab73b0ca 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -301,6 +301,7 @@ int oops_may_print(void) */ void oops_enter(void) { + tracing_off(); /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); -- cgit v1.2.3 From 6560dc160f3a96b8f1f43e2c6b51aa6eb9898b90 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Thu, 23 Jul 2009 23:42:08 +0930 Subject: module: use MODULE_SYMBOL_PREFIX with module_layout The check_modstruct_version() needs to look up the symbol "module_layout" in the kernel, but it does so literally and not by a C identifier. The trouble is that it does not include a symbol prefix for those ports that need it (like the Blackfin and H8300 port). So make sure we tack on the MODULE_SYMBOL_PREFIX define to the front of it. Signed-off-by: Mike Frysinger Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0a049837008..fd141140355 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1068,7 +1068,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; - if (!find_symbol("module_layout", NULL, &crc, true, false)) + if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, + &crc, true, false)) BUG(); return check_version(sechdrs, versindex, "module_layout", mod, crc); } -- cgit v1.2.3 From 9ae260270c90643156cda73427aa1f04c923e627 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 19 Jun 2009 02:51:13 +0200 Subject: update the comment in kthread_stop() Commit 63706172f332fd3f6e7458ebfb35fa6de9c21dc5 ("kthreads: rework kthread_stop()") removed the limitation that the thread function mysr not call do_exit() itself, but forgot to update the comment. Since that commit it is OK to use kthread_stop() even if kthread can exit itself. Signed-off-by: Oleg Nesterov Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/kthread.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9b1a7de2697..eb8751aa041 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,10 +180,12 @@ EXPORT_SYMBOL(kthread_bind); * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and - * waits for it to exit. Your threadfn() must not call do_exit() - * itself if you use this function! This can also be called after - * kthread_create() instead of calling wake_up_process(): the thread - * will exit without calling threadfn(). + * waits for it to exit. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will exit without + * calling threadfn(). + * + * If threadfn() may call do_exit() itself, the caller must ensure + * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. -- cgit v1.2.3 From 38ceb592fcac9110c6b3c87ea0a27bff68c43486 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:11:24 +0800 Subject: tracing: Fix invalid function_graph entry When print_graph_entry() computes a function call entry event, it needs to also check the next entry to guess if it matches the return event of the current function entry. In order to look at this next event, it needs to consume the current entry before going ahead in the ring buffer. However, if the current event that gets consumed is the last one in the ring buffer head page, the ring_buffer may reuse the page for writers. The consumed entry will then become invalid because of possible racy overwriting. Me must then handle this entry by making a copy of it. The fix also applies on 2.6.30 Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: stable@kernel.org LKML-Reference: <4A6EEAEC.3050508@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_functions_graph.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d2249abafb5..420ec348757 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter) switch (entry->type) { case TRACE_GRAPH_ENT: { - struct ftrace_graph_ent_entry *field; + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry *field, saved; trace_assign_type(field, entry); - return print_graph_entry(field, s, iter); + saved = *field; + return print_graph_entry(&saved, s, iter); } case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; -- cgit v1.2.3 From 74e7ff8c50b6b022e6ffaa736b16a4dc161d3eaf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:17:22 +0800 Subject: tracing: Fix missing function_graph events when we splice_read from trace_pipe About a half events are missing when we splice_read from trace_pipe. They are unexpectedly consumed because we ignore the TRACE_TYPE_NO_CONSUME return value used by the function graph tracer when it needs to consume the events by itself to walk on the ring buffer. The same problem appears with ftrace_dump() Example of an output before this patch: 1) | ktime_get_real() { 1) 2.846 us | read_hpet(); 1) 4.558 us | } 1) 6.195 us | } After this patch: 0) | ktime_get_real() { 0) | getnstimeofday() { 0) 1.960 us | read_hpet(); 0) 3.597 us | } 0) 5.196 us | } The fix also applies on 2.6.30 Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: stable@kernel.org LKML-Reference: <4A6EEC52.90704@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bc8d8afea6..da984ad065a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3085,7 +3085,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) break; } - trace_consume(iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); rem -= count; if (!find_next_entry_inc(iter)) { rem = 0; @@ -4233,8 +4234,11 @@ static void __ftrace_dump(bool disable_tracing) iter.pos = -1; if (find_next_entry_inc(&iter) != NULL) { - print_trace_line(&iter); - trace_consume(&iter); + int ret; + + ret = print_trace_line(&iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(&iter); } trace_printk_seq(&iter.seq); -- cgit v1.2.3 From 933b787b57ca8bdc0fc8fb2cbf67b5e6d21beb84 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 29 Jul 2009 15:02:07 -0700 Subject: mm: copy over oom_adj value at fork time Fix a post-2.6.31 regression which was introduced by 2ff05b2b4eac2e63d345fc731ea151a060247f53 ("oom: move oom_adj value from task_struct to mm_struct"). After moving the oom_adj value from the task struct to the mm_struct, the oom_adj value was no longer properly inherited by child processes. Copying over the oom_adj value at fork time fixes that bug. [kosaki.motohiro@jp.fujitsu.com: test for current->mm before dereferencing it] Signed-off-by: Rik van Riel Reported-by: Paul Menage Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9b42695f0d1..29b532e718f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -426,6 +426,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; + mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); -- cgit v1.2.3 From 11c7da4b0ca76a57f51c996c883c480e203cf5a9 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 29 Jul 2009 15:02:08 -0700 Subject: kexec: fix omitting offset in extended crashkernel syntax Setting "crashkernel=512M-2G:64M,2G-:128M" does not work but it turns to work if it has a trailing-whitespace, like "crashkernel=512M-2G:64M,2G-:128M ". It was because of a bug in the parser, running over the cmdline. This patch adds a check of the termination. Reported-by: Jin Dongming Signed-off-by: Hidetoshi Seto Tested-by: Jin Dongming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ae1c35201cc..f336e2107f9 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline, } while (*cur++ == ','); if (*crash_size > 0) { - while (*cur != ' ' && *cur != '@') + while (*cur && *cur != ' ' && *cur != '@') cur++; if (*cur == '@') { cur++; -- cgit v1.2.3 From 096b7fe012d66ed55e98bc8022405ede0cc80e96 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Jul 2009 15:04:04 -0700 Subject: cgroups: fix pid namespace bug The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844 ("cgroups: convert tasks file to use a seq_file with shared pid array"). We cache a pid array for all threads that are opening the same "tasks" file, but the pids in the array are always from the namespace of the last process that opened the file, so all other threads will read pids from that namespace instead of their own namespaces. To fix it, we maintain a list of pid arrays, which is keyed by pid_ns. The list will be of length 1 at most time. Reported-by: Paul Menage Idea-by: Paul Menage Signed-off-by: Li Zefan Reviewed-by: Serge Hallyn Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 96 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3737a682cdf..250dac05680 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -47,6 +47,7 @@ #include #include #include +#include #include @@ -960,6 +961,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); + INIT_LIST_HEAD(&cgrp->pids_list); init_rwsem(&cgrp->pids_mutex); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -2201,12 +2203,30 @@ err: return ret; } +/* + * Cache pids for all threads in the same pid namespace that are + * opening the same "tasks" file. + */ +struct cgroup_pids { + /* The node in cgrp->pids_list */ + struct list_head list; + /* The cgroup those pids belong to */ + struct cgroup *cgrp; + /* The namepsace those pids belong to */ + struct pid_namespace *ns; + /* Array of process ids in the cgroup */ + pid_t *tasks_pids; + /* How many files are using the this tasks_pids array */ + int use_count; + /* Length of the current tasks_pids array */ + int length; +}; + static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } - /* * seq_file methods for the "tasks" file. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid @@ -2221,45 +2241,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; int index = 0, pid = *pos; int *iter; down_read(&cgrp->pids_mutex); if (pid) { - int end = cgrp->pids_length; + int end = cp->length; while (index < end) { int mid = (index + end) / 2; - if (cgrp->tasks_pids[mid] == pid) { + if (cp->tasks_pids[mid] == pid) { index = mid; break; - } else if (cgrp->tasks_pids[mid] <= pid) + } else if (cp->tasks_pids[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= cgrp->pids_length) + if (index >= cp->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = cgrp->tasks_pids + index; + iter = cp->tasks_pids + index; *pos = *iter; return iter; } static void cgroup_tasks_stop(struct seq_file *s, void *v) { - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; up_read(&cgrp->pids_mutex); } static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; int *p = v; - int *end = cgrp->tasks_pids + cgrp->pids_length; + int *end = cp->tasks_pids + cp->length; /* * Advance to the next pid in the array. If this goes off the @@ -2286,26 +2308,33 @@ static struct seq_operations cgroup_tasks_seq_operations = { .show = cgroup_tasks_show, }; -static void release_cgroup_pid_array(struct cgroup *cgrp) +static void release_cgroup_pid_array(struct cgroup_pids *cp) { + struct cgroup *cgrp = cp->cgrp; + down_write(&cgrp->pids_mutex); - BUG_ON(!cgrp->pids_use_count); - if (!--cgrp->pids_use_count) { - kfree(cgrp->tasks_pids); - cgrp->tasks_pids = NULL; - cgrp->pids_length = 0; + BUG_ON(!cp->use_count); + if (!--cp->use_count) { + list_del(&cp->list); + put_pid_ns(cp->ns); + kfree(cp->tasks_pids); + kfree(cp); } up_write(&cgrp->pids_mutex); } static int cgroup_tasks_release(struct inode *inode, struct file *file) { - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct seq_file *seq; + struct cgroup_pids *cp; if (!(file->f_mode & FMODE_READ)) return 0; - release_cgroup_pid_array(cgrp); + seq = file->private_data; + cp = seq->private; + + release_cgroup_pid_array(cp); return seq_release(inode, file); } @@ -2324,6 +2353,8 @@ static struct file_operations cgroup_tasks_operations = { static int cgroup_tasks_open(struct inode *unused, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct pid_namespace *ns = current->nsproxy->pid_ns; + struct cgroup_pids *cp; pid_t *pidarray; int npids; int retval; @@ -2350,20 +2381,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) * array if necessary */ down_write(&cgrp->pids_mutex); - kfree(cgrp->tasks_pids); - cgrp->tasks_pids = pidarray; - cgrp->pids_length = npids; - cgrp->pids_use_count++; + + list_for_each_entry(cp, &cgrp->pids_list, list) { + if (ns == cp->ns) + goto found; + } + + cp = kzalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) { + up_write(&cgrp->pids_mutex); + kfree(pidarray); + return -ENOMEM; + } + cp->cgrp = cgrp; + cp->ns = ns; + get_pid_ns(ns); + list_add(&cp->list, &cgrp->pids_list); +found: + kfree(cp->tasks_pids); + cp->tasks_pids = pidarray; + cp->length = npids; + cp->use_count++; up_write(&cgrp->pids_mutex); file->f_op = &cgroup_tasks_operations; retval = seq_open(file, &cgroup_tasks_seq_operations); if (retval) { - release_cgroup_pid_array(cgrp); + release_cgroup_pid_array(cp); return retval; } - ((struct seq_file *)file->private_data)->private = cgrp; + ((struct seq_file *)file->private_data)->private = cp; return 0; } -- cgit v1.2.3 From 887032670d47366a8c8f25396ea7c14b7b2cc620 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 29 Jul 2009 15:04:06 -0700 Subject: cgroup avoid permanent sleep at rmdir After commit ec64f51545fffbc4cb968f0cea56341a4b07e85a ("cgroup: fix frequent -EBUSY at rmdir"), cgroup's rmdir (especially against memcg) doesn't return -EBUSY by temporary ref counts. That commit expects all refs after pre_destroy() is temporary but...it wasn't. Then, rmdir can wait permanently. This patch tries to fix that and change followings. - set CGRP_WAIT_ON_RMDIR flag before pre_destroy(). - clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case. if there are sleeping ones, wakes them up. - rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set. Tested-by: Daisuke Nishimura Reported-by: Daisuke Nishimura Reviewed-by: Paul Menage Acked-by: Balbir Sigh Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 55 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 250dac05680..b6eadfe30e7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -735,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry) * reference to css->refcnt. In general, this refcnt is expected to goes down * to zero, soon. * - * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; */ DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); -static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) { - if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) + if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) wake_up_all(&cgroup_rmdir_waitq); } +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) +{ + css_get(css); +} + +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) +{ + cgroup_wakeup_rmdir_waiter(css->cgroup); + css_put(css); +} + + static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) { @@ -1359,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) * wake up rmdir() waiter. the rmdir should fail since the cgroup * is no longer empty. */ - cgroup_wakeup_rmdir_waiters(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); return 0; } @@ -2743,34 +2755,43 @@ again: } mutex_unlock(&cgroup_mutex); + /* + * In general, subsystem has no css->refcnt after pre_destroy(). But + * in racy cases, subsystem may have to get css->refcnt after + * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes + * make rmdir return -EBUSY too often. To avoid that, we use waitqueue + * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir + * and subsystem's reference count handling. Please see css_get/put + * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. + */ + set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + /* * Call pre_destroy handlers of subsys. Notify subsystems * that rmdir() request comes. */ ret = cgroup_call_pre_destroy(cgrp); - if (ret) + if (ret) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); return ret; + } mutex_lock(&cgroup_mutex); parent = cgrp->parent; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); mutex_unlock(&cgroup_mutex); return -EBUSY; } - /* - * css_put/get is provided for subsys to grab refcnt to css. In typical - * case, subsystem has no reference after pre_destroy(). But, under - * hierarchy management, some *temporal* refcnt can be hold. - * To avoid returning -EBUSY to a user, waitqueue is used. If subsys - * is really busy, it should return -EBUSY at pre_destroy(). wake_up - * is called when css_put() is called and refcnt goes down to 0. - */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); - schedule(); + /* + * Because someone may call cgroup_wakeup_rmdir_waiter() before + * prepare_to_wait(), we need to check this flag. + */ + if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) + schedule(); finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); if (signal_pending(current)) @@ -3342,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiters(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); } rcu_read_unlock(); } -- cgit v1.2.3 From b62f495dad04fa94b5083aec638ff3072bccaaca Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 29 Jul 2009 15:04:09 -0700 Subject: profile: suppress warning about large allocations when profile=1 is specified When profile= is used, a large buffer is allocated early at boot. This can be larger than what the page allocator can provide so it prints a warning. However, the caller is able to handle the situation so this patch suppresses the warning. Signed-off-by: Mel Gorman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 69911b5745e..419250ebec4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -117,11 +117,12 @@ int __ref profile_init(void) cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; - prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + prof_buffer = alloc_pages_exact(buffer_bytes, + GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); if (prof_buffer) return 0; -- cgit v1.2.3 From ec30c5f3a18722f8fcf8c83146a10b03ac4d9ff1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 28 Jul 2009 19:47:23 -0400 Subject: kprobes: Use kernel_text_address() for checking probe address Use kernel_text_address() for checking probe address instead of __kernel_text_address(), because __kernel_text_address() returns true for init functions even after relaseing those functions. That will hit a BUG() in text_poke(). Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 16b5739c516..0540948e29a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -694,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p) p->addr = addr; preempt_disable(); - if (!__kernel_text_address((unsigned long) p->addr) || + if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) { preempt_enable(); return -EINVAL; -- cgit v1.2.3 From 0083fc2c50e6c5127c2802ad323adf8143ab7856 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 1 Aug 2009 10:34:56 -0700 Subject: do_sigaltstack: avoid copying 'stack_t' as a structure to user space Ulrich Drepper correctly points out that there is generally padding in the structure on 64-bit hosts, and that copying the structure from kernel to user space can leak information from the kernel stack in those padding bytes. Avoid the whole issue by just copying the three members one by one instead, which also means that the function also can avoid the need for a stack frame. This also happens to match how we copy the new structure from user space, so it all even makes sense. [ The obvious solution of adding a memset() generates horrid code, gcc does really stupid things. ] Reported-by: Ulrich Drepper Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ccf1ceedaeb..f268372c0cc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s stack_t oss; int error; - if (uoss) { - oss.ss_sp = (void __user *) current->sas_ss_sp; - oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp); - } + oss.ss_sp = (void __user *) current->sas_ss_sp; + oss.ss_size = current->sas_ss_size; + oss.ss_flags = sas_ss_flags(sp); if (uss) { void __user *ss_sp; @@ -2501,13 +2499,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s current->sas_ss_size = ss_size; } + error = 0; if (uoss) { error = -EFAULT; - if (copy_to_user(uoss, &oss, sizeof(oss))) + if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) goto out; + error = __put_user(oss.ss_sp, &uoss->ss_sp) | + __put_user(oss.ss_size, &uoss->ss_size) | + __put_user(oss.ss_flags, &uoss->ss_flags); } - error = 0; out: return error; } -- cgit v1.2.3 From 0dd8486b5cfe8048e0613334659d9252ecd1b08a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 1 Aug 2009 11:18:56 -0700 Subject: do_sigaltstack: small cleanups The previous commit ("do_sigaltstack: avoid copying 'stack_t' as a structure to user space") fixed a real bug. This one just cleans up the copy from user space to that gcc can generate better code for it (and so that it looks the same as the later copy back to user space). Signed-off-by: Linus Torvalds --- kernel/signal.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f268372c0cc..64c5deeaca5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2464,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s int ss_flags; error = -EFAULT; - if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) - || __get_user(ss_sp, &uss->ss_sp) - || __get_user(ss_flags, &uss->ss_flags) - || __get_user(ss_size, &uss->ss_size)) + if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) + goto out; + error = __get_user(ss_sp, &uss->ss_sp) | + __get_user(ss_flags, &uss->ss_flags) | + __get_user(ss_size, &uss->ss_size); + if (error) goto out; error = -EPERM; -- cgit v1.2.3 From e53c0994709166b111fbe9162d1a16ece7dfc45b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jul 2009 14:42:10 +0200 Subject: perf_counter: Collapse inherit on read() Currently the counter value returned by read() is the value of the parent counter, to which child counters are only fed back on child exit. Thus read() can return rather erratic (and meaningless) numbers depending on the state of the child processes. Change this by always iterating the full child hierarchy on read() and sum all counters. Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 95093104195..48471d75ae0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1688,6 +1688,18 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +static u64 perf_counter_read_tree(struct perf_counter *counter) +{ + struct perf_counter *child; + u64 total = 0; + + total += perf_counter_read(counter); + list_for_each_entry(child, &counter->child_list, child_list) + total += perf_counter_read(child); + + return total; +} + /* * Read the performance counter - simple non blocking version for now */ @@ -1707,7 +1719,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read(counter); + values[0] = perf_counter_read_tree(counter); n = 1; if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = counter->total_time_enabled + -- cgit v1.2.3 From 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 14:46:33 +0200 Subject: perf_counter: Full task tracing In order to be able to distinguish between no samples due to inactivity and no samples due to task ended, Arjan asked for PERF_EVENT_EXIT events. This is useful to the boot delay instrumentation (bootchart) app. This patch changes the PERF_EVENT_FORK to be emitted on every clone, and adds PERF_EVENT_EXIT to be emitted on task exit, after the task's counters have been closed. This task tracing is controlled through: attr.comm || attr.mmap and through the new attr.task field. Suggested-by: Arjan van de Ven Cc: Paul Mackerras Cc: Anton Blanchard Signed-off-by: Peter Zijlstra [ cleaned up perf_counter.h a bit ] Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 +-- kernel/perf_counter.c | 87 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 29b532e718f..466531eb92c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1269,6 +1269,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); + perf_counter_fork(p); return p; bad_fork_free_pid: @@ -1410,9 +1411,6 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if (!(clone_flags & CLONE_THREAD)) - perf_counter_fork(p); - audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 48471d75ae0..199ed477131 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1; static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; +static atomic_t nr_task_counters __read_mostly; /* * perf counter paranoia level: @@ -1654,6 +1655,8 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_mmap_counters); if (counter->attr.comm) atomic_dec(&nr_comm_counters); + if (counter->attr.task) + atomic_dec(&nr_task_counters); } if (counter->destroy) @@ -2831,10 +2834,12 @@ perf_counter_read_event(struct perf_counter *counter, } /* - * fork tracking + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task */ -struct perf_fork_event { +struct perf_task_event { struct task_struct *task; struct { @@ -2842,37 +2847,42 @@ struct perf_fork_event { u32 pid; u32 ppid; + u32 tid; + u32 ptid; } event; }; -static void perf_counter_fork_output(struct perf_counter *counter, - struct perf_fork_event *fork_event) +static void perf_counter_task_output(struct perf_counter *counter, + struct perf_task_event *task_event) { struct perf_output_handle handle; - int size = fork_event->event.header.size; - struct task_struct *task = fork_event->task; + int size = task_event->event.header.size; + struct task_struct *task = task_event->task; int ret = perf_output_begin(&handle, counter, size, 0, 0); if (ret) return; - fork_event->event.pid = perf_counter_pid(counter, task); - fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); + task_event->event.pid = perf_counter_pid(counter, task); + task_event->event.ppid = perf_counter_pid(counter, task->real_parent); - perf_output_put(&handle, fork_event->event); + task_event->event.tid = perf_counter_tid(counter, task); + task_event->event.ptid = perf_counter_tid(counter, task->real_parent); + + perf_output_put(&handle, task_event->event); perf_output_end(&handle); } -static int perf_counter_fork_match(struct perf_counter *counter) +static int perf_counter_task_match(struct perf_counter *counter) { - if (counter->attr.comm || counter->attr.mmap) + if (counter->attr.comm || counter->attr.mmap || counter->attr.task) return 1; return 0; } -static void perf_counter_fork_ctx(struct perf_counter_context *ctx, - struct perf_fork_event *fork_event) +static void perf_counter_task_ctx(struct perf_counter_context *ctx, + struct perf_task_event *task_event) { struct perf_counter *counter; @@ -2881,19 +2891,19 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_fork_match(counter)) - perf_counter_fork_output(counter, fork_event); + if (perf_counter_task_match(counter)) + perf_counter_task_output(counter, task_event); } rcu_read_unlock(); } -static void perf_counter_fork_event(struct perf_fork_event *fork_event) +static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_fork_ctx(&cpuctx->ctx, fork_event); + perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); @@ -2903,32 +2913,40 @@ static void perf_counter_fork_event(struct perf_fork_event *fork_event) */ ctx = rcu_dereference(current->perf_counter_ctxp); if (ctx) - perf_counter_fork_ctx(ctx, fork_event); + perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -void perf_counter_fork(struct task_struct *task) +static void perf_counter_task(struct task_struct *task, int new) { - struct perf_fork_event fork_event; + struct perf_task_event task_event; if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters)) + !atomic_read(&nr_mmap_counters) && + !atomic_read(&nr_task_counters)) return; - fork_event = (struct perf_fork_event){ + task_event = (struct perf_task_event){ .task = task, .event = { .header = { - .type = PERF_EVENT_FORK, + .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, - .size = sizeof(fork_event.event), + .size = sizeof(task_event.event), }, /* .pid */ /* .ppid */ + /* .tid */ + /* .ptid */ }, }; - perf_counter_fork_event(&fork_event); + perf_counter_task_event(&task_event); +} + +void perf_counter_fork(struct task_struct *task) +{ + perf_counter_task(task, 1); } /* @@ -3887,6 +3905,8 @@ done: atomic_inc(&nr_mmap_counters); if (counter->attr.comm) atomic_inc(&nr_comm_counters); + if (counter->attr.task) + atomic_inc(&nr_task_counters); } return counter; @@ -4248,8 +4268,10 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter_context *child_ctx; unsigned long flags; - if (likely(!child->perf_counter_ctxp)) + if (likely(!child->perf_counter_ctxp)) { + perf_counter_task(child, 0); return; + } local_irq_save(flags); /* @@ -4267,15 +4289,22 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all * the counters from it. */ unclone_ctx(child_ctx); - spin_unlock(&child_ctx->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the counters so that we + * won't get any samples after PERF_EVENT_EXIT. We can however still + * get a few PERF_EVENT_READ events. + */ + perf_counter_task(child, 0); + + child->perf_counter_ctxp = NULL; /* * We can recurse on the same lock type through: -- cgit v1.2.3 From e414314cce7539788dd5d2c35decad11782dd858 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 20:13:26 +0200 Subject: sched: Fix latencytop and sleep profiling vs group scheduling The latencytop and sleep accounting code assumes that any scheduler entity represents a task, this is not so. Cc: Arjan van de Ven Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9ffb2b2ceba..652e8bdef9a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -611,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS + struct task_struct *tsk = NULL; + + if (entity_is_task(se)) + tsk = task_of(se); + if (se->sleep_start) { u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; - struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -624,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sleep_start = 0; se->sum_sleep_runtime += delta; - account_scheduler_latency(tsk, delta >> 10, 1); + if (tsk) + account_scheduler_latency(tsk, delta >> 10, 1); } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; - struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -639,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->block_start = 0; se->sum_sleep_runtime += delta; - /* - * Blocking time is in units of nanosecs, so shift by 20 to - * get a milliseconds-range estimation of the amount of - * time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - - profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), - delta >> 20); + if (tsk) { + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); } - account_scheduler_latency(tsk, delta >> 10, 0); } #endif } -- cgit v1.2.3 From 07903af152b0597d94e9b0030746b63c4664e787 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Thu, 30 Jul 2009 10:57:28 -0400 Subject: sched: Fix race in cpupri introduced by cpumask_var changes Background: Several race conditions in the scheduler have cropped up recently, which Steven and I have tracked down using ftrace. The most recent one turns out to be a race in how the scheduler determines a suitable migration target for RT tasks, introduced recently with commit: commit 68e74568fbe5854952355e942acca51f138096d9 Date: Tue Nov 25 02:35:13 2008 +1030 sched: convert struct cpupri_vec cpumask_var_t. The original design of cpupri allowed lockless readers to quickly determine a best-estimate target. Races between the pri_active bitmap and the vec->mask were handled in the original code because we would detect and return "0" when this occured. The design was predicated on the *effective* atomicity (*) of caching the result of cpus_and() between the cpus_allowed and the vec->mask. Commit 68e74568 changed the behavior such that vec->mask is accessed multiple times. This introduces a subtle race, the result of which means we can have a result that returns "1", but with an empty bitmap. *) yes, we know cpus_and() is not a locked operator across the entire composite array, but it is implicitly atomic on a per-word basis which is all the design required to work. Implementation: Rather than forgoing the lockless design, or reverting to a stack-based cpumask_t, we simply check for when the race has been encountered and continue processing in the event that the race is hit. This renders the removal race as if the priority bit had been atomically cleared as well, and allows the algorithm to execute correctly. Signed-off-by: Gregory Haskins CC: Rusty Russell CC: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <20090730145728.25226.92769.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched_cpupri.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6c251790dd..d014efbf947 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; - if (lowest_mask) + if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + + /* + * We have to ensure that we have at least one bit + * still set in the array, since the map could have + * been concurrently emptied between the first and + * second reads of vec->mask. If we hit this + * condition, simply act as though we never hit this + * priority level and continue on. + */ + if (cpumask_any(lowest_mask) >= nr_cpu_ids) + continue; + } + return 1; } -- cgit v1.2.3 From 70d715fd0597f18528f389b5ac59102263067744 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 3 Aug 2009 11:48:19 +0900 Subject: posix-timers: Fix oops in clock_nanosleep() with CLOCK_MONOTONIC_RAW Prevent calling do_nanosleep() with clockid CLOCK_MONOTONIC_RAW, it may cause oops, such as NULL pointer dereference. Signed-off-by: Hiroshi Shimamoto Cc: Andrew Morton Cc: Thomas Gleixner Cc: John Stultz Cc: LKML-Reference: <4A764FF3.50607@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- kernel/posix-timers.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 052ec4d195c..d089d052c4a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer) return -EOPNOTSUPP; } +static int no_nsleep(const clockid_t which_clock, int flags, + struct timespec *tsave, struct timespec __user *rmtp) +{ + return -EOPNOTSUPP; +} + /* * Return nonzero if we know a priori this clockid_t value is bogus. */ @@ -254,6 +260,7 @@ static __init int init_posix_timers(void) .clock_get = posix_get_monotonic_raw, .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, + .nsleep = no_nsleep, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); -- cgit v1.2.3 From 0f2541d299d233eddddee4345795e0c46264fd56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 12:02:48 -0400 Subject: ring-buffer: fix check of try_to_discard result The function ring_buffer_discard_commit inversed the code path of the result of try_to_discard. It should skip incrementing the entry counter if try_to_discard succeeded. But instead, it increments the entry conder if it succeeded to discard, and does not increment it if it fails. The result of this bug is that filtering will make the stat counters incorrect. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7a63e..2fd1752f0c8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1785,7 +1785,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); - if (!rb_try_to_discard(cpu_buffer, event)) + if (rb_try_to_discard(cpu_buffer, event)) goto out; /* -- cgit v1.2.3 From 464e85eb0e63096bd52e4c3e2a6fb8357fb95828 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Aug 2009 15:26:37 -0400 Subject: ring-buffer: do not disable ring buffer on oops_in_progress The commit: commit e0fdace10e75dac67d906213b780ff1b1a4cc360 Author: David Miller Date: Fri Aug 1 01:11:22 2008 -0700 debug_locks: set oops_in_progress if we will log messages. Otherwise lock debugging messages on runqueue locks can deadlock the system due to the wakeups performed by printk(). Signed-off-by: David S. Miller Signed-off-by: Ingo Molnar Will permanently set oops_in_progress on any lockdep failure. When this triggers it will cause any read from the ring buffer to permanently disable the ring buffer (not to mention no locking of printk). This patch removes the check. It keeps the print in NMI which makes sense. This is probably OK, since the ring buffer should not cause something to set oops_in_progress anyway. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2fd1752f0c8..2606cee433d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2486,7 +2486,7 @@ static inline int rb_ok_to_lock(void) * buffer too. A one time deal is all you get from reading * the ring buffer from an NMI. */ - if (likely(!in_nmi() && !oops_in_progress)) + if (likely(!in_nmi())) return 1; tracing_off_permanent(); -- cgit v1.2.3 From 1bbf20835c4e088667a090ce6523a0f70b62dc76 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 5 Aug 2009 12:05:21 -0700 Subject: rtmutex: Avoid deadlock in rt_mutex_start_proxy_lock() In the event of a lock steal or owner died, rt_mutex_start_proxy_lock() will give the rt_mutex to the waiting task, but it fails to release the wait_lock. This leads to subsequent deadlocks when other tasks try to acquire the rt_mutex. I also removed a few extra blank lines that really spaced this routine out. I must have been high on the \n when I wrote this originally... Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A79D7F1.4000405@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rtmutex.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index fcd107a78c5..29bd4baf9e7 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { /* We got the lock for task. */ debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, task, 0); - + spin_unlock(&lock->wait_lock); rt_mutex_deadlock_account_lock(lock, task); return 1; } ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - if (ret && !waiter->task) { /* * Reset the return value. We might have -- cgit v1.2.3 From af6af30c0fcd77e621638e53ef8b176bca8bd3b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Aug 2009 20:41:04 +0200 Subject: ftrace: Fix perf-tracepoint OOPS Not all tracepoints are created equal, in specific the ftrace tracepoints are created with TRACE_EVENT_FORMAT() which does not generate the needed bits to tie them into perf counters. For those events, don't create the 'id' file and fail ->profile_enable when their ID is specified through other means. Reported-by: Chris Mason Signed-off-by: Peter Zijlstra Cc: Steven Rostedt LKML-Reference: <1249497664.5890.4.camel@laptop> [ v2: fix build error in the !CONFIG_EVENT_PROFILE case ] Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_profile.c | 2 +- kernel/trace/trace_events.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 5b5895afecf..11ba5bb4ed0 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { + if (event->id == event_id && event->profile_enable) { ret = event->profile_enable(event); break; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 23d2972b22d..e75276a49cf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -940,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, entry = trace_create_file("enable", 0644, call->dir, call, enable); - if (call->id) + if (call->id && call->profile_enable) entry = trace_create_file("id", 0444, call->dir, call, id); -- cgit v1.2.3 From 469535a598f28c13a2a42037e1b778f671af1d16 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 30 Jul 2009 19:19:18 +0200 Subject: ring-buffer: Fix advance of reader in rb_buffer_peek() When calling rb_buffer_peek() from ring_buffer_consume() and a padding event is returned, the function rb_advance_reader() is called twice. This may lead to missing samples or under high workloads to the warning below. This patch fixes this. If a padding event is returned by rb_buffer_peek() it will be consumed by the calling function now. Also, I simplified some code in ring_buffer_consume(). ------------[ cut here ]------------ WARNING: at /dev/shm/.source/linux/kernel/trace/ring_buffer.c:2289 rb_advance_reader+0x2e/0xc5() Hardware name: Anaheim Modules linked in: Pid: 29, comm: events/2 Tainted: G W 2.6.31-rc3-oprofile-x86_64-standard-00059-g5050dc2 #1 Call Trace: [] ? rb_advance_reader+0x2e/0xc5 [] warn_slowpath_common+0x77/0x8f [] warn_slowpath_null+0xf/0x11 [] rb_advance_reader+0x2e/0xc5 [] ring_buffer_consume+0xa0/0xd2 [] op_cpu_buffer_read_entry+0x21/0x9e [] ? __find_get_block+0x4b/0x165 [] sync_buffer+0xa5/0x401 [] ? __find_get_block+0x4b/0x165 [] ? wq_sync_buffer+0x0/0x78 [] wq_sync_buffer+0x5b/0x78 [] worker_thread+0x113/0x1ac [] ? autoremove_wake_function+0x0/0x38 [] ? worker_thread+0x0/0x1ac [] kthread+0x88/0x92 [] child_rip+0xa/0x20 [] ? kthread+0x0/0x92 [] ? child_rip+0x0/0x20 ---[ end trace f561c0a58fcc89bd ]--- Cc: Steven Rostedt Cc: Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2606cee433d..d4d3580a894 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2383,7 +2383,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) * the box. Return the padding, and we will release * the current locks, and try again. */ - rb_advance_reader(cpu_buffer); return event; case RINGBUF_TYPE_TIME_EXTEND: @@ -2519,6 +2518,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); + if (event && event->type_len == RINGBUF_TYPE_PADDING) + rb_advance_reader(cpu_buffer); if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); @@ -2590,12 +2591,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); - if (!event) - goto out_unlock; - - rb_advance_reader(cpu_buffer); + if (event) + rb_advance_reader(cpu_buffer); - out_unlock: if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); -- cgit v1.2.3 From 1054598cab8674438675085fae459e960eb10799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2009 18:06:26 +0200 Subject: perf_counter: Fix double list iteration in per task precise stats Brice Goglin reported this crash with per task precise stats: > I finally managed to test the threaded perfcounter statistics (thanks a > lot for implementing it). I am running 2.6.31-rc5 (with the AMD > magny-cours patches but I don't think they matter here). I am trying to > measure local/remote memory accesses per thread during the well-known > stream benchmark. It's compiled with OpenMP using 16 threads on a > quad-socket quad-core barcelona machine. > > Command line is: > /mnt/scratch/bgoglin/cpunode/linux-2.6.31/tools/perf/perf record -f -s > -e r1000001e0 -e r1000002e0 -e r1000004e0 -e r1000008e0 ./stream > > It seems to work fine with a single -e on the command line > while it crashes when there are at least 2 of them. > It seems to work fine without -s as well. A silly copy-paste resulted in a messed up iteration which would cause the OOPS. Reported-by: Brice Goglin Signed-off-by: Peter Zijlstra Tested-by: Brice Goglin LKML-Reference: <1249574786.32113.550.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 199ed477131..673c1aaf733 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1104,7 +1104,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx, __perf_counter_sync_stat(counter, next_counter); counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(counter, event_entry); + next_counter = list_next_entry(next_counter, event_entry); } } -- cgit v1.2.3 From 9795447f71324d8f14c19ed68b43c883135c3f59 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 16:37:10 +0800 Subject: lockdep: Fix file mode of lock_stat /proc/lock_stat is writable. Signed-off-by: Li Zefan Cc: Peter Zijlstra LKML-Reference: <4A7BE7B6.10904@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d7135aa2d2c..e94caa666db 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void) &proc_lockdep_stats_operations); #ifdef CONFIG_LOCK_STAT - proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); + proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, + &proc_lock_stat_operations); #endif return 0; -- cgit v1.2.3 From bd3f02212d6a457267e0c9c02c426151c436d9d4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Aug 2009 12:49:29 +0200 Subject: ring-buffer: Fix memleak in ring_buffer_free() I noticed oprofile memleaked in linux-2.6 current tree, and tracked this ring-buffer leak. Signed-off-by: Eric Dumazet LKML-Reference: <4A7C06B9.2090302@gmail.com> Cc: stable@kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4d3580a894..a330513d96c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -735,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer) put_online_cpus(); + kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); -- cgit v1.2.3 From 69dd647f969c28d18de77e2153f30d05a1874571 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 6 Aug 2009 15:07:29 -0700 Subject: generic-ipi: fix hotplug_cfd() Use CONFIG_HOTPLUG_CPU, not CONFIG_CPU_HOTPLUG When hot-unpluging a cpu, it will leak memory allocated at cpu hotplug, but only if CPUMASK_OFFSTACK=y, which is default to n. The bug was introduced by 8969a5ede0f9e17da4b943712429aef2c9bcd82b ("generic-ipi: remove kmalloc()"). Signed-off-by: Xiao Guangrong Cc: Ingo Molnar Cc: Jens Axboe Cc: Nick Piggin Cc: Peter Zijlstra Cc: Rusty Russell Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index ad63d850120..94188b8ecc3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_BAD; break; -#ifdef CONFIG_CPU_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: -- cgit v1.2.3 From 9c8a8228d0827e0d91d28527209988f672f97d28 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Aug 2009 15:09:28 -0700 Subject: execve: must clear current->clear_child_tid While looking at Jens Rosenboom bug report (http://lkml.org/lkml/2009/7/27/35) about strange sys_futex call done from a dying "ps" program, we found following problem. clone() syscall has special support for TID of created threads. This support includes two features. One (CLONE_CHILD_SETTID) is to set an integer into user memory with the TID value. One (CLONE_CHILD_CLEARTID) is to clear this same integer once the created thread dies. The integer location is a user provided pointer, provided at clone() time. kernel keeps this pointer value into current->clear_child_tid. At execve() time, we should make sure kernel doesnt keep this user provided pointer, as full user memory is replaced by a new one. As glibc fork() actually uses clone() syscall with CLONE_CHILD_SETTID and CLONE_CHILD_CLEARTID set, chances are high that we might corrupt user memory in forked processes. Following sequence could happen: 1) bash (or any program) starts a new process, by a fork() call that glibc maps to a clone( ... CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID ...) syscall 2) When new process starts, its current->clear_child_tid is set to a location that has a meaning only in bash (or initial program) context (&THREAD_SELF->tid) 3) This new process does the execve() syscall to start a new program. current->clear_child_tid is left unchanged (a non NULL value) 4) If this new program creates some threads, and initial thread exits, kernel will attempt to clear the integer pointed by current->clear_child_tid from mm_release() : if (tsk->clear_child_tid && !(tsk->flags & PF_SIGNALED) && atomic_read(&mm->mm_users) > 1) { u32 __user * tidptr = tsk->clear_child_tid; tsk->clear_child_tid = NULL; /* * We don't check the error code - if userspace has * not set up a proper pointer then tough luck. */ << here >> put_user(0, tidptr); sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); } 5) OR : if new program is not multi-threaded, but spied by /proc/pid users (ps command for example), mm_users > 1, and the exiting program could corrupt 4 bytes in a persistent memory area (shm or memory mapped file) If current->clear_child_tid points to a writeable portion of memory of the new program, kernel happily and silently corrupts 4 bytes of memory, with unexpected effects. Fix is straightforward and should not break any sane program. Reported-by: Jens Rosenboom Acked-by: Linus Torvalds Signed-off-by: Eric Dumazet Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Sonny Rao Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ulrich Drepper Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 466531eb92c..021e1138556 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -568,18 +568,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * the value intact in a core dump, and to save the unnecessary * trouble otherwise. Userland only wants this done for a sys_exit. */ - if (tsk->clear_child_tid - && !(tsk->flags & PF_SIGNALED) - && atomic_read(&mm->mm_users) > 1) { - u32 __user * tidptr = tsk->clear_child_tid; + if (tsk->clear_child_tid) { + if (!(tsk->flags & PF_SIGNALED) && + atomic_read(&mm->mm_users) > 1) { + /* + * We don't check the error code - if userspace has + * not set up a proper pointer then tough luck. + */ + put_user(0, tsk->clear_child_tid); + sys_futex(tsk->clear_child_tid, FUTEX_WAKE, + 1, NULL, NULL, 0); + } tsk->clear_child_tid = NULL; - - /* - * We don't check the error code - if userspace has - * not set up a proper pointer then tough luck. - */ - put_user(0, tidptr); - sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); } } -- cgit v1.2.3 From ad7d6c7a0654a4bbda3e109f56af713267e96274 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 4 Aug 2009 09:01:33 -0700 Subject: x86/irq: Fix move_irq_desc() for nodes without ram Don't move it if target node is -1. Signed-off-by: Yinghai Lu LKML-Reference: <4A785B5D.4070702@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/numa_migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 2f69bee57bf..3fd30197da2 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -107,8 +107,8 @@ out_unlock: struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - /* those all static, do move them */ - if (desc->irq < NR_IRQS_LEGACY) + /* those static or target node is -1, do not move them */ + if (desc->irq < NR_IRQS_LEGACY || node == -1) return desc; if (desc->node != node) -- cgit v1.2.3 From 96b2de313b1e0e02aea80ee47df6a2b5cbdf8e13 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:09 -0500 Subject: tracing/filters: Don't use pred on alloc failure Dan Carpenter sent me a fix to prevent pred from being used if it couldn't be allocated. I noticed the same problem also existed for the create_pred() case and added a fix for that. Reported-by: Dan Carpenter Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746549.6453.29.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 936c621bbf4..1557148be34 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1029,6 +1029,8 @@ static int replace_preds(struct event_subsystem *system, if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); + if (!pred) + return -ENOMEM; if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); @@ -1048,6 +1050,8 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); + if (!pred) + return -ENOMEM; if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); -- cgit v1.2.3 From 26528e773ecc74fb1b61b7275f86f761cbb340ec Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:53 -0500 Subject: tracing/filters: Always free pred on filter_add_subsystem_pred() failure If filter_add_subsystem_pred() fails due to ENOSPC or ENOMEM, the pred doesn't get freed, while as a side effect it does for other errors. Make it so the caller always frees the pred for any error. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746593.6453.32.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1557148be34..f32dc9d1ea7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -624,9 +624,6 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, return -ENOSPC; } - filter->preds[filter->n_preds] = pred; - filter->n_preds++; - list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -643,6 +640,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, } replace_filter_string(call->filter, filter_string); } + + filter->preds[filter->n_preds] = pred; + filter->n_preds++; out: return err; } @@ -1034,9 +1034,12 @@ static int replace_preds(struct event_subsystem *system, if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); - } else + } else { err = filter_add_subsystem_pred(ps, system, pred, filter_string); + if (err) + filter_free_pred(pred); + } if (err) return err; @@ -1055,9 +1058,12 @@ static int replace_preds(struct event_subsystem *system, if (call) { err = filter_add_pred(ps, call, pred); filter_free_pred(pred); - } else + } else { err = filter_add_subsystem_pred(ps, system, pred, filter_string); + if (err) + filter_free_pred(pred); + } if (err) return err; -- cgit v1.2.3 From 17d42c1c497aa54952b9e58c1502a46f0df40315 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 6 Aug 2009 16:03:30 -0700 Subject: posix_cpu_timers_exit_group(): Do not use thread_group_cputimer() When the process exits we don't have to run new cputimer nor use running one (as it not accounts when tsk->exit_state != 0) to get process CPU times. As there is only one thread we can just use CPU times fields from task and signal structs. Signed-off-by: Stanislaw Gruszka Cc: Peter Zijlstra Cc: Roland McGrath Cc: Vitaly Mayatskikh Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bece7c0b67b..e33a21cb940 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk) } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - struct task_cputime cputime; + struct signal_struct *const sig = tsk->signal; - thread_group_cputimer(tsk, &cputime); cleanup_timers(tsk->signal->cpu_timers, - cputime.utime, cputime.stime, cputime.sum_exec_runtime); + cputime_add(tsk->utime, sig->utime), + cputime_add(tsk->stime, sig->stime), + tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) -- cgit v1.2.3 From 3a6593050fbd8bbcaed3a44d01c31d907315c86c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 17:34:57 +0200 Subject: perf_counter, ftrace: Fix perf_counter integration Adds possible second part to the assign argument of TP_EVENT(). TP_perf_assign( __perf_count(foo); __perf_addr(bar); ) Which, when specified make the swcounter increment with @foo instead of the usual 1, and report @bar for PERF_SAMPLE_ADDR (data address associated with the event) when this triggers a counter overflow. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 673c1aaf733..52eb4b68d34 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3703,17 +3703,17 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id) +void perf_tpcounter_event(int event_id, u64 addr, u64 count) { struct perf_sample_data data = { .regs = get_irq_regs(), - .addr = 0, + .addr = addr, }; if (!data.regs) data.regs = task_pt_regs(current); - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); -- cgit v1.2.3 From f413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Aug 2009 01:25:54 +0200 Subject: perf_counter: Fix/complete ftrace event records sampling This patch implements the kernel side support for ftrace event record sampling. A new counter sampling attribute is added: PERF_SAMPLE_TP_RECORD which requests ftrace events record sampling. In this case if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint fires, we emit the tracepoint binary record to the perfcounter event buffer, as a sample. Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf record: perf record -f -F 1 -a -e workqueue:workqueue_execution perf report -D 0x21e18 [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........ . 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!...... . 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve . 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1........... . 0040: e0 b1 31 81 ff ff ff ff ....... . 0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33 The raw ftrace binary record starts at offset 0020. Translation: struct trace_entry { type = 0x2b = 43; flags = 1; preempt_count = 2; pid = 0xa = 10; tgid = 0xa = 10; } thread_comm = "events/1" thread_pid = 0xa = 10; func = 0xffffffff8131b1e0 = flush_to_ldisc() What will come next? - Userspace support ('perf trace'), 'flight data recorder' mode for perf trace, etc. - The unconditional copy from the profiling callback brings some costs however if someone wants no such sampling to occur, and needs to be fixed in the future. For that we need to have an instant access to the perf counter attribute. This is a matter of a flag to add in the struct ftrace_event. - Take care of the events recursivity! Don't ever try to record a lock event for example, it seems some locking is used in the profiling fast path and lead to a tracing recursivity. That will be fixed using raw spinlock or recursivity protection. - [...] - Profit! :-) Signed-off-by: Frederic Weisbecker Cc: Li Zefan Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Steven Rostedt Cc: Paul Mackerras Cc: Pekka Enberg Cc: Gabriel Munteanu Cc: Lai Jiangshan Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 18 +++++++++++++++++- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 4 ---- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52eb4b68d34..868102172aa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; + struct perf_tracepoint_record *tp; int callchain_size = 0; u64 time; struct { @@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } + if (sample_type & PERF_SAMPLE_TP_RECORD) { + tp = data->private; + header.size += tp->size; + } + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } + if (sample_type & PERF_SAMPLE_TP_RECORD) + perf_output_copy(&handle, tp->record, tp->size); + perf_output_end(&handle); } @@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count) +void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) { + struct perf_tracepoint_record tp = { + .size = entry_size, + .record = record, + }; + struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, + .private = &tp, }; if (!data.regs) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8930e39b9d8..c22b40f8f57 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } +EXPORT_SYMBOL_GPL(tracing_generic_entry_update); struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, int type, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5cc78..8b9f4f6e955 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags, - int pc); - void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); -- cgit v1.2.3 From 7b4b6658e152ed4568cfff48175d93645df081d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 09:29:32 +0200 Subject: perf_counter: Fix software counters for fast moving event sources Reimplement the software counters to deal with fast moving event sources (such as tracepoints). This means being able to generate multiple overflows from a single 'event' as well as support throttling. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 164 +++++++++++++++++++++++++++++--------------------- 1 file changed, 94 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 868102172aa..615440ab929 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3344,87 +3344,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, * Generic software counter infrastructure */ -static void perf_swcounter_update(struct perf_counter *counter) +/* + * We directly increment counter->count and keep a second value in + * counter->hw.period_left to count intervals. This period counter + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swcounter_set_period(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; - u64 prev, now; - s64 delta; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; again: - prev = atomic64_read(&hwc->prev_count); - now = atomic64_read(&hwc->count); - if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) - goto again; + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; - delta = now - prev; + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); + return nr; } -static void perf_swcounter_set_period(struct perf_counter *counter) +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct perf_sample_data *data) { struct hw_perf_counter *hwc = &counter->hw; - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; + u64 overflow; - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - } + data->period = counter->hw.last_period; + overflow = perf_swcounter_set_period(counter); - if (unlikely(left <= 0)) { - left += period; - atomic64_add(period, &hwc->period_left); - hwc->last_period = period; - } + if (hwc->interrupts == MAX_INTERRUPTS) + return; - atomic64_set(&hwc->prev_count, -left); - atomic64_set(&hwc->count, -left); + for (; overflow; overflow--) { + if (perf_counter_overflow(counter, nmi, data)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + } } -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +static void perf_swcounter_unthrottle(struct perf_counter *counter) { - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - data.regs = get_irq_regs(); /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. + * Nothing to do, we already reset hwc->interrupts. */ - if ((counter->attr.exclude_kernel || !data.regs) && - !counter->attr.exclude_user) - data.regs = task_pt_regs(current); +} - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) - ret = HRTIMER_NORESTART; - } +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct perf_sample_data *data) +{ + struct hw_perf_counter *hwc = &counter->hw; - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + atomic64_add(nr, &counter->count); - return ret; -} + if (!hwc->sample_period) + return; -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data) -{ - data->period = counter->hw.last_period; + if (!data->regs) + return; - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, data)) - /* soft-disable the counter */ - ; + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swcounter_overflow(counter, nmi, data); } static int perf_swcounter_is_counting(struct perf_counter *counter) @@ -3488,15 +3482,6 @@ static int perf_swcounter_match(struct perf_counter *counter, return 1; } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) -{ - int neg = atomic64_add_negative(nr, &counter->hw.count); - - if (counter->hw.sample_period && !neg && data->regs) - perf_swcounter_overflow(counter, nmi, data); -} - static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_type_id type, u32 event, u64 nr, int nmi, @@ -3575,26 +3560,65 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi, static void perf_swcounter_read(struct perf_counter *counter) { - perf_swcounter_update(counter); } static int perf_swcounter_enable(struct perf_counter *counter) { - perf_swcounter_set_period(counter); + struct hw_perf_counter *hwc = &counter->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swcounter_set_period(counter); + } return 0; } static void perf_swcounter_disable(struct perf_counter *counter) { - perf_swcounter_update(counter); } static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, + .unthrottle = perf_swcounter_unthrottle, }; +/* + * hrtimer based swcounter callback + */ + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct perf_counter *counter; + u64 period; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->pmu->read(counter); + + data.addr = 0; + data.regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->attr.exclude_kernel || !data.regs) && + !counter->attr.exclude_user) + data.regs = task_pt_regs(current); + + if (data.regs) { + if (perf_counter_overflow(counter, 0, &data)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, counter->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + /* * Software counter: cpu wall time clock */ -- cgit v1.2.3 From 10b8e3066066708f304e0fc5cfe658e05abf943d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:35 +0200 Subject: perf_counter: Work around gcc warning by initializing tracepoint record unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite that the tracepoint record is always present when the PERF_SAMPLE_TP_RECORD flag is set, gcc raises a warning, thinking it might not be initialized: kernel/perf_counter.c: In function ‘perf_counter_output’: kernel/perf_counter.c:2650: warning: ‘tp’ may be used uninitialized in this function Then, initialize it to NULL and always check if it's not NULL before dereference it. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 615440ab929..117622cb73a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp; + struct perf_tracepoint_record *tp = NULL; int callchain_size = 0; u64 time; struct { @@ -2717,7 +2717,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_TP_RECORD) { tp = data->private; - header.size += tp->size; + if (tp) + header.size += tp->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2783,7 +2784,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if (sample_type & PERF_SAMPLE_TP_RECORD) + if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) perf_output_copy(&handle, tp->record, tp->size); perf_output_end(&handle); -- cgit v1.2.3 From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:37 +0200 Subject: perf_counter: Fix tracepoint sampling to be part of generic sampling Based on Peter's comments, make tracepoint sampling generic just like all the other sampling bits are. This is a rename with no code changes: - PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW - struct perf_tracepoint_record to perf_raw_record We want the system in place that transport tracepoints raw samples events into the perf ring buffer to be generalized and usable by any type of counter. Reported-by; Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 117622cb73a..00231054041 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp = NULL; + struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2715,10 +2715,10 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_TP_RECORD) { - tp = data->private; - if (tp) - header.size += tp->size; + if (sample_type & PERF_SAMPLE_RAW) { + raw = data->raw; + if (raw) + header.size += raw->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2784,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) - perf_output_copy(&handle, tp->record, tp->size); + if ((sample_type & PERF_SAMPLE_RAW) && raw) + perf_output_copy(&handle, raw->data, raw->size); perf_output_end(&handle); } @@ -3740,15 +3740,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { - struct perf_tracepoint_record tp = { + struct perf_raw_record raw = { .size = entry_size, - .record = record, + .data = record, }; struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, - .private = &tp, + .raw = &raw, }; if (!data.regs) -- cgit v1.2.3 From 3a80b4a3539696f4b0574876326860323035a302 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Aug 2009 19:49:01 +0200 Subject: perf_counter: Fix a race on perf_counter_ctx While extending perfcounters with BTS hw-tracing, Markus Metzger managed to trigger this warning: [ 995.557128] WARNING: at kernel/perf_counter.c:1191 __perf_counter_task_sched_out+0x48/0x6b() triggers because commit 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 (perf_counter: Full task tracing) removed clearing of tsk->perf_counter_ctxp out from under ctx->lock which introduced a race (against perf_lock_task_context). Move it back and deal with the exit notification by explicitly passing along the former task context. Reported-by: Markus T Metzger Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1249667341.17467.5.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 00231054041..546e62d6294 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2850,7 +2850,8 @@ perf_counter_read_event(struct perf_counter *counter, */ struct perf_task_event { - struct task_struct *task; + struct task_struct *task; + struct perf_counter_context *task_ctx; struct { struct perf_event_header header; @@ -2910,24 +2911,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx, static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_counter_context *ctx = task_event->task_ctx; cpuctx = &get_cpu_var(perf_cpu_context); perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_counter_ctxp); if (ctx) perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -static void perf_counter_task(struct task_struct *task, int new) +static void perf_counter_task(struct task_struct *task, + struct perf_counter_context *task_ctx, + int new) { struct perf_task_event task_event; @@ -2937,8 +2937,9 @@ static void perf_counter_task(struct task_struct *task, int new) return; task_event = (struct perf_task_event){ - .task = task, - .event = { + .task = task, + .task_ctx = task_ctx, + .event = { .header = { .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, @@ -2956,7 +2957,7 @@ static void perf_counter_task(struct task_struct *task, int new) void perf_counter_fork(struct task_struct *task) { - perf_counter_task(task, 1); + perf_counter_task(task, NULL, 1); } /* @@ -4310,7 +4311,7 @@ void perf_counter_exit_task(struct task_struct *child) unsigned long flags; if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, 0); + perf_counter_task(child, NULL, 0); return; } @@ -4330,6 +4331,7 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); + child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -4343,9 +4345,7 @@ void perf_counter_exit_task(struct task_struct *child) * won't get any samples after PERF_EVENT_EXIT. We can however still * get a few PERF_EVENT_READ events. */ - perf_counter_task(child, 0); - - child->perf_counter_ctxp = NULL; + perf_counter_task(child, child_ctx, 0); /* * We can recurse on the same lock type through: -- cgit v1.2.3 From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:16:52 +0200 Subject: perf_counter: Correct PERF_SAMPLE_RAW output PERF_SAMPLE_* output switches should unconditionally output the correct format, as they are the only way to unambiguously parse the PERF_EVENT_SAMPLE data. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896447.17467.74.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d6294..5229d1666fa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2716,9 +2715,15 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } if (sample_type & PERF_SAMPLE_RAW) { - raw = data->raw; - if (raw) - header.size += raw->size; + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header.size += size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_RAW) && raw) - perf_output_copy(&handle, raw->data, raw->size); + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(&handle, data->raw->size); + perf_output_copy(&handle, data->raw->data, data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(&handle, raw); + } + } perf_output_end(&handle); } -- cgit v1.2.3 From a4e95fc2cbb31d70a65beffeaf8773f881328c34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:20:12 +0200 Subject: perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data Raw tracepoint data contains various kernel internals and data from other users, so restrict this to CAP_SYS_ADMIN. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896452.17467.75.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5229d1666fa..b0b20a07f39 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3787,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter) static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (ftrace_profile_enable(counter->attr.config)) return NULL; -- cgit v1.2.3