From f73be6dedf4fa058ce80846dae604b08fa805ca1 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 Aug 2008 17:07:14 -0700 Subject: smp: have smp_call_function_single() detect invalid CPUs Have smp_call_function_single() return invalid CPU indicies and return -ENXIO. This function is already executed inside a get_cpu()..put_cpu() which locks out CPU removal, so rather than having the higher layers doing another layer of locking to guard against unplugged CPUs do the test here. Signed-off-by: H. Peter Anvin --- kernel/smp.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 782e2b93e46..f362a855377 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -210,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, { struct call_single_data d; unsigned long flags; - /* prevent preemption and reschedule on another processor */ + /* prevent preemption and reschedule on another processor, + as well as CPU removal */ int me = get_cpu(); + int err = 0; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -220,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, local_irq_save(flags); func(info); local_irq_restore(flags); - } else { + } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { struct call_single_data *data = NULL; if (!wait) { @@ -236,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, data->func = func; data->info = info; generic_exec_single(cpu, data); + } else { + err = -ENXIO; /* CPU not online */ } put_cpu(); - return 0; + return err; } EXPORT_SYMBOL(smp_call_function_single); -- cgit v1.2.3 From 2189459d25a47401c69a17794c9d390c890351f9 Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Mon, 25 Aug 2008 17:15:33 -0400 Subject: lockstat: fix numerical output rounding error Fix rounding error in /proc/lock_stat numerical output. On occasion the two digit fractional part contains the three digit value '100'. This is due to a bug in the rounding algorithm which pushes values in the range '95..99' to '100' rather than to '00' + an increment to the integer part. For example, - 123456.100 old display + 123457.00 new display Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 4b194d34d77..20dbcbf9c7d 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -472,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr) { unsigned long rem; + nr += 5; /* for display rounding */ rem = do_div(nr, 1000); /* XXX: do_div_signed */ - snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); + snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); } static void seq_time(struct seq_file *m, s64 time) -- cgit v1.2.3 From 04148b73b89d49fe0fe201bcee395e51f7d637ce Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Mon, 25 Aug 2008 17:16:23 -0400 Subject: lockstat: repair erronous contention statistics Fix bad contention counting in /proc/lock_stat. /proc/lockstat tries to gather per-ip contention statistics per-lock. This was failing due to a garbage per-ip index selector being used. Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3bfb1877a00..b5db51d2803 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3029,7 +3029,7 @@ found_it: stats = get_lock_stats(hlock_class(hlock)); if (point < ARRAY_SIZE(stats->contention_point)) - stats->contention_point[i]++; + stats->contention_point[point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); -- cgit v1.2.3 From 74870172824a78640ec4f03058d9bd35dfa08618 Mon Sep 17 00:00:00 2001 From: Zhu Yi Date: Wed, 27 Aug 2008 14:33:00 +0800 Subject: lockdep: fix invalid list_del_rcu in zap_class The problem is found during iwlagn driver testing on v2.6.27-rc4-176-gb8e6c91 kernel, but it turns out to be a lockdep bug. In our testing, we frequently load and unload the iwlagn driver (>50 times). Then the MAX_STACK_TRACE_ENTRIES is reached (expected behaviour?). The error message with the call trace is as below. BUG: MAX_STACK_TRACE_ENTRIES too low! turning off the locking correctness validator. Pid: 4895, comm: iwlagn Not tainted 2.6.27-rc4 #13 Call Trace: [] save_stack_trace+0x22/0x3e [] save_trace+0x8b/0x91 [] mark_lock+0x1b0/0x8fa [] __lock_acquire+0x5b9/0x716 [] ieee80211_sta_work+0x0/0x6ea [mac80211] [] lock_acquire+0x52/0x6b [] run_workqueue+0x97/0x1ed [] run_workqueue+0xe7/0x1ed [] run_workqueue+0x97/0x1ed [] worker_thread+0xd8/0xe3 [] autoremove_wake_function+0x0/0x2e [] worker_thread+0x0/0xe3 [] kthread+0x47/0x73 [] trace_hardirqs_on_thunk+0x3a/0x3f [] child_rip+0xa/0x11 [] restore_args+0x0/0x30 [] finish_task_switch+0x0/0xcc [] kthread+0x0/0x73 [] child_rip+0x0/0x11 Although the above is harmless, when the ilwagn module is removed later lockdep will trigger a kernel oops as below. BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] zap_class+0x24/0x82 PGD 73128067 PUD 7448c067 PMD 0 Oops: 0002 [1] SMP CPU 0 Modules linked in: rfcomm l2cap bluetooth autofs4 sunrpc nf_conntrack_ipv6 xt_state nf_conntrack xt_tcpudp ip6t_ipv6header ip6t_REJECT ip6table_filter ip6_tables x_tables ipv6 cpufreq_ondemand acpi_cpufreq dm_mirror dm_log dm_multipath dm_mod snd_hda_intel sr_mod snd_seq_dummy snd_seq_oss snd_seq_midi_event battery snd_seq snd_seq_device cdrom button snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd_page_alloc e1000e snd_hwdep sg iTCO_wdt iTCO_vendor_support ac pcspkr i2c_i801 i2c_core snd soundcore video output ata_piix ata_generic libata sd_mod scsi_mod ext3 jbd mbcache uhci_hcd ohci_hcd ehci_hcd [last unloaded: mac80211] Pid: 4941, comm: modprobe Not tainted 2.6.27-rc4 #10 RIP: 0010:[] [] zap_class+0x24/0x82 RSP: 0000:ffff88007bcb3eb0 EFLAGS: 00010046 RAX: 0000000000068ee8 RBX: ffffffff8192a0a0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000001dfb RDI: ffffffff816e70b0 RBP: ffffffffa00cd000 R08: ffffffff816818f8 R09: ffff88007c923558 R10: ffffe20002ad2408 R11: ffffffff811028ec R12: ffffffff8192a0a0 R13: 000000000002bd90 R14: 0000000000000000 R15: 0000000000000296 FS: 00007f9d1cee56f0(0000) GS:ffffffff814a58c0(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000008 CR3: 0000000073047000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process modprobe (pid: 4941, threadinfo ffff88007bcb2000, task ffff8800758d1fc0) Stack: ffffffff81057376 0000000000000000 ffffffffa00f7b00 0000000000000000 0000000000000080 0000000000618278 00007fff24f16720 0000000000000000 ffffffff8105d37a ffffffffa00f7b00 ffffffff8105d591 313132303863616d Call Trace: [] ? lockdep_free_key_range+0x61/0xf5 [] ? free_module+0xd4/0xe4 [] ? sys_delete_module+0x1de/0x1f9 [] ? audit_syscall_entry+0x12d/0x160 [] ? system_call_fastpath+0x16/0x1b Code: b2 00 01 00 00 00 c3 31 f6 49 c7 c0 10 8a 61 81 eb 32 49 39 38 75 26 48 98 48 6b c0 38 48 8b 90 08 8a 61 81 48 8b 88 00 8a 61 81 <48> 89 51 08 48 89 0a 48 c7 80 08 8a 61 81 00 02 20 00 48 ff c6 RIP [] zap_class+0x24/0x82 RSP CR2: 0000000000000008 ---[ end trace a1297e0c4abb0f2e ]--- The root cause for this oops is in add_lock_to_list() when save_trace() fails due to MAX_STACK_TRACE_ENTRIES is reached, entry->class is assigned but entry is never added into any lock list. This makes the list_del_rcu() in zap_class() oops later when the module is unloaded. This patch fixes the problem by assigning entry->class after save_trace() returns success. Signed-off-by: Zhu Yi Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b5db51d2803..dbda475b13b 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -875,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, if (!entry) return 0; - entry->class = this; - entry->distance = distance; if (!save_trace(&entry->trace)) return 0; + entry->class = this; + entry->distance = distance; /* * Since we never remove from the dependency list, the list can * be walked lockless by other CPUs, it's only allocation -- cgit v1.2.3 From 2633f0e57b1127f4060d70bf460140dc9bb19386 Mon Sep 17 00:00:00 2001 From: Steve VanDeBogart Date: Tue, 26 Aug 2008 15:14:36 -0700 Subject: exit signals: use of uninitialized field notify_count task->signal->notify_count is only initialized if task->signal->group_exit_task is not NULL. Reorder a conditional so that uninitialised memory is not used. Found by Valgrind. Signed-off-by: Steve VanDeBogart Signed-off-by: Ingo Molnar --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 38ec4063014..75c64738763 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -918,8 +918,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && - tsk->signal->notify_count < 0 && - tsk->signal->group_exit_task) + tsk->signal->group_exit_task && + tsk->signal->notify_count < 0) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); -- cgit v1.2.3 From f42ac38c59e0a03d6da0c24a63fb211393f484b0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Aug 2008 09:14:40 -0400 Subject: ftrace: disable tracing for suspend to ram I've been painstakingly debugging the issue with suspend to ram and ftraced. The 2.6.28 code does not have this issue, but since the mcount recording is not going to be in 27, this must be solved for the ftrace daemon version. The resume from suspend to ram would reboot because it was triple faulting. Debugging further, I found that calling the mcount function itself was not an issue, but it would fault when it incremented preempt_count. preempt_count is on the tasks info structure that is on the low memory address of the task's stack. For some reason, it could not write to it. Resuming out of suspend to ram does quite a lot of funny tricks to get to work, so it is not surprising at all that simply doing a preempt_disable() would cause a fault. Thanks to Rafael for suggesting to add a "while (1);" to find the place in resuming that is causing the fault. I would place the loop somewhere in the code, compile and reboot and see if it would either reboot (hit the fault) or simply hang (hit the loop). Doing this over and over again, I narrowed it down that it was happening in enable_nonboot_cpus. At this point, I found that it is easier to simply disable tracing around the suspend code, instead of searching for the particular function that can not handle doing a preempt_disable. This patch disables the tracer as it suspends and reenables it on resume. I tested this patch on my Laptop, and it can resume fine with the patch. Signed-off-by: Steven Rostedt Acked-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- kernel/power/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 0b7476f5d2a..540b16b6856 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "power.h" @@ -310,7 +311,7 @@ static int suspend_enter(suspend_state_t state) */ int suspend_devices_and_enter(suspend_state_t state) { - int error; + int error, ftrace_save; if (!suspend_ops) return -ENOSYS; @@ -321,6 +322,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + ftrace_save = __ftrace_enabled_save(); suspend_test_start(); error = device_suspend(PMSG_SUSPEND); if (error) { @@ -352,6 +354,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); device_resume(PMSG_RESUME); suspend_test_finish("resume devices"); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: if (suspend_ops->end) -- cgit v1.2.3 From f3ade837808121ff8bab9c56725f4fe40ec85a56 Mon Sep 17 00:00:00 2001 From: John Blackwood Date: Tue, 26 Aug 2008 15:09:43 -0400 Subject: sched: fix sched_rt_rq_enqueue() resched idle When sysctl_sched_rt_runtime is set to something other than -1 and the CONFIG_RT_GROUP_SCHED kernel parameter is NOT enabled, we get into a state where we see one or more CPUs idling forvever even though there are real-time tasks in their rt runqueue that are able to run (no longer throttled). The sequence is: - A real-time task is running when the timer sets the rt runqueue to throttled, and the rt task is resched_task()ed and switched out, and idle is switched in since there are no non-rt tasks to run on that cpu. - Eventually the do_sched_rt_period_timer() runs and un-throttles the rt runqueue, but we just exit the timer interrupt and go back to executing the idle task in the idle loop forever. If we change the sched_rt_rq_enqueue() routine to use some of the code from the CONFIG_RT_GROUP_SCHED enabled version of this same routine and resched_task() the currently executing task (idle in our case) if it is a lower priority task than the higher rt task in the now un-throttled runqueue, the problem is no longer observed. Signed-off-by: John Blackwood Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 998ba54b454..07d9b330790 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + if (rt_rq->rt_nr_running) + resched_task(rq_of_rt_rq(rt_rq)->curr); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -- cgit v1.2.3 From cc2991cf15ae92fa30b3ea9f56a8a5a337bd33c7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:03 +0200 Subject: sched: rt-bandwidth accounting fix It fixes an accounting bug where we would continue accumulating runtime even though the bandwidth control is disabled. This would lead to very long throttle periods once bandwidth control gets turned on again. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 07d9b330790..552310798da 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -440,9 +440,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); @@ -493,9 +490,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + } spin_unlock(&rt_rq->rt_runtime_lock); } } -- cgit v1.2.3 From 41108eb10142e0552f2de1e4c0675b108c5f018f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Aug 2008 14:39:12 +0200 Subject: ftrace: disable tracing for hibernation In accordance with commit f42ac38c59e0a03d6da0c24a63fb211393f484b0 ("ftrace: disable tracing for suspend to ram"), disable tracing around the suspend code in hibernation code paths. Signed-off-by: Rafael J. Wysocki Acked-by: Steven Rostedt Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f011e0870b5..bbd85c60f74 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "power.h" @@ -255,7 +256,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { - int error; + int error, ftrace_save; /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); @@ -267,6 +268,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_FREEZE); if (error) goto Recover_platform; @@ -296,6 +298,7 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: platform_end(platform_mode); @@ -366,10 +369,11 @@ static int resume_target_kernel(void) int hibernation_restore(int platform_mode) { - int error; + int error, ftrace_save; pm_prepare_console(); suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -384,6 +388,7 @@ int hibernation_restore(int platform_mode) platform_restore_cleanup(platform_mode); device_resume(PMSG_RECOVER); Finish: + __ftrace_enabled_restore(ftrace_save); resume_console(); pm_restore_console(); return error; @@ -396,7 +401,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { - int error; + int error, ftrace_save; if (!hibernation_ops) return -ENOSYS; @@ -411,6 +416,7 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -445,6 +451,7 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: hibernation_ops->end(); -- cgit v1.2.3 From 316d9679f33caf7e683471647d1472bfe133d858 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Aug 2008 20:06:23 +0200 Subject: Don't trigger softlockup detector on network fs blocked tasks Pulling the ethernet cable on a 2.6.27-rc system with NFS mounts currently leads to an ongoing flood of soft lockup detector backtraces for all tasks blocked on the NFS mounts when the hickup takes longer than 120s. I don't think NFS problems should be all that noisy. Luckily there's a reasonably easy way to distingush this case. Don't report task softlockup warnings for tasks in TASK_KILLABLE state, which is used by the network file systems. I believe this patch is a 2.6.27 candidate. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/softlockup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index b75b492fbfc..1a07f8ca4b9 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -180,6 +180,10 @@ static void check_hung_task(struct task_struct *t, unsigned long now) if (t->flags & PF_FROZEN) return; + /* Don't check for tasks waiting on network file systems like NFS */ + if (t->state & TASK_KILLABLE) + return; + if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { t->last_switch_count = switch_count; t->last_switch_timestamp = now; -- cgit v1.2.3 From bef69ea0dcce574a425feb0a5aa4c63dd108b9a6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 29 Aug 2008 20:18:31 -0700 Subject: Resource handling: add 'insert_resource_expand_to_fit()' function Not used anywhere yet, but this complements the existing plain 'insert_resource()' functionality with a version that can expand the resource we are adding in order to fix up any conflicts it has with existing resources. Signed-off-by: Linus Torvalds --- kernel/resource.c | 88 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index f5b518eabef..cf0a178c751 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new, EXPORT_SYMBOL(allocate_resource); -/** - * insert_resource - Inserts a resource in the resource tree - * @parent: parent of the new resource - * @new: new resource to insert - * - * Returns 0 on success, -EBUSY if the resource can't be inserted. - * - * This function is equivalent to request_resource when no conflict - * happens. If a conflict happens, and the conflicting resources - * entirely fit within the range of the new resource, then the new - * resource is inserted and the conflicting resources become children of - * the new resource. +/* + * Insert a resource into the resource tree. If successful, return NULL, + * otherwise return the conflicting resource (compare to __request_resource()) */ -int insert_resource(struct resource *parent, struct resource *new) +static struct resource * __insert_resource(struct resource *parent, struct resource *new) { - int result; struct resource *first, *next; - write_lock(&resource_lock); - for (;; parent = first) { - result = 0; first = __request_resource(parent, new); if (!first) - goto out; + return first; - result = -EBUSY; if (first == parent) - goto out; + return first; if ((first->start > new->start) || (first->end < new->end)) break; @@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new) for (next = first; ; next = next->sibling) { /* Partial overlap? Bad, and unfixable */ if (next->start < new->start || next->end > new->end) - goto out; + return next; if (!next->sibling) break; if (next->sibling->start > new->end) break; } - result = 0; - new->parent = parent; new->sibling = next->sibling; new->child = first; @@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new) next = next->sibling; next->sibling = new; } + return NULL; +} - out: +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is equivalent to request_resource when no conflict + * happens. If a conflict happens, and the conflicting resources + * entirely fit within the range of the new resource, then the new + * resource is inserted and the conflicting resources become children of + * the new resource. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __insert_resource(parent, new); + write_unlock(&resource_lock); + return conflict ? -EBUSY : 0; +} + +/** + * insert_resource_expand_to_fit - Insert a resource into the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Insert a resource into the resource tree, possibly expanding it in order + * to make it encompass any conflicting resources. + */ +void insert_resource_expand_to_fit(struct resource *root, struct resource *new) +{ + if (new->parent) + return; + + write_lock(&resource_lock); + for (;;) { + struct resource *conflict; + + conflict = __insert_resource(root, new); + if (!conflict) + break; + if (conflict == root) + break; + + /* Ok, expand resource to cover the conflict, then try again .. */ + if (conflict->start < new->start) + new->start = conflict->start; + if (conflict->end > new->end) + new->end = conflict->end; + + printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); + } write_unlock(&resource_lock); - return result; } /** -- cgit v1.2.3 From c4bacefb7aaf49da11a695f29d85d40909f17693 Mon Sep 17 00:00:00 2001 From: Cordelia Date: Mon, 18 Aug 2008 09:45:51 -0700 Subject: [PATCH] audit: Moved variable declaration to beginning of function got rid of compilation warning: ISO C90 forbids mixed declarations and code Signed-off-by: Cordelia Sam Signed-off-by: Al Viro --- kernel/auditsc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 972f8e61d36..59cedfb040e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -243,10 +243,11 @@ static inline int open_arg(int flags, int mask) static int audit_match_perm(struct audit_context *ctx, int mask) { + unsigned n; if (unlikely(!ctx)) return 0; - unsigned n = ctx->major; + n = ctx->major; switch (audit_classify_syscall(ctx->arch, n)) { case 0: /* native */ if ((mask & AUDIT_PERM_WRITE) && -- cgit v1.2.3 From 6781f4ae30bbb8ebf31187b3c9304be16966f5a0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Aug 2008 20:31:55 -0700 Subject: kernel/resource.c: fix new kernel-doc warning Fix kernel-doc warning for new function: Warning(linux-2.6.27-rc5-git2//kernel/resource.c:448): No description found for parameter 'root' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index cf0a178c751..03d796c1b2e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -438,7 +438,7 @@ int insert_resource(struct resource *parent, struct resource *new) /** * insert_resource_expand_to_fit - Insert a resource into the resource tree - * @parent: parent of the new resource + * @root: root resource descriptor * @new: new resource to insert * * Insert a resource into the resource tree, possibly expanding it in order -- cgit v1.2.3 From cbaed698f37494b30b2449b51c728ae48630cb2b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 30 Aug 2008 21:08:40 +0400 Subject: softlockup: minor cleanup, don't check task->state twice The recent commit 16d9679f33caf7e683471647d1472bfe133d858 changed check_hung_task() to filter out the TASK_KILLABLE tasks. We can move this check to the caller which has to test t->state anyway. Signed-off-by: Oleg Nesterov Acked-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/softlockup.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 1a07f8ca4b9..cb838ee93a8 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -180,10 +180,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now) if (t->flags & PF_FROZEN) return; - /* Don't check for tasks waiting on network file systems like NFS */ - if (t->state & TASK_KILLABLE) - return; - if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { t->last_switch_count = switch_count; t->last_switch_timestamp = now; @@ -237,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu) do_each_thread(g, t) { if (!--max_count) goto unlock; - if (t->state & TASK_UNINTERRUPTIBLE) + /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ + if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, now); } while_each_thread(g, t); unlock: -- cgit v1.2.3 From add0d4dfd660e9e4fd0af3eac3cad23583c9558f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 2 Sep 2008 14:35:48 -0700 Subject: pid_ns: zap_pid_ns_processes: fix the ->child_reaper changing zap_pid_ns_processes() sets pid_ns->child_reaper = NULL, this is wrong. Yes, we have already killed all tasks in this namespace, and sys_wait4() doesn't see any child. But this doesn't mean ->children list is empty, we may have EXIT_DEAD tasks which are not visible to do_wait(). In that case the subsequent forget_original_parent() will crash the kernel because it will try to re-parent these tasks to the NULL reaper. Even if there are no childs, it is not good that forget_original_parent() uses reaper == NULL. Change the code to set ->child_reaper = init_pid_ns.child_reaper instead. We could use pid_ns->parent->child_reaper as well, I think this does not really matter. These EXIT_DEAD tasks are not visible to the new ->parent after re-parenting, they will silently do release_task() eventually. Note that we must change ->child_reaper, otherwise forget_original_parent() will use reaper == father, and in that case we will hit the (correct) BUG_ON(!list_empty(&father->children)). Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Acked-by: Sukadev Bhattiprolu Acked-by: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ea567b78d1a..598f1eec982 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -179,9 +179,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); - - /* Child reaper for the pid namespace is going away */ - pid_ns->child_reaper = NULL; + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; acct_exit_ns(pid_ns); return; } -- cgit v1.2.3 From 950bbabb5a804690a0201190de5c22837f72f83f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 2 Sep 2008 14:35:49 -0700 Subject: pid_ns: (BUG 11391) change ->child_reaper when init->group_leader exits We don't change pid_ns->child_reaper when the main thread of the subnamespace init exits. As Robert Rex pointed out this is wrong. Yes, the re-parenting itself works correctly, but if the reparented task exits it needs ->parent->nsproxy->pid_ns in do_notify_parent(), and if the main thread is zombie its ->nsproxy was already cleared by exit_task_namespaces(). Introduce the new function, find_new_reaper(), which finds the new ->parent for the re-parenting and changes ->child_reaper if needed. Kill the now unneeded exit_child_reaper(). Also move the changing of ->child_reaper from zap_pid_ns_processes() to find_new_reaper(), this consolidates the games with ->child_reaper and makes it stable under tasklist_lock. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=11391 Reported-by: Robert Rex Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Acked-by: Pavel Emelyanov Acked-by: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 78 ++++++++++++++++++++++---------------------------- kernel/pid_namespace.c | 6 ---- 2 files changed, 34 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 75c64738763..25ed2ad986d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -831,26 +831,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father) * the child reaper process (ie "init") in our pid * space. */ +static struct task_struct *find_new_reaper(struct task_struct *father) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *thread; + + thread = father; + while_each_thread(father, thread) { + if (thread->flags & PF_EXITING) + continue; + if (unlikely(pid_ns->child_reaper == father)) + pid_ns->child_reaper = thread; + return thread; + } + + if (unlikely(pid_ns->child_reaper == father)) { + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) + panic("Attempted to kill init!"); + + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; + } + + return pid_ns->child_reaper; +} + static void forget_original_parent(struct task_struct *father) { - struct task_struct *p, *n, *reaper = father; + struct task_struct *p, *n, *reaper; LIST_HEAD(ptrace_dead); write_lock_irq(&tasklist_lock); - + reaper = find_new_reaper(father); /* * First clean up ptrace if we were using it. */ ptrace_exit(father, &ptrace_dead); - do { - reaper = next_thread(reaper); - if (reaper == father) { - reaper = task_child_reaper(father); - break; - } - } while (reaper->flags & PF_EXITING); - list_for_each_entry_safe(p, n, &father->children, sibling) { p->real_parent = reaper; if (p->parent == father) { @@ -959,39 +983,6 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -static inline void exit_child_reaper(struct task_struct *tsk) -{ - if (likely(tsk->group_leader != task_child_reaper(tsk))) - return; - - if (tsk->nsproxy->pid_ns == &init_pid_ns) - panic("Attempted to kill init!"); - - /* - * @tsk is the last thread in the 'cgroup-init' and is exiting. - * Terminate all remaining processes in the namespace and reap them - * before exiting @tsk. - * - * Note that @tsk (last thread of cgroup-init) may not necessarily - * be the child-reaper (i.e main thread of cgroup-init) of the - * namespace i.e the child_reaper may have already exited. - * - * Even after a child_reaper exits, we let it inherit orphaned children, - * because, pid_ns->child_reaper remains valid as long as there is - * at least one living sub-thread in the cgroup init. - - * This living sub-thread of the cgroup-init will be notified when - * a child inherited by the 'child-reaper' exits (do_notify_parent() - * uses __group_send_sig_info()). Further, when reaping child processes, - * do_wait() iterates over children of all living sub threads. - - * i.e even though 'child_reaper' thread is listed as the parent of the - * orphaned children, any living sub-thread in the cgroup-init can - * perform the role of the child_reaper. - */ - zap_pid_ns_processes(tsk->nsproxy->pid_ns); -} - NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; @@ -1051,7 +1042,6 @@ NORET_TYPE void do_exit(long code) } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { - exit_child_reaper(tsk); hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 598f1eec982..fab8ea86fac 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -179,12 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); - /* - * We can not clear ->child_reaper or leave it alone. - * There may by stealth EXIT_DEAD tasks on ->children, - * forget_original_parent() must move them somewhere. - */ - pid_ns->child_reaper = init_pid_ns.child_reaper; acct_exit_ns(pid_ns); return; } -- cgit v1.2.3 From 9d3593574702ae1899e23a1535da1ac71f928042 Mon Sep 17 00:00:00 2001 From: John Kacur Date: Tue, 2 Sep 2008 14:36:13 -0700 Subject: pm_qos_requirement might sleep Make PM_QOS and CPU_IDLE play nicer when run with the RT-Preempt kernel. The purpose of the patch is to remove the spin_lock around the read in the function pm_qos_requirement - since spinlocks can sleep in -rt and this function is called from idle. CPU_IDLE polls the target_value's of some of the pm_qos parameters from the idle loop causing sleeping locking warnings. Changing the target_value to an atomic avoids this issue. Remove the spinlock in pm_qos_requirement by making target_value an atomic type. Signed-off-by: mark gross Signed-off-by: John Kacur Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pm_qos_params.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index da9c2dda6a4..dfdec524d1b 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -43,7 +43,7 @@ #include /* - * locking rule: all changes to target_value or requirements or notifiers lists + * locking rule: all changes to requirements or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ @@ -66,7 +66,7 @@ struct pm_qos_object { struct miscdevice pm_qos_power_miscdev; char *name; s32 default_value; - s32 target_value; + atomic_t target_value; s32 (*comparitor)(s32, s32); }; @@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = { .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = 2000 * USEC_PER_SEC, + .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), .comparitor = min_compare }; @@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = { .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = 2000 * USEC_PER_SEC, + .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), .comparitor = min_compare }; @@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = { .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, - .target_value = 0, + .target_value = ATOMIC_INIT(0), .comparitor = max_compare }; @@ -150,11 +150,11 @@ static void update_target(int target) extreme_value = pm_qos_array[target]->comparitor( extreme_value, node->value); } - if (pm_qos_array[target]->target_value != extreme_value) { + if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { call_notifier = 1; - pm_qos_array[target]->target_value = extreme_value; + atomic_set(&pm_qos_array[target]->target_value, extreme_value); pr_debug(KERN_ERR "new target for qos %d is %d\n", target, - pm_qos_array[target]->target_value); + atomic_read(&pm_qos_array[target]->target_value)); } spin_unlock_irqrestore(&pm_qos_lock, flags); @@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_requirement(int pm_qos_class) { - int ret_val; - unsigned long flags; - - spin_lock_irqsave(&pm_qos_lock, flags); - ret_val = pm_qos_array[pm_qos_class]->target_value; - spin_unlock_irqrestore(&pm_qos_lock, flags); - - return ret_val; + return atomic_read(&pm_qos_array[pm_qos_class]->target_value); } EXPORT_SYMBOL_GPL(pm_qos_requirement); -- cgit v1.2.3