From 09f2724a786f76475ef2985cf84f5359c553aade Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Wed, 14 Aug 2030 15:56:40 +0800 Subject: sched: fix the race between walk_tg_tree and sched_create_group With 2.6.27-rc3, I hit a kernel panic when running volanoMark on my new x86_64 machine. I also hit it with other 2.6.27-rc kernels. See below log. Basically, function walk_tg_tree and sched_create_group have a race between accessing and initiating tg->children. Below patch fixes it by moving tg->children initiation to the front of linking tg->siblings to parent->children. {----------------panic log------------} BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 IP: [] walk_tg_tree+0x45/0x7f PGD 1be1c4067 PUD 1bdd8d067 PMD 0 Oops: 0000 [1] SMP CPU 11 Modules linked in: igb Pid: 22979, comm: java Not tainted 2.6.27-rc3 #1 RIP: 0010:[] [] walk_tg_tree+0x45/0x7f RSP: 0018:ffff8801bfbbbd18 EFLAGS: 00010083 RAX: 0000000000000000 RBX: ffff8800be0dce40 RCX: ffffffffffffffc0 RDX: ffff880102c43740 RSI: 0000000000000000 RDI: ffff8800be0dce40 RBP: ffff8801bfbbbd48 R08: ffff8800ba437bc8 R09: 0000000000001f40 R10: ffff8801be812100 R11: ffffffff805fdf44 R12: ffff880102c43740 R13: 0000000000000000 R14: ffffffff8022cf0f R15: ffffffff8022749f FS: 00000000568ac950(0063) GS:ffff8801bfa26d00(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 00000001bd848000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process java (pid: 22979, threadinfo ffff8801b145a000, task ffff8801bf18e450) Stack: 0000000000000001 ffff8800ba5c8d60 0000000000000001 0000000000000001 ffff8800bad1ccb8 0000000000000000 ffff8801bfbbbd98 ffffffff8022ed37 0000000000000001 0000000000000286 ffff8801bd5ee180 ffff8800ba437bc8 Call Trace: [] try_to_wake_up+0x71/0x24c [] autoremove_wake_function+0x9/0x2e [] ? __wake_up_common+0x46/0x76 [] __wake_up+0x38/0x4f [] tcp_v4_rcv+0x380/0x62e Signed-off-by: Zhang Yanmin Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index d601fb0406c..8bf8a5528bc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8462,8 +8462,8 @@ struct task_group *sched_create_group(struct task_group *parent) WARN_ON(!parent); /* root should already exist */ tg->parent = parent; - list_add_rcu(&tg->siblings, &parent->children); INIT_LIST_HEAD(&tg->children); + list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); return tg; -- cgit v1.2.3 From be4de35263f59ca1f4740edfffbfb02cc3f2189e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 15 Aug 2008 00:40:44 -0700 Subject: completions: uninline try_wait_for_completion and completion_done m68k fails to build with these functions inlined in completion.h. Move them out of line into sched.c and export them to avoid this problem. Signed-off-by: Dave Chinner Cc: Geert Uytterhoeven Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index d601fb0406c..95e6ad3c231 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4669,6 +4669,52 @@ int __sched wait_for_completion_killable(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_killable); +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Returns: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Returns: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(completion_done); + static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { -- cgit v1.2.3 From 55cd53404c5cc5fd94708232e3b4aa4a9388917b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Aug 2008 08:54:26 +0200 Subject: sched: scale sysctl_sched_shares_ratelimit with nr_cpus David reported that his Niagra spend a little too much time in tg_shares_up(), which considering he has a large cpu count makes sense. So scale the ratelimit value with the number of cpus like we do for other controls as well. Reported-by: David Miller Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8bf8a5528bc..040807196b3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -808,9 +808,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; /* * ratelimit for updating the group shares. - * default: 0.5ms + * default: 0.25ms */ -const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; +unsigned int sysctl_sched_shares_ratelimit = 250000; /* * period over which we measure -rt task cpu usage in us. @@ -5740,6 +5740,8 @@ static inline void sched_init_granularity(void) sysctl_sched_latency = limit; sysctl_sched_wakeup_granularity *= factor; + + sysctl_sched_shares_ratelimit *= factor; } #ifdef CONFIG_SMP -- cgit v1.2.3