From 14819ea1e0bcbdc9b084cd60a6a24d5d786324ef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 14 Jan 2009 12:34:21 +0100 Subject: irq: export __set_irq_handler() and handle_level_irq() Impact: build fix ARM updates broke x86 allmodconfig builds: ERROR: "__set_irq_handler" [drivers/mfd/pcf50633-core.ko] undefined! ERROR: "handle_level_irq" [drivers/mfd/pcf50633-core.ko] undefined! Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f63c706d25e..7de11bd64df 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -383,6 +383,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) out_unlock: spin_unlock(&desc->lock); } +EXPORT_SYMBOL_GPL(handle_level_irq); /** * handle_fasteoi_irq - irq handler for transparent controllers @@ -593,6 +594,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, } spin_unlock_irqrestore(&desc->lock, flags); } +EXPORT_SYMBOL_GPL(__set_irq_handler); void set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, -- cgit v1.2.3 From f90d4118bacef87894621a3e8aba853fa0c89abc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 16 Jan 2009 10:24:10 +0800 Subject: cpuset: fix possible deadlock in async_rebuild_sched_domains Lockdep reported some possible circular locking info when we tested cpuset on NUMA/fake NUMA box. ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.29-rc1-00224-ga652504 #111 ------------------------------------------------------- bash/2968 is trying to acquire lock: (events){--..}, at: [] flush_work+0x24/0xd8 but task is already holding lock: (cgroup_mutex){--..}, at: [] cgroup_lock_live_group+0x12/0x29 which lock already depends on the new lock. ...... ------------------------------------------------------- Steps to reproduce: # mkdir /dev/cpuset # mount -t cpuset xxx /dev/cpuset # mkdir /dev/cpuset/0 # echo 0 > /dev/cpuset/0/cpus # echo 0 > /dev/cpuset/0/mems # echo 1 > /dev/cpuset/0/memory_migrate # cat /dev/zero > /dev/null & # echo $! > /dev/cpuset/0/tasks This is because async_rebuild_sched_domains has the following lock sequence: run_workqueue(async_rebuild_sched_domains) -> do_rebuild_sched_domains -> cgroup_lock But, attaching tasks when memory_migrate is set has following: cgroup_lock_live_group(cgroup_tasks_write) -> do_migrate_pages -> flush_work This patch fixes it by using a separate workqueue thread. Signed-off-by: Miao Xie Signed-off-by: Lai Jiangshan Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a85678865c5..f76db9dcaa0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -60,6 +60,14 @@ #include #include +/* + * Workqueue for cpuset related tasks. + * + * Using kevent workqueue may cause deadlock when memory_migrate + * is set. So we create a separate workqueue thread for cpuset. + */ +static struct workqueue_struct *cpuset_wq; + /* * Tracks how many cpusets are currently defined in system. * When there is only one cpuset (the root cpuset) we can @@ -831,7 +839,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); */ static void async_rebuild_sched_domains(void) { - schedule_work(&rebuild_sched_domains_work); + queue_work(cpuset_wq, &rebuild_sched_domains_work); } /* @@ -2111,6 +2119,9 @@ void __init cpuset_init_smp(void) hotcpu_notifier(cpuset_track_online_cpus, 0); hotplug_memory_notifier(cpuset_track_online_nodes, 10); + + cpuset_wq = create_singlethread_workqueue("cpuset"); + BUG_ON(!cpuset_wq); } /** -- cgit v1.2.3 From 082605de5f82eb692cc90f7fda071cc01bb5ac34 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Jan 2009 14:32:51 -0500 Subject: ring-buffer: fix alignment problem Impact: fix to allow some archs to use the ring buffer Commits in the ring buffer are checked by pointer arithmetic. If the calculation is incorrect, then the commits will never take place and the buffer will simply fill up and report an error. Each page in the ring buffer has a small header: struct buffer_data_page { u64 time_stamp; local_t commit; unsigned char data[]; }; Unfortuntely, some of the calculations used sizeof(struct buffer_data_page) to know the size of the header. But this is incorrect on some archs, where sizeof(struct buffer_data_page) does not equal offsetof(struct buffer_data_page, data), and on those archs, the commits are never processed. This patch replaces the sizeof with offsetof. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8b0daf0662e..1d6526361d0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -246,7 +246,7 @@ static inline int test_time_stamp(u64 delta) return 0; } -#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page)) +#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data)) /* * head_page == tail_page && head == tail then buffer is empty. -- cgit v1.2.3 From 00f57f545afa422db3003b0d0b30a30f8de7ecb2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Jan 2009 13:33:27 -0800 Subject: tracing/function-graph-tracer: fix a regression while suspend to disk Impact: fix a crash while kernel image restore When the function graph tracer is running and while suspend to disk, some racy and dangerous things happen against this tracer. The current task will save its registers including the stack pointer which contains the return address hooked by the tracer. But the current task will continue to enter other functions after that to save the memory, and then it will store other return addresses, and finally loose the old depth which matches the return address saved in the old stack (during the registers saving). So on image restore, the code will return to wrong addresses. And there are other things: on restore, the task will have it's "current" pointer overwritten during registers restoring....switching from one task to another... That would be insane to try to trace function graphs at these stages. This patch makes the function graph tracer listening on power events, making it's tracing disabled for the current task (the one that performs the hibernation work) while suspend/resume to disk, making the tracing safe during hibernation. Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2f32969c09d..7dcf6e9f2b0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1965,6 +1966,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static atomic_t ftrace_graph_active; +static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { @@ -2043,6 +2045,27 @@ static int start_graph_tracing(void) return ret; } +/* + * Hibernation protection. + * The state of the current task is too much unstable during + * suspend/restore to disk. We want to protect against that. + */ +static int +ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, + void *unused) +{ + switch (state) { + case PM_HIBERNATION_PREPARE: + pause_graph_tracing(); + break; + + case PM_POST_HIBERNATION: + unpause_graph_tracing(); + break; + } + return NOTIFY_DONE; +} + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -2050,6 +2073,9 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_sysctl_lock); + ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; + register_pm_notifier(&ftrace_suspend_notifier); + atomic_inc(&ftrace_graph_active); ret = start_graph_tracing(); if (ret) { @@ -2075,6 +2101,7 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); + unregister_pm_notifier(&ftrace_suspend_notifier); mutex_unlock(&ftrace_sysctl_lock); } -- cgit v1.2.3 From 551b4048b3d4acf15aff9fe4aed89b892c135b02 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 12 Jan 2009 11:06:18 +0800 Subject: ring_buffer: reset write when reserve buffer fail Impact: reset struct buffer_page.write when interrupt storm if struct buffer_page.write is not reset, any succedent committing will corrupted ring_buffer: static inline void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { ...... cpu_buffer->commit_page->commit = cpu_buffer->commit_page->write; ...... } when "if (RB_WARN_ON(cpu_buffer, next_page == reader_page))", ring_buffer is disabled, but some reserved buffers may haven't been committed. we need reset struct buffer_page.write. when "if (unlikely(next_page == cpu_buffer->commit_page))", ring_buffer is still available, we should not corrupt it. Signed-off-by: Lai Jiangshan Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d6526361d0..9c1e73da4e3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1025,12 +1025,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, } if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) { - /* reset write */ - if (tail <= BUF_PAGE_SIZE) - local_set(&tail_page->write, tail); + if (!(buffer->flags & RB_FL_OVERWRITE)) goto out_unlock; - } /* tail_page has not moved yet? */ if (tail_page == cpu_buffer->tail_page) { @@ -1105,6 +1101,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; out_unlock: + /* reset write */ + if (tail <= BUF_PAGE_SIZE) + local_set(&tail_page->write, tail); + __raw_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return NULL; -- cgit v1.2.3 From faf6861ebd776871e77b761c43ec045cd20b5716 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 Jan 2009 12:24:42 -0500 Subject: trace: print ftrace_dump at KERN_EMERG log level Impact: fix to print out ftrace_dump when expected I was debugging a hard race condition to only find out that after I hit the race, my log level was not at level to show KERN_INFO. The time it took to trigger the race was wasted because I did not capture the trace. Since ftrace_dump is only called from kernel oops (and only when it is set in the kernel command line to do so), or when a developer adds it to their own local tree, the log level of the print should be at KERN_EMERG to make sure the print appears. ftrace_dump is not called by a normal user setup, and will not add extra unwanted print out to the console. There is no reason it should be at KERN_INFO. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c580233add9..1a1c5a6ab24 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3736,7 +3736,7 @@ static struct notifier_block trace_die_notifier = { * it if we decide to change what log level the ftrace dump * should be at. */ -#define KERN_TRACE KERN_INFO +#define KERN_TRACE KERN_EMERG static void trace_printk_seq(struct trace_seq *s) -- cgit v1.2.3 From a442e5e0a2011af5b2d1f118fee0a8f9079f1d88 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 Jan 2009 14:50:19 -0500 Subject: trace: stop all recording to ring buffer on ftrace_dump Impact: limit ftrace dump output Currently ftrace_dump only calls ftrace_kill that is a fast way to prevent the function tracer functions from being called (just sets a flag and clears the function to call, nothing else). It is better to also turn off any recording to the ring buffers as well. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a1c5a6ab24..4d89e84f0f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3770,6 +3770,7 @@ void ftrace_dump(void) dump_ran = 1; /* No turning back! */ + tracing_off(); ftrace_kill(); for_each_tracing_cpu(cpu) { -- cgit v1.2.3 From 1092307d582a7566d23779c304cf86f3075ac5f0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Jan 2009 23:40:11 -0500 Subject: trace: set max latency variable to zero on default Impact: trace max latencies on start of latency tracing This patch sets the max latency to zero whenever one of the irq variant tracers or the wakeup tracer is set to current tracer. Most developers expect to see output when starting up a latency tracer. But since the max_latency is already set to max, and it takes a latency greater than max_latency to be recorded, there is no trace. This is not the expected behavior and has even confused myself. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- kernel/trace/trace_irqsoff.c | 1 + kernel/trace/trace_sched_wakeup.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4d89e84f0f4..17bb88d86ac 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -40,7 +40,7 @@ #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) -unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; +unsigned long __read_mostly tracing_max_latency; unsigned long __read_mostly tracing_thresh; /* diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7c2e326bbc8..62a78d94353 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -380,6 +380,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr) static void __irqsoff_tracer_init(struct trace_array *tr) { + tracing_max_latency = 0; irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 43586b689e3..42ae1e77b6b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -333,6 +333,7 @@ static void stop_wakeup_tracer(struct trace_array *tr) static int wakeup_tracer_init(struct trace_array *tr) { + tracing_max_latency = 0; wakeup_trace = tr; start_wakeup_tracer(tr); return 0; -- cgit v1.2.3 From 91a8d07d82cac3aae3ef2ea1aaba5c9c4a934e91 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 21 Jan 2009 18:45:57 -0500 Subject: ring-buffer: reset timestamps when ring buffer is reset Impact: fix bad times of recent resets The ring buffer needs to reset its timestamps when reseting of the buffer, otherwise the timestamps are stale and might be used to calculate times in the buffer causing funny timestamps to appear. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c1e73da4e3..bd38c5cfd8a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2174,6 +2174,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->overrun = 0; cpu_buffer->entries = 0; + + cpu_buffer->write_stamp = 0; + cpu_buffer->read_stamp = 0; } /** -- cgit v1.2.3 From 3a9f84d354ce1e19956083c8e691727dea33bd5a Mon Sep 17 00:00:00 2001 From: Ed Swierk Date: Mon, 26 Jan 2009 15:33:31 -0800 Subject: signals, debug: fix BUG: using smp_processor_id() in preemptible code in print_fatal_signal() With print-fatal-signals=1 on a kernel with CONFIG_PREEMPT=y, sending an unexpected signal to a process causes a BUG: using smp_processor_id() in preemptible code. get_signal_to_deliver() releases the siglock before calling print_fatal_signal(), which calls show_regs(), which calls smp_processor_id(), which is not supposed to be called from a preemptible thread. Make sure show_regs() runs with preemption disabled. Signed-off-by: Ed Swierk Signed-off-by: Ingo Molnar --- kernel/signal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e73759783dc..b6b36768b75 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -909,7 +909,9 @@ static void print_fatal_signal(struct pt_regs *regs, int signr) } #endif printk("\n"); + preempt_disable(); show_regs(regs); + preempt_enable(); } static int __init setup_print_fatal_signals(char *str) -- cgit v1.2.3 From baef99a08a2e23d9386b47e53fa5f0d44fc98f66 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 29 Jan 2009 14:25:10 -0800 Subject: cgroups: use hierarchy mutex in creation failure path Now, cgrp->sibling is handled under hierarchy mutex. error route should do so, too. Signed-off-by: KAMEZAWA Hiroyuki Cc: Li Zefan Acked-by Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c29831076e7..2ae7cb47dbf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2434,7 +2434,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_remove: + cgroup_lock_hierarchy(root); list_del(&cgrp->sibling); + cgroup_unlock_hierarchy(root); root->number_of_cgroups--; err_destroy: -- cgit v1.2.3 From 1404f06565ee89e0ce04d4a5859c00b0e3a0dc8d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 29 Jan 2009 14:25:21 -0800 Subject: cgroups: fix lock inconsistency in cgroup_clone() I fixed a bug in cgroup_clone() in Linus' tree in commit 7b574b7 ("cgroups: fix a race between cgroup_clone and umount") without noticing there was a cleanup patch in -mm tree that should be rebased (now commit 104cbd5, "cgroups: use task_lock() for access tsk->cgroups safe in cgroup_clone()"), thus resulted in lock inconsistency. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2ae7cb47dbf..0066092de19 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2993,20 +2993,21 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_unlock(&cgroup_mutex); return 0; } - task_lock(tsk); - cg = tsk->cgroups; - parent = task_cgroup(tsk, subsys->subsys_id); /* Pin the hierarchy */ - if (!atomic_inc_not_zero(&parent->root->sb->s_active)) { + if (!atomic_inc_not_zero(&root->sb->s_active)) { /* We race with the final deactivate_super() */ mutex_unlock(&cgroup_mutex); return 0; } /* Keep the cgroup alive */ + task_lock(tsk); + parent = task_cgroup(tsk, subsys->subsys_id); + cg = tsk->cgroups; get_css_set(cg); task_unlock(tsk); + mutex_unlock(&cgroup_mutex); /* Now do the VFS work to create a cgroup */ @@ -3045,7 +3046,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_unlock(&inode->i_mutex); put_css_set(cg); - deactivate_super(parent->root->sb); + deactivate_super(root->sb); /* The cgroup is still accessible in the VFS, but * we're not going to try to rmdir() it at this * point. */ @@ -3071,7 +3072,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_lock(&cgroup_mutex); put_css_set(cg); mutex_unlock(&cgroup_mutex); - deactivate_super(parent->root->sb); + deactivate_super(root->sb); return ret; } -- cgit v1.2.3 From 804b3c28a4e4fa1c224571bf76edb534b9c4b1ed Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Thu, 29 Jan 2009 14:25:21 -0800 Subject: cgroups: add cpu_relax() calls in css_tryget() and cgroup_clear_css_refs() css_tryget() and cgroup_clear_css_refs() contain polling loops; these loops should have cpu_relax calls in them to reduce cross-cache traffic. Signed-off-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0066092de19..492215d67fa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2509,7 +2509,7 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp) for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; int refcnt; - do { + while (1) { /* We can only remove a CSS with a refcnt==1 */ refcnt = atomic_read(&css->refcnt); if (refcnt > 1) { @@ -2523,7 +2523,10 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp) * css_tryget() to spin until we set the * CSS_REMOVED bits or abort */ - } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); + if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) + break; + cpu_relax(); + } } done: for_each_subsys(cgrp->root, ss) { -- cgit v1.2.3 From 839ec5452ebfd5905b9c69b20ceb640903a8ea1a Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Thu, 29 Jan 2009 14:25:22 -0800 Subject: cgroup: fix root_count when mount fails due to busy subsystem root_count was being incremented in cgroup_get_sb() after all error checking was complete, but decremented in cgroup_kill_sb(), which can be called on a superblock that we gave up on due to an error. This patch changes cgroup_kill_sb() to only decrement root_count if the root was previously linked into the list of roots. Signed-off-by: Paul Menage Tested-by: Serge Hallyn Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 492215d67fa..5a54ff42874 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1115,8 +1115,10 @@ static void cgroup_kill_sb(struct super_block *sb) { } write_unlock(&css_set_lock); - list_del(&root->root_list); - root_count--; + if (!list_empty(&root->root_list)) { + list_del(&root->root_list); + root_count--; + } mutex_unlock(&cgroup_mutex); -- cgit v1.2.3 From d7240b988017521ebf89edfadd42c0942f166850 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 29 Jan 2009 10:08:01 -0500 Subject: generic-ipi: use per cpu data for single cpu ipi calls The smp_call_function can be passed a wait parameter telling it to wait for all the functions running on other CPUs to complete before returning, or to return without waiting. Unfortunately, this is currently just a suggestion and not manditory. That is, the smp_call_function can decide not to return and wait instead. The reason for this is because it uses kmalloc to allocate storage to send to the called CPU and that CPU will free it when it is done. But if we fail to allocate the storage, the stack is used instead. This means we must wait for the called CPU to finish before continuing. Unfortunatly, some callers do no abide by this hint and act as if the non-wait option is mandatory. The MTRR code for instance will deadlock if the smp_call_function is set to wait. This is because the smp_call_function will wait for the other CPUs to finish their called functions, but those functions are waiting on the caller to continue. This patch changes the generic smp_call_function code to use per cpu variables if the allocation of the data fails for a single CPU call. The smp_call_function_many will fall back to the smp_call_function_single if it fails its alloc. The smp_call_function_single is modified to not force the wait state. Since we now are using a single data per cpu we must synchronize the callers to prevent a second caller modifying the data before the first called IPI functions complete. To do so, I added a flag to the call_single_data called CSD_FLAG_LOCK. When the single CPU is called (which can be called when a many call fails an alloc), we set the LOCK bit on this per cpu data. When the caller finishes it clears the LOCK bit. The caller must wait till the LOCK bit is cleared before setting it. When it is cleared, there is no IPI function using it. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Acked-by: Jens Axboe Acked-by: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/smp.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 5cfa0e5e3e8..bbedbb7efe3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -18,6 +18,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); enum { CSD_FLAG_WAIT = 0x01, CSD_FLAG_ALLOC = 0x02, + CSD_FLAG_LOCK = 0x04, }; struct call_function_data { @@ -186,6 +187,9 @@ void generic_smp_call_function_single_interrupt(void) if (data_flags & CSD_FLAG_WAIT) { smp_wmb(); data->flags &= ~CSD_FLAG_WAIT; + } else if (data_flags & CSD_FLAG_LOCK) { + smp_wmb(); + data->flags &= ~CSD_FLAG_LOCK; } else if (data_flags & CSD_FLAG_ALLOC) kfree(data); } @@ -196,6 +200,8 @@ void generic_smp_call_function_single_interrupt(void) } } +static DEFINE_PER_CPU(struct call_single_data, csd_data); + /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. @@ -224,14 +230,38 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, func(info); local_irq_restore(flags); } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *data = NULL; + struct call_single_data *data; if (!wait) { + /* + * We are calling a function on a single CPU + * and we are not going to wait for it to finish. + * We first try to allocate the data, but if we + * fail, we fall back to use a per cpu data to pass + * the information to that CPU. Since all callers + * of this code will use the same data, we must + * synchronize the callers to prevent a new caller + * from corrupting the data before the callee + * can access it. + * + * The CSD_FLAG_LOCK is used to let us know when + * the IPI handler is done with the data. + * The first caller will set it, and the callee + * will clear it. The next caller must wait for + * it to clear before we set it again. This + * will make sure the callee is done with the + * data before a new caller will use it. + */ data = kmalloc(sizeof(*data), GFP_ATOMIC); if (data) data->flags = CSD_FLAG_ALLOC; - } - if (!data) { + else { + data = &per_cpu(csd_data, me); + while (data->flags & CSD_FLAG_LOCK) + cpu_relax(); + data->flags = CSD_FLAG_LOCK; + } + } else { data = &d; data->flags = CSD_FLAG_WAIT; } -- cgit v1.2.3 From 7f22391cbe82a80a9f891d8bd10fc28ff248d1e2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 22 Dec 2008 02:24:48 +0100 Subject: hrtimers: increase clock min delta threshold while interrupt hanging Impact: avoid timer IRQ hanging slow systems While using the function graph tracer on a virtualized system, the hrtimer_interrupt can hang the system on an infinite loop. This can be caused in several situations: - the hardware is very slow and HZ is set too high - something intrusive is slowing the system down (tracing under emulation) ... and the next clock events to program are always before the current time. This patch implements a reasonable compromise: if such a situation is detected, we share the CPUs time in 1/4 to process the hrtimer interrupts. This is enough to let the system running without serious starvation. It has been successfully tested under VirtualBox with 1000 HZ and 100 HZ with function graph tracer launched. On both cases, the clock events were increased until about 25 ms periodic ticks, which means 40 HZ. So we change a hard to debug hang into a warning message and a system that still manages to limp along. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f33afb0407b..8fea312ca36 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1158,6 +1158,29 @@ static void __run_hrtimer(struct hrtimer *timer) #ifdef CONFIG_HIGH_RES_TIMERS +static int force_clock_reprogram; + +/* + * After 5 iteration's attempts, we consider that hrtimer_interrupt() + * is hanging, which could happen with something that slows the interrupt + * such as the tracing. Then we force the clock reprogramming for each future + * hrtimer interrupts to avoid infinite loops and use the min_delta_ns + * threshold that we will overwrite. + * The next tick event will be scheduled to 3 times we currently spend on + * hrtimer_interrupt(). This gives a good compromise, the cpus will spend + * 1/4 of their time to process the hrtimer interrupts. This is enough to + * let it running without serious starvation. + */ + +static inline void +hrtimer_interrupt_hanging(struct clock_event_device *dev, + ktime_t try_time) +{ + force_clock_reprogram = 1; + dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; + printk(KERN_WARNING "hrtimer: interrupt too slow, " + "forcing clock min delta to %lu ns\n", dev->min_delta_ns); +} /* * High resolution timer interrupt * Called with interrupts disabled @@ -1167,6 +1190,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; ktime_t expires_next, now; + int nr_retries = 0; int i; BUG_ON(!cpu_base->hres_active); @@ -1174,6 +1198,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; retry: + /* 5 retries is enough to notice a hang */ + if (!(++nr_retries % 5)) + hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now)); + now = ktime_get(); expires_next.tv64 = KTIME_MAX; @@ -1226,7 +1254,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) /* Reprogramming necessary ? */ if (expires_next.tv64 != KTIME_MAX) { - if (tick_program_event(expires_next, 0)) + if (tick_program_event(expires_next, force_clock_reprogram)) goto retry; } } -- cgit v1.2.3 From 94df7de0289bc2df3d6e85cd2ece52bf42682f45 Mon Sep 17 00:00:00 2001 From: Sebastien Dugue Date: Mon, 1 Dec 2008 14:09:07 +0100 Subject: hrtimers: allow the hot-unplugging of all cpus Impact: fix CPU hotplug hang on Power6 testbox On architectures that support offlining all cpus (at least powerpc/pseries), hot-unpluging the tick_do_timer_cpu can result in a system hang. This comes from the fact that if the cpu going down happens to be the cpu doing the tick, then as the tick_do_timer_cpu handover happens after the cpu is dead (via the CPU_DEAD notification), we're left without ticks, jiffies are frozen and any task relying on timers (msleep, ...) is stuck. That's particularly the case for the cpu looping in __cpu_die() waiting for the dying cpu to be dead. This patch addresses this by having the tick_do_timer_cpu handover happen earlier during the CPU_DYING notification. For this, a new clockevent notification type is introduced (CLOCK_EVT_NOTIFY_CPU_DYING) which is triggered in hrtimer_cpu_notify(). Signed-off-by: Sebastien Dugue Cc: Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 4 ++++ kernel/time/tick-common.c | 26 +++++++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8fea312ca36..647a40e2fea 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1608,6 +1608,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, break; #ifdef CONFIG_HOTPLUG_CPU + case CPU_DYING: + case CPU_DYING_FROZEN: + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: { diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 63e05d423a0..21a5ca84951 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -273,6 +273,21 @@ out_bc: return ret; } +/* + * Transfer the do_timer job away from a dying cpu. + * + * Called with interrupts disabled. + */ +static void tick_handover_do_timer(int *cpup) +{ + if (*cpup == tick_do_timer_cpu) { + int cpu = cpumask_first(cpu_online_mask); + + tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : + TICK_DO_TIMER_NONE; + } +} + /* * Shutdown an event device on a given cpu: * @@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } - /* Transfer the do_timer job away from this cpu */ - if (*cpup == tick_do_timer_cpu) { - int cpu = cpumask_first(cpu_online_mask); - - tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : - TICK_DO_TIMER_NONE; - } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, tick_broadcast_oneshot_control(reason); break; + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(dev); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: tick_shutdown_broadcast_oneshot(dev); tick_shutdown_broadcast(dev); -- cgit v1.2.3 From b0a9b5111abf60ef07eade834f480e89004c7920 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Jan 2009 11:31:36 +0100 Subject: hrtimer: prevent negative expiry value after clock_was_set() Impact: prevent false positive WARN_ON() in clockevents_program_event() clock_was_set() changes the base->offset of CLOCK_REALTIME and enforces the reprogramming of the clockevent device to expire timers which are based on CLOCK_REALTIME. If the clock change is large enough then the subtraction of the timer expiry value and base->offset can become negative which triggers the warning in clockevents_program_event(). Check the subtraction result and set a negative value to 0. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 647a40e2fea..f394d2a42ca 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -501,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) continue; timer = rb_entry(base->first, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + /* + * clock_was_set() has changed base->offset so the + * result might be negative. Fix it up to prevent a + * false positive in clockevents_program_event() + */ + if (expires.tv64 < 0) + expires.tv64 = 0; if (expires.tv64 < cpu_base->expires_next.tv64) cpu_base->expires_next = expires; } -- cgit v1.2.3 From d942fb6c7d391baba3dddb566eb735fbf3df8528 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 26 Jan 2009 17:56:17 +0100 Subject: sched: fix sync wakeups Pawel Dziekonski reported that the openssl benchmark and his quantum chemistry application both show slowdowns due to the scheduler under-parallelizing execution. The reason are pipe wakeups still doing 'sync' wakeups which overrides the normal buddy wakeup logic - even if waker and wakee are loosely coupled. Fix an inversion of logic in the buddy wakeup code. Reported-by: Pawel Dziekonski Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ kernel/sched_fair.c | 11 ++--------- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 52bbf1c842a..770b1f9ebe1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,6 +2266,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; + if (!sync && (current->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost)) + sync = 1; + #ifdef CONFIG_SMP if (sched_feat(LB_WAKEUP_UPDATE)) { struct sched_domain *sd; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5cc1c162044..fdc41750468 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1179,20 +1179,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) { - struct task_struct *curr = this_rq->curr; - struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; + struct task_group *tg; unsigned long weight; int balanced; if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; - if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || - p->se.avg_overlap > sysctl_sched_migration_cost)) - sync = 0; - /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1419,9 +1414,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (!sched_feat(WAKEUP_PREEMPT)) return; - if (sched_feat(WAKEUP_OVERLAP) && (sync || - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { + if (sched_feat(WAKEUP_OVERLAP) && sync) { resched_task(curr); return; } -- cgit v1.2.3 From 1596e29773eadd96b0a5fc6e736afa52394cafda Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 Jan 2009 14:51:38 +0100 Subject: sched: symmetric sync vs avg_overlap Reinstate the weakening of the sync hint if set. This yields a more symmetric usage of avg_overlap. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 770b1f9ebe1..242d0d47a70 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,9 +2266,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; - if (!sync && (current->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost)) - sync = 1; + if (!sync) { + if (current->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + sync = 1; + } else { + if (current->se.avg_overlap >= sysctl_sched_migration_cost || + p->se.avg_overlap >= sysctl_sched_migration_cost) + sync = 0; + } #ifdef CONFIG_SMP if (sched_feat(LB_WAKEUP_UPDATE)) { -- cgit v1.2.3 From a9f3e2b549f83a9cdab873abf4140be27c05a3f2 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 28 Jan 2009 14:51:39 +0100 Subject: sched: clear buddies more aggressively It was noticed that a task could get re-elected past its run quota due to buddy affinities. This could increase latency a little. Cure it by more aggresively clearing buddy state. We do so in two situations: - when we force preempt - when we select a buddy to run Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fdc41750468..75248b9ff4c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -768,8 +768,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) + if (delta_exec > ideal_runtime) { resched_task(rq_of(cfs_rq)->curr); + /* + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. + */ + clear_buddies(cfs_rq, curr); + } } static void @@ -1445,6 +1451,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) do { se = pick_next_entity(cfs_rq); + /* + * If se was a buddy, clear it so that it will have to earn + * the favour again. + */ + clear_buddies(cfs_rq, se); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); -- cgit v1.2.3 From a571bbeafbcc501d9989fbce1cddcd810bd51d71 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 Jan 2009 14:51:40 +0100 Subject: sched: fix buddie group latency Similar to the previous patch, by not clearing buddies we can select entities past their run quota, which can increase latency. This means we have to clear group buddies as well. Do not use the group clear for pick_next_task(), otherwise that'll get O(n^2). Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 75248b9ff4c..a7e50ba185a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -719,7 +719,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) __enqueue_entity(cfs_rq, se); } -static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->last == se) cfs_rq->last = NULL; @@ -728,6 +728,12 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->next = NULL; } +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + for_each_sched_entity(se) + __clear_buddies(cfs_rq_of(se), se); +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -1455,7 +1461,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) * If se was a buddy, clear it so that it will have to earn * the favour again. */ - clear_buddies(cfs_rq, se); + __clear_buddies(cfs_rq, se); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); -- cgit v1.2.3 From 3d398703ef06fd97b4c28c86b580546d5b57e7b7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 31 Jan 2009 23:21:24 +1030 Subject: sched_rt: don't use first_cpu on cpumask created with cpumask_and cpumask_and() only initializes nr_cpu_ids bits, so the (deprecated) first_cpu() might find one of those uninitialized bits if nr_cpu_ids is less than NR_CPUS (as it can be for CONFIG_CPUMASK_OFFSTACK). Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 954e1a81b79..bac1061cea2 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -968,8 +968,8 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) return this_cpu; - first = first_cpu(*mask); - if (first != NR_CPUS) + first = cpumask_first(mask); + if (first < nr_cpu_ids) return first; return -1; -- cgit v1.2.3 From 10b888d6cec2688e65e9e128b14bf98ecd199da2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 Jan 2009 14:50:07 -0800 Subject: irq, x86: fix lock status with numa_migrate_irq_desc Eric Paris reported: > I have an hp dl785g5 which is unable to successfully run > 2.6.29-0.66.rc3.fc11.x86_64 or 2.6.29-rc2-next-20090126. During bootup > (early in userspace daemons starting) I get the below BUG, which quickly > renders the machine dead. I assume it is because sparse_irq_lock never > gets released when the BUG kills that task. Adjust lock sequence when migrating a descriptor with CONFIG_NUMA_MIGRATE_IRQ_DESC enabled. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/numa_migrate.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index ecf765c6a77..acd88356ac7 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -71,7 +71,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, desc = irq_desc_ptrs[irq]; if (desc && old_desc != desc) - goto out_unlock; + goto out_unlock; node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); @@ -84,10 +84,15 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, init_copy_one_irq_desc(irq, old_desc, desc, cpu); irq_desc_ptrs[irq] = desc; + spin_unlock_irqrestore(&sparse_irq_lock, flags); /* free the old one */ free_one_irq_desc(old_desc, desc); + spin_unlock(&old_desc->lock); kfree(old_desc); + spin_lock(&desc->lock); + + return desc; out_unlock: spin_unlock_irqrestore(&sparse_irq_lock, flags); -- cgit v1.2.3 From 720eba31f47aeade8ec130ca7f4353223c49170f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 3 Feb 2009 13:31:36 +1030 Subject: modules: Use a better scheme for refcounting Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y) is using a lot of memory. Each 'struct module' contains an [NR_CPUS] array of full cache lines. This patch uses existing infrastructure (percpu_modalloc() & percpu_modfree()) to allocate percpu space for the refcount storage. Instead of wasting NR_CPUS*128 bytes (on i386), we now use nr_cpu_ids*sizeof(local_t) bytes. On a typical distro, where NR_CPUS=8, shiping 2000 modules, we reduce size of module files by about 2 Mbytes. (1Kb per module) Instead of having all refcounters in the same memory node - with TLB misses because of vmalloc() - this new implementation permits to have better NUMA properties, since each CPU will use storage on its preferred node, thanks to percpu storage. Signed-off-by: Eric Dumazet Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e8b51d41dd7..ba22484a987 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; /* Init the unload section of the module. */ static void module_unload_init(struct module *mod) { - unsigned int i; + int cpu; INIT_LIST_HEAD(&mod->modules_which_use_me); - for (i = 0; i < NR_CPUS; i++) - local_set(&mod->ref[i].count, 0); + for_each_possible_cpu(cpu) + local_set(__module_ref_addr(mod, cpu), 0); /* Hold reference count during initialization. */ - local_set(&mod->ref[raw_smp_processor_id()].count, 1); + local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -717,10 +717,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced) unsigned int module_refcount(struct module *mod) { - unsigned int i, total = 0; + unsigned int total = 0; + int cpu; - for (i = 0; i < NR_CPUS; i++) - total += local_read(&mod->ref[i].count); + for_each_possible_cpu(cpu) + total += local_read(__module_ref_addr(mod, cpu)); return total; } EXPORT_SYMBOL(module_refcount); @@ -894,7 +895,7 @@ void module_put(struct module *module) { if (module) { unsigned int cpu = get_cpu(); - local_dec(&module->ref[cpu].count); + local_dec(__module_ref_addr(module, cpu)); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); @@ -1464,7 +1465,10 @@ static void free_module(struct module *mod) kfree(mod->args); if (mod->percpu) percpu_modfree(mod->percpu); - +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + if (mod->refptr) + percpu_modfree(mod->refptr); +#endif /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -2011,6 +2015,14 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto free_mod; +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), + mod->name); + if (!mod->refptr) { + err = -ENOMEM; + goto free_mod; + } +#endif if (pcpuindex) { /* We have a special allocation for this section. */ percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, @@ -2018,7 +2030,7 @@ static noinline struct module *load_module(void __user *umod, mod->name); if (!percpu) { err = -ENOMEM; - goto free_mod; + goto free_percpu; } sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; mod->percpu = percpu; @@ -2282,6 +2294,9 @@ static noinline struct module *load_module(void __user *umod, free_percpu: if (percpu) percpu_modfree(percpu); +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + percpu_modfree(mod->refptr); +#endif free_mod: kfree(args); free_hdr: -- cgit v1.2.3