From ee7e5516be4f2107535ad5a3d47d9c79f93661a2 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 29 Jun 2008 14:18:46 +0400 Subject: generic: per-device coherent dma allocator Currently x86_32, sh and cris-v32 provide per-device coherent dma memory allocator. However their implementation is nearly identical. Refactor out common code to be reused by them. Signed-off-by: Dmitry Baryshkov Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/dma-coherent.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 kernel/dma-coherent.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9..9e287d8ab76 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c new file mode 100644 index 00000000000..89a554cfd93 --- /dev/null +++ b/kernel/dma-coherent.c @@ -0,0 +1,127 @@ +/* + * Coherent per-device memory handling. + * Borrowed from i386 + */ +#include +#include + +struct dma_coherent_mem { + void *virt_base; + u32 device_base; + int size; + int flags; + unsigned long *bitmap; +}; + +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + void __iomem *mem_base = NULL; + int pages = size >> PAGE_SHIFT; + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) + goto out; + if (!size) + goto out; + if (dev->dma_mem) + goto out; + + /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ + + mem_base = ioremap(bus_addr, size); + if (!mem_base) + goto out; + + dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + if (!dev->dma_mem) + goto out; + dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dev->dma_mem->bitmap) + goto free1_out; + + dev->dma_mem->virt_base = mem_base; + dev->dma_mem->device_base = device_addr; + dev->dma_mem->size = pages; + dev->dma_mem->flags = flags; + + if (flags & DMA_MEMORY_MAP) + return DMA_MEMORY_MAP; + + return DMA_MEMORY_IO; + + free1_out: + kfree(dev->dma_mem); + out: + if (mem_base) + iounmap(mem_base); + return 0; +} +EXPORT_SYMBOL(dma_declare_coherent_memory); + +void dma_release_declared_memory(struct device *dev) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + + if (!mem) + return; + dev->dma_mem = NULL; + iounmap(mem->virt_base); + kfree(mem->bitmap); + kfree(mem); +} +EXPORT_SYMBOL(dma_release_declared_memory); + +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + int pos, err; + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); + + pages >>= PAGE_SHIFT; + + if (!mem) + return ERR_PTR(-EINVAL); + + pos = (device_addr - mem->device_base) >> PAGE_SHIFT; + err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); + if (err != 0) + return ERR_PTR(err); + return mem->virt_base + (pos << PAGE_SHIFT); +} +EXPORT_SYMBOL(dma_mark_declared_memory_occupied); + +int dma_alloc_from_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle, void **ret) +{ + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + int order = get_order(size); + + if (mem) { + int page = bitmap_find_free_region(mem->bitmap, mem->size, + order); + if (page >= 0) { + *dma_handle = mem->device_base + (page << PAGE_SHIFT); + *ret = mem->virt_base + (page << PAGE_SHIFT); + memset(*ret, 0, size); + } + if (mem->flags & DMA_MEMORY_EXCLUSIVE) + *ret = NULL; + } + return (mem != NULL); +} + +int dma_release_from_coherent(struct device *dev, int order, void *vaddr) +{ + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + + if (mem && vaddr >= mem->virt_base && vaddr < + (mem->virt_base + (mem->size << PAGE_SHIFT))) { + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + + bitmap_release_region(mem->bitmap, page, order); + return 1; + } + return 0; +} -- cgit v1.2.3 From e59494f441c834ca7aaa0e6fa6678ddbd3d72743 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 16 Jul 2008 00:13:45 -0400 Subject: ftrace: fix 4d3702b6 (post-v2.6.26): WARNING: at kernel/lockdep.c:2731 check_flags (ftrace) On Wed, 16 Jul 2008, Vegard Nossum wrote: > When booting 4d3702b6, I got this huge thing: > > Testing tracer wakeup: <4>------------[ cut here ]------------ > WARNING: at kernel/lockdep.c:2731 check_flags+0x123/0x160() > Modules linked in: > Pid: 1, comm: swapper Not tainted 2.6.26-crashing-02127-g4d3702b6 #30 > [] warn_on_slowpath+0x59/0xb0 > [] ? ftrace_call+0x5/0x8 > [] ? native_read_tsc+0x0/0x20 > [] ? sub_preempt_count+0x12/0xf0 > [] ? trace_hardirqs_off+0xb/0x10 > [] ? __lock_acquire+0x2cc/0x1120 > [] ? trace_hardirqs_off+0xb/0x10 > [] ? mcount_call+0x5/0xa > [] check_flags+0x123/0x160 > [] lock_acquire+0x51/0xd0 > [] ? ftrace_call+0x5/0x8 > [] _spin_lock_irqsave+0x5f/0xa0 > [] ? ftrace_record_ip+0xf5/0x220 > [] ? debug_locks_off+0x3/0x50 > [] ftrace_record_ip+0xf5/0x220 > [] mcount_call+0x5/0xa > [] ? debug_locks_off+0x8/0x50 > [] check_flags+0xf7/0x160 > [] lock_acquire+0x51/0xd0 > [] ? ftrace_call+0x5/0x8 > [] _spin_lock_irqsave+0x5f/0xa0 > [] ? wakeup_tracer_call+0x6d/0xf0 > [] ? _local_bh_enable+0x62/0xb0 > [] ? sub_preempt_count+0xd/0xf0 > [] wakeup_tracer_call+0x6d/0xf0 > [] ? __do_softirq+0xf4/0x110 > [] ? wakeup_tracer_call+0x91/0xf0 > [] ftrace_call+0x5/0x8 > [] ? __do_softirq+0xf4/0x110 > [] ? sub_preempt_count+0x12/0xf0 > [] _local_bh_enable+0x62/0xb0 > [] __do_softirq+0xf4/0x110 > [] do_softirq+0xad/0xb0 > [] irq_exit+0xa5/0xb0 > [] smp_apic_timer_interrupt+0x66/0xa0 > [] ? trace_hardirqs_off_thunk+0xc/0x10 > [] apic_timer_interrupt+0x2d/0x34 > [] ? find_usage_backwards+0xb/0xf0 > [] ? _spin_unlock_irqrestore+0x69/0x80 > [] tg_shares_up+0x132/0x1d0 > [] walk_tg_tree+0x62/0xa0 > [] ? tg_shares_up+0x0/0x1d0 > [] ? tg_nop+0x0/0x10 > [] update_shares+0x5d/0x80 > [] try_to_wake_up+0x6f/0x280 > [] ? __ftrace_modify_code+0x0/0xc0 > [] ? __ftrace_modify_code+0x0/0xc0 > [] wake_up_process+0x14/0x20 > [] kthread_create+0x66/0xb0 > [] ? do_stop+0x0/0x200 > [] ? __stop_machine_run+0x30/0xb0 > [] __stop_machine_run+0x50/0xb0 > [] ? do_stop+0x0/0x200 > [] ? __ftrace_modify_code+0x0/0xc0 > [] ? mutex_unlock+0xd/0x10 > [] stop_machine_run+0x2c/0x60 > [] unregister_ftrace_function+0x103/0x180 > [] stop_wakeup_tracer+0x17/0x60 > [] wakeup_tracer_ctrl_update+0xf/0x30 > [] trace_selftest_startup_wakeup+0xb5/0x130 > [] ? trace_wakeup_test_thread+0x0/0x70 > [] register_tracer+0x135/0x1b0 > [] init_wakeup_tracer+0xd/0xf > [] kernel_init+0x1a9/0x2ce > [] ? _spin_unlock_irq+0x3b/0x60 > [] ? trace_hardirqs_on_thunk+0xc/0x10 > [] ? init_wakeup_tracer+0x0/0xf > [] ? trace_hardirqs_on_caller+0x126/0x180 > [] ? trace_hardirqs_on_thunk+0xc/0x10 > [] ? restore_nocheck_notrace+0x0/0xe > [] ? kernel_init+0x0/0x2ce > [] ? kernel_init+0x0/0x2ce > [] kernel_thread_helper+0x7/0x10 > ======================= > ---[ end trace a7919e7f17c0a725 ]--- > irq event stamp: 579530 > hardirqs last enabled at (579528): [] trace_hardirqs_on+0xb/0x10 > hardirqs last disabled at (579529): [] trace_hardirqs_off+0xb/0x10 > softirqs last enabled at (579530): [] __do_softirq+0xf4/0x110 > softirqs last disabled at (579517): [] do_softirq+0xad/0xb0 > irq event stamp: 579530 > hardirqs last enabled at (579528): [] trace_hardirqs_on+0xb/0x10 > hardirqs last disabled at (579529): [] trace_hardirqs_off+0xb/0x10 > softirqs last enabled at (579530): [] __do_softirq+0xf4/0x110 > softirqs last disabled at (579517): [] do_softirq+0xad/0xb0 > PASSED > > Incidentally, the kernel also hung while I was typing in this report. Things get weird between lockdep and ftrace because ftrace can be called within lockdep internal code (via the mcount pointer) and lockdep can be called with ftrace (via spin_locks). Signed-off-by: Steven Rostedt Tested-by: Vegard Nossum Signed-off-by: Ingo Molnar --- kernel/trace/trace_sched_wakeup.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3c8d61df447..e303ccb62cd 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -26,7 +26,8 @@ static struct task_struct *wakeup_task; static int wakeup_cpu; static unsigned wakeup_prio = -1; -static DEFINE_SPINLOCK(wakeup_lock); +static raw_spinlock_t wakeup_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static void __wakeup_reset(struct trace_array *tr); @@ -56,7 +57,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) if (unlikely(disabled != 1)) goto out; - spin_lock_irqsave(&wakeup_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); if (unlikely(!wakeup_task)) goto unlock; @@ -71,7 +73,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) trace_function(tr, data, ip, parent_ip, flags); unlock: - spin_unlock_irqrestore(&wakeup_lock, flags); + __raw_spin_unlock(&wakeup_lock); + local_irq_restore(flags); out: atomic_dec(&data->disabled); @@ -145,7 +148,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, if (likely(disabled != 1)) goto out; - spin_lock_irqsave(&wakeup_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); /* We could race with grabbing wakeup_lock */ if (unlikely(!tracer_enabled || next != wakeup_task)) @@ -174,7 +178,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, out_unlock: __wakeup_reset(tr); - spin_unlock_irqrestore(&wakeup_lock, flags); + __raw_spin_unlock(&wakeup_lock); + local_irq_restore(flags); out: atomic_dec(&tr->data[cpu]->disabled); } @@ -209,8 +214,6 @@ static void __wakeup_reset(struct trace_array *tr) struct trace_array_cpu *data; int cpu; - assert_spin_locked(&wakeup_lock); - for_each_possible_cpu(cpu) { data = tr->data[cpu]; tracing_reset(data); @@ -229,9 +232,11 @@ static void wakeup_reset(struct trace_array *tr) { unsigned long flags; - spin_lock_irqsave(&wakeup_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); __wakeup_reset(tr); - spin_unlock_irqrestore(&wakeup_lock, flags); + __raw_spin_unlock(&wakeup_lock); + local_irq_restore(flags); } static void @@ -252,7 +257,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, goto out; /* interrupts should be off from try_to_wake_up */ - spin_lock(&wakeup_lock); + __raw_spin_lock(&wakeup_lock); /* check for races. */ if (!tracer_enabled || p->prio >= wakeup_prio) @@ -274,7 +279,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, CALLER_ADDR1, CALLER_ADDR2, flags); out_locked: - spin_unlock(&wakeup_lock); + __raw_spin_unlock(&wakeup_lock); out: atomic_dec(&tr->data[cpu]->disabled); } -- cgit v1.2.3 From 1e01cb0c6ff7e9ddb6547551794c6aa82785a7cb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 15 Jul 2008 09:53:37 -0400 Subject: ftrace: only trace preempt off with preempt tracer When PREEMPT_TRACER and IRQSOFF_TRACER are both configured and irqsoff tracer is running, the preempt_off sections might also be traced. Thanks to Andrew Morton for pointing out my mistake of spin_lock disabling interrupts while he was reviewing ftrace.txt. Seems that my example I used actually hit this bug. Signed-off-by: Steven Rostedt Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/trace/trace_irqsoff.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 421d6fe3650..b1e4a89b08e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -337,12 +337,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { - stop_critical_timing(a0, a1); + if (preempt_trace()) + stop_critical_timing(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - start_critical_timing(a0, a1); + if (preempt_trace()) + start_critical_timing(a0, a1); } #endif /* CONFIG_PREEMPT_TRACER */ -- cgit v1.2.3 From 538c29d43ebdac2edcef96ac07982d2296a63077 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 18 Jul 2008 13:29:57 +0400 Subject: Generic dma-coherent: fix DMA_MEMORY_EXCLUSIVE Don't rewrite successfull allocation return values in case the memory was marked with DMA_MEMORY_EXCLUSIVE. Signed-off-by: Dmitry Baryshkov Cc: Jesse Barnes Signed-off-by: Ingo Molnar --- kernel/dma-coherent.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 89a554cfd93..56dff5cb0f2 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -105,8 +105,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, *dma_handle = mem->device_base + (page << PAGE_SHIFT); *ret = mem->virt_base + (page << PAGE_SHIFT); memset(*ret, 0, size); - } - if (mem->flags & DMA_MEMORY_EXCLUSIVE) + } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) *ret = NULL; } return (mem != NULL); -- cgit v1.2.3 From b6d4f7e3ef25beb8c658c97867d98883e69dc544 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 20 Jul 2008 15:01:10 +0400 Subject: dma-coherent: add documentation to new interfaces Signed-off-by: Dmitry Baryshkov Cc: Jesse Barnes Signed-off-by: Ingo Molnar --- kernel/dma-coherent.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 56dff5cb0f2..7517115a8cc 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -92,6 +92,21 @@ void *dma_mark_declared_memory_occupied(struct device *dev, } EXPORT_SYMBOL(dma_mark_declared_memory_occupied); +/** + * Try to allocate memory from the per-device coherent area. + * + * @dev: device from which we allocate memory + * @size: size of requested memory area + * @dma_handle: This will be filled with the correct dma handle + * @ret: This pointer will be filled with the virtual address + * to allocated area. + * + * This function should be only called from per-arch %dma_alloc_coherent() + * to support allocation from per-device coherent memory pools. + * + * Returns 0 if dma_alloc_coherent should continue with allocating from + * generic memory areas, or !0 if dma_alloc_coherent should return %ret. + */ int dma_alloc_from_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret) { @@ -111,6 +126,19 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, return (mem != NULL); } +/** + * Try to free the memory allocated from per-device coherent memory pool. + * @dev: device from which the memory was allocated + * @order: the order of pages allocated + * @vaddr: virtual address of allocated pages + * + * This checks whether the memory was allocated from the per-device + * coherent memory pool and if so, releases that memory. + * + * Returns 1 if we correctly released the memory, or 0 if + * %dma_release_coherent() should proceed with releasing memory from + * generic pools. + */ int dma_release_from_coherent(struct device *dev, int order, void *vaddr) { struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; -- cgit v1.2.3 From 1986b0cb1671ea39178b4e2b00461109728fc935 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 24 Jul 2008 08:10:02 +0200 Subject: ftrace: remove latency-tracer leftover remove the :vim=ft=help tag from trace files. I used them years ago to syntax-highlight traces and forgot about this hack. Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 868e121c8e3..fc20e09a6cb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1203,9 +1203,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) iter->pos = *pos; - if (last_ent && !ent) - seq_puts(m, "\n\nvim:ft=help\n"); - return ent; } -- cgit v1.2.3 From 1fe371044b21b226b96a9dd959e971b50b28c78e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 26 Jul 2008 15:09:47 +0200 Subject: ftrace: fix modular build fix: ERROR: "start_critical_timings" [drivers/acpi/processor.ko] undefined! ERROR: "stop_critical_timings" [drivers/acpi/processor.ko] undefined! Signed-off-by: Ingo Molnar --- kernel/trace/trace_irqsoff.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b1e4a89b08e..ece6cfb649f 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -253,12 +253,14 @@ void start_critical_timings(void) if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } +EXPORT_SYMBOL_GPL(start_critical_timings); void stop_critical_timings(void) { if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } +EXPORT_SYMBOL_GPL(stop_critical_timings); #ifdef CONFIG_IRQSOFF_TRACER #ifdef CONFIG_PROVE_LOCKING -- cgit v1.2.3 From b8d317d10cca76cabe6b03ebfeb23cc99118b731 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:29 -0700 Subject: cpumask: make cpumask_of_cpu_map generic If an arch doesn't define cpumask_of_cpu_map, create a generic statically-initialized one for them. This allows removal of the buggy cpumask_of_cpu() macro (&cpumask_of_cpu() gives address of out-of-scope var). An arch with NR_CPUS of 4096 probably wants to allocate this itself based on the actual number of CPUs, since otherwise they're using 2MB of rodata (1024 cpus means 128k). That's what CONFIG_HAVE_CPUMASK_OF_CPU_MAP is for (only x86/64 does so at the moment). In future as we support more CPUs, we'll need to resort to a get_cpu_map()/put_cpu_map() allocation scheme. Signed-off-by: Mike Travis Signed-off-by: Rusty Russell Cc: Andrew Morton Cc: Jack Steiner Signed-off-by: Ingo Molnar --- kernel/cpu.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 10ba5f1004a..fe31ff3d380 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -461,3 +461,112 @@ out: #endif /* CONFIG_PM_SLEEP_SMP */ #endif /* CONFIG_SMP */ + +#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +/* 64 bits of zeros, for initializers. */ +#if BITS_PER_LONG == 32 +#define Z64 0, 0 +#else +#define Z64 0 +#endif + +/* Initializer macros. */ +#define CMI0(n) { .bits = { 1UL << (n) } } +#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } } + +#define CMI8(n, ...) \ + CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__), \ + CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__), \ + CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__), \ + CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__) + +#if BITS_PER_LONG == 32 +#define CMI64(n, ...) \ + CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__), \ + CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__), \ + CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__), \ + CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__) +#else +#define CMI64(n, ...) \ + CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__), \ + CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__), \ + CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__), \ + CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__) +#endif + +#define CMI256(n, ...) \ + CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__), \ + CMI64((n)+128, Z64, Z64, __VA_ARGS__), \ + CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__) +#define Z256 Z64, Z64, Z64, Z64 + +#define CMI1024(n, ...) \ + CMI256((n), __VA_ARGS__), \ + CMI256((n)+256, Z256, __VA_ARGS__), \ + CMI256((n)+512, Z256, Z256, __VA_ARGS__), \ + CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__) +#define Z1024 Z256, Z256, Z256, Z256 + +/* We want this statically initialized, just to be safe. We try not + * to waste too much space, either. */ +static const cpumask_t cpumask_map[] = { + CMI0(0), CMI0(1), CMI0(2), CMI0(3), +#if NR_CPUS > 4 + CMI0(4), CMI0(5), CMI0(6), CMI0(7), +#endif +#if NR_CPUS > 8 + CMI0(8), CMI0(9), CMI0(10), CMI0(11), + CMI0(12), CMI0(13), CMI0(14), CMI0(15), +#endif +#if NR_CPUS > 16 + CMI0(16), CMI0(17), CMI0(18), CMI0(19), + CMI0(20), CMI0(21), CMI0(22), CMI0(23), + CMI0(24), CMI0(25), CMI0(26), CMI0(27), + CMI0(28), CMI0(29), CMI0(30), CMI0(31), +#endif +#if NR_CPUS > 32 +#if BITS_PER_LONG == 32 + CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0), + CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0), + CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0), + CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0), + CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0), + CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0), + CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0), + CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0), +#else + CMI0(32), CMI0(33), CMI0(34), CMI0(35), + CMI0(36), CMI0(37), CMI0(38), CMI0(39), + CMI0(40), CMI0(41), CMI0(42), CMI0(43), + CMI0(44), CMI0(45), CMI0(46), CMI0(47), + CMI0(48), CMI0(49), CMI0(50), CMI0(51), + CMI0(52), CMI0(53), CMI0(54), CMI0(55), + CMI0(56), CMI0(57), CMI0(58), CMI0(59), + CMI0(60), CMI0(61), CMI0(62), CMI0(63), +#endif /* BITS_PER_LONG == 64 */ +#endif +#if NR_CPUS > 64 + CMI64(64, Z64), +#endif +#if NR_CPUS > 128 + CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64), +#endif +#if NR_CPUS > 256 + CMI256(256, Z256), +#endif +#if NR_CPUS > 512 + CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256), +#endif +#if NR_CPUS > 1024 + CMI1024(1024, Z1024), +#endif +#if NR_CPUS > 2048 + CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024), +#endif +#if NR_CPUS > 4096 +#error NR_CPUS too big. Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP +#endif +}; + +const cpumask_t *cpumask_of_cpu_map = cpumask_map; +#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */ -- cgit v1.2.3 From 6524d938b3360504b43a1278b5a8403e85383d1a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:30 -0700 Subject: cpumask: put cpumask_of_cpu_map in the initdata section * Create the cpumask_of_cpu_map statically in the init data section using NR_CPUS but replace it during boot up with one sized by nr_cpu_ids (num possible cpus). Signed-off-by: Mike Travis Cc: Andrew Morton Cc: Jack Steiner Cc: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/cpu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index fe31ff3d380..9d4e1c28c05 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -462,7 +462,6 @@ out: #endif /* CONFIG_SMP */ -#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP /* 64 bits of zeros, for initializers. */ #if BITS_PER_LONG == 32 #define Z64 0, 0 @@ -509,7 +508,11 @@ out: /* We want this statically initialized, just to be safe. We try not * to waste too much space, either. */ -static const cpumask_t cpumask_map[] = { +static const cpumask_t cpumask_map[] +#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +__initdata +#endif += { CMI0(0), CMI0(1), CMI0(2), CMI0(3), #if NR_CPUS > 4 CMI0(4), CMI0(5), CMI0(6), CMI0(7), @@ -569,4 +572,3 @@ static const cpumask_t cpumask_map[] = { }; const cpumask_t *cpumask_of_cpu_map = cpumask_map; -#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */ -- cgit v1.2.3 From 0bc3cc03fa6e1c20aecb5a33356bcaae410640b9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:31 -0700 Subject: cpumask: change cpumask_of_cpu_ptr to use new cpumask_of_cpu * Replace previous instances of the cpumask_of_cpu_ptr* macros with a the new (lvalue capable) generic cpumask_of_cpu(). Signed-off-by: Mike Travis Cc: Andrew Morton Cc: Jack Steiner Cc: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 3 +-- kernel/time/tick-common.c | 8 +++----- kernel/trace/trace_sysprof.c | 4 +--- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 738b411ff2d..ba9b2054ecb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -33,9 +33,8 @@ static int stopmachine(void *cpu) { int irqs_disabled = 0; int prepared = 0; - cpumask_of_cpu_ptr(cpumask, (int)(long)cpu); - set_cpus_allowed_ptr(current, cpumask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu)); /* Ack: we are alive */ smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index bf43284d685..80c4336f418 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -196,12 +196,10 @@ static int tick_check_new_device(struct clock_event_device *newdev) struct tick_device *td; int cpu, ret = NOTIFY_OK; unsigned long flags; - cpumask_of_cpu_ptr_declare(cpumask); spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); - cpumask_of_cpu_ptr_next(cpumask, cpu); if (!cpu_isset(cpu, newdev->cpumask)) goto out_bc; @@ -209,7 +207,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) curdev = td->evtdev; /* cpu local device ? */ - if (!cpus_equal(newdev->cpumask, *cpumask)) { + if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { /* * If the cpu affinity of the device interrupt can not @@ -222,7 +220,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * If we have a cpu local device already, do not replace it * by a non cpu local device */ - if (curdev && cpus_equal(curdev->cpumask, *cpumask)) + if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) goto out_bc; } @@ -254,7 +252,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) curdev = NULL; } clockevents_exchange_device(curdev, newdev); - tick_setup_device(td, newdev, cpu, cpumask); + tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index ce2d723c10e..bb948e52ce2 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -213,9 +213,7 @@ static void start_stack_timers(void) int cpu; for_each_online_cpu(cpu) { - cpumask_of_cpu_ptr(new_mask, cpu); - - set_cpus_allowed_ptr(current, new_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); start_stack_timer(cpu); } set_cpus_allowed_ptr(current, &saved_mask); -- cgit v1.2.3 From 5a7a201c51c324876d00a54e7208af6af12d1ca4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 26 Jul 2008 16:50:47 +0200 Subject: cpumask: export cpumask_of_cpu_map fix: ERROR: "cpumask_of_cpu_map" [drivers/acpi/processor.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/microcode.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/speedstep-ich.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.ko] undefined! Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9d4e1c28c05..a35d8995dc8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -572,3 +572,5 @@ __initdata }; const cpumask_t *cpumask_of_cpu_map = cpumask_map; + +EXPORT_SYMBOL_GPL(cpumask_of_cpu_map); -- cgit v1.2.3 From a2e2e3577c3ef2b5dbb866e97e612aae4adfa32f Mon Sep 17 00:00:00 2001 From: David Brownell Date: Fri, 25 Jul 2008 19:44:38 -0700 Subject: pm selftest: rtc paranoia Cope with a quirk of some RTCs (notably ACPI ones) which aren't guaranteed to implement oneshot behavior when they woke the system from sleeep: forcibly disable the alarm, just in case. Signed-off-by: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/main.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 95bff23ecda..0b7476f5d2a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -635,6 +635,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) } if (status < 0) printk(err_suspend, status); + + /* Some platforms can't detect that the alarm triggered the + * wakeup, or (accordingly) disable it after it afterwards. + * It's supposed to give oneshot behavior; cope. + */ + alm.enabled = false; + rtc_set_alarm(rtc, &alm); } static int __init has_wakealarm(struct device *dev, void *name_ptr) -- cgit v1.2.3 From 7fccf0326536c1b245b98740d489abb9aab69a12 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 25 Jul 2008 19:45:02 -0700 Subject: kernel/kexec.c: make 'kimage_terminate' void Since kimage_terminate() always returns 0, make it void. Signed-off-by: WANG Cong Signed-off-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 1c5fcacbcf3..6db42ff8d52 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -589,14 +589,12 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->unuseable_pages); } -static int kimage_terminate(struct kimage *image) +static void kimage_terminate(struct kimage *image) { if (*image->entry != 0) image->entry++; *image->entry = IND_DONE; - - return 0; } #define for_each_kimage_entry(image, ptr, entry) \ @@ -997,9 +995,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, if (result) goto out; } - result = kimage_terminate(image); - if (result) - goto out; + kimage_terminate(image); } /* Install the new kernel, and Uninstall the old */ image = xchg(dest_image, image); -- cgit v1.2.3 From 3ab83521378268044a448113c6aa9a9e245f4d2f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 25 Jul 2008 19:45:07 -0700 Subject: kexec jump This patch provides an enhancement to kexec/kdump. It implements the following features: - Backup/restore memory used by the original kernel before/after kexec. - Save/restore CPU state before/after kexec. The features of this patch can be used as a general method to call program in physical mode (paging turning off). This can be used to call BIOS code under Linux. kexec-tools needs to be patched to support kexec jump. The patches and the precompiled kexec can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10 Usage example of calling some physical mode code and return: 1. Compile and install patched kernel with following options selected: CONFIG_X86_32=y CONFIG_KEXEC=y CONFIG_PM=y CONFIG_KEXEC_JUMP=y 2. Build patched kexec-tool or download the pre-built one. 3. Build some physical mode executable named such as "phy_mode" 4. Boot kernel compiled in step 1. 5. Load physical mode executable with /sbin/kexec. The shell command line can be as follow: /sbin/kexec --load-preserve-context --args-none phy_mode 6. Call physical mode executable with following shell command line: /sbin/kexec -e Implementation point: To support jumping without reserving memory. One shadow backup page (source page) is allocated for each page used by kexeced code image (destination page). When do kexec_load, the image of kexeced code is loaded into source pages, and before executing, the destination pages and the source pages are swapped, so the contents of destination pages are backupped. Before jumping to the kexeced code image and after jumping back to the original kernel, the destination pages and the source pages are swapped too. C ABI (calling convention) is used as communication protocol between kernel and called code. A flag named KEXEC_PRESERVE_CONTEXT for sys_kexec_load is added to indicate that the loaded kernel image is used for jumping back. Now, only the i386 architecture is supported. Signed-off-by: Huang Ying Acked-by: Vivek Goyal Cc: "Eric W. Biederman" Cc: Pavel Machek Cc: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 31 ++++++++----------------------- 2 files changed, 65 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 6db42ff8d52..a0d920915b3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include @@ -242,6 +244,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, goto out; } + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + printk(KERN_ERR "Could not allocate swap buffer\n"); + goto out; + } + result = 0; out: if (result == 0) @@ -986,6 +994,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, if (result) goto out; + if (flags & KEXEC_PRESERVE_CONTEXT) + image->preserve_context = 1; result = machine_kexec_prepare(image); if (result) goto out; @@ -1411,3 +1421,50 @@ static int __init crash_save_vmcoreinfo_init(void) } module_init(crash_save_vmcoreinfo_init) + +/** + * kernel_kexec - reboot the system + * + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. + */ +int kernel_kexec(void) +{ + int error = 0; + + if (xchg(&kexec_lock, 1)) + return -EBUSY; + if (!kexec_image) { + error = -EINVAL; + goto Unlock; + } + + if (kexec_image->preserve_context) { +#ifdef CONFIG_KEXEC_JUMP + local_irq_disable(); + save_processor_state(); +#endif + } else { + blocking_notifier_call_chain(&reboot_notifier_list, + SYS_RESTART, NULL); + system_state = SYSTEM_RESTART; + device_shutdown(); + sysdev_shutdown(); + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + } + + machine_kexec(kexec_image); + + if (kexec_image->preserve_context) { +#ifdef CONFIG_KEXEC_JUMP + restore_processor_state(); + local_irq_enable(); +#endif + } + + Unlock: + xchg(&kexec_lock, 0); + + return error; +} diff --git a/kernel/sys.c b/kernel/sys.c index 0c9d3fa1f5f..c01858090a9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -301,26 +301,6 @@ void kernel_restart(char *cmd) } EXPORT_SYMBOL_GPL(kernel_restart); -/** - * kernel_kexec - reboot the system - * - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. - */ -static void kernel_kexec(void) -{ -#ifdef CONFIG_KEXEC - struct kimage *image; - image = xchg(&kexec_image, NULL); - if (!image) - return; - kernel_restart_prepare(NULL); - printk(KERN_EMERG "Starting new kernel\n"); - machine_shutdown(); - machine_kexec(image); -#endif -} - static void kernel_shutdown_prepare(enum system_states state) { blocking_notifier_call_chain(&reboot_notifier_list, @@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user kernel_restart(buffer); break; +#ifdef CONFIG_KEXEC case LINUX_REBOOT_CMD_KEXEC: - kernel_kexec(); - unlock_kernel(); - return -EINVAL; + { + int ret; + ret = kernel_kexec(); + unlock_kernel(); + return ret; + } +#endif #ifdef CONFIG_HIBERNATION case LINUX_REBOOT_CMD_SW_SUSPEND: -- cgit v1.2.3 From 89081d17f7bb81d89fa1aa9b70f821c5cf4d39e9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 25 Jul 2008 19:45:10 -0700 Subject: kexec jump: save/restore device state This patch implements devices state save/restore before after kexec. This patch together with features in kexec_jump patch can be used for following: - A simple hibernation implementation without ACPI support. You can kexec a hibernating kernel, save the memory image of original system and shutdown the system. When resuming, you restore the memory image of original system via ordinary kexec load then jump back. - Kernel/system debug through making system snapshot. You can make system snapshot, jump back, do some thing and make another system snapshot. - Cooperative multi-kernel/system. With kexec jump, you can switch between several kernels/systems quickly without boot process except the first time. This appears like swap a whole kernel/system out/in. - A general method to call program in physical mode (paging turning off). This can be used to invoke BIOS code under Linux. The following user-space tools can be used with kexec jump: - kexec-tools needs to be patched to support kexec jump. The patches and the precompiled kexec can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10 - makedumpfile with patches are used as memory image saving tool, it can exclude free pages from original kernel memory image file. The patches and the precompiled makedumpfile can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-src_cvs_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-patches_cvs_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile_cvs_kh10 - An initramfs image can be used as the root file system of kexeced kernel. An initramfs image built with "BuildRoot" can be downloaded from the following URL: initramfs image: http://khibernation.sourceforge.net/download/release_v10/initramfs/rootfs_cvs_kh10.gz All user space tools above are included in the initramfs image. Usage example of simple hibernation: 1. Compile and install patched kernel with following options selected: CONFIG_X86_32=y CONFIG_RELOCATABLE=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_PM=y CONFIG_HIBERNATION=y CONFIG_KEXEC_JUMP=y 2. Build an initramfs image contains kexec-tool and makedumpfile, or download the pre-built initramfs image, called rootfs.gz in following text. 3. Prepare a partition to save memory image of original kernel, called hibernating partition in following text. 4. Boot kernel compiled in step 1 (kernel A). 5. In the kernel A, load kernel compiled in step 1 (kernel B) with /sbin/kexec. The shell command line can be as follow: /sbin/kexec --load-preserve-context /boot/bzImage --mem-min=0x100000 --mem-max=0xffffff --initrd=rootfs.gz 6. Boot the kernel B with following shell command line: /sbin/kexec -e 7. The kernel B will boot as normal kexec. In kernel B the memory image of kernel A can be saved into hibernating partition as follow: jump_back_entry=`cat /proc/cmdline | tr ' ' '\n' | grep kexec_jump_back_entry | cut -d '='` echo $jump_back_entry > kexec_jump_back_entry cp /proc/vmcore dump.elf Then you can shutdown the machine as normal. 8. Boot kernel compiled in step 1 (kernel C). Use the rootfs.gz as root file system. 9. In kernel C, load the memory image of kernel A as follow: /sbin/kexec -l --args-none --entry=`cat kexec_jump_back_entry` dump.elf 10. Jump back to the kernel A as follow: /sbin/kexec -e Then, kernel A is resumed. Implementation point: To support jumping between two kernels, before jumping to (executing) the new kernel and jumping back to the original kernel, the devices are put into quiescent state, and the state of devices and CPU is saved. After jumping back from kexeced kernel and jumping to the new kernel, the state of devices and CPU are restored accordingly. The devices/CPU state save/restore code of software suspend is called to implement corresponding function. Known issues: - Because the segment number supported by sys_kexec_load is limited, hibernation image with many segments may not be load. This is planned to be eliminated by adding a new flag to sys_kexec_load to make a image can be loaded with multiple sys_kexec_load invoking. Now, only the i386 architecture is supported. Signed-off-by: Huang Ying Acked-by: Vivek Goyal Cc: "Eric W. Biederman" Cc: Pavel Machek Cc: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 39 +++++++++++++++++++++++++++++++++++++++ kernel/power/power.h | 2 -- 2 files changed, 39 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index a0d920915b3..c8a4370e2a3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -26,6 +26,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -1441,7 +1445,31 @@ int kernel_kexec(void) if (kexec_image->preserve_context) { #ifdef CONFIG_KEXEC_JUMP + mutex_lock(&pm_mutex); + pm_prepare_console(); + error = freeze_processes(); + if (error) { + error = -EBUSY; + goto Restore_console; + } + suspend_console(); + error = device_suspend(PMSG_FREEZE); + if (error) + goto Resume_console; + error = disable_nonboot_cpus(); + if (error) + goto Resume_devices; local_irq_disable(); + /* At this point, device_suspend() has been called, + * but *not* device_power_down(). We *must* + * device_power_down() now. Otherwise, drivers for + * some devices (e.g. interrupt controllers) become + * desynchronized with the actual state of the + * hardware at resume time, and evil weirdness ensues. + */ + error = device_power_down(PMSG_FREEZE); + if (error) + goto Enable_irqs; save_processor_state(); #endif } else { @@ -1459,7 +1487,18 @@ int kernel_kexec(void) if (kexec_image->preserve_context) { #ifdef CONFIG_KEXEC_JUMP restore_processor_state(); + device_power_up(PMSG_RESTORE); + Enable_irqs: local_irq_enable(); + enable_nonboot_cpus(); + Resume_devices: + device_resume(PMSG_RESTORE); + Resume_console: + resume_console(); + thaw_processes(); + Restore_console: + pm_restore_console(); + mutex_unlock(&pm_mutex); #endif } diff --git a/kernel/power/power.h b/kernel/power/power.h index 700f44ec840..acc0c101dbd 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void); extern int pfn_is_nosave(unsigned long); -extern struct mutex pm_mutex; - #define power_attr(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ -- cgit v1.2.3 From 7babe8db99d305340cf4828ce1f5a1481d5622ef Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Fri, 25 Jul 2008 19:45:11 -0700 Subject: Full conversion to early_initcall() interface, remove old interface A previous patch added the early_initcall(), to allow a cleaner hooking of pre-SMP initcalls. Now we remove the older interface, converting all existing users to the new one. [akpm@linux-foundation.org: cleanups] [akpm@linux-foundation.org: build fix] [kosaki.motohiro@jp.fujitsu.com: warning fix] [kosaki.motohiro@jp.fujitsu.com: warning fix] Signed-off-by: Eduard - Gabriel Munteanu Cc: Tom Zanussi Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 5 ++++- kernel/smp.c | 4 +++- kernel/softirq.c | 3 ++- kernel/softlockup.c | 25 ++++++++++++++++++++++--- 4 files changed, 31 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0047bd9b96a..fde1a102635 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6389,7 +6389,7 @@ static struct notifier_block __cpuinitdata migration_notifier = { .priority = 10 }; -void __init migration_init(void) +static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; @@ -6399,7 +6399,10 @@ void __init migration_init(void) BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + + return err; } +early_initcall(migration_init); #endif #ifdef CONFIG_SMP diff --git a/kernel/smp.c b/kernel/smp.c index 462c785ca1e..96fc7c0edc5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -33,7 +33,7 @@ struct call_single_queue { spinlock_t lock; }; -void __cpuinit init_call_single_data(void) +static int __cpuinit init_call_single_data(void) { int i; @@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void) spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->list); } + return 0; } +early_initcall(init_call_single_data); static void csd_flag_wait(struct call_single_data *data) { diff --git a/kernel/softirq.c b/kernel/softirq.c index f6b03d56c2b..c506f266a6b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -__init int spawn_ksoftirqd(void) +static __init int spawn_ksoftirqd(void) { void *cpu = (void *)(long)smp_processor_id(); int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); @@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void) register_cpu_notifier(&cpu_nfb); return 0; } +early_initcall(spawn_ksoftirqd); #ifdef CONFIG_SMP /* diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 7bd8d1aadd5..b75b492fbfc 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -338,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -__init void spawn_softlockup_task(void) +static int __initdata nosoftlockup; + +static int __init nosoftlockup_setup(char *str) +{ + nosoftlockup = 1; + return 1; +} +__setup("nosoftlockup", nosoftlockup_setup); + +static int __init spawn_softlockup_task(void) { void *cpu = (void *)(long)smp_processor_id(); - int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + int err; - BUG_ON(err == NOTIFY_BAD); + if (nosoftlockup) + return 0; + + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + if (err == NOTIFY_BAD) { + BUG(); + return 1; + } cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + return 0; } +early_initcall(spawn_softlockup_task); -- cgit v1.2.3 From 20d8b67c06fa5e74f44e80b0a0fd68c8327f7c6a Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Fri, 25 Jul 2008 19:45:12 -0700 Subject: relay: add buffer-only channels; useful for early logging Allows one to create and use a channel with no associated files. Files can be initialized later. This is useful in scenarios such as logging in early code, before VFS is up. Therefore, such channels can be created and used as soon as kmem_cache_init() completed. This is needed by kmemtrace to do tracing in early kernel code. [kosaki.motohiro@jp.fujitsu.com: build fix] Signed-off-by: Eduard - Gabriel Munteanu Cc: Tom Zanussi Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 141 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 7de644cdec4..04006ef970b 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan) } EXPORT_SYMBOL_GPL(relay_reset); +static inline void relay_set_buf_dentry(struct rchan_buf *buf, + struct dentry *dentry) +{ + buf->dentry = dentry; + buf->dentry->d_inode->i_size = buf->early_bytes; +} + +static struct dentry *relay_create_buf_file(struct rchan *chan, + struct rchan_buf *buf, + unsigned int cpu) +{ + struct dentry *dentry; + char *tmpname; + + tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); + if (!tmpname) + return NULL; + snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); + + /* Create file in fs */ + dentry = chan->cb->create_buf_file(tmpname, chan->parent, + S_IRUSR, buf, + &chan->is_global); + + kfree(tmpname); + + return dentry; +} + /* * relay_open_buf - create a new relay channel buffer * @@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) { struct rchan_buf *buf = NULL; struct dentry *dentry; - char *tmpname; if (chan->is_global) return chan->buf[0]; - tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); - if (!tmpname) - goto end; - snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); - buf = relay_create_buf(chan); if (!buf) - goto free_name; + return NULL; + + if (chan->has_base_filename) { + dentry = relay_create_buf_file(chan, buf, cpu); + if (!dentry) + goto free_buf; + relay_set_buf_dentry(buf, dentry); + } buf->cpu = cpu; __relay_reset(buf, 1); - /* Create file in fs */ - dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, - buf, &chan->is_global); - if (!dentry) - goto free_buf; - - buf->dentry = dentry; - if(chan->is_global) { chan->buf[0] = buf; buf->cpu = 0; } - goto free_name; + return buf; free_buf: relay_destroy_buf(buf); - buf = NULL; -free_name: - kfree(tmpname); -end: - return buf; + return NULL; } /** @@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, /** * relay_open - create a new relay channel - * @base_filename: base name of files to create - * @parent: dentry of parent directory, %NULL for root directory + * @base_filename: base name of files to create, %NULL for buffering only + * @parent: dentry of parent directory, %NULL for root directory or buffer * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers * @cb: client callback functions @@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename, { unsigned int i; struct rchan *chan; - if (!base_filename) - return NULL; if (!(subbuf_size && n_subbufs)) return NULL; @@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename, chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); chan->parent = parent; chan->private_data = private_data; - strlcpy(chan->base_filename, base_filename, NAME_MAX); + if (base_filename) { + chan->has_base_filename = 1; + strlcpy(chan->base_filename, base_filename, NAME_MAX); + } setup_callbacks(chan, cb); kref_init(&chan->kref); @@ -604,6 +623,94 @@ free_bufs: } EXPORT_SYMBOL_GPL(relay_open); +struct rchan_percpu_buf_dispatcher { + struct rchan_buf *buf; + struct dentry *dentry; +}; + +/* Called in atomic context. */ +static void __relay_set_buf_dentry(void *info) +{ + struct rchan_percpu_buf_dispatcher *p = info; + + relay_set_buf_dentry(p->buf, p->dentry); +} + +/** + * relay_late_setup_files - triggers file creation + * @chan: channel to operate on + * @base_filename: base name of files to create + * @parent: dentry of parent directory, %NULL for root directory + * + * Returns 0 if successful, non-zero otherwise. + * + * Use to setup files for a previously buffer-only channel. + * Useful to do early tracing in kernel, before VFS is up, for example. + */ +int relay_late_setup_files(struct rchan *chan, + const char *base_filename, + struct dentry *parent) +{ + int err = 0; + unsigned int i, curr_cpu; + unsigned long flags; + struct dentry *dentry; + struct rchan_percpu_buf_dispatcher disp; + + if (!chan || !base_filename) + return -EINVAL; + + strlcpy(chan->base_filename, base_filename, NAME_MAX); + + mutex_lock(&relay_channels_mutex); + /* Is chan already set up? */ + if (unlikely(chan->has_base_filename)) + return -EEXIST; + chan->has_base_filename = 1; + chan->parent = parent; + curr_cpu = get_cpu(); + /* + * The CPU hotplug notifier ran before us and created buffers with + * no files associated. So it's safe to call relay_setup_buf_file() + * on all currently online CPUs. + */ + for_each_online_cpu(i) { + if (unlikely(!chan->buf[i])) { + printk(KERN_ERR "relay_late_setup_files: CPU %u " + "has no buffer, it must have!\n", i); + BUG(); + err = -EINVAL; + break; + } + + dentry = relay_create_buf_file(chan, chan->buf[i], i); + if (unlikely(!dentry)) { + err = -EINVAL; + break; + } + + if (curr_cpu == i) { + local_irq_save(flags); + relay_set_buf_dentry(chan->buf[i], dentry); + local_irq_restore(flags); + } else { + disp.buf = chan->buf[i]; + disp.dentry = dentry; + smp_mb(); + /* relay_channels_mutex must be held, so wait. */ + err = smp_call_function_single(i, + __relay_set_buf_dentry, + &disp, 1); + } + if (unlikely(err)) + break; + } + put_cpu(); + mutex_unlock(&relay_channels_mutex); + + return err; +} + /** * relay_switch_subbuf - switch to a new sub-buffer * @buf: channel buffer @@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; buf->padding[old_subbuf] = buf->prev_padding; buf->subbufs_produced++; - buf->dentry->d_inode->i_size += buf->chan->subbuf_size - - buf->padding[old_subbuf]; + if (buf->dentry) + buf->dentry->d_inode->i_size += + buf->chan->subbuf_size - + buf->padding[old_subbuf]; + else + buf->early_bytes += buf->chan->subbuf_size - + buf->padding[old_subbuf]; smp_mb(); if (waitqueue_active(&buf->read_wait)) /* @@ -1237,4 +1349,4 @@ static __init int relay_init(void) return 0; } -module_init(relay_init); +early_initcall(relay_init); -- cgit v1.2.3 From 51cc50685a4275c6a02653670af9f108a64e01cf Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 25 Jul 2008 19:45:34 -0700 Subject: SL*B: drop kmem cache argument from constructor Kmem cache passed to constructor is only needed for constructors that are themselves multiplexeres. Nobody uses this "feature", nor does anybody uses passed kmem cache in non-trivial way, so pass only pointer to object. Non-trivial places are: arch/powerpc/mm/init_64.c arch/powerpc/mm/hugetlbpage.c This is flag day, yes. Signed-off-by: Alexey Dobriyan Acked-by: Pekka Enberg Acked-by: Christoph Lameter Cc: Jon Tollefson Cc: Nick Piggin Cc: Matt Mackall [akpm@linux-foundation.org: fix arch/powerpc/mm/hugetlbpage.c] [akpm@linux-foundation.org: fix mm/slab.c] [akpm@linux-foundation.org: fix ubifs] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b99d73e971a..80e83e459b1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1442,7 +1442,7 @@ long do_fork(unsigned long clone_flags, #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -static void sighand_ctor(struct kmem_cache *cachep, void *data) +static void sighand_ctor(void *data) { struct sighand_struct *sighand = data; -- cgit v1.2.3 From b8c512f6190e313df69060bae4a161c5c044e272 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 25 Jul 2008 19:45:36 -0700 Subject: Use WARN() in kernel/irq/manage.c Replace a printk+WARN_ON() by a WARN(); this increases the chance of the string making it into the bugreport (ie: it goes inside the ---[ cut here ]--- section) Signed-off-by: Arjan van de Ven Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f8914b92b66..152abfd3589 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -177,8 +177,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq) { switch (desc->depth) { case 0: - printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); - WARN_ON(1); + WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); break; case 1: { unsigned int status = desc->status & ~IRQ_DISABLED; -- cgit v1.2.3 From 261c40c1191ad8d7a2e49fa2bb5f6a84e3d44b10 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 25 Jul 2008 19:45:37 -0700 Subject: use WARN() in kernel/irq/chip.c Use WARN() instead of a printk+WARN_ON() pair; this way the message becomes part of the warning section for better reporting/collection. Signed-off-by: Arjan van de Ven Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/chip.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 964964baefa..3cd441ebf5d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -28,8 +28,7 @@ void dynamic_irq_init(unsigned int irq) unsigned long flags; if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; } @@ -62,8 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned long flags; if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); return; } @@ -71,9 +69,8 @@ void dynamic_irq_cleanup(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); if (desc->action) { spin_unlock_irqrestore(&desc->lock, flags); - printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n", + WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", irq); - WARN_ON(1); return; } desc->msi_desc = NULL; @@ -96,8 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) unsigned long flags; if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); return -EINVAL; } -- cgit v1.2.3 From ff1188646c6870f336e910fb894eeed74f50471f Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:45 -0700 Subject: tracehook: unexport ptrace_notify The ptrace_notify() function should not be called by any modules. It was only ever exported to be called by binfmt exec functions. But that is no longer necessary since fs/exec.c deals with that generically now. There should be no calls to ptrace_notify() from outside the core kernel. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 82c3545596c..8715c18b27b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1895,7 +1895,6 @@ EXPORT_SYMBOL(recalc_sigpending); EXPORT_SYMBOL_GPL(dequeue_signal); EXPORT_SYMBOL(flush_signals); EXPORT_SYMBOL(force_sig); -EXPORT_SYMBOL(ptrace_notify); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); EXPORT_SYMBOL(sigprocmask); -- cgit v1.2.3 From 30199f5a46aee204bf437a4f5b0740f3efe448b7 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:46 -0700 Subject: tracehook: exit This moves the PTRACE_EVENT_EXIT tracing into a tracehook.h inline, tracehook_report_exec(). The change has no effect, just clean-up. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ad933bb29ec..c3691cbc220 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -1029,10 +1030,7 @@ NORET_TYPE void do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(current->ptrace & PT_TRACE_EXIT)) { - current->ptrace_message = code; - ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); - } + tracehook_report_exit(&code); /* * We're taking recursive faults here in do_exit. Safest is to just -- cgit v1.2.3 From 09a05394fe2448a4139b014936330af23fa7ec83 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:47 -0700 Subject: tracehook: clone This moves all the ptrace initialization and tracing logic for task creation into tracehook.h and ptrace.h inlines. It reorganizes the code slightly, but should not change any behavior. There are four tracehook entry points, at each important stage of task creation. This keeps the interface from the core fork.c code fairly clean, while supporting the complex setup required for ptrace or something like it. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 69 ++++++++++++++++++++++++----------------------------------- 1 file changed, 28 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 80e83e459b1..b42f8ed2361 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -865,8 +866,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags &= ~PF_SUPERPRIV; new_flags |= PF_FORKNOEXEC; - if (!(clone_flags & CLONE_PTRACE)) - p->ptrace = 0; + new_flags |= PF_STARTING; p->flags = new_flags; clear_freeze_flag(p); } @@ -907,7 +907,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, - struct pid *pid) + struct pid *pid, + int trace) { int retval; struct task_struct *p; @@ -1163,8 +1164,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); - INIT_LIST_HEAD(&p->ptrace_entry); - INIT_LIST_HEAD(&p->ptraced); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible @@ -1195,7 +1194,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_parent = current->real_parent; else p->real_parent = current; - p->parent = p->real_parent; spin_lock(¤t->sighand->siglock); @@ -1237,8 +1235,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (likely(p->pid)) { list_add_tail(&p->sibling, &p->real_parent->children); - if (unlikely(p->ptrace & PT_PTRACED)) - __ptrace_link(p, current->parent); + tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { if (clone_flags & CLONE_NEWPID) @@ -1323,29 +1320,13 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid); + &init_struct_pid, 0); if (!IS_ERR(task)) init_idle(task, cpu); return task; } -static int fork_traceflag(unsigned clone_flags) -{ - if (clone_flags & CLONE_UNTRACED) - return 0; - else if (clone_flags & CLONE_VFORK) { - if (current->ptrace & PT_TRACE_VFORK) - return PTRACE_EVENT_VFORK; - } else if ((clone_flags & CSIGNAL) != SIGCHLD) { - if (current->ptrace & PT_TRACE_CLONE) - return PTRACE_EVENT_CLONE; - } else if (current->ptrace & PT_TRACE_FORK) - return PTRACE_EVENT_FORK; - - return 0; -} - /* * Ok, this is the main fork-routine. * @@ -1380,14 +1361,14 @@ long do_fork(unsigned long clone_flags, } } - if (unlikely(current->ptrace)) { - trace = fork_traceflag (clone_flags); - if (trace) - clone_flags |= CLONE_PTRACE; - } + /* + * When called from kernel_thread, don't do user tracing stuff. + */ + if (likely(user_mode(regs))) + trace = tracehook_prepare_clone(clone_flags); p = copy_process(clone_flags, stack_start, regs, stack_size, - child_tidptr, NULL); + child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1405,24 +1386,30 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { + tracehook_report_clone(trace, regs, clone_flags, nr, p); + + /* + * We set PF_STARTING at creation in case tracing wants to + * use this to distinguish a fully live task from one that + * hasn't gotten to tracehook_report_clone() yet. Now we + * clear it and set the child going. + */ + p->flags &= ~PF_STARTING; + + if (unlikely(clone_flags & CLONE_STOPPED)) { /* * We'll start up with an immediate SIGSTOP. */ sigaddset(&p->pending.signal, SIGSTOP); set_tsk_thread_flag(p, TIF_SIGPENDING); - } - - if (!(clone_flags & CLONE_STOPPED)) - wake_up_new_task(p, clone_flags); - else __set_task_state(p, TASK_STOPPED); - - if (unlikely (trace)) { - current->ptrace_message = nr; - ptrace_notify ((trace << 8) | SIGTRAP); + } else { + wake_up_new_task(p, clone_flags); } + tracehook_report_clone_complete(trace, regs, + clone_flags, nr, p); + if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); -- cgit v1.2.3 From daded34be96b1975ff8539ff62ad8b158ce7d842 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:47 -0700 Subject: tracehook: vfork-done This moves the PTRACE_EVENT_VFORK_DONE tracing into a tracehook.h inline, tracehook_report_vfork_done(). The change has no effect, just clean-up. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b42f8ed2361..abb3ed6298f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1414,10 +1414,7 @@ long do_fork(unsigned long clone_flags, freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); - if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { - current->ptrace_message = nr; - ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); - } + tracehook_report_vfork_done(p, nr); } } else { nr = PTR_ERR(p); -- cgit v1.2.3 From dae33574dcf5211e1f43c7e45fa29f73ba3e00cb Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:48 -0700 Subject: tracehook: release_task This moves the ptrace-related logic from release_task into tracehook.h and ptrace.h inlines. It provides clean hooks both before and after locking tasklist_lock, for future tracing logic to do more cleanup without the lock. This also changes release_task() itself in the rare "zap_leader" case to set the leader to EXIT_DEAD before iterating. This maintains the invariant that release_task() only ever handles a task in EXIT_DEAD. This is a common-sense invariant that is already always true except in this one arcane case of zombie leader whose parent ignores SIGCHLD. This change is harmless and only costs one store in this one rare case. It keeps the expected state more consisently sane, which is nicer when debugging weirdness in release_task(). It also lets some future code in the tracehook entry points rely on this invariant for bookkeeping. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c3691cbc220..da28745f7c3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -163,27 +163,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(container_of(rhp, struct task_struct, rcu)); } -/* - * Do final ptrace-related cleanup of a zombie being reaped. - * - * Called with write_lock(&tasklist_lock) held. - */ -static void ptrace_release_task(struct task_struct *p) -{ - BUG_ON(!list_empty(&p->ptraced)); - ptrace_unlink(p); - BUG_ON(!list_empty(&p->ptrace_entry)); -} void release_task(struct task_struct * p) { struct task_struct *leader; int zap_leader; repeat: + tracehook_prepare_release_task(p); atomic_dec(&p->user->processes); proc_flush_task(p); write_lock_irq(&tasklist_lock); - ptrace_release_task(p); + tracehook_finish_release_task(p); __exit_signal(p); /* @@ -205,6 +195,13 @@ repeat: * that case. */ zap_leader = task_detached(leader); + + /* + * This maintains the invariant that release_task() + * only runs on a task in EXIT_DEAD, just for sanity. + */ + if (zap_leader) + leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); -- cgit v1.2.3 From 35de254dc60f91004b3b5ebb1fc7b2c3093d6032 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:51 -0700 Subject: tracehook: tracehook_consider_ignored_signal This defines tracehook_consider_ignored_signal() has a fine-grained hook for deciding to prevent the normal short-circuit of sending an ignored signal, as ptrace does. There is no change, only cleanup. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 8715c18b27b..9efd1cee6d0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -39,24 +40,21 @@ static struct kmem_cache *sigqueue_cachep; -static int __sig_ignored(struct task_struct *t, int sig) +static void __user *sig_handler(struct task_struct *t, int sig) { - void __user *handler; + return t->sighand->action[sig - 1].sa.sa_handler; +} +static int sig_handler_ignored(void __user *handler, int sig) +{ /* Is it explicitly or implicitly ignored? */ - - handler = t->sighand->action[sig - 1].sa.sa_handler; return handler == SIG_IGN || (handler == SIG_DFL && sig_kernel_ignore(sig)); } static int sig_ignored(struct task_struct *t, int sig) { - /* - * Tracers always want to know about signals.. - */ - if (t->ptrace & PT_PTRACED) - return 0; + void __user *handler; /* * Blocked signals are never ignored, since the @@ -66,7 +64,14 @@ static int sig_ignored(struct task_struct *t, int sig) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - return __sig_ignored(t, sig); + handler = sig_handler(t, sig); + if (!sig_handler_ignored(handler, sig)) + return 0; + + /* + * Tracers may want to know about even ignored signals. + */ + return !tracehook_consider_ignored_signal(t, sig, handler); } /* @@ -2298,7 +2303,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ - if (__sig_ignored(t, sig)) { + if (sig_handler_ignored(sig_handler(t, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); rm_from_queue_full(&mask, &t->signal->shared_pending); -- cgit v1.2.3 From 445a91d2fe3667fb8fc251433645f686933cf56a Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:52 -0700 Subject: tracehook: tracehook_consider_fatal_signal This defines tracehook_consider_fatal_signal() has a fine-grained hook for deciding to skip the special cases for a fatal signal, as ptrace does. There is no change, only cleanup. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9efd1cee6d0..1a942ce32ba 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -300,12 +300,12 @@ flush_signal_handlers(struct task_struct *t, int force_default) int unhandled_signal(struct task_struct *tsk, int sig) { + void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) return 1; - if (tsk->ptrace & PT_PTRACED) + if (handler != SIG_IGN && handler != SIG_DFL) return 0; - return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || - (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); + return !tracehook_consider_fatal_signal(tsk, sig, handler); } @@ -761,7 +761,8 @@ static void complete_signal(int sig, struct task_struct *p, int group) if (sig_fatal(p, sig) && !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { + (sig == SIGKILL || + !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) { /* * This signal will be fatal to the whole group. */ -- cgit v1.2.3 From 7bcf6a2ca5f639b038c48711ebe6c4eca2036641 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:53 -0700 Subject: tracehook: get_signal_to_deliver This defines the tracehook_get_signal() hook to allow tracing code to slip in before normal signal dequeuing. This lays the groundwork for new tracing features that can inject synthetic signals outside the normal queue or control the disposition of delivered signals. The calling convention lets tracehook_get_signal() decide both exactly what will happen and what signal number to report in the handler/exit. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 1a942ce32ba..10b31ecdd9c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1754,17 +1754,33 @@ relock: do_signal_stop(0)) goto relock; - signr = dequeue_signal(current, ¤t->blocked, info); - if (!signr) - break; /* will return 0 */ + /* + * Tracing can induce an artifical signal and choose sigaction. + * The return value in @signr determines the default action, + * but @info->si_signo is the signal number we will report. + */ + signr = tracehook_get_signal(current, regs, info, return_ka); + if (unlikely(signr < 0)) + goto relock; + if (unlikely(signr != 0)) + ka = return_ka; + else { + signr = dequeue_signal(current, ¤t->blocked, + info); - if (signr != SIGKILL) { - signr = ptrace_signal(signr, info, regs, cookie); if (!signr) - continue; + break; /* will return 0 */ + + if (signr != SIGKILL) { + signr = ptrace_signal(signr, info, + regs, cookie); + if (!signr) + continue; + } + + ka = &sighand->action[signr-1]; } - ka = &sighand->action[signr-1]; if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { @@ -1812,7 +1828,7 @@ relock: spin_lock_irq(&sighand->siglock); } - if (likely(do_signal_stop(signr))) { + if (likely(do_signal_stop(info->si_signo))) { /* It released the siglock. */ goto relock; } @@ -1833,7 +1849,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(regs, signr); + print_fatal_signal(regs, info->si_signo); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with @@ -1842,13 +1858,13 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump((long)signr, signr, regs); + do_coredump(info->si_signo, info->si_signo, regs); } /* * Death signals, no core dump. */ - do_group_exit(signr); + do_group_exit(info->si_signo); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); -- cgit v1.2.3 From fa00b80b3c41a845b3d56f866fb40a2e98754c51 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:54 -0700 Subject: tracehook: job control This defines the tracehook_notify_jctl() hook to formalize the ptrace effects on the job control notifications. There is no change, only cleanup. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 10b31ecdd9c..e9e699f4b1b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -596,9 +596,6 @@ static int check_kill_permission(int sig, struct siginfo *info, return security_task_kill(t, info, sig, 0); } -/* forward decl */ -static void do_notify_parent_cldstop(struct task_struct *tsk, int why); - /* * Handle magic process-wide effects of stop/continue signals. Unlike * the signal actions, these happen immediately at signal-generation @@ -1605,7 +1602,7 @@ finish_stop(int stop_count) * a group stop in progress and we are the last to stop, * report to the parent. When ptraced, every thread reports itself. */ - if (stop_count == 0 || (current->ptrace & PT_PTRACED)) { + if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) { read_lock(&tasklist_lock); do_notify_parent_cldstop(current, CLD_STOPPED); read_unlock(&tasklist_lock); @@ -1741,6 +1738,9 @@ relock: signal->flags &= ~SIGNAL_CLD_MASK; spin_unlock_irq(&sighand->siglock); + if (unlikely(!tracehook_notify_jctl(1, why))) + goto relock; + read_lock(&tasklist_lock); do_notify_parent_cldstop(current->group_leader, why); read_unlock(&tasklist_lock); @@ -1906,7 +1906,7 @@ void exit_signals(struct task_struct *tsk) out: spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(group_stop)) { + if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { read_lock(&tasklist_lock); do_notify_parent_cldstop(tsk, CLD_STOPPED); read_unlock(&tasklist_lock); -- cgit v1.2.3 From 2b2a1ff64afbadac842bbc58c5166962cf4f7664 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:54 -0700 Subject: tracehook: death This moves the ptrace logic in task death (exit_notify) into tracehook.h inlines. Some code is rearranged slightly to make things nicer. There is no change, only cleanup. There is one hook called with the tasklist_lock write-locked, as ptrace needs. There is also a new hook called after exit_state changes and without locks. This is a better place for tracing work to be in the future, since it doesn't delay the whole system with locking. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 26 +++++++++----------------- kernel/signal.c | 10 +++++++--- 2 files changed, 16 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index da28745f7c3..6cdf60712bd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -885,7 +885,8 @@ static void forget_original_parent(struct task_struct *father) */ static void exit_notify(struct task_struct *tsk, int group_dead) { - int state; + int signal; + void *cookie; /* * This does two things: @@ -922,22 +923,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead) !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; - /* If something other than our normal parent is ptracing us, then - * send it a SIGCHLD instead of honoring exit_signal. exit_signal - * only has special meaning to our real parent. - */ - if (!task_detached(tsk) && thread_group_empty(tsk)) { - int signal = ptrace_reparented(tsk) ? - SIGCHLD : tsk->exit_signal; - do_notify_parent(tsk, signal); - } else if (tsk->ptrace) { - do_notify_parent(tsk, SIGCHLD); - } + signal = tracehook_notify_death(tsk, &cookie, group_dead); + if (signal > 0) + signal = do_notify_parent(tsk, signal); - state = EXIT_ZOMBIE; - if (task_detached(tsk) && likely(!tsk->ptrace)) - state = EXIT_DEAD; - tsk->exit_state = state; + tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && @@ -947,8 +937,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) write_unlock_irq(&tasklist_lock); + tracehook_report_death(tsk, signal, cookie, group_dead); + /* If the process is dead, release it - nobody will wait for it */ - if (state == EXIT_DEAD) + if (signal < 0) release_task(tsk); } diff --git a/kernel/signal.c b/kernel/signal.c index e9e699f4b1b..0e862d3130f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1326,9 +1326,11 @@ static inline void __wake_up_parent(struct task_struct *p, /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. + * + * Returns -1 if our parent ignored us and so we've switched to + * self-reaping, or else @sig. */ - -void do_notify_parent(struct task_struct *tsk, int sig) +int do_notify_parent(struct task_struct *tsk, int sig) { struct siginfo info; unsigned long flags; @@ -1399,12 +1401,14 @@ void do_notify_parent(struct task_struct *tsk, int sig) */ tsk->exit_signal = -1; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) - sig = 0; + sig = -1; } if (valid_signal(sig) && sig > 0) __group_send_sig_info(sig, &info, tsk->parent); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); + + return sig; } static void do_notify_parent_cldstop(struct task_struct *tsk, int why) -- cgit v1.2.3 From b787f7ba677840da16a2228c16571ce8a1fcb799 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:55 -0700 Subject: tracehook: force signal_pending() This defines a new hook tracehook_force_sigpending() that lets tracing code decide to force TIF_SIGPENDING on in recalc_sigpending(). This is not used yet, so it compiles away to nothing for now. It lays the groundwork for new tracing code that can interrupt a task synthetically without actually sending a signal. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 0e862d3130f..954f77d7e3b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -134,7 +134,9 @@ void recalc_sigpending_and_wake(struct task_struct *t) void recalc_sigpending(void) { - if (!recalc_sigpending_tsk(current) && !freezing(current)) + if (unlikely(tracehook_force_sigpending())) + set_thread_flag(TIF_SIGPENDING); + else if (!recalc_sigpending_tsk(current) && !freezing(current)) clear_thread_flag(TIF_SIGPENDING); } -- cgit v1.2.3 From 85ba2d862e521375a8ee01526c5c46b1f24bb4af Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:58 -0700 Subject: tracehook: wait_task_inactive This extends wait_task_inactive() with a new argument so it can be used in a "soft" mode where it will check for the task changing state unexpectedly and back off. There is no change to existing callers. This lays the groundwork to allow robust, noninvasive tracing that can try to sample a blocked thread but back off safely if it wakes up. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 2 +- kernel/ptrace.c | 2 +- kernel/sched.c | 29 +++++++++++++++++++++++++++-- 3 files changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 6111c27491b..96cff2f8710 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) return; } /* Must have done schedule() in kthread() before we set_task_cpu */ - wait_task_inactive(k); + wait_task_inactive(k, 0); set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8392a9da645..082b3fcb32a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) read_unlock(&tasklist_lock); if (!ret && !kill) - wait_task_inactive(child); + ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; /* All systems go.. */ return ret; diff --git a/kernel/sched.c b/kernel/sched.c index fde1a102635..0236958addc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1867,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * wait_task_inactive - wait for a thread to unschedule. * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't * be called with interrupts off, or it may introduce deadlock with * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(struct task_struct *p) +unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; int running, on_rq; + unsigned long ncsw; struct rq *rq; for (;;) { @@ -1899,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p) * return false if the runqueue has changed and p * is actually now running somewhere else! */ - while (task_running(rq, p)) + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; cpu_relax(); + } /* * Ok, time to look more closely! We need the rq @@ -1910,8 +1921,20 @@ void wait_task_inactive(struct task_struct *p) rq = task_rq_lock(p, &flags); running = task_running(rq, p); on_rq = p->se.on_rq; + ncsw = 0; + if (!match_state || p->state == match_state) { + ncsw = p->nivcsw + p->nvcsw; + if (unlikely(!ncsw)) + ncsw = 1; + } task_rq_unlock(rq, &flags); + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + /* * Was it really running after all now that we * checked with the proper locks actually held? @@ -1944,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p) */ break; } + + return ncsw; } /*** -- cgit v1.2.3 From 96930a6365c99c160138a395566e360b27348b8f Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 25 Jul 2008 19:46:21 -0700 Subject: make cgroup_seqfile_release() static cgroup_seqfile_release() can become static. Signed-off-by: Adrian Bunk Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66ec9fd21e0..89bd6fb7894 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1529,7 +1529,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) return cft->read_seq_string(state->cgroup, cft, m); } -int cgroup_seqfile_release(struct inode *inode, struct file *file) +static int cgroup_seqfile_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; kfree(seq->private); -- cgit v1.2.3 From 734550921e9b7ab924a43aa3d0bd4239dac4fbf1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 14 Jul 2008 21:22:20 -0400 Subject: [PATCH] beginning of sysctl cleanup - ctl_table_set New object: set of sysctls [currently - root and per-net-ns]. Contains: pointer to parent set, list of tables and "should I see this set?" method (->is_seen(set)). Current lists of tables are subsumed by that; net-ns contains such a beast. ->lookup() for ctl_table_root returns pointer to ctl_table_set instead of that to ->list of that ctl_table_set. [folded compile fixes by rdd for configs without sysctl] Signed-off-by: Al Viro --- kernel/sysctl.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 35a50db9b6c..8ee4a0619fb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -160,12 +160,13 @@ static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list), + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), .root = &sysctl_table_root, + .set = &sysctl_table_root.default_set, }; static struct ctl_table_root sysctl_table_root = { .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), - .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry), + .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), }; static struct ctl_table kern_table[]; @@ -1403,14 +1404,20 @@ void sysctl_head_finish(struct ctl_table_header *head) spin_unlock(&sysctl_lock); } +static struct ctl_table_set * +lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = &root->default_set; + if (root->lookup) + set = root->lookup(root, namespaces); + return set; +} + static struct list_head * lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) { - struct list_head *header_list; - header_list = &root->header_list; - if (root->lookup) - header_list = root->lookup(root, namespaces); - return header_list; + struct ctl_table_set *set = lookup_header_set(root, namespaces); + return &set->list; } struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, @@ -1720,7 +1727,6 @@ struct ctl_table_header *__register_sysctl_paths( struct nsproxy *namespaces, const struct ctl_path *path, struct ctl_table *table) { - struct list_head *header_list; struct ctl_table_header *header; struct ctl_table *new, **prevp; unsigned int n, npath; @@ -1772,8 +1778,8 @@ struct ctl_table_header *__register_sysctl_paths( } #endif spin_lock(&sysctl_lock); - header_list = lookup_header_list(root, namespaces); - list_add_tail(&header->ctl_entry, header_list); + header->set = lookup_header_set(root, namespaces); + list_add_tail(&header->ctl_entry, &header->set->list); spin_unlock(&sysctl_lock); return header; @@ -1832,6 +1838,15 @@ void unregister_sysctl_table(struct ctl_table_header * header) kfree(header); } +void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_set *parent, + int (*is_seen)(struct ctl_table_set *)) +{ + INIT_LIST_HEAD(&p->list); + p->parent = parent ? parent : &sysctl_table_root.default_set; + p->is_seen = is_seen; +} + #else /* !CONFIG_SYSCTL */ struct ctl_table_header *register_sysctl_table(struct ctl_table * table) { @@ -1848,6 +1863,12 @@ void unregister_sysctl_table(struct ctl_table_header * table) { } +void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_set *parent, + int (*is_seen)(struct ctl_table_set *)) +{ +} + #endif /* CONFIG_SYSCTL */ /* -- cgit v1.2.3 From f7e6ced4061da509f737541ca4dbd44d83a6e82f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Jul 2008 01:44:23 -0400 Subject: [PATCH] allow delayed freeing of ctl_table_header Refcount the sucker; instead of freeing it by the end of unregistration just drop the refcount and free only when it hits zero. Make sure that we _always_ make ->unregistering non-NULL in start_unregistering(). That allows anybody to get a reference to such puppy, preventing its freeing and reuse. It does *not* block unregistration. Anybody who holds such a reference can * try to grab a "use" reference (ctl_head_grab()); that will succeeds if and only if it hadn't entered unregistration yet. If it succeeds, we can use it in all normal ways until we release the "use" reference (with ctl_head_finish()). Note that this relies on having ->unregistering become non-NULL in all cases when one starts to unregister the sucker. * keep pointers to ctl_table entries; they *can* be freed if the entire thing is unregistered. However, if ctl_head_grab() succeeds, we know that unregistration had not happened (and will not happen until ctl_head_finish()) and such pointers can be used safely. IOW, now we can have inodes under /proc/sys keep references to ctl_table entries, protecting them with references to ctl_table_header and grabbing the latter for the duration of operations that require access to ctl_table. That won't cause deadlocks, since unregistration will not be stopped by mere keeping a reference to ctl_table_header. Signed-off-by: Al Viro --- kernel/sysctl.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8ee4a0619fb..60d9357e717 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1387,6 +1387,9 @@ static void start_unregistering(struct ctl_table_header *p) spin_unlock(&sysctl_lock); wait_for_completion(&wait); spin_lock(&sysctl_lock); + } else { + /* anything non-NULL; we'll never dereference it */ + p->unregistering = ERR_PTR(-EINVAL); } /* * do not remove from the list until nobody holds it; walking the @@ -1395,6 +1398,32 @@ static void start_unregistering(struct ctl_table_header *p) list_del_init(&p->ctl_entry); } +void sysctl_head_get(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + head->count++; + spin_unlock(&sysctl_lock); +} + +void sysctl_head_put(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + if (!--head->count) + kfree(head); + spin_unlock(&sysctl_lock); +} + +struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) +{ + if (!head) + BUG(); + spin_lock(&sysctl_lock); + if (!use_table(head)) + head = ERR_PTR(-ENOENT); + spin_unlock(&sysctl_lock); + return head; +} + void sysctl_head_finish(struct ctl_table_header *head) { if (!head) @@ -1771,6 +1800,7 @@ struct ctl_table_header *__register_sysctl_paths( header->unregistering = NULL; header->root = root; sysctl_set_parent(NULL, header->ctl_table); + header->count = 1; #ifdef CONFIG_SYSCTL_SYSCALL_CHECK if (sysctl_check_table(namespaces, header->ctl_table)) { kfree(header); @@ -1834,8 +1864,9 @@ void unregister_sysctl_table(struct ctl_table_header * header) spin_lock(&sysctl_lock); start_unregistering(header); + if (!--header->count) + kfree(header); spin_unlock(&sysctl_lock); - kfree(header); } void setup_sysctl_set(struct ctl_table_set *p, @@ -1869,6 +1900,10 @@ void setup_sysctl_set(struct ctl_table_set *p, { } +void sysctl_head_put(struct ctl_table_header *head) +{ +} + #endif /* CONFIG_SYSCTL */ /* -- cgit v1.2.3 From ae7edecc9b8810770a8e5cb9a466ea4bdcfa8401 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Jul 2008 06:33:31 -0400 Subject: [PATCH] sysctl: keep track of tree relationships In a sense, that's the heart of the series. It's based on the following property of the trees we are actually asked to add: they can be split into stem that is already covered by registered trees and crown that is entirely new. IOW, if a/b and a/c/d are introduced by our tree, then a/c is also introduced by it. That allows to associate tree and table entry with each node in the union; while directory nodes might be covered by many trees, only one will cover the node by its crown. And that will allow much saner logics for /proc/sys in the next patches. This patch introduces the data structures needed to keep track of that. When adding a sysctl table, we find a "parent" one. Which is to say, find the deepest node on its stem that already is present in one of the tables from our table set or its ancestor sets. That table will be our parent and that node in it - attachment point. Add our table to list anchored in parent, have it refer the parent and contents of attachment point. Also remember where its crown lives. Signed-off-by: Al Viro --- kernel/sysctl.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 60d9357e717..c9a0af88703 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1680,6 +1680,52 @@ static __init int sysctl_init(void) core_initcall(sysctl_init); +static int is_branch_in(struct ctl_table *branch, struct ctl_table *table) +{ + struct ctl_table *p; + const char *s = branch->procname; + + /* branch should have named subdirectory as its first element */ + if (!s || !branch->child) + return 0; + + /* ... and nothing else */ + if (branch[1].procname || branch[1].ctl_name) + return 0; + + /* table should contain subdirectory with the same name */ + for (p = table; p->procname || p->ctl_name; p++) { + if (!p->child) + continue; + if (p->procname && strcmp(p->procname, s) == 0) + return 1; + } + return 0; +} + +/* see if attaching q to p would be an improvement */ +static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) +{ + struct ctl_table *to = p->ctl_table, *by = q->ctl_table; + int is_better = 0; + int not_in_parent = !p->attached_by; + + while (is_branch_in(by, to)) { + if (by == q->attached_by) + is_better = 1; + if (to == p->attached_by) + not_in_parent = 1; + by = by->child; + to = to->child; + } + + if (is_better && not_in_parent) { + q->attached_by = by; + q->attached_to = to; + q->parent = p; + } +} + /** * __register_sysctl_paths - register a sysctl hierarchy * @root: List of sysctl headers to register on @@ -1759,6 +1805,7 @@ struct ctl_table_header *__register_sysctl_paths( struct ctl_table_header *header; struct ctl_table *new, **prevp; unsigned int n, npath; + struct ctl_table_set *set; /* Count the path components */ for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) @@ -1809,6 +1856,18 @@ struct ctl_table_header *__register_sysctl_paths( #endif spin_lock(&sysctl_lock); header->set = lookup_header_set(root, namespaces); + header->attached_by = header->ctl_table; + header->attached_to = root_table; + header->parent = &root_table_header; + for (set = header->set; set; set = set->parent) { + struct ctl_table_header *p; + list_for_each_entry(p, &set->list, ctl_entry) { + if (p->unregistering) + continue; + try_attach(p, header); + } + } + header->parent->count++; list_add_tail(&header->ctl_entry, &header->set->list); spin_unlock(&sysctl_lock); @@ -1864,6 +1923,10 @@ void unregister_sysctl_table(struct ctl_table_header * header) spin_lock(&sysctl_lock); start_unregistering(header); + if (!--header->parent->count) { + WARN_ON(1); + kfree(header->parent); + } if (!--header->count) kfree(header); spin_unlock(&sysctl_lock); -- cgit v1.2.3 From 9043476f726802f4b00c96d0c4f418dde48d1304 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Jul 2008 08:54:06 -0400 Subject: [PATCH] sanitize proc_sysctl * keep references to ctl_table_head and ctl_table in /proc/sys inodes * grab the former during operations, use the latter for access to entry if that succeeds * have ->d_compare() check if table should be seen for one who does lookup; that allows us to avoid flipping inodes - if we have the same name resolve to different things, we'll just keep several dentries and ->d_compare() will reject the wrong ones. * have ->lookup() and ->readdir() scan the table of our inode first, then walk all ctl_table_header and scan ->attached_by for those that are attached to our directory. * implement ->getattr(). * get rid of insane amounts of tree-walking * get rid of the need to know dentry in ->permission() and of the contortions induced by that. Signed-off-by: Al Viro --- kernel/sysctl.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c9a0af88703..ff5abcca5dd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1932,6 +1932,21 @@ void unregister_sysctl_table(struct ctl_table_header * header) spin_unlock(&sysctl_lock); } +int sysctl_is_seen(struct ctl_table_header *p) +{ + struct ctl_table_set *set = p->set; + int res; + spin_lock(&sysctl_lock); + if (p->unregistering) + res = 0; + else if (!set->is_seen) + res = 1; + else + res = set->is_seen(set); + spin_unlock(&sysctl_lock); + return res; +} + void setup_sysctl_set(struct ctl_table_set *p, struct ctl_table_set *parent, int (*is_seen)(struct ctl_table_set *)) -- cgit v1.2.3 From e6305c43eda10ebfd2ad9e35d6e172ccc7bb3695 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Jul 2008 21:03:57 -0400 Subject: [PATCH] sanitize ->permission() prototype * kill nameidata * argument; map the 3 bits in ->flags anybody cares about to new MAY_... ones and pass with the mask. * kill redundant gfs2_iop_permission() * sanitize ecryptfs_permission() * fix remaining places where ->permission() instances might barf on new MAY_... found in mask. The obvious next target in that direction is permission(9) folded fix for nfs_permission() breakage from Miklos Szeredi Signed-off-by: Al Viro --- kernel/sysctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ff5abcca5dd..911d846f050 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1516,9 +1516,9 @@ static int do_sysctl_strategy(struct ctl_table_root *root, int op = 0, rc; if (oldval) - op |= 004; + op |= MAY_READ; if (newval) - op |= 002; + op |= MAY_WRITE; if (sysctl_perm(root, table, op)) return -EPERM; @@ -1560,7 +1560,7 @@ repeat: if (n == table->ctl_name) { int error; if (table->child) { - if (sysctl_perm(root, table, 001)) + if (sysctl_perm(root, table, MAY_EXEC)) return -EPERM; name++; nlen--; @@ -1635,7 +1635,7 @@ static int test_perm(int mode, int op) mode >>= 6; else if (in_egroup_p(0)) mode >>= 3; - if ((mode & op & 0007) == op) + if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) return 0; return -EACCES; } @@ -1645,7 +1645,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) int error; int mode; - error = security_sysctl(table, op); + error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); if (error) return error; -- cgit v1.2.3 From 7f2da1e7d0330395e5e9e350b879b98a1ea495df Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 10 May 2008 20:44:54 -0400 Subject: [PATCH] kill altroot long overdue... Signed-off-by: Al Viro --- kernel/exec_domain.c | 1 - kernel/exit.c | 2 -- kernel/fork.c | 7 ------- 3 files changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c1ef192aa65..0d407e88673 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -168,7 +168,6 @@ __set_personality(u_long personality) current->personality = personality; oep = current_thread_info()->exec_domain; current_thread_info()->exec_domain = ep; - set_fs_altroot(); module_put(oep->module); return 0; diff --git a/kernel/exit.c b/kernel/exit.c index 6cdf60712bd..0caf590548a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -565,8 +565,6 @@ void put_fs_struct(struct fs_struct *fs) if (atomic_dec_and_test(&fs->count)) { path_put(&fs->root); path_put(&fs->pwd); - if (fs->altroot.dentry) - path_put(&fs->altroot); kmem_cache_free(fs_cachep, fs); } } diff --git a/kernel/fork.c b/kernel/fork.c index abb3ed6298f..5e050c1317c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -657,13 +657,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old) path_get(&old->root); fs->pwd = old->pwd; path_get(&old->pwd); - if (old->altroot.dentry) { - fs->altroot = old->altroot; - path_get(&old->altroot); - } else { - fs->altroot.mnt = NULL; - fs->altroot.dentry = NULL; - } read_unlock(&old->lock); } return fs; -- cgit v1.2.3 From 3f8206d496e9e9495afb1d4e70d29712b4d403c9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Jul 2008 03:46:43 -0400 Subject: [PATCH] get rid of indirect users of namei.h fs.h needs path.h, not namei.h; nfs_fs.h doesn't need it at all. Several places in the tree needed direct include. Signed-off-by: Al Viro --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 89bd6fb7894..657f8f8d93a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -45,6 +45,7 @@ #include #include #include +#include #include -- cgit v1.2.3 From bfbcf034798b2ca45338cee5049b5694b7ddc865 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 27 Jul 2008 06:31:22 +0100 Subject: lost sysctl fix try_attach() should walk into the matching subdirectory, not the first one... Signed-off-by: Al Viro Tested-by: Valdis.Kletnieks@vt.edu Tested-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 911d846f050..fe471334727 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1680,43 +1680,45 @@ static __init int sysctl_init(void) core_initcall(sysctl_init); -static int is_branch_in(struct ctl_table *branch, struct ctl_table *table) +static struct ctl_table *is_branch_in(struct ctl_table *branch, + struct ctl_table *table) { struct ctl_table *p; const char *s = branch->procname; /* branch should have named subdirectory as its first element */ if (!s || !branch->child) - return 0; + return NULL; /* ... and nothing else */ if (branch[1].procname || branch[1].ctl_name) - return 0; + return NULL; /* table should contain subdirectory with the same name */ for (p = table; p->procname || p->ctl_name; p++) { if (!p->child) continue; if (p->procname && strcmp(p->procname, s) == 0) - return 1; + return p; } - return 0; + return NULL; } /* see if attaching q to p would be an improvement */ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) { struct ctl_table *to = p->ctl_table, *by = q->ctl_table; + struct ctl_table *next; int is_better = 0; int not_in_parent = !p->attached_by; - while (is_branch_in(by, to)) { + while ((next = is_branch_in(by, to)) != NULL) { if (by == q->attached_by) is_better = 1; if (to == p->attached_by) not_in_parent = 1; by = by->child; - to = to->child; + to = next->child; } if (is_better && not_in_parent) { -- cgit v1.2.3 From 605ccb73f6a1c891a16268b3a2923208fc637958 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sun, 27 Jul 2008 13:39:03 +0200 Subject: tracing: remove unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the following warning with CONFIG_TRACING=y: kernel/trace/trace.c: In function ‘s_next’: kernel/trace/trace.c:1186: warning: unused variable ‘last_ent’ Signed-off-by: Andrea Righi Signed-off-by: Linus Torvalds --- kernel/trace/trace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fc20e09a6cb..8f3fb3db61c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1183,7 +1183,6 @@ static void *find_next_entry_inc(struct trace_iterator *iter) static void *s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_iterator *iter = m->private; - void *last_ent = iter->ent; int i = (int)*pos; void *ent; -- cgit v1.2.3 From 5995477ab7f3522c497c9c4a1c55373e9d655574 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sun, 27 Jul 2008 17:29:15 +0200 Subject: task IO accounting: improve code readability Put all i/o statistics in struct proc_io_accounting and use inline functions to initialize and increment statistics, removing a lot of single variable assignments. This also reduces the kernel size as following (with CONFIG_TASK_XACCT=y and CONFIG_TASK_IO_ACCOUNTING=y). text data bss dec hex filename 11651 0 0 11651 2d83 kernel/exit.o.before 11619 0 0 11619 2d63 kernel/exit.o.after 10886 132 136 11154 2b92 kernel/fork.o.before 10758 132 136 11026 2b12 kernel/fork.o.after 3082029 807968 4818600 8708597 84e1f5 vmlinux.o.before 3081869 807968 4818600 8708437 84e155 vmlinux.o.after Signed-off-by: Andrea Righi Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 30 +++--------------------------- kernel/fork.c | 15 ++------------- kernel/tsacct.c | 14 +++++++------- 3 files changed, 12 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 0caf590548a..eb4d6470d1d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -121,18 +121,7 @@ static void __exit_signal(struct task_struct *tsk) sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); -#ifdef CONFIG_TASK_XACCT - sig->rchar += tsk->rchar; - sig->wchar += tsk->wchar; - sig->syscr += tsk->syscr; - sig->syscw += tsk->syscw; -#endif /* CONFIG_TASK_XACCT */ -#ifdef CONFIG_TASK_IO_ACCOUNTING - sig->ioac.read_bytes += tsk->ioac.read_bytes; - sig->ioac.write_bytes += tsk->ioac.write_bytes; - sig->ioac.cancelled_write_bytes += - tsk->ioac.cancelled_write_bytes; -#endif /* CONFIG_TASK_IO_ACCOUNTING */ + task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -1363,21 +1352,8 @@ static int wait_task_zombie(struct task_struct *p, int options, psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; -#ifdef CONFIG_TASK_XACCT - psig->rchar += p->rchar + sig->rchar; - psig->wchar += p->wchar + sig->wchar; - psig->syscr += p->syscr + sig->syscr; - psig->syscw += p->syscw + sig->syscw; -#endif /* CONFIG_TASK_XACCT */ -#ifdef CONFIG_TASK_IO_ACCOUNTING - psig->ioac.read_bytes += - p->ioac.read_bytes + sig->ioac.read_bytes; - psig->ioac.write_bytes += - p->ioac.write_bytes + sig->ioac.write_bytes; - psig->ioac.cancelled_write_bytes += - p->ioac.cancelled_write_bytes + - sig->ioac.cancelled_write_bytes; -#endif /* CONFIG_TASK_IO_ACCOUNTING */ + task_io_accounting_add(&psig->ioac, &p->ioac); + task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 5e050c1317c..8214ba7c8bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -806,12 +806,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; -#ifdef CONFIG_TASK_XACCT - sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0; -#endif -#ifdef CONFIG_TASK_IO_ACCOUNTING - memset(&sig->ioac, 0, sizeof(sig->ioac)); -#endif + task_io_accounting_init(&sig->ioac); sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); @@ -994,13 +989,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->last_switch_timestamp = 0; #endif -#ifdef CONFIG_TASK_XACCT - p->rchar = 0; /* I/O counter: bytes read */ - p->wchar = 0; /* I/O counter: bytes written */ - p->syscr = 0; /* I/O counter: read syscalls */ - p->syscw = 0; /* I/O counter: write syscalls */ -#endif - task_io_accounting_init(p); + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 3da47ccdc5e..f9cd2561689 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; mmput(mm); } - stats->read_char = p->rchar; - stats->write_char = p->wchar; - stats->read_syscalls = p->syscr; - stats->write_syscalls = p->syscw; + stats->read_char = p->ioac.chr.rchar; + stats->write_char = p->ioac.chr.wchar; + stats->read_syscalls = p->ioac.chr.syscr; + stats->write_syscalls = p->ioac.chr.syscw; #ifdef CONFIG_TASK_IO_ACCOUNTING - stats->read_bytes = p->ioac.read_bytes; - stats->write_bytes = p->ioac.write_bytes; - stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; + stats->read_bytes = p->ioac.blk.read_bytes; + stats->write_bytes = p->ioac.blk.write_bytes; + stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes; #else stats->read_bytes = 0; stats->write_bytes = 0; -- cgit v1.2.3 From 940389b8afad6495211614c13eb91ef7001773ec Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 28 Jul 2008 00:48:12 +0200 Subject: task IO accounting: move all IO statistics in struct task_io_accounting Simplify the code of include/linux/task_io_accounting.h. It is also more reasonable to have all the task i/o-related statistics in a single struct (task_io_accounting). Signed-off-by: Andrea Righi Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index f9cd2561689..8ebcd8532df 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; mmput(mm); } - stats->read_char = p->ioac.chr.rchar; - stats->write_char = p->ioac.chr.wchar; - stats->read_syscalls = p->ioac.chr.syscr; - stats->write_syscalls = p->ioac.chr.syscw; + stats->read_char = p->ioac.rchar; + stats->write_char = p->ioac.wchar; + stats->read_syscalls = p->ioac.syscr; + stats->write_syscalls = p->ioac.syscw; #ifdef CONFIG_TASK_IO_ACCOUNTING - stats->read_bytes = p->ioac.blk.read_bytes; - stats->write_bytes = p->ioac.blk.write_bytes; - stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes; + stats->read_bytes = p->ioac.read_bytes; + stats->write_bytes = p->ioac.write_bytes; + stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; #else stats->read_bytes = 0; stats->write_bytes = 0; -- cgit v1.2.3 From 15bba37d62351749c3915add81f673b256952ee1 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 24 Jul 2008 15:41:48 +0100 Subject: module: fix build warning with !CONFIG_KALLSYMS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixed the warning: CC kernel/module.o /home/wangcong/Projects/linux-2.6/kernel/module.c:332: warning: ‘lookup_symbol’ defined but not used Signed-off-by: WANG Cong Signed-off-by: Rusty Russell --- kernel/module.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d8b5605132a..d861bd5b8c1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -325,18 +325,6 @@ static unsigned long find_symbol(const char *name, return -ENOENT; } -/* lookup symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) -{ - const struct kernel_symbol *ks = start; - for (; ks < stop; ks++) - if (strcmp(ks->name, name) == 0) - return ks; - return NULL; -} - /* Search for module by name: must hold module_mutex. */ static struct module *find_module(const char *name) { @@ -1703,6 +1691,19 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } #ifdef CONFIG_KALLSYMS + +/* lookup symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) +{ + const struct kernel_symbol *ks = start; + for (; ks < stop; ks++) + if (strcmp(ks->name, name) == 0) + return ks; + return NULL; +} + static int is_exported(const char *name, const struct module *mod) { if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) -- cgit v1.2.3 From 5c2aed622571ac7c3c6ec182d6d3c318e4b45c8b Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 28 Feb 2008 11:33:03 -0500 Subject: stop_machine: add ALL_CPUS option -allow stop_mahcine_run() to call a function on all cpus. Calling stop_machine_run() with a 'ALL_CPUS' invokes this new behavior. stop_machine_run() proceeds as normal until the calling cpu has invoked 'fn'. Then, we tell all the other cpus to call 'fn'. Signed-off-by: Jason Baron Signed-off-by: Mathieu Desnoyers Signed-off-by: Rusty Russell CC: Adrian Bunk CC: Andi Kleen CC: Alexey Dobriyan CC: Christoph Hellwig CC: mingo@elte.hu CC: akpm@osdl.org --- kernel/stop_machine.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 738b411ff2d..a473bd0cb71 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -22,9 +22,17 @@ enum stopmachine_state { STOPMACHINE_WAIT, STOPMACHINE_PREPARE, STOPMACHINE_DISABLE_IRQ, + STOPMACHINE_RUN, STOPMACHINE_EXIT, }; +struct stop_machine_data { + int (*fn)(void *); + void *data; + struct completion done; + int run_all; +} smdata; + static enum stopmachine_state stopmachine_state; static unsigned int stopmachine_num_threads; static atomic_t stopmachine_thread_ack; @@ -33,6 +41,7 @@ static int stopmachine(void *cpu) { int irqs_disabled = 0; int prepared = 0; + int ran = 0; cpumask_of_cpu_ptr(cpumask, (int)(long)cpu); set_cpus_allowed_ptr(current, cpumask); @@ -58,6 +67,11 @@ static int stopmachine(void *cpu) prepared = 1; smp_mb(); /* Must read state first. */ atomic_inc(&stopmachine_thread_ack); + } else if (stopmachine_state == STOPMACHINE_RUN && !ran) { + smdata.fn(smdata.data); + ran = 1; + smp_mb(); /* Must read state first. */ + atomic_inc(&stopmachine_thread_ack); } /* Yield in first stage: migration threads need to * help our sisters onto their CPUs. */ @@ -136,11 +150,10 @@ static void restart_machine(void) preempt_enable_no_resched(); } -struct stop_machine_data { - int (*fn)(void *); - void *data; - struct completion done; -}; +static void run_other_cpus(void) +{ + stopmachine_set_state(STOPMACHINE_RUN); +} static int do_stop(void *_smdata) { @@ -150,6 +163,8 @@ static int do_stop(void *_smdata) ret = stop_machine(); if (ret == 0) { ret = smdata->fn(smdata->data); + if (smdata->run_all) + run_other_cpus(); restart_machine(); } @@ -173,14 +188,17 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, struct stop_machine_data smdata; struct task_struct *p; + mutex_lock(&stopmachine_mutex); + smdata.fn = fn; smdata.data = data; + smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0; init_completion(&smdata.done); - mutex_lock(&stopmachine_mutex); + smp_wmb(); /* make sure other cpus see smdata updates */ /* If they don't care which CPU fn runs on, bind to any online one. */ - if (cpu == NR_CPUS) + if (cpu == NR_CPUS || cpu == ALL_CPUS) cpu = raw_smp_processor_id(); p = kthread_create(do_stop, &smdata, "kstopmachine"); -- cgit v1.2.3 From ffdb5976c47609c862917d4c186ecbb5706d2dda Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:28 -0500 Subject: Simplify stop_machine stop_machine creates a kthread which creates kernel threads. We can create those threads directly and simplify things a little. Some care must be taken with CPU hotunplug, which has special needs, but that code seems more robust than it was in the past. Signed-off-by: Rusty Russell Acked-by: Christian Borntraeger --- kernel/cpu.c | 13 +-- kernel/stop_machine.c | 293 +++++++++++++++++++++----------------------------- 2 files changed, 128 insertions(+), 178 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 10ba5f1004a..cf79bb91137 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -216,7 +216,6 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - struct task_struct *p; cpumask_t old_allowed, tmp; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; @@ -250,19 +249,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpu_clear(cpu, tmp); set_cpus_allowed_ptr(current, &tmp); - p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); + err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); - if (IS_ERR(p) || cpu_online(cpu)) { + if (err || cpu_online(cpu)) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) BUG(); - if (IS_ERR(p)) { - err = PTR_ERR(p); - goto out_allowed; - } - goto out_thread; + goto out_allowed; } /* Wait for it to sleep (leaving idle task). */ @@ -279,8 +274,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); -out_thread: - err = kthread_stop(p); out_allowed: set_cpus_allowed_ptr(current, &old_allowed); out_release: diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a473bd0cb71..35882dccc94 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,4 +1,4 @@ -/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. +/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. * GPL v2 and any later version. */ #include @@ -13,220 +13,177 @@ #include #include -/* Since we effect priority and affinity (both of which are visible - * to, and settable by outside processes) we do indirection via a - * kthread. */ - -/* Thread to stop each CPU in user context. */ +/* This controls the threads on each CPU. */ enum stopmachine_state { - STOPMACHINE_WAIT, + /* Dummy starting state for thread. */ + STOPMACHINE_NONE, + /* Awaiting everyone to be scheduled. */ STOPMACHINE_PREPARE, + /* Disable interrupts. */ STOPMACHINE_DISABLE_IRQ, + /* Run the function */ STOPMACHINE_RUN, + /* Exit */ STOPMACHINE_EXIT, }; +static enum stopmachine_state state; struct stop_machine_data { int (*fn)(void *); void *data; - struct completion done; - int run_all; -} smdata; + int fnret; +}; -static enum stopmachine_state stopmachine_state; -static unsigned int stopmachine_num_threads; -static atomic_t stopmachine_thread_ack; +/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ +static unsigned int num_threads; +static atomic_t thread_ack; +static struct completion finished; +static DEFINE_MUTEX(lock); -static int stopmachine(void *cpu) +static void set_state(enum stopmachine_state newstate) { - int irqs_disabled = 0; - int prepared = 0; - int ran = 0; - cpumask_of_cpu_ptr(cpumask, (int)(long)cpu); - - set_cpus_allowed_ptr(current, cpumask); - - /* Ack: we are alive */ - smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ - atomic_inc(&stopmachine_thread_ack); - - /* Simple state machine */ - while (stopmachine_state != STOPMACHINE_EXIT) { - if (stopmachine_state == STOPMACHINE_DISABLE_IRQ - && !irqs_disabled) { - local_irq_disable(); - hard_irq_disable(); - irqs_disabled = 1; - /* Ack: irqs disabled. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } else if (stopmachine_state == STOPMACHINE_PREPARE - && !prepared) { - /* Everyone is in place, hold CPU. */ - preempt_disable(); - prepared = 1; - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } else if (stopmachine_state == STOPMACHINE_RUN && !ran) { - smdata.fn(smdata.data); - ran = 1; - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } - /* Yield in first stage: migration threads need to - * help our sisters onto their CPUs. */ - if (!prepared && !irqs_disabled) - yield(); - cpu_relax(); - } - - /* Ack: we are exiting. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - - if (irqs_disabled) - local_irq_enable(); - if (prepared) - preempt_enable(); - - return 0; + /* Reset ack counter. */ + atomic_set(&thread_ack, num_threads); + smp_wmb(); + state = newstate; } -/* Change the thread state */ -static void stopmachine_set_state(enum stopmachine_state state) +/* Last one to ack a state moves to the next state. */ +static void ack_state(void) { - atomic_set(&stopmachine_thread_ack, 0); - smp_wmb(); - stopmachine_state = state; - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - cpu_relax(); + if (atomic_dec_and_test(&thread_ack)) { + /* If we're the last one to ack the EXIT, we're finished. */ + if (state == STOPMACHINE_EXIT) + complete(&finished); + else + set_state(state + 1); + } } -static int stop_machine(void) +/* This is the actual thread which stops the CPU. It exits by itself rather + * than waiting for kthread_stop(), because it's easier for hotplug CPU. */ +static int stop_cpu(struct stop_machine_data *smdata) { - int i, ret = 0; - - atomic_set(&stopmachine_thread_ack, 0); - stopmachine_num_threads = 0; - stopmachine_state = STOPMACHINE_WAIT; + enum stopmachine_state curstate = STOPMACHINE_NONE; + int uninitialized_var(ret); - for_each_online_cpu(i) { - if (i == raw_smp_processor_id()) - continue; - ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); - if (ret < 0) - break; - stopmachine_num_threads++; - } - - /* Wait for them all to come to life. */ - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) { - yield(); + /* Simple state machine */ + do { + /* Chill out and ensure we re-read stopmachine_state. */ cpu_relax(); - } - - /* If some failed, kill them all. */ - if (ret < 0) { - stopmachine_set_state(STOPMACHINE_EXIT); - return ret; - } - - /* Now they are all started, make them hold the CPUs, ready. */ - preempt_disable(); - stopmachine_set_state(STOPMACHINE_PREPARE); - - /* Make them disable irqs. */ - local_irq_disable(); - hard_irq_disable(); - stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); - - return 0; -} + if (state != curstate) { + curstate = state; + switch (curstate) { + case STOPMACHINE_DISABLE_IRQ: + local_irq_disable(); + hard_irq_disable(); + break; + case STOPMACHINE_RUN: + /* |= allows error detection if functions on + * multiple CPUs. */ + smdata->fnret |= smdata->fn(smdata->data); + break; + default: + break; + } + ack_state(); + } + } while (curstate != STOPMACHINE_EXIT); -static void restart_machine(void) -{ - stopmachine_set_state(STOPMACHINE_EXIT); local_irq_enable(); - preempt_enable_no_resched(); + do_exit(0); } -static void run_other_cpus(void) +/* Callback for CPUs which aren't supposed to do anything. */ +static int chill(void *unused) { - stopmachine_set_state(STOPMACHINE_RUN); + return 0; } -static int do_stop(void *_smdata) +int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) { - struct stop_machine_data *smdata = _smdata; - int ret; + int i, err; + struct stop_machine_data active, idle; + struct task_struct **threads; + + active.fn = fn; + active.data = data; + active.fnret = 0; + idle.fn = chill; + idle.data = NULL; + + /* If they don't care which cpu fn runs on, just pick one. */ + if (cpu == NR_CPUS) + cpu = any_online_cpu(cpu_online_map); + + /* This could be too big for stack on large machines. */ + threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); + if (!threads) + return -ENOMEM; + + /* Set up initial state. */ + mutex_lock(&lock); + init_completion(&finished); + num_threads = num_online_cpus(); + set_state(STOPMACHINE_PREPARE); - ret = stop_machine(); - if (ret == 0) { - ret = smdata->fn(smdata->data); - if (smdata->run_all) - run_other_cpus(); - restart_machine(); - } + for_each_online_cpu(i) { + struct stop_machine_data *smdata; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - /* We're done: you can kthread_stop us now */ - complete(&smdata->done); + if (cpu == ALL_CPUS || i == cpu) + smdata = &active; + else + smdata = &idle; + + threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", + i); + if (IS_ERR(threads[i])) { + err = PTR_ERR(threads[i]); + threads[i] = NULL; + goto kill_threads; + } - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return ret; -} + /* Place it onto correct cpu. */ + kthread_bind(threads[i], i); -struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, - unsigned int cpu) -{ - static DEFINE_MUTEX(stopmachine_mutex); - struct stop_machine_data smdata; - struct task_struct *p; + /* Make it highest prio. */ + if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, ¶m)) + BUG(); + } - mutex_lock(&stopmachine_mutex); + /* We've created all the threads. Wake them all: hold this CPU so one + * doesn't hit this CPU until we're ready. */ + cpu = get_cpu(); + for_each_online_cpu(i) + wake_up_process(threads[i]); - smdata.fn = fn; - smdata.data = data; - smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0; - init_completion(&smdata.done); + /* This will release the thread on our CPU. */ + put_cpu(); + wait_for_completion(&finished); + mutex_unlock(&lock); - smp_wmb(); /* make sure other cpus see smdata updates */ + kfree(threads); - /* If they don't care which CPU fn runs on, bind to any online one. */ - if (cpu == NR_CPUS || cpu == ALL_CPUS) - cpu = raw_smp_processor_id(); + return active.fnret; - p = kthread_create(do_stop, &smdata, "kstopmachine"); - if (!IS_ERR(p)) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; +kill_threads: + for_each_online_cpu(i) + if (threads[i]) + kthread_stop(threads[i]); + mutex_unlock(&lock); - /* One high-prio thread per cpu. We'll do this one. */ - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_bind(p, cpu); - wake_up_process(p); - wait_for_completion(&smdata.done); - } - mutex_unlock(&stopmachine_mutex); - return p; + kfree(threads); + return err; } int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) { - struct task_struct *p; int ret; /* No CPUs can come up or down during this. */ get_online_cpus(); - p = __stop_machine_run(fn, data, cpu); - if (!IS_ERR(p)) - ret = kthread_stop(p); - else - ret = PTR_ERR(p); + ret = __stop_machine_run(fn, data, cpu); put_online_cpus(); return ret; -- cgit v1.2.3 From 04321587584272f4e8b9818f319f40caf8eeee13 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:29 -0500 Subject: Hotplug CPU: don't check cpu_online after take_cpu_down Akinobu points out that if take_cpu_down() succeeds, the cpu must be offline. Remove the cpu_online() check, and put a BUG_ON(). Quoting Akinobu Mita: Actually the cpu_online() check was necessary before appling this stop_machine: simplify patch. With old __stop_machine_run(), __stop_machine_run() could succeed (return !IS_ERR(p) value) even if take_cpu_down() returned non-zero value. The return value of take_cpu_down() was obtained through kthread_stop().. Signed-off-by: Rusty Russell Cc: "Akinobu Mita" --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index cf79bb91137..53cf508f975 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -250,8 +250,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) set_cpus_allowed_ptr(current, &tmp); err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); - - if (err || cpu_online(cpu)) { + if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) @@ -259,6 +258,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) goto out_allowed; } + BUG_ON(cpu_online(cpu)); /* Wait for it to sleep (leaving idle task). */ while (!idle_cpu(cpu)) -- cgit v1.2.3 From eeec4fad963490821348a331cca6102ae1c4a7a3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:30 -0500 Subject: stop_machine(): stop_machine_run() changed to use cpu mask Instead of a "cpu" arg with magic values NR_CPUS (any cpu) and ~0 (all cpus), pass a cpumask_t. Allow NULL for the common case (where we don't care which CPU the function is run on): temporary cpumask_t's are usually considered bad for stack space. This deprecates stop_machine_run, to be removed soon when all the callers are dead. Signed-off-by: Rusty Russell --- kernel/cpu.c | 3 ++- kernel/stop_machine.c | 27 +++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 53cf508f975..29510d68338 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -248,8 +248,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpus_setall(tmp); cpu_clear(cpu, tmp); set_cpus_allowed_ptr(current, &tmp); + tmp = cpumask_of_cpu(cpu); - err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); + err = __stop_machine(take_cpu_down, &tcd_param, &tmp); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 35882dccc94..e446c7c7d6a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -100,7 +100,7 @@ static int chill(void *unused) return 0; } -int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) +int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { int i, err; struct stop_machine_data active, idle; @@ -112,10 +112,6 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) idle.fn = chill; idle.data = NULL; - /* If they don't care which cpu fn runs on, just pick one. */ - if (cpu == NR_CPUS) - cpu = any_online_cpu(cpu_online_map); - /* This could be too big for stack on large machines. */ threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); if (!threads) @@ -128,13 +124,16 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) set_state(STOPMACHINE_PREPARE); for_each_online_cpu(i) { - struct stop_machine_data *smdata; + struct stop_machine_data *smdata = &idle; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - if (cpu == ALL_CPUS || i == cpu) - smdata = &active; - else - smdata = &idle; + if (!cpus) { + if (i == first_cpu(cpu_online_map)) + smdata = &active; + } else { + if (cpu_isset(i, *cpus)) + smdata = &active; + } threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", i); @@ -154,7 +153,7 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) /* We've created all the threads. Wake them all: hold this CPU so one * doesn't hit this CPU until we're ready. */ - cpu = get_cpu(); + get_cpu(); for_each_online_cpu(i) wake_up_process(threads[i]); @@ -177,15 +176,15 @@ kill_threads: return err; } -int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) +int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { int ret; /* No CPUs can come up or down during this. */ get_online_cpus(); - ret = __stop_machine_run(fn, data, cpu); + ret = __stop_machine(fn, data, cpus); put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(stop_machine_run); +EXPORT_SYMBOL_GPL(stop_machine); -- cgit v1.2.3 From 9b1a4d38373a5581a4e01032a3ccdd94cd93477b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:30 -0500 Subject: stop_machine: Wean existing callers off stop_machine_run() Signed-off-by: Rusty Russell --- kernel/module.c | 8 ++++---- kernel/rcuclassic.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d861bd5b8c1..61d212120df 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -678,7 +678,7 @@ static int try_stop_module(struct module *mod, int flags, int *forced) if (flags & O_NONBLOCK) { struct stopref sref = { mod, flags, forced }; - return stop_machine_run(__try_stop_module, &sref, NR_CPUS); + return stop_machine(__try_stop_module, &sref, NULL); } else { /* We don't need to stop the machine for this. */ mod->state = MODULE_STATE_GOING; @@ -1416,7 +1416,7 @@ static int __unlink_module(void *_mod) static void free_module(struct module *mod) { /* Delete from various lists */ - stop_machine_run(__unlink_module, mod, NR_CPUS); + stop_machine(__unlink_module, mod, NULL); remove_notes_attrs(mod); remove_sect_attrs(mod); mod_kobject_remove(mod); @@ -2197,7 +2197,7 @@ static struct module *load_module(void __user *umod, /* Now sew it into the lists so we can get lockdep and oops * info during argument parsing. Noone should access us, since * strong_try_module_get() will fail. */ - stop_machine_run(__link_module, mod, NR_CPUS); + stop_machine(__link_module, mod, NULL); /* Size of section 0 is 0, so this works well if no params */ err = parse_args(mod->name, mod->args, @@ -2231,7 +2231,7 @@ static struct module *load_module(void __user *umod, return mod; unlink: - stop_machine_run(__unlink_module, mod, NR_CPUS); + stop_machine(__unlink_module, mod, NULL); module_arch_cleanup(mod); cleanup: kobject_del(&mod->mkobj.kobj); diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 6f8696c502f..aad93cdc9f6 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -91,8 +91,8 @@ static void force_quiescent_state(struct rcu_data *rdp, * rdp->cpu is the current cpu. * * cpu_online_map is updated by the _cpu_down() - * using stop_machine_run(). Since we're in irqs disabled - * section, stop_machine_run() is not exectuting, hence + * using __stop_machine(). Since we're in irqs disabled + * section, __stop_machine() is not exectuting, hence * the cpu_online_map is stable. * * However, a cpu might have been offlined _just_ before -- cgit v1.2.3 From 784e2d76007f90d69341b95967160c4fb7829299 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:31 -0500 Subject: stop_machine: fix up ftrace.c Simple conversion. Signed-off-by: Rusty Russell Cc: Abhishek Sagar Cc: Ingo Molnar Cc: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4231a3dc224..f6e3af31b40 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -587,7 +587,7 @@ static int __ftrace_modify_code(void *data) static void ftrace_run_update_code(int command) { - stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); + stop_machine(__ftrace_modify_code, &command, NULL); } void ftrace_disable_daemon(void) @@ -787,7 +787,7 @@ static int ftrace_update_code(void) !ftrace_enabled || !ftraced_trigger) return 0; - stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); + stop_machine(__ftrace_update_code, NULL, NULL); return 1; } @@ -1564,7 +1564,7 @@ static int __init ftrace_dynamic_init(void) addr = (unsigned long)ftrace_record_ip; - stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); + stop_machine(ftrace_dyn_arch_init, &addr, NULL); /* ftrace_dyn_arch_init places the return code in addr */ if (addr) { -- cgit v1.2.3 From e56b3bc7942982ac2589c942fb345e38bc7a341a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 28 Jul 2008 11:32:33 -0700 Subject: cpu masks: optimize and clean up cpumask_of_cpu() Clean up and optimize cpumask_of_cpu(), by sharing all the zero words. Instead of stupidly generating all possible i=0...NR_CPUS 2^i patterns creating a huge array of constant bitmasks, realize that the zero words can be shared. In other words, on a 64-bit architecture, we only ever need 64 of these arrays - with a different bit set in one single world (with enough zero words around it so that we can create any bitmask by just offsetting in that big array). And then we just put enough zeroes around it that we can point every single cpumask to be one of those things. So when we have 4k CPU's, instead of having 4k arrays (of 4k bits each, with one bit set in each array - 2MB memory total), we have exactly 64 arrays instead, each 8k bits in size (64kB total). And then we just point cpumask(n) to the right position (which we can calculate dynamically). Once we have the right arrays, getting "cpumask(n)" ends up being: static inline const cpumask_t *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return (const cpumask_t *)p; } This brings other advantages and simplifications as well: - we are not wasting memory that is just filled with a single bit in various different places - we don't need all those games to re-create the arrays in some dense format, because they're already going to be dense enough. if we compile a kernel for up to 4k CPU's, "wasting" that 64kB of memory is a non-issue (especially since by doing this "overlapping" trick we probably get better cache behaviour anyway). [ mingo@elte.hu: Converted Linus's mails into a commit. See: http://lkml.org/lkml/2008/7/27/156 http://lkml.org/lkml/2008/7/28/320 Also applied a family filter - which also has the side-effect of leaving out the bits where Linus calls me an idio... Oh, never mind ;-) ] Signed-off-by: Ingo Molnar Cc: Rusty Russell Cc: Andrew Morton Cc: Al Viro Cc: Mike Travis Signed-off-by: Ingo Molnar --- kernel/cpu.c | 128 ++++++++++------------------------------------------------- 1 file changed, 20 insertions(+), 108 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a35d8995dc8..06a8358bb41 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -462,115 +462,27 @@ out: #endif /* CONFIG_SMP */ -/* 64 bits of zeros, for initializers. */ -#if BITS_PER_LONG == 32 -#define Z64 0, 0 -#else -#define Z64 0 -#endif +/* + * cpu_bit_bitmap[] is a special, "compressed" data structure that + * represents all NR_CPUS bits binary values of 1< 4 - CMI0(4), CMI0(5), CMI0(6), CMI0(7), -#endif -#if NR_CPUS > 8 - CMI0(8), CMI0(9), CMI0(10), CMI0(11), - CMI0(12), CMI0(13), CMI0(14), CMI0(15), -#endif -#if NR_CPUS > 16 - CMI0(16), CMI0(17), CMI0(18), CMI0(19), - CMI0(20), CMI0(21), CMI0(22), CMI0(23), - CMI0(24), CMI0(25), CMI0(26), CMI0(27), - CMI0(28), CMI0(29), CMI0(30), CMI0(31), -#endif -#if NR_CPUS > 32 -#if BITS_PER_LONG == 32 - CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0), - CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0), - CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0), - CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0), - CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0), - CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0), - CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0), - CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0), -#else - CMI0(32), CMI0(33), CMI0(34), CMI0(35), - CMI0(36), CMI0(37), CMI0(38), CMI0(39), - CMI0(40), CMI0(41), CMI0(42), CMI0(43), - CMI0(44), CMI0(45), CMI0(46), CMI0(47), - CMI0(48), CMI0(49), CMI0(50), CMI0(51), - CMI0(52), CMI0(53), CMI0(54), CMI0(55), - CMI0(56), CMI0(57), CMI0(58), CMI0(59), - CMI0(60), CMI0(61), CMI0(62), CMI0(63), -#endif /* BITS_PER_LONG == 64 */ -#endif -#if NR_CPUS > 64 - CMI64(64, Z64), -#endif -#if NR_CPUS > 128 - CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64), -#endif -#if NR_CPUS > 256 - CMI256(256, Z256), -#endif -#if NR_CPUS > 512 - CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256), -#endif -#if NR_CPUS > 1024 - CMI1024(1024, Z1024), -#endif -#if NR_CPUS > 2048 - CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024), -#endif -#if NR_CPUS > 4096 -#error NR_CPUS too big. Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP +const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { + + MASK_DECLARE_8(0), MASK_DECLARE_8(8), + MASK_DECLARE_8(16), MASK_DECLARE_8(24), +#if BITS_PER_LONG > 32 + MASK_DECLARE_8(32), MASK_DECLARE_8(40), + MASK_DECLARE_8(48), MASK_DECLARE_8(56), #endif }; - -const cpumask_t *cpumask_of_cpu_map = cpumask_map; - -EXPORT_SYMBOL_GPL(cpumask_of_cpu_map); +EXPORT_SYMBOL_GPL(cpu_bit_bitmap); -- cgit v1.2.3 From cddb8a5c14aa89810b40495d94d3d2a0faee6619 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 28 Jul 2008 15:46:29 -0700 Subject: mmu-notifiers: core With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages. There are secondary MMUs (with secondary sptes and secondary tlbs) too. sptes in the kvm case are shadow pagetables, but when I say spte in mmu-notifier context, I mean "secondary pte". In GRU case there's no actual secondary pte and there's only a secondary tlb because the GRU secondary MMU has no knowledge about sptes and every secondary tlb miss event in the MMU always generates a page fault that has to be resolved by the CPU (this is not the case of KVM where the a secondary tlb miss will walk sptes in hardware and it will refill the secondary tlb transparently to software if the corresponding spte is present). The same way zap_page_range has to invalidate the pte before freeing the page, the spte (and secondary tlb) must also be invalidated before any page is freed and reused. Currently we take a page_count pin on every page mapped by sptes, but that means the pages can't be swapped whenever they're mapped by any spte because they're part of the guest working set. Furthermore a spte unmap event can immediately lead to a page to be freed when the pin is released (so requiring the same complex and relatively slow tlb_gather smp safe logic we have in zap_page_range and that can be avoided completely if the spte unmap event doesn't require an unpin of the page previously mapped in the secondary MMU). The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know when the VM is swapping or freeing or doing anything on the primary MMU so that the secondary MMU code can drop sptes before the pages are freed, avoiding all page pinning and allowing 100% reliable swapping of guest physical address space. Furthermore it avoids the code that teardown the mappings of the secondary MMU, to implement a logic like tlb_gather in zap_page_range that would require many IPI to flush other cpu tlbs, for each fixed number of spte unmapped. To make an example: if what happens on the primary MMU is a protection downgrade (from writeable to wrprotect) the secondary MMU mappings will be invalidated, and the next secondary-mmu-page-fault will call get_user_pages and trigger a do_wp_page through get_user_pages if it called get_user_pages with write=1, and it'll re-establishing an updated spte or secondary-tlb-mapping on the copied page. Or it will setup a readonly spte or readonly tlb mapping if it's a guest-read, if it calls get_user_pages with write=0. This is just an example. This allows to map any page pointed by any pte (and in turn visible in the primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an full MMU with both sptes and secondary-tlb like the shadow-pagetable layer with kvm), or a remote DMA in software like XPMEM (hence needing of schedule in XPMEM code to send the invalidate to the remote node, while no need to schedule in kvm/gru as it's an immediate event like invalidating primary-mmu pte). At least for KVM without this patch it's impossible to swap guests reliably. And having this feature and removing the page pin allows several other optimizations that simplify life considerably. Dependencies: 1) mm_take_all_locks() to register the mmu notifier when the whole VM isn't doing anything with "mm". This allows mmu notifier users to keep track if the VM is in the middle of the invalidate_range_begin/end critical section with an atomic counter incraese in range_begin and decreased in range_end. No secondary MMU page fault is allowed to map any spte or secondary tlb reference, while the VM is in the middle of range_begin/end as any page returned by get_user_pages in that critical section could later immediately be freed without any further ->invalidate_page notification (invalidate_range_begin/end works on ranges and ->invalidate_page isn't called immediately before freeing the page). To stop all page freeing and pagetable overwrites the mmap_sem must be taken in write mode and all other anon_vma/i_mmap locks must be taken too. 2) It'd be a waste to add branches in the VM if nobody could possibly run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of mmu notifiers, but this already allows to compile a KVM external module against a kernel with mmu notifiers enabled and from the next pull from kvm.git we'll start using them. And GRU/XPMEM will also be able to continue the development by enabling KVM=m in their config, until they submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n). This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM are all =n. The mmu_notifier_register call can fail because mm_take_all_locks may be interrupted by a signal and return -EINTR. Because mmu_notifier_reigster is used when a driver startup, a failure can be gracefully handled. Here an example of the change applied to kvm to register the mmu notifiers. Usually when a driver startups other allocations are required anyway and -ENOMEM failure paths exists already. struct kvm *kvm_arch_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + int err; if (!kvm) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops; + err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm); + if (err) { + kfree(kvm); + return ERR_PTR(err); + } + return kvm; } mmu_notifier_unregister returns void and it's reliable. The patch also adds a few needed but missing includes that would prevent kernel to compile after these changes on non-x86 archs (x86 didn't need them by luck). [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix mm/filemap_xip.c build] [akpm@linux-foundation.org: fix mm/mmu_notifier.c build] Signed-off-by: Andrea Arcangeli Signed-off-by: Nick Piggin Signed-off-by: Christoph Lameter Cc: Jack Steiner Cc: Robin Holt Cc: Nick Piggin Cc: Peter Zijlstra Cc: Kanoj Sarcar Cc: Roland Dreier Cc: Steve Wise Cc: Avi Kivity Cc: Hugh Dickins Cc: Rusty Russell Cc: Anthony Liguori Cc: Chris Wright Cc: Marcelo Tosatti Cc: Eric Dumazet Cc: "Paul E. McKenney" Cc: Izik Eidus Cc: Anthony Liguori Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8214ba7c8bb..7ce2ebe8479 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -414,6 +415,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; + mmu_notifier_mm_init(mm); return mm; } @@ -446,6 +448,7 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + mmu_notifier_mm_destroy(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); -- cgit v1.2.3