From a9322f6488b432ddc1e89be88242c827c633fb63 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Wed, 11 Jun 2008 16:35:14 +0200 Subject: x86, pci: introduce pci=noioapicquirk kernel cmdline option Introduce pci=noioapicquirk kernel cmdline option to disable all boot interrupt quirks Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Signed-off-by: Ingo Molnar --- arch/x86/pci/common.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 940185ecaed..bc6a101ed7e 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | static int pci_bf_sort; int pci_routeirq; +int noioapicquirk; int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; @@ -495,6 +496,9 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "skip_isa_align")) { pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; return NULL; + } else if (!strcmp(str, "noioapicquirk")) { + noioapicquirk = 1; + return NULL; } return str; } -- cgit v1.2.3 From 9197979b518573999d52d9e85bce1680682ed85c Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Wed, 11 Jun 2008 16:35:15 +0200 Subject: x86, pci: introduce pci=ioapicreroute kernel cmdline option Introduce pci=ioapicreroute kernel cmdline option to enable rerouting of boot interrupts to the primary io-apic. Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Signed-off-by: Ingo Molnar --- arch/x86/pci/common.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index bc6a101ed7e..0a9eaa736d9 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -23,6 +23,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | static int pci_bf_sort; int pci_routeirq; int noioapicquirk; +int noioapicreroute = 1; int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; @@ -499,6 +500,10 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "noioapicquirk")) { noioapicquirk = 1; return NULL; + } else if (!strcmp(str, "ioapicreroute")) { + if (noioapicreroute != -1) + noioapicreroute = 0; + return NULL; } return str; } -- cgit v1.2.3 From 41b9eb264c8407655db57b60b4457fe1b2ec9977 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Tue, 15 Jul 2008 13:48:55 +0200 Subject: x86, pci: introduce config option for pci reroute quirks (was: [PATCH 0/3] Boot IRQ quirks for Broadcom and AMD/ATI) This is against linux-2.6-tip, branch pci-ioapic-boot-irq-quirks. From: Stefan Assmann Subject: Introduce config option for pci reroute quirks The config option X86_REROUTE_FOR_BROKEN_BOOT_IRQS is introduced to enable (or disable) the redirection of the interrupt handler to the boot interrupt line by default. Depending on the existence of interrupt masking / threaded interrupt handling in the kernel (vanilla, rt, ...) and the maturity of the rerouting patch, users can enable or disable the redirection by default. This means that the reroute quirk can be applied to any kernel without changing it. Interrupt sharing could be increased if this option is enabled. However this option is vital for threaded interrupt handling, as done by the RT kernel. It should simplify the consolidation with the RT kernel. The option can be overridden by either pci=ioapicreroute or pci=noioapicreroute. Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Cc: Jesse Barnes Cc: Jon Masters Cc: Ihno Krumreich Cc: Sven Dietrich Cc: Daniel Gollub Cc: Felix Foerster Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 24 ++++++++++++++++++++++++ arch/x86/pci/common.c | 8 ++++++++ 2 files changed, 32 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 96e0c2ebc38..09521332636 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -665,6 +665,30 @@ config X86_VISWS_APIC def_bool y depends on X86_32 && X86_VISWS +config X86_REROUTE_FOR_BROKEN_BOOT_IRQS + bool "Reroute for broken boot IRQs" + default n + depends on X86_IO_APIC + help + This option enables a workaround that fixes a source of + spurious interrupts. This is recommended when threaded + interrupt handling is used on systems where the generation of + superfluous "boot interrupts" cannot be disabled. + + Some chipsets generate a legacy INTx "boot IRQ" when the IRQ + entry in the chipset's IO-APIC is masked (as, e.g. the RT + kernel does during interrupt handling). On chipsets where this + boot IRQ generation cannot be disabled, this workaround keeps + the original IRQ line masked so that only the equivalent "boot + IRQ" is delivered to the CPUs. The workaround also tells the + kernel to set up the IRQ handler on the boot IRQ line. In this + way only one interrupt is delivered to the kernel. Otherwise + the spurious second interrupt may cause the kernel to bring + down (vital) interrupt lines. + + Only affects "broken" chipsets. Interrupt sharing may be + increased on these systems. + config X86_MCE bool "Machine Check Exception" depends on !X86_VOYAGER diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 1485a26ddce..bb1a01f089e 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -24,7 +24,11 @@ unsigned int pci_early_dump_regs; static int pci_bf_sort; int pci_routeirq; int noioapicquirk; +#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS +int noioapicreroute = 0; +#else int noioapicreroute = 1; +#endif int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; @@ -528,6 +532,10 @@ char * __devinit pcibios_setup(char *str) if (noioapicreroute != -1) noioapicreroute = 0; return NULL; + } else if (!strcmp(str, "noioapicreroute")) { + if (noioapicreroute != -1) + noioapicreroute = 1; + return NULL; } return str; } -- cgit v1.2.3 From b0f209898f1a177bd503d49215b8c6628797a81c Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Tue, 21 Oct 2008 14:09:51 -0500 Subject: x86, uv: use consistent names for region size and conherence id on x86 and ia64 Use consistent names for region size and conherence id on x86 and ia64. The SGI xp drivers are used on both ia64 and x86. Using the same names (sn_coherency_id, sn_region_size) simplies the driver code. Signed-off-by: Russ Anderson Signed-off-by: Ingo Molnar --- arch/x86/kernel/bios_uv.c | 8 ++++---- arch/x86/kernel/genx2apic_uv_x.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index f0dfe6f17e7..7cefb7170e7 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -69,10 +69,10 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, long sn_partition_id; EXPORT_SYMBOL_GPL(sn_partition_id); -long uv_coherency_id; -EXPORT_SYMBOL_GPL(uv_coherency_id); -long uv_region_size; -EXPORT_SYMBOL_GPL(uv_region_size); +long sn_coherency_id; +EXPORT_SYMBOL_GPL(sn_coherency_id); +long sn_region_size; +EXPORT_SYMBOL_GPL(sn_region_size); int uv_type; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index bfd532843df..6cf35c8bd63 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -429,7 +429,7 @@ void __init uv_system_init(void) uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &uv_coherency_id, &uv_region_size); + &sn_coherency_id, &sn_region_size); uv_rtc_init(); for_each_present_cpu(cpu) { @@ -451,7 +451,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; + uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); -- cgit v1.2.3 From 9e899816d126cc6f7d405c349f65363214fe7399 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Oct 2008 12:33:16 +0200 Subject: x86, mm: enable GBPAGES option by default DIRECT_GBPAGES was under DEBUG_KERNEL && EXPERIMENTAL and disabled by default. Turn it on by default and put it under EMBEDDED. Signed-off-by: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 9 +++++++++ arch/x86/Kconfig.debug | 12 ------------ 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5b9b12321ad..c00aefcb47d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -946,6 +946,15 @@ config X86_PAE config ARCH_PHYS_ADDR_T_64BIT def_bool X86_64 || X86_PAE +config DIRECT_GBPAGES + bool "Enable 1GB pages for kernel pagetables" if EMBEDDED + default y + depends on X86_64 + help + Allow the kernel linear mapping to use 1GB pages on CPUs that + support it. This can improve the kernel's performance a tiny bit by + reducing TLB pressure. If in doubt, say "Y". + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2a3dfbd5e67..567fe543e09 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -114,18 +114,6 @@ config DEBUG_RODATA data. This is recommended so that we can catch kernel bugs sooner. If in doubt, say "Y". -config DIRECT_GBPAGES - bool "Enable gbpages-mapped kernel pagetables" - depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64 - help - Enable gigabyte pages support (if the CPU supports it). This can - improve the kernel's performance a tiny bit by reducing TLB - pressure. - - This is experimental code. - - If in doubt, say "N". - config DEBUG_RODATA_TEST bool "Testcase for the DEBUG_RODATA feature" depends on DEBUG_RODATA -- cgit v1.2.3 From b4b8f87bf4958cbad620654efc0882ac46c19846 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:08 +0200 Subject: i386, dumpstack: move crash_kexec before bust_spinlocks(0) in oops_end crash_kexec should not be called with console_sem held. Move the call before bust_spinlocks(0) in oops_end to avoid the problem. Signed-off-by: Alexander van Heukelum Acked-by: "Neil Horman" Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b3614752197..5493d31be4e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -309,6 +309,9 @@ unsigned __kprobes long oops_begin(void) void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + bust_spinlocks(0); die_owner = -1; add_taint(TAINT_DIE); @@ -318,8 +321,6 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) if (!regs) return; - if (kexec_should_crash(current)) - crash_kexec(regs); if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) -- cgit v1.2.3 From 874d93d11823b2b861addac6a5dc31162e924ab2 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:09 +0200 Subject: x86, dumpstack: let signr=0 signal no do_exit Change oops_end such that signr=0 signals that do_exit is not to be called. Currently, each use of __die is soon followed by a call to oops_end and 'regs' is set to NULL if oops_end is expected not to call do_exit. Change all such pairs to set signr=0 instead. On x86_64 oops_end is used 'bare' in die_nmi; use signr=0 instead of regs=NULL there, too. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 7 ++++--- arch/x86/kernel/dumpstack_64.c | 9 +++++---- arch/x86/mm/fault.c | 11 +++++++---- 3 files changed, 16 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5493d31be4e..7c22f99f0ef 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -318,7 +318,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - if (!regs) + if (!signr) return; if (in_interrupt()) @@ -371,17 +371,18 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); + int sig = SIGSEGV; if (die_nest_count < 3) { report_bug(regs->ip, regs); if (__die(str, regs, err)) - regs = NULL; + sig = 0; } else { printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); } - oops_end(flags, regs, SIGSEGV); + oops_end(flags, regs, sig); } static DEFINE_SPINLOCK(nmi_print_lock); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 96a5db7da8a..ffefea611ba 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -465,7 +465,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) /* Nest count reaches zero, release the lock. */ __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - if (!regs) { + if (!signr) { oops_exit(); return; } @@ -509,13 +509,14 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); + int sig = SIGSEGV; if (!user_mode(regs)) report_bug(regs->ip, regs); if (__die(str, regs, err)) - regs = NULL; - oops_end(flags, regs, SIGSEGV); + sig = 0; + oops_end(flags, regs, sig); } notrace __kprobes void @@ -539,7 +540,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) crash_kexec(regs); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - oops_end(flags, NULL, SIGBUS); + oops_end(flags, regs, 0); nmi_exit(); local_irq_enable(); do_exit(SIGBUS); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 31e8730fa24..20ef272c412 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -413,6 +413,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, unsigned long error_code) { unsigned long flags = oops_begin(); + int sig = SIGKILL; struct task_struct *tsk; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", @@ -423,8 +424,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; if (__die("Bad pagetable", regs, error_code)) - regs = NULL; - oops_end(flags, regs, SIGKILL); + sig = 0; + oops_end(flags, regs, sig); } #endif @@ -590,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) int fault; #ifdef CONFIG_X86_64 unsigned long flags; + int sig; #endif tsk = current; @@ -849,11 +851,12 @@ no_context: bust_spinlocks(0); do_exit(SIGKILL); #else + sig = SIGKILL; if (__die("Oops", regs, error_code)) - regs = NULL; + sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, SIGKILL); + oops_end(flags, regs, sig); #endif /* -- cgit v1.2.3 From 0ed7a498f416dcfa1cca478a559238a2a3396240 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:10 +0200 Subject: x86_64, dumpstack: move kexec_crash from __die to oops_end oops_end is preceded by either a call to __die, or a conditional call to crash_kexec. Move the conditional call to crash_kexec from the end of __die to the start of oops_end and remove the superfluous call to crash_kexec in die_nmi. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index ffefea611ba..57ce11b895c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -458,6 +458,9 @@ unsigned __kprobes long oops_begin(void) void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + die_owner = -1; bust_spinlocks(0); die_nest_count--; @@ -501,8 +504,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk(KERN_ALERT "RIP "); printk_address(regs->ip, 1); printk(" RSP <%016lx>\n", regs->sp); - if (kexec_should_crash(current)) - crash_kexec(regs); return 0; } @@ -536,11 +537,9 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); + oops_end(flags, regs, 0); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - oops_end(flags, regs, 0); nmi_exit(); local_irq_enable(); do_exit(SIGBUS); -- cgit v1.2.3 From 10b14cb7eb7dd5bff8023f76a55c8ac20e586128 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:11 +0200 Subject: x86, dumpstack: always call oops_exit from oops_end Always call oops_exit from oops_end, even if signr==0. Also, move add_taint(TAINT_DIE) from __die to oops_end on x86_64 and interchange two lines to make oops_end more similar to the i386-version. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 2 +- arch/x86/kernel/dumpstack_64.c | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 7c22f99f0ef..a29b88ffa34 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -318,6 +318,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); + oops_exit(); if (!signr) return; @@ -325,7 +326,6 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - oops_exit(); do_exit(signr); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 57ce11b895c..dc6162bf745 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -461,22 +461,22 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) if (regs && kexec_should_crash(current)) crash_kexec(regs); - die_owner = -1; bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); die_nest_count--; if (!die_nest_count) /* Nest count reaches zero, release the lock. */ __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - if (!signr) { - oops_exit(); + oops_exit(); + + if (!signr) return; - } if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - oops_exit(); do_exit(signr); } @@ -499,7 +499,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) return 1; show_registers(regs); - add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ printk(KERN_ALERT "RIP "); printk_address(regs->ip, 1); -- cgit v1.2.3 From e4955cfd2f5c81eb708f55769aa60173f207fd63 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:12 +0200 Subject: i386, dumpstack: use x86_64's method to account die_nest_count oops_begin/oops_end should always be used in pairs. On x86_64 oops_begin increments die_nest_count, and oops_end decrements die_nest_count. Doing this makes oops_begin and oops_end equal to the x86_64 versions. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index a29b88ffa34..7c7d691b32b 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -289,21 +289,24 @@ static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { + int cpu; unsigned long flags; oops_enter(); - if (die_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die_lock); - die_owner = smp_processor_id(); - die_nest_count = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); } die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); return flags; } @@ -315,13 +318,15 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) bust_spinlocks(0); die_owner = -1; add_taint(TAINT_DIE); - __raw_spin_unlock(&die_lock); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - oops_exit(); + if (!signr) return; - if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) -- cgit v1.2.3 From e06ca430c3d0fddbd1c901ab3fb3e1f0bc8a786b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:13 +0200 Subject: i386, dumpstack: use oops_begin/oops_end in die_nmi Use oops_begin and oops_end in die_nmi. Whitespace-only changes on x86_64, to make it equal to i386's version. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 33 +++++++++++---------------------- arch/x86/kernel/dumpstack_64.c | 4 ++-- 2 files changed, 13 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 7c7d691b32b..e91ae34f968 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -390,40 +390,29 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static DEFINE_SPINLOCK(nmi_print_lock); - void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { + unsigned long flags; + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; - spin_lock(&nmi_print_lock); /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); + * We are in trouble anyway, lets at least try + * to get a message out. + */ + flags = oops_begin(); printk(KERN_EMERG "%s", str); printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); show_registers(regs); - if (do_panic) + oops_end(flags, regs, 0); + if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - bust_spinlocks(0); - do_exit(SIGSEGV); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); } static int __init oops_setup(char *s) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index dc6162bf745..831e1e159cb 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -519,7 +519,7 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -notrace __kprobes void +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { unsigned long flags; @@ -527,11 +527,11 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; - flags = oops_begin(); /* * We are in trouble anyway, lets at least try * to get a message out. */ + flags = oops_begin(); printk(KERN_EMERG "%s", str); printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); -- cgit v1.2.3 From 871d3779cba18b028e34d0d2f6cc6caae76a97b6 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:14 +0200 Subject: i386, dumpstack: unify die() Make i386's die() equal to x86_64's version. Whitespace-only changes on x86_64, to make it equal to i386's version. (user_mode and user_mode_vm are equal on x86_64.) Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 10 +++------- arch/x86/kernel/dumpstack_64.c | 6 +++++- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e91ae34f968..f2046c5752d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -378,15 +378,11 @@ void die(const char *str, struct pt_regs *regs, long err) unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (die_nest_count < 3) { + if (!user_mode_vm(regs)) report_bug(regs->ip, regs); - if (__die(str, regs, err)) - sig = 0; - } else { - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - } - + if (__die(str, regs, err)) + sig = 0; oops_end(flags, regs, sig); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 831e1e159cb..28c67aae556 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -506,12 +506,16 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) return 0; } +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (!user_mode(regs)) + if (!user_mode_vm(regs)) report_bug(regs->ip, regs); if (__die(str, regs, err)) -- cgit v1.2.3 From 63fb70859f987f2b3b8028fa467fd63336315e9c Mon Sep 17 00:00:00 2001 From: Sitsofe Wheeler Date: Sat, 11 Oct 2008 20:27:53 +0100 Subject: x86: change OPTIMIZE_INLINING help to say enabling makes smaller kernels Impact: clarify Kconfig help text The OPTIMIZE_INLINING help currently says "The gcc 4.x series have a rewritten inlining algorithm and disabling this option will generate a smaller kernel there." This contradicts other parts of the help text and my own tests: 5463127 2008-10-11 19:51 vmlinux.no-opt 5456152 2008-10-11 19:56 vmlinux.opt Reword text to say that enabling OPTIMIZE_INLINING will lead to smaller kernels with gcc 4.x or later. Signed-off-by: Sitsofe Wheeler Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2a3dfbd5e67..2be1e6b8e18 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -307,10 +307,10 @@ config OPTIMIZE_INLINING developers have marked 'inline'. Doing so takes away freedom from gcc to do what it thinks is best, which is desirable for the gcc 3.x series of compilers. The gcc 4.x series have a rewritten inlining algorithm and - disabling this option will generate a smaller kernel there. Hopefully - this algorithm is so good that allowing gcc4 to make the decision can - become the default in the future, until then this option is there to - test gcc for this. + enabling this option will generate a smaller kernel there. Hopefully + this algorithm is so good that allowing gcc 4.x and above to make the + decision will become the default in the future. Until then this option + is there to test gcc for this. If unsure, say N. -- cgit v1.2.3 From 7f1baa063e2582dd52d83bb31508e9e84468c666 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 24 Oct 2008 15:24:29 -0700 Subject: x86/uv: provide a System Activity Indicator driver Impact: start per CPU heartbeat LED timers on SGI UV systems The SGI UV system has no LEDS but uses one of the system controller regs to indicate the online internal state of the cpu. There is a heartbeat bit indicating that the cpu is responding to interrupts, and an idle bit indicating whether the cpu is idle when the heartbeat interrupt occurs. The current period is one second. When a cpu panics, an error code is written by BIOS to this same reg. This patchset provides the following: * x86_64: Add base functionality for writing to the specific SCIR's for each cpu. * heartbeat: Invert "heartbeat" bit to indicate the cpu is "interruptible". If the current thread is the idle thread, then indicate system is "idle". * if hotplug enabled, all bits are set (0xff) when the cpu is disabled. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 63 +++++++++++++++++++++++- arch/x86/kernel/genx2apic_uv_x.c | 102 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index c6ad93e315c..400776dba9b 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -112,6 +112,16 @@ */ #define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) +struct uv_scir_s { + struct timer_list timer; + unsigned long offset; + unsigned long last; + unsigned long idle_on; + unsigned long idle_off; + unsigned char state; + unsigned char enabled; +}; + /* * The following defines attributes of the HUB chip. These attributes are * frequently referenced and are kept in the per-cpu data areas of each cpu. @@ -130,7 +140,9 @@ struct uv_hub_info_s { unsigned char blade_processor_id; unsigned char m_val; unsigned char n_val; + struct uv_scir_s scir; }; + DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) @@ -162,6 +174,30 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_APIC_PNODE_SHIFT 6 +/* Local Bus from cpu's perspective */ +#define LOCAL_BUS_BASE 0x1c00000 +#define LOCAL_BUS_SIZE (4 * 1024 * 1024) + +/* + * System Controller Interface Reg + * + * Note there are NO leds on a UV system. This register is only + * used by the system controller to monitor system-wide operation. + * There are 64 regs per node. With Nahelem cpus (2 cores per node, + * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on + * a node. + * + * The window is located at top of ACPI MMR space + */ +#define SCIR_WINDOW_COUNT 64 +#define SCIR_LOCAL_MMR_BASE (LOCAL_BUS_BASE + \ + LOCAL_BUS_SIZE - \ + SCIR_WINDOW_COUNT) + +#define SCIR_CPU_HEARTBEAT 0x01 /* timer interrupt */ +#define SCIR_CPU_ACTIVITY 0x02 /* not idle */ +#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ + /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. @@ -276,6 +312,16 @@ static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) *uv_local_mmr_address(offset) = val; } +static inline unsigned char uv_read_local_mmr8(unsigned long offset) +{ + return *((unsigned char *)uv_local_mmr_address(offset)); +} + +static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val) +{ + *((unsigned char *)uv_local_mmr_address(offset)) = val; +} + /* * Structures and definitions for converting between cpu, node, pnode, and blade * numbers. @@ -350,5 +396,20 @@ static inline int uv_num_possible_blades(void) return uv_possible_blades; } -#endif /* _ASM_X86_UV_UV_HUB_H */ +/* Update SCIR state */ +static inline void uv_set_scir_bits(unsigned char value) +{ + if (uv_hub_info->scir.state != value) { + uv_hub_info->scir.state = value; + uv_write_local_mmr8(uv_hub_info->scir.offset, value); + } +} +static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) +{ + if (uv_cpu_hub_info(cpu)->scir.state != value) { + uv_cpu_hub_info(cpu)->scir.state = value; + uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value); + } +} +#endif /* _ASM_X86_UV_UV_HUB_H */ diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index dc6b4696152..84367d84bb1 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -18,6 +19,8 @@ #include #include #include +#include +#include #include #include #include @@ -356,6 +359,103 @@ static __init void uv_rtc_init(void) sn_rtc_cycles_per_second = ticks_per_sec; } +/* + * percpu heartbeat timer + */ +static void uv_heartbeat(unsigned long ignored) +{ + struct timer_list *timer = &uv_hub_info->scir.timer; + unsigned char bits = uv_hub_info->scir.state; + + /* flip heartbeat bit */ + bits ^= SCIR_CPU_HEARTBEAT; + + /* are we the idle thread? */ + if (current->pid == 0) + bits &= ~SCIR_CPU_ACTIVITY; + else + bits |= SCIR_CPU_ACTIVITY; + + /* update system controller interface reg */ + uv_set_scir_bits(bits); + + /* enable next timer period */ + mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); +} + +static void __cpuinit uv_heartbeat_enable(int cpu) +{ + if (!uv_cpu_hub_info(cpu)->scir.enabled) { + struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; + + uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); + setup_timer(timer, uv_heartbeat, cpu); + timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; + add_timer_on(timer, cpu); + uv_cpu_hub_info(cpu)->scir.enabled = 1; + } + + /* check boot cpu */ + if (!uv_cpu_hub_info(0)->scir.enabled) + uv_heartbeat_enable(0); +} + +static void __cpuinit uv_heartbeat_disable(int cpu) +{ + if (uv_cpu_hub_info(cpu)->scir.enabled) { + uv_cpu_hub_info(cpu)->scir.enabled = 0; + del_timer(&uv_cpu_hub_info(cpu)->scir.timer); + } + uv_set_cpu_scir_bits(cpu, 0xff); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * cpu hotplug notifier + */ +static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + uv_heartbeat_enable(cpu); + break; + case CPU_DOWN_PREPARE: + uv_heartbeat_disable(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static __init void uv_scir_register_cpu_notifier(void) +{ + hotcpu_notifier(uv_scir_cpu_notify, 0); +} + +#else /* !CONFIG_HOTPLUG_CPU */ + +static __init void uv_scir_register_cpu_notifier(void) +{ +} + +static __init int uv_init_heartbeat(void) +{ + int cpu; + + if (is_uv_system()) + for_each_online_cpu(cpu) + uv_heartbeat_enable(cpu); + return 0; +} + +late_initcall(uv_init_heartbeat); + +#endif /* !CONFIG_HOTPLUG_CPU */ + /* * Called on each cpu to initialize the per_cpu UV data area. * ZZZ hotplug not supported yet @@ -452,6 +552,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; + uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); @@ -468,4 +569,5 @@ void __init uv_system_init(void) map_mmioh_high(max_pnode); uv_cpu_init(); + uv_scir_register_cpu_notifier(); } -- cgit v1.2.3 From fd3fdf11d3c649769e02459c5f1b8081a15e9007 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Fri, 24 Oct 2008 20:08:11 +0300 Subject: trace: add the MMIO-tracer to the tracer menu, cleanup Impact: cleanup We can remove MMIOTRACE_HOOKS and replace it with just MMIOTRACE. MMIOTRACE_HOOKS is a remnant from the time when I thought that something else could also use the kmmio facilities. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 4 ---- arch/x86/mm/Makefile | 3 +-- arch/x86/mm/fault.c | 2 +- 3 files changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2a3dfbd5e67..fa013f529b7 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -186,14 +186,10 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. -config MMIOTRACE_HOOKS - bool - config MMIOTRACE bool "Memory mapped IO tracing" depends on DEBUG_KERNEL && PCI select TRACING - select MMIOTRACE_HOOKS help Mmiotrace traces Memory Mapped I/O access and is meant for debugging and reverse engineering. It is called from the ioremap diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 59f89b434b4..0a21b7aab9d 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o -obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o obj-$(CONFIG_MMIOTRACE) += mmiotrace.o -mmiotrace-y := pf_in.o mmio-mod.o +mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa_$(BITS).o diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 31e8730fa24..4152d3c3b13 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -53,7 +53,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE_HOOKS +#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; -- cgit v1.2.3 From 04d2aac33eb54fd3084140f2db130530d71e97c6 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Oct 2008 11:08:10 -0700 Subject: x86: corruption-check: fix some style issues Impact: cleanup Before moving the code to it's own file, fix some style issues in the corruption check code. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa6790c1dd..4f38e0305b0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -662,7 +662,7 @@ static void __init setup_bios_corruption_check(void) corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { + while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { u64 size; addr = find_e820_area_size(addr, &size, PAGE_SIZE); @@ -701,11 +701,11 @@ void check_for_bios_corruption(void) if (!memory_corruption_check) return; - for(i = 0; i < num_scan_areas; i++) { + for (i = 0; i < num_scan_areas; i++) { unsigned long *addr = __va(scan_areas[i].addr); unsigned long size = scan_areas[i].size; - for(; size; addr++, size -= sizeof(unsigned long)) { + for (; size; addr++, size -= sizeof(unsigned long)) { if (!*addr) continue; printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", @@ -721,7 +721,8 @@ void check_for_bios_corruption(void) static void periodic_check_for_corruption(unsigned long data) { check_for_bios_corruption(); - mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ)); + mod_timer(&periodic_check_timer, + round_jiffies(jiffies + corruption_check_period*HZ)); } void start_periodic_check_for_corruption(void) -- cgit v1.2.3 From 6784f7d0a5016a397d38be1134e63fc784c1ca8e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Oct 2008 11:33:42 -0700 Subject: x86: corruption check: move the corruption checks into their own file Impact: cleanup The corruption check code is rather sizable and it's likely to grow over time when we add checks for more types of corruptions (there's a few candidates in kerneloops.org that I want to add checks for)... so lets move it to its own file Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/include/asm/setup.h | 4 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/check.c | 158 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 152 ----------------------------------------- 4 files changed, 163 insertions(+), 152 deletions(-) create mode 100644 arch/x86/kernel/check.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f12d3723746..1ed8b2e8072 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -8,6 +8,10 @@ /* Interrupt control for vSMPowered x86_64 systems */ void vsmp_init(void); + +void setup_bios_corruption_check(void); + + #ifdef CONFIG_X86_VISWS extern void visws_early_detect(void); extern int is_visws_box(void); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d7e5a58ee22..31fbcaf3df7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,6 +35,7 @@ obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o obj-y += tsc.o io_delay.o rtc.o +obj-y += check.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c new file mode 100644 index 00000000000..5056703e1b0 --- /dev/null +++ b/arch/x86/kernel/check.c @@ -0,0 +1,158 @@ +#include +#include + +#include +#include + +/* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +#define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = -1; + +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + +static struct e820entry scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + + +static int set_corruption_check(char *arg) +{ + char *end; + + memory_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check", set_corruption_check); + +static int set_corruption_check_period(char *arg) +{ + char *end; + + corruption_check_period = simple_strtoul(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + +void __init setup_bios_corruption_check(void) +{ + u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { + u64 size; + addr = find_e820_area_size(addr, &size, PAGE_SIZE); + + if (addr == 0) + break; + + if ((addr + size) > corruption_check_size) + size = corruption_check_size - addr; + + if (size == 0) + break; + + e820_update_range(addr, size, E820_RAM, E820_RESERVED); + scan_areas[num_scan_areas].addr = addr; + scan_areas[num_scan_areas].size = size; + num_scan_areas++; + + /* Assume we've already mapped this early memory */ + memset(__va(addr), 0, size); + + addr += size; + } + + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", + num_scan_areas); + update_e820(); +} + +static struct timer_list periodic_check_timer; + +void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!memory_corruption_check) + return; + + for (i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for (; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", + addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); +} + +static void periodic_check_for_corruption(unsigned long data) +{ + check_for_bios_corruption(); + mod_timer(&periodic_check_timer, + round_jiffies(jiffies + corruption_check_period*HZ)); +} + +void start_periodic_check_for_corruption(void) +{ + if (!memory_corruption_check || corruption_check_period == 0) + return; + + printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", + corruption_check_period); + + init_timer(&periodic_check_timer); + periodic_check_timer.function = &periodic_check_for_corruption; + periodic_check_for_corruption(0); +} +#endif + diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4f38e0305b0..af690aa593a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -587,158 +587,6 @@ static struct x86_quirks default_x86_quirks __initdata; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; -/* - * Some BIOSes seem to corrupt the low 64k of memory during events - * like suspend/resume and unplugging an HDMI cable. Reserve all - * remaining free memory in that area and fill it with a distinct - * pattern. - */ -#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION -#define MAX_SCAN_AREAS 8 - -static int __read_mostly memory_corruption_check = -1; - -static unsigned __read_mostly corruption_check_size = 64*1024; -static unsigned __read_mostly corruption_check_period = 60; /* seconds */ - -static struct e820entry scan_areas[MAX_SCAN_AREAS]; -static int num_scan_areas; - - -static int set_corruption_check(char *arg) -{ - char *end; - - memory_corruption_check = simple_strtol(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check", set_corruption_check); - -static int set_corruption_check_period(char *arg) -{ - char *end; - - corruption_check_period = simple_strtoul(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check_period", set_corruption_check_period); - -static int set_corruption_check_size(char *arg) -{ - char *end; - unsigned size; - - size = memparse(arg, &end); - - if (*end == '\0') - corruption_check_size = size; - - return (size == corruption_check_size) ? 0 : -EINVAL; -} -early_param("memory_corruption_check_size", set_corruption_check_size); - - -static void __init setup_bios_corruption_check(void) -{ - u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ - - if (memory_corruption_check == -1) { - memory_corruption_check = -#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK - 1 -#else - 0 -#endif - ; - } - - if (corruption_check_size == 0) - memory_corruption_check = 0; - - if (!memory_corruption_check) - return; - - corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - - while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { - u64 size; - addr = find_e820_area_size(addr, &size, PAGE_SIZE); - - if (addr == 0) - break; - - if ((addr + size) > corruption_check_size) - size = corruption_check_size - addr; - - if (size == 0) - break; - - e820_update_range(addr, size, E820_RAM, E820_RESERVED); - scan_areas[num_scan_areas].addr = addr; - scan_areas[num_scan_areas].size = size; - num_scan_areas++; - - /* Assume we've already mapped this early memory */ - memset(__va(addr), 0, size); - - addr += size; - } - - printk(KERN_INFO "Scanning %d areas for low memory corruption\n", - num_scan_areas); - update_e820(); -} - -static struct timer_list periodic_check_timer; - -void check_for_bios_corruption(void) -{ - int i; - int corruption = 0; - - if (!memory_corruption_check) - return; - - for (i = 0; i < num_scan_areas; i++) { - unsigned long *addr = __va(scan_areas[i].addr); - unsigned long size = scan_areas[i].size; - - for (; size; addr++, size -= sizeof(unsigned long)) { - if (!*addr) - continue; - printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", - addr, __pa(addr), *addr); - corruption = 1; - *addr = 0; - } - } - - WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); -} - -static void periodic_check_for_corruption(unsigned long data) -{ - check_for_bios_corruption(); - mod_timer(&periodic_check_timer, - round_jiffies(jiffies + corruption_check_period*HZ)); -} - -void start_periodic_check_for_corruption(void) -{ - if (!memory_corruption_check || corruption_check_period == 0) - return; - - printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", - corruption_check_period); - - init_timer(&periodic_check_timer); - periodic_check_timer.function = &periodic_check_for_corruption; - periodic_check_for_corruption(0); -} -#endif - static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { printk(KERN_NOTICE -- cgit v1.2.3 From 304e629bf4a3150a0bf6556fc45c52c5c082340f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Oct 2008 12:09:03 -0700 Subject: x86: corruption check: run the corruption checks from a work queue Impact: change the implementation of the debug feature the periodic corruption checks are better off run from a work queue; there's nothing time critical about them and this way the amount of interrupt-context work is reduced. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/check.c | 27 +++++++++++++++++---------- arch/x86/mm/init_32.c | 2 -- arch/x86/mm/init_64.c | 2 -- 3 files changed, 17 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 5056703e1b0..55eed1752b4 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -1,6 +1,7 @@ #include #include - +#include +#include #include #include @@ -108,13 +109,14 @@ void __init setup_bios_corruption_check(void) update_e820(); } -static struct timer_list periodic_check_timer; void check_for_bios_corruption(void) { int i; int corruption = 0; + printk("dot\n"); + if (!memory_corruption_check) return; @@ -135,24 +137,29 @@ void check_for_bios_corruption(void) WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); } -static void periodic_check_for_corruption(unsigned long data) +static void check_corruption(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(bios_check_work, check_corruption); + +static void check_corruption(struct work_struct *dummy) { check_for_bios_corruption(); - mod_timer(&periodic_check_timer, - round_jiffies(jiffies + corruption_check_period*HZ)); + schedule_delayed_work(&bios_check_work, + round_jiffies_relative(corruption_check_period*HZ)); } -void start_periodic_check_for_corruption(void) +static int start_periodic_check_for_corruption(void) { if (!memory_corruption_check || corruption_check_period == 0) - return; + return 0; printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", corruption_check_period); - init_timer(&periodic_check_timer); - periodic_check_timer.function = &periodic_check_for_corruption; - periodic_check_for_corruption(0); + /* First time we run the checks right away */ + schedule_delayed_work(&bios_check_work, 0); + return 0; } + +module_init(start_periodic_check_for_corruption); #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8396868e82c..5e6377560ff 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -970,8 +970,6 @@ void __init mem_init(void) int codesize, reservedpages, datasize, initsize; int tmp; - start_periodic_check_for_corruption(); - #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b8e461d4941..d6ef1589b95 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -879,8 +879,6 @@ void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; - start_periodic_check_for_corruption(); - pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ -- cgit v1.2.3 From b43d196c4d3fe46d6dda7c987c47792612b80b1b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Oct 2008 12:21:32 -0700 Subject: x86: corruption-check: some post-move cleanups Impact: cleanup now that the code is moved and converted to a work queue, there's some minor cleanups that can be done. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 ++- arch/x86/kernel/check.c | 12 ++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 31fbcaf3df7..f63a8034fb8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,7 +35,6 @@ obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o obj-y += tsc.o io_delay.o rtc.o -obj-y += check.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o @@ -105,6 +104,8 @@ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 55eed1752b4..2ac0ab71412 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -11,7 +11,6 @@ * remaining free memory in that area and fill it with a distinct * pattern. */ -#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION #define MAX_SCAN_AREAS 8 static int __read_mostly memory_corruption_check = -1; @@ -23,7 +22,7 @@ static struct e820entry scan_areas[MAX_SCAN_AREAS]; static int num_scan_areas; -static int set_corruption_check(char *arg) +static __init int set_corruption_check(char *arg) { char *end; @@ -33,7 +32,7 @@ static int set_corruption_check(char *arg) } early_param("memory_corruption_check", set_corruption_check); -static int set_corruption_check_period(char *arg) +static __init int set_corruption_check_period(char *arg) { char *end; @@ -43,7 +42,7 @@ static int set_corruption_check_period(char *arg) } early_param("memory_corruption_check_period", set_corruption_check_period); -static int set_corruption_check_size(char *arg) +static __init int set_corruption_check_size(char *arg) { char *end; unsigned size; @@ -115,8 +114,6 @@ void check_for_bios_corruption(void) int i; int corruption = 0; - printk("dot\n"); - if (!memory_corruption_check) return; @@ -134,7 +131,7 @@ void check_for_bios_corruption(void) } } - WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); + WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n"); } static void check_corruption(struct work_struct *dummy); @@ -161,5 +158,4 @@ static int start_periodic_check_for_corruption(void) } module_init(start_periodic_check_for_corruption); -#endif -- cgit v1.2.3 From 6f290b4e016d6c61511542cf6d9ebdef1965978e Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Mon, 27 Oct 2008 12:42:34 -0400 Subject: x86, NMI watchdog: add support to enable and disable IOAPIC NMI Impact: change/improve the way /proc/sys/kernel/nmi_watchdog works This patch adds support to enable/disable IOAPIC NMI watchdog in runtime via procfs. Signed-off-by: Aristeu Rozanski Cc: "Maciej W. Rozycki" Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 2c97f07f1c2..2c005fac617 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -340,6 +340,8 @@ void stop_apic_nmi_watchdog(void *unused) return; if (nmi_watchdog == NMI_LOCAL_APIC) lapic_watchdog_stop(); + else + __acpi_nmi_disable(NULL); __get_cpu_var(wd_enabled) = 0; atomic_dec(&nmi_active); } @@ -465,6 +467,24 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) #ifdef CONFIG_SYSCTL +static void enable_ioapic_nmi_watchdog_single(void *unused) +{ + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); + __acpi_nmi_enable(NULL); +} + +static void enable_ioapic_nmi_watchdog(void) +{ + on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1); + touch_nmi_watchdog(); +} + +static void disable_ioapic_nmi_watchdog(void) +{ + on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); +} + static int __init setup_unknown_nmi_panic(char *str) { unknown_nmi_panic = 1; @@ -507,6 +527,11 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, enable_lapic_nmi_watchdog(); else disable_lapic_nmi_watchdog(); + } else if (nmi_watchdog == NMI_IO_APIC) { + if (nmi_watchdog_enabled) + enable_ioapic_nmi_watchdog(); + else + disable_ioapic_nmi_watchdog(); } else { printk(KERN_WARNING "NMI watchdog doesn't know what hardware to touch\n"); -- cgit v1.2.3 From 7d5a78cd98c3a5eb83bd6a061c5ea6ef1e9b8fcb Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Mon, 27 Oct 2008 12:42:35 -0400 Subject: x86, NMI watchdog: disable NMIs on LVT0 in case NMI watchdog is not working Impact: change NMI watchdog detection and disabling sequence Currently, if the NMI watchdog fails using IOAPIC method, it'll only disable interrupts on 8259 if the timer is passing thru it. This patch disables NMI delivery on LINT0 if the NMI watchdog initial test fails, just for safety. Signed-off-by: Aristeu Rozanski Cc: "Maciej W. Rozycki" Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 2c005fac617..13316cf57cd 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -131,6 +131,11 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count) atomic_dec(&nmi_active); } +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -179,8 +184,12 @@ int __init check_nmi_watchdog(void) kfree(prev_nmi_count); return 0; error: - if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) - disable_8259A_irq(0); + if (nmi_watchdog == NMI_IO_APIC) { + if (!timer_through_8259) + disable_8259A_irq(0); + on_each_cpu(__acpi_nmi_disable, NULL, 1); + } + #ifdef CONFIG_X86_32 timer_ack = 0; #endif @@ -285,11 +294,6 @@ void acpi_nmi_enable(void) on_each_cpu(__acpi_nmi_enable, NULL, 1); } -static void __acpi_nmi_disable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - /* * Disable timer based NMIs on all CPUs: */ -- cgit v1.2.3 From 878719e831d9e076961aa15d4049a57a6668c67a Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 23 Oct 2008 10:40:06 -0400 Subject: x86: unify appropriate bits from dumpstack_32 and dumpstack_64 Impact: cleanup As promised, now that dumpstack_32 and dumpstack_64 have so many bits in common, we should merge the in-sync bits into a common file, to prevent them from diverging again. This patch removes bits which are common between dumpstack_32.c and dumpstack_64.c and places them in a common dumpstack.c which is built for both 32 and 64 bit arches. Signed-off-by: Neil Horman Acked-by: Alexander van Heukelum Signed-off-by: Ingo Molnar Makefile | 2 arch/x86/kernel/Makefile | 2 arch/x86/kernel/Makefile | 2 arch/x86/kernel/Makefile | 2 arch/x86/kernel/Makefile | 2 arch/x86/kernel/Makefile | 2 arch/x86/kernel/dumpstack.c | 319 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/dumpstack.h | 39 +++++ arch/x86/kernel/dumpstack_32.c | 294 ------------------------------------- arch/x86/kernel/dumpstack_64.c | 285 ------------------------------------ 5 files changed, 363 insertions(+), 576 deletions(-) --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/dumpstack.c | 319 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/dumpstack.h | 39 +++++ arch/x86/kernel/dumpstack_32.c | 294 +------------------------------------ arch/x86/kernel/dumpstack_64.c | 285 +----------------------------------- 5 files changed, 363 insertions(+), 576 deletions(-) create mode 100644 arch/x86/kernel/dumpstack.c create mode 100644 arch/x86/kernel/dumpstack.h (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d7e5a58ee22..db3216a9d2b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -24,7 +24,7 @@ CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time_$(BITS).o ioport.o ldt.o +obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c new file mode 100644 index 00000000000..5962176dfab --- /dev/null +++ b/arch/x86/kernel/dumpstack.c @@ -0,0 +1,319 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dumpstack.h" + +int panic_on_unrecovered_nmi; +unsigned int code_bytes = 64; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); +} + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + sizeof(long)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; +} + + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk("%s <%s> ", (char *)data, name); + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk(data); + printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("%sCall Trace:\n", log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + get_bp(bp); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + show_trace(NULL, NULL, &stack, bp); +} +EXPORT_SYMBOL(dump_stack); + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + oops_exit(); + + if (!signr) + return; + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ +#ifdef CONFIG_X86_32 + unsigned short ss; + unsigned long sp; +#endif + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + sysfs_printk_last_file(); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); +#ifdef CONFIG_X86_32 + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); +#else + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); +#endif + return 0; +} + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + + if (!user_mode_vm(regs)) + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + sig = 0; + oops_end(flags, regs, sig); +} + +void notrace __kprobes +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + unsigned long flags; + + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + flags = oops_begin(); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + oops_end(flags, regs, 0); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); +} + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ + if (!s) + return -EINVAL; + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h new file mode 100644 index 00000000000..3119a801c32 --- /dev/null +++ b/arch/x86/kernel/dumpstack.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ + +#ifndef DUMPSTACK_H +#define DUMPSTACK_H + +#ifdef CONFIG_X86_32 +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) +#else +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) +#endif + +extern unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end); + +extern void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl); + +extern void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl); + +extern unsigned int code_bytes; +extern int kstack_depth_to_print; + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; +#endif diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f2046c5752d..7b031b106ec 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -17,64 +17,7 @@ #include -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%p>] %s%pS\n", (void *) address, - reliable ? "" : "? ", (void *) address); -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} +#include "dumpstack.h" void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, @@ -119,57 +62,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk(data); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { @@ -196,33 +89,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { @@ -283,159 +149,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - if (regs && kexec_should_crash(current)) - crash_kexec(regs); - - bust_spinlocks(0); - die_owner = -1; - add_taint(TAINT_DIE); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - oops_exit(); - - if (!signr) - return; - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - unsigned short ss; - unsigned long sp; - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - sysfs_printk_last_file(); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - int sig = SIGSEGV; - - if (!user_mode_vm(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - sig = 0; - oops_end(flags, regs, sig); -} - -void notrace __kprobes -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - flags = oops_begin(); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - oops_end(flags, regs, 0); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 28c67aae556..33ff10287a5 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -17,19 +17,7 @@ #include -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%p>] %s%pS\n", (void *) address, - reliable ? "" : "? ", (void *) address); -} +#include "dumpstack.h" static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) @@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) @@ -248,57 +191,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk(data); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { @@ -342,33 +235,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} -EXPORT_SYMBOL(dump_stack); - void show_registers(struct pt_regs *regs) { int i; @@ -429,150 +295,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - if (regs && kexec_should_crash(current)) - crash_kexec(regs); - - bust_spinlocks(0); - die_owner = -1; - add_taint(TAINT_DIE); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - oops_exit(); - - if (!signr) - return; - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - sysfs_printk_last_file(); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip, 1); - printk(" RSP <%016lx>\n", regs->sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - int sig = SIGSEGV; - - if (!user_mode_vm(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - sig = 0; - oops_end(flags, regs, sig); -} - -void notrace __kprobes -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - flags = oops_begin(); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - oops_end(flags, regs, 0); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); -- cgit v1.2.3 From 69a72a0e9337aad8c730e8e9942d5aa022bc4c5c Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 27 Oct 2008 07:51:20 -0700 Subject: x86/uv: update SCIR driver to use the idle_cpu() function Impact: cleanup Change UV heartbeat function to use idle_cpu to determine cpu's "idleness". Realign uv_hub definitions. Signed-of-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 26 +++++++++++++------------- arch/x86/kernel/genx2apic_uv_x.c | 4 ++-- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 400776dba9b..0ee12928e9e 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -128,19 +128,19 @@ struct uv_scir_s { * They are kept together in a struct to minimize cache misses. */ struct uv_hub_info_s { - unsigned long global_mmr_base; - unsigned long gpa_mask; - unsigned long gnode_upper; - unsigned long lowmem_remap_top; - unsigned long lowmem_remap_base; - unsigned short pnode; - unsigned short pnode_mask; - unsigned short coherency_domain_number; - unsigned short numa_blade_id; - unsigned char blade_processor_id; - unsigned char m_val; - unsigned char n_val; - struct uv_scir_s scir; + unsigned long global_mmr_base; + unsigned long gpa_mask; + unsigned long gnode_upper; + unsigned long lowmem_remap_top; + unsigned long lowmem_remap_base; + unsigned short pnode; + unsigned short pnode_mask; + unsigned short coherency_domain_number; + unsigned short numa_blade_id; + unsigned char blade_processor_id; + unsigned char m_val; + unsigned char n_val; + struct uv_scir_s scir; }; DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 84367d84bb1..85fb7dd48f6 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -370,8 +370,8 @@ static void uv_heartbeat(unsigned long ignored) /* flip heartbeat bit */ bits ^= SCIR_CPU_HEARTBEAT; - /* are we the idle thread? */ - if (current->pid == 0) + /* is this cpu idle? */ + if (idle_cpu(raw_smp_processor_id())) bits &= ~SCIR_CPU_ACTIVITY; else bits |= SCIR_CPU_ACTIVITY; -- cgit v1.2.3 From 30604bb410b53efa9c93ee8f03d7aa7494094faa Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 14 Oct 2008 18:59:18 -0700 Subject: x86: break up mtrr_cleanup() into several small functions. Ingo said mtrr_cleanup() is big and ugly. so break it up into more functions and make it more readable. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 346 ++++++++++++++++++++-------------------- 1 file changed, 171 insertions(+), 175 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index c78c04821ea..1159e269e59 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -803,6 +803,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, } static struct res_range __initdata range[RANGE_NUM]; +static int __initdata nr_range; #ifdef CONFIG_MTRR_SANITIZER @@ -1206,39 +1207,43 @@ struct mtrr_cleanup_result { #define PSHIFT (PAGE_SHIFT - 10) static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; -static struct res_range __initdata range_new[RANGE_NUM]; static unsigned long __initdata min_loss_pfn[RANGE_NUM]; -static int __init mtrr_cleanup(unsigned address_bits) +static void __init print_out_mtrr_range_state(void) { - unsigned long extra_remove_base, extra_remove_size; - unsigned long base, size, def, dummy; - mtrr_type type; - int nr_range, nr_range_new; - u64 chunk_size, gran_size; - unsigned long range_sums, range_sums_new; - int index_good; - int num_reg_good; int i; + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + mtrr_type type; - /* extra one for all 0 */ - int num[MTRR_NUM_TYPES + 1]; + for (i = 0; i < num_var_ranges; i++) { - if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) - return 0; - rdmsr(MTRRdefType_MSR, def, dummy); - def &= 0xff; - if (def != MTRR_TYPE_UNCACHABLE) - return 0; + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; - /* get it and store it aside */ - memset(range_state, 0, sizeof(range_state)); - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - range_state[i].base_pfn = base; - range_state[i].size_pfn = size; - range_state[i].type = type; + size_base = to_size_factor(size_base, &size_factor), + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor), + type = range_state[i].type; + + printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) + ); } +} + +static int __init mtrr_need_cleanup(void) +{ + int i; + mtrr_type type; + unsigned long size; + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; /* check entries number */ memset(num, 0, sizeof(num)); @@ -1263,29 +1268,133 @@ static int __init mtrr_cleanup(unsigned address_bits) num_var_ranges - num[MTRR_NUM_TYPES]) return 0; - /* print original var MTRRs at first, for debugging: */ - printk(KERN_DEBUG "original variable MTRRs\n"); - for (i = 0; i < num_var_ranges; i++) { - char start_factor = 'K', size_factor = 'K'; - unsigned long start_base, size_base; + return 1; +} - size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); - if (!size_base) - continue; +static unsigned long __initdata range_sums; +static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long extra_remove_base, + unsigned long extra_remove_size, + int i) +{ + int num_reg; + static struct res_range range_new[RANGE_NUM]; + static int nr_range_new; + unsigned long range_sums_new; + + /* convert ranges to var ranges state */ + num_reg = x86_setup_var_mtrrs(range, nr_range, + chunk_size, gran_size); + + /* we got new setting in range_state, check it */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + extra_remove_base, extra_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + result[i].chunk_sizek = chunk_size >> 10; + result[i].gran_sizek = gran_size >> 10; + result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = + (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else + result[i].lose_cover_sizek = + (range_sums - range_sums_new) << PSHIFT; - size_base = to_size_factor(size_base, &size_factor), - start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), - type = range_state[i].type; + /* double check it */ + if (!result[i].bad && !result[i].lose_cover_sizek) { + if (nr_range_new != nr_range || + memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; + } - printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", - i, start_base, start_factor, - size_base, size_factor, - (type == MTRR_TYPE_UNCACHABLE) ? "UC" : - ((type == MTRR_TYPE_WRPROT) ? "WP" : - ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) - ); + if (!result[i].bad && (range_sums - range_sums_new < + min_loss_pfn[num_reg])) { + min_loss_pfn[num_reg] = + range_sums - range_sums_new; } +} + +static void __init mtrr_print_out_one_result(int i) +{ + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); +} + +static int __init mtrr_search_optimal_index(void) +{ + int i; + int num_reg_good; + int index_good; + + if (nr_mtrr_spare_reg >= num_var_ranges) + nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; + for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { + if (!min_loss_pfn[i]) + num_reg_good = i; + } + + index_good = -1; + if (num_reg_good != -1) { + for (i = 0; i < NUM_RESULT; i++) { + if (!result[i].bad && + result[i].num_reg == num_reg_good && + !result[i].lose_cover_sizek) { + index_good = i; + break; + } + } + } + + return index_good; +} + + +static int __init mtrr_cleanup(unsigned address_bits) +{ + unsigned long extra_remove_base, extra_remove_size; + unsigned long base, size, def, dummy; + mtrr_type type; + u64 chunk_size, gran_size; + int index_good; + int i; + + if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* check if we need handle it and can handle it */ + if (!mtrr_need_cleanup()) + return 0; + + /* print original var MTRRs at first, for debugging: */ + printk(KERN_DEBUG "original variable MTRRs\n"); + print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); extra_remove_size = 0; @@ -1309,176 +1418,64 @@ static int __init mtrr_cleanup(unsigned address_bits) range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { - int num_reg; - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - - debug_print++; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, - mtrr_gran_size); + i = 0; + mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, + extra_remove_base, extra_remove_size, i); - /* we got new setting in range_state, check it */ - memset(range_new, 0, sizeof(range_new)); - nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, - extra_remove_size); - range_sums_new = sum_ranges(range_new, nr_range_new); + mtrr_print_out_one_result(i); - i = 0; - result[i].chunk_sizek = mtrr_chunk_size >> 10; - result[i].gran_sizek = mtrr_gran_size >> 10; - result[i].num_reg = num_reg; - if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; - result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; - - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad?"*BAD*":" ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad?"-":"", - lose_base, lose_factor); if (!result[i].bad) { set_var_mtrr_all(address_bits); return 1; } printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " "will find optimal one\n"); - debug_print--; - memset(result, 0, sizeof(result[0])); } i = 0; memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); memset(result, 0, sizeof(result)); for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { - char gran_factor; - unsigned long gran_base; - - if (debug_print) - gran_base = to_size_factor(gran_size >> 10, &gran_factor); for (chunk_size = gran_size; chunk_size < (1ULL<<32); chunk_size <<= 1) { - int num_reg; - if (debug_print) { - char chunk_factor; - unsigned long chunk_base; - - chunk_base = to_size_factor(chunk_size>>10, &chunk_factor), - printk(KERN_INFO "\n"); - printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n", - gran_base, gran_factor, chunk_base, chunk_factor); - } if (i >= NUM_RESULT) continue; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, - chunk_size, gran_size); - - /* we got new setting in range_state, check it */ - memset(range_new, 0, sizeof(range_new)); - nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, extra_remove_size); - range_sums_new = sum_ranges(range_new, nr_range_new); - - result[i].chunk_sizek = chunk_size >> 10; - result[i].gran_sizek = gran_size >> 10; - result[i].num_reg = num_reg; - if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; - result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; - - /* double check it */ - if (!result[i].bad && !result[i].lose_cover_sizek) { - if (nr_range_new != nr_range || - memcmp(range, range_new, sizeof(range))) - result[i].bad = 1; + mtrr_calc_range_state(chunk_size, gran_size, + extra_remove_base, extra_remove_size, i); + if (debug_print) { + mtrr_print_out_one_result(i); + printk(KERN_INFO "\n"); } - if (!result[i].bad && (range_sums - range_sums_new < - min_loss_pfn[num_reg])) { - min_loss_pfn[num_reg] = - range_sums - range_sums_new; - } i++; } } - /* print out all */ - for (i = 0; i < NUM_RESULT; i++) { - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad?"*BAD*":" ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad?"-":"", - lose_base, lose_factor); - } - /* try to find the optimal index */ - if (nr_mtrr_spare_reg >= num_var_ranges) - nr_mtrr_spare_reg = num_var_ranges - 1; - num_reg_good = -1; - for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { - if (!min_loss_pfn[i]) - num_reg_good = i; - } - - index_good = -1; - if (num_reg_good != -1) { - for (i = 0; i < NUM_RESULT; i++) { - if (!result[i].bad && - result[i].num_reg == num_reg_good && - !result[i].lose_cover_sizek) { - index_good = i; - break; - } - } - } + index_good = mtrr_search_optimal_index(); if (index_good != -1) { - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); i = index_good; - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n", - result[i].num_reg, lose_base, lose_factor); + mtrr_print_out_one_result(i); + /* convert ranges to var ranges state */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; gran_size = result[i].gran_sizek; gran_size <<= 10; - debug_print++; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - debug_print--; set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); return 1; + } else { + /* print out all */ + for (i = 0; i < NUM_RESULT; i++) + mtrr_print_out_one_result(i); } printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); @@ -1562,7 +1559,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; - int nr_range; u64 total_trim_size; /* extra one for all 0 */ -- cgit v1.2.3 From d4f1b10365d4f03dd802433e0014cf503e6e930c Mon Sep 17 00:00:00 2001 From: Jike Song Date: Fri, 17 Oct 2008 13:25:07 +0800 Subject: x86: clean up comments wrt. rd{msr|tsc|pmc} The rdmsr instruction(et al) for i386 and x86-64 are semantically same. The only difference is how gcc interpret constraint "A" for these targets. Signed-off-by: Jike Song Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 46be2fa7ac2..478a9245aae 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -22,10 +22,10 @@ static inline unsigned long long native_read_tscp(unsigned int *aux) } /* - * i386 calling convention returns 64-bit value in edx:eax, while - * x86_64 returns at rax. Also, the "A" constraint does not really - * mean rdx:rax in x86_64, so we need specialized behaviour for each - * architecture + * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A" + * constraint has different meanings. For i386, "A" means exactly + * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead, + * it means rax *or* rdx. */ #ifdef CONFIG_X86_64 #define DECLARE_ARGS(val, low, high) unsigned low, high -- cgit v1.2.3 From b062f841b569791d3054e975cd85f48562161565 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 30 Oct 2008 19:16:46 +0300 Subject: x86: nmi - add sensible names to nmi_watchdog boot param Impact: introduce nmi_watchdog=lapic and nmi_watchdog=ioapic aliases Add sensible names as "lapic" and "ioapic" to nmi_watchdog boot parameter. Sometimes it is not that easy to recall what exactly nmi_watchdog=1 does mean so we allow the using of symbolic names here. Old numeric values remain valid. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 2c97f07f1c2..c4869e4532a 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -199,12 +199,17 @@ static int __init setup_nmi_watchdog(char *str) ++str; } - get_option(&str, &nmi); - - if (nmi >= NMI_INVALID) - return 0; + if (!strncmp(str, "lapic", 5)) + nmi_watchdog = NMI_LOCAL_APIC; + else if (!strncmp(str, "ioapic", 6)) + nmi_watchdog = NMI_IO_APIC; + else { + get_option(&str, &nmi); + if (nmi >= NMI_INVALID) + return 0; + nmi_watchdog = nmi; + } - nmi_watchdog = nmi; return 1; } __setup("nmi_watchdog=", setup_nmi_watchdog); -- cgit v1.2.3 From 17666f02b118099028522dfc3df00a235700e216 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Oct 2008 16:08:32 -0400 Subject: ftrace: nmi safe code modification Impact: fix crashes that can occur in NMI handlers, if their code is modified Modifying code is something that needs special care. On SMP boxes, if code that is being modified is also being executed on another CPU, that CPU will have undefined results. The dynamic ftrace uses kstop_machine to make the system act like a uniprocessor system. But this does not address NMIs, that can still run on other CPUs. One approach to handle this is to make all code that are used by NMIs not be traced. But NMIs can call notifiers that spread throughout the kernel and this will be very hard to maintain, and the chance of missing a function is very high. The approach that this patch takes is to have the NMIs modify the code if the modification is taking place. The way this works is that just writing to code executing on another CPU is not harmful if what is written is the same as what exists. Two buffers are used: an IP buffer and a "code" buffer. The steps that the patcher takes are: 1) Put in the instruction pointer into the IP buffer and the new code into the "code" buffer. 2) Set a flag that says we are modifying code 3) Wait for any running NMIs to finish. 4) Write the code 5) clear the flag. 6) Wait for any running NMIs to finish. If an NMI is executed, it will also write the pending code. Multiple writes are OK, because what is being written is the same. Then the patcher must wait for all running NMIs to finish before going to the next line that must be patched. This is basically the RCU approach to code modification. Thanks to Ingo Molnar for suggesting the idea, and to Arjan van de Ven for his guidence on what is safe and what is not. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 15 ++++++ arch/x86/kernel/ftrace.c | 107 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 121 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9e8bc29b8b1..f2ed6b704a7 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -17,6 +17,21 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) */ return addr - 1; } + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_nmi_enter(void); +extern void ftrace_nmi_exit(void); +#else +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) +#endif +#endif + +#else /* CONFIG_FUNCTION_TRACER */ + +#ifndef __ASSEMBLY__ +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) #endif #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 50ea0ac8c9b..fe5f859130b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -56,6 +56,111 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) return calc.code; } +/* + * Modifying code must take extra care. On an SMP machine, if + * the code being modified is also being executed on another CPU + * that CPU will have undefined results and possibly take a GPF. + * We use kstop_machine to stop other CPUS from exectuing code. + * But this does not stop NMIs from happening. We still need + * to protect against that. We separate out the modification of + * the code to take care of this. + * + * Two buffers are added: An IP buffer and a "code" buffer. + * + * 1) Put in the instruction pointer into the IP buffer + * and the new code into the "code" buffer. + * 2) Set a flag that says we are modifying code + * 3) Wait for any running NMIs to finish. + * 4) Write the code + * 5) clear the flag. + * 6) Wait for any running NMIs to finish. + * + * If an NMI is executed, the first thing it does is to call + * "ftrace_nmi_enter". This will check if the flag is set to write + * and if it is, it will write what is in the IP and "code" buffers. + * + * The trick is, it does not matter if everyone is writing the same + * content to the code location. Also, if a CPU is executing code + * it is OK to write to that code location if the contents being written + * are the same as what exists. + */ + +static atomic_t in_nmi; +static int mod_code_status; +static int mod_code_write; +static void *mod_code_ip; +static void *mod_code_newcode; + +static void ftrace_mod_code(void) +{ + /* + * Yes, more than one CPU process can be writing to mod_code_status. + * (and the code itself) + * But if one were to fail, then they all should, and if one were + * to succeed, then they all should. + */ + mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, + MCOUNT_INSN_SIZE); + +} + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); + /* Must have in_nmi seen before reading write flag */ + smp_mb(); + if (mod_code_write) + ftrace_mod_code(); +} + +void ftrace_nmi_exit(void) +{ + /* Finish all executions before clearing in_nmi */ + smp_wmb(); + atomic_dec(&in_nmi); +} + +static void wait_for_nmi(void) +{ + while (atomic_read(&in_nmi)) + cpu_relax(); +} + +static int +do_ftrace_mod_code(unsigned long ip, void *new_code) +{ + mod_code_ip = (void *)ip; + mod_code_newcode = new_code; + + /* The buffers need to be visible before we let NMIs write them */ + smp_wmb(); + + mod_code_write = 1; + + /* Make sure write bit is visible before we wait on NMIs */ + smp_mb(); + + wait_for_nmi(); + + /* Make sure all running NMIs have finished before we write the code */ + smp_mb(); + + ftrace_mod_code(); + + /* Make sure the write happens before clearing the bit */ + smp_wmb(); + + mod_code_write = 0; + + /* make sure NMIs see the cleared bit */ + smp_mb(); + + wait_for_nmi(); + + return mod_code_status; +} + + int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) @@ -81,7 +186,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return -EINVAL; /* replace the text with the new text */ - if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) + if (do_ftrace_mod_code(ip, new_code)) return -EPERM; sync_core(); -- cgit v1.2.3 From b807c3d0f8e39ed7cbbbe6da162650e305e8de15 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Oct 2008 16:08:33 -0400 Subject: ftrace: nmi update statistics Impact: add more debug info to /debugfs/tracing/dyn_ftrace_total_info This patch adds dynamic ftrace NMI update statistics to the /debugfs/tracing/dyn_ftrace_total_info stat file. Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index fe5f859130b..6685b0fc1b4 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -91,6 +91,19 @@ static int mod_code_write; static void *mod_code_ip; static void *mod_code_newcode; +static int nmi_wait_count; +static atomic_t nmi_update_count; + +int ftrace_arch_read_dyn_info(char *buf, int size) +{ + int r; + + r = snprintf(buf, size, "%u %u", + nmi_wait_count, + atomic_read(&nmi_update_count)); + return r; +} + static void ftrace_mod_code(void) { /* @@ -109,8 +122,10 @@ void ftrace_nmi_enter(void) atomic_inc(&in_nmi); /* Must have in_nmi seen before reading write flag */ smp_mb(); - if (mod_code_write) + if (mod_code_write) { ftrace_mod_code(); + atomic_inc(&nmi_update_count); + } } void ftrace_nmi_exit(void) @@ -122,8 +137,15 @@ void ftrace_nmi_exit(void) static void wait_for_nmi(void) { - while (atomic_read(&in_nmi)) + int waited = 0; + + while (atomic_read(&in_nmi)) { + waited = 1; cpu_relax(); + } + + if (waited) + nmi_wait_count++; } static int -- cgit v1.2.3 From 31498a01496ffca3b542bae72b8ec499cd9302db Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Oct 2008 09:48:02 +0800 Subject: kexec/i386: remove PAGE_SIZE alignment from relocate_kernel Impact: save kernel .text by loosening kexec page alignment This patch removes PAGE_SIZE alignment from relocate_kernel(). Before kexec jump patches are merged, control page is mapped to relocate_kernel in kexec page tables, so relocate_kernel must be PAGE_SIZE aligned. Now, control page is mapped to identity mapped address, so relocate_kernel need not to be PAGE_SIZE aligned any more. This can reduce a few KB from kernel text segement. Signed-off-by: Huang Ying Signed-off-by: Ingo Molnar --- arch/x86/kernel/relocate_kernel_32.S | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 6f50664b2ba..377da3f78e8 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -39,7 +39,6 @@ #define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) .text - .align PAGE_SIZE .globl relocate_kernel relocate_kernel: /* Save the CPU context, used for jumping back */ -- cgit v1.2.3 From 92be3d6bdf2cb34972ab50e12ad4da1076e690da Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Oct 2008 09:48:08 +0800 Subject: kexec/i386: allocate page table pages dynamically Impact: save .text size when kexec is built in but not loaded This patch adds an architecture specific struct kimage_arch into struct kimage. The pointers to page table pages used by kexec are added to struct kimage_arch. The page tables pages are dynamically allocated in machine_kexec_prepare instead of statically from BSS segment. This will save up to 20k memory when kexec image is not loaded. Signed-off-by: Huang Ying Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kexec.h | 14 ++++++++ arch/x86/kernel/machine_kexec_32.c | 67 ++++++++++++++++++++++++++------------ 2 files changed, 60 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index a1f22771a15..df9c41a9c6a 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -170,6 +170,20 @@ relocate_kernel(unsigned long indirection_page, unsigned long start_address) ATTRIB_NORET; #endif +#ifdef CONFIG_X86_32 +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + pgd_t *pgd; +#ifdef CONFIG_X86_PAE + pmd_t *pmd0; + pmd_t *pmd1; +#endif + pte_t *pte0; + pte_t *pte1; +}; +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 7a385746509..1100312847a 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -25,15 +26,6 @@ #include #include -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) -static u32 kexec_pgd[1024] PAGE_ALIGNED; -#ifdef CONFIG_X86_PAE -static u32 kexec_pmd0[1024] PAGE_ALIGNED; -static u32 kexec_pmd1[1024] PAGE_ALIGNED; -#endif -static u32 kexec_pte0[1024] PAGE_ALIGNED; -static u32 kexec_pte1[1024] PAGE_ALIGNED; - static void set_idt(void *newidt, __u16 limit) { struct desc_ptr curidt; @@ -76,6 +68,37 @@ static void load_segments(void) #undef __STR } +static void machine_kexec_free_page_tables(struct kimage *image) +{ + free_page((unsigned long)image->arch.pgd); +#ifdef CONFIG_X86_PAE + free_page((unsigned long)image->arch.pmd0); + free_page((unsigned long)image->arch.pmd1); +#endif + free_page((unsigned long)image->arch.pte0); + free_page((unsigned long)image->arch.pte1); +} + +static int machine_kexec_alloc_page_tables(struct kimage *image) +{ + image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); +#ifdef CONFIG_X86_PAE + image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); +#endif + image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!image->arch.pgd || +#ifdef CONFIG_X86_PAE + !image->arch.pmd0 || !image->arch.pmd1 || +#endif + !image->arch.pte0 || !image->arch.pte1) { + machine_kexec_free_page_tables(image); + return -ENOMEM; + } + return 0; +} + /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -87,13 +110,14 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Make control page executable. + * - Make control page executable. + * - Allocate page tables */ int machine_kexec_prepare(struct kimage *image) { if (nx_enabled) set_pages_x(image->control_code_page, 1); - return 0; + return machine_kexec_alloc_page_tables(image); } /* @@ -104,6 +128,7 @@ void machine_kexec_cleanup(struct kimage *image) { if (nx_enabled) set_pages_nx(image->control_code_page, 1); + machine_kexec_free_page_tables(image); } /* @@ -150,18 +175,18 @@ void machine_kexec(struct kimage *image) relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; - page_list[PA_PGD] = __pa(kexec_pgd); - page_list[VA_PGD] = (unsigned long)kexec_pgd; + page_list[PA_PGD] = __pa(image->arch.pgd); + page_list[VA_PGD] = (unsigned long)image->arch.pgd; #ifdef CONFIG_X86_PAE - page_list[PA_PMD_0] = __pa(kexec_pmd0); - page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PMD_1] = __pa(kexec_pmd1); - page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; + page_list[PA_PMD_0] = __pa(image->arch.pmd0); + page_list[VA_PMD_0] = (unsigned long)image->arch.pmd0; + page_list[PA_PMD_1] = __pa(image->arch.pmd1); + page_list[VA_PMD_1] = (unsigned long)image->arch.pmd1; #endif - page_list[PA_PTE_0] = __pa(kexec_pte0); - page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PTE_1] = __pa(kexec_pte1); - page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_PTE_0] = __pa(image->arch.pte0); + page_list[VA_PTE_0] = (unsigned long)image->arch.pte0; + page_list[PA_PTE_1] = __pa(image->arch.pte1); + page_list[VA_PTE_1] = (unsigned long)image->arch.pte1; if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) -- cgit v1.2.3 From 9868ee63b896ee4d2ceb8c292e88d7f4e66caaf9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Oct 2008 09:48:15 +0800 Subject: kexec/i386: setup kexec page table in C Impact: change the kexec bootstrap code implementation from assembly to C This patch transforms the kexec page tables setup code from assembler code to C code in machine_kexec_prepare. This improves readability and reduces code line number. Signed-off-by: Huang Ying Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kexec.h | 17 +----- arch/x86/kernel/machine_kexec_32.c | 59 ++++++++++++++---- arch/x86/kernel/relocate_kernel_32.S | 114 ----------------------------------- 3 files changed, 49 insertions(+), 141 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index df9c41a9c6a..c61d8b2ab8b 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -5,21 +5,8 @@ # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 # define PA_PGD 2 -# define VA_PGD 3 -# define PA_PTE_0 4 -# define VA_PTE_0 5 -# define PA_PTE_1 6 -# define VA_PTE_1 7 -# define PA_SWAP_PAGE 8 -# ifdef CONFIG_X86_PAE -# define PA_PMD_0 9 -# define VA_PMD_0 10 -# define PA_PMD_1 11 -# define VA_PMD_1 12 -# define PAGES_NR 13 -# else -# define PAGES_NR 9 -# endif +# define PA_SWAP_PAGE 3 +# define PAGES_NR 4 #else # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 1100312847a..37f420018a4 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -99,6 +99,45 @@ static int machine_kexec_alloc_page_tables(struct kimage *image) return 0; } +static void machine_kexec_page_table_set_one( + pgd_t *pgd, pmd_t *pmd, pte_t *pte, + unsigned long vaddr, unsigned long paddr) +{ + pud_t *pud; + + pgd += pgd_index(vaddr); +#ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); +#endif + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); + pte = pte_offset_kernel(pmd, vaddr); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +} + +static void machine_kexec_prepare_page_tables(struct kimage *image) +{ + void *control_page; + pmd_t *pmd = 0; + + control_page = page_address(image->control_code_page); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd0; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte0, + (unsigned long)control_page, __pa(control_page)); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd1; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte1, + __pa(control_page), __pa(control_page)); +} + /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -112,12 +151,19 @@ static int machine_kexec_alloc_page_tables(struct kimage *image) * * - Make control page executable. * - Allocate page tables + * - Setup page tables */ int machine_kexec_prepare(struct kimage *image) { + int error; + if (nx_enabled) set_pages_x(image->control_code_page, 1); - return machine_kexec_alloc_page_tables(image); + error = machine_kexec_alloc_page_tables(image); + if (error) + return error; + machine_kexec_prepare_page_tables(image); + return 0; } /* @@ -176,17 +222,6 @@ void machine_kexec(struct kimage *image) page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_PGD] = __pa(image->arch.pgd); - page_list[VA_PGD] = (unsigned long)image->arch.pgd; -#ifdef CONFIG_X86_PAE - page_list[PA_PMD_0] = __pa(image->arch.pmd0); - page_list[VA_PMD_0] = (unsigned long)image->arch.pmd0; - page_list[PA_PMD_1] = __pa(image->arch.pmd1); - page_list[VA_PMD_1] = (unsigned long)image->arch.pmd1; -#endif - page_list[PA_PTE_0] = __pa(image->arch.pte0); - page_list[VA_PTE_0] = (unsigned long)image->arch.pte0; - page_list[PA_PTE_1] = __pa(image->arch.pte1); - page_list[VA_PTE_1] = (unsigned long)image->arch.pte1; if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 377da3f78e8..a160f311972 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -10,15 +10,12 @@ #include #include #include -#include /* * Must be relocatable PIC code callable as a C function */ #define PTR(x) (x << 2) -#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define PAE_PGD_ATTR (_PAGE_PRESENT) /* control_page + KEXEC_CONTROL_CODE_MAX_SIZE * ~ control_page + PAGE_SIZE are used as data storage and stack for @@ -59,117 +56,6 @@ relocate_kernel: movl %cr4, %eax movl %eax, CR4(%edi) -#ifdef CONFIG_X86_PAE - /* map the control page at its virtual address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0xc0000000, %eax - shrl $27, %eax - addl %edi, %eax - - movl PTR(PA_PMD_0)(%ebp), %edx - orl $PAE_PGD_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PMD_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x3fe00000, %eax - shrl $18, %eax - addl %edi, %eax - - movl PTR(PA_PTE_0)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x001ff000, %eax - shrl $9, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - /* identity map the control page at its physical address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0xc0000000, %eax - shrl $27, %eax - addl %edi, %eax - - movl PTR(PA_PMD_1)(%ebp), %edx - orl $PAE_PGD_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PMD_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x3fe00000, %eax - shrl $18, %eax - addl %edi, %eax - - movl PTR(PA_PTE_1)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x001ff000, %eax - shrl $9, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) -#else - /* map the control page at its virtual address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0xffc00000, %eax - shrl $20, %eax - addl %edi, %eax - - movl PTR(PA_PTE_0)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x003ff000, %eax - shrl $10, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - /* identity map the control page at its physical address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0xffc00000, %eax - shrl $20, %eax - addl %edi, %eax - - movl PTR(PA_PTE_1)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x003ff000, %eax - shrl $10, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) -#endif - -relocate_new_kernel: /* read the arguments and say goodbye to the stack */ movl 20+4(%esp), %ebx /* page_list */ movl 20+8(%esp), %ebp /* list of pages */ -- cgit v1.2.3 From a26a2a27396c0a0877aa701f8f92d08ba550a6c9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 31 Oct 2008 00:03:22 -0400 Subject: ftrace: nmi safe code clean ups Impact: cleanup This patch cleans up the NMI safe code for dynamic ftrace as suggested by Andrew Morton. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 10 +++++----- arch/x86/kernel/ftrace.c | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f2ed6b704a7..a23468194b8 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -22,16 +22,16 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) -#endif +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif +#endif /* __ASSEMBLY__ */ #else /* CONFIG_FUNCTION_TRACER */ #ifndef __ASSEMBLY__ -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 6685b0fc1b4..69149337f2f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -67,7 +67,7 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * * Two buffers are added: An IP buffer and a "code" buffer. * - * 1) Put in the instruction pointer into the IP buffer + * 1) Put the instruction pointer into the IP buffer * and the new code into the "code" buffer. * 2) Set a flag that says we are modifying code * 3) Wait for any running NMIs to finish. @@ -85,14 +85,14 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * are the same as what exists. */ -static atomic_t in_nmi; -static int mod_code_status; -static int mod_code_write; -static void *mod_code_ip; -static void *mod_code_newcode; +static atomic_t in_nmi = ATOMIC_INIT(0); +static int mod_code_status; /* holds return value of text write */ +static int mod_code_write; /* set when NMI should do the write */ +static void *mod_code_ip; /* holds the IP to write to */ +static void *mod_code_newcode; /* holds the text to write to the IP */ -static int nmi_wait_count; -static atomic_t nmi_update_count; +static unsigned nmi_wait_count; +static atomic_t nmi_update_count = ATOMIC_INIT(0); int ftrace_arch_read_dyn_info(char *buf, int size) { -- cgit v1.2.3 From b2bcc7b299f37037b4a78dc1538e5d6508ae8110 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 31 Oct 2008 11:59:53 -0700 Subject: x86: add a synthetic TSC_RELIABLE feature bit Impact: None, bit reservation only Add a synthetic TSC_RELIABLE feature bit which will be used to mark TSC as reliable so that we could skip all the runtime checks for TSC stablity, which have false positives in virtual environment. Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index cfdf8c2c5c3..e490a7932a0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -92,6 +92,7 @@ #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ -- cgit v1.2.3 From 49ab56ac6e1b907b7dadb72a4012460359feaf0e Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Sat, 1 Nov 2008 18:34:37 -0700 Subject: x86: add X86_FEATURE_HYPERVISOR feature bit Impact: Number declaration only. Add X86_FEATURE_HYPERVISOR bit (CPUID level 1, ECX, bit 31). Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e490a7932a0..694d1f8f1be 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -118,6 +118,7 @@ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */ @@ -238,6 +239,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 -- cgit v1.2.3 From 88b094fb8d4fe43b7025ea8d487059e8813e02cd Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 27 Oct 2008 10:41:46 -0700 Subject: x86: Hypervisor detection and get tsc_freq from hypervisor Impact: Changes timebase calibration on Vmware. v3->v2 : Abstract the hypervisor detection and feature (tsc_freq) request behind a hypervisor.c file v2->v1 : Add a x86_hyper_vendor field to the cpuinfo_x86 structure. This avoids multiple calls to the hypervisor detection function. This patch adds function to detect if we are running under VMware. The current way to check if we are on VMware is following, # check if "hypervisor present bit" is set, if so read the 0x40000000 cpuid leaf and check for "VMwareVMware" signature. # if the above fails, check the DMI vendors name for "VMware" string if we find one we query the VMware hypervisor port to check if we are under VMware. The DMI + "VMware hypervisor port check" is needed for older VMware products, which don't implement the hypervisor signature cpuid leaf. Also note that since we are checking for the DMI signature the hypervisor port should never be accessed on native hardware. This patch also adds a hypervisor_get_tsc_freq function, instead of calibrating the frequency which can be error prone in virtualized environment, we ask the hypervisor for it. We get the frequency from the hypervisor by accessing the hypervisor port if we are running on VMware. Other hypervisors too can add code to the generic routine to get frequency on their platform. Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/hypervisor.h | 26 ++++++++++++ arch/x86/include/asm/processor.h | 4 ++ arch/x86/include/asm/vmware.h | 26 ++++++++++++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/common.c | 2 + arch/x86/kernel/cpu/hypervisor.c | 48 +++++++++++++++++++++ arch/x86/kernel/cpu/vmware.c | 88 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 7 ++++ arch/x86/kernel/tsc.c | 9 +++- 9 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/hypervisor.h create mode 100644 arch/x86/include/asm/vmware.h create mode 100644 arch/x86/kernel/cpu/hypervisor.c create mode 100644 arch/x86/kernel/cpu/vmware.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h new file mode 100644 index 00000000000..369f5c5d09a --- /dev/null +++ b/arch/x86/include/asm/hypervisor.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2008, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef ASM_X86__HYPERVISOR_H +#define ASM_X86__HYPERVISOR_H + +extern unsigned long get_hypervisor_tsc_freq(void); +extern void init_hypervisor(struct cpuinfo_x86 *c); + +#endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ca01e38326..a570eafa475 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -110,6 +110,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; #endif + unsigned int x86_hyper_vendor; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 @@ -123,6 +124,9 @@ struct cpuinfo_x86 { #define X86_VENDOR_UNKNOWN 0xff +#define X86_HYPER_VENDOR_NONE 0 +#define X86_HYPER_VENDOR_VMWARE 1 + /* * capabilities of CPUs */ diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h new file mode 100644 index 00000000000..02dfea5aebc --- /dev/null +++ b/arch/x86/include/asm/vmware.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2008, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef ASM_X86__VMWARE_H +#define ASM_X86__VMWARE_H + +extern unsigned long vmware_get_tsc_khz(void); +extern int vmware_platform(void); + +#endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec6075c05..a5c04e88777 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -4,6 +4,7 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o +obj-y += vmware.o hypervisor.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0217a..b88595c3625 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "cpu.h" @@ -703,6 +704,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) detect_ht(c); #endif + init_hypervisor(c); /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c new file mode 100644 index 00000000000..7bd55064ffe --- /dev/null +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -0,0 +1,48 @@ +/* + * Common hypervisor code + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include + +static inline void __cpuinit +detect_hypervisor_vendor(struct cpuinfo_x86 *c) +{ + if (vmware_platform()) { + c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; + } else { + c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; + } +} + +unsigned long get_hypervisor_tsc_freq(void) +{ + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) + return vmware_get_tsc_khz(); + return 0; +} + +void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) +{ + detect_hypervisor_vendor(c); +} + diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c new file mode 100644 index 00000000000..d5d1b75a4b7 --- /dev/null +++ b/arch/x86/kernel/cpu/vmware.c @@ -0,0 +1,88 @@ +/* + * VMware Detection code. + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include + +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 +#define VMWARE_HYPERVISOR_PORT 0x5658 + +#define VMWARE_PORT_CMD_GETVERSION 10 +#define VMWARE_PORT_CMD_GETHZ 45 + +#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ + __asm__("inl (%%dx)" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "0"(VMWARE_HYPERVISOR_MAGIC), \ + "1"(VMWARE_PORT_CMD_##cmd), \ + "2"(VMWARE_HYPERVISOR_PORT), "3"(0) : \ + "memory"); + +static inline int __vmware_platform(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx); + return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; +} + +static unsigned long __vmware_get_tsc_khz(void) +{ + uint64_t tsc_hz; + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + if (eax == (uint32_t)-1) + return 0; + tsc_hz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_hz, 1000); + BUG_ON(tsc_hz >> 32); + return tsc_hz; +} + +int vmware_platform(void) +{ + if (cpu_has_hypervisor) { + unsigned int eax, ebx, ecx, edx; + char hyper_vendor_id[13]; + + cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); + memcpy(hyper_vendor_id + 0, &ebx, 4); + memcpy(hyper_vendor_id + 4, &ecx, 4); + memcpy(hyper_vendor_id + 8, &edx, 4); + hyper_vendor_id[12] = '\0'; + if (!strcmp(hyper_vendor_id, "VMwareVMware")) + return 1; + } else if (dmi_available && dmi_name_in_vendors("VMware") && + __vmware_platform()) + return 1; + + return 0; +} + +unsigned long vmware_get_tsc_khz(void) +{ + BUG_ON(!vmware_platform()); + return __vmware_get_tsc_khz(); +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa6790c1dd..f44dadfb32c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -98,6 +98,7 @@ #include #include +#include #include #include @@ -909,6 +910,12 @@ void __init setup_arch(char **cmdline_p) dmi_check_system(bad_bios_dmi_table); + /* + * VMware detection requires dmi to be available, so this + * needs to be done after dmi_scan_machine, for the BP. + */ + init_hypervisor(&boot_cpu_data); + #ifdef CONFIG_X86_32 probe_roms(); #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 62348e4fd8d..6dbf0bcb44a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -15,6 +15,7 @@ #include #include #include +#include unsigned int cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -352,9 +353,15 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms, fast_calibrate, tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; + tsc_khz = get_hypervisor_tsc_freq(); + if (tsc_khz) { + printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); + return tsc_khz; + } + local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); -- cgit v1.2.3 From eca0cd028bdf0f6aaceb0d023e9c7501079a7dda Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 31 Oct 2008 12:01:58 -0700 Subject: x86: Add a synthetic TSC_RELIABLE feature bit. Impact: Changes timebase calibration on Vmware. Use the synthetic TSC_RELIABLE bit to workaround virtualization anomalies. Virtual TSCs can be kept nearly in sync, but because the virtual TSC offset is set by software, it's not perfect. So, the TSC synchronization test can fail. Even then the TSC can be used as a clocksource since the VMware platform exports a reliable TSC to the guest for timekeeping purposes. Use this bit to check if we need to skip the TSC sync checks. Along with this also set the CONSTANT_TSC bit when on VMware, since we still want to use TSC as clocksource on VM running over hardware which has unsynchronized TSC's (opteron's), since the hypervisor will take care of providing consistent TSC to the guest. Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vmware.h | 1 + arch/x86/kernel/cpu/hypervisor.c | 11 ++++++++++- arch/x86/kernel/cpu/vmware.c | 18 ++++++++++++++++++ arch/x86/kernel/tsc_sync.c | 8 +++++++- 4 files changed, 36 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index 02dfea5aebc..c11b7e100d8 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h @@ -22,5 +22,6 @@ extern unsigned long vmware_get_tsc_khz(void); extern int vmware_platform(void); +extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 7bd55064ffe..35ae2b75226 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -41,8 +41,17 @@ unsigned long get_hypervisor_tsc_freq(void) return 0; } +static inline void __cpuinit +hypervisor_set_feature_bits(struct cpuinfo_x86 *c) +{ + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { + vmware_set_feature_bits(c); + return; + } +} + void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) { detect_hypervisor_vendor(c); + hypervisor_set_feature_bits(c); } - diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index d5d1b75a4b7..2ac4394fcb9 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -86,3 +86,21 @@ unsigned long vmware_get_tsc_khz(void) BUG_ON(!vmware_platform()); return __vmware_get_tsc_khz(); } + +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) +{ + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); +} diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9ffb01c31c4..5977c40a138 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -108,6 +108,12 @@ void __cpuinit check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + printk(KERN_INFO + "Skipping synchronization checks as TSC is reliable.\n"); + return; + } + printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", smp_processor_id(), cpu); @@ -161,7 +167,7 @@ void __cpuinit check_tsc_sync_target(void) { int cpus = 2; - if (unsynchronized_tsc()) + if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) return; /* -- cgit v1.2.3 From 395628ef4ea12ff0748099f145363b5e33c69acb Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 24 Oct 2008 17:22:01 -0700 Subject: x86: Skip verification by the watchdog for TSC clocksource. Impact: Changes timekeeping on Vmware (or with tsc=reliable). This is achieved by resetting the CLOCKSOURCE_MUST_VERIFY flag. We add a tsc=reliable commandline option to enable this. This enables legacy hardware without HPET, LAPIC, or ACPI timers to enter high-resolution timer mode. Along with that have extended this to be used in virtualization environement too. Now we also set this flag if the X86_FEATURE_TSC_RELIABLE bit is set. This is important since there is a wrap-around problem with the acpi_pm timer. The acpi_pm counter is just 24bits and this can overflow in ~4 seconds. With the NO_HZ kernels in virtualized environment, there can be situations when the guest is descheduled for longer duration, as a result we may miss the wrap of the acpi counter. When TSC is used as a clocksource and acpi_pm timer is being used as the watchdog clocksource this error in acpi_pm results in TSC being marked as unstable, and essentially results in time dropping in chunks of 4 seconds whenever this wrap is missed. Since the virtualized TSC is reliable on VMware, we should always use the TSCs clocksource on VMware, so we skip the verfication at runtime, by checking for the feature bit. Since we reset the flag for mgeode systems too, i have combined the mgeode case with the feature bit check. Signed-off-by: Jeff Hansen Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Signed-off-by: H. Peter Anvin --- arch/x86/kernel/tsc.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6dbf0bcb44a..ee01cd96b5e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -32,6 +32,7 @@ static int tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int tsc_disabled = -1; +static int tsc_clocksource_reliable; /* * Scheduler clock - returns current time in nanosec units. */ @@ -99,6 +100,15 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); +static int __init tsc_setup(char *str) +{ + if (!strcmp(str, "reliable")) + tsc_clocksource_reliable = 1; + return 1; +} + +__setup("tsc=", tsc_setup); + #define MAX_RETRIES 5 #define SMI_TRESHOLD 50000 @@ -738,24 +748,21 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { {} }; -/* - * Geode_LX - the OLPC CPU has a possibly a very reliable TSC - */ +static void __init check_system_tsc_reliable(void) +{ #ifdef CONFIG_MGEODE_LX -/* RTSC counts during suspend */ + /* RTSC counts during suspend */ #define RTSC_SUSP 0x100 - -static void __init check_geode_tsc_reliable(void) -{ unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); + /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ if (res_low & RTSC_SUSP) - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; -} -#else -static inline void check_geode_tsc_reliable(void) { } + tsc_clocksource_reliable = 1; #endif + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) + tsc_clocksource_reliable = 1; +} /* * Make an educated guess if the TSC is trustworthy and synchronized @@ -790,6 +797,8 @@ static void __init init_tsc_clocksource(void) { clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, clocksource_tsc.shift); + if (tsc_clocksource_reliable) + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ if (check_tsc_unstable()) { clocksource_tsc.rating = 0; @@ -850,7 +859,7 @@ void __init tsc_init(void) if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); - check_geode_tsc_reliable(); + check_system_tsc_reliable(); init_tsc_clocksource(); } -- cgit v1.2.3 From 3555105333ae55414d0fe051557bd7dc590f5255 Mon Sep 17 00:00:00 2001 From: Gary Hade Date: Fri, 31 Oct 2008 10:52:03 -0700 Subject: x86: add memory hotremove config option Impact: enable CONFIG_MEMORY_HOTREMOVE feature on x86. (default-off) Memory hotremove functionality can currently be configured into the ia64, powerpc, and s390 kernels. This patch makes it possible to configure the memory hotremove functionality into the x86 kernel as well. Signed-off-by: Badari Pulavarty Signed-off-by: Gary Hade Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c00aefcb47d..25e71152611 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1492,6 +1492,10 @@ config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y depends on X86_64 || (X86_32 && HIGHMEM) +config ARCH_ENABLE_MEMORY_HOTREMOVE + def_bool y + depends on MEMORY_HOTPLUG + config HAVE_ARCH_EARLY_PFN_TO_NID def_bool X86_64 depends on NUMA -- cgit v1.2.3 From 7e5e26a3d8ac4bcadb380073dc9604c07a9a6198 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 31 Oct 2008 09:36:38 -0400 Subject: ftrace: fix hardirq header for non ftrace archs Impact: build fix for non-ftrace architectures Not all archs implement ftrace, and therefore do not have an asm/ftrace.h. This patch corrects the problem. The ftrace_nmi_enter/exit now must be defined for all archs that implement dynamic ftrace. Currently, only x86 does. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index a23468194b8..f8173ed1c97 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -17,23 +17,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) */ return addr - 1; } - -#ifdef CONFIG_DYNAMIC_FTRACE -extern void ftrace_nmi_enter(void); -extern void ftrace_nmi_exit(void); -#else -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif #endif /* __ASSEMBLY__ */ - -#else /* CONFIG_FUNCTION_TRACER */ - -#ifndef __ASSEMBLY__ -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif - #endif /* CONFIG_FUNCTION_TRACER */ #endif /* _ASM_X86_FTRACE_H */ -- cgit v1.2.3 From 6bdbfe99916398dbb28d83833cc04757110f2738 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 3 Nov 2008 11:31:28 -0800 Subject: x86: VMware: Fix vmware_get_tsc code Impact: Fix possible failure to calibrate the TSC on Vmware near 4 GHz The current version of the code to get the tsc frequency from the VMware hypervisor, will be broken on processor with frequency (4G-1) HZ, because on such processors eax will have UINT_MAX and that would be legitimate. We instead check that EBX did change to decide if we were able to read the frequency from the hypervisor. Signed-off-by: Alok N Kataria Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/vmware.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 2ac4394fcb9..a0905ecfe7d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -36,7 +36,7 @@ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ "0"(VMWARE_HYPERVISOR_MAGIC), \ "1"(VMWARE_PORT_CMD_##cmd), \ - "2"(VMWARE_HYPERVISOR_PORT), "3"(0) : \ + "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ "memory"); static inline int __vmware_platform(void) @@ -53,7 +53,7 @@ static unsigned long __vmware_get_tsc_khz(void) VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (eax == (uint32_t)-1) + if (ebx == UINT_MAX) return 0; tsc_hz = eax | (((uint64_t)ebx) << 32); do_div(tsc_hz, 1000); -- cgit v1.2.3 From fd8cd7e1919fc1c27fe2fdccd2a1cd32f791ef0f Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 3 Nov 2008 15:50:38 -0800 Subject: x86: vmware: look for DMI string in the product serial key Impact: Should permit VMware detection on older platforms where the vendor is changed. Could theoretically cause a regression if some weird serial number scheme contains the string "VMware" by pure chance. Seems unlikely, especially with the mixed case. In some user configured cases, VMware may choose not to put a VMware specific DMI string, but the product serial key is always there and is VMware specific. Add a interface to check the serial key, when checking for VMware in the DMI information. Signed-off-by: Alok N Kataria Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/vmware.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index a0905ecfe7d..c034bda842d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -61,6 +61,11 @@ static unsigned long __vmware_get_tsc_khz(void) return tsc_hz; } +/* + * While checking the dmi string infomation, just checking the product + * serial key should be enough, as this will always have a VMware + * specific string when running under VMware hypervisor. + */ int vmware_platform(void) { if (cpu_has_hypervisor) { @@ -74,7 +79,7 @@ int vmware_platform(void) hyper_vendor_id[12] = '\0'; if (!strcmp(hyper_vendor_id, "VMwareVMware")) return 1; - } else if (dmi_available && dmi_name_in_vendors("VMware") && + } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) return 1; -- cgit v1.2.3 From 64ccf2f9a70a06ba56cd8cedfa610b4e77181587 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 5 Nov 2008 22:11:56 -0600 Subject: x86: uv: Add UV watchlist bios call Add UV bios calls to allocate and free watchlists. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 17 ++++++++++++++++- arch/x86/kernel/bios_uv.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 51cadc645e6..58105c5b0b4 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -32,7 +32,9 @@ enum uv_bios_cmd { UV_BIOS_COMMON, UV_BIOS_GET_SN_INFO, - UV_BIOS_FREQ_BASE + UV_BIOS_FREQ_BASE, + UV_BIOS_WATCHLIST_ALLOC, + UV_BIOS_WATCHLIST_FREE }; /* @@ -71,6 +73,15 @@ union partition_info_u { }; }; +union uv_watchlist_u { + u64 val; + struct { + u64 blade : 16, + size : 32, + filler : 16; + }; +}; + /* * bios calls have 6 parameters */ @@ -80,9 +91,13 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); +extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, + unsigned long *); +extern int uv_bios_mq_watchlist_free(int, int); extern void uv_bios_init(void); +extern unsigned long sn_rtc_cycles_per_second; extern int uv_type; extern long sn_partition_id; extern long sn_coherency_id; diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 7cefb7170e7..4c02b279921 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -100,6 +100,39 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, return ret; } +int +uv_bios_mq_watchlist_alloc(int blade, void *mq, unsigned int mq_size, + unsigned long *intr_mmr_offset) +{ + union uv_watchlist_u size_blade; + unsigned long addr; + u64 watchlist; + s64 ret; + + addr = (unsigned long)mq; + size_blade.size = mq_size; + size_blade.blade = blade; + + /* + * bios returns watchlist number or negative error number. + */ + ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, + size_blade.val, (u64)intr_mmr_offset, + (u64)&watchlist, 0); + if (ret < BIOS_STATUS_SUCCESS) + return ret; + + return watchlist; +} +EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc); + +int +uv_bios_mq_watchlist_free(int blade, int watchlist_num) +{ + return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE, + blade, watchlist_num, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free); s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { -- cgit v1.2.3 From e8929c8a6acbecbd629b8e3f2d1a2546ec4ebdfc Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 5 Nov 2008 22:13:44 -0600 Subject: x86: uv: Add UV memory protection bios call Add UV bios call to change memory protections. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 10 +++++++++- arch/x86/kernel/bios_uv.c | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 58105c5b0b4..a301a56d415 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -34,7 +34,8 @@ enum uv_bios_cmd { UV_BIOS_GET_SN_INFO, UV_BIOS_FREQ_BASE, UV_BIOS_WATCHLIST_ALLOC, - UV_BIOS_WATCHLIST_FREE + UV_BIOS_WATCHLIST_FREE, + UV_BIOS_MEMPROTECT }; /* @@ -82,6 +83,12 @@ union uv_watchlist_u { }; }; +enum uv_memprotect { + UV_MEMPROT_RESTRICT_ACCESS, + UV_MEMPROT_ALLOW_AMO, + UV_MEMPROT_ALLOW_RW +}; + /* * bios calls have 6 parameters */ @@ -94,6 +101,7 @@ extern s64 uv_bios_freq_base(u64, u64 *); extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); +extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern void uv_bios_init(void); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 4c02b279921..7cf6fc3d1c1 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -134,6 +134,14 @@ uv_bios_mq_watchlist_free(int blade, int watchlist_num) } EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free); +s64 +uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms) +{ + return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len, + perms, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); + s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, -- cgit v1.2.3 From 23c357003b3671cdfb17bc4d5383589e74b71511 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 5 Nov 2008 22:15:13 -0600 Subject: x86: uv: Add UV reserved page bios call Add UV bios call to get the address of the reserved page. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 5 ++++- arch/x86/kernel/bios_uv.c | 11 +++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index a301a56d415..da1c4e8e78f 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -35,13 +35,15 @@ enum uv_bios_cmd { UV_BIOS_FREQ_BASE, UV_BIOS_WATCHLIST_ALLOC, UV_BIOS_WATCHLIST_FREE, - UV_BIOS_MEMPROTECT + UV_BIOS_MEMPROTECT, + UV_BIOS_GET_PARTITION_ADDR }; /* * Status values returned from a BIOS call. */ enum { + BIOS_STATUS_MORE_PASSES = 1, BIOS_STATUS_SUCCESS = 0, BIOS_STATUS_UNIMPLEMENTED = -ENOSYS, BIOS_STATUS_EINVAL = -EINVAL, @@ -102,6 +104,7 @@ extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); +extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern void uv_bios_init(void); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 7cf6fc3d1c1..d22d0f1bbea 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -142,6 +142,17 @@ uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms) } EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); +s64 +uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len) +{ + s64 ret; + + ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie, + (u64)addr, buf, (u64)len, 0); + return ret; +} +EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa); + s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, -- cgit v1.2.3 From 60a7ecf42661f2b22168751298592da6ee210c9e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Nov 2008 16:05:44 -0500 Subject: ftrace: add quick function trace stop Impact: quick start and stop of function tracer This patch adds a way to disable the function tracer quickly without the need to run kstop_machine. It adds a new variable called function_trace_stop which will stop the calls to functions from mcount when set. This is just an on/off switch and does not handle recursion like preempt_disable(). It's main purpose is to help other tracers/debuggers start and stop tracing fuctions without the need to call kstop_machine. The config option HAVE_FUNCTION_TRACE_MCOUNT_TEST is added for archs that implement the testing of the function_trace_stop in the mcount arch dependent code. Otherwise, the test is done in the C code. x86 is the only arch at the moment that supports this. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/kernel/entry_32.S | 6 ++++++ arch/x86/kernel/entry_64.S | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6f20718d315..d09e812c622 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,6 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER + select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 28b597ef9ca..9134de814c9 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1157,6 +1157,9 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + pushl %eax pushl %ecx pushl %edx @@ -1180,6 +1183,9 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpl $ftrace_stub, ftrace_trace_function jnz trace .globl ftrace_stub diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b86f332c96a..08aa6b10933 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -68,6 +68,8 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub /* taken from glibc */ subq $0x38, %rsp @@ -103,6 +105,9 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpq $ftrace_stub, ftrace_trace_function jnz trace .globl ftrace_stub -- cgit v1.2.3 From fd51b2d7d5df932767b89e00d0871a38a2c53e74 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 5 Nov 2008 02:27:19 +0900 Subject: x86: update CONFIG_NUMA description Impact: clarify/update CONFIG_NUMA text CONFIG_NUMA description talk about a bit old thing. So, following changes are better. o CONFIG_NUMA is no longer EXPERIMENTAL o Opteron is not the only processor of NUMA topology on x86_64 no longer, but also Intel Core7i has it. Signed-off-by: KOSAKI Motohiro Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 350bee1d54d..38ae04bf651 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -951,22 +951,26 @@ config ARCH_PHYS_ADDR_T_64BIT # Common NUMA Features config NUMA - bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" + bool "Numa Memory Allocation and Scheduler Support" depends on SMP depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) help Enable NUMA (Non Uniform Memory Access) support. + The kernel will try to allocate memory used by a CPU on the local memory controller of the CPU and add some more NUMA awareness to the kernel. - For 32-bit this is currently highly experimental and should be only - used for kernel development. It might also cause boot failures. - For 64-bit this is recommended on all multiprocessor Opteron systems. - If the system is EM64T, you should say N unless your system is - EM64T NUMA. + For 64-bit this is recommended if the system is Intel Core 7i + (or later), AMD Opteron, or EM64T NUMA. + + For 32-bit this is only needed on (rare) 32-bit-only platforms + that support NUMA topologies, such as NUMAQ / Summit, or if you + boot a 32-bit kernel on a 64-bit NUMA platform. + + Otherwise, you should say N. comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) -- cgit v1.2.3 From a87d091434ed2a34d647979ab12084139ee1fe41 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Thu, 6 Nov 2008 11:10:49 -0800 Subject: x86, sched: enable wchan config menu item on 64-bit Enable the wchan config menu item for now on x86-64 arch? This will at least allow people to enable/disable frame pointers on scheduler functions. Signed-off-by: Ken Chen Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6f20718d315..488a4ecd0b5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -367,7 +367,7 @@ config X86_RDC321X config SCHED_NO_NO_OMIT_FRAME_POINTER def_bool y prompt "Single-depth WCHAN output" - depends on X86_32 + depends on X86 help Calculate simpler /proc//wchan values. If this option is disabled then wchan values will recurse back to the -- cgit v1.2.3 From f4166c54bfe04f64603974058e44fbd7cfef0ccc Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Sun, 9 Nov 2008 14:29:21 +0100 Subject: x86, bts: DS and BTS initialization Impact: widen BTS/PEBS ptrace enablement to more CPU models Move BTS initialisation out of an #ifdef CONFIG_X86_64 guard. Assume core2 BTS and DS layout for future models of family 6 processors. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 3 +-- arch/x86/kernel/ds.c | 9 ++++----- arch/x86/kernel/ptrace.c | 9 ++++----- 3 files changed, 9 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cce0b6118d5..816f27f289b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -307,12 +307,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_P4); if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_P3); +#endif if (cpu_has_bts) ptrace_bts_init_intel(c); -#endif - detect_extended_topology(c); if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { /* diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 2b69994fd3a..c570252905a 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -821,17 +821,16 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { + case 0 ... 0xC: + /* sorry, don't know about them */ + break; case 0xD: case 0xE: /* Pentium M */ ds_configure(&ds_cfg_var); break; - case 0xF: /* Core2 */ - case 0x1C: /* Atom */ + default: /* Core2, Atom, ... */ ds_configure(&ds_cfg_64); break; - default: - /* sorry, don't know about them */ - break; } break; case 0xF: diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a6d8c12e10..06180dff5b2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -929,17 +929,16 @@ void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { + case 0 ... 0xC: + /* sorry, don't know about them */ + break; case 0xD: case 0xE: /* Pentium M */ bts_configure(&bts_cfg_pentium_m); break; - case 0xF: /* Core2 */ - case 0x1C: /* Atom */ + default: /* Core2, Atom, ... */ bts_configure(&bts_cfg_core2); break; - default: - /* sorry, don't know about them */ - break; } break; case 0xF: -- cgit v1.2.3 From 4e0304310f5180eee11b4edc72cf4cb78acdc634 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 10 Nov 2008 09:16:40 +0100 Subject: x86: apic - calibrate_APIC_clock remove redundant irq-enable-disable Impact: cleanup lapic_timer_setup is self-protected with local_irq_save/restore no need to use them in caller and levt is the per-cpu variable so no concurrent access from another cpu. Signed-off-by: Cyrill Gorcunov Acked-by: "Maciej W. Rozycki" Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 04a7f960bbc..ce90dc18413 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -672,13 +672,9 @@ static int __init calibrate_APIC_clock(void) while (lapic_cal_loops <= LAPIC_CAL_LOOPS) cpu_relax(); - local_irq_disable(); - /* Stop the lapic timer */ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); - local_irq_enable(); - /* Jiffies delta */ deltaj = lapic_cal_j2 - lapic_cal_j1; apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); -- cgit v1.2.3 From ba21ebb6abac5c46e1d818d2ceda82420bd099ba Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 10 Nov 2008 09:16:41 +0100 Subject: x86: apic - use pr_ macros for logging Impact: cleanup It saves us some source lines and shift the code a bit righter. And a multiline comment style is fixed too :-) Signed-off-by: Cyrill Gorcunov Acked-by: "Maciej W. Rozycki" Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 116 ++++++++++++++++++++++--------------------------- 1 file changed, 53 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index ce90dc18413..70879c9e393 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -559,13 +559,13 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta) } else { res = (((u64)deltapm) * mult) >> 22; do_div(res, 1000000); - printk(KERN_WARNING "APIC calibration not consistent " + pr_warning("APIC calibration not consistent " "with PM Timer: %ldms instead of 100ms\n", (long)res); /* Correct the lapic counter value */ res = (((u64)(*delta)) * pm_100ms); do_div(res, deltapm); - printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + pr_info("APIC delta adjusted to PM-Timer: " "%lu (%ld)\n", (unsigned long)res, *delta); *delta = (long)res; } @@ -645,8 +645,7 @@ static int __init calibrate_APIC_clock(void) */ if (calibration_result < (1000000 / HZ)) { local_irq_enable(); - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); + pr_warning("APIC frequency too slow, disabling apic timer\n"); return -1; } @@ -688,8 +687,7 @@ static int __init calibrate_APIC_clock(void) local_irq_enable(); if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - printk(KERN_WARNING - "APIC timer disabled due to verification failure.\n"); + pr_warning("APIC timer disabled due to verification failure.\n"); return -1; } @@ -710,7 +708,7 @@ void __init setup_boot_APIC_clock(void) * broadcast mechanism is used. On UP systems simply ignore it. */ if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); + pr_info("Disabling APIC timer\n"); /* No broadcast on UP ! */ if (num_possible_cpus() > 1) { lapic_clockevent.mult = 1; @@ -737,7 +735,7 @@ void __init setup_boot_APIC_clock(void) if (nmi_watchdog != NMI_IO_APIC) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; else - printk(KERN_WARNING "APIC timer registered as dummy," + pr_warning("APIC timer registered as dummy," " due to nmi_watchdog=%d!\n", nmi_watchdog); /* Setup the lapic or request the broadcast */ @@ -769,8 +767,7 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); /* Switch it off */ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); return; @@ -1089,7 +1086,7 @@ static void __cpuinit lapic_setup_esr(void) unsigned int oldvalue, value, maxlvt; if (!lapic_is_integrated()) { - printk(KERN_INFO "No ESR for 82489DX.\n"); + pr_info("No ESR for 82489DX.\n"); return; } @@ -1100,7 +1097,7 @@ static void __cpuinit lapic_setup_esr(void) * ESR disabled - we can't do anything useful with the * errors anyway - mbligh */ - printk(KERN_INFO "Leaving ESR disabled.\n"); + pr_info("Leaving ESR disabled.\n"); return; } @@ -1294,7 +1291,7 @@ void check_x2apic(void) rdmsr(MSR_IA32_APICBASE, msr, msr2); if (msr & X2APIC_ENABLE) { - printk("x2apic enabled by BIOS, switching to x2apic ops\n"); + pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); x2apic_preenabled = x2apic = 1; apic_ops = &x2apic_ops; } @@ -1306,7 +1303,7 @@ void enable_x2apic(void) rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { - printk("Enabling x2apic\n"); + pr_info("Enabling x2apic\n"); wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } @@ -1321,9 +1318,8 @@ void enable_IR_x2apic(void) return; if (!x2apic_preenabled && disable_x2apic) { - printk(KERN_INFO - "Skipped enabling x2apic and Interrupt-remapping " - "because of nox2apic\n"); + pr_info("Skipped enabling x2apic and Interrupt-remapping " + "because of nox2apic\n"); return; } @@ -1331,22 +1327,19 @@ void enable_IR_x2apic(void) panic("Bios already enabled x2apic, can't enforce nox2apic"); if (!x2apic_preenabled && skip_ioapic_setup) { - printk(KERN_INFO - "Skipped enabling x2apic and Interrupt-remapping " - "because of skipping io-apic setup\n"); + pr_info("Skipped enabling x2apic and Interrupt-remapping " + "because of skipping io-apic setup\n"); return; } ret = dmar_table_init(); if (ret) { - printk(KERN_INFO - "dmar_table_init() failed with %d:\n", ret); + pr_info("dmar_table_init() failed with %d:\n", ret); if (x2apic_preenabled) panic("x2apic enabled by bios. But IR enabling failed"); else - printk(KERN_INFO - "Not enabling x2apic,Intr-remapping\n"); + pr_info("Not enabling x2apic,Intr-remapping\n"); return; } @@ -1355,7 +1348,7 @@ void enable_IR_x2apic(void) ret = save_mask_IO_APIC_setup(); if (ret) { - printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret); + pr_info("Saving IO-APIC state failed: %d\n", ret); goto end; } @@ -1390,14 +1383,11 @@ end: if (!ret) { if (!x2apic_preenabled) - printk(KERN_INFO - "Enabled x2apic and interrupt-remapping\n"); + pr_info("Enabled x2apic and interrupt-remapping\n"); else - printk(KERN_INFO - "Enabled Interrupt-remapping\n"); + pr_info("Enabled Interrupt-remapping\n"); } else - printk(KERN_ERR - "Failed to enable Interrupt-remapping and x2apic\n"); + pr_err("Failed to enable Interrupt-remapping and x2apic\n"); #else if (!cpu_has_x2apic) return; @@ -1406,8 +1396,8 @@ end: panic("x2apic enabled prior OS handover," " enable CONFIG_INTR_REMAP"); - printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " - " and x2apic\n"); + pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping " + " and x2apic\n"); #endif return; @@ -1424,7 +1414,7 @@ end: static int __init detect_init_APIC(void) { if (!cpu_has_apic) { - printk(KERN_INFO "No local APIC present\n"); + pr_info("No local APIC present\n"); return -1; } @@ -1465,8 +1455,8 @@ static int __init detect_init_APIC(void) * "lapic" specified. */ if (!force_enable_local_apic) { - printk(KERN_INFO "Local APIC disabled by BIOS -- " - "you can enable it with \"lapic\"\n"); + pr_info("Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); return -1; } /* @@ -1476,8 +1466,7 @@ static int __init detect_init_APIC(void) */ rdmsr(MSR_IA32_APICBASE, l, h); if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk(KERN_INFO - "Local APIC disabled by BIOS -- reenabling.\n"); + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; wrmsr(MSR_IA32_APICBASE, l, h); @@ -1490,7 +1479,7 @@ static int __init detect_init_APIC(void) */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - printk(KERN_WARNING "Could not enable APIC!\n"); + pr_warning("Could not enable APIC!\n"); return -1; } set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); @@ -1501,14 +1490,14 @@ static int __init detect_init_APIC(void) if (l & MSR_IA32_APICBASE_ENABLE) mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - printk(KERN_INFO "Found and enabled local APIC!\n"); + pr_info("Found and enabled local APIC!\n"); apic_pm_activate(); return 0; no_apic: - printk(KERN_INFO "No local APIC present or hardware disabled\n"); + pr_info("No local APIC present or hardware disabled\n"); return -1; } #endif @@ -1584,12 +1573,12 @@ int __init APIC_init_uniprocessor(void) { #ifdef CONFIG_X86_64 if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); + pr_info("Apic disabled\n"); return -1; } if (!cpu_has_apic) { disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); + pr_info("Apic disabled by BIOS\n"); return -1; } #else @@ -1601,8 +1590,8 @@ int __init APIC_init_uniprocessor(void) */ if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n", - boot_cpu_physical_apicid); + pr_err("BIOS bug, local APIC 0x%x not detected!...\n", + boot_cpu_physical_apicid); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } @@ -1695,8 +1684,8 @@ void smp_spurious_interrupt(struct pt_regs *regs) add_pda(irq_spurious_count, 1); #else /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " - "should never happen.\n", smp_processor_id()); + pr_info("spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); __get_cpu_var(irq_stat).irq_spurious_count++; #endif irq_exit(); @@ -1720,17 +1709,18 @@ void smp_error_interrupt(struct pt_regs *regs) ack_APIC_irq(); atomic_inc(&irq_err_count); - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + /* + * Here is what the APIC error bits mean: + * 0: Send CS error + * 1: Receive CS error + * 2: Send accept error + * 3: Receive accept error + * 4: Reserved + * 5: Send illegal vector + * 6: Received illegal vector + * 7: Illegal register address + */ + pr_debug("APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); irq_exit(); } @@ -1834,15 +1824,15 @@ void __cpuinit generic_processor_info(int apicid, int version) * Validate version */ if (version == 0x0) { - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); + pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + version); version = 0x10; } apic_version[apicid] = version; if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + pr_warning("WARNING: NR_CPUS limit of %i reached." " Processor ignored.\n", NR_CPUS); return; } @@ -2205,7 +2195,7 @@ static int __init apic_set_verbosity(char *arg) else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" + pr_warning("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } -- cgit v1.2.3 From ae1e9130bfb9ad55eb97ec3fb17a122b7a118f98 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Nov 2008 09:05:16 +0100 Subject: sched: rename SCHED_NO_NO_OMIT_FRAME_POINTER => SCHED_OMIT_FRAME_POINTER Impact: cleanup, change .config option name We had this ugly config name for a long time for hysteric raisons. Rename it to a saner name. We still cannot get rid of it completely, until /proc//stack usage replaces WCHAN usage for good. We'll be able to do that in the v2.6.29/v2.6.30 timeframe. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d5550d19b6..74db682ec1c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -364,7 +364,7 @@ config X86_RDC321X as R-8610-(G). If you don't have one of these chips, you should say N here. -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER def_bool y prompt "Single-depth WCHAN output" depends on X86 -- cgit v1.2.3 From caf4b323b02a16c92fba449952ac6515ddc76d7a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Nov 2008 07:03:45 +0100 Subject: tracing, x86: add low level support for ftrace return tracing Impact: add infrastructure for function-return tracing Add low level support for ftrace return tracing. This plug-in stores return addresses on the thread_info structure of the current task. The index of the current return address is initialized when the task is the first one (init) and when a process forks (the child). It is not needed when a task does a sys_execve because after this syscall, it still needs to return on the kernel functions it called. Note that the code of return_to_handler has been suggested by Steven Rostedt as almost all of the ideas of improvements in this V3. For purpose of security, arch/x86/kernel/process_32.c is not traced because __switch_to() changes the current task during its execution. That could cause inconsistency in the stored return address of this function even if I didn't have any crash after testing with tracing on this function enabled. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/ftrace.h | 26 ++++++ arch/x86/include/asm/thread_info.h | 24 +++++ arch/x86/kernel/Makefile | 6 ++ arch/x86/kernel/entry_32.S | 33 +++++++ arch/x86/kernel/ftrace.c | 181 +++++++++++++++++++++++++++++++++++-- 6 files changed, 264 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 27b8a3a3991..ca91e50bdb1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -11,6 +11,7 @@ config 64BIT config X86_32 def_bool !64BIT + select HAVE_FUNCTION_RET_TRACER config X86_64 def_bool 64BIT diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f8173ed1c97..9b6a1fa19e7 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -20,4 +20,30 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_RET_TRACER +#define FTRACE_RET_STACK_SIZE 20 + +#ifndef __ASSEMBLY__ + +/* + * Stack of return addresses for functions + * of a thread. + * Used in struct thread_info + */ +struct ftrace_ret_stack { + unsigned long ret; + unsigned long func; + unsigned long long calltime; +}; + +/* + * Primary handler of a function return. + * It relays on ftrace_return_to_handler. + * Defined in entry32.S + */ +extern void return_to_handler(void); + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_FUNCTION_RET_TRACER */ + #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad..a71158369fd 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -20,6 +20,7 @@ struct task_struct; struct exec_domain; #include +#include struct thread_info { struct task_struct *task; /* main task structure */ @@ -38,8 +39,30 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif + +#ifdef CONFIG_FUNCTION_RET_TRACER + /* Index of current stored adress in ret_stack */ + int curr_ret_stack; + /* Stack of return addresses for return function tracing */ + struct ftrace_ret_stack ret_stack[FTRACE_RET_STACK_SIZE]; +#endif }; +#ifdef CONFIG_FUNCTION_RET_TRACER +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + }, \ + .curr_ret_stack = -1,\ +} +#else #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ @@ -52,6 +75,7 @@ struct thread_info { .fn = do_no_restart_syscall, \ }, \ } +#endif #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e489ff9cb3e..1d8ed95da84 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -14,6 +14,11 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg endif +ifdef CONFIG_FUNCTION_RET_TRACER +# Don't trace __switch_to() but let it for function tracer +CFLAGS_REMOVE_process_32.o = -pg +endif + # # vsyscalls (which work on the user stack) should have # no stack-protector checks: @@ -65,6 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_RET_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 9134de814c9..9a0ac85946d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1188,6 +1188,10 @@ ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace +#ifdef CONFIG_FUNCTION_RET_TRACER + cmpl $ftrace_stub, ftrace_function_return + jnz trace_return +#endif .globl ftrace_stub ftrace_stub: ret @@ -1206,8 +1210,37 @@ trace: popl %edx popl %ecx popl %eax + jmp ftrace_stub +#ifdef CONFIG_FUNCTION_RET_TRACER +trace_return: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + pushl %eax + lea 0x4(%ebp), %eax + pushl %eax + call prepare_ftrace_return + addl $8, %esp + popl %edx + popl %ecx + popl %eax jmp ftrace_stub + +.globl return_to_handler +return_to_handler: + pushl $0 + pushl %eax + pushl %ecx + pushl %edx + call ftrace_return_to_handler + movl %eax, 0xc(%esp) + popl %edx + popl %ecx + popl %eax + ret +#endif /* CONFIG_FUNCTION_RET_TRACER */ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 69149337f2f..d68033bba22 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -18,10 +18,173 @@ #include #include +#include #include +#include -static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +#ifdef CONFIG_FUNCTION_RET_TRACER + +/* + * These functions are picked from those used on + * this page for dynamic ftrace. They have been + * simplified to ignore all traces in NMI context. + */ +static atomic_t in_nmi; + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); +} + +void ftrace_nmi_exit(void) +{ + atomic_dec(&in_nmi); +} + +/* + * Synchronize accesses to return adresses stack with + * interrupts. + */ +static raw_spinlock_t ret_stack_lock; + +/* Add a function return address to the trace stack on thread info.*/ +static int push_return_trace(unsigned long ret, unsigned long long time, + unsigned long func) +{ + int index; + struct thread_info *ti; + unsigned long flags; + int err = 0; + + raw_local_irq_save(flags); + __raw_spin_lock(&ret_stack_lock); + + ti = current_thread_info(); + /* The return trace stack is full */ + if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { + err = -EBUSY; + goto out; + } + + index = ++ti->curr_ret_stack; + ti->ret_stack[index].ret = ret; + ti->ret_stack[index].func = func; + ti->ret_stack[index].calltime = time; + +out: + __raw_spin_unlock(&ret_stack_lock); + raw_local_irq_restore(flags); + return err; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void pop_return_trace(unsigned long *ret, unsigned long long *time, + unsigned long *func) +{ + struct thread_info *ti; + int index; + unsigned long flags; + + raw_local_irq_save(flags); + __raw_spin_lock(&ret_stack_lock); + + ti = current_thread_info(); + index = ti->curr_ret_stack; + *ret = ti->ret_stack[index].ret; + *func = ti->ret_stack[index].func; + *time = ti->ret_stack[index].calltime; + ti->curr_ret_stack--; + + __raw_spin_unlock(&ret_stack_lock); + raw_local_irq_restore(flags); +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ + struct ftrace_retfunc trace; + pop_return_trace(&trace.ret, &trace.calltime, &trace.func); + trace.rettime = cpu_clock(raw_smp_processor_id()); + ftrace_function_return(&trace); + + return trace.ret; +} + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +asmlinkage +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +{ + unsigned long old; + unsigned long long calltime; + int faulted; + unsigned long return_hooker = (unsigned long) + &return_to_handler; + + /* Nmi's are currently unsupported */ + if (atomic_read(&in_nmi)) + return; + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: movl (%[parent_old]), %[old]\n" + "2: movl %[return_hooker], (%[parent_replaced])\n" + " movl $0, %[faulted]\n" + + ".section .fixup, \"ax\"\n" + "3: movl $1, %[faulted]\n" + ".previous\n" + + ".section __ex_table, \"a\"\n" + " .long 1b, 3b\n" + " .long 2b, 3b\n" + ".previous\n" + + : [parent_replaced] "=rm" (parent), [old] "=r" (old), + [faulted] "=r" (faulted) + : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (WARN_ON(faulted)) { + unregister_ftrace_return(); + return; + } + + if (WARN_ON(!__kernel_text_address(old))) { + unregister_ftrace_return(); + *parent = old; + return; + } + + calltime = cpu_clock(raw_smp_processor_id()); + + if (push_return_trace(old, calltime, self_addr) == -EBUSY) + *parent = old; +} + +static int __init init_ftrace_function_return(void) +{ + ret_stack_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + return 0; +} +device_initcall(init_ftrace_function_return); + + +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; @@ -31,17 +194,11 @@ union ftrace_code_union { } __attribute__((packed)); }; - static int ftrace_calc_offset(long ip, long addr) { return (int)(addr - ip); } -unsigned char *ftrace_nop_replace(void) -{ - return ftrace_nop; -} - unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -183,6 +340,15 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) } + + +static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop; +} + int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) @@ -292,3 +458,4 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } +#endif -- cgit v1.2.3 From f1c4be5edad3756212cbbbeab39428fe90c27109 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Nov 2008 10:22:36 +0100 Subject: tracing, x86: clean up FUNCTION_RET_TRACER Kconfig Impact: cleanup move FUNCTION_RET_TRACER to the X86 select section, where we have all the other options. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ca91e50bdb1..0de793cf214 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -11,7 +11,6 @@ config 64BIT config X86_32 def_bool !64BIT - select HAVE_FUNCTION_RET_TRACER config X86_64 def_bool 64BIT @@ -30,6 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER + select HAVE_FUNCTION_RET_TRACER if X86_32 select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER -- cgit v1.2.3 From 867f7fb3ebb831970847b179e7df5a9ab10da16d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Nov 2008 11:18:14 +0100 Subject: tracing, x86: function return tracer, fix assembly constraints fix: arch/x86/kernel/ftrace.c: Assembler messages: arch/x86/kernel/ftrace.c:140: Error: missing ')' arch/x86/kernel/ftrace.c:140: Error: junk `(%ebp))' after expression arch/x86/kernel/ftrace.c:141: Error: missing ')' arch/x86/kernel/ftrace.c:141: Error: junk `(%ebp))' after expression the [parent_replaced] is used in an =rm fashion, so that constraint is correct in isolation - but [parent_old] aliases register %0 and uses it in an addressing mode that is only valid with registers - so change the constraint from =rm to =r. This fixes the build failure. Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d68033bba22..9b2325a4d53 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -151,7 +151,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) " .long 2b, 3b\n" ".previous\n" - : [parent_replaced] "=rm" (parent), [old] "=r" (old), + : [parent_replaced] "=r" (parent), [old] "=r" (old), [faulted] "=r" (faulted) : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) : "memory" -- cgit v1.2.3 From a3d732f93785da17e0137210deadb4616f5536fc Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Mon, 10 Nov 2008 16:16:31 -0600 Subject: x86, UV: fix redundant creation of sgi_uv Impact: fix double entry creation in /proc There is a collision between two UV functions: both uv_ptc_init() and gru_proc_init() try to make /proc/sgi_uv So move it's creation to a single place: uv_system_init() Signed-off-by: Cliff Wickman Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 2 ++ arch/x86/kernel/tlb_uv.c | 4 ---- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 85fb7dd48f6..d7213a1cb78 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -570,4 +571,5 @@ void __init uv_system_init(void) uv_cpu_init(); uv_scir_register_cpu_notifier(); + proc_mkdir("sgi_uv", NULL); } diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 04431f34fd1..6a00e5faaa7 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -566,14 +566,10 @@ static int __init uv_ptc_init(void) if (!is_uv_system()) return 0; - if (!proc_mkdir("sgi_uv", NULL)) - return -EINVAL; - proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); if (!proc_uv_ptc) { printk(KERN_ERR "unable to create %s proc entry\n", UV_PTC_BASENAME); - remove_proc_entry("sgi_uv", NULL); return -EINVAL; } proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; -- cgit v1.2.3 From 19b3e9671c5a219b8c34da2cc66e0ce7c3a501ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Nov 2008 11:57:02 +0100 Subject: tracing: function return tracer, build fix fix: arch/x86/kernel/ftrace.c: In function 'ftrace_return_to_handler': arch/x86/kernel/ftrace.c:112: error: implicit declaration of function 'cpu_clock' cpu_clock() is implicitly included via a number of ways, but its real location is sched.h. (Build failure is triggerable if enough other kernel components are turned off.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9b2325a4d53..16a571dea2e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From d3ec5cae0921611ceae06464ef6291012dd9849f Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 11 Nov 2008 14:33:44 +0100 Subject: x86: call machine_shutdown and stop all CPUs in native_machine_halt Impact: really halt all CPUs on halt Function machine_halt (resp. native_machine_halt) is empty for x86 architectures. When command 'halt -f' is invoked, the message "System halted." is displayed but this is not really true because all CPUs are still running. There are also similar inconsistencies for other arches (some uses power-off for halt or forever-loop with IRQs enabled/disabled). IMO there should be used the same approach for all architectures OR what does the message "System halted" really mean? This patch fixes it for x86. Signed-off-by: Ivan Vecera Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 1 + arch/x86/include/asm/system.h | 2 ++ arch/x86/kernel/process.c | 16 ++++++++++++++++ arch/x86/kernel/reboot.c | 5 +++++ arch/x86/kernel/smp.c | 13 ------------- 5 files changed, 24 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3b1510b4fc5..25caa0738af 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -193,6 +193,7 @@ extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask); static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } +static inline void disable_local_APIC(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2ed3f0f44ff..07c3e404899 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -314,6 +314,8 @@ extern void free_init_pages(char *what, unsigned long begin, unsigned long end); void default_idle(void); +void stop_this_cpu(void *dummy); + /* * Force strict CPU ordering. * And yes, this is required on UP too when we're talking diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772744d..a4da7c4b312 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,6 +8,7 @@ #include #include #include +#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -122,6 +123,21 @@ void default_idle(void) EXPORT_SYMBOL(default_idle); #endif +void stop_this_cpu(void *dummy) +{ + local_irq_disable(); + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + disable_local_APIC(); + + for (;;) { + if (hlt_works(smp_processor_id())) + halt(); + } +} + static void do_nothing(void *unused) { } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 724adfc63cb..34f8d37ae3c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -461,6 +461,11 @@ static void native_machine_restart(char *__unused) static void native_machine_halt(void) { + /* stop other cpus and apics */ + machine_shutdown(); + + /* stop this cpu */ + stop_this_cpu(NULL); } static void native_machine_power_off(void) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18f9b19f5f8..3f92b134ab9 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -140,19 +140,6 @@ void native_send_call_func_ipi(cpumask_t mask) send_IPI_mask(mask, CALL_FUNCTION_VECTOR); } -static void stop_this_cpu(void *dummy) -{ - local_irq_disable(); - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - disable_local_APIC(); - if (hlt_works(smp_processor_id())) - for (;;) halt(); - for (;;); -} - /* * this function calls the 'stop' function on all other CPUs in the system. */ -- cgit v1.2.3 From a98f8fd24fb24fcb9a359553e64dd6aac5cf4279 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 6 Nov 2008 01:13:39 +0100 Subject: x86: apic reset counter on shutdown Impact: avoid spurious lapic timer events on shutdown The apic timer might be close to firing when it is shutdown. We can not really disable the timer - we just mask the interrupt. That way we can get an extra interrupt when it is reenabled. Set the counter to max on shutdown to avoid this. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 70879c9e393..1d410ee4b06 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -441,6 +441,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); + apic_write(APIC_TMICT, 0xffffffff); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ -- cgit v1.2.3 From c280ea5e4c6ba0b38ed6b005150fe16a660e903b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 8 Nov 2008 13:29:45 +0100 Subject: x86: fix documentation typo in arch/x86/Kconfig Impact: documentation update Chris Snook pointed out that it's Core i7, not Core 7i. Reported-by: Chris Snook Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 38ae04bf651..bacac556b18 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -963,7 +963,7 @@ config NUMA local memory controller of the CPU and add some more NUMA awareness to the kernel. - For 64-bit this is recommended if the system is Intel Core 7i + For 64-bit this is recommended if the system is Intel Core i7 (or later), AMD Opteron, or EM64T NUMA. For 32-bit this is only needed on (rare) 32-bit-only platforms -- cgit v1.2.3 From 14d7ca5c575853664d8fe4f225a77b8df1b7de7d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 11 Nov 2008 16:19:48 -0800 Subject: x86: attempt reboot via port CF9 if we have standard PCI ports Impact: Changes reboot behavior. If port CF9 seems to be safe to touch, attempt it before trying the keyboard controller. Port CF9 is not available on all chipsets (a significant but decreasing number of modern chipsets don't implement it), but port CF9 itself should in general be safe to poke (no ill effects if unimplemented) on any system which has PCI Configuration Method #1 or #2, as it falls inside the PCI configuration port range in both cases. No chipset without PCI is known to have port CF9, either, although an explicit "pci=bios" would mean we miss this and therefore don't use port CF9. An explicit "reboot=pci" can be used to force the use of port CF9. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/emergency-restart.h | 4 +++- arch/x86/kernel/reboot.c | 34 +++++++++++++++++++++++++------- arch/x86/pci/direct.c | 4 +++- arch/x86/pci/pci.h | 1 + 4 files changed, 34 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index 94826cf8745..cc70c1c78ca 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -8,7 +8,9 @@ enum reboot_type { BOOT_BIOS = 'b', #endif BOOT_ACPI = 'a', - BOOT_EFI = 'e' + BOOT_EFI = 'e', + BOOT_CF9 = 'p', + BOOT_CF9_COND = 'q', }; extern enum reboot_type reboot_type; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 34f8d37ae3c..ddc93891cdc 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -29,14 +29,17 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_KBD; +enum reboot_type reboot_type = BOOT_CF9_COND; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) static int reboot_cpu = -1; #endif -/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] +/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ +bool port_cf9_safe = false; + +/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] warm Don't set the cold reboot flag cold Set the cold reboot flag bios Reboot by jumping through the BIOS (only for X86_32) @@ -45,6 +48,7 @@ static int reboot_cpu = -1; kbd Use the keyboard controller. cold reset (default) acpi Use the RESET_REG in the FADT efi Use efi reset_system runtime service + pci Use the so-called "PCI reset register", CF9 force Avoid anything that could hang. */ static int __init reboot_setup(char *str) @@ -79,6 +83,7 @@ static int __init reboot_setup(char *str) case 'k': case 't': case 'e': + case 'p': reboot_type = *str; break; @@ -379,28 +384,43 @@ static void native_machine_emergency_restart(void) load_idt(&no_idt); __asm__ __volatile__("int3"); - reboot_type = BOOT_KBD; + reboot_type = BOOT_CF9_COND; break; #ifdef CONFIG_X86_32 case BOOT_BIOS: machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); - reboot_type = BOOT_KBD; + reboot_type = BOOT_CF9_COND; break; #endif case BOOT_ACPI: acpi_reboot(); - reboot_type = BOOT_KBD; + reboot_type = BOOT_CF9_COND; break; - case BOOT_EFI: if (efi_enabled) - efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, + efi.reset_system(reboot_mode ? + EFI_RESET_WARM : + EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); + reboot_type = BOOT_CF9_COND; + break; + + case BOOT_CF9: + port_cf9_safe = true; + /* fall through */ + case BOOT_CF9_COND: + if (port_cf9_safe) { + u8 cf9 = inb(0xcf9) & ~6; + outb(cf9|2, 0xcf9); /* Request hard reset */ + udelay(50); + outb(cf9|6, 0xcf9); /* Actually do the reset */ + udelay(50); + } reboot_type = BOOT_KBD; break; } diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 9915293500f..9a5af6c8fbe 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -173,7 +173,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, #undef PCI_CONF2_ADDRESS -static struct pci_raw_ops pci_direct_conf2 = { +struct pci_raw_ops pci_direct_conf2 = { .read = pci_conf2_read, .write = pci_conf2_write, }; @@ -289,6 +289,7 @@ int __init pci_direct_probe(void) if (pci_check_type1()) { raw_pci_ops = &pci_direct_conf1; + port_cf9_safe = true; return 1; } release_resource(region); @@ -305,6 +306,7 @@ int __init pci_direct_probe(void) if (pci_check_type2()) { raw_pci_ops = &pci_direct_conf2; + port_cf9_safe = true; return 2; } diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index 15b9cf6be72..1959018aac0 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -96,6 +96,7 @@ extern struct pci_raw_ops *raw_pci_ops; extern struct pci_raw_ops *raw_pci_ext_ops; extern struct pci_raw_ops pci_direct_conf1; +extern bool port_cf9_safe; /* arch_initcall level */ extern int pci_direct_probe(void); -- cgit v1.2.3 From 1f0d69a9fc815db82f15722bf05227190b1d714d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 00:14:39 -0500 Subject: tracing: profile likely and unlikely annotations Impact: new unlikely/likely profiler Andrew Morton recently suggested having an in-kernel way to profile likely and unlikely macros. This patch achieves that goal. When configured, every(*) likely and unlikely macro gets a counter attached to it. When the condition is hit, the hit and misses of that condition are recorded. These numbers can later be retrieved by: /debugfs/tracing/profile_likely - All likely markers /debugfs/tracing/profile_unlikely - All unlikely markers. # cat /debug/tracing/profile_unlikely | head correct incorrect % Function File Line ------- --------- - -------- ---- ---- 2167 0 0 do_arch_prctl process_64.c 832 0 0 0 do_arch_prctl process_64.c 804 2670 0 0 IS_ERR err.h 34 71230 5693 7 __switch_to process_64.c 673 76919 0 0 __switch_to process_64.c 639 43184 33743 43 __switch_to process_64.c 624 12740 64181 83 __switch_to process_64.c 594 12740 64174 83 __switch_to process_64.c 590 # cat /debug/tracing/profile_unlikely | \ awk '{ if ($3 > 25) print $0; }' |head -20 44963 35259 43 __switch_to process_64.c 624 12762 67454 84 __switch_to process_64.c 594 12762 67447 84 __switch_to process_64.c 590 1478 595 28 syscall_get_error syscall.h 51 0 2821 100 syscall_trace_leave ptrace.c 1567 0 1 100 native_smp_prepare_cpus smpboot.c 1237 86338 265881 75 calc_delta_fair sched_fair.c 408 210410 108540 34 calc_delta_mine sched.c 1267 0 54550 100 sched_info_queued sched_stats.h 222 51899 66435 56 pick_next_task_fair sched_fair.c 1422 6 10 62 yield_task_fair sched_fair.c 982 7325 2692 26 rt_policy sched.c 144 0 1270 100 pre_schedule_rt sched_rt.c 1261 1268 48073 97 pick_next_task_rt sched_rt.c 884 0 45181 100 sched_info_dequeued sched_stats.h 177 0 15 100 sched_move_task sched.c 8700 0 15 100 sched_move_task sched.c 8690 53167 33217 38 schedule sched.c 4457 0 80208 100 sched_info_switch sched_stats.h 270 30585 49631 61 context_switch sched.c 2619 # cat /debug/tracing/profile_likely | awk '{ if ($3 > 25) print $0; }' 39900 36577 47 pick_next_task sched.c 4397 20824 15233 42 switch_mm mmu_context_64.h 18 0 7 100 __cancel_work_timer workqueue.c 560 617 66484 99 clocksource_adjust timekeeping.c 456 0 346340 100 audit_syscall_exit auditsc.c 1570 38 347350 99 audit_get_context auditsc.c 732 0 345244 100 audit_syscall_entry auditsc.c 1541 38 1017 96 audit_free auditsc.c 1446 0 1090 100 audit_alloc auditsc.c 862 2618 1090 29 audit_alloc auditsc.c 858 0 6 100 move_masked_irq migration.c 9 1 198 99 probe_sched_wakeup trace_sched_switch.c 58 2 2 50 probe_wakeup trace_sched_wakeup.c 227 0 2 100 probe_wakeup_sched_switch trace_sched_wakeup.c 144 4514 2090 31 __grab_cache_page filemap.c 2149 12882 228786 94 mapping_unevictable pagemap.h 50 4 11 73 __flush_cpu_slab slub.c 1466 627757 330451 34 slab_free slub.c 1731 2959 61245 95 dentry_lru_del_init dcache.c 153 946 1217 56 load_elf_binary binfmt_elf.c 904 102 82 44 disk_put_part genhd.h 206 1 1 50 dst_gc_task dst.c 82 0 19 100 tcp_mss_split_point tcp_output.c 1126 As you can see by the above, there's a bit of work to do in rethinking the use of some unlikelys and likelys. Note: the unlikely case had 71 hits that were more than 25%. Note: After submitting my first version of this patch, Andrew Morton showed me a version written by Daniel Walker, where I picked up the following ideas from: 1) Using __builtin_constant_p to avoid profiling fixed values. 2) Using __FILE__ instead of instruction pointers. 3) Using the preprocessor to stop all profiling of likely annotations from vsyscall_64.c. Thanks to Andrew Morton, Arjan van de Ven, Theodore Tso and Ingo Molnar for their feed back on this patch. (*) Not ever unlikely is recorded, those that are used by vsyscalls (a few of them) had to have profiling disabled. Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: Frederic Weisbecker Cc: Theodore Tso Cc: Arjan van de Ven Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 0b8b6690a86..2f90202e59b 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -17,6 +17,14 @@ * want per guest time just set the kernel.vsyscall64 sysctl to 0. */ +/* Protect userspace from profiling */ +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE +# undef likely +# undef unlikely +# define likely(x) likely_notrace(x) +# define unlikely(x) unlikely_notrace(x) +#endif + #include #include #include -- cgit v1.2.3 From 2b7d0390a6d6d595f43ea3806639664afe5b9ebe Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 Nov 2008 13:17:38 +0100 Subject: tracing: branch tracer, fix vdso crash Impact: fix bootup crash the branch tracer missed arch/x86/vdso/vclock_gettime.c from disabling tracing, which caused such bootup crashes: [ 201.840097] init[1]: segfault at 7fffed3fe7c0 ip 00007fffed3fea2e sp 000077 also clean up the ugly ifdefs in arch/x86/kernel/vsyscall_64.c by creating DISABLE_UNLIKELY_PROFILE facility for code to turn off instrumentation on a per file basis. Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 9 ++------- arch/x86/vdso/vclock_gettime.c | 3 +++ 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 2f90202e59b..ece02932ea5 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -17,13 +17,8 @@ * want per guest time just set the kernel.vsyscall64 sysctl to 0. */ -/* Protect userspace from profiling */ -#ifdef CONFIG_TRACE_UNLIKELY_PROFILE -# undef likely -# undef unlikely -# define likely(x) likely_notrace(x) -# define unlikely(x) unlikely_notrace(x) -#endif +/* Disable profiling for userspace code: */ +#define DISABLE_UNLIKELY_PROFILE #include #include diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 1ef0f90813d..6e667631e7d 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -9,6 +9,9 @@ * Also alternative() doesn't work. */ +/* Disable profiling for userspace code: */ +#define DISABLE_UNLIKELY_PROFILE + #include #include #include -- cgit v1.2.3 From a7d41820f683c35b53af719210a51f6aa0f86a6a Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 12 Nov 2008 11:34:37 -0200 Subject: x86 kdump: extract kdump-specific code from crash_nmi_callback() Impact: cleanup The NMI CPU-halting code will be used on non-kdump cases, also (e.g. emergency_reboot when virtualization is enabled). Signed-off-by: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 26855381790..60475422a51 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -35,19 +35,34 @@ static int crashing_cpu; #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct notifier_block *self, - unsigned long val, void *data) +static void kdump_nmi_callback(int cpu, struct die_args *args) { struct pt_regs *regs; #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; #endif + + regs = args->regs; + +#ifdef CONFIG_X86_32 + if (!user_mode_vm(regs)) { + crash_fixup_ss_esp(&fixed_regs, regs); + regs = &fixed_regs; + } +#endif + crash_save_cpu(regs, cpu); + + disable_local_APIC(); +} + +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) +{ int cpu; if (val != DIE_NMI_IPI) return NOTIFY_OK; - regs = ((struct die_args *)data)->regs; cpu = raw_smp_processor_id(); /* Don't do anything if this handler is invoked on crashing cpu. @@ -58,14 +73,8 @@ static int crash_nmi_callback(struct notifier_block *self, return NOTIFY_STOP; local_irq_disable(); -#ifdef CONFIG_X86_32 - if (!user_mode_vm(regs)) { - crash_fixup_ss_esp(&fixed_regs, regs); - regs = &fixed_regs; - } -#endif - crash_save_cpu(regs, cpu); - disable_local_APIC(); + kdump_nmi_callback(cpu, (struct die_args *)data); + atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ halt(); -- cgit v1.2.3 From b2bbe71b829564fb65a6bc7e1e25e02d70cffce8 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 12 Nov 2008 11:34:38 -0200 Subject: x86 kdump: move crashing_cpu assignment to nmi_shootdown_cpus() Impact: cleanup This variable will be moved to non-kdump-specific code. Signed-off-by: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 60475422a51..ed2f0f9dc89 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -29,10 +29,11 @@ #include +#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) + /* This keeps a track of which one is crashing cpu. */ static int crashing_cpu; -#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static atomic_t waiting_for_crash_ipi; static void kdump_nmi_callback(int cpu, struct die_args *args) @@ -100,6 +101,9 @@ static void nmi_shootdown_cpus(void) { unsigned long msecs; + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = safe_smp_processor_id(); + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); /* Would it be better to replace the trap vector here? */ if (register_die_notifier(&crash_nmi_nb)) @@ -140,8 +144,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs) /* The kernel is broken so disable interrupts */ local_irq_disable(); - /* Make a note of crashing cpu. Will be used in NMI callback.*/ - crashing_cpu = safe_smp_processor_id(); nmi_shootdown_cpus(); lapic_shutdown(); #if defined(CONFIG_X86_IO_APIC) -- cgit v1.2.3 From d1e7b91cfaa8fc5ed736dcfb8beb5134a2385228 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 12 Nov 2008 11:34:39 -0200 Subject: x86 kdump: create kdump_nmi_shootdown_cpus() Impact: cleanup For the kdump-specific code that was living on nmi_shootdown_cpus(). Signed-off-by: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ed2f0f9dc89..75c468cc7e5 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -122,10 +122,17 @@ static void nmi_shootdown_cpus(void) } /* Leave the nmi callback set */ +} + +static void kdump_nmi_shootdown_cpus(void) +{ + nmi_shootdown_cpus(); + disable_local_APIC(); } + #else -static void nmi_shootdown_cpus(void) +static void kdump_nmi_shootdown_cpus(void) { /* There are no cpus to shootdown */ } @@ -144,7 +151,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) /* The kernel is broken so disable interrupts */ local_irq_disable(); - nmi_shootdown_cpus(); + kdump_nmi_shootdown_cpus(); lapic_shutdown(); #if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); -- cgit v1.2.3 From 8e294786316aca41c66b8b73ba1ee74a4ae7d452 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 12 Nov 2008 11:34:40 -0200 Subject: x86 kdump: make kdump_nmi_callback() a function ptr on crash_nmi_callback() Impact: extend nmi_shootdown_cpus() with a callback The reboot code will use a different function on crash_nmi_callback(). Adding a function pointer parameter to nmi_shootdown_cpus() for that. Signed-off-by: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 75c468cc7e5..f23c2beeb37 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -29,10 +29,13 @@ #include +typedef void (*nmi_shootdown_cb)(int, struct die_args*); + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) /* This keeps a track of which one is crashing cpu. */ static int crashing_cpu; +static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; @@ -74,7 +77,7 @@ static int crash_nmi_callback(struct notifier_block *self, return NOTIFY_STOP; local_irq_disable(); - kdump_nmi_callback(cpu, (struct die_args *)data); + shootdown_callback(cpu, (struct die_args *)data); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -97,13 +100,15 @@ static struct notifier_block crash_nmi_nb = { .notifier_call = crash_nmi_callback, }; -static void nmi_shootdown_cpus(void) +static void nmi_shootdown_cpus(nmi_shootdown_cb callback) { unsigned long msecs; /* Make a note of crashing cpu. Will be used in NMI callback.*/ crashing_cpu = safe_smp_processor_id(); + shootdown_callback = callback; + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); /* Would it be better to replace the trap vector here? */ if (register_die_notifier(&crash_nmi_nb)) @@ -126,7 +131,7 @@ static void nmi_shootdown_cpus(void) static void kdump_nmi_shootdown_cpus(void) { - nmi_shootdown_cpus(); + nmi_shootdown_cpus(kdump_nmi_callback); disable_local_APIC(); } -- cgit v1.2.3 From c370e5e089adfd5b1b863f3464cccae9ebf33cca Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 12 Nov 2008 11:34:41 -0200 Subject: x86 kdump: make nmi_shootdown_cpus() non-static Impact: make API available to the rest of x86 platform code Add prototype to asm/reboot.h. Signed-off-by: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/include/asm/reboot.h | 5 +++++ arch/x86/kernel/crash.c | 3 +-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index df7710354f8..562d4fd31ba 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_REBOOT_H #define _ASM_X86_REBOOT_H +#include + struct pt_regs; struct machine_ops { @@ -18,4 +20,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); void machine_real_restart(const unsigned char *code, int length); +typedef void (*nmi_shootdown_cb)(int, struct die_args*); +void nmi_shootdown_cpus(nmi_shootdown_cb callback); + #endif /* _ASM_X86_REBOOT_H */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index f23c2beeb37..fb298d1daac 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -29,7 +29,6 @@ #include -typedef void (*nmi_shootdown_cb)(int, struct die_args*); #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -100,7 +99,7 @@ static struct notifier_block crash_nmi_nb = { .notifier_call = crash_nmi_callback, }; -static void nmi_shootdown_cpus(nmi_shootdown_cb callback) +void nmi_shootdown_cpus(nmi_shootdown_cb callback) { unsigned long msecs; -- cgit v1.2.3 From 2ddded213895e41b9cfe1c084127e6c01632ac1a Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 12 Nov 2008 11:34:42 -0200 Subject: x86: move nmi_shootdown_cpus() to reboot.c Impact: make nmi_shootdown_cpus() available to the rest of the x86 platform Now nmi_shootdown_cpus() is ready to be used by non-kdump code also. Move it to reboot.c. Signed-off-by: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash.c | 76 -------------------------------------------- arch/x86/kernel/reboot.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 76 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index fb298d1daac..d84a852e4cd 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -32,12 +32,6 @@ #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) -/* This keeps a track of which one is crashing cpu. */ -static int crashing_cpu; -static nmi_shootdown_cb shootdown_callback; - -static atomic_t waiting_for_crash_ipi; - static void kdump_nmi_callback(int cpu, struct die_args *args) { struct pt_regs *regs; @@ -58,76 +52,6 @@ static void kdump_nmi_callback(int cpu, struct die_args *args) disable_local_APIC(); } -static int crash_nmi_callback(struct notifier_block *self, - unsigned long val, void *data) -{ - int cpu; - - if (val != DIE_NMI_IPI) - return NOTIFY_OK; - - cpu = raw_smp_processor_id(); - - /* Don't do anything if this handler is invoked on crashing cpu. - * Otherwise, system will completely hang. Crashing cpu can get - * an NMI if system was initially booted with nmi_watchdog parameter. - */ - if (cpu == crashing_cpu) - return NOTIFY_STOP; - local_irq_disable(); - - shootdown_callback(cpu, (struct die_args *)data); - - atomic_dec(&waiting_for_crash_ipi); - /* Assume hlt works */ - halt(); - for (;;) - cpu_relax(); - - return 1; -} - -static void smp_send_nmi_allbutself(void) -{ - cpumask_t mask = cpu_online_map; - cpu_clear(safe_smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, NMI_VECTOR); -} - -static struct notifier_block crash_nmi_nb = { - .notifier_call = crash_nmi_callback, -}; - -void nmi_shootdown_cpus(nmi_shootdown_cb callback) -{ - unsigned long msecs; - - /* Make a note of crashing cpu. Will be used in NMI callback.*/ - crashing_cpu = safe_smp_processor_id(); - - shootdown_callback = callback; - - atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - /* Would it be better to replace the trap vector here? */ - if (register_die_notifier(&crash_nmi_nb)) - return; /* return what? */ - /* Ensure the new callback function is set before sending - * out the NMI - */ - wmb(); - - smp_send_nmi_allbutself(); - - msecs = 1000; /* Wait at most a second for the other cpus to stop */ - while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { - mdelay(1); - msecs--; - } - - /* Leave the nmi callback set */ -} - static void kdump_nmi_shootdown_cpus(void) { nmi_shootdown_cpus(kdump_nmi_callback); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 724adfc63cb..364edeecc23 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -21,6 +21,9 @@ # include #endif +#include + + /* * Power off function, if any */ @@ -514,3 +517,83 @@ void machine_crash_shutdown(struct pt_regs *regs) machine_ops.crash_shutdown(regs); } #endif + + +#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) + +/* This keeps a track of which one is crashing cpu. */ +static int crashing_cpu; +static nmi_shootdown_cb shootdown_callback; + +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) +{ + int cpu; + + if (val != DIE_NMI_IPI) + return NOTIFY_OK; + + cpu = raw_smp_processor_id(); + + /* Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return NOTIFY_STOP; + local_irq_disable(); + + shootdown_callback(cpu, (struct die_args *)data); + + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + halt(); + for (;;) + cpu_relax(); + + return 1; +} + +static void smp_send_nmi_allbutself(void) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(safe_smp_processor_id(), mask); + if (!cpus_empty(mask)) + send_IPI_mask(mask, NMI_VECTOR); +} + +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + unsigned long msecs; + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = safe_smp_processor_id(); + + shootdown_callback = callback; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ + /* Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* Leave the nmi callback set */ +} +#endif -- cgit v1.2.3 From bb8dd270e62217e2d2172094c6c352c4ddc0a127 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 12 Nov 2008 11:34:43 -0200 Subject: x86: make nmi_shootdown_cpus() available on !SMP and !X86_LOCAL_APIC Impact: widen nmi_shootdown_cpus() availability The X86_LOCAL_APIC #ifdef was for kdump. For !SMP, the function simply does nothing. Signed-off-by: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 364edeecc23..17a41e05556 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -519,7 +519,7 @@ void machine_crash_shutdown(struct pt_regs *regs) #endif -#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) +#if defined(CONFIG_SMP) /* This keeps a track of which one is crashing cpu. */ static int crashing_cpu; @@ -568,6 +568,12 @@ static struct notifier_block crash_nmi_nb = { .notifier_call = crash_nmi_callback, }; +/* Halt all other CPUs, calling the specified function on each of them + * + * This function can be used to halt all other CPUs on crash + * or emergency reboot time. The function passed as parameter + * will be called inside a NMI handler on all CPUs. + */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { unsigned long msecs; @@ -596,4 +602,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* Leave the nmi callback set */ } +#else /* !CONFIG_SMP */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + /* No other CPUs to shoot down */ +} #endif -- cgit v1.2.3 From c415b3dce30dfb41234e118662e8720f47343a4f Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 12 Nov 2008 11:34:44 -0200 Subject: x86: disable IRQs before doing anything on nmi_shootdown_cpus() Impact: make nmi_shootdown_cpus() callable from preemptible context We need to know on which CPU we are running on, and we don't want to be preempted while doing this. Signed-off-by: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 17a41e05556..c3cd512484e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -577,6 +577,7 @@ static struct notifier_block crash_nmi_nb = { void nmi_shootdown_cpus(nmi_shootdown_cb callback) { unsigned long msecs; + local_irq_disable(); /* Make a note of crashing cpu. Will be used in NMI callback.*/ crashing_cpu = safe_smp_processor_id(); -- cgit v1.2.3 From 2ed84eeb8808cf3c9f039213ca137ffd7d753f0e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 15:24:24 -0500 Subject: trace: rename unlikely profiler to branch profiler Impact: name change of unlikely tracer and profiler Ingo Molnar suggested changing the config from UNLIKELY_PROFILE to BRANCH_PROFILING. I never did like the "unlikely" name so I went one step farther, and renamed all the unlikely configurations to a "BRANCH" variant. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 2 +- arch/x86/vdso/vclock_gettime.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index ece02932ea5..6f3d3d4cd97 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,7 +18,7 @@ */ /* Disable profiling for userspace code: */ -#define DISABLE_UNLIKELY_PROFILE +#define DISABLE_BRANCH_PROFILING #include #include diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 6e667631e7d..d9d35824c56 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -10,7 +10,7 @@ */ /* Disable profiling for userspace code: */ -#define DISABLE_UNLIKELY_PROFILE +#define DISABLE_BRANCH_PROFILING #include #include -- cgit v1.2.3 From 62d59d17a5f98edb48b171742dfa531488802f07 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 12 Nov 2008 22:47:54 +0100 Subject: tracing/function-return-tracer: make the function return tracer lockless Impact: remove spinlocks and irq disabling in function return tracer. I've tried to figure out all of the race condition that could happen when the tracer pushes or pops a return address trace to/from the current thread_info. Theory: _ One thread can only execute on one cpu at a time. So this code doesn't need to be SMP-safe. Just drop the spinlock. _ The only race could happen between the current thread and an interrupt. If an interrupt is raised, it will increase the index of the return stack storage and then execute until the end of the tracing to finally free the index it used. We don't need to disable irqs. This is theorical. In practice, I've tested it with a two-core SMP and had no problem at all. Perhaps -tip testing could confirm it. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 43 +++++-------------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 16a571dea2e..1db0e121a3e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -44,62 +44,37 @@ void ftrace_nmi_exit(void) atomic_dec(&in_nmi); } -/* - * Synchronize accesses to return adresses stack with - * interrupts. - */ -static raw_spinlock_t ret_stack_lock; - /* Add a function return address to the trace stack on thread info.*/ static int push_return_trace(unsigned long ret, unsigned long long time, unsigned long func) { int index; - struct thread_info *ti; - unsigned long flags; - int err = 0; - - raw_local_irq_save(flags); - __raw_spin_lock(&ret_stack_lock); + struct thread_info *ti = current_thread_info(); - ti = current_thread_info(); /* The return trace stack is full */ - if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { - err = -EBUSY; - goto out; - } + if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) + return -EBUSY; index = ++ti->curr_ret_stack; ti->ret_stack[index].ret = ret; ti->ret_stack[index].func = func; ti->ret_stack[index].calltime = time; -out: - __raw_spin_unlock(&ret_stack_lock); - raw_local_irq_restore(flags); - return err; + return 0; } /* Retrieve a function return address to the trace stack on thread info.*/ static void pop_return_trace(unsigned long *ret, unsigned long long *time, unsigned long *func) { - struct thread_info *ti; int index; - unsigned long flags; - - raw_local_irq_save(flags); - __raw_spin_lock(&ret_stack_lock); - ti = current_thread_info(); + struct thread_info *ti = current_thread_info(); index = ti->curr_ret_stack; *ret = ti->ret_stack[index].ret; *func = ti->ret_stack[index].func; *time = ti->ret_stack[index].calltime; ti->curr_ret_stack--; - - __raw_spin_unlock(&ret_stack_lock); - raw_local_irq_restore(flags); } /* @@ -175,14 +150,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) *parent = old; } -static int __init init_ftrace_function_return(void) -{ - ret_stack_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - return 0; -} -device_initcall(init_ftrace_function_return); - - #endif #ifdef CONFIG_DYNAMIC_FTRACE -- cgit v1.2.3 From 1dc1c6adf38bc5799d1594681645ced40ced4b6b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 12 Nov 2008 22:49:23 +0100 Subject: tracing/function-return-tracer: call prepare_ftrace_return by registers Impact: Optimize a bit the function return tracer This patch changes the calling convention of prepare_ftrace_return to pass its arguments by register. This will optimize it a bit and prepare it to support dynamic tracing. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 5 +---- arch/x86/kernel/ftrace.c | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 9a0ac85946d..f9762114983 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1217,12 +1217,9 @@ trace_return: pushl %eax pushl %ecx pushl %edx - movl 0xc(%esp), %eax - pushl %eax + movl 0xc(%esp), %edx lea 0x4(%ebp), %eax - pushl %eax call prepare_ftrace_return - addl $8, %esp popl %edx popl %ecx popl %eax diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1db0e121a3e..fe832738e1e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -95,7 +95,6 @@ unsigned long ftrace_return_to_handler(void) * Hook the return address and push it in the stack of return addrs * in current thread info. */ -asmlinkage void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) { unsigned long old; -- cgit v1.2.3 From 350b4da71f8326b9319ada7b701f2bce2e1285b7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:38:40 +1100 Subject: CRED: Wrap task credential accesses in the x86 arch Wrap access to task credentials so that they can be separated more easily from the task_struct during the introduction of COW creds. Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id(). Change some task->e?[ug]id to task_e?[ug]id(). In some places it makes more sense to use RCU directly rather than a convenient wrapper; these will be addressed by later patches. Signed-off-by: David Howells Reviewed-by: James Morris Acked-by: Serge Hallyn Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Signed-off-by: James Morris --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 31e8730fa24..3a1b6ef4f05 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -393,7 +393,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, if (pte && pte_present(*pte) && !pte_exec(*pte)) printk(KERN_CRIT "kernel tried to execute " "NX-protected page - exploit attempt? " - "(uid: %d)\n", current->uid); + "(uid: %d)\n", current_uid()); } #endif -- cgit v1.2.3 From a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:24 +1100 Subject: CRED: Make execve() take advantage of copy-on-write credentials Make execve() take advantage of copy-on-write credentials, allowing it to set up the credentials in advance, and then commit the whole lot after the point of no return. This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). The credential bits from struct linux_binprm are, for the most part, replaced with a single credentials pointer (bprm->cred). This means that all the creds can be calculated in advance and then applied at the point of no return with no possibility of failure. I would like to replace bprm->cap_effective with: cap_isclear(bprm->cap_effective) but this seems impossible due to special behaviour for processes of pid 1 (they always retain their parent's capability masks where normally they'd be changed - see cap_bprm_set_creds()). The following sequence of events now happens: (a) At the start of do_execve, the current task's cred_exec_mutex is locked to prevent PTRACE_ATTACH from obsoleting the calculation of creds that we make. (a) prepare_exec_creds() is then called to make a copy of the current task's credentials and prepare it. This copy is then assigned to bprm->cred. This renders security_bprm_alloc() and security_bprm_free() unnecessary, and so they've been removed. (b) The determination of unsafe execution is now performed immediately after (a) rather than later on in the code. The result is stored in bprm->unsafe for future reference. (c) prepare_binprm() is called, possibly multiple times. (i) This applies the result of set[ug]id binaries to the new creds attached to bprm->cred. Personality bit clearance is recorded, but now deferred on the basis that the exec procedure may yet fail. (ii) This then calls the new security_bprm_set_creds(). This should calculate the new LSM and capability credentials into *bprm->cred. This folds together security_bprm_set() and parts of security_bprm_apply_creds() (these two have been removed). Anything that might fail must be done at this point. (iii) bprm->cred_prepared is set to 1. bprm->cred_prepared is 0 on the first pass of the security calculations, and 1 on all subsequent passes. This allows SELinux in (ii) to base its calculations only on the initial script and not on the interpreter. (d) flush_old_exec() is called to commit the task to execution. This performs the following steps with regard to credentials: (i) Clear pdeath_signal and set dumpable on certain circumstances that may not be covered by commit_creds(). (ii) Clear any bits in current->personality that were deferred from (c.i). (e) install_exec_creds() [compute_creds() as was] is called to install the new credentials. This performs the following steps with regard to credentials: (i) Calls security_bprm_committing_creds() to apply any security requirements, such as flushing unauthorised files in SELinux, that must be done before the credentials are changed. This is made up of bits of security_bprm_apply_creds() and security_bprm_post_apply_creds(), both of which have been removed. This function is not allowed to fail; anything that might fail must have been done in (c.ii). (ii) Calls commit_creds() to apply the new credentials in a single assignment (more or less). Possibly pdeath_signal and dumpable should be part of struct creds. (iii) Unlocks the task's cred_replace_mutex, thus allowing PTRACE_ATTACH to take place. (iv) Clears The bprm->cred pointer as the credentials it was holding are now immutable. (v) Calls security_bprm_committed_creds() to apply any security alterations that must be done after the creds have been changed. SELinux uses this to flush signals and signal handlers. (f) If an error occurs before (d.i), bprm_free() will call abort_creds() to destroy the proposed new credentials and will then unlock cred_replace_mutex. No changes to the credentials will have been made. (2) LSM interface. A number of functions have been changed, added or removed: (*) security_bprm_alloc(), ->bprm_alloc_security() (*) security_bprm_free(), ->bprm_free_security() Removed in favour of preparing new credentials and modifying those. (*) security_bprm_apply_creds(), ->bprm_apply_creds() (*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds() Removed; split between security_bprm_set_creds(), security_bprm_committing_creds() and security_bprm_committed_creds(). (*) security_bprm_set(), ->bprm_set_security() Removed; folded into security_bprm_set_creds(). (*) security_bprm_set_creds(), ->bprm_set_creds() New. The new credentials in bprm->creds should be checked and set up as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the second and subsequent calls. (*) security_bprm_committing_creds(), ->bprm_committing_creds() (*) security_bprm_committed_creds(), ->bprm_committed_creds() New. Apply the security effects of the new credentials. This includes closing unauthorised files in SELinux. This function may not fail. When the former is called, the creds haven't yet been applied to the process; when the latter is called, they have. The former may access bprm->cred, the latter may not. (3) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) The bprm_security_struct struct has been removed in favour of using the credentials-under-construction approach. (c) flush_unauthorized_files() now takes a cred pointer and passes it on to inode_has_perm(), file_has_perm() and dentry_open(). Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- arch/x86/ia32/ia32_aout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 127ec3f0721..2a4d073d2cf 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -327,7 +327,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->cached_hole_size = 0; current->mm->mmap = NULL; - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; if (N_MAGIC(ex) == OMAGIC) { -- cgit v1.2.3 From 31e889098a80ceb3e9e3c555d522b2686a6663c6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2008 16:21:19 -0800 Subject: ftrace: pass module struct to arch dynamic ftrace functions Impact: allow archs more flexibility on dynamic ftrace implementations Dynamic ftrace has largly been developed on x86. Since x86 does not have the same limitations as other architectures, the ftrace interaction between the generic code and the architecture specific code was not flexible enough to handle some of the issues that other architectures have. Most notably, module trampolines. Due to the limited branch distance that archs make in calling kernel core code from modules, the module load code must create a trampoline to jump to what will make the larger jump into core kernel code. The problem arises when this happens to a call to mcount. Ftrace checks all code before modifying it and makes sure the current code is what it expects. Right now, there is not enough information to handle modifying module trampolines. This patch changes the API between generic dynamic ftrace code and the arch dependent code. There is now two functions for modifying code: ftrace_make_nop(mod, rec, addr) - convert the code at rec->ip into a nop, where the original text is calling addr. (mod is the module struct if called by module init) ftrace_make_caller(rec, addr) - convert the code rec->ip that should be a nop into a caller to addr. The record "rec" now has a new field called "arch" where the architecture can add any special attributes to each call site record. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 8 ++++++++ arch/x86/kernel/ftrace.c | 29 ++++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9b6a1fa19e7..2bb43b433e0 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -17,6 +17,14 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) */ return addr - 1; } + +#ifdef CONFIG_DYNAMIC_FTRACE + +struct dyn_arch_ftrace { + /* No extra data needed for x86 */ +}; + +#endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index fe832738e1e..762222ad138 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -166,7 +166,7 @@ static int ftrace_calc_offset(long ip, long addr) return (int)(addr - ip); } -unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -311,12 +311,12 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; -unsigned char *ftrace_nop_replace(void) +static unsigned char *ftrace_nop_replace(void) { return ftrace_nop; } -int +static int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { @@ -349,6 +349,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return 0; } +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + new = ftrace_nop_replace(); + + return ftrace_modify_code(rec->ip, old, new); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + + return ftrace_modify_code(rec->ip, old, new); +} + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); -- cgit v1.2.3 From b01c746617da5e260803eb10ed64ca043e9a1241 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 15 Nov 2008 02:37:44 +0100 Subject: tracing/function-return-tracer: add a barrier to ensure return stack index is incremented in memory Impact: fix possible race condition in ftrace function return tracer This fixes a possible race condition if index incrementation is not immediately flushed in memory. Thanks for Andi Kleen and Steven Rostedt for pointing out this issue and give me this solution. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 762222ad138..d98b5a8ecf4 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -56,6 +56,7 @@ static int push_return_trace(unsigned long ret, unsigned long long time, return -EBUSY; index = ++ti->curr_ret_stack; + barrier(); ti->ret_stack[index].ret = ret; ti->ret_stack[index].func = func; ti->ret_stack[index].calltime = time; -- cgit v1.2.3 From e7d3737ea1b102030f44e96c97754101e41515f0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 16 Nov 2008 06:02:06 +0100 Subject: tracing/function-return-tracer: support for dynamic ftrace on function return tracer This patch adds the support for dynamic tracing on the function return tracer. The whole difference with normal dynamic function tracing is that we don't need to hook on a particular callback. The only pro that we want is to nop or set dynamically the calls to ftrace_caller (which is ftrace_return_caller here). Some security checks ensure that we are not trying to launch dynamic tracing for return tracing while normal function tracing is already running. An example of trace with getnstimeofday set as a filter: ktime_get_ts+0x22/0x50 -> getnstimeofday (2283 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1396 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1825 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1426 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1524 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1434 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1502 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1404 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1397 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1051 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1314 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1344 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1163 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1390 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1374 ns) Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 18 ++-- arch/x86/kernel/ftrace.c | 258 +++++++++++++++++++++++---------------------- 2 files changed, 141 insertions(+), 135 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f9762114983..74defe21ba4 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1190,7 +1190,7 @@ ENTRY(mcount) jnz trace #ifdef CONFIG_FUNCTION_RET_TRACER cmpl $ftrace_stub, ftrace_function_return - jnz trace_return + jnz ftrace_return_caller #endif .globl ftrace_stub ftrace_stub: @@ -1211,9 +1211,15 @@ trace: popl %ecx popl %eax jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_RET_TRACER -trace_return: +ENTRY(ftrace_return_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + pushl %eax pushl %ecx pushl %edx @@ -1223,7 +1229,8 @@ trace_return: popl %edx popl %ecx popl %eax - jmp ftrace_stub + ret +END(ftrace_return_caller) .globl return_to_handler return_to_handler: @@ -1237,10 +1244,7 @@ return_to_handler: popl %ecx popl %eax ret -#endif /* CONFIG_FUNCTION_RET_TRACER */ -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ +#endif .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d98b5a8ecf4..924153edd97 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,134 +24,6 @@ #include - -#ifdef CONFIG_FUNCTION_RET_TRACER - -/* - * These functions are picked from those used on - * this page for dynamic ftrace. They have been - * simplified to ignore all traces in NMI context. - */ -static atomic_t in_nmi; - -void ftrace_nmi_enter(void) -{ - atomic_inc(&in_nmi); -} - -void ftrace_nmi_exit(void) -{ - atomic_dec(&in_nmi); -} - -/* Add a function return address to the trace stack on thread info.*/ -static int push_return_trace(unsigned long ret, unsigned long long time, - unsigned long func) -{ - int index; - struct thread_info *ti = current_thread_info(); - - /* The return trace stack is full */ - if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) - return -EBUSY; - - index = ++ti->curr_ret_stack; - barrier(); - ti->ret_stack[index].ret = ret; - ti->ret_stack[index].func = func; - ti->ret_stack[index].calltime = time; - - return 0; -} - -/* Retrieve a function return address to the trace stack on thread info.*/ -static void pop_return_trace(unsigned long *ret, unsigned long long *time, - unsigned long *func) -{ - int index; - - struct thread_info *ti = current_thread_info(); - index = ti->curr_ret_stack; - *ret = ti->ret_stack[index].ret; - *func = ti->ret_stack[index].func; - *time = ti->ret_stack[index].calltime; - ti->curr_ret_stack--; -} - -/* - * Send the trace to the ring-buffer. - * @return the original return address. - */ -unsigned long ftrace_return_to_handler(void) -{ - struct ftrace_retfunc trace; - pop_return_trace(&trace.ret, &trace.calltime, &trace.func); - trace.rettime = cpu_clock(raw_smp_processor_id()); - ftrace_function_return(&trace); - - return trace.ret; -} - -/* - * Hook the return address and push it in the stack of return addrs - * in current thread info. - */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) -{ - unsigned long old; - unsigned long long calltime; - int faulted; - unsigned long return_hooker = (unsigned long) - &return_to_handler; - - /* Nmi's are currently unsupported */ - if (atomic_read(&in_nmi)) - return; - - /* - * Protect against fault, even if it shouldn't - * happen. This tool is too much intrusive to - * ignore such a protection. - */ - asm volatile( - "1: movl (%[parent_old]), %[old]\n" - "2: movl %[return_hooker], (%[parent_replaced])\n" - " movl $0, %[faulted]\n" - - ".section .fixup, \"ax\"\n" - "3: movl $1, %[faulted]\n" - ".previous\n" - - ".section __ex_table, \"a\"\n" - " .long 1b, 3b\n" - " .long 2b, 3b\n" - ".previous\n" - - : [parent_replaced] "=r" (parent), [old] "=r" (old), - [faulted] "=r" (faulted) - : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) - : "memory" - ); - - if (WARN_ON(faulted)) { - unregister_ftrace_return(); - return; - } - - if (WARN_ON(!__kernel_text_address(old))) { - unregister_ftrace_return(); - *parent = old; - return; - } - - calltime = cpu_clock(raw_smp_processor_id()); - - if (push_return_trace(old, calltime, self_addr) == -EBUSY) - *parent = old; -} - -#endif - #ifdef CONFIG_DYNAMIC_FTRACE union ftrace_code_union { @@ -450,3 +322,133 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } #endif + +#ifdef CONFIG_FUNCTION_RET_TRACER + +#ifndef CONFIG_DYNAMIC_FTRACE + +/* + * These functions are picked from those used on + * this page for dynamic ftrace. They have been + * simplified to ignore all traces in NMI context. + */ +static atomic_t in_nmi; + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); +} + +void ftrace_nmi_exit(void) +{ + atomic_dec(&in_nmi); +} +#endif /* !CONFIG_DYNAMIC_FTRACE */ + +/* Add a function return address to the trace stack on thread info.*/ +static int push_return_trace(unsigned long ret, unsigned long long time, + unsigned long func) +{ + int index; + struct thread_info *ti = current_thread_info(); + + /* The return trace stack is full */ + if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) + return -EBUSY; + + index = ++ti->curr_ret_stack; + barrier(); + ti->ret_stack[index].ret = ret; + ti->ret_stack[index].func = func; + ti->ret_stack[index].calltime = time; + + return 0; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void pop_return_trace(unsigned long *ret, unsigned long long *time, + unsigned long *func) +{ + int index; + + struct thread_info *ti = current_thread_info(); + index = ti->curr_ret_stack; + *ret = ti->ret_stack[index].ret; + *func = ti->ret_stack[index].func; + *time = ti->ret_stack[index].calltime; + ti->curr_ret_stack--; +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ + struct ftrace_retfunc trace; + pop_return_trace(&trace.ret, &trace.calltime, &trace.func); + trace.rettime = cpu_clock(raw_smp_processor_id()); + ftrace_function_return(&trace); + + return trace.ret; +} + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +{ + unsigned long old; + unsigned long long calltime; + int faulted; + unsigned long return_hooker = (unsigned long) + &return_to_handler; + + /* Nmi's are currently unsupported */ + if (atomic_read(&in_nmi)) + return; + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: movl (%[parent_old]), %[old]\n" + "2: movl %[return_hooker], (%[parent_replaced])\n" + " movl $0, %[faulted]\n" + + ".section .fixup, \"ax\"\n" + "3: movl $1, %[faulted]\n" + ".previous\n" + + ".section __ex_table, \"a\"\n" + " .long 1b, 3b\n" + " .long 2b, 3b\n" + ".previous\n" + + : [parent_replaced] "=r" (parent), [old] "=r" (old), + [faulted] "=r" (faulted) + : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (WARN_ON(faulted)) { + unregister_ftrace_return(); + return; + } + + if (WARN_ON(!__kernel_text_address(old))) { + unregister_ftrace_return(); + *parent = old; + return; + } + + calltime = cpu_clock(raw_smp_processor_id()); + + if (push_return_trace(old, calltime, self_addr) == -EBUSY) + *parent = old; +} + +#endif /* CONFIG_FUNCTION_RET_TRACER */ -- cgit v1.2.3 From 569712b2b0970fa5b19673544d62ae661d04a220 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 16 Nov 2008 03:12:49 -0800 Subject: x86: fix wakeup_cpu with numaq/es7000, v2 Impact: fix secondary-CPU wakeup/init path with numaq and es7000 While looking at wakeup_secondary_cpu for WAKE_SECONDARY_VIA_NMI: |#ifdef WAKE_SECONDARY_VIA_NMI |/* | * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal | * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this | * won't ... remember to clear down the APIC, etc later. | */ |static int __devinit |wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) |{ | unsigned long send_status, accept_status = 0; | int maxlvt; |... | if (APIC_INTEGRATED(apic_version[phys_apicid])) { | maxlvt = lapic_get_maxlvt(); I noticed that there is no warning about undefined phys_apicid... because WAKE_SECONDARY_VIA_NMI and WAKE_SECONDARY_VIA_INIT can not be defined at the same time. So NUMAQ is using wrong wakeup_secondary_cpu. WAKE_SECONDARY_VIA_NMI, WAKE_SECONDARY_VIA_INIT and WAKE_SECONDARY_VIA_MIP are variants of a weird and fragile preprocessor-driven "HAL" mechanisms to specify the kind of secondary-CPU wakeup strategy a given x86 kernel will use. The vast majority of systems want to use INIT for secondary wakeup - NUMAQ uses an NMI, (old-style-) ES7000 uses 'MIP' (a firmware driven in-memory flag to let secondaries continue). So convert these mechanisms to x86_quirks and add a ->wakeup_secondary_cpu() method to specify the rare exception to the sane default. Extend genapic accordingly as well, for 32-bit. While looking further, I noticed that functions in wakecup.h for numaq and es7000 are different to the default in mach_wakecpu.h - but smpboot.c will only use default mach_wakecpu.h with smphook.h. So we need to add mach_wakecpu.h for mach_generic, to properly support numaq and es7000, and vectorize the following SMP init methods: int trampoline_phys_low; int trampoline_phys_high; void (*wait_for_init_deassert)(atomic_t *deassert); void (*smp_callin_clear_local_apic)(void); void (*store_NMI_vector)(unsigned short *high, unsigned short *low); void (*restore_NMI_vector)(unsigned short *high, unsigned short *low); void (*inquire_remote_apic)(int apicid); Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bigsmp/apic.h | 2 -- arch/x86/include/asm/es7000/apic.h | 3 -- arch/x86/include/asm/es7000/wakecpu.h | 41 ++++++--------------- arch/x86/include/asm/genapic_32.h | 17 ++++++++- arch/x86/include/asm/mach-default/mach_wakecpu.h | 24 +++++-------- arch/x86/include/asm/mach-default/smpboot_hooks.h | 8 +++-- arch/x86/include/asm/mach-generic/mach_wakecpu.h | 12 +++++++ arch/x86/include/asm/numaq/wakecpu.h | 24 +++++++------ arch/x86/include/asm/setup.h | 2 ++ arch/x86/kernel/es7000_32.c | 44 ++++++++++++----------- arch/x86/kernel/numaq_32.c | 1 + arch/x86/kernel/smpboot.c | 24 ++++++++----- arch/x86/mach-generic/bigsmp.c | 1 + arch/x86/mach-generic/default.c | 1 + arch/x86/mach-generic/summit.c | 1 + 15 files changed, 110 insertions(+), 95 deletions(-) create mode 100644 arch/x86/include/asm/mach-generic/mach_wakecpu.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h index 1d9543b9d35..ce547f24a1c 100644 --- a/arch/x86/include/asm/bigsmp/apic.h +++ b/arch/x86/include/asm/bigsmp/apic.h @@ -24,8 +24,6 @@ static inline cpumask_t target_cpus(void) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE (0) /* phys delivery to target proc */ #define NO_BALANCE_IRQ (0) -#define WAKE_SECONDARY_VIA_INIT - static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h index 380f0b4f17e..9d8cf776c28 100644 --- a/arch/x86/include/asm/es7000/apic.h +++ b/arch/x86/include/asm/es7000/apic.h @@ -23,8 +23,6 @@ static inline cpumask_t target_cpus(void) #define INT_DELIVERY_MODE (dest_LowestPrio) #define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */ #define NO_BALANCE_IRQ (1) -#undef WAKE_SECONDARY_VIA_INIT -#define WAKE_SECONDARY_VIA_MIP #else #define APIC_DFR_VALUE (APIC_DFR_FLAT) #define INT_DELIVERY_MODE (dest_Fixed) @@ -32,7 +30,6 @@ static inline cpumask_t target_cpus(void) #define NO_BALANCE_IRQ (0) #undef APIC_DEST_LOGICAL #define APIC_DEST_LOGICAL 0x0 -#define WAKE_SECONDARY_VIA_INIT #endif static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h index 39849346191..78f0daaee43 100644 --- a/arch/x86/include/asm/es7000/wakecpu.h +++ b/arch/x86/include/asm/es7000/wakecpu.h @@ -1,36 +1,12 @@ #ifndef __ASM_ES7000_WAKECPU_H #define __ASM_ES7000_WAKECPU_H -/* - * This file copes with machines that wakeup secondary CPUs by the - * INIT, INIT, STARTUP sequence. - */ - -#ifdef CONFIG_ES7000_CLUSTERED_APIC -#define WAKE_SECONDARY_VIA_MIP -#else -#define WAKE_SECONDARY_VIA_INIT -#endif - -#ifdef WAKE_SECONDARY_VIA_MIP -extern int es7000_start_cpu(int cpu, unsigned long eip); -static inline int -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) -{ - int boot_error = 0; - boot_error = es7000_start_cpu(phys_apicid, start_eip); - return boot_error; -} -#endif - -#define TRAMPOLINE_LOW phys_to_virt(0x467) -#define TRAMPOLINE_HIGH phys_to_virt(0x469) - -#define boot_cpu_apicid boot_cpu_physical_apicid +#define TRAMPOLINE_PHYS_LOW 0x467 +#define TRAMPOLINE_PHYS_HIGH 0x469 static inline void wait_for_init_deassert(atomic_t *deassert) { -#ifdef WAKE_SECONDARY_VIA_INIT +#ifndef CONFIG_ES7000_CLUSTERED_APIC while (!atomic_read(deassert)) cpu_relax(); #endif @@ -50,9 +26,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { } -#define inquire_remote_apic(apicid) do { \ - if (apic_verbosity >= APIC_DEBUG) \ - __inquire_remote_apic(apicid); \ - } while (0) +extern void __inquire_remote_apic(int apicid); + +static inline void inquire_remote_apic(int apicid) +{ + if (apic_verbosity >= APIC_DEBUG) + __inquire_remote_apic(apicid); +} #endif /* __ASM_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h index 5cbd4fcc06f..39bd8c1db3f 100644 --- a/arch/x86/include/asm/genapic_32.h +++ b/arch/x86/include/asm/genapic_32.h @@ -2,6 +2,7 @@ #define _ASM_X86_GENAPIC_32_H #include +#include /* * Generic APIC driver interface. @@ -65,6 +66,13 @@ struct genapic { void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); #endif + int trampoline_phys_low; + int trampoline_phys_high; + void (*wait_for_init_deassert)(atomic_t *deassert); + void (*smp_callin_clear_local_apic)(void); + void (*store_NMI_vector)(unsigned short *high, unsigned short *low); + void (*restore_NMI_vector)(unsigned short *high, unsigned short *low); + void (*inquire_remote_apic)(int apicid); }; #define APICFUNC(x) .x = x, @@ -105,13 +113,20 @@ struct genapic { APICFUNC(get_apic_id) \ .apic_id_mask = APIC_ID_MASK, \ APICFUNC(cpu_mask_to_apicid) \ - APICFUNC(vector_allocation_domain) \ + APICFUNC(vector_allocation_domain) \ APICFUNC(acpi_madt_oem_check) \ IPIFUNC(send_IPI_mask) \ IPIFUNC(send_IPI_allbutself) \ IPIFUNC(send_IPI_all) \ APICFUNC(enable_apic_mode) \ APICFUNC(phys_pkg_id) \ + .trampoline_phys_low = TRAMPOLINE_PHYS_LOW, \ + .trampoline_phys_high = TRAMPOLINE_PHYS_HIGH, \ + APICFUNC(wait_for_init_deassert) \ + APICFUNC(smp_callin_clear_local_apic) \ + APICFUNC(store_NMI_vector) \ + APICFUNC(restore_NMI_vector) \ + APICFUNC(inquire_remote_apic) \ } extern struct genapic *genapic; diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h index 9d80db91e99..ceb01366014 100644 --- a/arch/x86/include/asm/mach-default/mach_wakecpu.h +++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h @@ -1,17 +1,8 @@ #ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H #define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H -/* - * This file copes with machines that wakeup secondary CPUs by the - * INIT, INIT, STARTUP sequence. - */ - -#define WAKE_SECONDARY_VIA_INIT - -#define TRAMPOLINE_LOW phys_to_virt(0x467) -#define TRAMPOLINE_HIGH phys_to_virt(0x469) - -#define boot_cpu_apicid boot_cpu_physical_apicid +#define TRAMPOLINE_PHYS_LOW (0x467) +#define TRAMPOLINE_PHYS_HIGH (0x469) static inline void wait_for_init_deassert(atomic_t *deassert) { @@ -33,9 +24,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { } -#define inquire_remote_apic(apicid) do { \ - if (apic_verbosity >= APIC_DEBUG) \ - __inquire_remote_apic(apicid); \ - } while (0) +extern void __inquire_remote_apic(int apicid); + +static inline void inquire_remote_apic(int apicid) +{ + if (apic_verbosity >= APIC_DEBUG) + __inquire_remote_apic(apicid); +} #endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h index dbab36d64d4..23bf52103b8 100644 --- a/arch/x86/include/asm/mach-default/smpboot_hooks.h +++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h @@ -13,9 +13,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) CMOS_WRITE(0xa, 0xf); local_flush_tlb(); pr_debug("1.\n"); - *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + start_eip >> 4; pr_debug("2.\n"); - *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + start_eip & 0xf; pr_debug("3.\n"); } @@ -32,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void) */ CMOS_WRITE(0, 0xf); - *((volatile long *) phys_to_virt(0x467)) = 0; + *((volatile long *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } static inline void __init smpboot_setup_io_apic(void) diff --git a/arch/x86/include/asm/mach-generic/mach_wakecpu.h b/arch/x86/include/asm/mach-generic/mach_wakecpu.h new file mode 100644 index 00000000000..1ab16b168c8 --- /dev/null +++ b/arch/x86/include/asm/mach-generic/mach_wakecpu.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H +#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H + +#define TRAMPOLINE_PHYS_LOW (genapic->trampoline_phys_low) +#define TRAMPOLINE_PHYS_HIGH (genapic->trampoline_phys_high) +#define wait_for_init_deassert (genapic->wait_for_init_deassert) +#define smp_callin_clear_local_apic (genapic->smp_callin_clear_local_apic) +#define store_NMI_vector (genapic->store_NMI_vector) +#define restore_NMI_vector (genapic->restore_NMI_vector) +#define inquire_remote_apic (genapic->inquire_remote_apic) + +#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */ diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h index c577bda5b1c..6f499df8edd 100644 --- a/arch/x86/include/asm/numaq/wakecpu.h +++ b/arch/x86/include/asm/numaq/wakecpu.h @@ -3,12 +3,8 @@ /* This file copes with machines that wakeup secondary CPUs by NMIs */ -#define WAKE_SECONDARY_VIA_NMI - -#define TRAMPOLINE_LOW phys_to_virt(0x8) -#define TRAMPOLINE_HIGH phys_to_virt(0xa) - -#define boot_cpu_apicid boot_cpu_logical_apicid +#define TRAMPOLINE_PHYS_LOW (0x8) +#define TRAMPOLINE_PHYS_HIGH (0xa) /* We don't do anything here because we use NMI's to boot instead */ static inline void wait_for_init_deassert(atomic_t *deassert) @@ -27,17 +23,23 @@ static inline void smp_callin_clear_local_apic(void) static inline void store_NMI_vector(unsigned short *high, unsigned short *low) { printk("Storing NMI vector\n"); - *high = *((volatile unsigned short *) TRAMPOLINE_HIGH); - *low = *((volatile unsigned short *) TRAMPOLINE_LOW); + *high = + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)); + *low = + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)); } static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { printk("Restoring NMI vector\n"); - *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high; - *((volatile unsigned short *) TRAMPOLINE_LOW) = *low; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + *high; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + *low; } -#define inquire_remote_apic(apicid) {} +static inline void inquire_remote_apic(int apicid) +{ +} #endif /* __ASM_NUMAQ_WAKECPU_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f12d3723746..40b2d330491 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -16,6 +16,7 @@ static inline void visws_early_detect(void) { } static inline int is_visws_box(void) { return 0; } #endif +extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); /* * Any setup quirks to be performed? */ @@ -39,6 +40,7 @@ struct x86_quirks { void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable, unsigned short oemsize); int (*setup_ioapic_ids)(void); + int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); }; extern struct x86_quirks *x86_quirks; diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index f454c78fcef..bed10dddf09 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -40,6 +40,7 @@ #include #include #include +#include /* * ES7000 chipsets @@ -161,6 +162,26 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } +#ifdef CONFIG_ES7000_CLUSTERED_APIC +static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +{ + unsigned long vect = 0, psaival = 0; + + if (psai == NULL) + return -1; + + vect = ((unsigned long)__pa(eip)/0x1000) << 16; + psaival = (0x1000000 | vect | cpu); + + while (*psai & 0x1000000) + ; + + *psai = psaival; + + return 0; +} +#endif + void __init setup_unisys(void) { @@ -176,6 +197,9 @@ setup_unisys(void) else es7000_plat = ES7000_CLASSIC; ioapic_renumber_irq = es7000_rename_gsi; +#ifdef CONFIG_ES7000_CLUSTERED_APIC + x86_quirks->wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip; +#endif } /* @@ -324,26 +348,6 @@ es7000_mip_write(struct mip_reg *mip_reg) return status; } -int -es7000_start_cpu(int cpu, unsigned long eip) -{ - unsigned long vect = 0, psaival = 0; - - if (psai == NULL) - return -1; - - vect = ((unsigned long)__pa(eip)/0x1000) << 16; - psaival = (0x1000000 | vect | cpu); - - while (*psai & 0x1000000) - ; - - *psai = psaival; - - return 0; - -} - void __init es7000_sw_apic(void) { diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 4caff39078e..745891b7d0f 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -250,6 +250,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, .setup_ioapic_ids = numaq_setup_ioapic_ids, + .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi, }; void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7b109339731..498c1ef37fe 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -536,7 +537,7 @@ static void impress_friends(void) pr_debug("Before bogocount - setting activated=1.\n"); } -static inline void __inquire_remote_apic(int apicid) +void __inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; @@ -575,14 +576,13 @@ static inline void __inquire_remote_apic(int apicid) } } -#ifdef WAKE_SECONDARY_VIA_NMI /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -static int __devinit -wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) +int __devinit +wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt; @@ -599,7 +599,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -614,11 +614,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -#endif /* WAKE_SECONDARY_VIA_NMI */ -#ifdef WAKE_SECONDARY_VIA_INIT static int __devinit -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) +wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt, num_starts, j; @@ -737,7 +735,15 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -#endif /* WAKE_SECONDARY_VIA_INIT */ + +static int __devinit +wakeup_secondary_cpu(int apicid, unsigned long start_eip) +{ + if (x86_quirks->wakeup_secondary_cpu) + return x86_quirks->wakeup_secondary_cpu(apicid, start_eip); + + return wakeup_secondary_cpu_via_init(apicid, start_eip); +} struct create_idle { struct work_struct work; diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 3c3b471ea49..3624a364b7f 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -17,6 +17,7 @@ #include #include #include +#include static int dmi_bigsmp; /* can be set by dmi scanners */ diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c index 9e835a11a13..e63a4a76d8c 100644 --- a/arch/x86/mach-generic/default.c +++ b/arch/x86/mach-generic/default.c @@ -16,6 +16,7 @@ #include #include #include +#include /* should be called last. */ static int probe_default(void) diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 6272b5e69da..2c6d234e000 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -16,6 +16,7 @@ #include #include #include +#include static int probe_summit(void) { -- cgit v1.2.3 From 54ac14a8e982ae6c7ac71ee2b0d0173b974509e2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 17 Nov 2008 15:19:53 -0800 Subject: x86: fix wakeup_cpu with numaq/es7000, v2, fix Impact: fix wakeup_secondary_cpu with hotplug We can not put that into x86_quirks, because that is __initdata. So try to move that to genapic, and add update_genapic in x86_quirks. later we even could use that stub to: 1. autodetect CONFIG_ES7000_CLUSTERED_APIC 2. more correct inquire_remote_apic with apic_verbosity setting. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/genapic_32.h | 1 + arch/x86/include/asm/genapic_64.h | 2 ++ arch/x86/include/asm/mach-default/mach_apic.h | 2 ++ arch/x86/include/asm/mach-generic/mach_apic.h | 1 + arch/x86/include/asm/setup.h | 3 ++- arch/x86/kernel/es7000_32.c | 11 ++++++++++- arch/x86/kernel/genapic_64.c | 4 ++++ arch/x86/kernel/numaq_32.c | 11 +++++++++-- arch/x86/kernel/setup.c | 13 ++++++++++++- arch/x86/kernel/smpboot.c | 11 +---------- arch/x86/mach-generic/probe.c | 4 ++++ 11 files changed, 48 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h index 39bd8c1db3f..455d6c27a98 100644 --- a/arch/x86/include/asm/genapic_32.h +++ b/arch/x86/include/asm/genapic_32.h @@ -66,6 +66,7 @@ struct genapic { void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); #endif + int (*wakeup_cpu)(int apicid, unsigned long start_eip); int trampoline_phys_low; int trampoline_phys_high; void (*wait_for_init_deassert)(atomic_t *deassert); diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h index 13c4e96199e..2cae011668b 100644 --- a/arch/x86/include/asm/genapic_64.h +++ b/arch/x86/include/asm/genapic_64.h @@ -32,6 +32,8 @@ struct genapic { unsigned int (*get_apic_id)(unsigned long x); unsigned long (*set_apic_id)(unsigned int id); unsigned long apic_id_mask; + /* wakeup_secondary_cpu */ + int (*wakeup_cpu)(int apicid, unsigned long start_eip); }; extern struct genapic *genapic; diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h index ff3a6c236c0..6cb3a467e06 100644 --- a/arch/x86/include/asm/mach-default/mach_apic.h +++ b/arch/x86/include/asm/mach-default/mach_apic.h @@ -32,11 +32,13 @@ static inline cpumask_t target_cpus(void) #define vector_allocation_domain (genapic->vector_allocation_domain) #define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID))) #define send_IPI_self (genapic->send_IPI_self) +#define wakeup_secondary_cpu (genapic->wakeup_cpu) extern void setup_apic_routing(void); #else #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ #define TARGET_CPUS (target_cpus()) +#define wakeup_secondary_cpu wakeup_secondary_cpu_via_init /* * Set up the logical destination ID. * diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h index 5180bd7478f..e430f47df66 100644 --- a/arch/x86/include/asm/mach-generic/mach_apic.h +++ b/arch/x86/include/asm/mach-generic/mach_apic.h @@ -27,6 +27,7 @@ #define vector_allocation_domain (genapic->vector_allocation_domain) #define enable_apic_mode (genapic->enable_apic_mode) #define phys_pkg_id (genapic->phys_pkg_id) +#define wakeup_secondary_cpu (genapic->wakeup_cpu) extern void generic_bigsmp_probe(void); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 40b2d330491..294daeb3a00 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -17,6 +17,7 @@ static inline int is_visws_box(void) { return 0; } #endif extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); +extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip); /* * Any setup quirks to be performed? */ @@ -40,7 +41,7 @@ struct x86_quirks { void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable, unsigned short oemsize); int (*setup_ioapic_ids)(void); - int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + int (*update_genapic)(void); }; extern struct x86_quirks *x86_quirks; diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index bed10dddf09..fb3bfe66fbe 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -40,6 +40,7 @@ #include #include #include +#include #include /* @@ -180,6 +181,13 @@ static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) return 0; } + +static int __init es7000_update_genapic(void) +{ + genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + + return 0; +} #endif void __init @@ -197,8 +205,9 @@ setup_unisys(void) else es7000_plat = ES7000_CLASSIC; ioapic_renumber_irq = es7000_rename_gsi; + #ifdef CONFIG_ES7000_CLUSTERED_APIC - x86_quirks->wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip; + x86_quirks->update_genapic = es7000_update_genapic; #endif } diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 6c9bfc9e1e9..2bced78b0b8 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -21,6 +21,7 @@ #include #include #include +#include extern struct genapic apic_flat; extern struct genapic apic_physflat; @@ -53,6 +54,9 @@ void __init setup_apic_routing(void) genapic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); } + + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 745891b7d0f..0deea37a53c 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include @@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void) return 1; } +static int __init numaq_update_genapic(void) +{ + genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi; + + return 0; +} + static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, @@ -250,7 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, .setup_ioapic_ids = numaq_setup_ioapic_ids, - .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi, + .update_genapic = numaq_update_genapic, }; void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa6790c1dd..c366e891e10 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -583,7 +583,18 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; +static int __init default_update_genapic(void) +{ +#if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) + genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi; +#endif + + return 0; +} + +static struct x86_quirks default_x86_quirks __initdata = { + .update_genapic = default_update_genapic, +}; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 498c1ef37fe..0e9f446269f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -615,7 +615,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -static int __devinit +int __devinit wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -736,15 +736,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -static int __devinit -wakeup_secondary_cpu(int apicid, unsigned long start_eip) -{ - if (x86_quirks->wakeup_secondary_cpu) - return x86_quirks->wakeup_secondary_cpu(apicid, start_eip); - - return wakeup_secondary_cpu_via_init(apicid, start_eip); -} - struct create_idle { struct work_struct work; struct task_struct *idle; diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c index 5a7e4619e1c..90b134f3cd7 100644 --- a/arch/x86/mach-generic/probe.c +++ b/arch/x86/mach-generic/probe.c @@ -15,6 +15,7 @@ #include #include #include +#include extern struct genapic apic_numaq; extern struct genapic apic_summit; @@ -57,6 +58,9 @@ static int __init parse_apic(char *arg) } } + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); + /* Parsed again by __setup for debug/verbose */ return 0; } -- cgit v1.2.3 From 0231022cc32d5f2e7f3c06b75691dda0ad6aec33 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 17 Nov 2008 03:22:41 +0100 Subject: tracing/function-return-tracer: add the overrun field Impact: help to find the better depth of trace We decided to arbitrary define the depth of function return trace as "20". Perhaps this is not enough. To help finding an optimal depth, we measure now the overrun: the number of functions that have been missed for the current thread. By default this is not displayed, we have to do set a particular flag on the return tracer: echo overrun > /debug/tracing/trace_options And the overrun will be printed on the right. As the trace shows below, the current 20 depth is not enough. update_wall_time+0x37f/0x8c0 -> update_xtime_cache (345 ns) (Overruns: 2838) update_wall_time+0x384/0x8c0 -> clocksource_get_next (1141 ns) (Overruns: 2838) do_timer+0x23/0x100 -> update_wall_time (3882 ns) (Overruns: 2838) tick_do_update_jiffies64+0xbf/0x160 -> do_timer (5339 ns) (Overruns: 2838) tick_sched_timer+0x6a/0xf0 -> tick_do_update_jiffies64 (7209 ns) (Overruns: 2838) vgacon_set_cursor_size+0x98/0x120 -> native_io_delay (2613 ns) (Overruns: 274) vgacon_cursor+0x16e/0x1d0 -> vgacon_set_cursor_size (33151 ns) (Overruns: 274) set_cursor+0x5f/0x80 -> vgacon_cursor (36432 ns) (Overruns: 274) con_flush_chars+0x34/0x40 -> set_cursor (38790 ns) (Overruns: 274) release_console_sem+0x1ec/0x230 -> up (721 ns) (Overruns: 274) release_console_sem+0x225/0x230 -> wake_up_klogd (316 ns) (Overruns: 274) con_flush_chars+0x39/0x40 -> release_console_sem (2996 ns) (Overruns: 274) con_write+0x22/0x30 -> con_flush_chars (46067 ns) (Overruns: 274) n_tty_write+0x1cc/0x360 -> con_write (292670 ns) (Overruns: 274) smp_apic_timer_interrupt+0x2a/0x90 -> native_apic_mem_write (330 ns) (Overruns: 274) irq_enter+0x17/0x70 -> idle_cpu (413 ns) (Overruns: 274) smp_apic_timer_interrupt+0x2f/0x90 -> irq_enter (1525 ns) (Overruns: 274) ktime_get_ts+0x40/0x70 -> getnstimeofday (465 ns) (Overruns: 274) ktime_get_ts+0x60/0x70 -> set_normalized_timespec (436 ns) (Overruns: 274) ktime_get+0x16/0x30 -> ktime_get_ts (2501 ns) (Overruns: 274) hrtimer_interrupt+0x77/0x1a0 -> ktime_get (3439 ns) (Overruns: 274) Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 7 +++++++ arch/x86/kernel/ftrace.c | 10 +++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a71158369fd..e90e81ef6ab 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -21,6 +21,7 @@ struct task_struct; struct exec_domain; #include #include +#include struct thread_info { struct task_struct *task; /* main task structure */ @@ -45,6 +46,11 @@ struct thread_info { int curr_ret_stack; /* Stack of return addresses for return function tracing */ struct ftrace_ret_stack ret_stack[FTRACE_RET_STACK_SIZE]; + /* + * Number of functions that haven't been traced + * because of depth overrun. + */ + atomic_t trace_overrun; #endif }; @@ -61,6 +67,7 @@ struct thread_info { .fn = do_no_restart_syscall, \ }, \ .curr_ret_stack = -1,\ + .trace_overrun = ATOMIC_INIT(0) \ } #else #define INIT_THREAD_INFO(tsk) \ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 924153edd97..356bb1eb6e9 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -353,8 +353,10 @@ static int push_return_trace(unsigned long ret, unsigned long long time, struct thread_info *ti = current_thread_info(); /* The return trace stack is full */ - if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) + if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { + atomic_inc(&ti->trace_overrun); return -EBUSY; + } index = ++ti->curr_ret_stack; barrier(); @@ -367,7 +369,7 @@ static int push_return_trace(unsigned long ret, unsigned long long time, /* Retrieve a function return address to the trace stack on thread info.*/ static void pop_return_trace(unsigned long *ret, unsigned long long *time, - unsigned long *func) + unsigned long *func, unsigned long *overrun) { int index; @@ -376,6 +378,7 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, *ret = ti->ret_stack[index].ret; *func = ti->ret_stack[index].func; *time = ti->ret_stack[index].calltime; + *overrun = atomic_read(&ti->trace_overrun); ti->curr_ret_stack--; } @@ -386,7 +389,8 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, unsigned long ftrace_return_to_handler(void) { struct ftrace_retfunc trace; - pop_return_trace(&trace.ret, &trace.calltime, &trace.func); + pop_return_trace(&trace.ret, &trace.calltime, &trace.func, + &trace.overrun); trace.rettime = cpu_clock(raw_smp_processor_id()); ftrace_function_return(&trace); -- cgit v1.2.3 From a1afd01c175324656d0e8f1c82ea94b474953c04 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 18 Nov 2008 12:44:21 +0100 Subject: x86: default to SWIOTLB=y on x86_64 Impact: fixes korg bugzilla 11980 A kernel for a 64bit x86 system should always contain the swiotlb code in case it is booted on a machine without any hardware IOMMU supported by the kernel and more than 4GB of RAM. This patch changes Kconfig to always compile swiotlb into the kernel for x86_64. Signed-off-by: Joerg Roedel Cc: stable@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 93224b56918..669c6d588bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -566,7 +566,7 @@ config AMD_IOMMU # need this always selected by IOMMU for the VIA workaround config SWIOTLB - bool + def_bool y if X86_64 help Support for software bounce buffers used on x86-64 systems which don't have a hardware IOMMU (e.g. the current generation -- cgit v1.2.3 From f632ddcc0786149c0e4bef9b6b44c96a75c0d074 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 18 Nov 2008 17:32:26 +0100 Subject: x86: fix wakeup_cpu with numaq/es7000, v2, fix #2 Impact: fix boot crash fix default_update_genapic(). Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c366e891e10..31328909456 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -585,8 +585,10 @@ early_param("elfcorehdr", setup_elfcorehdr); static int __init default_update_genapic(void) { -#if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) - genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi; +#ifdef CONFIG_X86_SMP +# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) + genapic->wakeup_cpu = wakeup_secondary_cpu_via_init; +# endif #endif return 0; -- cgit v1.2.3 From b5fe363b7d89577fcfda9b6cf0efc32760bbccc6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 18 Nov 2008 08:14:14 -0800 Subject: x86: use update_genapic to get rid of ES7000_CLUSTERED_APIC v2 Impact: clean up We can autodetect those system that need cluster apic, and update genapic accordingly. We can also remove wakeup.h for e7000, because it's default one is now the same as overall default mach_wakecpu.h Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 -- arch/x86/include/asm/es7000/apic.h | 76 ++++++++++++++++++++++++++------------ arch/x86/include/asm/genapic_32.h | 1 + arch/x86/kernel/es7000_32.c | 17 +++++++-- arch/x86/mach-generic/es7000.c | 14 ++++++- 5 files changed, 80 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 93224b56918..7d0ab8942cf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -462,10 +462,6 @@ config X86_CYCLONE_TIMER def_bool y depends on X86_GENERICARCH -config ES7000_CLUSTERED_APIC - def_bool y - depends on SMP && X86_ES7000 && MPENTIUMIII - source "arch/x86/Kconfig.cpu" config HPET_TIMER diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h index 9d8cf776c28..e24ef876915 100644 --- a/arch/x86/include/asm/es7000/apic.h +++ b/arch/x86/include/asm/es7000/apic.h @@ -9,28 +9,27 @@ static inline int apic_id_registered(void) return (1); } -static inline cpumask_t target_cpus(void) +static inline cpumask_t target_cpus_cluster(void) { -#if defined CONFIG_ES7000_CLUSTERED_APIC return CPU_MASK_ALL; -#else +} + +static inline cpumask_t target_cpus(void) +{ return cpumask_of_cpu(smp_processor_id()); -#endif } -#if defined CONFIG_ES7000_CLUSTERED_APIC -#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -#define INT_DELIVERY_MODE (dest_LowestPrio) -#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */ -#define NO_BALANCE_IRQ (1) -#else +#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER) +#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio) +#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */ +#define NO_BALANCE_IRQ_CLUSTER (1) + #define APIC_DFR_VALUE (APIC_DFR_FLAT) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE (0) /* phys delivery to target procs */ #define NO_BALANCE_IRQ (0) #undef APIC_DEST_LOGICAL #define APIC_DEST_LOGICAL 0x0 -#endif static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { @@ -57,6 +56,16 @@ static inline unsigned long calculate_ldr(int cpu) * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel * document number 292116). So here it goes... */ +static inline void init_apic_ldr_cluster(void) +{ + unsigned long val; + int cpu = smp_processor_id(); + + apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER); + val = calculate_ldr(cpu); + apic_write(APIC_LDR, val); +} + static inline void init_apic_ldr(void) { unsigned long val; @@ -67,10 +76,6 @@ static inline void init_apic_ldr(void) apic_write(APIC_LDR, val); } -#ifndef CONFIG_X86_GENERICARCH -extern void enable_apic_mode(void); -#endif - extern int apic_version [MAX_APICS]; static inline void setup_apic_routing(void) { @@ -141,7 +146,7 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid) return (1); } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask) { int num_bits_set; int cpus_found = 0; @@ -151,11 +156,7 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) num_bits_set = cpus_weight(cpumask); /* Return id to all */ if (num_bits_set == NR_CPUS) -#if defined CONFIG_ES7000_CLUSTERED_APIC return 0xFF; -#else - return cpu_to_logical_apicid(0); -#endif /* * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. @@ -168,11 +169,40 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ printk ("%s: Not a valid mask!\n", __func__); -#if defined CONFIG_ES7000_CLUSTERED_APIC return 0xFF; -#else + } + apicid = new_apicid; + cpus_found++; + } + cpu++; + } + return apicid; +} + +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ + int num_bits_set; + int cpus_found = 0; + int cpu; + int apicid; + + num_bits_set = cpus_weight(cpumask); + /* Return id to all */ + if (num_bits_set == NR_CPUS) + return cpu_to_logical_apicid(0); + /* + * The cpus in the mask must all be on the apic cluster. If are not + * on the same apicid cluster return default value of TARGET_CPUS. + */ + cpu = first_cpu(cpumask); + apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { + if (cpu_isset(cpu, cpumask)) { + int new_apicid = cpu_to_logical_apicid(cpu); + if (apicid_cluster(apicid) != + apicid_cluster(new_apicid)){ + printk ("%s: Not a valid mask!\n", __func__); return cpu_to_logical_apicid(0); -#endif } apicid = new_apicid; cpus_found++; diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h index 455d6c27a98..0ac17d33a8c 100644 --- a/arch/x86/include/asm/genapic_32.h +++ b/arch/x86/include/asm/genapic_32.h @@ -131,6 +131,7 @@ struct genapic { } extern struct genapic *genapic; +extern void es7000_update_genapic_to_cluster(void); enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; #define get_uv_system_type() UV_NONE diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index fb3bfe66fbe..71d7be624d4 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -163,7 +164,6 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } -#ifdef CONFIG_ES7000_CLUSTERED_APIC static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { unsigned long vect = 0, psaival = 0; @@ -182,13 +182,24 @@ static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) return 0; } +static void noop_wait_for_deassert(atomic_t *deassert_not_used) +{ +} + static int __init es7000_update_genapic(void) { genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + /* MPENTIUMIII */ + if (boot_cpu_data.x86 == 6 && + (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) { + es7000_update_genapic_to_cluster(); + genapic->wait_for_init_deassert = noop_wait_for_deassert; + genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + } + return 0; } -#endif void __init setup_unisys(void) @@ -206,9 +217,7 @@ setup_unisys(void) es7000_plat = ES7000_CLASSIC; ioapic_renumber_irq = es7000_rename_gsi; -#ifdef CONFIG_ES7000_CLUSTERED_APIC x86_quirks->update_genapic = es7000_update_genapic; -#endif } /* diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 28459cab3dd..7b4e6d0d169 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -16,7 +16,19 @@ #include #include #include -#include +#include + +void __init es7000_update_genapic_to_cluster(void) +{ + genapic->target_cpus = target_cpus_cluster; + genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER; + genapic->int_dest_mode = INT_DEST_MODE_CLUSTER; + genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER; + + genapic->init_apic_ldr = init_apic_ldr_cluster; + + genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster; +} static int probe_es7000(void) { -- cgit v1.2.3 From 77be80e437fec44f8b7a620314b7d7b605b8d93b Mon Sep 17 00:00:00 2001 From: "Richard A. Holden III" Date: Wed, 19 Nov 2008 16:05:14 -0700 Subject: x86: fix arch/x86/kernel/genx2apic_uv_x.c build warning when !CONFIG_HOTPLUG_CPU Impact: cleanup, reduce size of the kernel image a bit Fix: arch/x86/kernel/genx2apic_uv_x.c:403: warning: 'uv_heartbeat_disable' defined but not used the function is only used when CONFIG_HOTPLUG_CPU is defined. Signed-off-by: Richard A. Holden III Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index f02bbe5d017..221299f4509 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -400,6 +400,7 @@ static void __cpuinit uv_heartbeat_enable(int cpu) uv_heartbeat_enable(0); } +#ifdef CONFIG_HOTPLUG_CPU static void __cpuinit uv_heartbeat_disable(int cpu) { if (uv_cpu_hub_info(cpu)->scir.enabled) { @@ -409,7 +410,6 @@ static void __cpuinit uv_heartbeat_disable(int cpu) uv_set_cpu_scir_bits(cpu, 0xff); } -#ifdef CONFIG_HOTPLUG_CPU /* * cpu hotplug notifier */ -- cgit v1.2.3 From bb5574608a8375026510b4f983ffbb06ece33fe2 Mon Sep 17 00:00:00 2001 From: "Richard A. Holden III" Date: Wed, 19 Nov 2008 16:05:15 -0700 Subject: x86: fix arch/x86/kernel/setup.c build warning when !CONFIG_X86_RESERVE_LOW_64K Impact: cleanup Fix: arch/x86/kernel/setup.c:592: warning: 'dmi_low_memory_corruption' defined but not used this is only used if CONFIG_X86_RESERVE_LOW_64K is defined. Signed-off-by: Richard A. Holden III Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e6c51433247..13a5f592ac2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -587,6 +587,7 @@ static struct x86_quirks default_x86_quirks __initdata; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; +#ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { printk(KERN_NOTICE @@ -598,6 +599,7 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) return 0; } +#endif /* List of systems that have known low memory corruption BIOS problems */ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { -- cgit v1.2.3 From 87f7606591aea6a8a38ea4c8911b5eeeee2740b8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 19 Nov 2008 20:50:53 -0800 Subject: x86: fix wakeup_cpu with numaq/es7000 v2 - call ->update_genapic() Impact: fix boot crash on 32-bit Hiroshi Shimamoto reported a boot failure on 32-bit x86. The setting of x86_quirks.wakeup_cpu is missing (when not passing in an explicit apic= boot parameter). Reported-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/mach-generic/probe.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c index 90b134f3cd7..c346d9d0226 100644 --- a/arch/x86/mach-generic/probe.c +++ b/arch/x86/mach-generic/probe.c @@ -76,12 +76,15 @@ void __init generic_bigsmp_probe(void) * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support */ - if (!cmdline_apic && genapic == &apic_default) + if (!cmdline_apic && genapic == &apic_default) { if (apic_bigsmp.probe()) { genapic = &apic_bigsmp; + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); printk(KERN_INFO "Overriding APIC driver with %s\n", genapic->name); } + } #endif } @@ -98,6 +101,9 @@ void __init generic_apic_probe(void) /* Not visible without early console */ if (!apic_probe[i]) panic("Didn't find an APIC driver"); + + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); } printk(KERN_INFO "Using APIC driver %s\n", genapic->name); } @@ -112,6 +118,8 @@ int __init mps_oem_check(struct mp_config_table *mpc, char *oem, if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) { if (!cmdline_apic) { genapic = apic_probe[i]; + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); printk(KERN_INFO "Switched to APIC driver `%s'.\n", genapic->name); } @@ -128,6 +136,8 @@ int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { if (!cmdline_apic) { genapic = apic_probe[i]; + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); printk(KERN_INFO "Switched to APIC driver `%s'.\n", genapic->name); } -- cgit v1.2.3 From 3889d0cea2b73049bdca062d9ff1e5d33468289c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 22 Nov 2008 23:39:23 -0800 Subject: x86: revert default reboot method to REBOOT_KBD Impact: Reverts default reboot method. Checkin 14d7ca5c575853664d8fe4f225a77b8df1b7de7d changed the default reboot method to "pci", a.k.a. port CF9. Unfortunately this has been shown to cause lockups on at least two systems for which REBOOT_KBD worked, both Thinkpads with Intel chipsets. This reverts the default to REBOOT_KBD, while leaving the option to have "reboot=pci" specified explicitly or via a DMI match. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index ddc93891cdc..790b09fbadc 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -29,7 +29,7 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_CF9_COND; +enum reboot_type reboot_type = BOOT_KBD; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) -- cgit v1.2.3 From f201ae2356c74bcae130b2177b3dca903ea98071 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 23 Nov 2008 06:22:56 +0100 Subject: tracing/function-return-tracer: store return stack into task_struct and allocate it dynamically Impact: use deeper function tracing depth safely Some tests showed that function return tracing needed a more deeper depth of function calls. But it could be unsafe to store these return addresses to the stack. So these arrays will now be allocated dynamically into task_struct of current only when the tracer is activated. Typical scheme when tracer is activated: - allocate a return stack for each task in global list. - fork: allocate the return stack for the newly created task - exit: free return stack of current - idle init: same as fork I chose a default depth of 50. I don't have overruns anymore. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 1 - arch/x86/include/asm/thread_info.h | 29 ----------------------------- arch/x86/kernel/ftrace.c | 29 +++++++++++++++-------------- 3 files changed, 15 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 2bb43b433e0..754a3e082f9 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -29,7 +29,6 @@ struct dyn_arch_ftrace { #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_RET_TRACER -#define FTRACE_RET_STACK_SIZE 20 #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e90e81ef6ab..0921b4018c1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -40,36 +40,8 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif - -#ifdef CONFIG_FUNCTION_RET_TRACER - /* Index of current stored adress in ret_stack */ - int curr_ret_stack; - /* Stack of return addresses for return function tracing */ - struct ftrace_ret_stack ret_stack[FTRACE_RET_STACK_SIZE]; - /* - * Number of functions that haven't been traced - * because of depth overrun. - */ - atomic_t trace_overrun; -#endif }; -#ifdef CONFIG_FUNCTION_RET_TRACER -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ - .curr_ret_stack = -1,\ - .trace_overrun = ATOMIC_INIT(0) \ -} -#else #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ @@ -82,7 +54,6 @@ struct thread_info { .fn = do_no_restart_syscall, \ }, \ } -#endif #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 356bb1eb6e9..bb137f7297e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -350,19 +350,21 @@ static int push_return_trace(unsigned long ret, unsigned long long time, unsigned long func) { int index; - struct thread_info *ti = current_thread_info(); + + if (!current->ret_stack) + return -EBUSY; /* The return trace stack is full */ - if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { - atomic_inc(&ti->trace_overrun); + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); return -EBUSY; } - index = ++ti->curr_ret_stack; + index = ++current->curr_ret_stack; barrier(); - ti->ret_stack[index].ret = ret; - ti->ret_stack[index].func = func; - ti->ret_stack[index].calltime = time; + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = time; return 0; } @@ -373,13 +375,12 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, { int index; - struct thread_info *ti = current_thread_info(); - index = ti->curr_ret_stack; - *ret = ti->ret_stack[index].ret; - *func = ti->ret_stack[index].func; - *time = ti->ret_stack[index].calltime; - *overrun = atomic_read(&ti->trace_overrun); - ti->curr_ret_stack--; + index = current->curr_ret_stack; + *ret = current->ret_stack[index].ret; + *func = current->ret_stack[index].func; + *time = current->ret_stack[index].calltime; + *overrun = atomic_read(¤t->trace_overrun); + current->curr_ret_stack--; } /* -- cgit v1.2.3 From 02b67518e2b1c490787dac7f35e1204e74fe21ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Sat, 22 Nov 2008 13:28:47 +0200 Subject: tracing: add support for userspace stacktraces in tracing/iter_ctrl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: add new (default-off) tracing visualization feature Usage example: mount -t debugfs nodev /sys/kernel/debug cd /sys/kernel/debug/tracing echo userstacktrace >iter_ctrl echo sched_switch >current_tracer echo 1 >tracing_enabled .... run application ... echo 0 >tracing_enabled Then read one of 'trace','latency_trace','trace_pipe'. To get the best output you can compile your userspace programs with frame pointers (at least glibc + the app you are tracing). Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar --- arch/x86/kernel/stacktrace.c | 57 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index a03e7f6d90c..b1515306041 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -6,6 +6,7 @@ #include #include #include +#include #include static void save_stack_warning(void *data, char *msg) @@ -83,3 +84,59 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ + +struct stack_frame { + const void __user *next_fp; + unsigned long return_address; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +void save_stack_trace_user(struct stack_trace *trace) +{ + /* + * Trace user stack if we are not a kernel thread + */ + if (current->mm) { + const struct pt_regs *regs = task_pt_regs(current); + const void __user *fp = (const void __user *)regs->bp; + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame frame; + frame.next_fp = NULL; + frame.return_address = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (frame.return_address) + trace->entries[trace->nr_entries++] = + frame.return_address; + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + } + } + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + -- cgit v1.2.3 From c450d7805b2c5cac8846c5f490fddfd9030d2207 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Fri, 21 Nov 2008 23:17:09 +0100 Subject: x86: vmware - fix sparse warnings Impact: fix sparse build warning Fix the following sparse warnings: arch/x86/kernel/cpu/vmware.c:69:5: warning: symbol 'vmware_platform' was not declared. Should it be static? arch/x86/kernel/cpu/vmware.c:89:15: warning: symbol 'vmware_get_tsc_khz' was not declared. Should it be static? arch/x86/kernel/cpu/vmware.c:107:16: warning: symbol 'vmware_set_feature_bits' was not declared. Should it be static? Signed-off-by: Hannes Eder Cc: "Alok N Kataria" Cc: "Dan Hecht" Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/vmware.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index c034bda842d..284c399e323 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -23,6 +23,7 @@ #include #include +#include #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 -- cgit v1.2.3 From 4e42ebd57b2e727b28bf5f6068e95cd19b0e807b Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Fri, 21 Nov 2008 22:56:17 +0100 Subject: x86: hypervisor - fix sparse warnings Impact: fix sparse build warning Fix the following sparse warnings: arch/x86/kernel/cpu/hypervisor.c:37:15: warning: symbol 'get_hypervisor_tsc_freq' was not declared. Should it be static? arch/x86/kernel/cpu/hypervisor.c:53:16: warning: symbol 'init_hypervisor' was not declared. Should it be static? Signed-off-by: Hannes Eder Cc: "Alok N Kataria" Cc: "Dan Hecht" Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/hypervisor.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 35ae2b75226..fb5b86af0b0 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -23,6 +23,7 @@ #include #include +#include static inline void __cpuinit detect_hypervisor_vendor(struct cpuinfo_x86 *c) -- cgit v1.2.3 From 8d7c6a96164651dbbab449ef0b5c20ae1f76a3a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Sun, 23 Nov 2008 12:39:06 +0200 Subject: tracing/stack-tracer: fix style issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar --- arch/x86/kernel/stacktrace.c | 51 +++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index b1515306041..10786af9554 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); struct stack_frame { const void __user *next_fp; - unsigned long return_address; + unsigned long ret_addr; }; static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) @@ -108,33 +108,40 @@ static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) return ret; } +static inline void __save_stack_trace_user(struct stack_trace *trace) +{ + const struct pt_regs *regs = task_pt_regs(current); + const void __user *fp = (const void __user *)regs->bp; + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (frame.ret_addr) { + trace->entries[trace->nr_entries++] = + frame.ret_addr; + } + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + } +} + void save_stack_trace_user(struct stack_trace *trace) { /* * Trace user stack if we are not a kernel thread */ if (current->mm) { - const struct pt_regs *regs = task_pt_regs(current); - const void __user *fp = (const void __user *)regs->bp; - - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = regs->ip; - - while (trace->nr_entries < trace->max_entries) { - struct stack_frame frame; - frame.next_fp = NULL; - frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) - break; - if ((unsigned long)fp < regs->sp) - break; - if (frame.return_address) - trace->entries[trace->nr_entries++] = - frame.return_address; - if (fp == frame.next_fp) - break; - fp = frame.next_fp; - } + __save_stack_trace_user(trace); } if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; -- cgit v1.2.3 From 8d26487fd4ddda7a0237da418fb8669fb06ae557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Sun, 23 Nov 2008 12:39:08 +0200 Subject: tracing/stack-tracer: introduce CONFIG_USER_STACKTRACE_SUPPORT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup User stack tracing is just implemented for x86, but it is not x86 specific. Introduce a generic config flag, that is currently enabled only for x86. When other arches implement it, they will have to SELECT USER_STACKTRACE_SUPPORT. Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7a146baaa99..e49a4fd718f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -36,6 +36,7 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS + select USER_STACKTRACE_SUPPORT config ARCH_DEFCONFIG string -- cgit v1.2.3 From 050dc6944b9ca2186f4729ab44e0da3743933941 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 23 Nov 2008 13:35:48 +0100 Subject: x86: remove duplicate #define from 'cpufeature.h' Impact: cleanup Remove duplicate #define from 'cpufeature.h'. This also fixes the following sparse warning: arch/x86/kernel/cpu/capflags.c:54:3: warning: Initializer entry defined twice arch/x86/kernel/cpu/capflags.c:58:3: also defined here Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 694d1f8f1be..5bce8ed02b4 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -80,7 +80,6 @@ #define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */ #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ #define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ #define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */ -- cgit v1.2.3 From a1a00b58855ccdbedf556b4f5638d5208b454472 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 23 Nov 2008 19:37:09 +0100 Subject: x86: boot - fix sparse warnings Impact: make global variables static Fix these sparse warnings: arch/x86/boot/video.c:233:3: warning: symbol 'saved' was not declared. Should it be static? arch/x86/boot/video-vga.c:37:13: warning: symbol 'video_vga' was not declared. Should it be static? Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- arch/x86/boot/video-vga.c | 4 ++-- arch/x86/boot/video.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index b939cb476de..5d4742ed4aa 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -34,7 +34,7 @@ static struct mode_info cga_modes[] = { { VIDEO_80x25, 80, 25, 0 }, }; -__videocard video_vga; +static __videocard video_vga; /* Set basic 80x25 mode */ static u8 vga_set_basic_mode(void) @@ -259,7 +259,7 @@ static int vga_probe(void) return mode_count[adapter]; } -__videocard video_vga = { +static __videocard video_vga = { .card_name = "VGA", .probe = vga_probe, .set_mode = vga_set_mode, diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 83598b23093..3bef2c1febe 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -226,7 +226,7 @@ static unsigned int mode_menu(void) #ifdef CONFIG_VIDEO_RETAIN /* Save screen content to the heap */ -struct saved_screen { +static struct saved_screen { int x, y; int curx, cury; u16 *data; -- cgit v1.2.3 From 3b71e9e307b3406aa29960a7428247f8a48b810c Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 23 Nov 2008 20:19:33 +0100 Subject: x86: HPET: fix sparse warning Impact: make global variable static Fix this sparse warning: arch/x86/kernel/hpet.c:36:18: warning: symbol 'hpet_num_timers' was not declared. Should it be static? Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 067d8de913f..15fcaacc1f8 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -33,7 +33,7 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; -unsigned long hpet_num_timers; +static unsigned long hpet_num_timers; static void __iomem *hpet_virt_address; struct hpet_dev { -- cgit v1.2.3 From b47b92884212008b4bd044ba6b48b93c00b10ec6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 24 Nov 2008 00:50:09 -0800 Subject: x86: drop REBOOT_CF9_COND from reboot fallback chain Impact: Reverts sequence of reboot fallbacks Checkin 14d7ca5c575853664d8fe4f225a77b8df1b7de7d changed the default reboot method to "pci", a.k.a. port CF9. Unfortunately this has been shown to cause lockups on at least two systems for which REBOOT_KBD worked, both Thinkpads with Intel chipsets. Checkin 3889d0cea2b73049bdca062d9ff1e5d33468289c reverted the default, but did not revert the fallback chain. This checkin reverts the fallback chain; port CF9 is now only done by explicit "reboot=pci" or a future potential DMI key. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 790b09fbadc..bb387ab0eea 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -384,20 +384,20 @@ static void native_machine_emergency_restart(void) load_idt(&no_idt); __asm__ __volatile__("int3"); - reboot_type = BOOT_CF9_COND; + reboot_type = BOOT_KBD; break; #ifdef CONFIG_X86_32 case BOOT_BIOS: machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); - reboot_type = BOOT_CF9_COND; + reboot_type = BOOT_KBD; break; #endif case BOOT_ACPI: acpi_reboot(); - reboot_type = BOOT_CF9_COND; + reboot_type = BOOT_KBD; break; case BOOT_EFI: @@ -406,7 +406,7 @@ static void native_machine_emergency_restart(void) EFI_RESET_WARM : EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); - reboot_type = BOOT_CF9_COND; + reboot_type = BOOT_KBD; break; case BOOT_CF9: -- cgit v1.2.3 From ad07e914e681f18ec0eaba60db17f497ee7e7e78 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 24 Nov 2008 11:33:12 +0100 Subject: x86 defconfig: increase CONFIG_LOG_BUF_SHIFT Impact: double the defconfig printk buffer Booting defconfigs produces more output than 128K so the output is truncated - double it to 256K. Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 2 +- arch/x86/configs/x86_64_defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 13b8c86ae98..71fc39c7078 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -77,7 +77,7 @@ CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y CONFIG_AUDIT_TREE=y # CONFIG_IKCONFIG is not set -CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y # CONFIG_CGROUP_DEBUG is not set CONFIG_CGROUP_NS=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index f0a03d7a7d6..b38bbabc170 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -77,7 +77,7 @@ CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y CONFIG_AUDIT_TREE=y # CONFIG_IKCONFIG is not set -CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y # CONFIG_CGROUP_DEBUG is not set CONFIG_CGROUP_NS=y -- cgit v1.2.3 From e951e4af2e399c46891004d4931333d2d8d520ab Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 25 Nov 2008 08:42:01 +0100 Subject: x86: fix unused variable warning in arch/x86/kernel/hpet.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix build warning this warning: arch/x86/kernel/hpet.c:36: warning: ‘hpet_num_timers’ defined but not used Triggers because hpet_num_timers is unused in the !CONFIG_PCI_MSI case. Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 15fcaacc1f8..3f0a3edf0a5 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -33,7 +33,9 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; +#ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; +#endif static void __iomem *hpet_virt_address; struct hpet_dev { -- cgit v1.2.3 From ca0002a179bfa532d009a9272d619732872c49bd Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 25 Nov 2008 09:01:25 +0100 Subject: x86, bts: base in-kernel ds interface on handles Impact: generalize the DS code to shared buffers Change the in-kernel ds.h interface to identify the tracer via a handle returned on ds_request_~(). Tracers used to be identified via their task_struct. The changes are required to allow DS to be shared between different tasks, which is needed for perfmon2 and for ftrace. For ptrace, the handle is stored in the traced task's task_struct. This should probably go into a (arch-specific) ptrace context some time. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 124 ++++----- arch/x86/kernel/ds.c | 679 +++++++++++++++++++++++----------------------- arch/x86/kernel/ptrace.c | 73 ++--- 3 files changed, 437 insertions(+), 439 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index a95008457ea..0af997de5f0 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -26,11 +26,18 @@ #include #include +#include #ifdef CONFIG_X86_DS struct task_struct; +struct ds_tracer; +struct bts_tracer; +struct pebs_tracer; + +typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); +typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); /* * Request BTS or PEBS @@ -38,21 +45,29 @@ struct task_struct; * Due to alignement constraints, the actual buffer may be slightly * smaller than the requested or provided buffer. * - * Returns 0 on success; -Eerrno otherwise + * Returns a pointer to a tracer structure on success, or + * ERR_PTR(errcode) on failure. + * + * The interrupt threshold is independent from the overflow callback + * to allow users to use their own overflow interrupt handling mechanism. * * task: the task to request recording for; * NULL for per-cpu recording on the current cpu * base: the base pointer for the (non-pageable) buffer; * NULL if buffer allocation requested - * size: the size of the requested or provided buffer + * size: the size of the requested or provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; * NULL if cyclic buffer requested + * th: the interrupt threshold in records from the end of the buffer; + * -1 if no interrupt threshold is requested. */ -typedef void (*ds_ovfl_callback_t)(struct task_struct *); -extern int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl); -extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl); +extern struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th); +extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th); /* * Release BTS or PEBS resources @@ -61,37 +76,34 @@ extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, * * Returns 0 on success; -Eerrno otherwise * - * task: the task to release resources for; - * NULL to release resources for the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_release_bts(struct task_struct *task); -extern int ds_release_pebs(struct task_struct *task); +extern int ds_release_bts(struct bts_tracer *tracer); +extern int ds_release_pebs(struct pebs_tracer *tracer); /* - * Return the (array) index of the write pointer. + * Get the (array) index of the write pointer. * (assuming an array of BTS/PEBS records) * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result */ -extern int ds_get_bts_index(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_index(struct task_struct *task, size_t *pos); +extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos); /* - * Return the (array) index one record beyond the end of the array. + * Get the (array) index one record beyond the end of the array. * (assuming an array of BTS/PEBS records) * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result */ -extern int ds_get_bts_end(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); +extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos); /* * Provide a pointer to the BTS/PEBS record at parameter index. @@ -102,14 +114,13 @@ extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); * * Returns the size of a single record on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() * index: the index of the requested record * record (out): pointer to the requested record */ -extern int ds_access_bts(struct task_struct *task, +extern int ds_access_bts(struct bts_tracer *tracer, size_t index, const void **record); -extern int ds_access_pebs(struct task_struct *task, +extern int ds_access_pebs(struct pebs_tracer *tracer, size_t index, const void **record); /* @@ -129,38 +140,24 @@ extern int ds_access_pebs(struct task_struct *task, * * Returns the number of bytes written or -Eerrno. * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() * buffer: the buffer to write * size: the size of the buffer */ -extern int ds_write_bts(struct task_struct *task, +extern int ds_write_bts(struct bts_tracer *tracer, const void *buffer, size_t size); -extern int ds_write_pebs(struct task_struct *task, +extern int ds_write_pebs(struct pebs_tracer *tracer, const void *buffer, size_t size); -/* - * Same as ds_write_bts/pebs, but omit ownership checks. - * - * This is needed to have some other task than the owner of the - * BTS/PEBS buffer or the parameter task itself write into the - * respective buffer. - */ -extern int ds_unchecked_write_bts(struct task_struct *task, - const void *buffer, size_t size); -extern int ds_unchecked_write_pebs(struct task_struct *task, - const void *buffer, size_t size); - /* * Reset the write pointer of the BTS/PEBS buffer. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_reset_bts(struct task_struct *task); -extern int ds_reset_pebs(struct task_struct *task); +extern int ds_reset_bts(struct bts_tracer *tracer); +extern int ds_reset_pebs(struct pebs_tracer *tracer); /* * Clear the BTS/PEBS buffer and reset the write pointer. @@ -168,33 +165,30 @@ extern int ds_reset_pebs(struct task_struct *task); * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_clear_bts(struct task_struct *task); -extern int ds_clear_pebs(struct task_struct *task); +extern int ds_clear_bts(struct bts_tracer *tracer); +extern int ds_clear_pebs(struct pebs_tracer *tracer); /* * Provide the PEBS counter reset value. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs() * value (out): the counter reset value */ -extern int ds_get_pebs_reset(struct task_struct *task, u64 *value); +extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value); /* * Set the PEBS counter reset value. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs() * value: the new counter reset value */ -extern int ds_set_pebs_reset(struct task_struct *task, u64 value); +extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); /* * Initialization @@ -207,17 +201,13 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); /* * The DS context - part of struct thread_struct. */ +#define MAX_SIZEOF_DS (12 * 8) + struct ds_context { /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ - unsigned char *ds; + unsigned char ds[MAX_SIZEOF_DS]; /* the owner of the BTS and PEBS configuration, respectively */ - struct task_struct *owner[2]; - /* buffer overflow notification function for BTS and PEBS */ - ds_ovfl_callback_t callback[2]; - /* the original buffer address */ - void *buffer[2]; - /* the number of allocated pages for on-request allocated buffers */ - unsigned int pages[2]; + struct ds_tracer *owner[2]; /* use count */ unsigned long count; /* a pointer to the context location inside the thread_struct diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index d6938d9351c..96768e9cce9 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -28,6 +28,7 @@ #include #include #include +#include /* @@ -44,6 +45,35 @@ struct ds_configuration { }; static struct ds_configuration ds_cfg; +/* + * A BTS or PEBS tracer. + * + * This holds the configuration of the tracer and serves as a handle + * to identify tracers. + */ +struct ds_tracer { + /* the DS context (partially) owned by this tracer */ + struct ds_context *context; + /* the buffer provided on ds_request() and its size in bytes */ + void *buffer; + size_t size; + /* the number of allocated pages for on-request allocated buffers */ + unsigned int pages; +}; + +struct bts_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* buffer overflow notification function */ + bts_ovfl_callback_t ovfl; +}; + +struct pebs_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* buffer overflow notification function */ + pebs_ovfl_callback_t ovfl; +}; /* * Debug Store (DS) save area configuration (see Intel64 and IA32 @@ -107,35 +137,15 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, (*(unsigned long *)base) = value; } +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ + /* * Locking is done only for allocating BTS or PEBS resources and for * guarding context and buffer memory allocation. - * - * Most functions require the current task to own the ds context part - * they are going to access. All the locking is done when validating - * access to the context. */ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); -/* - * Validate that the current task is allowed to access the BTS/PEBS - * buffer of the parameter task. - * - * Returns 0, if access is granted; -Eerrno, otherwise. - */ -static inline int ds_validate_access(struct ds_context *context, - enum ds_qualifier qual) -{ - if (!context) - return -EPERM; - - if (context->owner[qual] == current) - return 0; - - return -EPERM; -} - /* * We either support (system-wide) per-cpu or per-thread allocation. @@ -183,50 +193,12 @@ static inline int check_tracer(struct task_struct *task) * * Contexts are use-counted. They are allocated on first access and * deallocated when the last user puts the context. - * - * We distinguish between an allocating and a non-allocating get of a - * context: - * - the allocating get is used for requesting BTS/PEBS resources. It - * requires the caller to hold the global ds_lock. - * - the non-allocating get is used for all other cases. A - * non-existing context indicates an error. It acquires and releases - * the ds_lock itself for obtaining the context. - * - * A context and its DS configuration are allocated and deallocated - * together. A context always has a DS configuration of the - * appropriate size. */ static DEFINE_PER_CPU(struct ds_context *, system_context); #define this_system_context per_cpu(system_context, smp_processor_id()) -/* - * Returns the pointer to the parameter task's context or to the - * system-wide context, if task is NULL. - * - * Increases the use count of the returned context, if not NULL. - */ static inline struct ds_context *ds_get_context(struct task_struct *task) -{ - struct ds_context *context; - unsigned long irq; - - spin_lock_irqsave(&ds_lock, irq); - - context = (task ? task->thread.ds_ctx : this_system_context); - if (context) - context->count++; - - spin_unlock_irqrestore(&ds_lock, irq); - - return context; -} - -/* - * Same as ds_get_context, but allocates the context and it's DS - * structure, if necessary; returns NULL; if out of memory. - */ -static inline struct ds_context *ds_alloc_context(struct task_struct *task) { struct ds_context **p_context = (task ? &task->thread.ds_ctx : &this_system_context); @@ -238,16 +210,9 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task) if (!context) return NULL; - context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); - if (!context->ds) { - kfree(context); - return NULL; - } - spin_lock_irqsave(&ds_lock, irq); if (*p_context) { - kfree(context->ds); kfree(context); context = *p_context; @@ -272,10 +237,6 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task) return context; } -/* - * Decreases the use count of the parameter context, if not NULL. - * Deallocates the context, if the use count reaches zero. - */ static inline void ds_put_context(struct ds_context *context) { unsigned long irq; @@ -296,13 +257,6 @@ static inline void ds_put_context(struct ds_context *context) if (!context->task || (context->task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); - put_tracer(context->task); - - /* free any leftover buffers from tracers that did not - * deallocate them properly. */ - kfree(context->buffer[ds_bts]); - kfree(context->buffer[ds_pebs]); - kfree(context->ds); kfree(context); out: spin_unlock_irqrestore(&ds_lock, irq); @@ -312,21 +266,29 @@ static inline void ds_put_context(struct ds_context *context) /* * Handle a buffer overflow * - * task: the task whose buffers are overflowing; - * NULL for a buffer overflow on the current cpu * context: the ds context * qual: the buffer type */ -static void ds_overflow(struct task_struct *task, struct ds_context *context, - enum ds_qualifier qual) -{ - if (!context) - return; - - if (context->callback[qual]) - (*context->callback[qual])(task); - - /* todo: do some more overflow handling */ +static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) +{ + switch (qual) { + case ds_bts: { + struct bts_tracer *tracer = + container_of(context->owner[qual], + struct bts_tracer, ds); + if (tracer->ovfl) + tracer->ovfl(tracer); + } + break; + case ds_pebs: { + struct pebs_tracer *tracer = + container_of(context->owner[qual], + struct pebs_tracer, ds); + if (tracer->ovfl) + tracer->ovfl(tracer); + } + break; + } } @@ -343,23 +305,25 @@ static void ds_overflow(struct task_struct *task, struct ds_context *context, static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) { unsigned long rlim, vm, pgsz; - void *buffer; + void *buffer = NULL; pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + down_write(¤t->mm->mmap_sem); + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; vm = current->mm->total_vm + pgsz; if (rlim < vm) - return NULL; + goto out; rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; vm = current->mm->locked_vm + pgsz; if (rlim < vm) - return NULL; + goto out; buffer = kzalloc(size, GFP_KERNEL); if (!buffer) - return NULL; + goto out; current->mm->total_vm += pgsz; current->mm->locked_vm += pgsz; @@ -367,290 +331,337 @@ static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) if (pages) *pages = pgsz; + out: + up_write(¤t->mm->mmap_sem); return buffer; } -static int ds_request(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl, enum ds_qualifier qual) +static void ds_install_ds_config(struct ds_context *context, + enum ds_qualifier qual, + void *base, size_t size, size_t ith) { - struct ds_context *context; unsigned long buffer, adj; - const unsigned long alignment = (1 << 3); + + /* adjust the buffer address and size to meet alignment + * constraints: + * - buffer is double-word aligned + * - size is multiple of record size + * + * We checked the size at the very beginning; we have enough + * space to do the adjustment. + */ + buffer = (unsigned long)base; + + adj = ALIGN(buffer, DS_ALIGNMENT) - buffer; + buffer += adj; + size -= adj; + + size /= ds_cfg.sizeof_rec[qual]; + size *= ds_cfg.sizeof_rec[qual]; + + ds_set(context->ds, qual, ds_buffer_base, buffer); + ds_set(context->ds, qual, ds_index, buffer); + ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + + /* The value for 'no threshold' is -1, which will set the + * threshold outside of the buffer, just like we want it. + */ + ds_set(context->ds, qual, + ds_interrupt_threshold, buffer + size - ith); +} + +static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, + struct task_struct *task, + void *base, size_t size, size_t th) +{ + struct ds_context *context; unsigned long irq; - int error = 0; + int error; + error = -EOPNOTSUPP; if (!ds_cfg.sizeof_ds) - return -EOPNOTSUPP; + goto out; /* we require some space to do alignment adjustments below */ - if (size < (alignment + ds_cfg.sizeof_rec[qual])) - return -EINVAL; + error = -EINVAL; + if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + goto out; - /* buffer overflow notification is not yet implemented */ - if (ovfl) - return -EOPNOTSUPP; + if (th != (size_t)-1) { + th *= ds_cfg.sizeof_rec[qual]; + + error = -EINVAL; + if (size <= th) + goto out; + } + + error = -ENOMEM; + if (!base) { + base = ds_allocate_buffer(size, &tracer->pages); + if (!base) + goto out; + } + tracer->buffer = base; + tracer->size = size; - context = ds_alloc_context(task); + error = -ENOMEM; + context = ds_get_context(task); if (!context) - return -ENOMEM; + goto out; + tracer->context = context; + spin_lock_irqsave(&ds_lock, irq); error = -EPERM; if (!check_tracer(task)) goto out_unlock; - get_tracer(task); - error = -EALREADY; - if (context->owner[qual] == current) - goto out_put_tracer; error = -EPERM; - if (context->owner[qual] != NULL) + if (context->owner[qual]) goto out_put_tracer; - context->owner[qual] = current; + context->owner[qual] = tracer; spin_unlock_irqrestore(&ds_lock, irq); - error = -ENOMEM; - if (!base) { - base = ds_allocate_buffer(size, &context->pages[qual]); - if (!base) - goto out_release; - - context->buffer[qual] = base; - } - error = 0; + ds_install_ds_config(context, qual, base, size, th); - context->callback[qual] = ovfl; - - /* adjust the buffer address and size to meet alignment - * constraints: - * - buffer is double-word aligned - * - size is multiple of record size - * - * We checked the size at the very beginning; we have enough - * space to do the adjustment. - */ - buffer = (unsigned long)base; - - adj = ALIGN(buffer, alignment) - buffer; - buffer += adj; - size -= adj; - - size /= ds_cfg.sizeof_rec[qual]; - size *= ds_cfg.sizeof_rec[qual]; - - ds_set(context->ds, qual, ds_buffer_base, buffer); - ds_set(context->ds, qual, ds_index, buffer); - ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); - - if (ovfl) { - /* todo: select a suitable interrupt threshold */ - } else - ds_set(context->ds, qual, - ds_interrupt_threshold, buffer + size + 1); - - /* we keep the context until ds_release */ - return error; - - out_release: - context->owner[qual] = NULL; - ds_put_context(context); - put_tracer(task); - return error; + return 0; out_put_tracer: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); put_tracer(task); - return error; - out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(context); + tracer->context = NULL; + out: return error; } -int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) +struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th) { - return ds_request(task, base, size, ovfl, ds_bts); -} + struct bts_tracer *tracer; + int error; -int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) -{ - return ds_request(task, base, size, ovfl, ds_pebs); + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) + goto out; + + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + goto out; + tracer->ovfl = ovfl; + + error = ds_request(&tracer->ds, ds_bts, task, base, size, th); + if (error < 0) + goto out_tracer; + + return tracer; + + out_tracer: + (void)ds_release_bts(tracer); + out: + return ERR_PTR(error); } -static int ds_release(struct task_struct *task, enum ds_qualifier qual) +struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th) { - struct ds_context *context; + struct pebs_tracer *tracer; int error; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) goto out; - kfree(context->buffer[qual]); - context->buffer[qual] = NULL; + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + goto out; + tracer->ovfl = ovfl; - current->mm->total_vm -= context->pages[qual]; - current->mm->locked_vm -= context->pages[qual]; - context->pages[qual] = 0; - context->owner[qual] = NULL; + error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); + if (error < 0) + goto out_tracer; - /* - * we put the context twice: - * once for the ds_get_context - * once for the corresponding ds_request - */ - ds_put_context(context); + return tracer; + + out_tracer: + (void)ds_release_pebs(tracer); out: - ds_put_context(context); - return error; + return ERR_PTR(error); +} + +static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) +{ + if (tracer->context) { + BUG_ON(tracer->context->owner[qual] != tracer); + tracer->context->owner[qual] = NULL; + + put_tracer(tracer->context->task); + ds_put_context(tracer->context); + } + + if (tracer->pages) { + kfree(tracer->buffer); + + down_write(¤t->mm->mmap_sem); + + current->mm->total_vm -= tracer->pages; + current->mm->locked_vm -= tracer->pages; + + up_write(¤t->mm->mmap_sem); + } } -int ds_release_bts(struct task_struct *task) +int ds_release_bts(struct bts_tracer *tracer) { - return ds_release(task, ds_bts); + if (!tracer) + return -EINVAL; + + ds_release(&tracer->ds, ds_bts); + kfree(tracer); + + return 0; } -int ds_release_pebs(struct task_struct *task) +int ds_release_pebs(struct pebs_tracer *tracer) { - return ds_release(task, ds_pebs); + if (!tracer) + return -EINVAL; + + ds_release(&tracer->ds, ds_pebs); + kfree(tracer); + + return 0; } -static int ds_get_index(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) +static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) { - struct ds_context *context; unsigned long base, index; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; base = ds_get(context->ds, qual, ds_buffer_base); index = ds_get(context->ds, qual, ds_index); - error = ((index - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; - out: - ds_put_context(context); - return error; + return (index - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_index(struct task_struct *task, size_t *pos) +int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos) { - return ds_get_index(task, pos, ds_bts); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_index(tracer->ds.context, ds_bts); + + return 0; } -int ds_get_pebs_index(struct task_struct *task, size_t *pos) +int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos) { - return ds_get_index(task, pos, ds_pebs); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_index(tracer->ds.context, ds_pebs); + + return 0; } -static int ds_get_end(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) +static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual) { - struct ds_context *context; - unsigned long base, end; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; + unsigned long base, max; base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); + max = ds_get(context->ds, qual, ds_absolute_maximum); - error = ((end - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; - out: - ds_put_context(context); - return error; + return (max - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_end(struct task_struct *task, size_t *pos) +int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos) { - return ds_get_end(task, pos, ds_bts); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_end(tracer->ds.context, ds_bts); + + return 0; } -int ds_get_pebs_end(struct task_struct *task, size_t *pos) +int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos) { - return ds_get_end(task, pos, ds_pebs); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_end(tracer->ds.context, ds_pebs); + + return 0; } -static int ds_access(struct task_struct *task, size_t index, - const void **record, enum ds_qualifier qual) +static int ds_access(struct ds_context *context, enum ds_qualifier qual, + size_t index, const void **record) { - struct ds_context *context; unsigned long base, idx; - int error; if (!record) return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; - base = ds_get(context->ds, qual, ds_buffer_base); idx = base + (index * ds_cfg.sizeof_rec[qual]); - error = -EINVAL; if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) - goto out; + return -EINVAL; *record = (const void *)idx; - error = ds_cfg.sizeof_rec[qual]; - out: - ds_put_context(context); - return error; + + return ds_cfg.sizeof_rec[qual]; } -int ds_access_bts(struct task_struct *task, size_t index, const void **record) +int ds_access_bts(struct bts_tracer *tracer, size_t index, + const void **record) { - return ds_access(task, index, record, ds_bts); + if (!tracer) + return -EINVAL; + + return ds_access(tracer->ds.context, ds_bts, index, record); } -int ds_access_pebs(struct task_struct *task, size_t index, const void **record) +int ds_access_pebs(struct pebs_tracer *tracer, size_t index, + const void **record) { - return ds_access(task, index, record, ds_pebs); + if (!tracer) + return -EINVAL; + + return ds_access(tracer->ds.context, ds_pebs, index, record); } -static int ds_write(struct task_struct *task, const void *record, size_t size, - enum ds_qualifier qual, int force) +static int ds_write(struct ds_context *context, enum ds_qualifier qual, + const void *record, size_t size) { - struct ds_context *context; - int error; + int bytes_written = 0; if (!record) return -EINVAL; - error = -EPERM; - context = ds_get_context(task); - if (!context) - goto out; - - if (!force) { - error = ds_validate_access(context, qual); - if (error < 0) - goto out; - } - - error = 0; while (size) { unsigned long base, index, end, write_end, int_th; unsigned long write_size, adj_write_size; @@ -678,14 +689,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size, write_end = end; if (write_end <= index) - goto out; + break; write_size = min((unsigned long) size, write_end - index); memcpy((void *)index, record, write_size); record = (const char *)record + write_size; - size -= write_size; - error += write_size; + size -= write_size; + bytes_written += write_size; adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; adj_write_size *= ds_cfg.sizeof_rec[qual]; @@ -700,47 +711,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size, ds_set(context->ds, qual, ds_index, index); if (index >= int_th) - ds_overflow(task, context, qual); + ds_overflow(context, qual); } - out: - ds_put_context(context); - return error; + return bytes_written; } -int ds_write_bts(struct task_struct *task, const void *record, size_t size) +int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size) { - return ds_write(task, record, size, ds_bts, /* force = */ 0); -} + if (!tracer) + return -EINVAL; -int ds_write_pebs(struct task_struct *task, const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 0); + return ds_write(tracer->ds.context, ds_bts, record, size); } -int ds_unchecked_write_bts(struct task_struct *task, - const void *record, size_t size) +int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size) { - return ds_write(task, record, size, ds_bts, /* force = */ 1); -} + if (!tracer) + return -EINVAL; -int ds_unchecked_write_pebs(struct task_struct *task, - const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 1); + return ds_write(tracer->ds.context, ds_pebs, record, size); } -static int ds_reset_or_clear(struct task_struct *task, - enum ds_qualifier qual, int clear) +static void ds_reset_or_clear(struct ds_context *context, + enum ds_qualifier qual, int clear) { - struct ds_context *context; unsigned long base, end; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; base = ds_get(context->ds, qual, ds_buffer_base); end = ds_get(context->ds, qual, ds_absolute_maximum); @@ -749,70 +745,69 @@ static int ds_reset_or_clear(struct task_struct *task, memset((void *)base, 0, end - base); ds_set(context->ds, qual, ds_index, base); - - error = 0; - out: - ds_put_context(context); - return error; } -int ds_reset_bts(struct task_struct *task) +int ds_reset_bts(struct bts_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); + + return 0; } -int ds_reset_pebs(struct task_struct *task) +int ds_reset_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); + + return 0; } -int ds_clear_bts(struct task_struct *task) +int ds_clear_bts(struct bts_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); + + return 0; } -int ds_clear_pebs(struct task_struct *task) +int ds_clear_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); + + return 0; } -int ds_get_pebs_reset(struct task_struct *task, u64 *value) +int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value) { - struct ds_context *context; - int error; + if (!tracer) + return -EINVAL; if (!value) return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; - - *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); + *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); - error = 0; - out: - ds_put_context(context); - return error; + return 0; } -int ds_set_pebs_reset(struct task_struct *task, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) { - struct ds_context *context; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; + if (!tracer) + return -EINVAL; - *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; - error = 0; - out: - ds_put_context(context); - return error; + return 0; } static const struct ds_configuration ds_cfg_var = { @@ -840,6 +835,10 @@ static inline void ds_configure(const struct ds_configuration *cfg) { ds_cfg = *cfg; + + printk(KERN_INFO "DS available\n"); + + BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -883,6 +882,8 @@ void ds_free(struct ds_context *context) * is dying. There should not be any user of that context left * to disturb us, anymore. */ unsigned long leftovers = context->count; - while (leftovers--) + while (leftovers--) { + put_tracer(context->task); ds_put_context(context); + } } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 06180dff5b2..76adf5b640f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, size_t bts_index, bts_end; int error; - error = ds_get_bts_end(child, &bts_end); + error = ds_get_bts_end(child->bts, &bts_end); if (error < 0) return error; if (bts_end <= index) return -EINVAL; - error = ds_get_bts_index(child, &bts_index); + error = ds_get_bts_index(child->bts, &bts_index); if (error < 0) return error; @@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, if (bts_end <= bts_index) bts_index -= bts_end; - error = ds_access_bts(child, bts_index, &bts_record); + error = ds_access_bts(child->bts, bts_index, &bts_record); if (error < 0) return error; @@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child, size_t end, i; int error; - error = ds_get_bts_index(child, &end); + error = ds_get_bts_index(child->bts, &end); if (error < 0) return error; if (size < (end * sizeof(struct bts_struct))) return -EIO; - error = ds_access_bts(child, 0, (const void **)&raw); + error = ds_access_bts(child->bts, 0, (const void **)&raw); if (error < 0) return error; @@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child, return -EFAULT; } - error = ds_clear_bts(child); + error = ds_clear_bts(child->bts); if (error < 0) return error; return end; } -static void ptrace_bts_ovfl(struct task_struct *child) -{ - send_sig(child->thread.bts_ovfl_signal, child, 0); -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) @@ -760,23 +755,29 @@ static int ptrace_bts_config(struct task_struct *child, goto errout; if (cfg.flags & PTRACE_BTS_O_ALLOC) { - ds_ovfl_callback_t ovfl = NULL; + bts_ovfl_callback_t ovfl = NULL; unsigned int sig = 0; - /* we ignore the error in case we were not tracing child */ - (void)ds_release_bts(child); - if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) goto errout; + error = -EOPNOTSUPP; + goto errout; + sig = cfg.signal; - ovfl = ptrace_bts_ovfl; } - error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); - if (error < 0) + if (child->bts) + (void)ds_release_bts(child->bts); + + child->bts = ds_request_bts(child, /* base = */ NULL, cfg.size, + ovfl, /* th = */ (size_t)-1); + if (IS_ERR(child->bts)) { + error = PTR_ERR(child->bts); + child->bts = NULL; goto errout; + } child->thread.bts_ovfl_signal = sig; } @@ -823,15 +824,15 @@ static int ptrace_bts_status(struct task_struct *child, if (cfg_size < sizeof(cfg)) return -EIO; - error = ds_get_bts_end(child, &end); + error = ds_get_bts_end(child->bts, &end); if (error < 0) return error; - error = ds_access_bts(child, /* index = */ 0, &base); + error = ds_access_bts(child->bts, /* index = */ 0, &base); if (error < 0) return error; - error = ds_access_bts(child, /* index = */ end, &max); + error = ds_access_bts(child->bts, /* index = */ end, &max); if (error < 0) return error; @@ -884,10 +885,7 @@ static int ptrace_bts_write_record(struct task_struct *child, return -EINVAL; } - /* The writing task will be the switched-to task on a context - * switch. It needs to write into the switched-from task's BTS - * buffer. */ - return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); + return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts); } void ptrace_bts_take_timestamp(struct task_struct *tsk, @@ -972,13 +970,15 @@ void ptrace_disable(struct task_struct *child) clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif #ifdef CONFIG_X86_PTRACE_BTS - (void)ds_release_bts(child); + if (child->bts) { + (void)ds_release_bts(child->bts); - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + } #endif /* CONFIG_X86_PTRACE_BTS */ } @@ -1110,9 +1110,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) (child, data, (struct ptrace_bts_config __user *)addr); break; - case PTRACE_BTS_SIZE: - ret = ds_get_bts_index(child, /* pos = */ NULL); + case PTRACE_BTS_SIZE: { + size_t size; + + ret = ds_get_bts_index(child->bts, &size); + if (ret == 0) { + BUG_ON(size != (int) size); + ret = (int) size; + } break; + } case PTRACE_BTS_GET: ret = ptrace_bts_read_record @@ -1120,7 +1127,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ds_clear_bts(child); + ret = ds_clear_bts(child->bts); break; case PTRACE_BTS_DRAIN: -- cgit v1.2.3 From 6abb11aecd888d1da6276399380b7355f127c006 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 25 Nov 2008 09:05:27 +0100 Subject: x86, bts, ptrace: move BTS buffer allocation from ds.c into ptrace.c Impact: restructure DS memory allocation to be done by the usage site of DS Require pre-allocated buffers in ds.h. Move the BTS buffer allocation for ptrace into ptrace.c. The pointer to the allocated buffer is stored in the traced task's task_struct together with the handle returned by ds_request_bts(). Removes memory accounting code. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 12 +++---- arch/x86/kernel/ds.c | 92 ++++++++--------------------------------------- arch/x86/kernel/ptrace.c | 22 ++++++++++-- 3 files changed, 38 insertions(+), 88 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 0af997de5f0..99b6c39774a 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -7,13 +7,12 @@ * * It manages: * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done) * - buffer access * * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks * * * Copyright (C) 2007-2008 Intel Corporation. @@ -54,8 +53,7 @@ typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); * task: the task to request recording for; * NULL for per-cpu recording on the current cpu * base: the base pointer for the (non-pageable) buffer; - * NULL if buffer allocation requested - * size: the size of the requested or provided buffer in bytes + * size: the size of the provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; * NULL if cyclic buffer requested * th: the interrupt threshold in records from the end of the buffer; @@ -72,8 +70,6 @@ extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, /* * Release BTS or PEBS resources * - * Frees buffers allocated on ds_request. - * * Returns 0 on success; -Eerrno otherwise * * tracer: the tracer handle returned from ds_request_~() diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 96768e9cce9..19a8c2c0389 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -7,13 +7,12 @@ * * It manages: * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done) * - buffer access * * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks * * * Copyright (C) 2007-2008 Intel Corporation. @@ -57,8 +56,6 @@ struct ds_tracer { /* the buffer provided on ds_request() and its size in bytes */ void *buffer; size_t size; - /* the number of allocated pages for on-request allocated buffers */ - unsigned int pages; }; struct bts_tracer { @@ -141,8 +138,7 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, /* - * Locking is done only for allocating BTS or PEBS resources and for - * guarding context and buffer memory allocation. + * Locking is done only for allocating BTS or PEBS resources. */ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); @@ -292,50 +288,6 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) } -/* - * Allocate a non-pageable buffer of the parameter size. - * Checks the memory and the locked memory rlimit. - * - * Returns the buffer, if successful; - * NULL, if out of memory or rlimit exceeded. - * - * size: the requested buffer size in bytes - * pages (out): if not NULL, contains the number of pages reserved - */ -static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) -{ - unsigned long rlim, vm, pgsz; - void *buffer = NULL; - - pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - - down_write(¤t->mm->mmap_sem); - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + pgsz; - if (rlim < vm) - goto out; - - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + pgsz; - if (rlim < vm) - goto out; - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto out; - - current->mm->total_vm += pgsz; - current->mm->locked_vm += pgsz; - - if (pages) - *pages = pgsz; - - out: - up_write(¤t->mm->mmap_sem); - return buffer; -} - static void ds_install_ds_config(struct ds_context *context, enum ds_qualifier qual, void *base, size_t size, size_t ith) @@ -382,6 +334,10 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, if (!ds_cfg.sizeof_ds) goto out; + error = -EINVAL; + if (!base) + goto out; + /* we require some space to do alignment adjustments below */ error = -EINVAL; if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) @@ -395,13 +351,6 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, goto out; } - error = -ENOMEM; - if (!base) { - base = ds_allocate_buffer(size, &tracer->pages); - if (!base) - goto out; - } - tracer->buffer = base; tracer->size = size; @@ -466,7 +415,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, return tracer; out_tracer: - (void)ds_release_bts(tracer); + kfree(tracer); out: return ERR_PTR(error); } @@ -496,31 +445,18 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, return tracer; out_tracer: - (void)ds_release_pebs(tracer); + kfree(tracer); out: return ERR_PTR(error); } static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) { - if (tracer->context) { - BUG_ON(tracer->context->owner[qual] != tracer); - tracer->context->owner[qual] = NULL; - - put_tracer(tracer->context->task); - ds_put_context(tracer->context); - } + BUG_ON(tracer->context->owner[qual] != tracer); + tracer->context->owner[qual] = NULL; - if (tracer->pages) { - kfree(tracer->buffer); - - down_write(¤t->mm->mmap_sem); - - current->mm->total_vm -= tracer->pages; - current->mm->locked_vm -= tracer->pages; - - up_write(¤t->mm->mmap_sem); - } + put_tracer(tracer->context->task); + ds_put_context(tracer->context); } int ds_release_bts(struct bts_tracer *tracer) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 76adf5b640f..2c8ec1ba75e 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -758,6 +758,10 @@ static int ptrace_bts_config(struct task_struct *child, bts_ovfl_callback_t ovfl = NULL; unsigned int sig = 0; + error = -EINVAL; + if (cfg.size < (10 * bts_cfg.sizeof_bts)) + goto errout; + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) goto errout; @@ -768,14 +772,26 @@ static int ptrace_bts_config(struct task_struct *child, sig = cfg.signal; } - if (child->bts) + if (child->bts) { (void)ds_release_bts(child->bts); + kfree(child->bts_buffer); + + child->bts = NULL; + child->bts_buffer = NULL; + } + + error = -ENOMEM; + child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); + if (!child->bts_buffer) + goto errout; - child->bts = ds_request_bts(child, /* base = */ NULL, cfg.size, + child->bts = ds_request_bts(child, child->bts_buffer, cfg.size, ovfl, /* th = */ (size_t)-1); if (IS_ERR(child->bts)) { error = PTR_ERR(child->bts); + kfree(child->bts_buffer); child->bts = NULL; + child->bts_buffer = NULL; goto errout; } @@ -972,6 +988,8 @@ void ptrace_disable(struct task_struct *child) #ifdef CONFIG_X86_PTRACE_BTS if (child->bts) { (void)ds_release_bts(child->bts); + kfree(child->bts_buffer); + child->bts_buffer = NULL; child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; if (!child->thread.debugctlmsr) -- cgit v1.2.3 From 1e9b51c28312f7334394aa30be56ff52c2b65b7e Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 25 Nov 2008 09:24:15 +0100 Subject: x86, bts, ftrace: a BTS ftrace plug-in prototype Impact: add new ftrace plugin A prototype for a BTS ftrace plug-in. The tracer collects branch trace in a cyclic buffer for each cpu. The tracer is not configurable and the trace for each snapshot is appended when doing cat /debug/tracing/trace. This is a proof of concept that will be extended with future patches to become a (hopefully) useful tool. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b815664fe37..85a78575956 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -515,6 +515,7 @@ config CPU_SUP_UMC_32 config X86_DS def_bool X86_PTRACE_BTS depends on X86_DEBUGCTLMSR + select HAVE_HW_BRANCH_TRACER config X86_PTRACE_BTS bool "Branch Trace Store" -- cgit v1.2.3 From fb52607afcd0629776f1dc9e657647ceae81dd50 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 25 Nov 2008 21:07:04 +0100 Subject: tracing/function-return-tracer: change the name into function-graph-tracer Impact: cleanup This patch changes the name of the "return function tracer" into function-graph-tracer which is a more suitable name for a tracing which makes one able to retrieve the ordered call stack during the code flow. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/ftrace.h | 4 ++-- arch/x86/kernel/Makefile | 4 ++-- arch/x86/kernel/entry_32.S | 12 ++++++------ arch/x86/kernel/ftrace.c | 12 ++++++------ 5 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e49a4fd718f..0842b112768 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,7 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER - select HAVE_FUNCTION_RET_TRACER if X86_32 + select HAVE_FUNCTION_GRAPH_TRACER if X86_32 select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 754a3e082f9..7e61b4ceb9a 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -28,7 +28,7 @@ struct dyn_arch_ftrace { #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER #ifndef __ASSEMBLY__ @@ -51,6 +51,6 @@ struct ftrace_ret_stack { extern void return_to_handler(void); #endif /* __ASSEMBLY__ */ -#endif /* CONFIG_FUNCTION_RET_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index af2bc36ca1c..64939a0c398 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -14,7 +14,7 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg endif -ifdef CONFIG_FUNCTION_RET_TRACER +ifdef CONFIG_FUNCTION_GRAPH_TRACER # Don't trace __switch_to() but let it for function tracer CFLAGS_REMOVE_process_32.o = -pg endif @@ -70,7 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_FUNCTION_RET_TRACER) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 74defe21ba4..2b1f0f081a6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1188,9 +1188,9 @@ ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace -#ifdef CONFIG_FUNCTION_RET_TRACER - cmpl $ftrace_stub, ftrace_function_return - jnz ftrace_return_caller +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_function + jnz ftrace_graph_caller #endif .globl ftrace_stub ftrace_stub: @@ -1215,8 +1215,8 @@ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ -#ifdef CONFIG_FUNCTION_RET_TRACER -ENTRY(ftrace_return_caller) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) cmpl $0, function_trace_stop jne ftrace_stub @@ -1230,7 +1230,7 @@ ENTRY(ftrace_return_caller) popl %ecx popl %eax ret -END(ftrace_return_caller) +END(ftrace_graph_caller) .globl return_to_handler return_to_handler: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index bb137f7297e..3595a4c14ab 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -323,7 +323,7 @@ int __init ftrace_dyn_arch_init(void *data) } #endif -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER #ifndef CONFIG_DYNAMIC_FTRACE @@ -389,11 +389,11 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, */ unsigned long ftrace_return_to_handler(void) { - struct ftrace_retfunc trace; + struct ftrace_graph_ret trace; pop_return_trace(&trace.ret, &trace.calltime, &trace.func, &trace.overrun); trace.rettime = cpu_clock(raw_smp_processor_id()); - ftrace_function_return(&trace); + ftrace_graph_function(&trace); return trace.ret; } @@ -440,12 +440,12 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) ); if (WARN_ON(faulted)) { - unregister_ftrace_return(); + unregister_ftrace_graph(); return; } if (WARN_ON(!__kernel_text_address(old))) { - unregister_ftrace_return(); + unregister_ftrace_graph(); *parent = old; return; } @@ -456,4 +456,4 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) *parent = old; } -#endif /* CONFIG_FUNCTION_RET_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.2.3 From 287b6e68ca7209caec40b2f44f837c580a413bae Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 26 Nov 2008 00:57:25 +0100 Subject: tracing/function-return-tracer: set a more human readable output Impact: feature This patch sets a C-like output for the function graph tracing. For this aim, we now call two handler for each function: one on the entry and one other on return. This way we can draw a well-ordered call stack. The pid of the previous trace is loosely stored to be compared against the one of the current trace to see if there were a context switch. Without this little feature, the call tree would seem broken at some locations. We could use the sched_tracer to capture these sched_events but this way of processing is much more simpler. 2 spaces have been chosen for indentation to fit the screen while deep calls. The time of execution in nanosecs is printed just after closed braces, it seems more easy this way to find the corresponding function. If the time was printed as a first column, it would be not so easy to find the corresponding function if it is called on a deep depth. I plan to output the return value but on 32 bits CPU, the return value can be 32 or 64, and its difficult to guess on which case we are. I don't know what would be the better solution on X86-32: only print eax (low-part) or even edx (high-part). Actually it's thee same problem when a function return a 8 bits value, the high part of eax could contain junk values... Here is an example of trace: sys_read() { fget_light() { } 526 vfs_read() { rw_verify_area() { security_file_permission() { cap_file_permission() { } 519 } 1564 } 2640 do_sync_read() { pipe_read() { __might_sleep() { } 511 pipe_wait() { prepare_to_wait() { } 760 deactivate_task() { dequeue_task() { dequeue_task_fair() { dequeue_entity() { update_curr() { update_min_vruntime() { } 504 } 1587 clear_buddies() { } 512 add_cfs_task_weight() { } 519 update_min_vruntime() { } 511 } 5602 dequeue_entity() { update_curr() { update_min_vruntime() { } 496 } 1631 clear_buddies() { } 496 update_min_vruntime() { } 527 } 4580 hrtick_update() { hrtick_start_fair() { } 488 } 1489 } 13700 } 14949 } 16016 msecs_to_jiffies() { } 496 put_prev_task_fair() { } 504 pick_next_task_fair() { } 489 pick_next_task_rt() { } 496 pick_next_task_fair() { } 489 pick_next_task_idle() { } 489 ------------8<---------- thread 4 ------------8<---------- finish_task_switch() { } 1203 do_softirq() { __do_softirq() { __local_bh_disable() { } 669 rcu_process_callbacks() { __rcu_process_callbacks() { cpu_quiet() { rcu_start_batch() { } 503 } 1647 } 3128 __rcu_process_callbacks() { } 542 } 5362 _local_bh_enable() { } 587 } 8880 } 9986 kthread_should_stop() { } 669 deactivate_task() { dequeue_task() { dequeue_task_fair() { dequeue_entity() { update_curr() { calc_delta_mine() { } 511 update_min_vruntime() { } 511 } 2813 Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3595a4c14ab..26b2d92d48b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -347,7 +347,7 @@ void ftrace_nmi_exit(void) /* Add a function return address to the trace stack on thread info.*/ static int push_return_trace(unsigned long ret, unsigned long long time, - unsigned long func) + unsigned long func, int *depth) { int index; @@ -365,21 +365,22 @@ static int push_return_trace(unsigned long ret, unsigned long long time, current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = time; + *depth = index; return 0; } /* Retrieve a function return address to the trace stack on thread info.*/ -static void pop_return_trace(unsigned long *ret, unsigned long long *time, - unsigned long *func, unsigned long *overrun) +static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) { int index; index = current->curr_ret_stack; *ret = current->ret_stack[index].ret; - *func = current->ret_stack[index].func; - *time = current->ret_stack[index].calltime; - *overrun = atomic_read(¤t->trace_overrun); + trace->func = current->ret_stack[index].func; + trace->calltime = current->ret_stack[index].calltime; + trace->overrun = atomic_read(¤t->trace_overrun); + trace->depth = index; current->curr_ret_stack--; } @@ -390,12 +391,13 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, unsigned long ftrace_return_to_handler(void) { struct ftrace_graph_ret trace; - pop_return_trace(&trace.ret, &trace.calltime, &trace.func, - &trace.overrun); + unsigned long ret; + + pop_return_trace(&trace, &ret); trace.rettime = cpu_clock(raw_smp_processor_id()); - ftrace_graph_function(&trace); + ftrace_graph_return(&trace); - return trace.ret; + return ret; } /* @@ -407,6 +409,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) unsigned long old; unsigned long long calltime; int faulted; + struct ftrace_graph_ent trace; unsigned long return_hooker = (unsigned long) &return_to_handler; @@ -452,8 +455,15 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) calltime = cpu_clock(raw_smp_processor_id()); - if (push_return_trace(old, calltime, self_addr) == -EBUSY) + if (push_return_trace(old, calltime, + self_addr, &trace.depth) == -EBUSY) { *parent = old; + return; + } + + trace.func = self_addr; + ftrace_graph_entry(&trace); + } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.2.3 From c2324b694fa8ffee382a124198c68754088e483c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 26 Nov 2008 03:10:01 +0100 Subject: tracing: function graph tracer, fix fix return-tracer => graph-tracer namespace rename fallout. Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2b1f0f081a6..7def9fd5c1e 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1189,7 +1189,7 @@ ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace #ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpl $ftrace_stub, ftrace_graph_function + cmpl $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller #endif .globl ftrace_stub -- cgit v1.2.3 From 5a45cfe1c64862e8cd3b0d79d7c4ba71c3118915 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2008 00:16:24 -0500 Subject: ftrace: use code patching for ftrace graph tracer Impact: more efficient code for ftrace graph tracer This patch uses the dynamic patching, when available, to patch the function graph code into the kernel. This patch will ease the way for letting both function tracing and function graph tracing run together. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 5 +++++ arch/x86/kernel/ftrace.c | 48 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 7def9fd5c1e..958af86186c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1174,6 +1174,11 @@ ftrace_call: popl %edx popl %ecx popl %eax +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif .globl ftrace_stub ftrace_stub: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 26b2d92d48b..7ef914e6a2f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -111,7 +111,6 @@ static void ftrace_mod_code(void) */ mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, MCOUNT_INSN_SIZE); - } void ftrace_nmi_enter(void) @@ -325,7 +324,51 @@ int __init ftrace_dyn_arch_init(void *data) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -#ifndef CONFIG_DYNAMIC_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); + +static int ftrace_mod_jmp(unsigned long ip, + int old_offset, int new_offset) +{ + unsigned char code[MCOUNT_INSN_SIZE]; + + if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + if (code[0] != 0xe9 || old_offset != *(int *)(&code[1])) + return -EINVAL; + + *(int *)(&code[1]) = new_offset; + + if (do_ftrace_mod_code(ip, &code)) + return -EPERM; + + return 0; +} + +int ftrace_enable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + int old_offset, new_offset; + + old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); + new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); + + return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + int old_offset, new_offset; + + old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); + new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); + + return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +#else /* CONFIG_DYNAMIC_FTRACE */ /* * These functions are picked from those used on @@ -343,6 +386,7 @@ void ftrace_nmi_exit(void) { atomic_dec(&in_nmi); } + #endif /* !CONFIG_DYNAMIC_FTRACE */ /* Add a function return address to the trace stack on thread info.*/ -- cgit v1.2.3 From f3f47a6768a29448866da4422b6f6bee485c947f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 23 Nov 2008 16:49:58 -0800 Subject: tracing: add "power-tracer": C/P state tracer to help power optimization Impact: new "power-tracer" ftrace plugin This patch adds a C/P-state ftrace plugin that will generate detailed statistics about the C/P-states that are being used, so that we can look at detailed decisions that the C/P-state code is making, rather than the too high level "average" that we have today. An example way of using this is: mount -t debugfs none /sys/kernel/debug echo cstate > /sys/kernel/debug/tracing/current_tracer echo 1 > /sys/kernel/debug/tracing/tracing_enabled sleep 1 echo 0 > /sys/kernel/debug/tracing/tracing_enabled cat /sys/kernel/debug/tracing/trace | perl scripts/trace/cstate.pl > out.svg Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 4 ++++ arch/x86/kernel/process.c | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 8e48c5d4467..88ea02dcb62 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; int result = 0; + struct power_trace it; dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } + trace_power_mark(&it, POWER_PSTATE, next_perf_state); + switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772744d..c27af49a4ed 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -7,6 +7,7 @@ #include #include #include +#include #include unsigned long idle_halt; @@ -100,6 +101,9 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -112,6 +116,7 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; + trace_power_end(&it); } else { local_irq_enable(); /* loop is done by the caller */ @@ -154,24 +159,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __mwait(ax, cx); } + trace_power_end(&it); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { + struct power_trace it; if (!need_resched()) { + trace_power_start(&it, POWER_CSTATE, 1); __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __sti_mwait(0, 0); else local_irq_enable(); + trace_power_end(&it); } else local_irq_enable(); } @@ -183,9 +195,13 @@ static void mwait_idle(void) */ static void poll_idle(void) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); + trace_power_end(&it); } /* -- cgit v1.2.3 From 4db646b1af8fdcf01d690d29eeae44cd937edb0d Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 23 Nov 2008 20:49:52 +0100 Subject: x86: microcode: fix sparse warnings Impact: make global variables and a function static Fix following sparse warnings: arch/x86/kernel/microcode_core.c:102:22: warning: symbol 'microcode_ops' was not declared. Should it be static? arch/x86/kernel/microcode_core.c:206:24: warning: symbol 'microcode_pdev' was not declared. Should it be static? arch/x86/kernel/microcode_core.c:322:6: warning: symbol 'microcode_update_cpu' was not declared. Should it be static? arch/x86/kernel/microcode_intel.c:468:22: warning: symbol 'microcode_intel_ops' was not declared. Should it be static? Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 6 +++--- arch/x86/kernel/microcode_intel.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 82fb2809ce3..5b711a53449 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -99,7 +99,7 @@ MODULE_LICENSE("GPL"); #define MICROCODE_VERSION "2.00" -struct microcode_ops *microcode_ops; +static struct microcode_ops *microcode_ops; /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); @@ -203,7 +203,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #endif /* fake device for request_firmware */ -struct platform_device *microcode_pdev; +static struct platform_device *microcode_pdev; static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, @@ -319,7 +319,7 @@ static int microcode_resume_cpu(int cpu) return 0; } -void microcode_update_cpu(int cpu) +static void microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 622dc4a2178..c34c820ee48 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -465,7 +465,7 @@ static void microcode_fini_cpu(int cpu) uci->mc = NULL; } -struct microcode_ops microcode_intel_ops = { +static struct microcode_ops microcode_intel_ops = { .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, -- cgit v1.2.3 From 1d9b16d1690fe5edb1c907fe4746681cf026cdf3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 27 Nov 2008 18:39:15 +0100 Subject: x86: move GART specific stuff from iommu.h to gart.h Impact: cleanup Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/include/asm/gart.h | 33 +++++++++++++++++++++++++++++++++ arch/x86/include/asm/iommu.h | 33 --------------------------------- arch/x86/kernel/amd_iommu.c | 1 + arch/x86/kernel/amd_iommu_init.c | 1 + arch/x86/kernel/early-quirks.c | 1 + arch/x86/kernel/pci-dma.c | 1 + arch/x86/kernel/setup.c | 1 + 7 files changed, 38 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 74252264433..6cfdafa409d 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -29,6 +29,39 @@ extern int fix_aperture; #define AMD64_GARTCACHECTL 0x9c #define AMD64_GARTEN (1<<0) +#ifdef CONFIG_GART_IOMMU +extern int gart_iommu_aperture; +extern int gart_iommu_aperture_allowed; +extern int gart_iommu_aperture_disabled; + +extern void early_gart_iommu_check(void); +extern void gart_iommu_init(void); +extern void gart_iommu_shutdown(void); +extern void __init gart_parse_options(char *); +extern void gart_iommu_hole_init(void); + +#else +#define gart_iommu_aperture 0 +#define gart_iommu_aperture_allowed 0 +#define gart_iommu_aperture_disabled 1 + +static inline void early_gart_iommu_check(void) +{ +} +static inline void gart_iommu_init(void) +{ +} +static inline void gart_iommu_shutdown(void) +{ +} +static inline void gart_parse_options(char *options) +{ +} +static inline void gart_iommu_hole_init(void) +{ +} +#endif + extern int agp_amd64_init(void); static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 0b500c5b644..295b13193f4 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -12,37 +12,4 @@ extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len); /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) -#ifdef CONFIG_GART_IOMMU -extern int gart_iommu_aperture; -extern int gart_iommu_aperture_allowed; -extern int gart_iommu_aperture_disabled; - -extern void early_gart_iommu_check(void); -extern void gart_iommu_init(void); -extern void gart_iommu_shutdown(void); -extern void __init gart_parse_options(char *); -extern void gart_iommu_hole_init(void); - -#else -#define gart_iommu_aperture 0 -#define gart_iommu_aperture_allowed 0 -#define gart_iommu_aperture_disabled 1 - -static inline void early_gart_iommu_check(void) -{ -} -static inline void gart_iommu_init(void) -{ -} -static inline void gart_iommu_shutdown(void) -{ -} -static inline void gart_parse_options(char *options) -{ -} -static inline void gart_iommu_hole_init(void) -{ -} -#endif - #endif /* _ASM_X86_IOMMU_H */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 331b318304e..172e0dc4641 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0cdcda35a05..7685f0774a8 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * definitions for the ACPI scanning code diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 1b894b72c0f..744aa7fc49d 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,6 +17,7 @@ #include #include #include +#include static void __init fix_hypertransport_config(int num, int slot, int func) { diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 19262482021..12eeb4bfcde 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa6790c1dd..67d5979e654 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -93,6 +93,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 8caac56305cef98f9357b060a77939d17699937d Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 26 Nov 2008 17:15:27 +0100 Subject: aperture_64.c: clarify that too small aperture is valid reason for this code Impact: update comment Clarify that too small aperture is valid reason for this code. Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9a32b37ee2e..676debfc170 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -1,8 +1,9 @@ /* * Firmware replacement code. * - * Work around broken BIOSes that don't set an aperture or only set the - * aperture in the AGP bridge. + * Work around broken BIOSes that don't set an aperture, only set the + * aperture in the AGP bridge, or set too small aperture. + * * If all fails map the aperture over some low memory. This is cheaper than * doing bounce buffering. The memory is lost. This is done at early boot * because only the bootmem allocator can allocate 32+MB. -- cgit v1.2.3 From 4385cecf1f5866fb33fc95e2ee26a44e9b6f6be2 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Sat, 29 Nov 2008 22:33:16 +0100 Subject: x86: intel_cacheinfo, minor show_type cleanup Impact: cleanup Signed-off-by: Jiri Slaby Cc: Jiri Slaby Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 3f46afbb1cf..68b5d8681cb 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -644,20 +644,17 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) return show_shared_cpu_map_func(leaf, 1, buf); } -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { - switch(this_leaf->eax.split.type) { - case CACHE_TYPE_DATA: +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) +{ + switch (this_leaf->eax.split.type) { + case CACHE_TYPE_DATA: return sprintf(buf, "Data\n"); - break; - case CACHE_TYPE_INST: + case CACHE_TYPE_INST: return sprintf(buf, "Instruction\n"); - break; - case CACHE_TYPE_UNIFIED: + case CACHE_TYPE_UNIFIED: return sprintf(buf, "Unified\n"); - break; - default: + default: return sprintf(buf, "Unknown\n"); - break; } } -- cgit v1.2.3 From 2c5643b1c5c7fbb13f340d4c58944d9642f41796 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Sun, 30 Nov 2008 17:16:04 +0900 Subject: x86: provide readq()/writeq() on 32-bit too Impact: add new API for drivers Add implementation of readq/writeq to x86_32, and add config value to the x86 architecture to determine existence of readq/writeq. Signed-off-by: Hitoshi Mitake Acked-by: Sam Ravnborg Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 ++ arch/x86/include/asm/io.h | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f..a7d50f5d118 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,6 +19,8 @@ config X86_64 config X86 def_bool y select HAVE_AOUT if X86_32 + select HAVE_READQ + select HAVE_WRITEQ select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index ac2abc88cd9..25946449df4 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -4,6 +4,7 @@ #define ARCH_HAS_IOREMAP_WC #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -57,6 +58,29 @@ build_mmio_write(__writeq, "q", unsigned long, "r", ) /* Let people know we have them */ #define readq readq #define writeq writeq + +#else /* CONFIG_X86_32 from here */ + +static inline __u64 readq(const volatile void __iomem *addr) +{ + const volatile u32 __iomem *p = addr; + u32 l, h; + + l = readl(p); + h = readl(p + 1); + + return l + ((u64)h << 32); +} + +static inline void writeq(__u64 val, volatile void __iomem *addr) +{ + writel(val, addr); + writel(val >> 32, addr+4); +} + +#define readq readq +#define writeq writeq + #endif extern int iommu_bio_merge; -- cgit v1.2.3 From a0b1131e479e5af32eefac8bc54c9742e23d638e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 30 Nov 2008 09:33:55 +0100 Subject: x86: provide readq()/writeq() on 32-bit too, cleanup Impact: cleanup Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 25946449df4..3ccfaf610c8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -55,21 +55,17 @@ build_mmio_write(__writeq, "q", unsigned long, "r", ) #define __raw_readq __readq #define __raw_writeq writeq -/* Let people know we have them */ -#define readq readq -#define writeq writeq - #else /* CONFIG_X86_32 from here */ static inline __u64 readq(const volatile void __iomem *addr) { const volatile u32 __iomem *p = addr; - u32 l, h; + u32 low, high; - l = readl(p); - h = readl(p + 1); + low = readl(p); + high = readl(p + 1); - return l + ((u64)h << 32); + return low + ((u64)high << 32); } static inline void writeq(__u64 val, volatile void __iomem *addr) @@ -78,11 +74,12 @@ static inline void writeq(__u64 val, volatile void __iomem *addr) writel(val >> 32, addr+4); } +#endif + +/* Let people know that we have them */ #define readq readq #define writeq writeq -#endif - extern int iommu_bio_merge; #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 93093d099e5dd0c258fd530c12668e828c20df41 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 30 Nov 2008 10:20:20 +0100 Subject: x86: provide readq()/writeq() on 32-bit too, complete if HAVE_READQ/HAVE_WRITEQ are defined, the full range of readq/writeq APIs has to be provided to drivers: drivers/infiniband/hw/amso1100/c2.c: In function 'c2_tx_ring_alloc': drivers/infiniband/hw/amso1100/c2.c:133: error: implicit declaration of function '__raw_writeq' So provide them on 32-bit as well. Also, map all the APIs to the strongest ordering variant. It's way too easy to mess such details up in drivers and the difference between "memory" and "" constrained asm() constructs is in the noise range. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 3ccfaf610c8..33513b9a67f 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -46,16 +46,11 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define mmiowb() barrier() #ifdef CONFIG_X86_64 + build_mmio_read(readq, "q", unsigned long, "=r", :"memory") -build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") -build_mmio_write(__writeq, "q", unsigned long, "r", ) - -#define readq_relaxed(a) __readq(a) -#define __raw_readq __readq -#define __raw_writeq writeq -#else /* CONFIG_X86_32 from here */ +#else static inline __u64 readq(const volatile void __iomem *addr) { @@ -76,9 +71,14 @@ static inline void writeq(__u64 val, volatile void __iomem *addr) #endif +#define readq_relaxed(a) readq(a) + +#define __raw_readq(a) readq(a) +#define __raw_writeq(val, addr) writeq(val, addr) + /* Let people know that we have them */ -#define readq readq -#define writeq writeq +#define readq readq +#define writeq writeq extern int iommu_bio_merge; -- cgit v1.2.3 From 50cec5c51c18301ff60262fdbe920f4a907c9d81 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 2 Dec 2008 02:17:15 +0900 Subject: x86: fix dma_mapping_error for 32bit x86, cleanup This removes ifdef CONFIG_X86_64 in dma_mapping_error(): 1) Xen people plan to use swiotlb on X86_32 for Dom0 support. swiotlb uses ops->mapping_error so X86_32 also needs to check ops->mapping_error. 2) Removing #ifdef hack is almost always a good thing. Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dma-mapping.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 097794ff6b7..dc22c073328 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -71,12 +71,10 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) /* Make sure we keep the same behaviour */ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { -#ifdef CONFIG_X86_64 struct dma_mapping_ops *ops = get_dma_ops(dev); if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); -#endif return (dma_addr == bad_dma_address); } -- cgit v1.2.3 From 48d68b20d00865035b8b65e69af343d0f53fac9d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 2 Dec 2008 00:20:39 +0100 Subject: tracing/function-graph-tracer: support for x86-64 Impact: extend and enable the function graph tracer to 64-bit x86 This patch implements the support for function graph tracer under x86-64. Both static and dynamic tracing are supported. This causes some small CPP conditional asm on arch/x86/kernel/ftrace.c I wanted to use probe_kernel_read/write to make the return address saving/patching code more generic but it causes tracing recursion. That would be perhaps useful to implement a notrace version of these function for other archs ports. Note that arch/x86/process_64.c is not traced, as in X86-32. I first thought __switch_to() was responsible of crashes during tracing because I believed current task were changed inside but that's actually not the case (actually yes, but not the "current" pointer). So I will have to investigate to find the functions that harm here, to enable tracing of the other functions inside (but there is no issue at this time, while process_64.c stays out of -pg flags). A little possible race condition is fixed inside this patch too. When the tracer allocate a return stack dynamically, the current depth is not initialized before but after. An interrupt could occur at this time and, after seeing that the return stack is allocated, the tracer could try to trace it with a random uninitialized depth. It's a prevention, even if I hadn't problems with it. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tim Bird Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/entry_64.S | 74 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ftrace.c | 11 ++++++- 4 files changed, 86 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0842b112768..45c86fb9413 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,7 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER - select HAVE_FUNCTION_GRAPH_TRACER if X86_32 + select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 64939a0c398..d274425fb07 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -17,6 +17,7 @@ endif ifdef CONFIG_FUNCTION_GRAPH_TRACER # Don't trace __switch_to() but let it for function tracer CFLAGS_REMOVE_process_32.o = -pg +CFLAGS_REMOVE_process_64.o = -pg endif # diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 08aa6b10933..2aa0526ac30 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -98,6 +98,12 @@ ftrace_call: movq (%rsp), %rax addq $0x38, %rsp +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + .globl ftrace_stub ftrace_stub: retq @@ -110,6 +116,12 @@ ENTRY(mcount) cmpq $ftrace_stub, ftrace_trace_function jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller +#endif + .globl ftrace_stub ftrace_stub: retq @@ -145,6 +157,68 @@ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + leaq 8(%rbp), %rdi + movq 0x38(%rsp), %rsi + + call prepare_ftrace_return + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + retq +END(ftrace_graph_caller) + + +.globl return_to_handler +return_to_handler: + subq $80, %rsp + + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + movq %r10, 56(%rsp) + movq %r11, 64(%rsp) + + call ftrace_return_to_handler + + movq %rax, 72(%rsp) + movq 64(%rsp), %r11 + movq 56(%rsp), %r10 + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $72, %rsp + retq +#endif + + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 7ef914e6a2f..58832478b94 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -467,8 +467,13 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) * ignore such a protection. */ asm volatile( +#ifdef CONFIG_X86_64 + "1: movq (%[parent_old]), %[old]\n" + "2: movq %[return_hooker], (%[parent_replaced])\n" +#else "1: movl (%[parent_old]), %[old]\n" "2: movl %[return_hooker], (%[parent_replaced])\n" +#endif " movl $0, %[faulted]\n" ".section .fixup, \"ax\"\n" @@ -476,8 +481,13 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) ".previous\n" ".section __ex_table, \"a\"\n" +#ifdef CONFIG_X86_64 + " .quad 1b, 3b\n" + " .quad 2b, 3b\n" +#else " .long 1b, 3b\n" " .long 2b, 3b\n" +#endif ".previous\n" : [parent_replaced] "=r" (parent), [old] "=r" (old), @@ -509,5 +519,4 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) ftrace_graph_entry(&trace); } - #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.2.3 From 8daa19051e1c7369c89ace7b18e74fe1f55dfa29 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Mon, 1 Dec 2008 14:13:53 -0800 Subject: x86, apm: remove CONFIG_APM_REAL_MODE_POWER_OFF in favor of a kernel parameter Remove CONFIG_APM_REAL_MODE_POWER_OFF like CONFIG_APM_POWER_OFF which has been done for linux-2.2.14pre8 (http://lkml.org/lkml/1999/11/23/3). Re-introducing CONFIG_APM_POWER_OFF got nack-ed. Stephen didn't bother to remove CONFIG_APM_REAL_MODE_POWER_OFF, let's get rid of it now. Reference: http://lkml.org/lkml/2008/5/7/97 Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 7 ------- arch/x86/kernel/apm_32.c | 4 ---- 2 files changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4cf0ab13d18..ebcad15ccf3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1629,13 +1629,6 @@ config APM_ALLOW_INTS many of the newer IBM Thinkpads. If you experience hangs when you suspend, try setting this to Y. Otherwise, say N. -config APM_REAL_MODE_POWER_OFF - bool "Use real mode APM BIOS call to power off" - help - Use real mode APM BIOS calls to switch off the computer. This is - a work-around for a number of buggy BIOSes. Switch this option on if - your computer crashes instead of powering off properly. - endif # APM source "arch/x86/kernel/cpu/cpufreq/Kconfig" diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5145a6e72bb..3a26525a3f3 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -391,11 +391,7 @@ static int power_off; #else static int power_off = 1; #endif -#ifdef CONFIG_APM_REAL_MODE_POWER_OFF -static int realmode_power_off = 1; -#else static int realmode_power_off; -#endif #ifdef CONFIG_APM_ALLOW_INTS static int allow_ints = 1; #else -- cgit v1.2.3 From dcb7731a185efbf3d800618d874af99895df5afb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 2 Dec 2008 20:16:03 +0100 Subject: x86: fix broken flushing in GART nofullflush path Impact: remove stale IOTLB entries In the non-default nofullflush case the GART is only flushed when next_bit wraps around. But it can happen that an unmap operation unmaps memory which is behind the current next_bit location. If these addresses are reused it may result in stale GART IO/TLB entries. Fix this by setting the GART next_bit always behind an unmapped location. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index a42b02b4df6..ba7ad83e20a 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -123,6 +123,8 @@ static void free_iommu(unsigned long offset, int size) spin_lock_irqsave(&iommu_bitmap_lock, flags); iommu_area_free(iommu_gart_bitmap, offset, size); + if (offset >= next_bit) + next_bit = offset + size; spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } -- cgit v1.2.3 From 181de82ee3ffda1175f89d50c991dae31b79280c Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 3 Dec 2008 14:53:04 +0900 Subject: x86: remove dead BIO_VMERGE_BOUNDARY definition Impact: cleanup, remove dead code The block layer dropped the virtual merge feature (b8b3e16cfe6435d961f6aaebcfd52a1ff2a988c5). BIO_VMERGE_BOUNDARY definition is meaningless now. Signed-off-by: FUJITA Tomonori Acked-by: Jens Axboe Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 2 -- arch/x86/include/asm/io_64.h | 2 -- arch/x86/kernel/pci-dma.c | 6 ------ 3 files changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 33513b9a67f..05cfed4485f 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -80,8 +80,6 @@ static inline void writeq(__u64 val, volatile void __iomem *addr) #define readq readq #define writeq writeq -extern int iommu_bio_merge; - #ifdef CONFIG_X86_32 # include "io_32.h" #else diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h index fea325a1122..563c16270ba 100644 --- a/arch/x86/include/asm/io_64.h +++ b/arch/x86/include/asm/io_64.h @@ -232,8 +232,6 @@ void memset_io(volatile void __iomem *a, int b, size_t c); #define flush_write_buffers() -#define BIO_VMERGE_BOUNDARY iommu_bio_merge - /* * Convert a virtual cached pointer to an uncached pointer */ diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 12eeb4bfcde..da93c65f8f0 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -31,11 +31,6 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -/* This tells the BIO block layer to assume merging. Default to off - because we cannot guarantee merging later. */ -int iommu_bio_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_bio_merge); - dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -189,7 +184,6 @@ static __init int iommu_setup(char *p) } if (!strncmp(p, "biomerge", 8)) { - iommu_bio_merge = 4096; iommu_merge = 1; force_iommu = 1; } -- cgit v1.2.3 From 347fdd9dd4e5d3f3a4e415925c35bdff1d59c3a9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:08 -0500 Subject: ftrace: clean up function graph asm Impact: clean up There exists macros for x86 asm to handle x86_64 and i386. This patch updates function graph asm to use them. Signed-off-by: Steven Rostedt Acked-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 58832478b94..1a5b8f8cb3c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -467,28 +467,16 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) * ignore such a protection. */ asm volatile( -#ifdef CONFIG_X86_64 - "1: movq (%[parent_old]), %[old]\n" - "2: movq %[return_hooker], (%[parent_replaced])\n" -#else - "1: movl (%[parent_old]), %[old]\n" - "2: movl %[return_hooker], (%[parent_replaced])\n" -#endif + "1: " _ASM_MOV " (%[parent_old]), %[old]\n" + "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n" " movl $0, %[faulted]\n" ".section .fixup, \"ax\"\n" "3: movl $1, %[faulted]\n" ".previous\n" - ".section __ex_table, \"a\"\n" -#ifdef CONFIG_X86_64 - " .quad 1b, 3b\n" - " .quad 2b, 3b\n" -#else - " .long 1b, 3b\n" - " .long 2b, 3b\n" -#endif - ".previous\n" + _ASM_EXTABLE(1b, 3b) + _ASM_EXTABLE(2b, 3b) : [parent_replaced] "=r" (parent), [old] "=r" (old), [faulted] "=r" (faulted) -- cgit v1.2.3 From bb4304c71c97bf727ec43cd2f195c2c237c27fd3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:09 -0500 Subject: ftrace: have function graph use mcount caller address Impact: consistency change for function graph This patch makes function graph record the mcount caller address the same way the function tracer does. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + arch/x86/kernel/entry_64.S | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 958af86186c..826682abed1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1230,6 +1230,7 @@ ENTRY(ftrace_graph_caller) pushl %edx movl 0xc(%esp), %edx lea 0x4(%ebp), %eax + subl $MCOUNT_INSN_SIZE, %edx call prepare_ftrace_return popl %edx popl %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 2aa0526ac30..9060ba6497e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -173,6 +173,7 @@ ENTRY(ftrace_graph_caller) leaq 8(%rbp), %rdi movq 0x38(%rsp), %rsi + subq $MCOUNT_INSN_SIZE, %rsi call prepare_ftrace_return -- cgit v1.2.3 From 14a866c567e040ccf6240d68b083dd1dbbde63e6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:02 -0500 Subject: ftrace: add ftrace_graph_stop() Impact: new ftrace_graph_stop function While developing more features of function graph, I hit a bug that caused the WARN_ON to trigger in the prepare_ftrace_return function. Well, it was hard for me to find out that was happening because the bug would not print, it would just cause a hard lockup or reboot. The reason is that it is not safe to call printk from this function. Looking further, I also found that it calls unregister_ftrace_graph, which grabs a mutex and calls kstop machine. This would definitely lock the box up if it were to trigger. This patch adds a fast and safe ftrace_graph_stop() which will stop the function tracer. Then it is safe to call the WARN ON. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1a5b8f8cb3c..adba8e9a427 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -484,14 +484,16 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) : "memory" ); - if (WARN_ON(faulted)) { - unregister_ftrace_graph(); + if (unlikely(faulted)) { + ftrace_graph_stop(); + WARN_ON(1); return; } - if (WARN_ON(!__kernel_text_address(old))) { - unregister_ftrace_graph(); + if (unlikely(!__kernel_text_address(old))) { + ftrace_graph_stop(); *parent = old; + WARN_ON(1); return; } -- cgit v1.2.3 From 7ee991fbc6f947e9b04f29c9c6c1d057d0671a16 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:04 -0500 Subject: ftrace: print real return in dumpstack for function graph Impact: better dumpstack output I noticed in my crash dumps and even in the stack tracer that a lot of functions listed in the stack trace are simply return_to_handler which is ftrace graphs way to insert its own call into the return of a function. But we lose out where the actually function was called from. This patch adds in hooks to the dumpstack mechanism that detects this and finds the real function to print. Both are printed to let the user know that a hook is still in place. This does give a funny side effect in the stack tracer output: Depth Size Location (80 entries) ----- ---- -------- 0) 4144 48 save_stack_trace+0x2f/0x4d 1) 4096 128 ftrace_call+0x5/0x2b 2) 3968 16 mempool_alloc_slab+0x16/0x18 3) 3952 384 return_to_handler+0x0/0x73 4) 3568 -240 stack_trace_call+0x11d/0x209 5) 3808 144 return_to_handler+0x0/0x73 6) 3664 -128 mempool_alloc+0x4d/0xfe 7) 3792 128 return_to_handler+0x0/0x73 8) 3664 -32 scsi_sg_alloc+0x48/0x4a [scsi_mod] As you can see, the real functions are now negative. This is due to them not being found inside the stack. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 34 +++++++++++++++++++++++++++++++++- arch/x86/kernel/dumpstack.h | 2 +- arch/x86/kernel/dumpstack_32.c | 5 ++++- arch/x86/kernel/dumpstack_64.c | 7 ++++--- 4 files changed, 42 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 5962176dfab..6b1f6f6f866 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -30,6 +30,37 @@ void printk_address(unsigned long address, int reliable) reliable ? "" : "? ", (void *) address); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ + struct task_struct *task = tinfo->task; + unsigned long ret_addr; + int index = task->curr_ret_stack; + + if (addr != (unsigned long)return_to_handler) + return; + + if (!task->ret_stack || index < *graph) + return; + + index -= *graph; + ret_addr = task->ret_stack[index].ret; + + ops->address(data, ret_addr, 1); + + (*graph)++; +} +#else +static inline void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ } +#endif + /* * x86-64 can have up to three kernel stacks: * process stack @@ -54,7 +85,7 @@ unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end) + unsigned long *end, int *graph) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -70,6 +101,7 @@ print_context_stack(struct thread_info *tinfo, } else { ops->address(data, addr, bp == 0); } + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); } stack++; } diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 3119a801c32..da87590b869 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -18,7 +18,7 @@ extern unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end); + unsigned long *end, int *graph); extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 7b031b106ec..d593cd1f58d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -23,6 +23,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { + int graph = 0; + if (!task) task = current; @@ -50,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data, NULL); + bp = print_context_stack(context, stack, bp, ops, + data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 33ff10287a5..c302d070704 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -109,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; + int graph = 0; if (!task) task = current; @@ -149,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, break; bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end); + data, estack_end, &graph); ops->stack(data, ""); /* * We link to the next stack via the @@ -168,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, "IRQ") < 0) break; bp = print_context_stack(tinfo, stack, bp, - ops, data, irqstack_end); + ops, data, irqstack_end, &graph); /* * We link to the next stack (which would be * the process stack normally) the last @@ -186,7 +187,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); -- cgit v1.2.3 From e49dc19c6a19ea112fcb94b7c62ec62cdd5c08aa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:05 -0500 Subject: ftrace: function graph return for function entry Impact: feature, let entry function decide to trace or not This patch lets the graph tracer entry function decide if the tracing should be done at the end as well. This requires all function graph entry functions return 1 if it should trace, or 0 if the return should not be traced. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 3 +++ arch/x86/kernel/entry_64.S | 3 +++ arch/x86/kernel/ftrace.c | 7 ++++++- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 826682abed1..43ceb3f454b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1196,6 +1196,9 @@ ENTRY(mcount) #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpl $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller #endif .globl ftrace_stub ftrace_stub: diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 9060ba6497e..54e0bbdccb9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -120,6 +120,9 @@ ENTRY(mcount) #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpq $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller #endif .globl ftrace_stub diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index adba8e9a427..d278ad2ebda 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -425,6 +425,7 @@ static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = index; + barrier(); current->curr_ret_stack--; } @@ -506,7 +507,11 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) } trace.func = self_addr; - ftrace_graph_entry(&trace); + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) { + current->curr_ret_stack--; + *parent = old; + } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.2.3 From 62679efe0a5f02987a621942afc5979a80a6ca5a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:06 -0500 Subject: ftrace: add checks on ret stack in function graph Import: robustness checks Add more checks in the function graph code to detect errors and perhaps print out better information if a bug happens. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d278ad2ebda..f98c4076a17 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -420,6 +420,15 @@ static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) int index; index = current->curr_ret_stack; + + if (unlikely(index < 0)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + *ret = current->ret_stack[index].ret; trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; @@ -427,6 +436,7 @@ static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->depth = index; barrier(); current->curr_ret_stack--; + } /* @@ -442,6 +452,13 @@ unsigned long ftrace_return_to_handler(void) trace.rettime = cpu_clock(raw_smp_processor_id()); ftrace_graph_return(&trace); + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + return ret; } -- cgit v1.2.3 From affa219b60a11b3295637a97f5b1b8ef231490fc Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Wed, 3 Dec 2008 18:58:19 -0500 Subject: x86: change thread_info's flag field back to 32 bits Impact: pack struct thread_info more tightly Change x86_64's thread_info 'flags' field back to __u32. This was changed to 'unsigned long' when the thread_info*.h for i386 and x86_64 were merged. Change it back. We can do this as only 27 bits of 'flags' are actually used. This change actually packs down thread_info by 64 bits: 32 bits are saved by the smaller flags, and 32 bits are saved by the following 'mm_segment_t field' becoming naturally 64-bit aligned. Signed-off-by: Joe Korty Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad..8dbc57390d2 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -24,7 +24,7 @@ struct exec_domain; struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ - unsigned long flags; /* low level flags */ + __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, -- cgit v1.2.3 From 55c395b47042e12d5c25aa07f271f56ffe44f793 Mon Sep 17 00:00:00 2001 From: Michael Tokarev Date: Fri, 5 Dec 2008 14:42:20 +0300 Subject: x86: fix missing space in printk Just come across this when booting on an old hw.. Looks somewhat ugly, that single missing space ;) Signed-off-by: Michael Tokarev Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7b109339731..1a3c3253f0e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1086,8 +1086,10 @@ static int __init smp_sanity_check(unsigned max_cpus) #endif if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk(KERN_WARNING "weird, boot CPU (#%d) not listed" - "by the BIOS.\n", hard_smp_processor_id()); + printk(KERN_WARNING + "weird, boot CPU (#%d) not listed by the BIOS.\n", + hard_smp_processor_id()); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); } -- cgit v1.2.3 From a0286c94f07636380082608196d41dd725a83229 Mon Sep 17 00:00:00 2001 From: Michael Tokarev Date: Fri, 5 Dec 2008 15:47:29 +0300 Subject: x86: fix missing space in printk, #2 Impact: clean up printk Signed-off-by: Michael Tokarev Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 19262482021..dc572994703 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -300,8 +300,8 @@ fs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO "PCI: VIA PCI bridge detected." - "Disabling DAC.\n"); + printk(KERN_INFO + "PCI: VIA PCI bridge detected. Disabling DAC.\n"); forbid_dac = 1; } } -- cgit v1.2.3 From 3e1e9002aa8b32bd4c95ac6c8fad376b7a8127fb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 8 Dec 2008 00:50:22 +0100 Subject: x86: change static allocation of trampoline area Impact: fix trampoline sizing bug, save space While debugging a suspend-to-RAM related issue it occured to me that if the trampoline code had grown past 4 KB, we would have been allocating too little memory for it, since the 4 KB size of the trampoline is hardcoded into arch/x86/kernel/e820.c . Change that by making the kernel compute the trampoline size and allocate as much memory as necessary. Signed-off-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trampoline.h | 7 +++++++ arch/x86/kernel/e820.c | 16 ---------------- arch/x86/kernel/head32.c | 3 +++ arch/x86/kernel/head64.c | 3 +++ arch/x86/kernel/trampoline.c | 19 +++++++++++++++++-- 5 files changed, 30 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index fa0d79facdb..780ba0ab94f 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -3,6 +3,7 @@ #ifndef __ASSEMBLY__ +#ifdef CONFIG_X86_TRAMPOLINE /* * Trampoline 80x86 program as an array. */ @@ -13,8 +14,14 @@ extern unsigned char *trampoline_base; extern unsigned long init_rsp; extern unsigned long initial_code; +#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) #define TRAMPOLINE_BASE 0x6000 + extern unsigned long setup_trampoline(void); +extern void __init reserve_trampoline_memory(void); +#else +static inline void reserve_trampoline_memory(void) {}; +#endif /* CONFIG_X86_TRAMPOLINE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7aafeb5263e..65a13943e09 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -677,22 +677,6 @@ struct early_res { }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE) - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, -#endif -#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" }, - /* - * Has to be in very low memory so we can execute - * real-mode AP code. - */ - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" }, -#endif {} }; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index fa1d25dd83e..ac108d1fe18 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -12,9 +12,12 @@ #include #include #include +#include void __init i386_start_kernel(void) { + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index d16084f9064..388e05a5fc1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -24,6 +24,7 @@ #include #include #include +#include /* boot cpu pda */ static struct x8664_pda _boot_cpu_pda __read_mostly; @@ -120,6 +121,8 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 1106fac6024..808031a5ba1 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -1,10 +1,26 @@ #include #include +#include /* ready for x86_64 and x86 */ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); +void __init reserve_trampoline_memory(void) +{ +#ifdef CONFIG_X86_32 + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); +#endif + /* Has to be in very low memory so we can execute real-mode AP code. */ + reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, + "TRAMPOLINE"); +} + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -12,7 +28,6 @@ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); */ unsigned long setup_trampoline(void) { - memcpy(trampoline_base, trampoline_data, - trampoline_end - trampoline_data); + memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } -- cgit v1.2.3 From 69b88afa8d114a43a3c0431722b79e31d9920692 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 5 Dec 2008 22:45:50 -0800 Subject: x86: clean up get_smp_config() Impact: cleanup reorder exit path in __get_smp_config(). also move two print outs to acpi_process_madt Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 11 +++++++++++ arch/x86/kernel/mpparse.c | 25 +++++++++++-------------- 2 files changed, 22 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4c51a2f8fd3..65d0b72777e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1360,6 +1360,17 @@ static void __init acpi_process_madt(void) disable_acpi(); } } + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " + "information\n"); + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) " + "configuration information\n"); #endif return; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0f4c1fd5a1f..45e3b69808b 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -586,26 +586,23 @@ static void __init __get_smp_config(unsigned int early) { struct intel_mp_floating *mpf = mpf_found; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } + if (!mpf) + return; + if (acpi_lapic && early) return; + /* - * ACPI supports both logical (e.g. Hyper-Threading) and physical - * processors, where MPS only supports physical. + * MPS doesn't support hyperthreading, aka only have + * thread 0 apic id in MPS table */ - if (acpi_lapic && acpi_ioapic) { - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " - "information\n"); + if (acpi_lapic && acpi_ioapic) return; - } else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) " - "configuration information\n"); - if (!mpf) - return; + if (x86_quirks->mach_get_smp_config) { + if (x86_quirks->mach_get_smp_config(early)) + return; + } printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); -- cgit v1.2.3 From 8b96f0119818964e4944fd1c423bf6770027d3ac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 6 Dec 2008 03:40:00 +0100 Subject: tracing/function-graph-tracer: introduce __notrace_funcgraph to filter special functions Impact: trace more functions When the function graph tracer is configured, three more files are not traced to prevent only four functions to be traced. And this impacts the normal function tracer too. arch/x86/kernel/process_64/32.c: I had crashes when I let this file traced. After some debugging, I saw that the "current" task point was changed inside__swtich_to(), ie: "write_pda(pcurrent, next_p);" inside process_64.c Since the tracer store the original return address of the function inside current, we had crashes. Only __switch_to() has to be excluded from tracing. kernel/module.c and kernel/extable.c: Because of a function used internally by the function graph tracer: __kernel_text_address() To let the other functions inside these files to be traced, this patch introduces the __notrace_funcgraph function prefix which is __notrace if function graph tracer is configured and nothing if not. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 6 ------ arch/x86/kernel/process_32.c | 4 +++- arch/x86/kernel/process_64.c | 4 +++- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a3049da6198..1cad9318d21 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -14,12 +14,6 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg endif -ifdef CONFIG_FUNCTION_GRAPH_TRACER -# Don't trace __switch_to() but let it for function tracer -CFLAGS_REMOVE_process_32.o = -pg -CFLAGS_REMOVE_process_64.o = -pg -endif - # # vsyscalls (which work on the user stack) should have # no stack-protector checks: diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0a1302fe6d4..24c2276aa45 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -548,7 +549,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c958120fb1b..fbb321d53d3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -551,8 +552,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, * - could test fs/gs bitsliced * * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. */ -struct task_struct * +__notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; -- cgit v1.2.3 From 380c4b1411ccd6885f92b2c8ceb08433a720f44e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 6 Dec 2008 03:43:41 +0100 Subject: tracing/function-graph-tracer: append the tracing_graph_flag Impact: Provide a way to pause the function graph tracer As suggested by Steven Rostedt, the previous patch that prevented from spinlock function tracing shouldn't use the raw_spinlock to fix it. It's much better to follow lockdep with normal spinlock, so this patch adds a new flag for each task to make the function graph tracer able to be paused. We also can send an ftrace_printk whithout worrying of the irrelevant traced spinlock during insertion. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index f98c4076a17..1b43086b097 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -476,7 +476,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) &return_to_handler; /* Nmi's are currently unsupported */ - if (atomic_read(&in_nmi)) + if (unlikely(atomic_read(&in_nmi))) + return; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; /* -- cgit v1.2.3 From b0884e25fe361f2ca228808fb5fd1b74cb04e711 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 11 Dec 2008 13:45:23 +0100 Subject: x86, bts: turn BUG_ON into WARN_ON_ONCE Impact: make the ds code more debuggable Turn BUG_ON's into WARN_ON_ONCE. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 4 ++-- arch/x86/kernel/ptrace.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 19a8c2c0389..09530698866 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -452,7 +452,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) { - BUG_ON(tracer->context->owner[qual] != tracer); + WARN_ON_ONCE(tracer->context->owner[qual] != tracer); tracer->context->owner[qual] = NULL; put_tracer(tracer->context->task); @@ -774,7 +774,7 @@ ds_configure(const struct ds_configuration *cfg) printk(KERN_INFO "DS available\n"); - BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); + WARN_ON_ONCE(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2c8ec1ba75e..b2998fe1166 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -878,7 +878,8 @@ static int ptrace_bts_write_record(struct task_struct *child, { unsigned char bts_record[BTS_MAX_RECORD_SIZE]; - BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); + if (BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts) + return -EOVERFLOW; memset(bts_record, 0, bts_cfg.sizeof_bts); switch (in->qualifier) { @@ -1133,7 +1134,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) ret = ds_get_bts_index(child->bts, &size); if (ret == 0) { - BUG_ON(size != (int) size); + WARN_ON_ONCE(size != (int) size); ret = (int) size; } break; -- cgit v1.2.3 From c2724775ce57c98b8af9694857b941dc61056516 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 11 Dec 2008 13:49:59 +0100 Subject: x86, bts: provide in-kernel branch-trace interface Impact: cleanup Move the BTS bits from ptrace.c into ds.c. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 241 ++++++----- arch/x86/include/asm/processor.h | 13 + arch/x86/include/asm/ptrace.h | 36 -- arch/x86/include/asm/thread_info.h | 5 +- arch/x86/kernel/cpu/intel.c | 4 - arch/x86/kernel/ds.c | 857 +++++++++++++++++++++++-------------- arch/x86/kernel/process_32.c | 59 +-- arch/x86/kernel/process_64.c | 50 +-- arch/x86/kernel/ptrace.c | 416 +++++------------- 9 files changed, 810 insertions(+), 871 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 99b6c39774a..ee0ea3a96c1 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -6,13 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS + * - DS and BTS hardware configuration * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all traced tasks - * - current is allowed to trace tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -31,6 +31,7 @@ #ifdef CONFIG_X86_DS struct task_struct; +struct ds_context; struct ds_tracer; struct bts_tracer; struct pebs_tracer; @@ -38,6 +39,38 @@ struct pebs_tracer; typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); + +/* + * A list of features plus corresponding macros to talk about them in + * the ds_request function's flags parameter. + * + * We use the enum to index an array of corresponding control bits; + * we use the macro to index a flags bit-vector. + */ +enum ds_feature { + dsf_bts = 0, + dsf_bts_kernel, +#define BTS_KERNEL (1 << dsf_bts_kernel) + /* trace kernel-mode branches */ + + dsf_bts_user, +#define BTS_USER (1 << dsf_bts_user) + /* trace user-mode branches */ + + dsf_bts_overflow, + dsf_bts_max, + dsf_pebs = dsf_bts_max, + + dsf_pebs_max, + dsf_ctl_max = dsf_pebs_max, + dsf_bts_timestamps = dsf_ctl_max, +#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps) + /* add timestamps into BTS trace */ + +#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS) +}; + + /* * Request BTS or PEBS * @@ -58,92 +91,135 @@ typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); * NULL if cyclic buffer requested * th: the interrupt threshold in records from the end of the buffer; * -1 if no interrupt threshold is requested. + * flags: a bit-mask of the above flags */ extern struct bts_tracer *ds_request_bts(struct task_struct *task, void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th); + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, void *base, size_t size, pebs_ovfl_callback_t ovfl, - size_t th); + size_t th, unsigned int flags); /* * Release BTS or PEBS resources - * - * Returns 0 on success; -Eerrno otherwise + * Suspend and resume BTS or PEBS tracing * * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_release_bts(struct bts_tracer *tracer); -extern int ds_release_pebs(struct pebs_tracer *tracer); +extern void ds_release_bts(struct bts_tracer *tracer); +extern void ds_suspend_bts(struct bts_tracer *tracer); +extern void ds_resume_bts(struct bts_tracer *tracer); +extern void ds_release_pebs(struct pebs_tracer *tracer); +extern void ds_suspend_pebs(struct pebs_tracer *tracer); +extern void ds_resume_pebs(struct pebs_tracer *tracer); + /* - * Get the (array) index of the write pointer. - * (assuming an array of BTS/PEBS records) - * - * Returns 0 on success; -Eerrno on error + * The raw DS buffer state as it is used for BTS and PEBS recording. * - * tracer: the tracer handle returned from ds_request_~() - * pos (out): will hold the result + * This is the low-level, arch-dependent interface for working + * directly on the raw trace data. */ -extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos); -extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos); +struct ds_trace { + /* the number of bts/pebs records */ + size_t n; + /* the size of a bts/pebs record in bytes */ + size_t size; + /* pointers into the raw buffer: + - to the first entry */ + void *begin; + /* - one beyond the last entry */ + void *end; + /* - one beyond the newest entry */ + void *top; + /* - the interrupt threshold */ + void *ith; + /* flags given on ds_request() */ + unsigned int flags; +}; /* - * Get the (array) index one record beyond the end of the array. - * (assuming an array of BTS/PEBS records) - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_~() - * pos (out): will hold the result + * An arch-independent view on branch trace data. */ -extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos); -extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos); +enum bts_qualifier { + bts_invalid, +#define BTS_INVALID bts_invalid + + bts_branch, +#define BTS_BRANCH bts_branch + + bts_task_arrives, +#define BTS_TASK_ARRIVES bts_task_arrives + + bts_task_departs, +#define BTS_TASK_DEPARTS bts_task_departs + + bts_qual_bit_size = 4, + bts_qual_max = (1 << bts_qual_bit_size), +}; + +struct bts_struct { + __u64 qualifier; + union { + /* BTS_BRANCH */ + struct { + __u64 from; + __u64 to; + } lbr; + /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ + struct { + __u64 jiffies; + pid_t pid; + } timestamp; + } variant; +}; + /* - * Provide a pointer to the BTS/PEBS record at parameter index. - * (assuming an array of BTS/PEBS records) - * - * The pointer points directly into the buffer. The user is - * responsible for copying the record. - * - * Returns the size of a single record on success; -Eerrno on error + * The BTS state. * - * tracer: the tracer handle returned from ds_request_~() - * index: the index of the requested record - * record (out): pointer to the requested record + * This gives access to the raw DS state and adds functions to provide + * an arch-independent view of the BTS data. */ -extern int ds_access_bts(struct bts_tracer *tracer, - size_t index, const void **record); -extern int ds_access_pebs(struct pebs_tracer *tracer, - size_t index, const void **record); +struct bts_trace { + struct ds_trace ds; + + int (*read)(struct bts_tracer *tracer, const void *at, + struct bts_struct *out); + int (*write)(struct bts_tracer *tracer, const struct bts_struct *in); +}; + /* - * Write one or more BTS/PEBS records at the write pointer index and - * advance the write pointer. + * The PEBS state. * - * If size is not a multiple of the record size, trailing bytes are - * zeroed out. - * - * May result in one or more overflow notifications. - * - * If called during overflow handling, that is, with index >= - * interrupt threshold, the write will wrap around. + * This gives access to the raw DS state and the PEBS-specific counter + * reset value. + */ +struct pebs_trace { + struct ds_trace ds; + + /* the PEBS reset value */ + unsigned long long reset_value; +}; + + +/* + * Read the BTS or PEBS trace. * - * An overflow notification is given if and when the interrupt - * threshold is reached during or after the write. + * Returns a view on the trace collected for the parameter tracer. * - * Returns the number of bytes written or -Eerrno. + * The view remains valid as long as the traced task is not running or + * the tracer is suspended. + * Writes into the trace buffer are not reflected. * * tracer: the tracer handle returned from ds_request_~() - * buffer: the buffer to write - * size: the size of the buffer */ -extern int ds_write_bts(struct bts_tracer *tracer, - const void *buffer, size_t size); -extern int ds_write_pebs(struct pebs_tracer *tracer, - const void *buffer, size_t size); +extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer); +extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer); + /* * Reset the write pointer of the BTS/PEBS buffer. @@ -155,27 +231,6 @@ extern int ds_write_pebs(struct pebs_tracer *tracer, extern int ds_reset_bts(struct bts_tracer *tracer); extern int ds_reset_pebs(struct pebs_tracer *tracer); -/* - * Clear the BTS/PEBS buffer and reset the write pointer. - * The entire buffer will be zeroed out. - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_~() - */ -extern int ds_clear_bts(struct bts_tracer *tracer); -extern int ds_clear_pebs(struct pebs_tracer *tracer); - -/* - * Provide the PEBS counter reset value. - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_pebs() - * value (out): the counter reset value - */ -extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value); - /* * Set the PEBS counter reset value. * @@ -192,35 +247,17 @@ extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); struct cpuinfo_x86; extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); - - /* - * The DS context - part of struct thread_struct. + * Context switch work */ -#define MAX_SIZEOF_DS (12 * 8) - -struct ds_context { - /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ - unsigned char ds[MAX_SIZEOF_DS]; - /* the owner of the BTS and PEBS configuration, respectively */ - struct ds_tracer *owner[2]; - /* use count */ - unsigned long count; - /* a pointer to the context location inside the thread_struct - * or the per_cpu context array */ - struct ds_context **this; - /* a pointer to the task owning this context, or NULL, if the - * context is owned by a cpu */ - struct task_struct *task; -}; - -/* called by exit_thread() to free leftover contexts */ -extern void ds_free(struct ds_context *context); +extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} +static inline void ds_switch_to(struct task_struct *prev, + struct task_struct *next) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ca01e38326..aa5914f8e50 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -752,6 +752,19 @@ extern void switch_to_new_gdt(void); extern void cpu_init(void); extern void init_gdt(int cpu); +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index eefb0594b05..fbf74421591 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -6,7 +6,6 @@ #include #ifdef __KERNEL__ -#include /* the DS BTS struct is used for ptrace too */ #include #endif @@ -128,34 +127,6 @@ struct pt_regs { #endif /* !__i386__ */ -#ifdef CONFIG_X86_PTRACE_BTS -/* a branch trace record entry - * - * In order to unify the interface between various processor versions, - * we use the below data structure for all processors. - */ -enum bts_qualifier { - BTS_INVALID = 0, - BTS_BRANCH, - BTS_TASK_ARRIVES, - BTS_TASK_DEPARTS -}; - -struct bts_struct { - __u64 qualifier; - union { - /* BTS_BRANCH */ - struct { - __u64 from_ip; - __u64 to_ip; - } lbr; - /* BTS_TASK_ARRIVES or - BTS_TASK_DEPARTS */ - __u64 jiffies; - } variant; -}; -#endif /* CONFIG_X86_PTRACE_BTS */ - #ifdef __KERNEL__ #include @@ -163,13 +134,6 @@ struct bts_struct { struct cpuinfo_x86; struct task_struct; -#ifdef CONFIG_X86_PTRACE_BTS -extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *); -extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier); -#else -#define ptrace_bts_init_intel(config) do {} while (0) -#endif /* CONFIG_X86_PTRACE_BTS */ - extern unsigned long profile_pc(struct pt_regs *regs); extern unsigned long diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 0921b4018c1..bf8113d16a3 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -93,7 +93,6 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ -#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -115,7 +114,6 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) -#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -141,8 +139,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ - _TIF_NOTSC) + (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 816f27f289b..cd413d9a021 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -309,9 +308,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_P3); #endif - if (cpu_has_bts) - ptrace_bts_init_intel(c); - detect_extended_topology(c); if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { /* diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 09530698866..f0583005b75 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -6,13 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS + * - DS and BTS hardware configuration * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all traced tasks - * - current is allowed to trace tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -34,15 +34,30 @@ * The configuration for a particular DS hardware implementation. */ struct ds_configuration { - /* the size of the DS structure in bytes */ - unsigned char sizeof_ds; - /* the size of one pointer-typed field in the DS structure in bytes; - this covers the first 8 fields related to buffer management. */ + /* the name of the configuration */ + const char *name; + /* the size of one pointer-typed field in the DS structure and + in the BTS and PEBS buffers in bytes; + this covers the first 8 DS fields related to buffer management. */ unsigned char sizeof_field; /* the size of a BTS/PEBS record in bytes */ unsigned char sizeof_rec[2]; + /* a series of bit-masks to control various features indexed + * by enum ds_feature */ + unsigned long ctl[dsf_ctl_max]; }; -static struct ds_configuration ds_cfg; +static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); + +#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) + +#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ +#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ + +#define BTS_CONTROL \ + (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ + ds_cfg.ctl[dsf_bts_overflow]) + /* * A BTS or PEBS tracer. @@ -61,6 +76,8 @@ struct ds_tracer { struct bts_tracer { /* the common DS part */ struct ds_tracer ds; + /* the trace including the DS configuration */ + struct bts_trace trace; /* buffer overflow notification function */ bts_ovfl_callback_t ovfl; }; @@ -68,6 +85,8 @@ struct bts_tracer { struct pebs_tracer { /* the common DS part */ struct ds_tracer ds; + /* the trace including the DS configuration */ + struct pebs_trace trace; /* buffer overflow notification function */ pebs_ovfl_callback_t ovfl; }; @@ -134,13 +153,11 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, (*(unsigned long *)base) = value; } -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ - /* * Locking is done only for allocating BTS or PEBS resources. */ -static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); +static DEFINE_SPINLOCK(ds_lock); /* @@ -156,27 +173,32 @@ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); * >0 number of per-thread tracers * <0 number of per-cpu tracers * - * The below functions to get and put tracers and to check the - * allocation type require the ds_lock to be held by the caller. - * * Tracers essentially gives the number of ds contexts for a certain * type of allocation. */ -static long tracers; +static atomic_t tracers = ATOMIC_INIT(0); static inline void get_tracer(struct task_struct *task) { - tracers += (task ? 1 : -1); + if (task) + atomic_inc(&tracers); + else + atomic_dec(&tracers); } static inline void put_tracer(struct task_struct *task) { - tracers -= (task ? 1 : -1); + if (task) + atomic_dec(&tracers); + else + atomic_inc(&tracers); } static inline int check_tracer(struct task_struct *task) { - return (task ? (tracers >= 0) : (tracers <= 0)); + return task ? + (atomic_read(&tracers) >= 0) : + (atomic_read(&tracers) <= 0); } @@ -190,14 +212,30 @@ static inline int check_tracer(struct task_struct *task) * Contexts are use-counted. They are allocated on first access and * deallocated when the last user puts the context. */ -static DEFINE_PER_CPU(struct ds_context *, system_context); +struct ds_context { + /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + unsigned char ds[MAX_SIZEOF_DS]; + /* the owner of the BTS and PEBS configuration, respectively */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; + /* use count */ + unsigned long count; + /* a pointer to the context location inside the thread_struct + * or the per_cpu context array */ + struct ds_context **this; + /* a pointer to the task owning this context, or NULL, if the + * context is owned by a cpu */ + struct task_struct *task; +}; + +static DEFINE_PER_CPU(struct ds_context *, system_context_array); -#define this_system_context per_cpu(system_context, smp_processor_id()) +#define system_context per_cpu(system_context_array, smp_processor_id()) static inline struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &this_system_context); + (task ? &task->thread.ds_ctx : &system_context); struct ds_context *context = *p_context; unsigned long irq; @@ -225,10 +263,22 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); } + + context->count++; + + spin_unlock_irqrestore(&ds_lock, irq); + } else { + spin_lock_irqsave(&ds_lock, irq); + + context = *p_context; + if (context) + context->count++; + spin_unlock_irqrestore(&ds_lock, irq); - } - context->count++; + if (!context) + context = ds_get_context(task); + } return context; } @@ -242,8 +292,10 @@ static inline void ds_put_context(struct ds_context *context) spin_lock_irqsave(&ds_lock, irq); - if (--context->count) - goto out; + if (--context->count) { + spin_unlock_irqrestore(&ds_lock, irq); + return; + } *(context->this) = NULL; @@ -253,14 +305,14 @@ static inline void ds_put_context(struct ds_context *context) if (!context->task || (context->task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); - kfree(context); - out: spin_unlock_irqrestore(&ds_lock, irq); + + kfree(context); } /* - * Handle a buffer overflow + * Call the tracer's callback on a buffer overflow. * * context: the ds context * qual: the buffer type @@ -268,30 +320,244 @@ static inline void ds_put_context(struct ds_context *context) static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) { switch (qual) { - case ds_bts: { - struct bts_tracer *tracer = - container_of(context->owner[qual], - struct bts_tracer, ds); - if (tracer->ovfl) - tracer->ovfl(tracer); - } + case ds_bts: + if (context->bts_master && + context->bts_master->ovfl) + context->bts_master->ovfl(context->bts_master); + break; + case ds_pebs: + if (context->pebs_master && + context->pebs_master->ovfl) + context->pebs_master->ovfl(context->pebs_master); break; - case ds_pebs: { - struct pebs_tracer *tracer = - container_of(context->owner[qual], - struct pebs_tracer, ds); - if (tracer->ovfl) - tracer->ovfl(tracer); } +} + + +/* + * Write raw data into the BTS or PEBS buffer. + * + * The remainder of any partially written record is zeroed out. + * + * context: the DS context + * qual: the buffer type + * record: the data to write + * size: the size of the data + */ +static int ds_write(struct ds_context *context, enum ds_qualifier qual, + const void *record, size_t size) +{ + int bytes_written = 0; + + if (!record) + return -EINVAL; + + while (size) { + unsigned long base, index, end, write_end, int_th; + unsigned long write_size, adj_write_size; + + /* + * write as much as possible without producing an + * overflow interrupt. + * + * interrupt_threshold must either be + * - bigger than absolute_maximum or + * - point to a record between buffer_base and absolute_maximum + * + * index points to a valid record. + */ + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + end = ds_get(context->ds, qual, ds_absolute_maximum); + int_th = ds_get(context->ds, qual, ds_interrupt_threshold); + + write_end = min(end, int_th); + + /* if we are already beyond the interrupt threshold, + * we fill the entire buffer */ + if (write_end <= index) + write_end = end; + + if (write_end <= index) + break; + + write_size = min((unsigned long) size, write_end - index); + memcpy((void *)index, record, write_size); + + record = (const char *)record + write_size; + size -= write_size; + bytes_written += write_size; + + adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; + adj_write_size *= ds_cfg.sizeof_rec[qual]; + + /* zero out trailing bytes */ + memset((char *)index + write_size, 0, + adj_write_size - write_size); + index += adj_write_size; + + if (index >= end) + index = base; + ds_set(context->ds, qual, ds_index, index); + + if (index >= int_th) + ds_overflow(context, qual); + } + + return bytes_written; +} + + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + * + * We use two levels of abstraction: + * - the raw data level defined here + * - an arch-independent level defined in ds.h + */ + +enum bts_field { + bts_from, + bts_to, + bts_flags, + + bts_qual = bts_from, + bts_jiffies = bts_to, + bts_pid = bts_flags, + + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) +}; + +static inline unsigned long bts_get(const char *base, enum bts_field field) +{ + base += (ds_cfg.sizeof_field * field); + return *(unsigned long *)base; +} + +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ + base += (ds_cfg.sizeof_field * field);; + (*(unsigned long *)base) = val; +} + + +/* + * The raw BTS data is architecture dependent. + * + * For higher-level users, we give an arch-independent view. + * - ds.h defines struct bts_struct + * - bts_read translates one raw bts record into a bts_struct + * - bts_write translates one bts_struct into the raw format and + * writes it into the top of the parameter tracer's buffer. + * + * return: bytes read/written on success; -Eerrno, otherwise + */ +static int bts_read(struct bts_tracer *tracer, const void *at, + struct bts_struct *out) +{ + if (!tracer) + return -EINVAL; + + if (at < tracer->trace.ds.begin) + return -EINVAL; + + if (tracer->trace.ds.end < (at + tracer->trace.ds.size)) + return -EINVAL; + + memset(out, 0, sizeof(*out)); + if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { + out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); + out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); + out->variant.timestamp.pid = bts_get(at, bts_pid); + } else { + out->qualifier = bts_branch; + out->variant.lbr.from = bts_get(at, bts_from); + out->variant.lbr.to = bts_get(at, bts_to); + } + + return ds_cfg.sizeof_rec[ds_bts]; +} + +static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) +{ + unsigned char raw[MAX_SIZEOF_BTS]; + + if (!tracer) + return -EINVAL; + + if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts]) + return -EOVERFLOW; + + switch (in->qualifier) { + case bts_invalid: + bts_set(raw, bts_from, 0); + bts_set(raw, bts_to, 0); + bts_set(raw, bts_flags, 0); + break; + case bts_branch: + bts_set(raw, bts_from, in->variant.lbr.from); + bts_set(raw, bts_to, in->variant.lbr.to); + bts_set(raw, bts_flags, 0); + break; + case bts_task_arrives: + case bts_task_departs: + bts_set(raw, bts_qual, (bts_escape | in->qualifier)); + bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); + bts_set(raw, bts_pid, in->variant.timestamp.pid); break; + default: + return -EINVAL; } + + return ds_write(tracer->ds.context, ds_bts, raw, + ds_cfg.sizeof_rec[ds_bts]); } -static void ds_install_ds_config(struct ds_context *context, - enum ds_qualifier qual, - void *base, size_t size, size_t ith) +static void ds_write_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) +{ + unsigned char *ds = context->ds; + + ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin); + ds_set(ds, qual, ds_index, (unsigned long)cfg->top); + ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end); + ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith); +} + +static void ds_read_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) { + unsigned char *ds = context->ds; + + cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base); + cfg->top = (void *)ds_get(ds, qual, ds_index); + cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum); + cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold); +} + +static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, + void *base, size_t size, size_t ith, + unsigned int flags) { unsigned long buffer, adj; /* adjust the buffer address and size to meet alignment @@ -308,32 +574,30 @@ static void ds_install_ds_config(struct ds_context *context, buffer += adj; size -= adj; - size /= ds_cfg.sizeof_rec[qual]; - size *= ds_cfg.sizeof_rec[qual]; + trace->n = size / ds_cfg.sizeof_rec[qual]; + trace->size = ds_cfg.sizeof_rec[qual]; - ds_set(context->ds, qual, ds_buffer_base, buffer); - ds_set(context->ds, qual, ds_index, buffer); - ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + size = (trace->n * trace->size); + trace->begin = (void *)buffer; + trace->top = trace->begin; + trace->end = (void *)(buffer + size); /* The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ - ds_set(context->ds, qual, - ds_interrupt_threshold, buffer + size - ith); + trace->ith = (void *)(buffer + size - ith); + + trace->flags = flags; } -static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, - struct task_struct *task, - void *base, size_t size, size_t th) + +static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, + enum ds_qualifier qual, struct task_struct *task, + void *base, size_t size, size_t th, unsigned int flags) { struct ds_context *context; - unsigned long irq; int error; - error = -EOPNOTSUPP; - if (!ds_cfg.sizeof_ds) - goto out; - error = -EINVAL; if (!base) goto out; @@ -360,43 +624,26 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, goto out; tracer->context = context; + ds_init_ds_trace(trace, qual, base, size, th, flags); - spin_lock_irqsave(&ds_lock, irq); - - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - - error = -EPERM; - if (context->owner[qual]) - goto out_put_tracer; - context->owner[qual] = tracer; - - spin_unlock_irqrestore(&ds_lock, irq); - - - ds_install_ds_config(context, qual, base, size, th); - - return 0; - - out_put_tracer: - put_tracer(task); - out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); - tracer->context = NULL; + error = 0; out: return error; } struct bts_tracer *ds_request_bts(struct task_struct *task, void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th) + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct bts_tracer *tracer; + unsigned long irq; int error; + error = -EOPNOTSUPP; + if (!ds_cfg.ctl[dsf_bts]) + goto out; + /* buffer overflow notification is not yet implemented */ error = -EOPNOTSUPP; if (ovfl) @@ -408,12 +655,40 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, goto out; tracer->ovfl = ovfl; - error = ds_request(&tracer->ds, ds_bts, task, base, size, th); + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_bts, task, base, size, th, flags); if (error < 0) goto out_tracer; + + spin_lock_irqsave(&ds_lock, irq); + + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); + + error = -EPERM; + if (tracer->ds.context->bts_master) + goto out_put_tracer; + tracer->ds.context->bts_master = tracer; + + spin_unlock_irqrestore(&ds_lock, irq); + + + tracer->trace.read = bts_read; + tracer->trace.write = bts_write; + + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_bts(tracer); + return tracer; + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); out: @@ -422,9 +697,11 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, struct pebs_tracer *ds_request_pebs(struct task_struct *task, void *base, size_t size, - pebs_ovfl_callback_t ovfl, size_t th) + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct pebs_tracer *tracer; + unsigned long irq; int error; /* buffer overflow notification is not yet implemented */ @@ -438,300 +715,171 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, goto out; tracer->ovfl = ovfl; - error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_pebs, task, base, size, th, flags); if (error < 0) goto out_tracer; + spin_lock_irqsave(&ds_lock, irq); + + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); + + error = -EPERM; + if (tracer->ds.context->pebs_master) + goto out_put_tracer; + tracer->ds.context->pebs_master = tracer; + + spin_unlock_irqrestore(&ds_lock, irq); + + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_pebs(tracer); + return tracer; + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); out: return ERR_PTR(error); } -static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) -{ - WARN_ON_ONCE(tracer->context->owner[qual] != tracer); - tracer->context->owner[qual] = NULL; - - put_tracer(tracer->context->task); - ds_put_context(tracer->context); -} - -int ds_release_bts(struct bts_tracer *tracer) +void ds_release_bts(struct bts_tracer *tracer) { if (!tracer) - return -EINVAL; + return; - ds_release(&tracer->ds, ds_bts); - kfree(tracer); + ds_suspend_bts(tracer); - return 0; -} + WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); + tracer->ds.context->bts_master = NULL; -int ds_release_pebs(struct pebs_tracer *tracer) -{ - if (!tracer) - return -EINVAL; + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); - ds_release(&tracer->ds, ds_pebs); kfree(tracer); - - return 0; -} - -static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) -{ - unsigned long base, index; - - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - - return (index - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos) +void ds_suspend_bts(struct bts_tracer *tracer) { - if (!tracer) - return -EINVAL; + struct task_struct *task; - if (!pos) - return -EINVAL; - - *pos = ds_get_index(tracer->ds.context, ds_bts); - - return 0; -} - -int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos) -{ if (!tracer) - return -EINVAL; + return; - if (!pos) - return -EINVAL; + task = tracer->ds.context->task; - *pos = ds_get_index(tracer->ds.context, ds_pebs); + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); - return 0; -} + if (task) { + task->thread.debugctlmsr &= ~BTS_CONTROL; -static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual) -{ - unsigned long base, max; - - base = ds_get(context->ds, qual, ds_buffer_base); - max = ds_get(context->ds, qual, ds_absolute_maximum); - - return (max - base) / ds_cfg.sizeof_rec[qual]; + if (!task->thread.debugctlmsr) + clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } } -int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos) +void ds_resume_bts(struct bts_tracer *tracer) { - if (!tracer) - return -EINVAL; - - if (!pos) - return -EINVAL; - - *pos = ds_get_end(tracer->ds.context, ds_bts); - - return 0; -} + struct task_struct *task; + unsigned long control; -int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos) -{ if (!tracer) - return -EINVAL; - - if (!pos) - return -EINVAL; - - *pos = ds_get_end(tracer->ds.context, ds_pebs); - - return 0; -} - -static int ds_access(struct ds_context *context, enum ds_qualifier qual, - size_t index, const void **record) -{ - unsigned long base, idx; - - if (!record) - return -EINVAL; - - base = ds_get(context->ds, qual, ds_buffer_base); - idx = base + (index * ds_cfg.sizeof_rec[qual]); - - if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) - return -EINVAL; + return; - *record = (const void *)idx; + task = tracer->ds.context->task; - return ds_cfg.sizeof_rec[qual]; -} + control = ds_cfg.ctl[dsf_bts]; + if (!(tracer->trace.ds.flags & BTS_KERNEL)) + control |= ds_cfg.ctl[dsf_bts_kernel]; + if (!(tracer->trace.ds.flags & BTS_USER)) + control |= ds_cfg.ctl[dsf_bts_user]; -int ds_access_bts(struct bts_tracer *tracer, size_t index, - const void **record) -{ - if (!tracer) - return -EINVAL; + if (task) { + task->thread.debugctlmsr |= control; + set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } - return ds_access(tracer->ds.context, ds_bts, index, record); + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() | control); } -int ds_access_pebs(struct pebs_tracer *tracer, size_t index, - const void **record) +void ds_release_pebs(struct pebs_tracer *tracer) { if (!tracer) - return -EINVAL; - - return ds_access(tracer->ds.context, ds_pebs, index, record); -} - -static int ds_write(struct ds_context *context, enum ds_qualifier qual, - const void *record, size_t size) -{ - int bytes_written = 0; - - if (!record) - return -EINVAL; - - while (size) { - unsigned long base, index, end, write_end, int_th; - unsigned long write_size, adj_write_size; - - /* - * write as much as possible without producing an - * overflow interrupt. - * - * interrupt_threshold must either be - * - bigger than absolute_maximum or - * - point to a record between buffer_base and absolute_maximum - * - * index points to a valid record. - */ - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - end = ds_get(context->ds, qual, ds_absolute_maximum); - int_th = ds_get(context->ds, qual, ds_interrupt_threshold); - - write_end = min(end, int_th); - - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ - if (write_end <= index) - write_end = end; - - if (write_end <= index) - break; - - write_size = min((unsigned long) size, write_end - index); - memcpy((void *)index, record, write_size); - - record = (const char *)record + write_size; - size -= write_size; - bytes_written += write_size; - - adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; - adj_write_size *= ds_cfg.sizeof_rec[qual]; - - /* zero out trailing bytes */ - memset((char *)index + write_size, 0, - adj_write_size - write_size); - index += adj_write_size; + return; - if (index >= end) - index = base; - ds_set(context->ds, qual, ds_index, index); + ds_suspend_pebs(tracer); - if (index >= int_th) - ds_overflow(context, qual); - } + WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); + tracer->ds.context->pebs_master = NULL; - return bytes_written; -} + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); -int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size) -{ - if (!tracer) - return -EINVAL; - - return ds_write(tracer->ds.context, ds_bts, record, size); + kfree(tracer); } -int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size) +void ds_suspend_pebs(struct pebs_tracer *tracer) { - if (!tracer) - return -EINVAL; - return ds_write(tracer->ds.context, ds_pebs, record, size); } -static void ds_reset_or_clear(struct ds_context *context, - enum ds_qualifier qual, int clear) +void ds_resume_pebs(struct pebs_tracer *tracer) { - unsigned long base, end; - - base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); - - if (clear) - memset((void *)base, 0, end - base); - ds_set(context->ds, qual, ds_index, base); } -int ds_reset_bts(struct bts_tracer *tracer) +const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { if (!tracer) - return -EINVAL; - - ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); + return NULL; - return 0; + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + return &tracer->trace; } -int ds_reset_pebs(struct pebs_tracer *tracer) +const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) { if (!tracer) - return -EINVAL; + return NULL; - ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + tracer->trace.reset_value = + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); - return 0; + return &tracer->trace; } -int ds_clear_bts(struct bts_tracer *tracer) +int ds_reset_bts(struct bts_tracer *tracer) { if (!tracer) return -EINVAL; - ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); - - return 0; -} - -int ds_clear_pebs(struct pebs_tracer *tracer) -{ - if (!tracer) - return -EINVAL; + tracer->trace.ds.top = tracer->trace.ds.begin; - ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); return 0; } -int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value) +int ds_reset_pebs(struct pebs_tracer *tracer) { if (!tracer) return -EINVAL; - if (!value) - return -EINVAL; + tracer->trace.ds.top = tracer->trace.ds.begin; - *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); return 0; } @@ -746,35 +894,59 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) return 0; } -static const struct ds_configuration ds_cfg_var = { - .sizeof_ds = sizeof(long) * 12, - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, +static const struct ds_configuration ds_cfg_netburst = { + .name = "netburst", + .ctl[dsf_bts] = (1 << 2) | (1 << 3), + .ctl[dsf_bts_kernel] = (1 << 5), + .ctl[dsf_bts_user] = (1 << 6), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = sizeof(long) * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; -static const struct ds_configuration ds_cfg_64 = { - .sizeof_ds = 8 * 12, - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, +static const struct ds_configuration ds_cfg_pentium_m = { + .name = "pentium m", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = 8 * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = 8 * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; +static const struct ds_configuration ds_cfg_core2 = { + .name = "core 2", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + + .sizeof_field = 8, + .sizeof_rec[ds_bts] = 8 * 3, + .sizeof_rec[ds_pebs] = 8 * 18, +}; -static inline void +static void ds_configure(const struct ds_configuration *cfg) { + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; - printk(KERN_INFO "DS available\n"); + printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + + if (!cpu_has_bts) { + ds_cfg.ctl[dsf_bts] = 0; + printk(KERN_INFO "[ds] bts not available\n"); + } + if (!cpu_has_pebs) + printk(KERN_INFO "[ds] pebs not available\n"); - WARN_ON_ONCE(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); + WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -787,10 +959,10 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; case 0xD: case 0xE: /* Pentium M */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_pentium_m); break; default: /* Core2, Atom, ... */ - ds_configure(&ds_cfg_64); + ds_configure(&ds_cfg_core2); break; } break; @@ -799,7 +971,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_netburst); break; default: /* sorry, don't know about them */ @@ -812,14 +984,41 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) } } -void ds_free(struct ds_context *context) +/* + * Change the DS configuration from tracing prev to tracing next. + */ +void ds_switch_to(struct task_struct *prev, struct task_struct *next) { - /* This is called when the task owning the parameter context - * is dying. There should not be any user of that context left - * to disturb us, anymore. */ - unsigned long leftovers = context->count; - while (leftovers--) { - put_tracer(context->task); - ds_put_context(context); + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + + if (prev_ctx) { + update_debugctlmsr(0); + + if (prev_ctx->bts_master && + (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_departs, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = prev->pid + }; + bts_write(prev_ctx->bts_master, &ts); + } + } + + if (next_ctx) { + if (next_ctx->bts_master && + (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_arrives, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = next->pid + }; + bts_write(next_ctx->bts_master, &ts); + } + + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); } + + update_debugctlmsr(next->thread.debugctlmsr); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 24c2276aa45..605eff9a8ac 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -252,11 +252,14 @@ void exit_thread(void) put_cpu(); } #ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(current->thread.ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(current->thread.ds_ctx); + /* Free any BTS tracers that have not been properly released. */ + if (unlikely(current->bts)) { + ds_release_bts(current->bts); + current->bts = NULL; + + kfree(current->bts_buffer); + current->bts_buffer = NULL; + current->bts_size = 0; } #endif /* CONFIG_X86_DS */ } @@ -420,48 +423,19 @@ int set_tsc_mode(unsigned int val) return 0; } -#ifdef CONFIG_X86_DS -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - unsigned long ds_prev = 0; - unsigned long ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, ds_next, 0); - } - return debugctl; -} -#else -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - return debugctl; -} -#endif /* CONFIG_X86_DS */ - static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread; next = &next_p->thread; - debugctl = update_debugctl(prev, next, prev->debugctlmsr); - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -483,15 +457,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, hard_enable_TSC(); } -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ - - if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index fbb321d53d3..1cfd2a4bf85 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -237,11 +237,14 @@ void exit_thread(void) put_cpu(); } #ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(t->ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(t->ds_ctx); + /* Free any BTS tracers that have not been properly released. */ + if (unlikely(current->bts)) { + ds_release_bts(current->bts); + current->bts = NULL; + + kfree(current->bts_buffer); + current->bts_buffer = NULL; + current->bts_size = 0; } #endif /* CONFIG_X86_DS */ } @@ -471,35 +474,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; - debugctl = prev->debugctlmsr; - -#ifdef CONFIG_X86_DS - { - unsigned long ds_prev = 0, ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* - * We clear debugctl to make sure DS - * is not in use when we change it: - */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, ds_next); - } - } -#endif /* CONFIG_X86_DS */ - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -534,14 +516,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } - -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ } /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b2998fe1166..45e9855da2d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -581,153 +581,73 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS -/* - * The configuration for a particular BTS hardware implementation. - */ -struct bts_configuration { - /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */ - unsigned char sizeof_bts; - /* the size of a field in the BTS record in bytes */ - unsigned char sizeof_field; - /* a bitmask to enable/disable BTS in DEBUGCTL MSR */ - unsigned long debugctl_mask; -}; -static struct bts_configuration bts_cfg; - -#define BTS_MAX_RECORD_SIZE (8 * 3) - - -/* - * Branch Trace Store (BTS) uses the following format. Different - * architectures vary in the size of those fields. - * - source linear address - * - destination linear address - * - flags - * - * Later architectures use 64bit pointers throughout, whereas earlier - * architectures use 32bit pointers in 32bit mode. - * - * We compute the base address for the first 8 fields based on: - * - the field size stored in the DS configuration - * - the relative field position - * - * In order to store additional information in the BTS buffer, we use - * a special source address to indicate that the record requires - * special interpretation. - * - * Netburst indicated via a bit in the flags field whether the branch - * was predicted; this is ignored. - */ - -enum bts_field { - bts_from = 0, - bts_to, - bts_flags, - - bts_escape = (unsigned long)-1, - bts_qual = bts_to, - bts_jiffies = bts_flags -}; - -static inline unsigned long bts_get(const char *base, enum bts_field field) -{ - base += (bts_cfg.sizeof_field * field); - return *(unsigned long *)base; -} - -static inline void bts_set(char *base, enum bts_field field, unsigned long val) -{ - base += (bts_cfg.sizeof_field * field);; - (*(unsigned long *)base) = val; -} - -/* - * Translate a BTS record from the raw format into the bts_struct format - * - * out (out): bts_struct interpretation - * raw: raw BTS record - */ -static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw) -{ - memset(out, 0, sizeof(*out)); - if (bts_get(raw, bts_from) == bts_escape) { - out->qualifier = bts_get(raw, bts_qual); - out->variant.jiffies = bts_get(raw, bts_jiffies); - } else { - out->qualifier = BTS_BRANCH; - out->variant.lbr.from_ip = bts_get(raw, bts_from); - out->variant.lbr.to_ip = bts_get(raw, bts_to); - } -} - static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { - struct bts_struct ret; - const void *bts_record; - size_t bts_index, bts_end; + const struct bts_trace *trace; + struct bts_struct bts; + const unsigned char *at; int error; - error = ds_get_bts_end(child->bts, &bts_end); - if (error < 0) - return error; - - if (bts_end <= index) - return -EINVAL; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - error = ds_get_bts_index(child->bts, &bts_index); - if (error < 0) - return error; + at = trace->ds.top - ((index + 1) * trace->ds.size); + if ((void *)at < trace->ds.begin) + at += (trace->ds.n * trace->ds.size); - /* translate the ptrace bts index into the ds bts index */ - bts_index += bts_end - (index + 1); - if (bts_end <= bts_index) - bts_index -= bts_end; + if (!trace->read) + return -EOPNOTSUPP; - error = ds_access_bts(child->bts, bts_index, &bts_record); + error = trace->read(child->bts, at, &bts); if (error < 0) return error; - ptrace_bts_translate_record(&ret, bts_record); - - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; - return sizeof(ret); + return sizeof(bts); } static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { - struct bts_struct ret; - const unsigned char *raw; - size_t end, i; - int error; + const struct bts_trace *trace; + const unsigned char *at; + int error, drained = 0; - error = ds_get_bts_index(child->bts, &end); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - if (size < (end * sizeof(struct bts_struct))) + if (!trace->read) + return -EOPNOTSUPP; + + if (size < (trace->ds.top - trace->ds.begin)) return -EIO; - error = ds_access_bts(child->bts, 0, (const void **)&raw); - if (error < 0) - return error; + for (at = trace->ds.begin; (void *)at < trace->ds.top; + out++, drained++, at += trace->ds.size) { + struct bts_struct bts; + int error; - for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { - ptrace_bts_translate_record(&ret, raw); + error = trace->read(child->bts, at, &bts); + if (error < 0) + return error; - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; } - error = ds_clear_bts(child->bts); + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); + + error = ds_reset_bts(child->bts); if (error < 0) return error; - return end; + return drained; } static int ptrace_bts_config(struct task_struct *child, @@ -735,136 +655,89 @@ static int ptrace_bts_config(struct task_struct *child, const struct ptrace_bts_config __user *ucfg) { struct ptrace_bts_config cfg; - int error = 0; - - error = -EOPNOTSUPP; - if (!bts_cfg.sizeof_bts) - goto errout; + unsigned int flags = 0; - error = -EIO; if (cfg_size < sizeof(cfg)) - goto errout; + return -EIO; - error = -EFAULT; if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - goto errout; - - error = -EINVAL; - if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && - !(cfg.flags & PTRACE_BTS_O_ALLOC)) - goto errout; - - if (cfg.flags & PTRACE_BTS_O_ALLOC) { - bts_ovfl_callback_t ovfl = NULL; - unsigned int sig = 0; - - error = -EINVAL; - if (cfg.size < (10 * bts_cfg.sizeof_bts)) - goto errout; + return -EFAULT; - if (cfg.flags & PTRACE_BTS_O_SIGNAL) { - if (!cfg.signal) - goto errout; + if (child->bts) { + ds_release_bts(child->bts); + child->bts = NULL; + } - error = -EOPNOTSUPP; - goto errout; + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { + if (!cfg.signal) + return -EINVAL; - sig = cfg.signal; - } + return -EOPNOTSUPP; - if (child->bts) { - (void)ds_release_bts(child->bts); - kfree(child->bts_buffer); + child->thread.bts_ovfl_signal = cfg.signal; + } - child->bts = NULL; - child->bts_buffer = NULL; - } + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && + (cfg.size != child->bts_size)) { + kfree(child->bts_buffer); - error = -ENOMEM; + child->bts_size = cfg.size; child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); - if (!child->bts_buffer) - goto errout; - - child->bts = ds_request_bts(child, child->bts_buffer, cfg.size, - ovfl, /* th = */ (size_t)-1); - if (IS_ERR(child->bts)) { - error = PTR_ERR(child->bts); - kfree(child->bts_buffer); - child->bts = NULL; - child->bts_buffer = NULL; - goto errout; + if (!child->bts_buffer) { + child->bts_size = 0; + return -ENOMEM; } - - child->thread.bts_ovfl_signal = sig; } - error = -EINVAL; - if (!child->thread.ds_ctx && cfg.flags) - goto errout; - if (cfg.flags & PTRACE_BTS_O_TRACE) - child->thread.debugctlmsr |= bts_cfg.debugctl_mask; - else - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + flags |= BTS_USER; if (cfg.flags & PTRACE_BTS_O_SCHED) - set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - else - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + flags |= BTS_TIMESTAMPS; - error = sizeof(cfg); + child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, + /* ovfl = */ NULL, /* th = */ (size_t)-1, + flags); + if (IS_ERR(child->bts)) { + int error = PTR_ERR(child->bts); -out: - if (child->thread.debugctlmsr) - set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - else - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + kfree(child->bts_buffer); + child->bts = NULL; + child->bts_buffer = NULL; + child->bts_size = 0; - return error; + return error; + } -errout: - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - goto out; + return sizeof(cfg); } static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + const struct bts_trace *trace; struct ptrace_bts_config cfg; - size_t end; - const void *base, *max; - int error; if (cfg_size < sizeof(cfg)) return -EIO; - error = ds_get_bts_end(child->bts, &end); - if (error < 0) - return error; - - error = ds_access_bts(child->bts, /* index = */ 0, &base); - if (error < 0) - return error; - - error = ds_access_bts(child->bts, /* index = */ end, &max); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; memset(&cfg, 0, sizeof(cfg)); - cfg.size = (max - base); + cfg.size = trace->ds.end - trace->ds.begin; cfg.signal = child->thread.bts_ovfl_signal; cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; - if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && - child->thread.debugctlmsr & bts_cfg.debugctl_mask) + if (trace->ds.flags & BTS_USER) cfg.flags |= PTRACE_BTS_O_TRACE; - if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + if (trace->ds.flags & BTS_TIMESTAMPS) cfg.flags |= PTRACE_BTS_O_SCHED; if (copy_to_user(ucfg, &cfg, sizeof(cfg))) @@ -873,105 +746,28 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } -static int ptrace_bts_write_record(struct task_struct *child, - const struct bts_struct *in) +static int ptrace_bts_clear(struct task_struct *child) { - unsigned char bts_record[BTS_MAX_RECORD_SIZE]; + const struct bts_trace *trace; - if (BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts) - return -EOVERFLOW; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - memset(bts_record, 0, bts_cfg.sizeof_bts); - switch (in->qualifier) { - case BTS_INVALID: - break; + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - case BTS_BRANCH: - bts_set(bts_record, bts_from, in->variant.lbr.from_ip); - bts_set(bts_record, bts_to, in->variant.lbr.to_ip); - break; - - case BTS_TASK_ARRIVES: - case BTS_TASK_DEPARTS: - bts_set(bts_record, bts_from, bts_escape); - bts_set(bts_record, bts_qual, in->qualifier); - bts_set(bts_record, bts_jiffies, in->variant.jiffies); - break; - - default: - return -EINVAL; - } - - return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts); + return ds_reset_bts(child->bts); } -void ptrace_bts_take_timestamp(struct task_struct *tsk, - enum bts_qualifier qualifier) +static int ptrace_bts_size(struct task_struct *child) { - struct bts_struct rec = { - .qualifier = qualifier, - .variant.jiffies = jiffies_64 - }; - - ptrace_bts_write_record(tsk, &rec); -} - -static const struct bts_configuration bts_cfg_netburst = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<2)|(1<<3)|(1<<5) -}; + const struct bts_trace *trace; -static const struct bts_configuration bts_cfg_pentium_m = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<6)|(1<<7) -}; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; -static const struct bts_configuration bts_cfg_core2 = { - .sizeof_bts = 8 * 3, - .sizeof_field = 8, - .debugctl_mask = (1<<6)|(1<<7)|(1<<9) -}; - -static inline void bts_configure(const struct bts_configuration *cfg) -{ - bts_cfg = *cfg; -} - -void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) -{ - switch (c->x86) { - case 0x6: - switch (c->x86_model) { - case 0 ... 0xC: - /* sorry, don't know about them */ - break; - case 0xD: - case 0xE: /* Pentium M */ - bts_configure(&bts_cfg_pentium_m); - break; - default: /* Core2, Atom, ... */ - bts_configure(&bts_cfg_core2); - break; - } - break; - case 0xF: - switch (c->x86_model) { - case 0x0: - case 0x1: - case 0x2: /* Netburst */ - bts_configure(&bts_cfg_netburst); - break; - default: - /* sorry, don't know about them */ - break; - } - break; - default: - /* sorry, don't know about them */ - break; - } + return (trace->ds.top - trace->ds.begin) / trace->ds.size; } #endif /* CONFIG_X86_PTRACE_BTS */ @@ -988,15 +784,12 @@ void ptrace_disable(struct task_struct *child) #endif #ifdef CONFIG_X86_PTRACE_BTS if (child->bts) { - (void)ds_release_bts(child->bts); + ds_release_bts(child->bts); + child->bts = NULL; + kfree(child->bts_buffer); child->bts_buffer = NULL; - - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + child->bts_size = 0; } #endif /* CONFIG_X86_PTRACE_BTS */ } @@ -1129,16 +922,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) (child, data, (struct ptrace_bts_config __user *)addr); break; - case PTRACE_BTS_SIZE: { - size_t size; - - ret = ds_get_bts_index(child->bts, &size); - if (ret == 0) { - WARN_ON_ONCE(size != (int) size); - ret = (int) size; - } + case PTRACE_BTS_SIZE: + ret = ptrace_bts_size(child); break; - } case PTRACE_BTS_GET: ret = ptrace_bts_read_record @@ -1146,7 +932,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ds_clear_bts(child->bts); + ret = ptrace_bts_clear(child); break; case PTRACE_BTS_DRAIN: @@ -1409,6 +1195,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: +#ifdef CONFIG_X86_PTRACE_BTS + case PTRACE_BTS_CONFIG: + case PTRACE_BTS_STATUS: + case PTRACE_BTS_SIZE: + case PTRACE_BTS_GET: + case PTRACE_BTS_CLEAR: + case PTRACE_BTS_DRAIN: +#endif /* CONFIG_X86_PTRACE_BTS */ return arch_ptrace(child, request, addr, data); default: -- cgit v1.2.3 From ffc2238af8431d930d2c15f16feecf1fd6d75642 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 08:21:19 +0100 Subject: x86, bts: fix build error Impact: build fix arch/x86/kernel/ds.c: In function 'ds_request': arch/x86/kernel/ds.c:236: sorry, unimplemented: inlining failed in call to 'ds_get_context': recursive inlining but the recursion here is scary ... Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index f0583005b75..dc1e7123ea4 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -232,7 +232,7 @@ static DEFINE_PER_CPU(struct ds_context *, system_context_array); #define system_context per_cpu(system_context_array, smp_processor_id()) -static inline struct ds_context *ds_get_context(struct task_struct *task) +static struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = (task ? &task->thread.ds_ctx : &system_context); -- cgit v1.2.3 From 8808500f26a61757cb414da76b271bbd09d5958c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 09:20:12 +0100 Subject: x86: soften multi-BAR mapping sanity check warning message Impact: make debug warning less scary The ioremap() time multi-BAR map warning has been causing false positives: http://lkml.org/lkml/2008/12/10/432 http://lkml.org/lkml/2008/12/11/136 So make it less scary by making it once-per-boot, by making it KERN_INFO and by adding this text: "Info: mapping multiple BARs. Your kernel is fine." Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d4c4307ff3e..bd85d42819e 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -223,7 +223,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * Check if the request spans more than any BAR in the iomem resource * tree. */ - WARN_ON(iomem_map_sanity_check(phys_addr, size)); + WARN_ONCE(iomem_map_sanity_check(phys_addr, size), + KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); /* * Don't allow anybody to remap normal RAM that we're using.. -- cgit v1.2.3 From 85072bd55219231b8ca5d9d3fa3492eb4fa6635f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 11:08:42 +0100 Subject: x86, debug: remove EBDA debug printk Remove leftover EBDA debug message. Reported-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/head.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 1dcb0f13897..3e66bd364a9 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -35,7 +35,6 @@ void __init reserve_ebda_region(void) /* start of EBDA area */ ebda_addr = get_bios_ebda(); - printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ -- cgit v1.2.3 From a0343e823184070f55364d8359f832dcb33c57c7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Dec 2008 23:53:16 +0100 Subject: tracing/function-graph-tracer: add a new .irqentry.text section Impact: let the function-graph-tracer be aware of the irq entrypoints Add a new .irqentry.text section to store the irq entrypoints functions inside the same section. This way, the tracer will be able to signal an interrupts triggering on output by recognizing these entrypoints. Also, make this section recordable for dynamic tracing. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_64.lds.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 46e05447405..1a614c0e6be 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -35,6 +35,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ -- cgit v1.2.3 From bcbc4f20b52c2c40c43a4d2337707dcdfe81bc3a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Dec 2008 23:54:20 +0100 Subject: tracing/function-graph-tracer: annotate do_IRQ and smp_apic_timer_interrupt Impact: move most important x86 irq entry-points to a separate subsection Annotate do_IRQ and smp_apic_timer_interrupt to put them into the .irqentry.text subsection. These function will so be recognized as hardirq entrypoints for the function-graph-tracer. We could also annotate other irq entries but the others are far less important but they can be added on request. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 3 ++- arch/x86/kernel/irq_64.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f94879b52..b946ac19753 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -800,7 +801,7 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs *regs) +void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 60eb84eb77a..11c65e811ff 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -47,7 +48,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs *regs) +asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; -- cgit v1.2.3 From 16855f878d7127a8bb3925753463485f3071ad76 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 8 Dec 2008 19:18:38 -0800 Subject: x86: uaccess: return value of __{get|put}_user() can be int Impact: cleanup The type of return value of __{get|put}_user() can be int. There is no user to refer the return value of __{get|put}_user() as long. This reduces code size a bit on 64-bit. $ size vmlinux.* text data bss dec hex filename 4509265 479988 673588 5662841 566879 vmlinux.new 4511462 479988 673588 5665038 56710e vmlinux.old Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 35c54921b2e..580c3ee6c58 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -350,14 +350,14 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ - long __pu_err; \ + int __pu_err; \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ __pu_err; \ }) #define __get_user_nocheck(x, ptr, size) \ ({ \ - long __gu_err; \ + int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ -- cgit v1.2.3 From 8f2466f45f75e3cbe3aa2b69d33fd9d6e343b9cc Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 8 Dec 2008 19:19:07 -0800 Subject: x86: kill #ifdef for exit_idle() Impact: cleanup Introduce helper inline function in arch/x86/include/asm/idle.h to remove #ifdefs around exit_idle(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/idle.h | 5 +++++ arch/x86/kernel/apic.c | 6 ------ arch/x86/kernel/io_apic.c | 3 +-- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h index 44c89c3a23e..38d87379e27 100644 --- a/arch/x86/include/asm/idle.h +++ b/arch/x86/include/asm/idle.h @@ -8,8 +8,13 @@ struct notifier_block; void idle_notifier_register(struct notifier_block *n); void idle_notifier_unregister(struct notifier_block *n); +#ifdef CONFIG_X86_64 void enter_idle(void); void exit_idle(void); +#else /* !CONFIG_X86_64 */ +static inline void enter_idle(void) { } +static inline void exit_idle(void) { } +#endif /* CONFIG_X86_64 */ void c1e_remove_cpu(int cpu); diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f94879b52..0fd083713f6 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -814,9 +814,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); @@ -1682,9 +1680,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) { u32 v; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1713,9 +1709,7 @@ void smp_error_interrupt(struct pt_regs *regs) { u32 v, v1; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 9043251210f..679e7bbbbcd 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -2216,10 +2216,9 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; + ack_APIC_irq(); -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); me = smp_processor_id(); -- cgit v1.2.3 From 2bed8446819a7c5033aa1da138d9f230ae212edc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 12:13:36 +0100 Subject: tracing/function-graph-tracer: add a new .irqentry.text section, fix Impact: build fix 32-bit x86 needs this section too. Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_32.lds.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index a9b8560adbc..82c67559dde 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -44,6 +44,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ -- cgit v1.2.3 From 9470565579f29486f4ed0ffa50774268b64994b0 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 1 Dec 2008 14:13:50 -0800 Subject: x86: remove init_mm export as planned for 2.6.26 Impact: remove deprecated export Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- arch/x86/kernel/init_task.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index a4f93b4120c..d39918076bb 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -14,7 +14,6 @@ static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); -EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ /* * Initial thread structure. -- cgit v1.2.3 From fd28a5b58dddf5cb5df162ae5c8797a63171c31d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 21 Oct 2008 14:05:00 +0200 Subject: x86: remove simnow earlyprintk support Impact: remove obsolete code The later versions of SimNow! actually all have serial console emulation, so the direct interface isn't needed anymore. So remove the undocumented simnow earlyprintk console. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner --- arch/x86/kernel/early_printk.c | 47 ------------------------------------------ 1 file changed, 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 34ad997d383..23b138e31e9 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -875,49 +875,6 @@ static struct console early_dbgp_console = { }; #endif -/* Console interface to a host file on AMD's SimNow! */ - -static int simnow_fd; - -enum { - MAGIC1 = 0xBACCD00A, - MAGIC2 = 0xCA110000, - XOPEN = 5, - XWRITE = 4, -}; - -static noinline long simnow(long cmd, long a, long b, long c) -{ - long ret; - - asm volatile("cpuid" : - "=a" (ret) : - "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); - return ret; -} - -static void __init simnow_init(char *str) -{ - char *fn = "klog"; - - if (*str == '=') - fn = ++str; - /* error ignored */ - simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); -} - -static void simnow_write(struct console *con, const char *s, unsigned n) -{ - simnow(XWRITE, simnow_fd, (unsigned long)s, n); -} - -static struct console simnow_console = { - .name = "simnow", - .write = simnow_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; @@ -960,10 +917,6 @@ static int __init setup_early_printk(char *buf) max_ypos = boot_params.screen_info.orig_video_lines; current_ypos = boot_params.screen_info.orig_y; early_console = &early_vga_console; - } else if (!strncmp(buf, "simnow", 6)) { - simnow_init(buf + 6); - early_console = &simnow_console; - keep_early = 1; #ifdef CONFIG_EARLY_PRINTK_DBGP } else if (!strncmp(buf, "dbgp", 4)) { if (early_dbgp_init(buf+4) < 0) -- cgit v1.2.3 From ae8d04e2ecbb233926860e9ce145eac19c7835dc Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Sat, 13 Dec 2008 12:36:58 -0800 Subject: x86 Fix VMI crash on boot in 2.6.28-rc8 VMI initialiation can relocate the fixmap, causing early_ioremap to malfunction if it is initialized before the relocation. To fix this, VMI activation is split into two phases; the detection, which must happen before setting up ioremap, and the activation, which must happen after parsing early boot parameters. This fixes a crash on boot when VMI is enabled under VMware. Signed-off-by: Zachary Amsden Signed-off-by: Linus Torvalds --- arch/x86/include/asm/vmi.h | 8 +++++++- arch/x86/kernel/setup.c | 12 +++++------- arch/x86/kernel/smpboot.c | 2 -- arch/x86/kernel/vmi_32.c | 16 +++++++++++----- 4 files changed, 23 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h index b7c0dea119f..61e08c0a290 100644 --- a/arch/x86/include/asm/vmi.h +++ b/arch/x86/include/asm/vmi.h @@ -223,9 +223,15 @@ struct pci_header { } __attribute__((packed)); /* Function prototypes for bootstrapping */ +#ifdef CONFIG_VMI extern void vmi_init(void); +extern void vmi_activate(void); extern void vmi_bringup(void); -extern void vmi_apply_boot_page_allocations(void); +#else +static inline void vmi_init(void) {} +static inline void vmi_activate(void) {} +static inline void vmi_bringup(void) {} +#endif /* State needed to start an application processor in an SMP system. */ struct vmi_ap_state { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9d5674f7b6c..bdec76e5559 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -794,6 +794,9 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + /* VMI may relocate the fixmap; do this before touching ioremap area */ + vmi_init(); + early_cpu_init(); early_ioremap_init(); @@ -880,13 +883,8 @@ void __init setup_arch(char **cmdline_p) check_efer(); #endif -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be before kernel pagetables are setup - * or fixmap area is touched. - */ - vmi_init(); -#endif + /* Must be before kernel pagetables are setup */ + vmi_activate(); /* after early param, so could get panic from serial */ reserve_early_setup_data(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7b109339731..f71f96fc9e6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -294,9 +294,7 @@ static void __cpuinit start_secondary(void *unused) * fragile that we want to limit the things done here to the * most necessary things. */ -#ifdef CONFIG_VMI vmi_bringup(); -#endif cpu_init(); preempt_disable(); smp_callin(); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 8b6c393ab9f..22fd6577156 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -960,8 +960,6 @@ static inline int __init activate_vmi(void) void __init vmi_init(void) { - unsigned long flags; - if (!vmi_rom) probe_vmi_rom(); else @@ -973,13 +971,21 @@ void __init vmi_init(void) reserve_top_address(-vmi_rom->virtual_top); - local_irq_save(flags); - activate_vmi(); - #ifdef CONFIG_X86_IO_APIC /* This is virtual hardware; timer routing is wired correctly */ no_timer_check = 1; #endif +} + +void vmi_activate(void) +{ + unsigned long flags; + + if (!vmi_rom) + return; + + local_irq_save(flags); + activate_vmi(); local_irq_restore(flags & X86_EFLAGS_IF); } -- cgit v1.2.3 From 205516c12dbba003c26b42cfb41e598631300106 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Tue, 16 Dec 2008 00:32:21 -0800 Subject: x86: convert rdtscll() to use __native_read_tsc Impact: micro-optimization Is there any reason why x86 rdtscll have to use the out of line function instead of inline __native_read_tsc()? native_read_tsc and __native_read_tsc is essentially the same functions. Patch to let x86 rdtscll() to use the inline version of read_tsc. Signed-off-by: Ken Chen Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c2a812ebde8..42f639b991b 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -181,10 +181,10 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) } #define rdtscl(low) \ - ((low) = (u32)native_read_tsc()) + ((low) = (u32)__native_read_tsc()) #define rdtscll(val) \ - ((val) = native_read_tsc()) + ((val) = __native_read_tsc()) #define rdpmc(counter, low, high) \ do { \ -- cgit v1.2.3 From cc1dc6d039ced64c2f8b8457bf1cccf4ecfc5942 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 16 Dec 2008 15:51:03 +0100 Subject: x86, bts: remove recursion from get_context Impact: cleanup Optimistically allocate a DS context. It is extremely unlikely that one already existed. This simplifies the code a lot. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 58 ++++++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index dc1e7123ea4..0dc795951d7 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -232,53 +232,45 @@ static DEFINE_PER_CPU(struct ds_context *, system_context_array); #define system_context per_cpu(system_context_array, smp_processor_id()) -static struct ds_context *ds_get_context(struct task_struct *task) + +static inline struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = (task ? &task->thread.ds_ctx : &system_context); - struct ds_context *context = *p_context; + struct ds_context *context = NULL; + struct ds_context *new_context = NULL; unsigned long irq; - if (!context) { - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return NULL; - - spin_lock_irqsave(&ds_lock, irq); - - if (*p_context) { - kfree(context); + /* Chances are small that we already have a context. */ + new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); + if (!new_context) + return NULL; - context = *p_context; - } else { - *p_context = context; + spin_lock_irqsave(&ds_lock, irq); - context->this = p_context; - context->task = task; + context = *p_context; + if (!context) { + context = new_context; - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); + context->this = p_context; + context->task = task; + context->count = 0; - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, - (unsigned long)context->ds); - } + if (task) + set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - context->count++; + if (!task || (task == current)) + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - spin_unlock_irqrestore(&ds_lock, irq); - } else { - spin_lock_irqsave(&ds_lock, irq); + *p_context = context; + } - context = *p_context; - if (context) - context->count++; + context->count++; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irqrestore(&ds_lock, irq); - if (!context) - context = ds_get_context(task); - } + if (context != new_context) + kfree(new_context); return context; } -- cgit v1.2.3 From d072c25f531c6513994960401d2c7f059434c0d2 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 16 Dec 2008 15:53:11 +0100 Subject: x86, bts: correctly report invalid bts records Impact: change the reporting of empty BTS records Correctly report a cleared BTS record as invalid. Used to be reported as branch from 0 to 0. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 0dc795951d7..98d271e60e0 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -484,6 +484,9 @@ static int bts_read(struct bts_tracer *tracer, const void *at, out->qualifier = bts_branch; out->variant.lbr.from = bts_get(at, bts_from); out->variant.lbr.to = bts_get(at, bts_to); + + if (!out->variant.lbr.from && !out->variant.lbr.to) + out->qualifier = bts_invalid; } return ds_cfg.sizeof_rec[ds_bts]; -- cgit v1.2.3 From 1796316a8b028a148be48ba5d4e7be493a39d173 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:35:24 +0000 Subject: x86: consolidate __swp_XXX() macros Impact: cleanup, code robustization The __swp_...() macros silently relied upon which bits are used for _PAGE_FILE and _PAGE_PROTNONE. After having changed _PAGE_PROTNONE in our Xen kernel to no longer overlap _PAGE_PAT, live locks and crashes were reported that could have been avoided if these macros properly used the symbolic constants. Since, as pointed out earlier, for Xen Dom0 support mainline likewise will need to eliminate the conflict between _PAGE_PAT and _PAGE_PROTNONE, this patch does all the necessary adjustments, plus it introduces a mechanism to check consistency between MAX_SWAPFILES_SHIFT and the actual encoding macros. This also fixes a latent bug in that x86-64 used a 6-bit mask in __swp_type(), and if MAX_SWAPFILES_SHIFT was increased beyond 5 in (the seemingly unrelated) linux/swap.h, this would have resulted in a collision with _PAGE_FILE. Non-PAE 32-bit code gets similarly adjusted for its pte_to_pgoff() and pgoff_to_pte() calculations. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable-2level.h | 50 ++++++++++++++++++++++++++++------- arch/x86/include/asm/pgtable-3level.h | 1 + arch/x86/include/asm/pgtable.h | 14 +++++----- arch/x86/include/asm/pgtable_64.h | 20 +++++++++++--- 4 files changed, 66 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index b17edfd2362..e0d199fe1d8 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -56,23 +56,55 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #define pte_none(x) (!(x).pte_low) /* - * Bits 0, 6 and 7 are taken, split up the 29 bits of offset - * into this range: + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, + * split up the 29 bits of offset into this range: */ #define PTE_FILE_MAX_BITS 29 +#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) +#else +#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1) +#endif +#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) +#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) #define pte_to_pgoff(pte) \ - ((((pte).pte_low >> 1) & 0x1f) + (((pte).pte_low >> 8) << 5)) + ((((pte).pte_low >> PTE_FILE_SHIFT1) \ + & ((1U << PTE_FILE_BITS1) - 1)) \ + + ((((pte).pte_low >> PTE_FILE_SHIFT2) \ + & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \ + + (((pte).pte_low >> PTE_FILE_SHIFT3) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2))) #define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + \ - (((off) >> 5) << 8) + _PAGE_FILE }) + ((pte_t) { .pte_low = \ + (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ + + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \ + << PTE_FILE_SHIFT2) \ + + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + << PTE_FILE_SHIFT3) \ + + _PAGE_FILE }) /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x1f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 52597aeadff..447da43cddb 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -166,6 +166,7 @@ static inline int pte_none(pte_t pte) #define PTE_FILE_MAX_BITS 32 /* Encode and de-code a swap entry */ +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) #define __swp_offset(x) ((x).val >> 5) #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c012f3b1167..b7c2ecdb765 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -10,7 +10,6 @@ #define _PAGE_BIT_PCD 4 /* page cache disabled */ #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ -#define _PAGE_BIT_FILE 6 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ @@ -22,6 +21,12 @@ #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ +/* If _PAGE_BIT_PRESENT is clear, we use these: */ +/* - if the user mapped it with PROT_NONE; pte_present gives true */ +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL +/* - set: nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY + #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) @@ -46,11 +51,8 @@ #define _PAGE_NX (_AT(pteval_t, 0)) #endif -/* If _PAGE_PRESENT is clear, we use these: */ -#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, - * saved PTE; unset:swap */ -#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE; - pte_present gives true */ +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 545a0e042bb..65b6be6677c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -250,10 +250,22 @@ static inline int pud_large(pud_t pte) extern int direct_gbpages; /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x3f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \ - ((offset) << 8) }) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) -- cgit v1.2.3 From b93a531e315e97ef00367099e6b5f19651936e20 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:40:27 +0000 Subject: allow bug table entries to use relative pointers (and use it on x86-64) Impact: reduce bug table size This allows reducing the bug table size by half. Perhaps there are other 64-bit architectures that could also make use of this. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/bug.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f..ab98cca84e1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -87,6 +87,10 @@ config GENERIC_IOMAP config GENERIC_BUG def_bool y depends on BUG + select GENERIC_BUG_RELATIVE_POINTERS if X86_64 + +config GENERIC_BUG_RELATIVE_POINTERS + bool config GENERIC_HWEIGHT def_bool y diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 3def2065fce..d9cf1cd156d 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -9,7 +9,7 @@ #ifdef CONFIG_X86_32 # define __BUG_C0 "2:\t.long 1b, %c0\n" #else -# define __BUG_C0 "2:\t.quad 1b, %c0\n" +# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n" #endif #define BUG() \ -- cgit v1.2.3 From d6be89ad660c5d03edef91715093d447025df59b Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:42:45 +0000 Subject: x86, 32-bit: simplify alloc_low_page() Impact: cleanup Neither of the callers really needs the physical address this function returns, so eliminate the pointless argument. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3ffed259883..333c9e79d46 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -67,7 +67,7 @@ static unsigned long __meminitdata table_top; static int __initdata after_init_bootmem; -static __init void *alloc_low_page(unsigned long *phys) +static __init void *alloc_low_page(void) { unsigned long pfn = table_end++; void *adr; @@ -77,7 +77,6 @@ static __init void *alloc_low_page(unsigned long *phys) adr = __va(pfn * PAGE_SIZE); memset(adr, 0, PAGE_SIZE); - *phys = pfn * PAGE_SIZE; return adr; } @@ -92,12 +91,11 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_t *pmd_table; #ifdef CONFIG_X86_PAE - unsigned long phys; if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { if (after_init_bootmem) pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); else - pmd_table = (pmd_t *)alloc_low_page(&phys); + pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -128,10 +126,8 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!page_table) page_table = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); - } else { - unsigned long phys; - page_table = (pte_t *)alloc_low_page(&phys); - } + } else + page_table = (pte_t *)alloc_low_page(); paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); -- cgit v1.2.3 From beeb4195cbc80b7489631361b7ed38b7518af433 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:45:56 +0000 Subject: x86, 32-bit: add some compile time checks to mem_init() Some of the inconsistencies checked for at run time can be detected at build time already, so duplicate the checks done at run time to also be done at build time. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c483f424207..d3a45d54547 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1040,11 +1040,25 @@ void __init mem_init(void) (unsigned long)&_text, (unsigned long)&_etext, ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + /* + * Check boundaries twice: Some fundamental inconsistencies can + * be detected at build time already. + */ +#define __FIXADDR_TOP (-PAGE_SIZE) +#ifdef CONFIG_HIGHMEM + BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif +#define high_memory (-128UL << 20) + BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); +#undef high_memory +#undef __FIXADDR_TOP + #ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); BUG_ON(VMALLOC_END > PKMAP_BASE); #endif - BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON(VMALLOC_START >= VMALLOC_END); BUG_ON((unsigned long)high_memory > VMALLOC_START); if (boot_cpu_data.wp_works_ok < 0) -- cgit v1.2.3 From cfc319833b5b359bf3bce99564dbac00af7925ac Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:46:58 +0000 Subject: x86, 32-bit: improve lazy TLB handling code Impact: micro-optimize the 32-bit TLB flush code Use the faster x86_{read,write}_percpu() accessors here. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context_32.h | 13 ++++++------- arch/x86/kernel/tlb_32.c | 11 +++++------ 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h index 8e10015781f..7e98ce1d2c0 100644 --- a/arch/x86/include/asm/mmu_context_32.h +++ b/arch/x86/include/asm/mmu_context_32.h @@ -4,9 +4,8 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP - unsigned cpu = smp_processor_id(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY; + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY); #endif } @@ -20,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev, /* stop flush ipis for the previous mm */ cpu_clear(cpu, prev->cpu_vm_mask); #ifdef CONFIG_SMP - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - per_cpu(cpu_tlbstate, cpu).active_mm = next; + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); + x86_write_percpu(cpu_tlbstate.active_mm, next); #endif cpu_set(cpu, next->cpu_vm_mask); @@ -36,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev, } #ifdef CONFIG_SMP else { - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next); if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { /* We were in lazy tlb mode and leave_mm disabled diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index f4049f3513b..4290d918b58 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -34,9 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock); */ void leave_mm(int cpu) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); + BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK); + cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -104,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) * BUG(); */ - if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { + if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) { + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) { if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else @@ -238,7 +237,7 @@ static void do_flush_tlb_all(void *info) unsigned long cpu = smp_processor_id(); __flush_tlb_all(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(cpu); } -- cgit v1.2.3 From 83fd5cc6481c6b7fa8b45f8a7e0aa7120213430b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 16 Dec 2008 19:17:11 +0100 Subject: AMD IOMMU: allocate rlookup_table with __GFP_ZERO Impact: fix bug which can lead to panic in prealloc_protection_domains() Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 30ae2701b3d..c90a15eba5c 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1074,7 +1074,8 @@ int __init amd_iommu_init(void) goto free; /* IOMMU rlookup table - find the IOMMU for a specific device */ - amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_rlookup_table = (void *)__get_free_pages( + GFP_KERNEL | __GFP_ZERO, get_order(rlookup_table_size)); if (amd_iommu_rlookup_table == NULL) goto free; -- cgit v1.2.3 From b6fd6f26733e864fba2ea3eb1d716e23d2e66f3a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Dec 2008 19:23:36 +0100 Subject: x86, mm: limit MAXMEM on 64-bit on 64-bit x86 the physical memory limit is controlled by the sparsemem bits - which are 44 bits right now. But MAXMEM (the max pfn number e820 parsing will allow to enter our sizing routines) is set to 0x00003fffffffffff, i.e. 46 bits - that's too large because it overlaps into the vmalloc range. So couple MAXMEM to MAX_PHYSMEM_BITS, and add a comment that the maximum of MAX_PHYSMEM_BITS is 45 bits. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64.h | 2 +- arch/x86/include/asm/sparsemem.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 65b6be6677c..c54ba69608b 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -146,7 +146,7 @@ static inline void native_pgd_clear(pgd_t *pgd) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) -#define MAXMEM _AC(0x00003fffffffffff, UL) +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #define VMALLOC_START _AC(0xffffc20000000000, UL) #define VMALLOC_END _AC(0xffffe1ffffffffff, UL) #define VMEMMAP_START _AC(0xffffe20000000000, UL) diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index be44f7dab39..e3cc3c063ec 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,7 +27,7 @@ #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ # define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 44 +# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ #endif #endif /* CONFIG_SPARSEMEM */ -- cgit v1.2.3 From 3c763fd77e66e55d029052da31df0abd9920cb1e Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:07:47 +0100 Subject: x86: microcode_amd: fix wrong handling of equivalent CPU id Impact: fix bug resulting in non-loaded AMD microcode mc_header->processor_rev_id is a 2 byte value. Similar is true for equiv_cpu in an equiv_cpu_entry -- only 2 bytes are of interest. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 5f8e5d75a25..b5bc81470bc 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -62,7 +62,7 @@ struct microcode_header_amd { unsigned int mc_patch_data_checksum; unsigned int nb_dev_id; unsigned int sb_dev_id; - unsigned char processor_rev_id[2]; + u16 processor_rev_id; unsigned char nb_rev_id; unsigned char sb_rev_id; unsigned char bios_api_rev; @@ -125,7 +125,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) while (equiv_cpu_table[i].installed_cpu != 0) { if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { - equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; + equiv_cpu_id = equiv_cpu_table[i].equiv_cpu & 0xffff; break; } i++; @@ -137,21 +137,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 0; } - if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { - printk(KERN_ERR - "microcode: CPU%d patch does not match " - "(patch is %x, cpu extended is %x) \n", - cpu, mc_header->processor_rev_id[0], - (equiv_cpu_id & 0xff)); - return 0; - } - - if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { - printk(KERN_ERR "microcode: CPU%d patch does not match " - "(patch is %x, cpu base id is %x) \n", - cpu, mc_header->processor_rev_id[1], - ((equiv_cpu_id >> 16) & 0xff)); - + if (mc_header->processor_rev_id != equiv_cpu_id) { + printk(KERN_ERR "microcode: CPU%d patch does not match " + "(processor_rev_id: %x, eqiv_cpu_id: %x)\n", + cpu, mc_header->processor_rev_id, equiv_cpu_id); return 0; } -- cgit v1.2.3 From 2a3282a77b02fb47576ffbdb4867c8c6eeb83ed5 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:08:53 +0100 Subject: x86: microcode_amd: fix typos and trailing whitespaces in log messages Impact: fix printk typos Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index b5bc81470bc..83a9fa321d9 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -10,7 +10,7 @@ * This driver allows to upgrade microcode on AMD * family 0x10 and 0x11 processors. * - * Licensed unter the terms of the GNU General Public + * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ @@ -133,7 +133,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) if (!equiv_cpu_id) { printk(KERN_ERR "microcode: CPU%d cpu_id " - "not found in equivalent cpu table \n", cpu); + "not found in equivalent cpu table\n", cpu); return 0; } @@ -151,7 +151,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) NULL); if ((!nb_pci_dev) || (mc_header->nb_rev_id != nb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu); + printk(KERN_ERR "microcode: CPU%d NB mismatch\n", cpu); pci_dev_put(nb_pci_dev); return 0; } @@ -165,7 +165,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) NULL); if ((!sb_pci_dev) || (mc_header->sb_rev_id != sb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu); + printk(KERN_ERR "microcode: CPU%d SB mismatch\n", cpu); pci_dev_put(sb_pci_dev); return 0; } @@ -219,7 +219,7 @@ static void apply_microcode_amd(int cpu) } printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x \n", + "0x%x to 0x%x\n", cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id); uci->cpu_sig.rev = rev; @@ -282,7 +282,7 @@ static int install_equiv_cpu_table(u8 *buf, if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { printk(KERN_ERR "microcode: error! " - "Wrong microcode equivalnet cpu table\n"); + "Wrong microcode equivalent cpu table\n"); return 0; } -- cgit v1.2.3 From be957763b01905d33b53cdd25c8df110f94f499a Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:11:23 +0100 Subject: x86: microcode_amd: fix checkpatch warnings/errors Impact: cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 83a9fa321d9..a8a0ec60055 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -32,9 +32,9 @@ #include #include #include +#include #include -#include #include #include @@ -225,7 +225,7 @@ static void apply_microcode_amd(int cpu) uci->cpu_sig.rev = rev; } -static void * get_next_ucode(u8 *buf, unsigned int size, +static void *get_next_ucode(u8 *buf, unsigned int size, int (*get_ucode_data)(void *, const void *, size_t), unsigned int *mc_size) { @@ -256,7 +256,8 @@ static void * get_next_ucode(u8 *buf, unsigned int size, mc = vmalloc(UCODE_MAX_SIZE); if (mc) { memset(mc, 0, UCODE_MAX_SIZE); - if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { + if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, + total_size)) { vfree(mc); mc = NULL; } else @@ -332,7 +333,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, unsigned int uninitialized_var(mc_size); struct microcode_header_amd *mc_header; - mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); + mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, + &mc_size); if (!mc) break; @@ -342,7 +344,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size, vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; - } else + } else vfree(mc); ucode_ptr += mc_size; @@ -354,9 +356,9 @@ static int generic_load_microcode(int cpu, void *data, size_t size, if (uci->mc) vfree(uci->mc); uci->mc = new_mc; - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); + pr_debug("microcode: CPU%d found a matching microcode " + "update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); } else vfree(new_mc); } @@ -383,7 +385,8 @@ static int request_microcode_fw(int cpu, struct device *device) ret = request_firmware(&firmware, fw_name, device); if (ret) { - printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); + printk(KERN_ERR "microcode: ucode data file %s load failed\n", + fw_name); return ret; } @@ -397,8 +400,8 @@ static int request_microcode_fw(int cpu, struct device *device) static int request_microcode_user(int cpu, const void __user *buf, size_t size) { - printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" - "is not supported\n"); + printk(KERN_WARNING "microcode: AMD microcode update via " + "/dev/cpu/microcode is not supported\n"); return -1; } -- cgit v1.2.3 From 8c135206c826095c852c16d94a0a74eeaf05c90d Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:13:00 +0100 Subject: x86: microcode_amd: fix compile warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix build warning CC arch/x86/kernel/microcode_amd.o arch/x86/kernel/microcode_amd.c: In function ‘request_microcode_fw’: arch/x86/kernel/microcode_amd.c:393: warning: passing argument 2 of ‘generic_load_microcode’ discards qualifiers from pointer target type (Respect "const" qualifier of firmware->data.) Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index a8a0ec60055..89b386c901f 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -225,8 +225,8 @@ static void apply_microcode_amd(int cpu) uci->cpu_sig.rev = rev; } -static void *get_next_ucode(u8 *buf, unsigned int size, - int (*get_ucode_data)(void *, const void *, size_t), +static void *get_next_ucode(const u8 *buf, unsigned int size, + int (*get_ucode_data)(void *, const u8 *, size_t), unsigned int *mc_size) { unsigned int total_size; @@ -268,8 +268,8 @@ static void *get_next_ucode(u8 *buf, unsigned int size, } -static int install_equiv_cpu_table(u8 *buf, - int (*get_ucode_data)(void *, const void *, size_t)) +static int install_equiv_cpu_table(const u8 *buf, + int (*get_ucode_data)(void *, const u8 *, size_t)) { #define UCODE_CONTAINER_HEADER_SIZE 12 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; @@ -311,11 +311,13 @@ static void free_equiv_cpu_table(void) } } -static int generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static int generic_load_microcode(int cpu, const u8 *data, size_t size, + int (*get_ucode_data)(void *, const u8 *, size_t)) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc; + const u8 *ucode_ptr = data; + void *new_mc = NULL; + void *mc; int new_rev = uci->cpu_sig.rev; unsigned int leftover; unsigned long offset; @@ -368,7 +370,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size, return (int)leftover; } -static int get_ucode_fw(void *to, const void *from, size_t n) +static int get_ucode_fw(void *to, const u8 *from, size_t n) { memcpy(to, from, n); return 0; @@ -390,7 +392,7 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } - ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, + ret = generic_load_microcode(cpu, firmware->data, firmware->size, &get_ucode_fw); release_firmware(firmware); -- cgit v1.2.3 From 0657d9ebff186dcdb17e582dcb909028775a7707 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:14:05 +0100 Subject: x86: microcode_amd: don't pass superfluous function pointer for get_ucode_data Impact: cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 89b386c901f..c7f225c7e48 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -225,9 +225,14 @@ static void apply_microcode_amd(int cpu) uci->cpu_sig.rev = rev; } +static int get_ucode_data(void *to, const u8 *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + static void *get_next_ucode(const u8 *buf, unsigned int size, - int (*get_ucode_data)(void *, const u8 *, size_t), - unsigned int *mc_size) + unsigned int *mc_size) { unsigned int total_size; #define UCODE_CONTAINER_SECTION_HDR 8 @@ -268,8 +273,7 @@ static void *get_next_ucode(const u8 *buf, unsigned int size, } -static int install_equiv_cpu_table(const u8 *buf, - int (*get_ucode_data)(void *, const u8 *, size_t)) +static int install_equiv_cpu_table(const u8 *buf) { #define UCODE_CONTAINER_HEADER_SIZE 12 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; @@ -311,8 +315,7 @@ static void free_equiv_cpu_table(void) } } -static int generic_load_microcode(int cpu, const u8 *data, size_t size, - int (*get_ucode_data)(void *, const u8 *, size_t)) +static int generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; const u8 *ucode_ptr = data; @@ -322,7 +325,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size, unsigned int leftover; unsigned long offset; - offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); + offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); return -EINVAL; @@ -335,8 +338,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size, unsigned int uninitialized_var(mc_size); struct microcode_header_amd *mc_header; - mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, - &mc_size); + mc = get_next_ucode(ucode_ptr, leftover, &mc_size); if (!mc) break; @@ -370,12 +372,6 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size, return (int)leftover; } -static int get_ucode_fw(void *to, const u8 *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - static int request_microcode_fw(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; @@ -392,8 +388,7 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } - ret = generic_load_microcode(cpu, firmware->data, firmware->size, - &get_ucode_fw); + ret = generic_load_microcode(cpu, firmware->data, firmware->size); release_firmware(firmware); -- cgit v1.2.3 From 29d0887ffd084cde9d6a1286cb82b71701a974dd Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:16:34 +0100 Subject: x86: microcode_amd: replace inline asm by common rdmsr/wrmsr functions Impact: cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/microcode_amd.c | 23 +++++------------------ 2 files changed, 7 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e38859d577a..cb58643947b 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -85,7 +85,9 @@ /* AMD64 MSRs. Not complete. See the architecture manual for a more complete list. */ +#define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index c7f225c7e48..2856955ddab 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -93,6 +93,7 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); + u32 dummy; memset(csig, 0, sizeof(*csig)); @@ -102,9 +103,7 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return -1; } - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (csig->rev) - : "i" (0x0000008B) : "ecx"); + rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", csig->rev); @@ -181,12 +180,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev) static void apply_microcode_amd(int cpu) { unsigned long flags; - unsigned int eax, edx; - unsigned int rev; + u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; struct microcode_amd *mc_amd = uci->mc; - unsigned long addr; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -195,19 +192,9 @@ static void apply_microcode_amd(int cpu) return; spin_lock_irqsave(µcode_update_lock, flags); - - addr = (unsigned long)&mc_amd->hdr.data_code; - edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); - eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); - - asm volatile("movl %0, %%ecx; wrmsr" : - : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); - + wrmsrl(MSR_AMD64_PATCH_LOADER, &mc_amd->hdr.data_code); /* get patch id after patching */ - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (rev) - : "i" (0x0000008B) : "ecx"); - + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ -- cgit v1.2.3 From 6cc9b6d94b6fee23b0671970f67d297fa76b68b3 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:17:45 +0100 Subject: x86: microcode_amd: consolidate macro definitions Impact: cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 2856955ddab..e68e723490a 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -75,15 +75,9 @@ struct microcode_amd { unsigned int mpb[0]; }; -#define UCODE_MAX_SIZE (2048) -#define DEFAULT_UCODE_DATASIZE (896) -#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) -#define DWSIZE (sizeof(u32)) -/* For now we support a fixed ucode total size only */ -#define get_totalsize(mc) \ - ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \ - + MC_HEADER_SIZE) +#define UCODE_MAX_SIZE 2048 +#define UCODE_CONTAINER_SECTION_HDR 8 +#define UCODE_CONTAINER_HEADER_SIZE 12 /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); @@ -222,7 +216,6 @@ static void *get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) { unsigned int total_size; -#define UCODE_CONTAINER_SECTION_HDR 8 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; void *mc; @@ -255,14 +248,12 @@ static void *get_next_ucode(const u8 *buf, unsigned int size, } else *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; } -#undef UCODE_CONTAINER_SECTION_HDR return mc; } static int install_equiv_cpu_table(const u8 *buf) { -#define UCODE_CONTAINER_HEADER_SIZE 12 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; unsigned int *buf_pos = (unsigned int *)container_hdr; unsigned long size; @@ -291,7 +282,6 @@ static int install_equiv_cpu_table(const u8 *buf) } return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ -#undef UCODE_CONTAINER_HEADER_SIZE } static void free_equiv_cpu_table(void) -- cgit v1.2.3 From 98415301ea2dd389539ab429bcfa9da07219eabc Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:20:21 +0100 Subject: x86: microcode_amd: remove (wrong) chipset deivce ID checks Impact: remove dead/incorrect code Currently there is no chipset specific ucode. The checks are incorrect anyway (e.g. pci device IDs are 16 bit and not 8 bit). Thus I remove the stuff for the time being and will reintroduce it if it's foreseeable that it is really needed. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index e68e723490a..2e8af6ef3da 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -108,7 +108,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) static int get_matching_microcode(int cpu, void *mc, int rev) { struct microcode_header_amd *mc_header = mc; - struct pci_dev *nb_pci_dev, *sb_pci_dev; unsigned int current_cpu_id; unsigned int equiv_cpu_id = 0x00; unsigned int i = 0; @@ -137,32 +136,11 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 0; } - /* ucode may be northbridge specific */ - if (mc_header->nb_dev_id) { - nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, - (mc_header->nb_dev_id & 0xff), - NULL); - if ((!nb_pci_dev) || - (mc_header->nb_rev_id != nb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d NB mismatch\n", cpu); - pci_dev_put(nb_pci_dev); - return 0; - } - pci_dev_put(nb_pci_dev); - } - - /* ucode may be southbridge specific */ - if (mc_header->sb_dev_id) { - sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, - (mc_header->sb_dev_id & 0xff), - NULL); - if ((!sb_pci_dev) || - (mc_header->sb_rev_id != sb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d SB mismatch\n", cpu); - pci_dev_put(sb_pci_dev); - return 0; - } - pci_dev_put(sb_pci_dev); + /* ucode might be chipset specific -- currently we don't support this */ + if (mc_header->nb_dev_id || mc_header->sb_dev_id) { + printk(KERN_WARNING "microcode: CPU%d loading of chipset " + "specific code not yet supported\n", cpu); + return 0; } if (mc_header->patch_id <= rev) -- cgit v1.2.3 From 5549b94bc74c3e7edd44e0aeb7d9f773e82d2d20 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:21:30 +0100 Subject: x86: microcode_amd: use 'packed' attribute for structs Impact: cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 45 +++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 2e8af6ef3da..e1ce650f276 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -47,28 +47,29 @@ MODULE_LICENSE("GPL v2"); #define UCODE_UCODE_TYPE 0x00000001 struct equiv_cpu_entry { - unsigned int installed_cpu; - unsigned int fixed_errata_mask; - unsigned int fixed_errata_compare; - unsigned int equiv_cpu; -}; + u32 installed_cpu; + u32 fixed_errata_mask; + u32 fixed_errata_compare; + u16 equiv_cpu; + u16 res; +} __attribute__((packed)); struct microcode_header_amd { - unsigned int data_code; - unsigned int patch_id; - unsigned char mc_patch_data_id[2]; - unsigned char mc_patch_data_len; - unsigned char init_flag; - unsigned int mc_patch_data_checksum; - unsigned int nb_dev_id; - unsigned int sb_dev_id; - u16 processor_rev_id; - unsigned char nb_rev_id; - unsigned char sb_rev_id; - unsigned char bios_api_rev; - unsigned char reserved1[3]; - unsigned int match_reg[8]; -}; + u32 data_code; + u32 patch_id; + u16 mc_patch_data_id; + u8 mc_patch_data_len; + u8 init_flag; + u32 mc_patch_data_checksum; + u32 nb_dev_id; + u32 sb_dev_id; + u16 processor_rev_id; + u8 nb_rev_id; + u8 sb_rev_id; + u8 bios_api_rev; + u8 reserved1[3]; + u32 match_reg[8]; +} __attribute__((packed)); struct microcode_amd { struct microcode_header_amd hdr; @@ -109,7 +110,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) { struct microcode_header_amd *mc_header = mc; unsigned int current_cpu_id; - unsigned int equiv_cpu_id = 0x00; + u16 equiv_cpu_id = 0; unsigned int i = 0; BUG_ON(equiv_cpu_table == NULL); @@ -117,7 +118,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) while (equiv_cpu_table[i].installed_cpu != 0) { if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { - equiv_cpu_id = equiv_cpu_table[i].equiv_cpu & 0xffff; + equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; break; } i++; -- cgit v1.2.3 From df23cab563912ba43f7e9bc8ac517e5a2ddc9cd2 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:22:36 +0100 Subject: x86: microcode_amd: modify log messages Impact: change microcode printk content Change log level and provide (at least I tried to;-) consistent, short, meaningful content. Signed-off-by: Andreas Herrmann Cc: Dmitry Adamushko Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 58 ++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index e1ce650f276..24c256f4e50 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -91,18 +91,13 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) u32 dummy; memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", - cpu); + printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " + "supported\n", cpu, c->x86); return -1; } - rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - - printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", - csig->rev); - + printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } @@ -125,21 +120,21 @@ static int get_matching_microcode(int cpu, void *mc, int rev) } if (!equiv_cpu_id) { - printk(KERN_ERR "microcode: CPU%d cpu_id " - "not found in equivalent cpu table\n", cpu); + printk(KERN_WARNING "microcode: CPU%d: cpu revision " + "not listed in equivalent cpu table\n", cpu); return 0; } if (mc_header->processor_rev_id != equiv_cpu_id) { - printk(KERN_ERR "microcode: CPU%d patch does not match " - "(processor_rev_id: %x, eqiv_cpu_id: %x)\n", + printk(KERN_ERR "microcode: CPU%d: patch mismatch " + "(processor_rev_id: %x, equiv_cpu_id: %x)\n", cpu, mc_header->processor_rev_id, equiv_cpu_id); return 0; } /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - printk(KERN_WARNING "microcode: CPU%d loading of chipset " + printk(KERN_ERR "microcode: CPU%d: loading of chipset " "specific code not yet supported\n", cpu); return 0; } @@ -172,15 +167,13 @@ static void apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", cpu_num, - mc_amd->hdr.patch_id, rev); + printk(KERN_ERR "microcode: CPU%d: update failed " + "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); return; } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x\n", - cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id); + printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", + cpu, rev); uci->cpu_sig.rev = rev; } @@ -202,18 +195,18 @@ static void *get_next_ucode(const u8 *buf, unsigned int size, return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - printk(KERN_ERR "microcode: error! " - "Wrong microcode payload type field\n"); + printk(KERN_ERR "microcode: error: invalid type field in " + "container file section header\n"); return NULL; } total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); - printk(KERN_INFO "microcode: size %u, total_size %u\n", - size, total_size); + printk(KERN_DEBUG "microcode: size %u, total_size %u\n", + size, total_size); if (total_size > size || total_size > UCODE_MAX_SIZE) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + printk(KERN_ERR "microcode: error: size mismatch\n"); return NULL; } @@ -243,14 +236,15 @@ static int install_equiv_cpu_table(const u8 *buf) size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - printk(KERN_ERR "microcode: error! " - "Wrong microcode equivalent cpu table\n"); + printk(KERN_ERR "microcode: error: invalid type field in " + "container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); + printk(KERN_ERR "microcode: failed to allocate " + "equivalent CPU table\n"); return 0; } @@ -283,7 +277,8 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); + printk(KERN_ERR "microcode: failed to create " + "equivalent cpu table\n"); return -EINVAL; } @@ -339,8 +334,7 @@ static int request_microcode_fw(int cpu, struct device *device) ret = request_firmware(&firmware, fw_name, device); if (ret) { - printk(KERN_ERR "microcode: ucode data file %s load failed\n", - fw_name); + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); return ret; } @@ -353,8 +347,8 @@ static int request_microcode_fw(int cpu, struct device *device) static int request_microcode_user(int cpu, const void __user *buf, size_t size) { - printk(KERN_WARNING "microcode: AMD microcode update via " - "/dev/cpu/microcode is not supported\n"); + printk(KERN_INFO "microcode: AMD microcode update via " + "/dev/cpu/microcode not supported\n"); return -1; } -- cgit v1.2.3 From d4377974062122d6d9be0bbd8a910a0954714194 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Dec 2008 20:59:24 +0100 Subject: x86: support always running TSC on Intel CPUs, add cpufeature definition Impact: add new synthetic-cpuid bit definition add X86_FEATURE_NONSTOP_TSC to the cpufeature bits - this is in preparation of Venki's always-running-TSC patch. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 5bce8ed02b4..ea408dcba51 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -92,6 +92,7 @@ #define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ -- cgit v1.2.3 From 40fb17152c50a69dc304dd632131c2f41281ce44 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Mon, 17 Nov 2008 16:11:37 -0800 Subject: x86: support always running TSC on Intel CPUs Impact: reward non-stop TSCs with good TSC-based clocksources, etc. Add support for CPUID_0x80000007_Bit8 on Intel CPUs as well. This bit means that the TSC is invariant with C/P/T states and always runs at constant frequency. With Intel CPUs, we have 3 classes * CPUs where TSC runs at constant rate and does not stop n C-states * CPUs where TSC runs at constant rate, but will stop in deep C-states * CPUs where TSC rate will vary based on P/T-states and TSC will stop in deep C-states. To cover these 3, one feature bit (CONSTANT_TSC) is not enough. So, add a second bit (NONSTOP_TSC). CONSTANT_TSC indicates that the TSC runs at constant frequency irrespective of P/T-states, and NONSTOP_TSC indicates that TSC does not stop in deep C-states. CPUID_0x8000000_Bit8 indicates both these feature bit can be set. We still have CONSTANT_TSC _set_ and NONSTOP_TSC _not_set_ on some older Intel CPUs, based on model checks. We can use TSC on such CPUs for time, as long as those CPUs do not support/enter deep C-states. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 9 +++++++-- arch/x86/kernel/cpu/intel.c | 10 ++++++++++ arch/x86/kernel/process.c | 2 +- 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8f1e31db2ad..7c878f6aa91 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -283,9 +283,14 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cce0b6118d5..caec59437a2 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -41,6 +41,16 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; #endif + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772744d..18c70fedba3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -270,7 +270,7 @@ static void c1e_idle(void) rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { c1e_detected = 1; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); -- cgit v1.2.3 From f63c2f248959366cd11bfa476f866737047cf663 Mon Sep 17 00:00:00 2001 From: Tej Date: Tue, 16 Dec 2008 11:56:06 -0800 Subject: xen: whitespace/checkpatch cleanup Impact: cleanup Signed-off-by: Tej Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 16 +++++++++------- arch/x86/xen/mmu.c | 17 ++++++++++------- arch/x86/xen/multicalls.c | 2 +- arch/x86/xen/setup.c | 9 +++++---- 4 files changed, 25 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5e4686d70f6..86cd2f82968 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -793,7 +793,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) ret = 0; - switch(msr) { + switch (msr) { #ifdef CONFIG_X86_64 unsigned which; u64 base; @@ -1453,7 +1453,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) ident_pte = 0; pfn = 0; - for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { pte_t *pte_page; /* Reuse or allocate a page of ptes */ @@ -1471,7 +1471,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) } /* Install mappings */ - for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { pte_t pte; if (pfn > max_pfn_mapped) @@ -1485,7 +1485,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) } } - for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); set_page_prot(pmd, PAGE_KERNEL_RO); @@ -1499,7 +1499,7 @@ static void convert_pfn_mfn(void *v) /* All levels are converted the same way, so just treat them as ptes. */ - for(i = 0; i < PTRS_PER_PTE; i++) + for (i = 0; i < PTRS_PER_PTE; i++) pte[i] = xen_make_pte(pte[i].pte); } @@ -1514,7 +1514,8 @@ static void convert_pfn_mfn(void *v) * of the physical mapping once some sort of allocator has been set * up. */ -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) { pud_t *l3; pmd_t *l2; @@ -1577,7 +1578,8 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf #else /* !CONFIG_X86_64 */ static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) { pmd_t *kernel_pmd; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 636ef4caa52..773d68d3e91 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -154,13 +154,13 @@ void xen_setup_mfn_list_list(void) { unsigned pfn, idx; - for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); } - for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { + for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); } @@ -179,7 +179,7 @@ void __init xen_build_dynamic_phys_to_machine(void) unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; - for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top[topidx] = &mfn_list[pfn]; @@ -207,7 +207,7 @@ static void alloc_p2m(unsigned long **pp, unsigned long *mfnp) p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); BUG_ON(p == NULL); - for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++) + for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) p[i] = INVALID_P2M_ENTRY; if (cmpxchg(pp, p2m_missing, p) != p2m_missing) @@ -407,7 +407,8 @@ out: preempt_enable(); } -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ return *ptep; @@ -878,7 +879,8 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) if (user_pgd) { xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); - xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); + xen_do_pin(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ @@ -993,7 +995,8 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) pgd_t *user_pgd = xen_get_user_pgd(pgd); if (user_pgd) { - xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); + xen_do_pin(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(user_pgd))); xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); } } diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 8ea8a0d0b0d..c738644b543 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -154,7 +154,7 @@ void xen_mc_flush(void) ret, smp_processor_id()); dump_stack(); for (i = 0; i < b->mcidx; i++) { - printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", + printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n", i+1, b->mcidx, b->debug[i].op, b->debug[i].args[0], diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index d6790108388..15c6c68db6a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -28,6 +28,9 @@ /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +extern void xen_sysenter_target(void); +extern void xen_syscall_target(void); +extern void xen_syscall32_target(void); /** @@ -110,7 +113,6 @@ static __cpuinit int register_callback(unsigned type, const void *func) void __cpuinit xen_enable_sysenter(void) { - extern void xen_sysenter_target(void); int ret; unsigned sysenter_feature; @@ -132,8 +134,6 @@ void __cpuinit xen_enable_syscall(void) { #ifdef CONFIG_X86_64 int ret; - extern void xen_syscall_target(void); - extern void xen_syscall32_target(void); ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); if (ret != 0) { @@ -160,7 +160,8 @@ void __init xen_arch_setup(void) HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); if (!xen_feature(XENFEAT_auto_translated_physmap)) - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); + HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_pae_extended_cr3); if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) -- cgit v1.2.3 From aab02f0ae20b8fe0fe891e9f107c6e392256ca01 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 15 Dec 2008 22:23:54 +0530 Subject: x86: process_64.c declare __switch_to() and sys_arch_prctl before they get used Impact: cleanup In asm/system.h moved out __switch_to from CONFIG_X86_32 as it is common for both 32 and 64 bit. In asm/pctl.h defined sys_arch_prctl Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/prctl.h | 3 +++ arch/x86/include/asm/system.h | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h index fe681147a4f..a8894647dd9 100644 --- a/arch/x86/include/asm/prctl.h +++ b/arch/x86/include/asm/prctl.h @@ -6,5 +6,8 @@ #define ARCH_GET_FS 0x1003 #define ARCH_GET_GS 0x1004 +#ifdef CONFIG_X86_64 +extern long sys_arch_prctl(int, unsigned long); +#endif /* CONFIG_X86_64 */ #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2ed3f0f44ff..59555f48bf4 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -17,12 +17,12 @@ # define AT_VECTOR_SIZE_ARCH 1 #endif -#ifdef CONFIG_X86_32 - struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); +#ifdef CONFIG_X86_32 + /* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. -- cgit v1.2.3 From 7b5b50f1be9e07714cfaa620d102c8daf3cdd814 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 15 Dec 2008 22:24:48 +0530 Subject: x86: signal.c declare do_notify_resume before they get used Impact: cleanup In asm/signal.h moved out do_notify_resume from __i386__ as it is common for both 32 and 64 bit. Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar arch/x86/include/asm/signal.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) --- arch/x86/include/asm/signal.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 96ac44f275d..7761a5d554b 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -121,6 +121,10 @@ typedef unsigned long sigset_t; #ifndef __ASSEMBLY__ +# ifdef __KERNEL__ +extern void do_notify_resume(struct pt_regs *, void *, __u32); +# endif /* __KERNEL__ */ + #ifdef __i386__ # ifdef __KERNEL__ struct old_sigaction { @@ -141,8 +145,6 @@ struct k_sigaction { struct sigaction sa; }; -extern void do_notify_resume(struct pt_regs *, void *, __u32); - # else /* __KERNEL__ */ /* Here we must cater to libcs that poke about in kernel headers. */ -- cgit v1.2.3 From c0195b6da08c4ddd8c8ea830f6c3c40bc7f82071 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 15 Dec 2008 22:26:30 +0530 Subject: x86: ldt.c declare sys_modify_ldt before they get used Impact: cleanup In asm/syscalls.h moved out sys_modify_ldt from CONFIG_X86_32 as it is common for both 32 and 64 bit. Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/syscalls.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 87803da4401..75d4a6afc36 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -19,6 +19,9 @@ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +/* kernel/ldt.c */ +asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); + /* X86_32 only */ #ifdef CONFIG_X86_32 /* kernel/process_32.c */ @@ -38,9 +41,6 @@ asmlinkage int sys_rt_sigreturn(unsigned long); /* kernel/ioport.c */ asmlinkage long sys_iopl(unsigned long); -/* kernel/ldt.c */ -asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); - /* kernel/sys_i386_32.c */ asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -- cgit v1.2.3 From a9b43c7d9890066709609df849959009645c1a19 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 15 Dec 2008 23:11:10 +0530 Subject: x86: setup.c find_and_reserve_crashkernel should be static Impact: cleanup, reduce kernel size a bit Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9d5674f7b6c..81f5d22747a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -448,6 +448,7 @@ static void __init reserve_early_setup_data(void) * @size: Size of the crashkernel memory to reserve. * Returns the base address on success, and -1ULL on failure. */ +static unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) { const unsigned long long alignment = 16<<20; /* 16M */ -- cgit v1.2.3 From ecbf29cdb3990c83d90d0c4187c89fb2ce423367 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 16 Dec 2008 12:37:07 -0800 Subject: xen: clean up asm/xen/hypervisor.h Impact: cleanup hypervisor.h had accumulated a lot of crud, including lots of spurious #includes. Clean it all up, and go around fixing up everything else accordingly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/include/asm/xen/hypercall.h | 6 ++++++ arch/x86/include/asm/xen/hypervisor.h | 39 +++++++---------------------------- arch/x86/include/asm/xen/page.h | 5 +++++ arch/x86/xen/enlighten.c | 1 + 4 files changed, 20 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 3f6000d95fe..5e79ca69432 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -33,8 +33,14 @@ #ifndef _ASM_X86_XEN_HYPERCALL_H #define _ASM_X86_XEN_HYPERCALL_H +#include +#include #include #include +#include + +#include +#include #include #include diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index a38d25ac87d..81fbd735aec 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -33,39 +33,10 @@ #ifndef _ASM_X86_XEN_HYPERVISOR_H #define _ASM_X86_XEN_HYPERVISOR_H -#include -#include - -#include -#include - -#include -#include -#include -#if defined(__i386__) -# ifdef CONFIG_X86_PAE -# include -# else -# include -# endif -#endif -#include - /* arch/i386/kernel/setup.c */ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; -/* arch/i386/mach-xen/evtchn.c */ -/* Force a proper event-channel callback from Xen. */ -extern void force_evtchn_callback(void); - -/* Turn jiffies into Xen system time. */ -u64 jiffies_to_st(unsigned long jiffies); - - -#define MULTI_UVMFLAGS_INDEX 3 -#define MULTI_UVMDOMID_INDEX 4 - enum xen_domain_type { XEN_NATIVE, XEN_PV_DOMAIN, @@ -74,9 +45,15 @@ enum xen_domain_type { extern enum xen_domain_type xen_domain_type; +#ifdef CONFIG_XEN #define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) +#else +#define xen_domain() (0) +#endif + +#define xen_pv_domain() (xen_domain() && xen_domain_type == XEN_PV_DOMAIN) +#define xen_hvm_domain() (xen_domain() && xen_domain_type == XEN_HVM_DOMAIN) + #define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN) -#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index bc628998a1b..7ef617ef1df 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -1,11 +1,16 @@ #ifndef _ASM_X86_XEN_PAGE_H #define _ASM_X86_XEN_PAGE_H +#include +#include +#include #include #include +#include #include +#include #include /* Xen machine address */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 86cd2f82968..bea215230b2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3 From 8ae936690972dfcad73d0dde1095b9f32af5ee95 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 12 Dec 2008 15:52:26 -0800 Subject: x86: hardirq: use inc_irq_stat() in non-unified functions Impact: cleanup Replace incrementing irq stat with inc_irq_stat() in non-unified functions. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- arch/x86/kernel/time_32.c | 2 +- arch/x86/kernel/time_64.c | 2 +- arch/x86/kernel/tlb_32.c | 2 +- arch/x86/kernel/tlb_64.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 5eb390a4b2e..748c8f9e7a0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -237,7 +237,7 @@ asmlinkage void mce_threshold_interrupt(void) } } out: - add_pda(irq_threshold_count, 1); + inc_irq_stat(irq_threshold_count); irq_exit(); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index c17eaf5dd6d..4b48f251fd3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -26,7 +26,7 @@ asmlinkage void smp_thermal_interrupt(void) if (therm_throt_process(msr_val & 1)) mce_log_therm_throt_event(smp_processor_id(), msr_val); - add_pda(irq_thermal_count, 1); + inc_irq_stat(irq_thermal_count); irq_exit(); } diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 77b400f06ea..65309e4cb1c 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -75,7 +75,7 @@ EXPORT_SYMBOL(profile_pc); irqreturn_t timer_interrupt(int irq, void *dev_id) { /* Keep nmi watchdog up to date */ - per_cpu(irq_stat, smp_processor_id()).irq0_irqs++; + inc_irq_stat(irq0_irqs); #ifdef CONFIG_X86_IO_APIC if (timer_ack) { diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 418a095c579..1749cacde8b 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(profile_pc); irqreturn_t timer_interrupt(int irq, void *dev_id) { - add_pda(irq0_irqs, 1); + inc_irq_stat(irq0_irqs); global_clock_event->event_handler(global_clock_event); diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index f4049f3513b..f374f83fca4 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -119,7 +119,7 @@ void smp_invalidate_interrupt(struct pt_regs *regs) smp_mb__after_clear_bit(); out: put_cpu_no_resched(); - __get_cpu_var(irq_stat).irq_tlb_count++; + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 8f919ca6949..29887d7081a 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -154,7 +154,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) out: ack_APIC_irq(); cpu_clear(cpu, f->flush_cpumask); - add_pda(irq_tlb_count, 1); + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, -- cgit v1.2.3 From fde9071167c4624281553b23232aa8b81e71c790 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 12 Dec 2008 11:26:35 -0800 Subject: x86: clean up dead code in vmi_32.c Impact: cleanup, remove dead debug code I ran across some old debugging code in vmi paravirt-ops code that was already dead, but still potentially useful. After reviewing recent changes to the way kernel page tables are allocated and initialized, and the lack of bugs caught by this debugging code, I've concluded it is now totally useless to have around, and it's already been #if 0'd for quite some time. There's no rush to get this in mainline, but it's also totally harmless, so I'll let the x86 maintainers decide where it should be tucked. I've been out of the mainstream dev loop for a couple months, so apologies if I haven't got any protocol changes in order. Remove mummified remains found in vmi_32.c Signed-off-by: Zachary Amsden Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmi_32.c | 119 ----------------------------------------------- 1 file changed, 119 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 8b6c393ab9f..8087e0cd877 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -266,109 +266,6 @@ static void vmi_nop(void) { } -#ifdef CONFIG_DEBUG_PAGE_TYPE - -#ifdef CONFIG_X86_PAE -#define MAX_BOOT_PTS (2048+4+1) -#else -#define MAX_BOOT_PTS (1024+1) -#endif - -/* - * During boot, mem_map is not yet available in paging_init, so stash - * all the boot page allocations here. - */ -static struct { - u32 pfn; - int type; -} boot_page_allocations[MAX_BOOT_PTS]; -static int num_boot_page_allocations; -static int boot_allocations_applied; - -void vmi_apply_boot_page_allocations(void) -{ - int i; - BUG_ON(!mem_map); - for (i = 0; i < num_boot_page_allocations; i++) { - struct page *page = pfn_to_page(boot_page_allocations[i].pfn); - page->type = boot_page_allocations[i].type; - page->type = boot_page_allocations[i].type & - ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - } - boot_allocations_applied = 1; -} - -static void record_page_type(u32 pfn, int type) -{ - BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS); - boot_page_allocations[num_boot_page_allocations].pfn = pfn; - boot_page_allocations[num_boot_page_allocations].type = type; - num_boot_page_allocations++; -} - -static void check_zeroed_page(u32 pfn, int type, struct page *page) -{ - u32 *ptr; - int i; - int limit = PAGE_SIZE / sizeof(int); - - if (page_address(page)) - ptr = (u32 *)page_address(page); - else - ptr = (u32 *)__va(pfn << PAGE_SHIFT); - /* - * When cloning the root in non-PAE mode, only the userspace - * pdes need to be zeroed. - */ - if (type & VMI_PAGE_CLONE) - limit = KERNEL_PGD_BOUNDARY; - for (i = 0; i < limit; i++) - BUG_ON(ptr[i]); -} - -/* - * We stash the page type into struct page so we can verify the page - * types are used properly. - */ -static void vmi_set_page_type(u32 pfn, int type) -{ - /* PAE can have multiple roots per page - don't track */ - if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) - return; - - if (boot_allocations_applied) { - struct page *page = pfn_to_page(pfn); - if (type != VMI_PAGE_NORMAL) - BUG_ON(page->type); - else - BUG_ON(page->type == VMI_PAGE_NORMAL); - page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - if (type & VMI_PAGE_ZEROED) - check_zeroed_page(pfn, type, page); - } else { - record_page_type(pfn, type); - } -} - -static void vmi_check_page_type(u32 pfn, int type) -{ - /* PAE can have multiple roots per page - skip checks */ - if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) - return; - - type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - if (boot_allocations_applied) { - struct page *page = pfn_to_page(pfn); - BUG_ON((page->type ^ type) & VMI_PAGE_PAE); - BUG_ON(type == VMI_PAGE_NORMAL && page->type); - BUG_ON((type & page->type) == 0); - } -} -#else -#define vmi_set_page_type(p,t) do { } while (0) -#define vmi_check_page_type(p,t) do { } while (0) -#endif - #ifdef CONFIG_HIGHPTE static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) { @@ -395,7 +292,6 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { - vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } @@ -406,27 +302,22 @@ static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn) * It is called only for swapper_pg_dir, which already has * data on it. */ - vmi_set_page_type(pfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); } static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) { - vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); - vmi_check_page_type(clonepfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); } static void vmi_release_pte(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L1); - vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } static void vmi_release_pmd(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L2); - vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } /* @@ -450,26 +341,22 @@ static void vmi_release_pmd(unsigned long pfn) static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_set_pte(pte_t *ptep, pte_t pte) { /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD); vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); } static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } @@ -477,10 +364,8 @@ static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) { #ifdef CONFIG_X86_PAE const pte_t pte = { .pte = pmdval.pmd }; - vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); #else const pte_t pte = { pmdval.pud.pgd.pgd }; - vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD); #endif vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); } @@ -502,7 +387,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval) static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); } @@ -510,21 +394,18 @@ static void vmi_set_pud(pud_t *pudp, pud_t pudval) { /* Um, eww */ const pte_t pte = { .pte = pudval.pgd.pgd }; - vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); } static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { const pte_t pte = { .pte = 0 }; - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_pmd_clear(pmd_t *pmd) { const pte_t pte = { .pte = 0 }; - vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); } #endif -- cgit v1.2.3 From 189f67c4408806563a1f061f5c8bf184a6658477 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 12 Dec 2008 14:50:40 -0600 Subject: x86: UV fix for global physical addresses Impact: fix UV boot crash This fixes a UV bug related to generating global memory addresses on partitioned systems. Partition systems do not have physical memory at address 0. Instead, a chunk of high memory is remapped by the chipset so that it appears to be at address 0. This remapping is INVISIBLE to most of the OS. The only OS functions that need to be aware of the remaping are functions that directly interface to the chipset. The GRU is one example. Also, delete a couple of unused macros related to global memory addresses. Signed-off-by: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 16 ++-------------- arch/x86/kernel/genx2apic_uv_x.c | 3 +-- 2 files changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 52aa943c634..777327ef05c 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -210,7 +210,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) { if (paddr < uv_hub_info->lowmem_remap_top) - paddr += uv_hub_info->lowmem_remap_base; + paddr |= uv_hub_info->lowmem_remap_base; return paddr | uv_hub_info->gnode_upper; } @@ -218,19 +218,7 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) /* socket virtual --> UV global physical address */ static inline unsigned long uv_gpa(void *v) { - return __pa(v) | uv_hub_info->gnode_upper; -} - -/* socket virtual --> UV global physical address */ -static inline void *uv_vgpa(void *v) -{ - return (void *)uv_gpa(v); -} - -/* UV global physical address --> socket virtual */ -static inline void *uv_va(unsigned long gpa) -{ - return __va(gpa & uv_hub_info->gpa_mask); + return uv_soc_phys_ram_to_gpa(__pa(v)); } /* pnode, offset --> socket virtual */ diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 221299f4509..dece1728973 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -540,8 +540,7 @@ void __init uv_system_init(void) uv_blade_info[blade].nr_possible_cpus++; uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; - uv_cpu_hub_info(cpu)->lowmem_remap_top = - lowmem_redir_base + lowmem_redir_size; + uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; uv_cpu_hub_info(cpu)->n_val = m_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; -- cgit v1.2.3 From cf9b303e55da810255638c0b616b1a3f7eda9320 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 Dec 2008 23:33:10 +0100 Subject: x86: re-enable MCE on secondary CPUS after suspend/resume Impact: fix disabled MCE after resume Don't prevent multiple initialization of MCEs. Back from early prehistory mcheck_init() has a reentry check. Presumably that was needed in very old kernels to prevent it entering twice. But as Andreas points out this prevents CPU hotplug (and therefore resume) to correctly reinitialize MCEs when a AP boots again after being offlined. Just drop the check. Reported-by: Andreas Herrmann Signed-off-by: Andi Kleen Tested-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_64.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 4b031a4ac85..1c838032fd3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -510,12 +510,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - static cpumask_t mce_cpus = CPU_MASK_NONE; - mce_cpu_quirks(c); if (mce_dont_init || - cpu_test_and_set(smp_processor_id(), mce_cpus) || !mce_available(c)) return; -- cgit v1.2.3 From c8182f0016fb65a721c4fbe487909a2d56178135 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 12 Dec 2008 11:07:00 -0600 Subject: sgi-xp: xpc needs to pass the physical address, not virtual Impact: fix crash xpc needs to pass the physical address, not virtual. Testing uncovered this problem. The virtual address happens to work most of the time due to the way bios was masking off the node bits. Passing the physical address makes it work all of the time. Signed-off-by: Russ Anderson Acked-by: Dean Nelson Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/bios.h | 2 +- arch/x86/kernel/bios_uv.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index da1c4e8e78f..7ed17ff502b 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -100,7 +100,7 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); -extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, +extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index d22d0f1bbea..2a0a2a3cac2 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -101,15 +101,13 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, } int -uv_bios_mq_watchlist_alloc(int blade, void *mq, unsigned int mq_size, +uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, unsigned long *intr_mmr_offset) { union uv_watchlist_u size_blade; - unsigned long addr; u64 watchlist; s64 ret; - addr = (unsigned long)mq; size_blade.size = mq_size; size_blade.blade = blade; -- cgit v1.2.3 From ae417bb487e3bb88dc862b83b4bf00d87ba67ec8 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 16 Dec 2008 14:02:16 -0800 Subject: x86: signal: use signal_fault() in sys_sigreturn() Impact: cleanup Call signal_fault() in error route of sys_sigreturn(). Change log level to KERN_EMERG if current is init. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b1cc6da6420..2725a294d73 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -594,17 +594,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused) return ax; badframe: - if (show_unhandled_signals && printk_ratelimit()) { - printk("%s%s[%d] bad frame in sigreturn frame:" - "%p ip:%lx sp:%lx oeax:%lx", - task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, - current->comm, task_pid_nr(current), frame, regs->ip, - regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); - } - - force_sig(SIGSEGV, current); + signal_fault(regs, frame, "sigreturn"); return 0; } @@ -901,8 +891,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) struct task_struct *me = current; if (show_unhandled_signals && printk_ratelimit()) { - printk(KERN_INFO + printk("%s" "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); -- cgit v1.2.3 From d0b48ca189523b638d8674fa41e94d1950a17038 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 16 Dec 2008 14:03:36 -0800 Subject: x86: ia32_signal: use __put_user() instead of __copy_to_user() Impact: cleanup __put_user() can be used for constant size 8, like arch/x86/kernel/signal.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 1267977e770..e4f2a504574 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -467,7 +467,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, * These are actually not used anymore, but left because some * gdb versions depend on them as a marker. */ - err |= __copy_to_user(frame->retcode, &code, 8); + err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode); if (err) return -EFAULT; @@ -554,7 +554,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * Not actually used anymore, but left because some gdb * versions need it. */ - err |= __copy_to_user(frame->retcode, &code, 8); + err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode); if (err) return -EFAULT; -- cgit v1.2.3 From 8bee3f0a662ad9c3d6bb705b0530a3b90f089c55 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 16 Dec 2008 14:04:43 -0800 Subject: x86: ia32_signal: use proper macro __USER32_DS Impact: cleanup Use __USER32_DS instead of __USER_DS in ia32_signal.c. No impact, because __USER32_DS is defined __USER_DS. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index e4f2a504574..9c99c429a20 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -396,7 +396,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, } /* This is the legacy signal stack switching. */ - else if ((regs->ss & 0xffff) != __USER_DS && + else if ((regs->ss & 0xffff) != __USER32_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) sp = (unsigned long) ka->sa.sa_restorer; -- cgit v1.2.3 From d680fe44775ed17a80035462d9898f5e77bfd7dd Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 13 Dec 2008 00:09:08 +0300 Subject: x86: entry_64 - introduce FTRACE_ frame macro v2 Impact: clean up Itroduce MCOUNT_SAVE/RESTORE_FRAME which allow us to save a number of lines on source level. Also fix a comment in ftrace.h. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 29 +++++++++++++++++++++- arch/x86/kernel/entry_64.S | 57 ++++++------------------------------------- 2 files changed, 35 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 7e61b4ceb9a..b55b4a7fbef 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -1,6 +1,33 @@ #ifndef _ASM_X86_FTRACE_H #define _ASM_X86_FTRACE_H +#ifdef __ASSEMBLY__ + + .macro MCOUNT_SAVE_FRAME + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + .endm + + .macro MCOUNT_RESTORE_FRAME + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + .endm + +#endif + #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ @@ -46,7 +73,7 @@ struct ftrace_ret_stack { /* * Primary handler of a function return. * It relays on ftrace_return_to_handler. - * Defined in entry32.S + * Defined in entry_32/64.S */ extern void return_to_handler(void); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 54e0bbdccb9..303dd84d2a9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -71,15 +71,7 @@ ENTRY(ftrace_caller) cmpl $0, function_trace_stop jne ftrace_stub - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi @@ -89,14 +81,7 @@ ENTRY(ftrace_caller) ftrace_call: call ftrace_stub - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call @@ -130,15 +115,7 @@ ftrace_stub: retq trace: - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi @@ -146,14 +123,7 @@ trace: call *ftrace_trace_function - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME jmp ftrace_stub END(mcount) @@ -165,14 +135,7 @@ ENTRY(ftrace_graph_caller) cmpl $0, function_trace_stop jne ftrace_stub - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME leaq 8(%rbp), %rdi movq 0x38(%rsp), %rsi @@ -180,14 +143,8 @@ ENTRY(ftrace_graph_caller) call prepare_ftrace_return - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME + retq END(ftrace_graph_caller) -- cgit v1.2.3 From cf558d25e5c9f70fa0279c9b7b8b4aed7cae9bd4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 17 Dec 2008 15:06:01 +0100 Subject: AMD IOMMU: set cmd buffer pointers to zero manually Impact: set cmd buffer head and tail pointers to zero in case nobody else did Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c90a15eba5c..c6cc22815d3 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -427,6 +427,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, &entry, sizeof(entry)); + /* set head and tail to zero manually */ + writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); return cmd_buf; -- cgit v1.2.3 From 84df81759590ad16b0024cf46b3423cca76b2e07 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 17 Dec 2008 16:36:44 +0100 Subject: AMD IOMMU: panic if completion wait loop fails Impact: prevents data corruption after a failed completion wait loop Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a7b6dec6fc3..0a60d60ed03 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -235,8 +235,9 @@ static int iommu_completion_wait(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) - printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); + if (unlikely(i == EXIT_LOOP_COUNT)) + panic("AMD IOMMU: Completion wait loop failed\n"); + out: spin_unlock_irqrestore(&iommu->lock, flags); -- cgit v1.2.3 From f5223763a664da16771211f9d293e18cb242b246 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:47:17 -0800 Subject: x86: signal: move ia32 func declarations into arch/x86/kernel/signal.c Impact: cleanup Move declarations of ia32_setup_rt_frame() and ia32_setup_frame() into arch/x86/kernel/signal.c. This is for future use of sigframe.h. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/sigframe.h | 5 ----- arch/x86/kernel/signal.c | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h index cc673aa55ce..6dd7e2b70a4 100644 --- a/arch/x86/kernel/sigframe.h +++ b/arch/x86/kernel/sigframe.h @@ -34,9 +34,4 @@ struct rt_sigframe { struct siginfo info; /* fp state follows here */ }; - -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs *regs); #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 2725a294d73..848c2d64a28 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -671,6 +671,11 @@ static int signr_convert(int sig) #define is_ia32 0 #endif /* CONFIG_IA32_EMULATION */ +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs *regs); + #endif /* CONFIG_X86_32 */ static int -- cgit v1.2.3 From a5c56eb36f999ae0ecac278e51fd1cf8feb16c2f Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:49:55 -0800 Subject: x86: signal: rename sigframe and rt_sigframe on 32-bit Impact: cleanup, prepare to move sigframe.h On 32-bit, rename struct sigrame to struct sigframe_ia32, struct rt_sigframe to struct rt_sigframe_ia32 and several structures. And add helper macros to access the above data in arch/x86/kernel/signal.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/sigframe.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h index 6dd7e2b70a4..6718ed04b05 100644 --- a/arch/x86/kernel/sigframe.h +++ b/arch/x86/kernel/sigframe.h @@ -1,8 +1,14 @@ #ifdef CONFIG_X86_32 -struct sigframe { - char __user *pretcode; +#define sigframe_ia32 sigframe +#define rt_sigframe_ia32 rt_sigframe +#define sigcontext_ia32 sigcontext +#define _fpstate_ia32 _fpstate +#define ucontext_ia32 ucontext + +struct sigframe_ia32 { + u32 pretcode; int sig; - struct sigcontext sc; + struct sigcontext_ia32 sc; /* * fpstate is unused. fpstate is moved/allocated after * retcode[] below. This movement allows to have the FP state and the @@ -11,27 +17,27 @@ struct sigframe { * the offset of extramask[] in the sigframe and thus prevent any * legacy application accessing/modifying it. */ - struct _fpstate fpstate_unused; + struct _fpstate_ia32 fpstate_unused; unsigned long extramask[_NSIG_WORDS-1]; char retcode[8]; /* fp state follows here */ }; -struct rt_sigframe { - char __user *pretcode; +struct rt_sigframe_ia32 { + u32 pretcode; int sig; - struct siginfo __user *pinfo; - void __user *puc; + u32 pinfo; + u32 puc; struct siginfo info; - struct ucontext uc; + struct ucontext_ia32 uc; char retcode[8]; /* fp state follows here */ }; -#else +#else /* !CONFIG_X86_32 */ struct rt_sigframe { char __user *pretcode; struct ucontext uc; struct siginfo info; /* fp state follows here */ }; -#endif +#endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From 41af86fad3c40646b9748279e3862781e937a5d2 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:50:32 -0800 Subject: x86: signal: move sigframe.h to arch/x86/include/asm Impact: cleanup, move header file Move arch/x86/kernel/sigframe.h to arch/x86/include/asm/sigframe.h. It will be used in arch/x86/ia32/ia32_signal.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigframe.h | 43 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/asm-offsets_32.c | 2 +- arch/x86/kernel/sigframe.h | 43 ---------------------------------------- arch/x86/kernel/signal.c | 2 +- 4 files changed, 45 insertions(+), 45 deletions(-) create mode 100644 arch/x86/include/asm/sigframe.h delete mode 100644 arch/x86/kernel/sigframe.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h new file mode 100644 index 00000000000..6718ed04b05 --- /dev/null +++ b/arch/x86/include/asm/sigframe.h @@ -0,0 +1,43 @@ +#ifdef CONFIG_X86_32 +#define sigframe_ia32 sigframe +#define rt_sigframe_ia32 rt_sigframe +#define sigcontext_ia32 sigcontext +#define _fpstate_ia32 _fpstate +#define ucontext_ia32 ucontext + +struct sigframe_ia32 { + u32 pretcode; + int sig; + struct sigcontext_ia32 sc; + /* + * fpstate is unused. fpstate is moved/allocated after + * retcode[] below. This movement allows to have the FP state and the + * future state extensions (xsave) stay together. + * And at the same time retaining the unused fpstate, prevents changing + * the offset of extramask[] in the sigframe and thus prevent any + * legacy application accessing/modifying it. + */ + struct _fpstate_ia32 fpstate_unused; + unsigned long extramask[_NSIG_WORDS-1]; + char retcode[8]; + /* fp state follows here */ +}; + +struct rt_sigframe_ia32 { + u32 pretcode; + int sig; + u32 pinfo; + u32 puc; + struct siginfo info; + struct ucontext_ia32 uc; + char retcode[8]; + /* fp state follows here */ +}; +#else /* !CONFIG_X86_32 */ +struct rt_sigframe { + char __user *pretcode; + struct ucontext uc; + struct siginfo info; + /* fp state follows here */ +}; +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 6649d09ad88..ee4df08feee 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -11,7 +11,7 @@ #include #include #include -#include "sigframe.h" +#include #include #include #include diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h deleted file mode 100644 index 6718ed04b05..00000000000 --- a/arch/x86/kernel/sigframe.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifdef CONFIG_X86_32 -#define sigframe_ia32 sigframe -#define rt_sigframe_ia32 rt_sigframe -#define sigcontext_ia32 sigcontext -#define _fpstate_ia32 _fpstate -#define ucontext_ia32 ucontext - -struct sigframe_ia32 { - u32 pretcode; - int sig; - struct sigcontext_ia32 sc; - /* - * fpstate is unused. fpstate is moved/allocated after - * retcode[] below. This movement allows to have the FP state and the - * future state extensions (xsave) stay together. - * And at the same time retaining the unused fpstate, prevents changing - * the offset of extramask[] in the sigframe and thus prevent any - * legacy application accessing/modifying it. - */ - struct _fpstate_ia32 fpstate_unused; - unsigned long extramask[_NSIG_WORDS-1]; - char retcode[8]; - /* fp state follows here */ -}; - -struct rt_sigframe_ia32 { - u32 pretcode; - int sig; - u32 pinfo; - u32 puc; - struct siginfo info; - struct ucontext_ia32 uc; - char retcode[8]; - /* fp state follows here */ -}; -#else /* !CONFIG_X86_32 */ -struct rt_sigframe { - char __user *pretcode; - struct ucontext uc; - struct siginfo info; - /* fp state follows here */ -}; -#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 848c2d64a28..89bb7668041 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -35,7 +35,7 @@ #include #include -#include "sigframe.h" +#include #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) -- cgit v1.2.3 From c85c2ff877c9305f801f7d5b9e6382cb05a03d45 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:51:08 -0800 Subject: x86: signal: prepare to include from ia32_signal.c Impact: cleanup, prepare to use from ia32_signal.c Make struct sigframe_ia32 and rt_sigframe_ia32 visible to ia32_signal.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigframe.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 6718ed04b05..491a0878c3a 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -4,7 +4,15 @@ #define sigcontext_ia32 sigcontext #define _fpstate_ia32 _fpstate #define ucontext_ia32 ucontext +#else /* !CONFIG_X86_32 */ + +#ifdef CONFIG_IA32_EMULATION +#include +#endif /* CONFIG_IA32_EMULATION */ + +#endif /* CONFIG_X86_32 */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) struct sigframe_ia32 { u32 pretcode; int sig; @@ -18,7 +26,11 @@ struct sigframe_ia32 { * legacy application accessing/modifying it. */ struct _fpstate_ia32 fpstate_unused; +#ifdef CONFIG_IA32_EMULATION + unsigned int extramask[_COMPAT_NSIG_WORDS-1]; +#else /* !CONFIG_IA32_EMULATION */ unsigned long extramask[_NSIG_WORDS-1]; +#endif /* CONFIG_IA32_EMULATION */ char retcode[8]; /* fp state follows here */ }; @@ -28,16 +40,22 @@ struct rt_sigframe_ia32 { int sig; u32 pinfo; u32 puc; +#ifdef CONFIG_IA32_EMULATION + compat_siginfo_t info; +#else /* !CONFIG_IA32_EMULATION */ struct siginfo info; +#endif /* CONFIG_IA32_EMULATION */ struct ucontext_ia32 uc; char retcode[8]; /* fp state follows here */ }; -#else /* !CONFIG_X86_32 */ +#endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */ + +#ifdef CONFIG_X86_64 struct rt_sigframe { char __user *pretcode; struct ucontext uc; struct siginfo info; /* fp state follows here */ }; -#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_X86_64 */ -- cgit v1.2.3 From 3b0d29ee1c73b6b90bfddd10f7b8e86632b6b694 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:51:46 -0800 Subject: x86: ia32_signal: rename struct sigframe and rt_sigframe Impact: cleanup, prepare to include sigframe.h Rename struct sigframe to struct sigframe_ia32 and struct rt_sigframe to struct rt_sigframe_ia32. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 9c99c429a20..334a4aa2e75 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -174,7 +174,7 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, * Do a signal return; undo the signal stack. */ -struct sigframe +struct sigframe_ia32 { u32 pretcode; int sig; @@ -185,7 +185,7 @@ struct sigframe /* fp state follows here */ }; -struct rt_sigframe +struct rt_sigframe_ia32 { u32 pretcode; int sig; @@ -271,7 +271,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, asmlinkage long sys32_sigreturn(struct pt_regs *regs) { - struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8); + struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; unsigned int ax; @@ -301,12 +301,12 @@ badframe: asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) { - struct rt_sigframe __user *frame; + struct rt_sigframe_ia32 __user *frame; sigset_t set; unsigned int ax; struct pt_regs tregs; - frame = (struct rt_sigframe __user *)(regs->sp - 4); + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -418,7 +418,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, int ia32_setup_frame(int sig, struct k_sigaction *ka, compat_sigset_t *set, struct pt_regs *regs) { - struct sigframe __user *frame; + struct sigframe_ia32 __user *frame; void __user *restorer; int err = 0; void __user *fpstate = NULL; @@ -497,7 +497,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, compat_sigset_t *set, struct pt_regs *regs) { - struct rt_sigframe __user *frame; + struct rt_sigframe_ia32 __user *frame; void __user *restorer; int err = 0; void __user *fpstate = NULL; -- cgit v1.2.3 From d98f9d84422c393103dc7569dc8444bac628f7ac Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:52:45 -0800 Subject: x86: ia32_signal: use sigframe.h Impact: cleanup Use arch/x86/include/asm/sigframe.h instead of defining redundant structures. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 334a4aa2e75..3b3878a63bc 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -32,6 +32,8 @@ #include #include +#include + #define DEBUG_SIG 0 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) @@ -173,30 +175,6 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, /* * Do a signal return; undo the signal stack. */ - -struct sigframe_ia32 -{ - u32 pretcode; - int sig; - struct sigcontext_ia32 sc; - struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */ - unsigned int extramask[_COMPAT_NSIG_WORDS-1]; - char retcode[8]; - /* fp state follows here */ -}; - -struct rt_sigframe_ia32 -{ - u32 pretcode; - int sig; - u32 pinfo; - u32 puc; - compat_siginfo_t info; - struct ucontext_ia32 uc; - char retcode[8]; - /* fp state follows here */ -}; - #define COPY(x) { \ err |= __get_user(regs->x, &sc->x); \ } -- cgit v1.2.3 From 55aab5f49e384a361668d112eefdb33e90779af9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 17 Dec 2008 12:52:34 -0700 Subject: x86 gart: don't complain if no AMD GART found Impact: remove annoying bootup printk It's perfectly normal for no AMD GART to be present, e.g., if you have Intel CPUs. None of the other iommu_init() functions makes noise when it finds nothing. Signed-off-by: Bjorn Helgaas Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index ba7ad83e20a..a35eaa379ff 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -745,10 +745,8 @@ void __init gart_iommu_init(void) unsigned long scratch; long i; - if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { - printk(KERN_INFO "PCI-GART: No AMD GART found.\n"); + if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) return; - } #ifndef CONFIG_AGP_AMD64 no_agp = 1; -- cgit v1.2.3 From 57a37505d19f4dfeee26f0fd7ea38ed6f1d10cbe Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Wed, 17 Dec 2008 23:17:21 +0530 Subject: x86: time_64.c timer_interrupt() should be static Impact: cleanup, reduce kernel size a bit Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index cb19d650c21..083a4a5bb00 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -49,7 +49,7 @@ unsigned long profile_pc(struct pt_regs *regs) } EXPORT_SYMBOL(profile_pc); -irqreturn_t timer_interrupt(int irq, void *dev_id) +static irqreturn_t timer_interrupt(int irq, void *dev_id) { add_pda(irq0_irqs, 1); -- cgit v1.2.3 From 7c9c160c54fc545efc23881344593868e5f717bd Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Wed, 17 Dec 2008 23:18:52 +0530 Subject: x86: tls.c declare sys_set_thread_area and sys_get_thread_area before they get used Impact: cleanup In asm/syscalls.h move out sys_set_thread_area() and sys_get_thread_area() as they are common for both 32 and 64 bit. Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/syscalls.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 75d4a6afc36..c0b0bda754e 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -22,6 +22,10 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +/* kernel/tls.c */ +asmlinkage int sys_set_thread_area(struct user_desc __user *); +asmlinkage int sys_get_thread_area(struct user_desc __user *); + /* X86_32 only */ #ifdef CONFIG_X86_32 /* kernel/process_32.c */ @@ -54,10 +58,6 @@ asmlinkage int sys_uname(struct old_utsname __user *); struct oldold_utsname; asmlinkage int sys_olduname(struct oldold_utsname __user *); -/* kernel/tls.c */ -asmlinkage int sys_set_thread_area(struct user_desc __user *); -asmlinkage int sys_get_thread_area(struct user_desc __user *); - /* kernel/vm86_32.c */ asmlinkage int sys_vm86old(struct pt_regs); asmlinkage int sys_vm86(struct pt_regs); -- cgit v1.2.3 From f269b07e862c395d6981ab2c05d6bc34b0249e90 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 18 Dec 2008 18:35:06 +0100 Subject: x86: revert CONFIG_RELOCATABLE=y defconfig change This commit: commit 5cb04df8d3f03e37a19f2502591a84156be71772 Author: Ingo Molnar Date: Sun May 4 19:49:04 2008 +0200 x86: defconfig updates changed CONFIG_RELOCATABLE from n to y, which may lead to a mismatch between the vmlinux debug information and the runtime location of the kernel, even when the bootloader does not relocate the kernel. Revert the specific change. Works for me with GRUB and qemu. Reference: http://lkml.org/lkml/2008/11/25/243 Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 2 +- arch/x86/configs/x86_64_defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 71fc39c7078..b30a08ed8eb 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -298,7 +298,7 @@ CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y # CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y +# CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_ALIGN=0x200000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index b38bbabc170..0e7dbc0a3e4 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -298,7 +298,7 @@ CONFIG_SCHED_HRTICK=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y +# CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_ALIGN=0x200000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set -- cgit v1.2.3 From 5c2628e8b4f670d0954053444289e2b018be957a Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 18 Dec 2008 09:18:35 -0800 Subject: x86: sigframe.h: add guard macro Impact: cleanup Add missing guard macro _ASM_X86_SIGFRAME_H. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigframe.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 491a0878c3a..3bd0f427600 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -1,3 +1,6 @@ +#ifndef _ASM_X86_SIGFRAME_H +#define _ASM_X86_SIGFRAME_H + #ifdef CONFIG_X86_32 #define sigframe_ia32 sigframe #define rt_sigframe_ia32 rt_sigframe @@ -59,3 +62,5 @@ struct rt_sigframe { /* fp state follows here */ }; #endif /* CONFIG_X86_64 */ + +#endif /* _ASM_X86_SIGFRAME_H */ -- cgit v1.2.3 From f0bc2202e0373eb8e9b1ddbec930e2e681357db8 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Wed, 17 Dec 2008 23:20:05 +0530 Subject: x86: process.c declare c1e_remove_cpu before they get used Impact: cleanup, avoid sparse warning Included asm/idle.h for c1e_remove_cpu() declaration. Fixes this sparse warning: CHECK arch/x86/kernel/process.c arch/x86/kernel/process.c:284:6: warning: symbol 'c1e_remove_cpu' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772744d..b06100f1d61 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 5899329b19100c0b82dc78e9b21ed8b920c9ffb3 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:30 -0800 Subject: x86: PAT: implement track/untrack of pfnmap regions for x86 - v3 Impact: New mm functionality. Hookup remap_pfn_range and vm_insert_pfn and corresponding copy and free routines with reserve and free tracking. reserve and free here only takes care of non RAM region mapping. For RAM region, driver should use set_memory_[uc|wc|wb] to set the cache type and then setup the mapping for user pte. We can bypass below reserve/free in that case. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 10 ++ arch/x86/mm/pat.c | 236 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 246 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c012f3b1167..7dcd94c2904 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -219,6 +219,11 @@ static inline unsigned long pte_pfn(pte_t pte) return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; } +static inline u64 pte_pa(pte_t pte) +{ + return pte_val(pte) & PTE_PFN_MASK; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -328,6 +333,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) +/* Indicate that x86 has its own track and untrack pfn vma functions */ +#define track_pfn_vma_new track_pfn_vma_new +#define track_pfn_vma_copy track_pfn_vma_copy +#define untrack_pfn_vma untrack_pfn_vma + #ifndef __ASSEMBLY__ #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index eb1bf000d12..1069ffecf77 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -596,6 +596,242 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } +/* + * Internal interface to reserve a range of physical memory with prot. + * Reserved non RAM regions only and after successful reserve_memtype, + * this func also keeps identity mapping (if any) in sync with this new prot. + */ +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot) +{ + int is_ram = 0; + int id_sz, ret; + unsigned long flags; + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); + + is_ram = pagerange_is_ram(paddr, paddr + size); + + if (is_ram != 0) { + /* + * For mapping RAM pages, drivers need to call + * set_memory_[uc|wc|wb] directly, for reserve and free, before + * setting up the PTE. + */ + WARN_ON_ONCE(1); + return 0; + } + + ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); + if (ret) + return ret; + + if (flags != want_flags) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR + "%s:%d map pfn expected mapping type %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + return -EINVAL; + } + + /* Need to keep identity mapping in sync */ + if (paddr >= __pa(high_memory)) + return 0; + + id_sz = (__pa(high_memory) < paddr + size) ? + __pa(high_memory) - paddr : + size; + + if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR + "%s:%d reserve_pfn_range ioremap_change_attr failed %s " + "for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size)); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to free a range of physical memory. + * Frees non RAM regions only. + */ +static void free_pfn_range(u64 paddr, unsigned long size) +{ + int is_ram; + + is_ram = pagerange_is_ram(paddr, paddr + size); + if (is_ram == 0) + free_memtype(paddr, paddr + size); +} + +/* + * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + * + * If the vma has a linear pfn mapping for the entire range, we get the prot + * from pte and reserve the entire vma range with single reserve_pfn_range call. + * Otherwise, we reserve the entire vma range, my ging through the PTEs page + * by page to get physical address and protection. + */ +int track_pfn_vma_copy(struct vm_area_struct *vma) +{ + int retval = 0; + unsigned long i, j; + u64 paddr; + pgprot_t prot; + pte_t pte; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return 0; + + if (is_linear_pfn_mapping(vma)) { + /* + * reserve the whole chunk starting from vm_pgoff, + * But, we have to get the protection from pte. + */ + if (follow_pfnmap_pte(vma, vma_start, &pte)) { + WARN_ON_ONCE(1); + return -1; + } + prot = pte_pgprot(pte); + paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + return reserve_pfn_range(paddr, vma_size, prot); + } + + /* reserve entire vma page by page, using pfn and prot from pte */ + for (i = 0; i < vma_size; i += PAGE_SIZE) { + if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + continue; + + paddr = pte_pa(pte); + prot = pte_pgprot(pte); + retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + if (retval) + goto cleanup_ret; + } + return 0; + +cleanup_ret: + /* Reserve error: Cleanup partial reservation and return error */ + for (j = 0; j < i; j += PAGE_SIZE) { + if (follow_pfnmap_pte(vma, vma_start + j, &pte)) + continue; + + paddr = pte_pa(pte); + free_pfn_range(paddr, PAGE_SIZE); + } + + return retval; +} + +/* + * track_pfn_vma_new is called when a _new_ pfn mapping is being established + * for physical range indicated by pfn and size. + * + * prot is passed in as a parameter for the new mapping. If the vma has a + * linear pfn mapping for the entire range reserve the entire vma range with + * single reserve_pfn_range call. + * Otherwise, we look t the pfn and size and reserve only the specified range + * page by page. + * + * Note that this function can be called with caller trying to map only a + * subrange/page inside the vma. + */ +int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size) +{ + int retval = 0; + unsigned long i, j; + u64 base_paddr; + u64 paddr; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return 0; + + if (is_linear_pfn_mapping(vma)) { + /* reserve the whole chunk starting from vm_pgoff */ + paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + return reserve_pfn_range(paddr, vma_size, prot); + } + + /* reserve page by page using pfn and size */ + base_paddr = (u64)pfn << PAGE_SHIFT; + for (i = 0; i < size; i += PAGE_SIZE) { + paddr = base_paddr + i; + retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + if (retval) + goto cleanup_ret; + } + return 0; + +cleanup_ret: + /* Reserve error: Cleanup partial reservation and return error */ + for (j = 0; j < i; j += PAGE_SIZE) { + paddr = base_paddr + j; + free_pfn_range(paddr, PAGE_SIZE); + } + + return retval; +} + +/* + * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case size can be zero). + */ +void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) +{ + unsigned long i; + u64 paddr; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return; + + if (is_linear_pfn_mapping(vma)) { + /* free the whole chunk starting from vm_pgoff */ + paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + free_pfn_range(paddr, vma_size); + return; + } + + if (size != 0 && size != vma_size) { + /* free page by page, using pfn and size */ + paddr = (u64)pfn << PAGE_SHIFT; + for (i = 0; i < size; i += PAGE_SIZE) { + paddr = paddr + i; + free_pfn_range(paddr, PAGE_SIZE); + } + } else { + /* free entire vma, page by page, using the pfn from pte */ + for (i = 0; i < vma_size; i += PAGE_SIZE) { + pte_t pte; + + if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + continue; + + paddr = pte_pa(pte); + free_pfn_range(paddr, PAGE_SIZE); + } + } +} + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* get Nth element of the linked list */ -- cgit v1.2.3 From 8a7b12f70fb135a1b1d865687de3edcdc780f6d1 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:31 -0800 Subject: x86: PAT: change pgprot_noncached to uc_minus instead of strong uc - v3 Impact: mm behavior change. Make pgprot_noncached uc_minus instead of strong UC. This will make pgprot_noncached to be in line with ioremap_nocache() and all the other APIs that map page uc_minus on uc request. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 8 ++++++++ arch/x86/include/asm/pgtable_32.h | 9 --------- arch/x86/include/asm/pgtable_64.h | 6 ------ 3 files changed, 8 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7dcd94c2904..6968d4f6be3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -158,6 +158,14 @@ #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ #endif +/* + * Macro to mark a page protection value as UC- + */ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \ + : (prot)) + #ifndef __ASSEMBLY__ /* diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index f9d5889b336..72b020deb46 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -100,15 +100,6 @@ extern unsigned long pg0[]; # include #endif -/* - * Macro to mark a page protection value as "uncacheable". - * On processors which do not support it, this is a no-op. - */ -#define pgprot_noncached(prot) \ - ((boot_cpu_data.x86 > 3) \ - ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \ - : (prot)) - /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 545a0e042bb..4798a4033e3 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -176,12 +176,6 @@ static inline int pmd_bad(pmd_t pmd) #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */ -/* - * Macro to mark a page protection value as "uncacheable". - */ -#define pgprot_noncached(prot) \ - (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT)) - /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. -- cgit v1.2.3 From 2520bd3123c00272f818a176c92d03c7d0a113d6 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:32 -0800 Subject: x86: PAT: add pgprot_writecombine() interface for drivers - v3 Impact: New mm functionality. Add pgprot_writecombine. pgprot_writecombine will be aliased to pgprot_noncached when not supported by the architecture. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 3 +++ arch/x86/mm/pat.c | 8 ++++++++ 2 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6968d4f6be3..579f8ceee94 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -168,6 +168,9 @@ #ifndef __ASSEMBLY__ +#define pgprot_writecombine pgprot_writecombine +extern pgprot_t pgprot_writecombine(pgprot_t prot); + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1069ffecf77..d5254bae84f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -832,6 +832,14 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, } } +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + if (pat_enabled) + return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC); + else + return pgprot_noncached(prot); +} + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* get Nth element of the linked list */ -- cgit v1.2.3 From d1769d5475176124af04fa69848b022c98c4bc37 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 19 Dec 2008 00:03:56 +0530 Subject: x86: traps.c declare functions before they get used Impact: cleanup In asm/traps.h :- do_double_fault : added under X86_64 sync_regs : added under X86_64 math_error : moved out from X86_32 as it is common for both 32 and 64 bit math_emulate : moved from X86_32 as it is common for both 32 and 64 bit smp_thermal_interrupt : added under X86_64 mce_threshold_interrupt : added under X86_64 Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/traps.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 45dee286e45..2ee0a3bceed 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -46,6 +46,10 @@ dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long); dotraplinkage void do_invalid_TSS(struct pt_regs *, long); dotraplinkage void do_segment_not_present(struct pt_regs *, long); dotraplinkage void do_stack_segment(struct pt_regs *, long); +#ifdef CONFIG_X86_64 +dotraplinkage void do_double_fault(struct pt_regs *, long); +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *); +#endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); @@ -72,10 +76,13 @@ static inline int get_si_code(unsigned long condition) extern int panic_on_unrecovered_nmi; extern int kstack_depth_to_print; -#ifdef CONFIG_X86_32 void math_error(void __user *); -unsigned long patch_espfix_desc(unsigned long, unsigned long); asmlinkage void math_emulate(long); +#ifdef CONFIG_X86_32 +unsigned long patch_espfix_desc(unsigned long, unsigned long); +#else +asmlinkage void smp_thermal_interrupt(void); +asmlinkage void mce_threshold_interrupt(void); #endif #endif /* _ASM_X86_TRAPS_H */ -- cgit v1.2.3 From b2fa739c06931d167b6d2aa7b514ab7f30d04dc0 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 18 Dec 2008 14:43:34 -0800 Subject: x86: sigframe.h: include headers for dependency Impact: cleanup Include following headers for dependency. asm/sigcontext.h asm/siginfo.h asm/ucontext.h Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/sigframe.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 3bd0f427600..4e0fe26d27d 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -1,6 +1,10 @@ #ifndef _ASM_X86_SIGFRAME_H #define _ASM_X86_SIGFRAME_H +#include +#include +#include + #ifdef CONFIG_X86_32 #define sigframe_ia32 sigframe #define rt_sigframe_ia32 rt_sigframe -- cgit v1.2.3 From 8869a2e5d3a66d5b63b948052d60cd13ede8b735 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 18 Dec 2008 14:46:52 -0800 Subject: x86: asm-offset_64: use rt_sigframe_ia32 Impact: cleanup Use rt_sigframe_ia32 instead of rt_sigframe32. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/asm-offsets_64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 7fcf63d22f8..1d41d3f1edb 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -20,6 +20,8 @@ #include +#include + #define __NO_STUBS 1 #undef __SYSCALL #undef _ASM_X86_UNISTD_64_H @@ -87,7 +89,7 @@ int main(void) BLANK(); #undef ENTRY DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe32, uc.uc_mcontext)); + offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); BLANK(); #endif DEFINE(pbe_address, offsetof(struct pbe, address)); -- cgit v1.2.3 From 9f221495997d180df51ce4d8296669445dd3e7b3 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 18 Dec 2008 14:47:37 -0800 Subject: x86: ia32.h: remove unused struct sigfram32 and rt_sigframe32 Impact: cleanup Remove struct sigfram32 and rt_sigframe32 because there is no user. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/ia32.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 97989c0e534..50ca486fd88 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -129,24 +129,6 @@ typedef struct compat_siginfo { } _sifields; } compat_siginfo_t; -struct sigframe32 { - u32 pretcode; - int sig; - struct sigcontext_ia32 sc; - struct _fpstate_ia32 fpstate; - unsigned int extramask[_COMPAT_NSIG_WORDS-1]; -}; - -struct rt_sigframe32 { - u32 pretcode; - int sig; - u32 pinfo; - u32 puc; - compat_siginfo_t info; - struct ucontext_ia32 uc; - struct _fpstate_ia32 fpstate; -}; - struct ustat32 { __u32 f_tfree; compat_ino_t f_tinode; -- cgit v1.2.3 From f34a10bd9f8cc95ebdc69a079db195636b2e22e0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 01:36:14 +0100 Subject: x86: fix warning in arch/x86/kernel/microcode_amd.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this warning: arch/x86/kernel/microcode_amd.c: In function ‘apply_microcode_amd’: arch/x86/kernel/microcode_amd.c:163: warning: cast from pointer to integer of different size arch/x86/kernel/microcode_amd.c:163: warning: cast from pointer to integer of different size triggers because we want to pass the address to the microcode MSR, which is 64-bit even on 32-bit. Cast it explicitly to express this. Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 24c256f4e50..c25fdb38229 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -160,7 +160,7 @@ static void apply_microcode_amd(int cpu) return; spin_lock_irqsave(µcode_update_lock, flags); - wrmsrl(MSR_AMD64_PATCH_LOADER, &mc_amd->hdr.data_code); + wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* get patch id after patching */ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); spin_unlock_irqrestore(µcode_update_lock, flags); @@ -372,3 +372,4 @@ struct microcode_ops * __init init_amd_microcode(void) { return µcode_amd_ops; } + -- cgit v1.2.3 From 345077cd98ff5532b2d1158013c3fec7b1ae85ec Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 18 Dec 2008 18:09:21 -0800 Subject: x86: fix intel x86_64 llc_shared_map/cpu_llc_id anomolies Impact: fix wrong cache sharing detection on platforms supporting > 8 bit apicid's In the presence of extended topology eumeration leaf 0xb provided by cpuid, 32bit extended initial_apicid in cpuinfo_x86 struct will be updated by detect_extended_topology(). At this instance, we should also reinit the apicid (which could also potentially be extended to 32bit). With out this there will potentially be duplicate apicid's populated in the per cpu's cpuinfo_x86 struct, resulting in wrong cache sharing topology etc detected by init_intel_cacheinfo(). Reported-by: Dimitri Sivanich Signed-off-by: Suresh Siddha Acked-by: Dimitri Sivanich Signed-off-by: Ingo Molnar Cc: --- arch/x86/kernel/cpu/addon_cpuid_features.c | 8 ++++++++ arch/x86/kernel/cpu/intel.c | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index ef8f831af82..2cf23634b6d 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -120,9 +120,17 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) & core_select_mask; c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = phys_pkg_id(c->initial_apicid, 0); #else c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; c->phys_proc_id = phys_pkg_id(core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = phys_pkg_id(0); #endif c->x86_max_cores = (core_level_siblings / smp_num_siblings); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index caec59437a2..b21c37c060a 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -252,6 +252,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) intel_workarounds(c); + /* + * Detect the extended topology information if available. This + * will reinitialise the initial_apicid which will be used + * in init_intel_cacheinfo() + */ + detect_extended_topology(c); + l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); @@ -323,7 +330,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) #endif - detect_extended_topology(c); if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { /* * let's use the legacy cpuid vector 0x1 and 0x4 for topology -- cgit v1.2.3 From 9bb482476c6c9d1ae033306440c51ceac93ea80c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:30:08 +0000 Subject: allow stripping of generated symbols under CONFIG_KALLSYMS_ALL Building upon parts of the module stripping patch, this patch introduces similar stripping for vmlinux when CONFIG_KALLSYMS_ALL=y. Using CONFIG_KALLSYMS_STRIP_GENERATED reduces the overhead of CONFIG_KALLSYMS_ALL from 245k/310k to 65k/80k for the (i386/x86-64) kernels I tested with. The patch also does away with the need to special case the kallsyms- internal symbols by making them available even in the first linking stage. While it is a generated file, the patch includes the changes to scripts/genksyms/keywords.c_shipped, as I'm unsure what the procedure here is. Signed-off-by: Jan Beulich Signed-off-by: Sam Ravnborg --- arch/x86/scripts/strip-symbols | 1 + 1 file changed, 1 insertion(+) create mode 100644 arch/x86/scripts/strip-symbols (limited to 'arch/x86') diff --git a/arch/x86/scripts/strip-symbols b/arch/x86/scripts/strip-symbols new file mode 100644 index 00000000000..a2f1ccb827c --- /dev/null +++ b/arch/x86/scripts/strip-symbols @@ -0,0 +1 @@ +__cpu_vendor_dev_X86_VENDOR_* -- cgit v1.2.3 From 34945ede31071ac7d72270cc6c1893323f392b3f Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 19 Dec 2008 22:33:52 +0530 Subject: x86: common.c boot_cpu_stack and boot_exception_stacks should be static Impact: cleanup, avoid sparse warnings, reduce kernel size a bit Fixes these sparse warnings: arch/x86/kernel/cpu/common.c:869:6: warning: symbol 'boot_cpu_stack' was not declared. Should it be static? arch/x86/kernel/cpu/common.c:910:6: warning: symbol 'boot_exception_stacks' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0217a..aba49c782fd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -862,7 +862,7 @@ EXPORT_SYMBOL(_cpu_pda); struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; -char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; +static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; void __cpuinit pda_init(int cpu) { @@ -903,8 +903,8 @@ void __cpuinit pda_init(int cpu) } } -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] __page_aligned_bss; +static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + + DEBUG_STKSZ] __page_aligned_bss; extern asmlinkage void ignore_sysret(void); -- cgit v1.2.3 From 8403295e0fa460f6240e2d781e25dc29189f33c7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 19 Dec 2008 14:25:50 -0800 Subject: x86: ia32_signal: remove unnecessary declaration Impact: cleanup No need to declare do_signal(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 3b3878a63bc..09513f8a289 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -43,7 +43,6 @@ X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ X86_EFLAGS_CF) -asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); void signal_fault(struct pt_regs *regs, void __user *frame, char *where); int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) -- cgit v1.2.3 From 982d789ab76c8a11426852fec2fdf2f412e21c0c Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:28 -0800 Subject: x86: PAT: remove follow_pfnmap_pte in favor of follow_phys Impact: Cleanup - removes a new function in favor of a recently modified older one. Replace follow_pfnmap_pte in pat code with follow_phys. follow_phys lso returns protection eliminating the need of pte_pgprot call. Using follow_phys also eliminates the need for pte_pa. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 5 ----- arch/x86/mm/pat.c | 30 +++++++++++------------------- 2 files changed, 11 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 579f8ceee94..2aa792bbd7e 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -230,11 +230,6 @@ static inline unsigned long pte_pfn(pte_t pte) return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; } -static inline u64 pte_pa(pte_t pte) -{ - return pte_val(pte) & PTE_PFN_MASK; -} - #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d5254bae84f..541bcc944a5 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -685,8 +685,7 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) int retval = 0; unsigned long i, j; u64 paddr; - pgprot_t prot; - pte_t pte; + unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; unsigned long vma_size = vma_end - vma_start; @@ -696,26 +695,22 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) if (is_linear_pfn_mapping(vma)) { /* - * reserve the whole chunk starting from vm_pgoff, - * But, we have to get the protection from pte. + * reserve the whole chunk covered by vma. We need the + * starting address and protection from pte. */ - if (follow_pfnmap_pte(vma, vma_start, &pte)) { + if (follow_phys(vma, vma_start, 0, &prot, &paddr)) { WARN_ON_ONCE(1); - return -1; + return -EINVAL; } - prot = pte_pgprot(pte); - paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, prot); + return reserve_pfn_range(paddr, vma_size, __pgprot(prot)); } /* reserve entire vma page by page, using pfn and prot from pte */ for (i = 0; i < vma_size; i += PAGE_SIZE) { - if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); - prot = pte_pgprot(pte); - retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + retval = reserve_pfn_range(paddr, PAGE_SIZE, __pgprot(prot)); if (retval) goto cleanup_ret; } @@ -724,10 +719,9 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) cleanup_ret: /* Reserve error: Cleanup partial reservation and return error */ for (j = 0; j < i; j += PAGE_SIZE) { - if (follow_pfnmap_pte(vma, vma_start + j, &pte)) + if (follow_phys(vma, vma_start + j, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); free_pfn_range(paddr, PAGE_SIZE); } @@ -797,6 +791,7 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, { unsigned long i; u64 paddr; + unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; unsigned long vma_size = vma_end - vma_start; @@ -821,12 +816,9 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, } else { /* free entire vma, page by page, using the pfn from pte */ for (i = 0; i < vma_size; i += PAGE_SIZE) { - pte_t pte; - - if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); free_pfn_range(paddr, PAGE_SIZE); } } -- cgit v1.2.3 From 34801ba9bf0381fcf0e2b08179d2c07f2c6ede74 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:29 -0800 Subject: x86: PAT: move track untrack pfnmap stubs to asm-generic Impact: Cleanup and branch hints only. Move the track and untrack pfn stub routines from memory.c to asm-generic. Also add unlikely to pfnmap related calls in fork and exit path. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2aa792bbd7e..875192bf72c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -339,12 +339,10 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) +#ifndef __ASSEMBLY__ /* Indicate that x86 has its own track and untrack pfn vma functions */ -#define track_pfn_vma_new track_pfn_vma_new -#define track_pfn_vma_copy track_pfn_vma_copy -#define untrack_pfn_vma untrack_pfn_vma +#define __HAVE_PFNMAP_TRACKING -#ifndef __ASSEMBLY__ #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, -- cgit v1.2.3 From bf53de907dfdaac178c92d774aae7370d7b97d20 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 19 Dec 2008 15:10:24 +0100 Subject: x86, bts: add fork and exit handling Impact: introduce new ptrace facility Add arch_ptrace_untrace() function that is called when the tracer detaches (either voluntarily or when the tracing task dies); ptrace_disable() is only called on a voluntary detach. Add ptrace_fork() and arch_ptrace_fork(). They are called when a traced task is forked. Clear DS and BTS related fields on fork. Release DS resources and reclaim memory in ptrace_untrace(). This releases resources already when the tracing task dies. We used to do that when the traced task dies. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 9 ++++++++ arch/x86/include/asm/ptrace.h | 7 ++++++ arch/x86/kernel/ds.c | 11 ++++++++++ arch/x86/kernel/process_32.c | 20 ++++++++--------- arch/x86/kernel/process_64.c | 20 ++++++++--------- arch/x86/kernel/ptrace.c | 50 ++++++++++++++++++++++++++++++++++--------- 6 files changed, 85 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index ee0ea3a96c1..a8f672ba100 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -252,12 +252,21 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); */ extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); +/* + * Task clone/init and cleanup work + */ +extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father); +extern void ds_exit_thread(struct task_struct *tsk); + #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} static inline void ds_switch_to(struct task_struct *prev, struct task_struct *next) {} +static inline void ds_copy_thread(struct task_struct *tsk, + struct task_struct *father) {} +static inline void ds_exit_thread(struct task_struct *tsk) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index fbf74421591..6d34d954c22 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -235,6 +235,13 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); +extern void x86_ptrace_untrace(struct task_struct *); +extern void x86_ptrace_fork(struct task_struct *child, + unsigned long clone_flags); + +#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) +#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) + #endif /* __KERNEL__ */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 98d271e60e0..da91701a234 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1017,3 +1017,14 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next) update_debugctlmsr(next->thread.debugctlmsr); } + +void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +{ + clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); + tsk->thread.ds_ctx = NULL; +} + +void ds_exit_thread(struct task_struct *tsk) +{ + WARN_ON(tsk->thread.ds_ctx); +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 605eff9a8ac..3ba155d2488 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -60,6 +60,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -251,17 +252,8 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any BTS tracers that have not been properly released. */ - if (unlikely(current->bts)) { - ds_release_bts(current->bts); - current->bts = NULL; - - kfree(current->bts_buffer); - current->bts_buffer = NULL; - current->bts_size = 0; - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -343,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + + ds_copy_thread(p, current); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + return err; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1cfd2a4bf85..416fb9282f4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -53,6 +53,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); @@ -236,17 +237,8 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any BTS tracers that have not been properly released. */ - if (unlikely(current->bts)) { - ds_release_bts(current->bts); - current->bts = NULL; - - kfree(current->bts_buffer); - current->bts_buffer = NULL; - current->bts_size = 0; - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -376,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, if (err) goto out; } + + ds_copy_thread(p, me); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + err = 0; out: if (err && p->thread.io_bitmap_ptr) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 45e9855da2d..6ad2bb60765 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -769,8 +769,47 @@ static int ptrace_bts_size(struct task_struct *child) return (trace->ds.top - trace->ds.begin) / trace->ds.size; } + +static void ptrace_bts_fork(struct task_struct *tsk) +{ + tsk->bts = NULL; + tsk->bts_buffer = NULL; + tsk->bts_size = 0; + tsk->thread.bts_ovfl_signal = 0; +} + +static void ptrace_bts_untrace(struct task_struct *child) +{ + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; + + kfree(child->bts_buffer); + child->bts_buffer = NULL; + child->bts_size = 0; + } +} + +static void ptrace_bts_detach(struct task_struct *child) +{ + ptrace_bts_untrace(child); +} +#else +static inline void ptrace_bts_fork(struct task_struct *tsk) {} +static inline void ptrace_bts_detach(struct task_struct *child) {} +static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ +void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) +{ + ptrace_bts_fork(child); +} + +void x86_ptrace_untrace(struct task_struct *child) +{ + ptrace_bts_untrace(child); +} + /* * Called by kernel/ptrace.c when detaching.. * @@ -782,16 +821,7 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif -#ifdef CONFIG_X86_PTRACE_BTS - if (child->bts) { - ds_release_bts(child->bts); - child->bts = NULL; - - kfree(child->bts_buffer); - child->bts_buffer = NULL; - child->bts_size = 0; - } -#endif /* CONFIG_X86_PTRACE_BTS */ + ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -- cgit v1.2.3 From c5dee6177f4bd2095aab7d9be9f6ebdddd6deee9 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 19 Dec 2008 15:17:02 +0100 Subject: x86, bts: memory accounting Impact: move the BTS buffer accounting to the mlock bucket Add alloc_locked_buffer() and free_locked_buffer() functions to mm/mlock.c to kalloc a buffer and account the locked memory to current. Account the memory for the BTS buffer to the tracer. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 45 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 6ad2bb60765..0a5df5f82fb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -650,6 +650,24 @@ static int ptrace_bts_drain(struct task_struct *child, return drained; } +static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) +{ + child->bts_buffer = alloc_locked_buffer(size); + if (!child->bts_buffer) + return -ENOMEM; + + child->bts_size = size; + + return 0; +} + +static void ptrace_bts_free_buffer(struct task_struct *child) +{ + free_locked_buffer(child->bts_buffer, child->bts_size); + child->bts_buffer = NULL; + child->bts_size = 0; +} + static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) @@ -679,14 +697,13 @@ static int ptrace_bts_config(struct task_struct *child, if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != child->bts_size)) { - kfree(child->bts_buffer); + int error; - child->bts_size = cfg.size; - child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); - if (!child->bts_buffer) { - child->bts_size = 0; - return -ENOMEM; - } + ptrace_bts_free_buffer(child); + + error = ptrace_bts_allocate_buffer(child, cfg.size); + if (error < 0) + return error; } if (cfg.flags & PTRACE_BTS_O_TRACE) @@ -701,10 +718,8 @@ static int ptrace_bts_config(struct task_struct *child, if (IS_ERR(child->bts)) { int error = PTR_ERR(child->bts); - kfree(child->bts_buffer); + ptrace_bts_free_buffer(child); child->bts = NULL; - child->bts_buffer = NULL; - child->bts_size = 0; return error; } @@ -784,6 +799,9 @@ static void ptrace_bts_untrace(struct task_struct *child) ds_release_bts(child->bts); child->bts = NULL; + /* We cannot update total_vm and locked_vm since + child's mm is already gone. But we can reclaim the + memory. */ kfree(child->bts_buffer); child->bts_buffer = NULL; child->bts_size = 0; @@ -792,7 +810,12 @@ static void ptrace_bts_untrace(struct task_struct *child) static void ptrace_bts_detach(struct task_struct *child) { - ptrace_bts_untrace(child); + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; + + ptrace_bts_free_buffer(child); + } } #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} -- cgit v1.2.3 From 280a9ca5d0663b185ddc4443052076c29652a328 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Sat, 20 Dec 2008 00:15:24 +0100 Subject: x86: fix resume (S2R) broken by Intel microcode module, on A110L Impact: fix deadlock This is in response to the following bug report: Bug-Entry : http://bugzilla.kernel.org/show_bug.cgi?id=12100 Subject : resume (S2R) broken by Intel microcode module, on A110L Submitter : Andreas Mohr Date : 2008-11-25 08:48 (19 days old) Handled-By : Dmitry Adamushko [ The deadlock scenario has been discovered by Andreas Mohr ] I think I might have a logical explanation why the system: (http://bugzilla.kernel.org/show_bug.cgi?id=12100) might hang upon resuming, OTOH it should have likely hanged each and every time. (1) possible deadlock in microcode_resume_cpu() if either 'if' section is taken; (2) now, I don't see it in spec. and can't experimentally verify it (newer ucodes don't seem to be available for my Core2duo)... but logically-wise, I'd think that when read upon resuming, the 'microcode revision' (MSR 0x8B) should be back to its original one (we need to reload ucode anyway so it doesn't seem logical if a cpu doesn't drop the version)... if so, the comparison with memcmp() for the full 'struct cpu_signature' is wrong... and that's how one of the aforementioned 'if' sections might have been triggered - leading to a deadlock. Obviously, in my tests I simulated loading/resuming with the ucode of the same version (just to see that the file is loaded/re-loaded upon resuming) so this issue has never popped up. I'd appreciate if someone with an appropriate system might give a try to the 2nd patch (titled "fix a comparison && deadlock..."). In any case, the deadlock situation is a must-have fix. Reported-by: Andreas Mohr Signed-off-by: Dmitry Adamushko Tested-by: Andreas Mohr Signed-off-by: Ingo Molnar Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 19 ++++++++++++++----- arch/x86/kernel/microcode_intel.c | 6 ++++++ 2 files changed, 20 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 82fb2809ce3..c4b5b24e021 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -272,13 +272,18 @@ static struct attribute_group mc_attr_group = { .name = "microcode", }; -static void microcode_fini_cpu(int cpu) +static void __microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - mutex_lock(µcode_mutex); microcode_ops->microcode_fini_cpu(cpu); uci->valid = 0; +} + +static void microcode_fini_cpu(int cpu) +{ + mutex_lock(µcode_mutex); + __microcode_fini_cpu(cpu); mutex_unlock(µcode_mutex); } @@ -306,12 +311,16 @@ static int microcode_resume_cpu(int cpu) * to this cpu (a bit of paranoia): */ if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - microcode_fini_cpu(cpu); + __microcode_fini_cpu(cpu); + printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", + cpu); return -1; } - if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { - microcode_fini_cpu(cpu); + if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { + __microcode_fini_cpu(cpu); + printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", + cpu); /* Should we look for a new ucode here? */ return 1; } diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 622dc4a2178..a8e62792d17 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -155,6 +155,7 @@ static DEFINE_SPINLOCK(microcode_update_lock); static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); + unsigned long flags; unsigned int val[2]; memset(csig, 0, sizeof(*csig)); @@ -174,11 +175,16 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } + /* serialize access to the physical write to MSR 0x79 */ + spin_lock_irqsave(µcode_update_lock, flags); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); + spin_unlock_irqrestore(µcode_update_lock, flags); + pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", csig->sig, csig->pf, csig->rev); -- cgit v1.2.3 From adf77bac052bb5bf0722b2ce2af9fefc5b2d2a71 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 22 Dec 2008 17:56:05 -0800 Subject: x86: prioritize the FPU traps for the error code In the case of multiple FPU errors, prioritize the error codes, instead of returning __SI_FAULT, which ends up pushing a 0 as the error code to userspace, a POSIX violation. For i386, we will simply return if there are no errors at all; for x86-64 this is probably a "can't happen" (and the code should be unified), but for this patch, return __SI_FAULT|SI_KERNEL if this ever happens. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/traps.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 04d242ab016..c320c29255c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -664,7 +664,7 @@ void math_error(void __user *ip) { struct task_struct *task; siginfo_t info; - unsigned short cwd, swd; + unsigned short cwd, swd, err; /* * Save the info for the exception handler and clear the error. @@ -675,7 +675,6 @@ void math_error(void __user *ip) task->thread.error_code = 0; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_code = __SI_FAULT; info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked @@ -689,34 +688,31 @@ void math_error(void __user *ip) */ cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ -#ifdef CONFIG_X86_32 + + err = swd & ~cwd & 0x3f; + +#if CONFIG_X86_32 + if (!err) return; #endif - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ + + if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ + } else if (err & 0x004) { /* Divide by Zero */ info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ + } else if (err & 0x008) { /* Overflow */ info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ + } else if (err & 0x012) { /* Denormal, Underflow */ + info.si_code = FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ info.si_code = FPE_FLTRES; - break; + } else { + info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */ } force_sig_info(SIGFPE, &info, task); } -- cgit v1.2.3 From c1c15b65ec30275575dac9322aae607075769fbc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 23 Dec 2008 10:10:40 -0800 Subject: x86: PAT: fix address types in track_pfn_vma_new() Impact: cleanup, fix warning This warning: arch/x86/mm/pat.c: In function track_pfn_vma_copy: arch/x86/mm/pat.c:701: warning: passing argument 5 of follow_phys from incompatible pointer type Triggers because physical addresses are resource_size_t, not u64. This really matters when calling an interface like follow_phys() which takes a pointer to a physical address -- although on x86, being littleendian, it would generally work anyway as long as the memory region wasn't completely uninitialized. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 541bcc944a5..85cbd3cd372 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -684,7 +684,7 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) { int retval = 0; unsigned long i, j; - u64 paddr; + resource_size_t paddr; unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; @@ -746,8 +746,8 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, { int retval = 0; unsigned long i, j; - u64 base_paddr; - u64 paddr; + resource_size_t base_paddr; + resource_size_t paddr; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; unsigned long vma_size = vma_end - vma_start; @@ -757,12 +757,12 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, if (is_linear_pfn_mapping(vma)) { /* reserve the whole chunk starting from vm_pgoff */ - paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; return reserve_pfn_range(paddr, vma_size, prot); } /* reserve page by page using pfn and size */ - base_paddr = (u64)pfn << PAGE_SHIFT; + base_paddr = (resource_size_t)pfn << PAGE_SHIFT; for (i = 0; i < size; i += PAGE_SIZE) { paddr = base_paddr + i; retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); @@ -790,7 +790,7 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size) { unsigned long i; - u64 paddr; + resource_size_t paddr; unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; @@ -801,14 +801,14 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, if (is_linear_pfn_mapping(vma)) { /* free the whole chunk starting from vm_pgoff */ - paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; free_pfn_range(paddr, vma_size); return; } if (size != 0 && size != vma_size) { /* free page by page, using pfn and size */ - paddr = (u64)pfn << PAGE_SHIFT; + paddr = (resource_size_t)pfn << PAGE_SHIFT; for (i = 0; i < size; i += PAGE_SIZE) { paddr = paddr + i; free_pfn_range(paddr, PAGE_SIZE); -- cgit v1.2.3 From 40f15ad8aadff5ebb621b17a6f303ad2cd3f847d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 24 Dec 2008 10:49:51 +0100 Subject: x86: disable X86_PTRACE_BTS there's a new ptrace arch level feature in .28: config X86_PTRACE_BTS bool "Branch Trace Store" it has broken fork() handling: the old DS area gets copied over into a new task without clearing it. Fixes exist but they came too late: c5dee61: x86, bts: memory accounting bf53de9: x86, bts: add fork and exit handling and are queued up for v2.6.29. This shows that the facility is still not tested well enough to release into a stable kernel - disable it for now and reactivate in .29. In .29 the hardware-branch-tracer will use the DS/BTS facilities too - hopefully resulting in better code. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b815664fe37..8e99073b9e0 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -520,6 +520,7 @@ config X86_PTRACE_BTS bool "Branch Trace Store" default y depends on X86_DEBUGCTLMSR + depends on BROKEN help This adds a ptrace interface to the hardware's branch trace store. -- cgit v1.2.3 From 67be403d897f818b1a5ecc201967b0ee6a0332f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 24 Dec 2008 21:08:37 +0100 Subject: Revert "x86: disable X86_PTRACE_BTS" This reverts commit 40f15ad8aadff5ebb621b17a6f303ad2cd3f847d. The CONFIG_X86_PTRACE_BTS bugs have been fixed via: c5dee61: x86, bts: memory accounting bf53de9: x86, bts: add fork and exit handling Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b54903efb39..85a78575956 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -521,7 +521,6 @@ config X86_PTRACE_BTS bool "Branch Trace Store" default y depends on X86_DEBUGCTLMSR - depends on BROKEN help This adds a ptrace interface to the hardware's branch trace store. -- cgit v1.2.3 From 1c06da81a5d042d5fba67c4c533b16ae62a174ab Mon Sep 17 00:00:00 2001 From: Kent Liu Date: Fri, 31 Oct 2008 16:52:58 +0800 Subject: crypto: crc32c-intel - Update copyright head The original copyright head for crc32c-intel.c is incorrect. Please merge the patch to update it. Signed-Off-By: Kent Liu Signed-off-by: Herbert Xu --- arch/x86/crypto/crc32c-intel.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c index 070afc5b6c9..a2c539cc52b 100644 --- a/arch/x86/crypto/crc32c-intel.c +++ b/arch/x86/crypto/crc32c-intel.c @@ -6,13 +6,22 @@ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual * Volume 2A: Instruction Set Reference, A-M * - * Copyright (c) 2008 Austin Zhang - * Copyright (c) 2008 Kent Liu + * Copyright (C) 2008 Intel Corporation + * Authors: Austin Zhang + * Kent Liu * * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. * */ #include @@ -194,4 +203,3 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS("crc32c"); MODULE_ALIAS("crc32c-intel"); - -- cgit v1.2.3 From b7e8bdadce6317eb13c13b9451d7114614aa1450 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 6 Nov 2008 16:56:41 +0800 Subject: crypto: crc32c-intel - Switch to shash This patch changes crc32c-intel to the new shash interface. Signed-off-by: Herbert Xu --- arch/x86/crypto/crc32c-intel.c | 101 +++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c index a2c539cc52b..b9d00261703 100644 --- a/arch/x86/crypto/crc32c-intel.c +++ b/arch/x86/crypto/crc32c-intel.c @@ -84,99 +84,92 @@ static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len * If your algorithm starts with ~0, then XOR with ~0 before you set * the seed. */ -static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key, +static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key, unsigned int keylen) { - u32 *mctx = crypto_ahash_ctx(hash); + u32 *mctx = crypto_shash_ctx(hash); if (keylen != sizeof(u32)) { - crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } *mctx = le32_to_cpup((__le32 *)key); return 0; } -static int crc32c_intel_init(struct ahash_request *req) +static int crc32c_intel_init(struct shash_desc *desc) { - u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); - u32 *crcp = ahash_request_ctx(req); + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); *crcp = *mctx; return 0; } -static int crc32c_intel_update(struct ahash_request *req) +static int crc32c_intel_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - struct crypto_hash_walk walk; - u32 *crcp = ahash_request_ctx(req); - u32 crc = *crcp; - int nbytes; + u32 *crcp = shash_desc_ctx(desc); - for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; - nbytes = crypto_hash_walk_done(&walk, 0)) - crc = crc32c_intel_le_hw(crc, walk.data, nbytes); - - *crcp = crc; + *crcp = crc32c_intel_le_hw(*crcp, data, len); return 0; } -static int crc32c_intel_final(struct ahash_request *req) +static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) { - u32 *crcp = ahash_request_ctx(req); - - *(__le32 *)req->result = ~cpu_to_le32p(crcp); + *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); return 0; } -static int crc32c_intel_digest(struct ahash_request *req) +static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct crypto_hash_walk walk; - u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); - u32 crc = *mctx; - int nbytes; + return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out); +} - for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; - nbytes = crypto_hash_walk_done(&walk, 0)) - crc = crc32c_intel_le_hw(crc, walk.data, nbytes); +static int crc32c_intel_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); - *(__le32 *)req->result = ~cpu_to_le32(crc); + *(__le32 *)out = ~cpu_to_le32p(crcp); return 0; } +static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + static int crc32c_intel_cra_init(struct crypto_tfm *tfm) { u32 *key = crypto_tfm_ctx(tfm); *key = ~0; - tfm->crt_ahash.reqsize = sizeof(u32); - return 0; } -static struct crypto_alg alg = { - .cra_name = "crc32c", - .cra_driver_name = "crc32c-intel", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_AHASH, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_alignmask = 3, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(alg.cra_list), - .cra_init = crc32c_intel_cra_init, - .cra_type = &crypto_ahash_type, - .cra_u = { - .ahash = { - .digestsize = CHKSUM_DIGEST_SIZE, - .setkey = crc32c_intel_setkey, - .init = crc32c_intel_init, - .update = crc32c_intel_update, - .final = crc32c_intel_final, - .digest = crc32c_intel_digest, - } +static struct shash_alg alg = { + .setkey = crc32c_intel_setkey, + .init = crc32c_intel_init, + .update = crc32c_intel_update, + .final = crc32c_intel_final, + .finup = crc32c_intel_finup, + .digest = crc32c_intel_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-intel", + .cra_priority = 200, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = crc32c_intel_cra_init, } }; @@ -184,14 +177,14 @@ static struct crypto_alg alg = { static int __init crc32c_intel_mod_init(void) { if (cpu_has_xmm4_2) - return crypto_register_alg(&alg); + return crypto_register_shash(&alg); else return -ENODEV; } static void __exit crc32c_intel_mod_fini(void) { - crypto_unregister_alg(&alg); + crypto_unregister_shash(&alg); } module_init(crc32c_intel_mod_init); -- cgit v1.2.3 From 0ca59dd948a51c95d5a366d35f897bc5ef9df55d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Dec 2008 23:30:02 +0100 Subject: tracing/ftrace: don't trace on early stage of a secondary cpu boot, v3 Impact: fix a crash/hard-reboot on certain configs while enabling cpu runtime On some archs, the boot of a secondary cpu can have an early fragile state. On x86-64, the pda is not initialized on the first stage of a cpu boot but it is needed to get the cpu number and the current task pointer. This data is needed during tracing. As they were dereferenced at this stage, we got a crash while tracing a cpu being enabled at runtime. Some other archs like ia64 can have such kind of issue too. Changes on v2: We dropped the previous solution of a per-arch called function to guess the current state of a cpu. That could slow down the tracing. This patch removes the -pg flag on arch/x86/kernel/cpu/common.c where the low level cpu boot functions exist, on start_secondary() and a helper function used at this stage. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 3 ++- arch/x86/kernel/cpu/Makefile | 5 +++++ arch/x86/kernel/smpboot.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c2a812ebde8..b8a1799ea87 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -85,7 +85,8 @@ static inline void native_write_msr(unsigned int msr, asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); } -static inline int native_write_msr_safe(unsigned int msr, +/* Can be uninlined because referenced by paravirt */ +notrace static inline int native_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int err; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec6075c05..4ae495a313f 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -2,6 +2,11 @@ # Makefile for x86-compatible CPU details and quirks # +# Don't trace early stages of a secondary CPU boot +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_common.o = -pg +endif + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f71f96fc9e6..f6174d22902 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -287,7 +287,7 @@ static int __cpuinitdata unsafe_smp; /* * Activate a secondary processor. */ -static void __cpuinit start_secondary(void *unused) +notrace static void __cpuinit start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too -- cgit v1.2.3 From 1fcccb008be12ea823aaa392758e1e41fb82de9a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Tue, 23 Dec 2008 21:50:11 +0530 Subject: x86: traps.c replace #if CONFIG_X86_32 with #ifdef CONFIG_X86_32 Impact: cleanup, avoid warning on X86_64 Fixes this warning on X86_64: CC arch/x86/kernel/traps.o arch/x86/kernel/traps.c:695:5: warning: "CONFIG_X86_32" is not defined Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c320c29255c..f37cee75ab5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -691,7 +691,7 @@ void math_error(void __user *ip) err = swd & ~cwd & 0x3f; -#if CONFIG_X86_32 +#ifdef CONFIG_X86_32 if (!err) return; #endif -- cgit v1.2.3 From fc5243d98ac2575ad14a974b3c097e9ba874c03d Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 25 Dec 2008 13:38:35 +0100 Subject: [S390] arch_setup_additional_pages arguments arch_setup_additional_pages currently gets two arguments, the binary format descripton and an indication if the process uses an executable stack or not. The second argument is not used by anybody, it could be removed without replacement. What actually does make sense is to pass an indication if the process uses the elf interpreter or not. The glibc code will not use anything from the vdso if the process does not use the dynamic linker, so for statically linked binaries the architecture backend can choose not to map the vdso. Acked-by: Ingo Molnar Signed-off-by: Martin Schwidefsky --- arch/x86/include/asm/elf.h | 2 +- arch/x86/vdso/vdso32-setup.c | 2 +- arch/x86/vdso/vma.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 40ca1bea791..f51a3ddde01 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -325,7 +325,7 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack); + int uses_interp); extern int syscall32_setup_pages(struct linux_binprm *, int exstack); #define compat_arch_setup_additional_pages syscall32_setup_pages diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 513f330c583..1241f118ab5 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -310,7 +310,7 @@ int __init sysenter_setup(void) } /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 257ba4a10ab..9c98cc6ba97 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -98,7 +98,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) /* Setup a VMA at program startup for the vsyscall page. Not called for compat tasks */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; -- cgit v1.2.3