diff options
Diffstat (limited to 'arch')
30 files changed, 1188 insertions, 1087 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f..d4d4cb7629e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -242,21 +242,13 @@ config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER -if ACPI config X86_MPPARSE - def_bool y - bool "Enable MPS table" + bool "Enable MPS table" if ACPI + default y depends on X86_LOCAL_APIC help For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it -endif - -if !ACPI -config X86_MPPARSE - def_bool y - depends on X86_LOCAL_APIC -endif choice prompt "Subarchitecture Type" diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 4bc02b23674..e82ebd65226 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -572,11 +572,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->dx = (unsigned long) &frame->info; regs->cx = (unsigned long) &frame->uc; - /* Make -mregparm=3 work */ - regs->ax = sig; - regs->dx = (unsigned long) &frame->info; - regs->cx = (unsigned long) &frame->uc; - loadsegment(ds, __USER32_DS); loadsegment(es, __USER32_DS); diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 36001032271..9fa9dcdf344 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -168,7 +168,15 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) */ static inline void change_bit(int nr, volatile unsigned long *addr) { - asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr)); + if (IS_IMMEDIATE(nr)) { + asm volatile(LOCK_PREFIX "xorb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((u8)CONST_MASK(nr))); + } else { + asm volatile(LOCK_PREFIX "btc %1,%0" + : BITOP_ADDR(addr) + : "Ir" (nr)); + } } /** diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h index e02ae2d89ac..f110ad417df 100644 --- a/arch/x86/include/asm/byteorder.h +++ b/arch/x86/include/asm/byteorder.h @@ -4,26 +4,33 @@ #include <asm/types.h> #include <linux/compiler.h> -#ifdef __GNUC__ +#define __LITTLE_ENDIAN -#ifdef __i386__ - -static inline __attribute_const__ __u32 ___arch__swab32(__u32 x) +static inline __attribute_const__ __u32 __arch_swab32(__u32 val) { -#ifdef CONFIG_X86_BSWAP - asm("bswap %0" : "=r" (x) : "0" (x)); -#else +#ifdef __i386__ +# ifdef CONFIG_X86_BSWAP + asm("bswap %0" : "=r" (val) : "0" (val)); +# else asm("xchgb %b0,%h0\n\t" /* swap lower bytes */ "rorl $16,%0\n\t" /* swap words */ "xchgb %b0,%h0" /* swap higher bytes */ - : "=q" (x) - : "0" (x)); + : "=q" (val) + : "0" (val)); +# endif + +#else /* __i386__ */ + asm("bswapl %0" + : "=r" (val) + : "0" (val)); #endif - return x; + return val; } +#define __arch_swab32 __arch_swab32 -static inline __attribute_const__ __u64 ___arch__swab64(__u64 val) +static inline __attribute_const__ __u64 __arch_swab64(__u64 val) { +#ifdef __i386__ union { struct { __u32 a; @@ -32,50 +39,27 @@ static inline __attribute_const__ __u64 ___arch__swab64(__u64 val) __u64 u; } v; v.u = val; -#ifdef CONFIG_X86_BSWAP +# ifdef CONFIG_X86_BSWAP asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b)); -#else - v.s.a = ___arch__swab32(v.s.a); - v.s.b = ___arch__swab32(v.s.b); +# else + v.s.a = __arch_swab32(v.s.a); + v.s.b = __arch_swab32(v.s.b); asm("xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b)); -#endif +# endif return v.u; -} - #else /* __i386__ */ - -static inline __attribute_const__ __u64 ___arch__swab64(__u64 x) -{ asm("bswapq %0" - : "=r" (x) - : "0" (x)); - return x; -} - -static inline __attribute_const__ __u32 ___arch__swab32(__u32 x) -{ - asm("bswapl %0" - : "=r" (x) - : "0" (x)); - return x; -} - + : "=r" (val) + : "0" (val)); + return val; #endif +} +#define __arch_swab64 __arch_swab64 -/* Do not define swab16. Gcc is smart enough to recognize "C" version and - convert it into rotation or exhange. */ - -#define __arch__swab64(x) ___arch__swab64(x) -#define __arch__swab32(x) ___arch__swab32(x) - -#define __BYTEORDER_HAS_U64__ - -#endif /* __GNUC__ */ - -#include <linux/byteorder/little_endian.h> +#include <linux/byteorder.h> #endif /* _ASM_X86_BYTEORDER_H */ diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 804b6e6be92..3afc5e87cfd 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -6,56 +6,91 @@ #endif /* - Macros for dwarf2 CFI unwind table entries. - See "as.info" for details on these pseudo ops. Unfortunately - they are only supported in very new binutils, so define them - away for older version. + * Macros for dwarf2 CFI unwind table entries. + * See "as.info" for details on these pseudo ops. Unfortunately + * they are only supported in very new binutils, so define them + * away for older version. */ #ifdef CONFIG_AS_CFI -#define CFI_STARTPROC .cfi_startproc -#define CFI_ENDPROC .cfi_endproc -#define CFI_DEF_CFA .cfi_def_cfa -#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register -#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset -#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset -#define CFI_OFFSET .cfi_offset -#define CFI_REL_OFFSET .cfi_rel_offset -#define CFI_REGISTER .cfi_register -#define CFI_RESTORE .cfi_restore -#define CFI_REMEMBER_STATE .cfi_remember_state -#define CFI_RESTORE_STATE .cfi_restore_state -#define CFI_UNDEFINED .cfi_undefined +#define CFI_STARTPROC .cfi_startproc +#define CFI_ENDPROC .cfi_endproc +#define CFI_DEF_CFA .cfi_def_cfa +#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register +#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset +#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset +#define CFI_OFFSET .cfi_offset +#define CFI_REL_OFFSET .cfi_rel_offset +#define CFI_REGISTER .cfi_register +#define CFI_RESTORE .cfi_restore +#define CFI_REMEMBER_STATE .cfi_remember_state +#define CFI_RESTORE_STATE .cfi_restore_state +#define CFI_UNDEFINED .cfi_undefined #ifdef CONFIG_AS_CFI_SIGNAL_FRAME -#define CFI_SIGNAL_FRAME .cfi_signal_frame +#define CFI_SIGNAL_FRAME .cfi_signal_frame #else #define CFI_SIGNAL_FRAME #endif #else -/* Due to the structure of pre-exisiting code, don't use assembler line - comment character # to ignore the arguments. Instead, use a dummy macro. */ +/* + * Due to the structure of pre-exisiting code, don't use assembler line + * comment character # to ignore the arguments. Instead, use a dummy macro. + */ .macro cfi_ignore a=0, b=0, c=0, d=0 .endm -#define CFI_STARTPROC cfi_ignore -#define CFI_ENDPROC cfi_ignore -#define CFI_DEF_CFA cfi_ignore +#define CFI_STARTPROC cfi_ignore +#define CFI_ENDPROC cfi_ignore +#define CFI_DEF_CFA cfi_ignore #define CFI_DEF_CFA_REGISTER cfi_ignore #define CFI_DEF_CFA_OFFSET cfi_ignore #define CFI_ADJUST_CFA_OFFSET cfi_ignore -#define CFI_OFFSET cfi_ignore -#define CFI_REL_OFFSET cfi_ignore -#define CFI_REGISTER cfi_ignore -#define CFI_RESTORE cfi_ignore -#define CFI_REMEMBER_STATE cfi_ignore -#define CFI_RESTORE_STATE cfi_ignore -#define CFI_UNDEFINED cfi_ignore -#define CFI_SIGNAL_FRAME cfi_ignore +#define CFI_OFFSET cfi_ignore +#define CFI_REL_OFFSET cfi_ignore +#define CFI_REGISTER cfi_ignore +#define CFI_RESTORE cfi_ignore +#define CFI_REMEMBER_STATE cfi_ignore +#define CFI_RESTORE_STATE cfi_ignore +#define CFI_UNDEFINED cfi_ignore +#define CFI_SIGNAL_FRAME cfi_ignore #endif +/* + * An attempt to make CFI annotations more or less + * correct and shorter. It is implied that you know + * what you're doing if you use them. + */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_X86_64 + .macro pushq_cfi reg + pushq \reg + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro popq_cfi reg + popq \reg + CFI_ADJUST_CFA_OFFSET -8 + .endm + + .macro movq_cfi reg offset=0 + movq %\reg, \offset(%rsp) + CFI_REL_OFFSET \reg, \offset + .endm + + .macro movq_cfi_restore offset reg + movq \offset(%rsp), %\reg + CFI_RESTORE \reg + .endm +#else /*!CONFIG_X86_64*/ + + /* 32bit defenitions are missed yet */ + +#endif /*!CONFIG_X86_64*/ +#endif /*__ASSEMBLY__*/ + #endif /* _ASM_X86_DWARF2_H */ diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h index 5ca135e72f2..cf7954d1405 100644 --- a/arch/x86/include/asm/hardirq_32.h +++ b/arch/x86/include/asm/hardirq_32.h @@ -22,6 +22,8 @@ DECLARE_PER_CPU(irq_cpustat_t, irq_stat); #define __ARCH_IRQ_STAT #define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member) +#define inc_irq_stat(member) (__get_cpu_var(irq_stat).member++) + void ack_bad_irq(unsigned int irq); #include <linux/irq_cpustat.h> diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h index 1ba381fc51d..b5a6b5d5670 100644 --- a/arch/x86/include/asm/hardirq_64.h +++ b/arch/x86/include/asm/hardirq_64.h @@ -11,6 +11,8 @@ #define __ARCH_IRQ_STAT 1 +#define inc_irq_stat(member) add_pda(member, 1) + #define local_softirq_pending() read_pda(__softirq_pending) #define __ARCH_SET_SOFTIRQ_PENDING 1 diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b97aecb0b61..8de644b6b95 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -109,9 +109,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #endif #endif -#ifdef CONFIG_X86_32 -extern void (*const interrupt[NR_VECTORS])(void); -#endif +extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index bae0eda9548..28e409fc73f 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -31,10 +31,6 @@ static inline int irq_canonicalize(int irq) # endif #endif -#ifdef CONFIG_IRQBALANCE -extern int irqbalance_disable(char *str); -#endif - #ifdef CONFIG_HOTPLUG_CPU #include <linux/cpumask.h> extern void fixup_irqs(cpumask_t map); diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h index af2f02d27fc..86afd747345 100644 --- a/arch/x86/include/asm/irq_regs_32.h +++ b/arch/x86/include/asm/irq_regs_32.h @@ -9,6 +9,8 @@ #include <asm/percpu.h> +#define ARCH_HAS_OWN_IRQ_REGS + DECLARE_PER_CPU(struct pt_regs *, irq_regs); static inline struct pt_regs *get_irq_regs(void) diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index f61ee8f937e..5d98d0b68ff 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -57,5 +57,65 @@ #define __ALIGN_STR ".align 16,0x90" #endif +/* + * to check ENTRY_X86/END_X86 and + * KPROBE_ENTRY_X86/KPROBE_END_X86 + * unbalanced-missed-mixed appearance + */ +#define __set_entry_x86 .set ENTRY_X86_IN, 0 +#define __unset_entry_x86 .set ENTRY_X86_IN, 1 +#define __set_kprobe_x86 .set KPROBE_X86_IN, 0 +#define __unset_kprobe_x86 .set KPROBE_X86_IN, 1 + +#define __macro_err_x86 .error "ENTRY_X86/KPROBE_X86 unbalanced,missed,mixed" + +#define __check_entry_x86 \ + .ifdef ENTRY_X86_IN; \ + .ifeq ENTRY_X86_IN; \ + __macro_err_x86; \ + .abort; \ + .endif; \ + .endif + +#define __check_kprobe_x86 \ + .ifdef KPROBE_X86_IN; \ + .ifeq KPROBE_X86_IN; \ + __macro_err_x86; \ + .abort; \ + .endif; \ + .endif + +#define __check_entry_kprobe_x86 \ + __check_entry_x86; \ + __check_kprobe_x86 + +#define ENTRY_KPROBE_FINAL_X86 __check_entry_kprobe_x86 + +#define ENTRY_X86(name) \ + __check_entry_kprobe_x86; \ + __set_entry_x86; \ + .globl name; \ + __ALIGN; \ + name: + +#define END_X86(name) \ + __unset_entry_x86; \ + __check_entry_kprobe_x86; \ + .size name, .-name + +#define KPROBE_ENTRY_X86(name) \ + __check_entry_kprobe_x86; \ + __set_kprobe_x86; \ + .pushsection .kprobes.text, "ax"; \ + .globl name; \ + __ALIGN; \ + name: + +#define KPROBE_END_X86(name) \ + __unset_kprobe_x86; \ + __check_entry_kprobe_x86; \ + .size name, .-name; \ + .popsection + #endif /* _ASM_X86_LINKAGE_H */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 9cd83a8e40d..38ae163cc91 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -34,8 +34,6 @@ static inline cycles_t get_cycles(void) static __always_inline cycles_t vget_cycles(void) { - cycles_t cycles; - /* * We only do VDSOs on TSC capable CPUs, so this shouldnt * access boot_cpu_data (which is not VDSO-safe): @@ -44,11 +42,7 @@ static __always_inline cycles_t vget_cycles(void) if (!cpu_has_tsc) return 0; #endif - rdtsc_barrier(); - cycles = (cycles_t)__native_read_tsc(); - rdtsc_barrier(); - - return cycles; + return (cycles_t)__native_read_tsc(); } extern void tsc_init(void); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b62a7667828..3de1f235045 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -12,6 +12,7 @@ CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_early_printk.o = -pg endif # diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index fc26ff390f1..47cbaa237b7 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -777,11 +777,7 @@ static void local_apic_timer_interrupt(void) /* * the NMI deadlock-detector uses this. */ -#ifdef CONFIG_X86_64 - add_pda(apic_timer_irqs, 1); -#else - per_cpu(irq_stat, cpu).apic_timer_irqs++; -#endif + inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); } @@ -1681,14 +1677,11 @@ void smp_spurious_interrupt(struct pt_regs *regs) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); -#ifdef CONFIG_X86_64 - add_pda(irq_spurious_count, 1); -#else + inc_irq_stat(irq_spurious_count); + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); - __get_cpu_var(irq_stat).irq_spurious_count++; -#endif irq_exit(); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 5eb390a4b2e..748c8f9e7a0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -237,7 +237,7 @@ asmlinkage void mce_threshold_interrupt(void) } } out: - add_pda(irq_threshold_count, 1); + inc_irq_stat(irq_threshold_count); irq_exit(); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index c17eaf5dd6d..4b48f251fd3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -26,7 +26,7 @@ asmlinkage void smp_thermal_interrupt(void) if (therm_throt_process(msr_val & 1)) mce_log_therm_throt_event(smp_processor_id(), msr_val); - add_pda(irq_thermal_count, 1); + inc_irq_stat(irq_thermal_count); irq_exit(); } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 28b597ef9ca..fe7014176eb 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -619,28 +619,37 @@ END(syscall_badsys) 27:; /* - * Build the entry stubs and pointer table with - * some assembler magic. + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. */ -.section .rodata,"a" +.section .init.rodata,"a" ENTRY(interrupt) .text - + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT ENTRY(irq_entries_start) RING0_INT_FRAME -vector=0 -.rept NR_VECTORS - ALIGN - .if vector +vector=FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -4 - .endif -1: pushl $~(vector) + .endif +1: pushl $(~vector+0x80) /* Note: always in signed byte range */ CFI_ADJUST_CFA_OFFSET 4 - jmp common_interrupt - .previous + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 + jmp 2f + .endif + .previous .long 1b - .text + .text vector=vector+1 + .endif + .endr +2: jmp common_interrupt .endr END(irq_entries_start) @@ -652,8 +661,9 @@ END(interrupt) * the CPU automatically disables interrupts when executing an IRQ vector, * so IRQ-flags tracing has to follow that: */ - ALIGN + .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ SAVE_ALL TRACE_IRQS_OFF movl %esp,%eax @@ -678,65 +688,6 @@ ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -KPROBE_ENTRY(page_fault) - RING0_EC_FRAME - pushl $do_page_fault - CFI_ADJUST_CFA_OFFSET 4 - ALIGN -error_code: - /* the function address is in %fs's slot on the stack */ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET es, 0*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET ds, 0*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebx, 0 - cld - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET fs, 0*/ - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs - UNWIND_ESPFIX_STACK - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - /*CFI_REGISTER es, ecx*/ - movl PT_FS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - mov %ecx, PT_FS(%esp) - /*CFI_REL_OFFSET fs, ES*/ - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(page_fault) - ENTRY(coprocessor_error) RING0_INT_FRAME pushl $0 @@ -767,140 +718,6 @@ ENTRY(device_not_available) CFI_ENDPROC END(device_not_available) -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -#define FIX_STACK(offset, ok, label) \ - cmpw $__KERNEL_CS,4(%esp); \ - jne ok; \ -label: \ - movl TSS_sysenter_sp0+offset(%esp),%esp; \ - CFI_DEF_CFA esp, 0; \ - CFI_UNDEFINED eip; \ - pushfl; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $__KERNEL_CS; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $sysenter_past_esp; \ - CFI_ADJUST_CFA_OFFSET 4; \ - CFI_REL_OFFSET eip, 0 - -KPROBE_ENTRY(debug) - RING0_INT_FRAME - cmpl $ia32_sysenter_target,(%esp) - jne debug_stack_correct - FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) -debug_stack_correct: - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(debug) - -/* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. - */ -KPROBE_ENTRY(nmi) - RING0_INT_FRAME - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - je nmi_espfix_stack - cmpl $ia32_sysenter_target,(%esp) - je nmi_stack_fixup - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - movl %esp,%eax - /* Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - /* We have a RING0_INT_FRAME here */ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_nocheck_notrace - CFI_ENDPROC - -nmi_stack_fixup: - RING0_INT_FRAME - FIX_STACK(12,nmi_stack_correct, 1) - jmp nmi_stack_correct - -nmi_debug_stack_check: - /* We have a RING0_INT_FRAME here */ - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct - FIX_STACK(24,nmi_stack_correct, 1) - jmp nmi_stack_correct - -nmi_espfix_stack: - /* We have a RING0_INT_FRAME here. - * - * create the pointer to lss back - */ - pushl %ss - CFI_ADJUST_CFA_OFFSET 4 - pushl %esp - CFI_ADJUST_CFA_OFFSET 4 - addw $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - CFI_ADJUST_CFA_OFFSET 4 - .endr - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi - RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - CFI_ADJUST_CFA_OFFSET -24 - jmp irq_return - CFI_ENDPROC -KPROBE_END(nmi) - #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iret @@ -916,19 +733,6 @@ ENTRY(native_irq_enable_sysexit) END(native_irq_enable_sysexit) #endif -KPROBE_ENTRY(int3) - RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(int3) - ENTRY(overflow) RING0_INT_FRAME pushl $0 @@ -993,14 +797,6 @@ ENTRY(stack_segment) CFI_ENDPROC END(stack_segment) -KPROBE_ENTRY(general_protection) - RING0_EC_FRAME - pushl $do_general_protection - CFI_ADJUST_CFA_OFFSET 4 - jmp error_code - CFI_ENDPROC -KPROBE_END(general_protection) - ENTRY(alignment_check) RING0_EC_FRAME pushl $do_alignment_check @@ -1051,6 +847,7 @@ ENTRY(kernel_thread_helper) push %eax CFI_ADJUST_CFA_OFFSET 4 call do_exit + ud2 # padding for call trace CFI_ENDPROC ENDPROC(kernel_thread_helper) @@ -1210,3 +1007,227 @@ END(mcount) #include "syscall_table_32.S" syscall_table_size=(.-sys_call_table) + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 + ALIGN +error_code: + /* the function address is in %fs's slot on the stack */ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + cld + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0*/ + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_REGISTER es, ecx*/ + movl PT_FS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + mov %ecx, PT_FS(%esp) + /*CFI_REL_OFFSET fs, ES*/ + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC +END(page_fault) + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +#define FIX_STACK(offset, ok, label) \ + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ +label: \ + movl TSS_sysenter_sp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ + pushfl; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $__KERNEL_CS; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $sysenter_past_esp; \ + CFI_ADJUST_CFA_OFFSET 4; \ + CFI_REL_OFFSET eip, 0 + +ENTRY(debug) + RING0_INT_FRAME + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +debug_stack_correct: + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + je nmi_espfix_stack + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_nocheck_notrace + CFI_ENDPROC + +nmi_stack_fixup: + RING0_INT_FRAME + FIX_STACK(12,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK(24,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_espfix_stack: + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ + pushl %ss + CFI_ADJUST_CFA_OFFSET 4 + pushl %esp + CFI_ADJUST_CFA_OFFSET 4 + addw $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 + .endr + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 + jmp irq_return + CFI_ENDPROC +END(nmi) + +ENTRY(int3) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC +END(int3) + +ENTRY(general_protection) + RING0_EC_FRAME + pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(general_protection) + +/* + * End of kprobes section + */ + .popsection diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b86f332c96a..3194636a429 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -11,15 +11,15 @@ * * NOTE: This code handles signal-recognition, which happens every time * after an interrupt and after each system call. - * - * Normal syscalls and interrupts don't save a full stack frame, this is + * + * Normal syscalls and interrupts don't save a full stack frame, this is * only done for syscall tracing, signals or fork/exec et.al. - * - * A note on terminology: - * - top of stack: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. + * + * A note on terminology: + * - top of stack: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. * - partial stack frame: partially saved registers upto R11. - * - full stack frame: Like partial stack frame, but all register saved. + * - full stack frame: Like partial stack frame, but all register saved. * * Some macro usage: * - CFI macros are used to generate dwarf2 unwind information for better @@ -60,7 +60,6 @@ #define __AUDIT_ARCH_LE 0x40000000 .code64 - #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) @@ -142,7 +141,7 @@ END(mcount) #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args -#endif +#endif #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) @@ -161,29 +160,29 @@ ENTRY(native_usergs_sysret64) .endm /* - * C code is not supposed to know about undefined top of stack. Every time - * a C function with an pt_regs argument is called from the SYSCALL based + * C code is not supposed to know about undefined top of stack. Every time + * a C function with an pt_regs argument is called from the SYSCALL based * fast path FIXUP_TOP_OF_STACK is needed. * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs * manipulation. - */ - - /* %rsp:at FRAMEEND */ - .macro FIXUP_TOP_OF_STACK tmp - movq %gs:pda_oldrsp,\tmp - movq \tmp,RSP(%rsp) - movq $__USER_DS,SS(%rsp) - movq $__USER_CS,CS(%rsp) - movq $-1,RCX(%rsp) - movq R11(%rsp),\tmp /* get eflags */ - movq \tmp,EFLAGS(%rsp) + */ + + /* %rsp:at FRAMEEND */ + .macro FIXUP_TOP_OF_STACK tmp offset=0 + movq %gs:pda_oldrsp,\tmp + movq \tmp,RSP+\offset(%rsp) + movq $__USER_DS,SS+\offset(%rsp) + movq $__USER_CS,CS+\offset(%rsp) + movq $-1,RCX+\offset(%rsp) + movq R11+\offset(%rsp),\tmp /* get eflags */ + movq \tmp,EFLAGS+\offset(%rsp) .endm - .macro RESTORE_TOP_OF_STACK tmp,offset=0 - movq RSP-\offset(%rsp),\tmp - movq \tmp,%gs:pda_oldrsp - movq EFLAGS-\offset(%rsp),\tmp - movq \tmp,R11-\offset(%rsp) + .macro RESTORE_TOP_OF_STACK tmp offset=0 + movq RSP+\offset(%rsp),\tmp + movq \tmp,%gs:pda_oldrsp + movq EFLAGS+\offset(%rsp),\tmp + movq \tmp,R11+\offset(%rsp) .endm .macro FAKE_STACK_FRAME child_rip @@ -195,7 +194,7 @@ ENTRY(native_usergs_sysret64) pushq %rax /* rsp */ CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rsp,0 - pushq $(1<<9) /* eflags - interrupts on */ + pushq $X86_EFLAGS_IF /* eflags - interrupts on */ CFI_ADJUST_CFA_OFFSET 8 /*CFI_REL_OFFSET rflags,0*/ pushq $__KERNEL_CS /* cs */ @@ -213,62 +212,184 @@ ENTRY(native_usergs_sysret64) CFI_ADJUST_CFA_OFFSET -(6*8) .endm - .macro CFI_DEFAULT_STACK start=1 +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro EMPTY_FRAME start=1 offset=0 .if \start - CFI_STARTPROC simple + CFI_STARTPROC simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8 + CFI_DEF_CFA rsp,8+\offset .else - CFI_DEF_CFA_OFFSET SS+8 + CFI_DEF_CFA_OFFSET 8+\offset .endif - CFI_REL_OFFSET r15,R15 - CFI_REL_OFFSET r14,R14 - CFI_REL_OFFSET r13,R13 - CFI_REL_OFFSET r12,R12 - CFI_REL_OFFSET rbp,RBP - CFI_REL_OFFSET rbx,RBX - CFI_REL_OFFSET r11,R11 - CFI_REL_OFFSET r10,R10 - CFI_REL_OFFSET r9,R9 - CFI_REL_OFFSET r8,R8 - CFI_REL_OFFSET rax,RAX - CFI_REL_OFFSET rcx,RCX - CFI_REL_OFFSET rdx,RDX - CFI_REL_OFFSET rsi,RSI - CFI_REL_OFFSET rdi,RDI - CFI_REL_OFFSET rip,RIP - /*CFI_REL_OFFSET cs,CS*/ - /*CFI_REL_OFFSET rflags,EFLAGS*/ - CFI_REL_OFFSET rsp,RSP - /*CFI_REL_OFFSET ss,SS*/ .endm + +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro INTR_FRAME start=1 offset=0 + EMPTY_FRAME \start, SS+8+\offset-RIP + /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ + CFI_REL_OFFSET rsp, RSP+\offset-RIP + /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ + /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ + CFI_REL_OFFSET rip, RIP+\offset-RIP + .endm + +/* + * initial frame state for exceptions with error code (and interrupts + * with vector already pushed) + */ + .macro XCPT_FRAME start=1 offset=0 + INTR_FRAME \start, RIP+\offset-ORIG_RAX + /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/ + .endm + +/* + * frame that enables calling into C. + */ + .macro PARTIAL_FRAME start=1 offset=0 + XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET + CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET + CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET + CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET + CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET + CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET + CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET + CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET + CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET + CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET + .endm + +/* + * frame that enables passing a complete pt_regs to a C function. + */ + .macro DEFAULT_FRAME start=1 offset=0 + PARTIAL_FRAME \start, R11+\offset-R15 + CFI_REL_OFFSET rbx, RBX+\offset + CFI_REL_OFFSET rbp, RBP+\offset + CFI_REL_OFFSET r12, R12+\offset + CFI_REL_OFFSET r13, R13+\offset + CFI_REL_OFFSET r14, R14+\offset + CFI_REL_OFFSET r15, R15+\offset + .endm + +/* save partial stack frame */ +ENTRY(save_args) + XCPT_FRAME + cld + movq_cfi rdi, RDI+16-ARGOFFSET + movq_cfi rsi, RSI+16-ARGOFFSET + movq_cfi rdx, RDX+16-ARGOFFSET + movq_cfi rcx, RCX+16-ARGOFFSET + movq_cfi rax, RAX+16-ARGOFFSET + movq_cfi r8, R8+16-ARGOFFSET + movq_cfi r9, R9+16-ARGOFFSET + movq_cfi r10, R10+16-ARGOFFSET + movq_cfi r11, R11+16-ARGOFFSET + + leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ + movq_cfi rbp, 8 /* push %rbp */ + leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ + testl $3, CS(%rdi) + je 1f + SWAPGS + /* + * irqcount is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ +1: incl %gs:pda_irqcount + jne 2f + popq_cfi %rax /* move return address... */ + mov %gs:pda_irqstackptr,%rsp + EMPTY_FRAME 0 + pushq_cfi %rax /* ... to the new stack */ + /* + * We entered an interrupt context - irqs are off: + */ +2: TRACE_IRQS_OFF + ret + CFI_ENDPROC +END(save_args) + +ENTRY(save_rest) + PARTIAL_FRAME 1 REST_SKIP+8 + movq 5*8+16(%rsp), %r11 /* save return address */ + movq_cfi rbx, RBX+16 + movq_cfi rbp, RBP+16 + movq_cfi r12, R12+16 + movq_cfi r13, R13+16 + movq_cfi r14, R14+16 + movq_cfi r15, R15+16 + movq %r11, 8(%rsp) /* return address */ + FIXUP_TOP_OF_STACK %r11, 16 + ret + CFI_ENDPROC +END(save_rest) + +/* save complete stack frame */ +ENTRY(save_paranoid) + XCPT_FRAME 1 RDI+8 + cld + movq_cfi rdi, RDI+8 + movq_cfi rsi, RSI+8 + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq_cfi r8, R8+8 + movq_cfi r9, R9+8 + movq_cfi r10, R10+8 + movq_cfi r11, R11+8 + movq_cfi rbx, RBX+8 + movq_cfi rbp, RBP+8 + movq_cfi r12, R12+8 + movq_cfi r13, R13+8 + movq_cfi r14, R14+8 + movq_cfi r15, R15+8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret + CFI_ENDPROC +END(save_paranoid) + /* - * A newly forked process directly context switches into this. - */ -/* rdi: prev */ + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ ENTRY(ret_from_fork) - CFI_DEFAULT_STACK + DEFAULT_FRAME + push kernel_eflags(%rip) CFI_ADJUST_CFA_OFFSET 8 - popf # reset kernel eflags + popf # reset kernel eflags CFI_ADJUST_CFA_OFFSET -8 - call schedule_tail + + call schedule_tail # rdi: 'prev' task parameter + GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) - jnz rff_trace -rff_action: + + CFI_REMEMBER_STATE RESTORE_REST - testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? + + testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? je int_ret_from_sys_call - testl $_TIF_IA32,TI_flags(%rcx) + + testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET jnz int_ret_from_sys_call - RESTORE_TOP_OF_STACK %rdi,ARGOFFSET - jmp ret_from_sys_call -rff_trace: - movq %rsp,%rdi - call syscall_trace_leave - GET_THREAD_INFO(%rcx) - jmp rff_action + + RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET + jmp ret_from_sys_call # go to the SYSRET fastpath + + CFI_RESTORE_STATE CFI_ENDPROC END(ret_from_fork) @@ -278,20 +399,20 @@ END(ret_from_fork) * SYSCALL does not save anything on the stack and does not change the * stack pointer. */ - + /* - * Register setup: + * Register setup: * rax system call number * rdi arg0 - * rcx return address for syscall/sysret, C arg3 + * rcx return address for syscall/sysret, C arg3 * rsi arg1 - * rdx arg2 + * rdx arg2 * r10 arg3 (--> moved to rcx for C) * r8 arg4 * r9 arg5 * r11 eflags for syscall/sysret, temporary for C - * r12-r15,rbp,rbx saved by C code, not touched. - * + * r12-r15,rbp,rbx saved by C code, not touched. + * * Interrupts are off on entry. * Only called from user space. * @@ -301,7 +422,7 @@ END(ret_from_fork) * When user can change the frames always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. - */ + */ ENTRY(system_call) CFI_STARTPROC simple @@ -317,7 +438,7 @@ ENTRY(system_call) */ ENTRY(system_call_after_swapgs) - movq %rsp,%gs:pda_oldrsp + movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp /* * No need to follow this irqs off/on section - it's straight @@ -325,7 +446,7 @@ ENTRY(system_call_after_swapgs) */ ENABLE_INTERRUPTS(CLBR_NONE) SAVE_ARGS 8,1 - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) @@ -339,19 +460,19 @@ system_call_fastpath: movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) - * Has incomplete stack frame and undefined top of stack. - */ + * Has incomplete stack frame and undefined top of stack. + */ ret_from_sys_call: movl $_TIF_ALLWORK_MASK,%edi /* edi: flagmask */ -sysret_check: +sysret_check: LOCKDEP_SYS_EXIT GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl TI_flags(%rcx),%edx andl %edi,%edx - jnz sysret_careful + jnz sysret_careful CFI_REMEMBER_STATE /* * sysretq will re-enable interrupts: @@ -366,7 +487,7 @@ sysret_check: CFI_RESTORE_STATE /* Handle reschedules */ - /* edx: work, edi: workmask */ + /* edx: work, edi: workmask */ sysret_careful: bt $TIF_NEED_RESCHED,%edx jnc sysret_signal @@ -379,7 +500,7 @@ sysret_careful: CFI_ADJUST_CFA_OFFSET -8 jmp sysret_check - /* Handle a signal */ + /* Handle a signal */ sysret_signal: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) @@ -388,17 +509,20 @@ sysret_signal: jc sysret_audit #endif /* edx: work flags (arg3) */ - leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 - call ptregscall_common + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call do_notify_resume + RESTORE_TOP_OF_STACK %r11 + RESTORE_REST movl $_TIF_WORK_MASK,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check - + badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp ret_from_sys_call @@ -437,7 +561,7 @@ sysret_audit: #endif /* CONFIG_AUDITSYSCALL */ /* Do syscall tracing */ -tracesys: +tracesys: #ifdef CONFIG_AUDITSYSCALL testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) jz auditsys @@ -460,8 +584,8 @@ tracesys: call *sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ - -/* + +/* * Syscall return path ending with IRET. * Has correct top of stack, but partial stack frame. */ @@ -505,18 +629,18 @@ int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST - /* Check for syscall exit trace */ + /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal pushq %rdi CFI_ADJUST_CFA_OFFSET 8 - leaq 8(%rsp),%rdi # &ptregs -> arg1 + leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave popq %rdi CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest - + int_signal: testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f @@ -531,22 +655,24 @@ int_restore_rest: jmp int_with_check CFI_ENDPROC END(system_call) - -/* + +/* * Certain special system calls that need to save a complete full stack frame. - */ - + */ .macro PTREGSCALL label,func,arg - .globl \label -\label: - leaq \func(%rip),%rax - leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ - jmp ptregscall_common +ENTRY(\label) + PARTIAL_FRAME 1 8 /* offset 8: return address */ + subq $REST_SKIP, %rsp + CFI_ADJUST_CFA_OFFSET REST_SKIP + call save_rest + DEFAULT_FRAME 0 8 /* offset 8: return address */ + leaq 8(%rsp), \arg /* pt_regs pointer */ + call \func + jmp ptregscall_common + CFI_ENDPROC END(\label) .endm - CFI_STARTPROC - PTREGSCALL stub_clone, sys_clone, %r8 PTREGSCALL stub_fork, sys_fork, %rdi PTREGSCALL stub_vfork, sys_vfork, %rdi @@ -554,25 +680,18 @@ END(\label) PTREGSCALL stub_iopl, sys_iopl, %rsi ENTRY(ptregscall_common) - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rip, r11 - SAVE_REST - movq %r11, %r15 - CFI_REGISTER rip, r15 - FIXUP_TOP_OF_STACK %r11 - call *%rax - RESTORE_TOP_OF_STACK %r11 - movq %r15, %r11 - CFI_REGISTER rip, r11 - RESTORE_REST - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rip, 0 - ret + DEFAULT_FRAME 1 8 /* offset 8: return address */ + RESTORE_TOP_OF_STACK %r11, 8 + movq_cfi_restore R15+8, r15 + movq_cfi_restore R14+8, r14 + movq_cfi_restore R13+8, r13 + movq_cfi_restore R12+8, r12 + movq_cfi_restore RBP+8, rbp + movq_cfi_restore RBX+8, rbx + ret $REST_SKIP /* pop extended registers */ CFI_ENDPROC END(ptregscall_common) - + ENTRY(stub_execve) CFI_STARTPROC popq %r11 @@ -588,11 +707,11 @@ ENTRY(stub_execve) jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execve) - + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. - */ + */ ENTRY(stub_rt_sigreturn) CFI_STARTPROC addq $8, %rsp @@ -608,70 +727,70 @@ ENTRY(stub_rt_sigreturn) END(stub_rt_sigreturn) /* - * initial frame state for interrupts and exceptions + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. */ - .macro _frame ref - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-\ref - /*CFI_REL_OFFSET ss,SS-\ref*/ - CFI_REL_OFFSET rsp,RSP-\ref - /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ - /*CFI_REL_OFFSET cs,CS-\ref*/ - CFI_REL_OFFSET rip,RIP-\ref - .endm + .section .init.rodata,"a" +ENTRY(interrupt) + .text + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT +ENTRY(irq_entries_start) + INTR_FRAME +vector=FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector <> FIRST_EXTERNAL_VECTOR + CFI_ADJUST_CFA_OFFSET -8 + .endif +1: pushq $(~vector+0x80) /* Note: always in signed byte range */ + CFI_ADJUST_CFA_OFFSET 8 + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 + jmp 2f + .endif + .previous + .quad 1b + .text +vector=vector+1 + .endif + .endr +2: jmp common_interrupt +.endr + CFI_ENDPROC +END(irq_entries_start) -/* initial frame state for interrupts (and exceptions without error code) */ -#define INTR_FRAME _frame RIP -/* initial frame state for exceptions with error code (and interrupts with - vector already pushed) */ -#define XCPT_FRAME _frame ORIG_RAX +.previous +END(interrupt) +.previous -/* +/* * Interrupt entry/exit. * * Interrupt entry points save only callee clobbered registers in fast path. - * - * Entry runs with interrupts off. - */ + * + * Entry runs with interrupts off. + */ -/* 0(%rsp): interrupt number */ +/* 0(%rsp): ~(interrupt number) */ .macro interrupt func - cld - SAVE_ARGS - leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler - pushq %rbp - /* - * Save rbp twice: One is for marking the stack frame, as usual, and the - * other, to fill pt_regs properly. This is because bx comes right - * before the last saved register in that structure, and not bp. If the - * base pointer were in the place bx is today, this would not be needed. - */ - movq %rbp, -8(%rsp) - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbp, 0 - movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp - testl $3,CS(%rdi) - je 1f - SWAPGS - /* irqcount is used to check if a CPU is already on an interrupt - stack or not. While this is essentially redundant with preempt_count - it is a little cheaper to use a separate counter in the PDA - (short of moving irq_enter into assembly, which would be too - much work) */ -1: incl %gs:pda_irqcount - cmoveq %gs:pda_irqstackptr,%rsp - push %rbp # backlink for old unwinder - /* - * We entered an interrupt context - irqs are off: - */ - TRACE_IRQS_OFF + subq $10*8, %rsp + CFI_ADJUST_CFA_OFFSET 10*8 + call save_args + PARTIAL_FRAME 0 call \func .endm -ENTRY(common_interrupt) + /* + * The interrupt stubs push (~vector+0x80) onto the stack and + * then jump to common_interrupt. + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: XCPT_FRAME + addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: @@ -685,12 +804,12 @@ exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) je retint_kernel - + /* Interrupt came from user space */ /* * Has a correct top of stack, but a partial stack frame * %rcx: thread info. Interrupts off. - */ + */ retint_with_reschedule: movl $_TIF_WORK_MASK,%edi retint_check: @@ -763,20 +882,20 @@ retint_careful: pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule - popq %rdi + popq %rdi CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp retint_check - + retint_signal: testl $_TIF_DO_NOTIFY_MASK,%edx jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST - movq $-1,ORIG_RAX(%rsp) + movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume @@ -798,324 +917,211 @@ ENTRY(retint_kernel) jnc retint_restore_args call preempt_schedule_irq jmp exit_intr -#endif +#endif CFI_ENDPROC END(common_interrupt) - + /* * APIC interrupts. - */ - .macro apicinterrupt num,func + */ +.macro apicinterrupt num sym do_sym +ENTRY(\sym) INTR_FRAME pushq $~(\num) CFI_ADJUST_CFA_OFFSET 8 - interrupt \func + interrupt \do_sym jmp ret_from_intr CFI_ENDPROC - .endm - -ENTRY(thermal_interrupt) - apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt -END(thermal_interrupt) - -ENTRY(threshold_interrupt) - apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt -END(threshold_interrupt) - -#ifdef CONFIG_SMP -ENTRY(reschedule_interrupt) - apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt -END(reschedule_interrupt) - - .macro INVALIDATE_ENTRY num -ENTRY(invalidate_interrupt\num) - apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt -END(invalidate_interrupt\num) - .endm +END(\sym) +.endm - INVALIDATE_ENTRY 0 - INVALIDATE_ENTRY 1 - INVALIDATE_ENTRY 2 - INVALIDATE_ENTRY 3 - INVALIDATE_ENTRY 4 - INVALIDATE_ENTRY 5 - INVALIDATE_ENTRY 6 - INVALIDATE_ENTRY 7 - -ENTRY(call_function_interrupt) - apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt -END(call_function_interrupt) -ENTRY(call_function_single_interrupt) - apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt -END(call_function_single_interrupt) -ENTRY(irq_move_cleanup_interrupt) - apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt -END(irq_move_cleanup_interrupt) +#ifdef CONFIG_SMP +apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ + irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt #endif -ENTRY(apic_timer_interrupt) - apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt -END(apic_timer_interrupt) +apicinterrupt UV_BAU_MESSAGE \ + uv_bau_message_intr1 uv_bau_message_interrupt +apicinterrupt LOCAL_TIMER_VECTOR \ + apic_timer_interrupt smp_apic_timer_interrupt + +#ifdef CONFIG_SMP +apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ + invalidate_interrupt0 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ + invalidate_interrupt1 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ + invalidate_interrupt2 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ + invalidate_interrupt3 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \ + invalidate_interrupt4 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \ + invalidate_interrupt5 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \ + invalidate_interrupt6 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ + invalidate_interrupt7 smp_invalidate_interrupt +#endif -ENTRY(uv_bau_message_intr1) - apicinterrupt 220,uv_bau_message_interrupt -END(uv_bau_message_intr1) +apicinterrupt THRESHOLD_APIC_VECTOR \ + threshold_interrupt mce_threshold_interrupt +apicinterrupt THERMAL_APIC_VECTOR \ + thermal_interrupt smp_thermal_interrupt + +#ifdef CONFIG_SMP +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ + call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR \ + call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR \ + reschedule_interrupt smp_reschedule_interrupt +#endif -ENTRY(error_interrupt) - apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt -END(error_interrupt) +apicinterrupt ERROR_APIC_VECTOR \ + error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR \ + spurious_interrupt smp_spurious_interrupt -ENTRY(spurious_interrupt) - apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt -END(spurious_interrupt) - /* * Exception entry points. - */ - .macro zeroentry sym + */ +.macro zeroentry sym do_sym +ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 /* push error code/oldrax */ - CFI_ADJUST_CFA_OFFSET 8 - pushq %rax /* push real oldrax to the rdi slot */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rax,0 - leaq \sym(%rip),%rax - jmp error_entry + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + subq $15*8,%rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call error_entry + DEFAULT_FRAME 0 + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + call \do_sym + jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +END(\sym) +.endm - .macro errorentry sym - XCPT_FRAME +.macro paranoidzeroentry sym do_sym +ENTRY(\sym) + INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq %rax + pushq $-1 /* ORIG_RAX: no syscall to restart */ CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rax,0 - leaq \sym(%rip),%rax - jmp error_entry + subq $15*8, %rsp + call save_paranoid + TRACE_IRQS_OFF + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + call \do_sym + jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +END(\sym) +.endm - /* error code is on the stack already */ - /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym, ist=0, irqtrace=1 - SAVE_ALL - cld - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f - SWAPGS - xorl %ebx,%ebx -1: - .if \ist - movq %gs:pda_data_offset, %rbp - .endif - .if \irqtrace - TRACE_IRQS_OFF - .endif - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi - movq $-1,ORIG_RAX(%rsp) - .if \ist - subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) - .endif - call \sym - .if \ist - addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) - .endif - DISABLE_INTERRUPTS(CLBR_NONE) - .if \irqtrace +.macro paranoidzeroentry_ist sym do_sym ist +ENTRY(\sym) + INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + pushq $-1 /* ORIG_RAX: no syscall to restart */ + CFI_ADJUST_CFA_OFFSET 8 + subq $15*8, %rsp + call save_paranoid TRACE_IRQS_OFF - .endif - .endm + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + movq %gs:pda_data_offset, %rbp + subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + call \do_sym + addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + jmp paranoid_exit /* %ebx: no swapgs flag */ + CFI_ENDPROC +END(\sym) +.endm - /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take - * any kernel state for granted. - * We don't do kernel preemption checks here, because only - * NMI should be common and it does not enable IRQs and - * cannot get reschedule ticks. - * - * "trace" is 0 for the NMI handler only, because irq-tracing - * is fundamentally NMI-unsafe. (we cannot change the soft and - * hard flags at once, atomically) - */ - .macro paranoidexit trace=1 - /* ebx: no swapgs flag */ -paranoid_exit\trace: - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore\trace - testl $3,CS(%rsp) - jnz paranoid_userspace\trace -paranoid_swapgs\trace: - .if \trace - TRACE_IRQS_IRETQ 0 - .endif - SWAPGS_UNSAFE_STACK -paranoid_restore\trace: - RESTORE_ALL 8 - jmp irq_return -paranoid_userspace\trace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs\trace - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule\trace - movl %ebx,%edx /* arg3: thread flags */ - .if \trace - TRACE_IRQS_ON - .endif - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - .if \trace - TRACE_IRQS_OFF - .endif - jmp paranoid_userspace\trace -paranoid_schedule\trace: - .if \trace - TRACE_IRQS_ON - .endif - ENABLE_INTERRUPTS(CLBR_ANY) - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) - .if \trace - TRACE_IRQS_OFF - .endif - jmp paranoid_userspace\trace +.macro errorentry sym do_sym +ENTRY(\sym) + XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + subq $15*8,%rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call error_entry + DEFAULT_FRAME 0 + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + call \do_sym + jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +END(\sym) +.endm -/* - * Exception entry point. This expects an error code/orig_rax on the stack - * and the exception handler in %rax. - */ -KPROBE_ENTRY(error_entry) - _frame RDI - CFI_REL_OFFSET rax,0 - /* rdi slot contains rax, oldrax contains error code */ - cld - subq $14*8,%rsp - CFI_ADJUST_CFA_OFFSET (14*8) - movq %rsi,13*8(%rsp) - CFI_REL_OFFSET rsi,RSI - movq 14*8(%rsp),%rsi /* load rax from rdi slot */ - CFI_REGISTER rax,rsi - movq %rdx,12*8(%rsp) - CFI_REL_OFFSET rdx,RDX - movq %rcx,11*8(%rsp) - CFI_REL_OFFSET rcx,RCX - movq %rsi,10*8(%rsp) /* store rax */ - CFI_REL_OFFSET rax,RAX - movq %r8, 9*8(%rsp) - CFI_REL_OFFSET r8,R8 - movq %r9, 8*8(%rsp) - CFI_REL_OFFSET r9,R9 - movq %r10,7*8(%rsp) - CFI_REL_OFFSET r10,R10 - movq %r11,6*8(%rsp) - CFI_REL_OFFSET r11,R11 - movq %rbx,5*8(%rsp) - CFI_REL_OFFSET rbx,RBX - movq %rbp,4*8(%rsp) - CFI_REL_OFFSET rbp,RBP - movq %r12,3*8(%rsp) - CFI_REL_OFFSET r12,R12 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13,R13 - movq %r14,1*8(%rsp) - CFI_REL_OFFSET r14,R14 - movq %r15,(%rsp) - CFI_REL_OFFSET r15,R15 - xorl %ebx,%ebx - testl $3,CS(%rsp) - je error_kernelspace -error_swapgs: - SWAPGS -error_sti: - TRACE_IRQS_OFF - movq %rdi,RDI(%rsp) - CFI_REL_OFFSET rdi,RDI - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) - call *%rax - /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ -error_exit: - movl %ebx,%eax - RESTORE_REST - DISABLE_INTERRUPTS(CLBR_NONE) + /* error code is on the stack already */ +.macro paranoiderrorentry sym do_sym +ENTRY(\sym) + XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + subq $15*8,%rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call save_paranoid + DEFAULT_FRAME 0 TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - testl %eax,%eax - jne retint_kernel - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - jmp retint_swapgs + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + call \do_sym + jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC +END(\sym) +.endm -error_kernelspace: - incl %ebx - /* There are two places in the kernel that can potentially fault with - usergs. Handle them here. The exception handlers after - iret run with kernel gs again, so don't set the user space flag. - B stepping K8s sometimes report an truncated RIP for IRET - exceptions returning to compat mode. Check for these here too. */ - leaq irq_return(%rip),%rcx - cmpq %rcx,RIP(%rsp) - je error_swapgs - movl %ecx,%ecx /* zero extend */ - cmpq %rcx,RIP(%rsp) - je error_swapgs - cmpq $gs_change,RIP(%rsp) - je error_swapgs - jmp error_sti -KPROBE_END(error_entry) - - /* Reload gs selector with exception handling */ - /* edi: new selector */ +zeroentry divide_error do_divide_error +zeroentry overflow do_overflow +zeroentry bounds do_bounds +zeroentry invalid_op do_invalid_op +zeroentry device_not_available do_device_not_available +paranoiderrorentry double_fault do_double_fault +zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun +errorentry invalid_TSS do_invalid_TSS +errorentry segment_not_present do_segment_not_present +zeroentry spurious_interrupt_bug do_spurious_interrupt_bug +zeroentry coprocessor_error do_coprocessor_error +errorentry alignment_check do_alignment_check +zeroentry simd_coprocessor_error do_simd_coprocessor_error + + /* Reload gs selector with exception handling */ + /* edi: new selector */ ENTRY(native_load_gs_index) CFI_STARTPROC pushf CFI_ADJUST_CFA_OFFSET 8 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) - SWAPGS -gs_change: - movl %edi,%gs + SWAPGS +gs_change: + movl %edi,%gs 2: mfence /* workaround */ SWAPGS - popf + popf CFI_ADJUST_CFA_OFFSET -8 - ret + ret CFI_ENDPROC -ENDPROC(native_load_gs_index) - - .section __ex_table,"a" - .align 8 - .quad gs_change,bad_gs - .previous - .section .fixup,"ax" +END(native_load_gs_index) + + .section __ex_table,"a" + .align 8 + .quad gs_change,bad_gs + .previous + .section .fixup,"ax" /* running with kernelgs */ -bad_gs: +bad_gs: SWAPGS /* switch back to user gs */ xorl %eax,%eax - movl %eax,%gs - jmp 2b - .previous - + movl %eax,%gs + jmp 2b + .previous + /* * Create a kernel thread. * @@ -1138,7 +1144,7 @@ ENTRY(kernel_thread) xorl %r8d,%r8d xorl %r9d,%r9d - + # clone now call do_fork movq %rax,RAX(%rsp) @@ -1149,15 +1155,15 @@ ENTRY(kernel_thread) * so internally to the x86_64 port you can rely on kernel_thread() * not to reschedule the child before returning, this avoids the need * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] + * [Hopefully no generic code relies on the reschedule -AK] */ RESTORE_ALL UNFAKE_STACK_FRAME ret CFI_ENDPROC -ENDPROC(kernel_thread) - -child_rip: +END(kernel_thread) + +ENTRY(child_rip) pushq $0 # fake return address CFI_STARTPROC /* @@ -1170,8 +1176,9 @@ child_rip: # exit mov %eax, %edi call do_exit + ud2 # padding for call trace CFI_ENDPROC -ENDPROC(child_rip) +END(child_rip) /* * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. @@ -1191,10 +1198,10 @@ ENDPROC(child_rip) ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 - SAVE_ALL + SAVE_ALL movq %rsp,%rcx call sys_execve - movq %rax, RAX(%rsp) + movq %rax, RAX(%rsp) RESTORE_REST testq %rax,%rax je int_ret_from_sys_call @@ -1202,129 +1209,7 @@ ENTRY(kernel_execve) UNFAKE_STACK_FRAME ret CFI_ENDPROC -ENDPROC(kernel_execve) - -KPROBE_ENTRY(page_fault) - errorentry do_page_fault -KPROBE_END(page_fault) - -ENTRY(coprocessor_error) - zeroentry do_coprocessor_error -END(coprocessor_error) - -ENTRY(simd_coprocessor_error) - zeroentry do_simd_coprocessor_error -END(simd_coprocessor_error) - -ENTRY(device_not_available) - zeroentry do_device_not_available -END(device_not_available) - - /* runs on exception stack */ -KPROBE_ENTRY(debug) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_debug, DEBUG_STACK - paranoidexit -KPROBE_END(debug) - - /* runs on exception stack */ -KPROBE_ENTRY(nmi) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $-1 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_nmi, 0, 0 -#ifdef CONFIG_TRACE_IRQFLAGS - paranoidexit 0 -#else - jmp paranoid_exit1 - CFI_ENDPROC -#endif -KPROBE_END(nmi) - -KPROBE_ENTRY(int3) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_int3, DEBUG_STACK - jmp paranoid_exit1 - CFI_ENDPROC -KPROBE_END(int3) - -ENTRY(overflow) - zeroentry do_overflow -END(overflow) - -ENTRY(bounds) - zeroentry do_bounds -END(bounds) - -ENTRY(invalid_op) - zeroentry do_invalid_op -END(invalid_op) - -ENTRY(coprocessor_segment_overrun) - zeroentry do_coprocessor_segment_overrun -END(coprocessor_segment_overrun) - - /* runs on exception stack */ -ENTRY(double_fault) - XCPT_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - paranoidentry do_double_fault - jmp paranoid_exit1 - CFI_ENDPROC -END(double_fault) - -ENTRY(invalid_TSS) - errorentry do_invalid_TSS -END(invalid_TSS) - -ENTRY(segment_not_present) - errorentry do_segment_not_present -END(segment_not_present) - - /* runs on exception stack */ -ENTRY(stack_segment) - XCPT_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - paranoidentry do_stack_segment - jmp paranoid_exit1 - CFI_ENDPROC -END(stack_segment) - -KPROBE_ENTRY(general_protection) - errorentry do_general_protection -KPROBE_END(general_protection) - -ENTRY(alignment_check) - errorentry do_alignment_check -END(alignment_check) - -ENTRY(divide_error) - zeroentry do_divide_error -END(divide_error) - -ENTRY(spurious_interrupt_bug) - zeroentry do_spurious_interrupt_bug -END(spurious_interrupt_bug) - -#ifdef CONFIG_X86_MCE - /* runs on exception stack */ -ENTRY(machine_check) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_machine_check - jmp paranoid_exit1 - CFI_ENDPROC -END(machine_check) -#endif +END(kernel_execve) /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) @@ -1344,40 +1229,33 @@ ENTRY(call_softirq) decl %gs:pda_irqcount ret CFI_ENDPROC -ENDPROC(call_softirq) - -KPROBE_ENTRY(ignore_sysret) - CFI_STARTPROC - mov $-ENOSYS,%eax - sysret - CFI_ENDPROC -ENDPROC(ignore_sysret) +END(call_softirq) #ifdef CONFIG_XEN -ENTRY(xen_hypervisor_callback) - zeroentry xen_do_hypervisor_callback -END(xen_hypervisor_callback) +zeroentry xen_hypervisor_callback xen_do_hypervisor_callback /* -# A note on the "critical region" in our callback handler. -# We want to avoid stacking callback handlers due to events occurring -# during handling of the last event. To do this, we keep events disabled -# until we've done all processing. HOWEVER, we must enable events before -# popping the stack frame (can't be done atomically) and so it would still -# be possible to get enough handler activations to overflow the stack. -# Although unlikely, bugs of that kind are hard to track down, so we'd -# like to avoid the possibility. -# So, on entry to the handler we detect whether we interrupted an -# existing activation in its critical region -- if so, we pop the current -# activation and restart the handler using the previous one. -*/ + * A note on the "critical region" in our callback handler. + * We want to avoid stacking callback handlers due to events occurring + * during handling of the last event. To do this, we keep events disabled + * until we've done all processing. HOWEVER, we must enable events before + * popping the stack frame (can't be done atomically) and so it would still + * be possible to get enough handler activations to overflow the stack. + * Although unlikely, bugs of that kind are hard to track down, so we'd + * like to avoid the possibility. + * So, on entry to the handler we detect whether we interrupted an + * existing activation in its critical region -- if so, we pop the current + * activation and restart the handler using the previous one. + */ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) CFI_STARTPROC -/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will - see the correct pointer to the pt_regs */ +/* + * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + * see the correct pointer to the pt_regs + */ movq %rdi, %rsp # we don't return, adjust the stack frame CFI_ENDPROC - CFI_DEFAULT_STACK + DEFAULT_FRAME 11: incl %gs:pda_irqcount movq %rsp,%rbp CFI_DEF_CFA_REGISTER rbp @@ -1392,23 +1270,26 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) END(do_hypervisor_callback) /* -# Hypervisor uses this for application faults while it executes. -# We get here for two reasons: -# 1. Fault while reloading DS, ES, FS or GS -# 2. Fault while executing IRET -# Category 1 we do not need to fix up as Xen has already reloaded all segment -# registers that could be reloaded and zeroed the others. -# Category 2 we fix up by killing the current process. We cannot use the -# normal Linux return path in this case because if we use the IRET hypercall -# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -# We distinguish between categories by comparing each saved segment register -# with its current contents: any discrepancy means we in category 1. -*/ + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we do not need to fix up as Xen has already reloaded all segment + * registers that could be reloaded and zeroed the others. + * Category 2 we fix up by killing the current process. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by comparing each saved segment register + * with its current contents: any discrepancy means we in category 1. + */ ENTRY(xen_failsafe_callback) - framesz = (RIP-0x30) /* workaround buggy gas */ - _frame framesz - CFI_REL_OFFSET rcx, 0 - CFI_REL_OFFSET r11, 8 + INTR_FRAME 1 (6*8) + /*CFI_REL_OFFSET gs,GS*/ + /*CFI_REL_OFFSET fs,FS*/ + /*CFI_REL_OFFSET es,ES*/ + /*CFI_REL_OFFSET ds,DS*/ + CFI_REL_OFFSET r11,8 + CFI_REL_OFFSET rcx,0 movw %ds,%cx cmpw %cx,0x10(%rsp) CFI_REMEMBER_STATE @@ -1429,12 +1310,9 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - pushq %rcx - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $0 /* RIP */ + pushq_cfi %r11 + pushq_cfi %rcx jmp general_protection CFI_RESTORE_STATE 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ @@ -1444,11 +1322,223 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $0 SAVE_ALL jmp error_exit CFI_ENDPROC END(xen_failsafe_callback) #endif /* CONFIG_XEN */ + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +paranoidzeroentry_ist debug do_debug DEBUG_STACK +paranoidzeroentry_ist int3 do_int3 DEBUG_STACK +paranoiderrorentry stack_segment do_stack_segment +errorentry general_protection do_general_protection +errorentry page_fault do_page_fault +#ifdef CONFIG_X86_MCE +paranoidzeroentry machine_check do_machine_check +#endif + + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + + /* ebx: no swapgs flag */ +ENTRY(paranoid_exit) + INTR_FRAME + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore + testl $3,CS(%rsp) + jnz paranoid_userspace +paranoid_swapgs: + TRACE_IRQS_IRETQ 0 + SWAPGS_UNSAFE_STACK +paranoid_restore: + RESTORE_ALL 8 + jmp irq_return +paranoid_userspace: + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule + movl %ebx,%edx /* arg3: thread flags */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp paranoid_userspace +paranoid_schedule: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + call schedule + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + jmp paranoid_userspace + CFI_ENDPROC +END(paranoid_exit) + +/* + * Exception entry point. This expects an error code/orig_rax on the stack. + * returns in "no swapgs flag" in %ebx. + */ +ENTRY(error_entry) + XCPT_FRAME + CFI_ADJUST_CFA_OFFSET 15*8 + /* oldrax contains error code */ + cld + movq_cfi rdi, RDI+8 + movq_cfi rsi, RSI+8 + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq_cfi r8, R8+8 + movq_cfi r9, R9+8 + movq_cfi r10, R10+8 + movq_cfi r11, R11+8 + movq_cfi rbx, RBX+8 + movq_cfi rbp, RBP+8 + movq_cfi r12, R12+8 + movq_cfi r13, R13+8 + movq_cfi r14, R14+8 + movq_cfi r15, R15+8 + xorl %ebx,%ebx + testl $3,CS+8(%rsp) + je error_kernelspace +error_swapgs: + SWAPGS +error_sti: + TRACE_IRQS_OFF + ret + CFI_ENDPROC + +/* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. The exception handlers after iret run with + * kernel gs again, so don't set the user space flag. B stepping K8s + * sometimes report an truncated RIP for IRET exceptions returning to + * compat mode. Check for these here too. + */ +error_kernelspace: + incl %ebx + leaq irq_return(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + movl %ecx,%ecx /* zero extend */ + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + cmpq $gs_change,RIP+8(%rsp) + je error_swapgs + jmp error_sti +END(error_entry) + + +/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +ENTRY(error_exit) + DEFAULT_FRAME + movl %ebx,%eax + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + testl %eax,%eax + jne retint_kernel + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful + jmp retint_swapgs + CFI_ENDPROC +END(error_exit) + + + /* runs on exception stack */ +ENTRY(nmi) + INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + pushq_cfi $-1 + subq $15*8, %rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call save_paranoid + DEFAULT_FRAME 0 + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp,%rdi + movq $-1,%rsi + call do_nmi +#ifdef CONFIG_TRACE_IRQFLAGS + /* paranoidexit; without TRACE_IRQS_OFF */ + /* ebx: no swapgs flag */ + DISABLE_INTERRUPTS(CLBR_NONE) + testl %ebx,%ebx /* swapgs needed? */ + jnz nmi_restore + testl $3,CS(%rsp) + jnz nmi_userspace +nmi_swapgs: + SWAPGS_UNSAFE_STACK +nmi_restore: + RESTORE_ALL 8 + jmp irq_return +nmi_userspace: + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz nmi_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz nmi_schedule + movl %ebx,%edx /* arg3: thread flags */ + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + DISABLE_INTERRUPTS(CLBR_NONE) + jmp nmi_userspace +nmi_schedule: + ENABLE_INTERRUPTS(CLBR_ANY) + call schedule + DISABLE_INTERRUPTS(CLBR_ANY) + jmp nmi_userspace + CFI_ENDPROC +#else + jmp paranoid_exit + CFI_ENDPROC +#endif +END(nmi) + +ENTRY(ignore_sysret) + CFI_STARTPROC + mov $-ENOSYS,%eax + sysret + CFI_ENDPROC +END(ignore_sysret) + +/* + * End of kprobes section + */ + .popsection diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 60eb84eb77a..1d3d0e71b04 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,7 +18,6 @@ #include <asm/idle.h> #include <asm/smp.h> -#ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: * @@ -28,19 +27,18 @@ */ static inline void stack_overflow_check(struct pt_regs *regs) { +#ifdef CONFIG_DEBUG_STACKOVERFLOW u64 curbase = (u64)task_stack_page(current); - static unsigned long warned = -60*HZ; - - if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + 128 && - time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", - current->comm, curbase, regs->sp); - show_stack(NULL,NULL); - warned = jiffies; - } -} + + WARN_ONCE(regs->sp >= curbase && + regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128, + + "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); #endif +} /* * do_IRQ handles all normal device IRQ's (the special @@ -60,9 +58,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; -#ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); -#endif desc = irq_to_desc(irq); if (likely(desc)) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 845aa9803e8..607db63044a 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -129,7 +129,7 @@ void __init native_init_IRQ(void) for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i]); + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ff023539128..8670b3ce626 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -24,41 +24,6 @@ #include <asm/i8259.h> /* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define IRQ_NAME2(nr) nr##_interrupt(void) -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -/* - * SMP has a few special interrupts for IPI messages - */ - -#define BUILD_IRQ(nr) \ - asmlinkage void IRQ_NAME(nr); \ - asm("\n.text\n.p2align\n" \ - "IRQ" #nr "_interrupt:\n\t" \ - "push $~(" #nr ") ; " \ - "jmp common_interrupt\n" \ - ".previous"); - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -/* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: * (these are usually mapped to vectors 0x30-0x3f) */ @@ -73,37 +38,6 @@ * * (these are usually mapped into the 0x30-0xff vector range) */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - -#undef BUILD_16_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -/* for the irq vectors */ -static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) -}; - -#undef IRQ -#undef IRQLIST_16 - - - /* * IRQ2 is cascade interrupt to second interrupt controller diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18f9b19f5f8..d18537ce2c7 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -178,11 +178,7 @@ static void native_smp_send_stop(void) void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_resched_count++; -#else - add_pda(irq_resched_count, 1); -#endif + inc_irq_stat(irq_resched_count); } void smp_call_function_interrupt(struct pt_regs *regs) @@ -190,11 +186,7 @@ void smp_call_function_interrupt(struct pt_regs *regs) ack_APIC_irq(); irq_enter(); generic_smp_call_function_interrupt(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; -#else - add_pda(irq_call_count, 1); -#endif + inc_irq_stat(irq_call_count); irq_exit(); } @@ -203,11 +195,7 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) ack_APIC_irq(); irq_enter(); generic_smp_call_function_single_interrupt(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; -#else - add_pda(irq_call_count, 1); -#endif + inc_irq_stat(irq_call_count); irq_exit(); } diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 77b400f06ea..65309e4cb1c 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -75,7 +75,7 @@ EXPORT_SYMBOL(profile_pc); irqreturn_t timer_interrupt(int irq, void *dev_id) { /* Keep nmi watchdog up to date */ - per_cpu(irq_stat, smp_processor_id()).irq0_irqs++; + inc_irq_stat(irq0_irqs); #ifdef CONFIG_X86_IO_APIC if (timer_ack) { diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index cb19d650c21..1749cacde8b 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(profile_pc); irqreturn_t timer_interrupt(int irq, void *dev_id) { - add_pda(irq0_irqs, 1); + inc_irq_stat(irq0_irqs); global_clock_event->event_handler(global_clock_event); @@ -80,6 +80,8 @@ unsigned long __init calibrate_cpu(void) break; no_ctr_free = (i == 4); if (no_ctr_free) { + WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " + "cpu_khz value may be incorrect.\n"); i = 3; rdmsrl(MSR_K7_EVNTSEL3, evntsel3); wrmsrl(MSR_K7_EVNTSEL3, 0); diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index f4049f3513b..f374f83fca4 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -119,7 +119,7 @@ void smp_invalidate_interrupt(struct pt_regs *regs) smp_mb__after_clear_bit(); out: put_cpu_no_resched(); - __get_cpu_var(irq_stat).irq_tlb_count++; + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 8f919ca6949..29887d7081a 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -154,7 +154,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) out: ack_APIC_irq(); cpu_clear(cpu, f->flush_cpumask); - add_pda(irq_tlb_count, 1); + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 04d242ab016..d815293e6d9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -481,11 +481,7 @@ do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); -#ifdef CONFIG_X86_32 - { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } -#else - add_pda(__nmi_count, 1); -#endif + inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 0b8b6690a86..ebf2f12900f 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -128,7 +128,16 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) gettimeofday(tv,NULL); return; } + + /* + * Surround the RDTSC by barriers, to make sure it's not + * speculated to outside the seqlock critical section and + * does not cause time warps: + */ + rdtsc_barrier(); now = vread(); + rdtsc_barrier(); + base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a5d8e1ace1c..50a779264bb 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -590,7 +590,8 @@ static void __init lguest_init_IRQ(void) * a straightforward 1 to 1 mapping, so force that here. */ __get_cpu_var(vector_irq)[vector] = i; if (vector != SYSCALL_VECTOR) { - set_intr_gate(vector, interrupt[vector]); + set_intr_gate(vector, + interrupt[vector-FIRST_EXTERNAL_VECTOR]); set_irq_chip_and_handler_name(i, &lguest_irq_controller, handle_level_irq, "level"); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c483f424207..3ffed259883 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -102,6 +102,8 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); + + return pmd_table; } #endif pud = pud_offset(pgd, 0); |