diff options
Diffstat (limited to 'arch/x86_64')
-rw-r--r-- | arch/x86_64/Kconfig | 14 | ||||
-rw-r--r-- | arch/x86_64/defconfig | 58 | ||||
-rw-r--r-- | arch/x86_64/kernel/Makefile | 1 | ||||
-rw-r--r-- | arch/x86_64/kernel/apic.c | 5 | ||||
-rw-r--r-- | arch/x86_64/kernel/entry.S | 11 | ||||
-rw-r--r-- | arch/x86_64/kernel/io_apic.c | 70 | ||||
-rw-r--r-- | arch/x86_64/kernel/mpparse.c | 22 | ||||
-rw-r--r-- | arch/x86_64/kernel/nmi.c | 248 | ||||
-rw-r--r-- | arch/x86_64/kernel/pmtimer.c | 101 | ||||
-rw-r--r-- | arch/x86_64/kernel/ptrace.c | 13 | ||||
-rw-r--r-- | arch/x86_64/kernel/setup.c | 17 | ||||
-rw-r--r-- | arch/x86_64/kernel/smpboot.c | 262 | ||||
-rw-r--r-- | arch/x86_64/kernel/time.c | 62 | ||||
-rw-r--r-- | arch/x86_64/kernel/vsyscall.c | 5 | ||||
-rw-r--r-- | arch/x86_64/mm/fault.c | 11 | ||||
-rw-r--r-- | arch/x86_64/mm/ioremap.c | 2 |
16 files changed, 612 insertions, 290 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 44ee7f6acf7..82cb2a3f127 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -303,6 +303,20 @@ config HPET_TIMER as it is off-chip. You can find the HPET spec at <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>. +config X86_PM_TIMER + bool "PM timer" + default y + help + Support the ACPI PM timer for time keeping. This is slow, + but is useful on some chipsets without HPET on systems with more + than one CPU. On a single processor or single socket multi core + system it is normally not required. + When the PM timer is active 64bit vsyscalls are disabled + and should not be enabled (/proc/sys/kernel/vsyscall64 should + not be changed). + The kernel selects the PM timer only as a last resort, so it is + useful to enable just in case. + config HPET_EMULATE_RTC bool "Provide RTC interrupt" depends on HPET_TIMER && RTC=y diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 9ce51dee30b..569595b74c7 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.11-bk7 -# Sat Mar 12 23:43:44 2005 +# Linux kernel version: 2.6.12-rc4 +# Fri May 13 06:39:11 2005 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -11,8 +11,6 @@ CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_X86_CMPXCHG=y CONFIG_EARLY_PRINTK=y -CONFIG_HPET_TIMER=y -CONFIG_HPET_EMULATE_RTC=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y @@ -22,6 +20,7 @@ CONFIG_GENERIC_IOMAP=y CONFIG_EXPERIMENTAL=y CONFIG_CLEAN_COMPILE=y CONFIG_LOCK_KERNEL=y +CONFIG_INIT_ENV_ARG_LIMIT=32 # # General setup @@ -33,7 +32,6 @@ CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set CONFIG_SYSCTL=y # CONFIG_AUDIT is not set -CONFIG_LOG_BUF_SHIFT=18 # CONFIG_HOTPLUG is not set CONFIG_KOBJECT_UEVENT=y CONFIG_IKCONFIG=y @@ -43,10 +41,11 @@ CONFIG_IKCONFIG_PROC=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_PRINTK=y +CONFIG_BUG=y CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SHMEM=y CONFIG_CC_ALIGN_FUNCTIONS=0 CONFIG_CC_ALIGN_LABELS=0 @@ -93,6 +92,9 @@ CONFIG_DISCONTIGMEM=y CONFIG_NUMA=y CONFIG_HAVE_DEC_LOCK=y CONFIG_NR_CPUS=8 +CONFIG_HPET_TIMER=y +CONFIG_X86_PM_TIMER=y +CONFIG_HPET_EMULATE_RTC=y CONFIG_GART_IOMMU=y CONFIG_SWIOTLB=y CONFIG_X86_MCE=y @@ -100,6 +102,7 @@ CONFIG_X86_MCE_INTEL=y CONFIG_SECCOMP=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_ISA_DMA_API=y # # Power management options @@ -129,7 +132,7 @@ CONFIG_ACPI_NUMA=y # CONFIG_ACPI_IBM is not set CONFIG_ACPI_TOSHIBA=y CONFIG_ACPI_BLACKLIST_YEAR=2001 -CONFIG_ACPI_DEBUG=y +# CONFIG_ACPI_DEBUG is not set CONFIG_ACPI_BUS=y CONFIG_ACPI_EC=y CONFIG_ACPI_POWER=y @@ -141,6 +144,7 @@ CONFIG_ACPI_SYSTEM=y # CPU Frequency scaling # CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_TABLE=y # CONFIG_CPU_FREQ_DEBUG is not set CONFIG_CPU_FREQ_STAT=y # CONFIG_CPU_FREQ_STAT_DETAILS is not set @@ -150,7 +154,6 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -CONFIG_CPU_FREQ_TABLE=y # # CPUFreq processor drivers @@ -164,6 +167,7 @@ CONFIG_X86_ACPI_CPUFREQ=y # shared options # CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y +# CONFIG_X86_SPEEDSTEP_LIB is not set # # Bus options (PCI etc.) @@ -172,9 +176,11 @@ CONFIG_PCI=y CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y CONFIG_UNORDERED_IO=y +# CONFIG_PCIEPORTBUS is not set CONFIG_PCI_MSI=y # CONFIG_PCI_LEGACY_PROC is not set # CONFIG_PCI_NAMES is not set +# CONFIG_PCI_DEBUG is not set # # PCCARD (PCMCIA/CardBus) support @@ -182,10 +188,6 @@ CONFIG_PCI_MSI=y # CONFIG_PCCARD is not set # -# PC-card bridges -# - -# # PCI Hotplug Support # # CONFIG_HOTPLUG_PCI is not set @@ -254,7 +256,7 @@ CONFIG_LBD=y # IO Schedulers # CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y +# CONFIG_IOSCHED_AS is not set CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y # CONFIG_ATA_OVER_ETH is not set @@ -308,7 +310,8 @@ CONFIG_BLK_DEV_AMD74XX=y CONFIG_BLK_DEV_PIIX=y # CONFIG_BLK_DEV_NS87415 is not set # CONFIG_BLK_DEV_PDC202XX_OLD is not set -# CONFIG_BLK_DEV_PDC202XX_NEW is not set +CONFIG_BLK_DEV_PDC202XX_NEW=y +# CONFIG_PDC202XX_FORCE is not set # CONFIG_BLK_DEV_SVWKS is not set # CONFIG_BLK_DEV_SIIMAGE is not set # CONFIG_BLK_DEV_SIS5513 is not set @@ -353,7 +356,7 @@ CONFIG_BLK_DEV_SD=y # # SCSI low-level drivers # -CONFIG_BLK_DEV_3W_XXXX_RAID=y +# CONFIG_BLK_DEV_3W_XXXX_RAID is not set # CONFIG_SCSI_3W_9XXX is not set # CONFIG_SCSI_ACARD is not set # CONFIG_SCSI_AACRAID is not set @@ -384,7 +387,6 @@ CONFIG_SCSI_SATA_VIA=y # CONFIG_SCSI_BUSLOGIC is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_EATA is not set -# CONFIG_SCSI_EATA_PIO is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set # CONFIG_SCSI_GDTH is not set # CONFIG_SCSI_IPS is not set @@ -392,7 +394,6 @@ CONFIG_SCSI_SATA_VIA=y # CONFIG_SCSI_INIA100 is not set # CONFIG_SCSI_SYM53C8XX_2 is not set # CONFIG_SCSI_IPR is not set -# CONFIG_SCSI_QLOGIC_ISP is not set # CONFIG_SCSI_QLOGIC_FC is not set # CONFIG_SCSI_QLOGIC_1280 is not set CONFIG_SCSI_QLA2XXX=y @@ -401,6 +402,7 @@ CONFIG_SCSI_QLA2XXX=y # CONFIG_SCSI_QLA2300 is not set # CONFIG_SCSI_QLA2322 is not set # CONFIG_SCSI_QLA6312 is not set +# CONFIG_SCSI_LPFC is not set # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set @@ -437,7 +439,6 @@ CONFIG_NET=y # CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set -# CONFIG_NETLINK_DEV is not set CONFIG_UNIX=y # CONFIG_NET_KEY is not set CONFIG_INET=y @@ -502,7 +503,7 @@ CONFIG_NETDEVICES=y # CONFIG_DUMMY is not set # CONFIG_BONDING is not set # CONFIG_EQUALIZER is not set -# CONFIG_TUN is not set +CONFIG_TUN=y # # ARCnet devices @@ -525,8 +526,7 @@ CONFIG_MII=y # CONFIG_HP100 is not set CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set -CONFIG_AMD8111_ETH=y -# CONFIG_AMD8111E_NAPI is not set +# CONFIG_AMD8111_ETH is not set # CONFIG_ADAPTEC_STARFIRE is not set # CONFIG_B44 is not set CONFIG_FORCEDETH=y @@ -536,7 +536,7 @@ CONFIG_FORCEDETH=y # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set # CONFIG_NE2K_PCI is not set -CONFIG_8139CP=m +CONFIG_8139CP=y CONFIG_8139TOO=y # CONFIG_8139TOO_PIO is not set # CONFIG_8139TOO_TUNE_TWISTER is not set @@ -671,6 +671,7 @@ CONFIG_SERIAL_8250_NR_UARTS=4 # CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y +# CONFIG_SERIAL_JSM is not set CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 @@ -696,6 +697,7 @@ CONFIG_RTC=y # CONFIG_AGP=y CONFIG_AGP_AMD64=y +CONFIG_AGP_INTEL=y # CONFIG_DRM is not set # CONFIG_MWAVE is not set CONFIG_RAW_DRIVER=y @@ -703,7 +705,7 @@ CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set CONFIG_HPET_MMAP=y CONFIG_MAX_RAW_DEVS=256 -CONFIG_HANGCHECK_TIMER=y +# CONFIG_HANGCHECK_TIMER is not set # # TPM devices @@ -786,6 +788,8 @@ CONFIG_SOUND_ICH=y # # USB support # +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB_ARCH_HAS_OHCI=y CONFIG_USB=y # CONFIG_USB_DEBUG is not set @@ -797,8 +801,6 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set # CONFIG_USB_OTG is not set -CONFIG_USB_ARCH_HAS_HCD=y -CONFIG_USB_ARCH_HAS_OHCI=y # # USB Host Controller Drivers @@ -826,7 +828,6 @@ CONFIG_USB_PRINTER=y # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set -# CONFIG_USB_STORAGE_RW_DETECT is not set # CONFIG_USB_STORAGE_DATAFAB is not set # CONFIG_USB_STORAGE_FREECOM is not set # CONFIG_USB_STORAGE_ISD200 is not set @@ -965,7 +966,7 @@ CONFIG_AUTOFS_FS=y # CD-ROM/DVD Filesystems # CONFIG_ISO9660_FS=y -# CONFIG_JOLIET is not set +CONFIG_JOLIET=y # CONFIG_ZISOFS is not set # CONFIG_UDF_FS is not set @@ -1092,9 +1093,10 @@ CONFIG_OPROFILE=y # # Kernel hacking # +# CONFIG_PRINTK_TIME is not set CONFIG_DEBUG_KERNEL=y CONFIG_MAGIC_SYSRQ=y -# CONFIG_PRINTK_TIME is not set +CONFIG_LOG_BUF_SHIFT=18 # CONFIG_SCHEDSTATS is not set # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_SPINLOCK is not set diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 0a3318e08ab..5ca4a4598fd 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o obj-$(CONFIG_MODULES) += module.o diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 7e13545748e..f8e6cc4fecd 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -33,6 +33,7 @@ #include <asm/mpspec.h> #include <asm/pgalloc.h> #include <asm/mach_apic.h> +#include <asm/nmi.h> int apic_verbosity; @@ -925,7 +926,7 @@ __init int oem_force_hpet_timer(void) unsigned id; DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - bitmap_empty(clustermap, NUM_APIC_CLUSTERS); + bitmap_zero(clustermap, NUM_APIC_CLUSTERS); for (i = 0; i < NR_CPUS; i++) { id = bios_cpu_apicid[i]; @@ -1056,7 +1057,7 @@ int __init APIC_init_uniprocessor (void) nr_ioapics = 0; #endif setup_boot_APIC_clock(); - + check_nmi_watchdog(); return 0; } diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 1086b5fcac2..28817490fdc 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -220,13 +220,18 @@ sysret_careful: jmp sysret_check /* Handle a signal */ - /* edx: work flags (arg3) */ sysret_signal: sti + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jz 1f + + /* Really a signal */ + /* edx: work flags (arg3) */ leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common +1: movl $_TIF_NEED_RESCHED,%edi jmp sysret_check /* Do syscall tracing */ @@ -484,6 +489,8 @@ retint_careful: jmp retint_check retint_signal: + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jz retint_swapgs sti SAVE_REST movq $-1,ORIG_RAX(%rsp) @@ -492,8 +499,8 @@ retint_signal: call do_notify_resume RESTORE_REST cli + movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) - movl $_TIF_WORK_MASK,%edi jmp retint_check #ifdef CONFIG_PREEMPT diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 60be58617eb..ac768432495 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1804,76 +1804,6 @@ device_initcall(ioapic_init_sysfs); #define IO_APIC_MAX_ID 0xFE -int __init io_apic_get_unique_id (int ioapic, int apic_id) -{ - union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map; - unsigned long flags; - int i = 0; - - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic_id_map = phys_cpu_present_map; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - if (apic_id >= IO_APIC_MAX_ID) { - apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); - apic_id = reg_00.bits.ID; - } - - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (physid_isset(apic_id, apic_id_map)) { - - for (i = 0; i < IO_APIC_MAX_ID; i++) { - if (!physid_isset(i, apic_id_map)) - break; - } - - if (i == IO_APIC_MAX_ID) - panic("Max apic_id exceeded!\n"); - - apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - - apic_id = i; - } - - physid_set(apic_id, apic_id_map); - - if (reg_00.bits.ID != apic_id) { - reg_00.bits.ID = apic_id; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* Sanity check */ - if (reg_00.bits.ID != apic_id) - panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); - } - - apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); - - return apic_id; -} - - int __init io_apic_get_version (int ioapic) { union IO_APIC_reg_01 reg_01; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 7ec031c6ca1..f86d9db94bf 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -107,6 +107,7 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __init MP_processor_info (struct mpc_config_processor *m) { int ver; + static int found_bsp=0; if (!(m->mpc_cpuflag & CPU_ENABLED)) return; @@ -126,11 +127,6 @@ static void __init MP_processor_info (struct mpc_config_processor *m) " Processor ignored.\n", NR_CPUS); return; } - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } num_processors++; @@ -150,7 +146,19 @@ static void __init MP_processor_info (struct mpc_config_processor *m) ver = 0x10; } apic_version[m->mpc_apicid] = ver; - bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + /* + * bios_cpu_apicid is required to have processors listed + * in same order as logical cpu numbers. Hence the first + * entry is BSP, and so on. + */ + bios_cpu_apicid[0] = m->mpc_apicid; + x86_cpu_to_apicid[0] = m->mpc_apicid; + found_bsp = 1; + } else { + bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid; + x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid; + } } static void __init MP_bus_info (struct mpc_config_bus *m) @@ -759,7 +767,7 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); + mp_ioapics[idx].mpc_apicid = id; mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); /* diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 61de0b34a01..31c0f2e6ac9 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -33,6 +33,7 @@ #include <asm/msr.h> #include <asm/proto.h> #include <asm/kdebug.h> +#include <asm/local.h> /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: @@ -59,7 +60,8 @@ int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; -unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ +static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ +static unsigned int nmi_p4_cccr_val; /* Note that these events don't tick when the CPU idles. This means the frequency varies with CPU load. */ @@ -71,61 +73,87 @@ unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED +#define MSR_P4_MISC_ENABLE 0x1A0 +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) +#define MSR_P4_PERFCTR0 0x300 +#define MSR_P4_CCCR0 0x360 +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ +#define MSR_P4_IQ_COUNTER0 0x30C +#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) +#define P4_NMI_IQ_CCCR0 \ + (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ + P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) + +static __init inline int nmi_known_cpu(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return boot_cpu_data.x86 == 15; + case X86_VENDOR_INTEL: + return boot_cpu_data.x86 == 15; + } + return 0; +} /* Run after command line and cpu_init init, but before all other checks */ void __init nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; - - /* For some reason the IO APIC watchdog doesn't work on the AMD - 8111 chipset. For now switch to local APIC mode using - perfctr0 there. On Intel CPUs we don't have code to handle - the perfctr and the IO-APIC seems to work, so use that. */ - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - nmi_watchdog = NMI_LOCAL_APIC; - printk(KERN_INFO - "Using local APIC NMI watchdog using perfctr0\n"); - } else { - printk(KERN_INFO "Using IO APIC NMI watchdog\n"); + if (nmi_known_cpu()) + nmi_watchdog = NMI_LOCAL_APIC; + else nmi_watchdog = NMI_IO_APIC; - } } -/* Why is there no CPUID flag for this? */ -static __init int cpu_has_lapic(void) +#ifdef CONFIG_SMP +/* The performance counters used by NMI_LOCAL_APIC don't trigger when + * the CPU is idle. To make sure the NMI watchdog really ticks on all + * CPUs during the test make them busy. + */ +static __init void nmi_cpu_busy(void *data) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - case X86_VENDOR_AMD: - return boot_cpu_data.x86 >= 6; - /* .... add more cpus here or find a different way to figure this out. */ - default: - return 0; - } + volatile int *endflag = data; + local_irq_enable(); + /* Intentionally don't use cpu_relax here. This is + to make sure that the performance counter really ticks, + even if there is a simulator or similar that catches the + pause instruction. On a real HT machine this is fine because + all other CPUs are busy with "useless" delay loops and don't + care if they get somewhat less cycles. */ + while (*endflag == 0) + barrier(); } +#endif -static int __init check_nmi_watchdog (void) +int __init check_nmi_watchdog (void) { - int counts[NR_CPUS]; + volatile int endflag = 0; + int *counts; int cpu; - if (nmi_watchdog == NMI_NONE) - return 0; + counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + if (!counts) + return -1; - if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { - nmi_watchdog = NMI_NONE; - return -1; - } + printk(KERN_INFO "testing NMI watchdog ... "); - printk(KERN_INFO "Testing NMI watchdog ... "); + if (nmi_watchdog == NMI_LOCAL_APIC) + smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); for (cpu = 0; cpu < NR_CPUS; cpu++) counts[cpu] = cpu_pda[cpu].__nmi_count; @@ -133,15 +161,22 @@ static int __init check_nmi_watchdog (void) mdelay((10*1000)/nmi_hz); // wait 10 ticks for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (!cpu_online(cpu)) + continue; if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { - printk("CPU#%d: NMI appears to be stuck (%d)!\n", + endflag = 1; + printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, + counts[cpu], cpu_pda[cpu].__nmi_count); nmi_active = 0; lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; + nmi_perfctr_msr = 0; + kfree(counts); return -1; } } + endflag = 1; printk("OK.\n"); /* now that we know it works we can reduce NMI frequency to @@ -149,10 +184,9 @@ static int __init check_nmi_watchdog (void) if (nmi_watchdog == NMI_LOCAL_APIC) nmi_hz = 1; + kfree(counts); return 0; } -/* Have this called later during boot so counters are updating */ -late_initcall(check_nmi_watchdog); int __init setup_nmi_watchdog(char *str) { @@ -170,7 +204,7 @@ int __init setup_nmi_watchdog(char *str) if (nmi >= NMI_INVALID) return 0; - nmi_watchdog = nmi; + nmi_watchdog = nmi; return 1; } @@ -185,7 +219,10 @@ static void disable_lapic_nmi_watchdog(void) wrmsr(MSR_K7_EVNTSEL0, 0, 0); break; case X86_VENDOR_INTEL: - wrmsr(MSR_IA32_EVNTSEL0, 0, 0); + if (boot_cpu_data.x86 == 15) { + wrmsr(MSR_P4_IQ_CCCR0, 0, 0); + wrmsr(MSR_P4_CRU_ESCR0, 0, 0); + } break; } nmi_active = -1; @@ -253,7 +290,7 @@ void enable_timer_nmi_watchdog(void) static int nmi_pm_active; /* nmi_active before suspend */ -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) +static int lapic_nmi_suspend(struct sys_device *dev, u32 state) { nmi_pm_active = nmi_active; disable_lapic_nmi_watchdog(); @@ -300,22 +337,27 @@ late_initcall(init_lapic_nmi_sysfs); * Original code written by Keith Owens. */ +static void clear_msr_range(unsigned int base, unsigned int n) +{ + unsigned int i; + + for(i = 0; i < n; ++i) + wrmsr(base+i, 0, 0); +} + static void setup_k7_watchdog(void) { int i; unsigned int evntsel; - /* No check, so can start with slow frequency */ - nmi_hz = 1; - - /* XXX should check these in EFER */ - nmi_perfctr_msr = MSR_K7_PERFCTR0; for(i = 0; i < 4; ++i) { /* Simulator may not support it */ - if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) + if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) { + nmi_perfctr_msr = 0; return; + } wrmsrl(MSR_K7_PERFCTR0+i, 0UL); } @@ -325,12 +367,54 @@ static void setup_k7_watchdog(void) | K7_NMI_EVENT; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); + wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); } + +static int setup_p4_watchdog(void) +{ + unsigned int misc_enable, dummy; + + rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); + if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) + return 0; + + nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; + nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; +#ifdef CONFIG_SMP + if (smp_num_siblings == 2) + nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; +#endif + + if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) + clear_msr_range(0x3F1, 2); + /* MSR 0x3F0 seems to have a default value of 0xFC00, but current + docs doesn't fully define it, so leave it alone for now. */ + if (boot_cpu_data.x86_model >= 0x3) { + /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ + clear_msr_range(0x3A0, 26); + clear_msr_range(0x3BC, 3); + } else { + clear_msr_range(0x3A0, 31); + } + clear_msr_range(0x3C0, 6); + clear_msr_range(0x3C8, 6); + clear_msr_range(0x3E0, 2); + clear_msr_range(MSR_P4_CCCR0, 18); + clear_msr_range(MSR_P4_PERFCTR0, 18); + + wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); + wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); + Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000)); + wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1); + apic_write(APIC_LVTPC, APIC_DM_NMI); + wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); + return 1; +} + void setup_apic_nmi_watchdog(void) { switch (boot_cpu_data.x86_vendor) { @@ -341,6 +425,13 @@ void setup_apic_nmi_watchdog(void) return; setup_k7_watchdog(); break; + case X86_VENDOR_INTEL: + if (boot_cpu_data.x86 != 15) + return; + if (!setup_p4_watchdog()) + return; + break; + default: return; } @@ -355,56 +446,67 @@ void setup_apic_nmi_watchdog(void) * * as these watchdog NMI IRQs are generated on every CPU, we only * have to check the current processor. - * - * since NMIs don't listen to _any_ locks, we have to be extremely - * careful not to rely on unsafe variables. The printk might lock - * up though, so we have to break up any console locks first ... - * [when there will be more tty-related locks, break them up - * here too!] */ -static unsigned int - last_irq_sums [NR_CPUS], - alert_counter [NR_CPUS]; +static DEFINE_PER_CPU(unsigned, last_irq_sum); +static DEFINE_PER_CPU(local_t, alert_counter); +static DEFINE_PER_CPU(int, nmi_touch); void touch_nmi_watchdog (void) { int i; /* - * Just reset the alert counters, (other CPUs might be - * spinning on locks we hold): + * Tell other CPUs to reset their alert counters. We cannot + * do it ourselves because the alert count increase is not + * atomic. */ for (i = 0; i < NR_CPUS; i++) - alert_counter[i] = 0; + per_cpu(nmi_touch, i) = 1; } void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) { - int sum, cpu; + int sum; + int touched = 0; - cpu = safe_smp_processor_id(); sum = read_pda(apic_timer_irqs); - if (last_irq_sums[cpu] == sum) { + if (__get_cpu_var(nmi_touch)) { + __get_cpu_var(nmi_touch) = 0; + touched = 1; + } + if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) { + local_inc(&__get_cpu_var(alert_counter)); + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { - alert_counter[cpu] = 0; + local_set(&__get_cpu_var(alert_counter), 0); return; } die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); } } else { - last_irq_sums[cpu] = sum; - alert_counter[cpu] = 0; + __get_cpu_var(last_irq_sum) = sum; + local_set(&__get_cpu_var(alert_counter), 0); } - if (nmi_perfctr_msr) + if (nmi_perfctr_msr) { + if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); + apic_write(APIC_LVTPC, APIC_DM_NMI); + } wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); + } } static int dummy_nmi_callback(struct pt_regs * regs, int cpu) diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c new file mode 100644 index 00000000000..feb5f108dd2 --- /dev/null +++ b/arch/x86_64/kernel/pmtimer.c @@ -0,0 +1,101 @@ +/* Ported over from i386 by AK, original copyright was: + * + * (C) Dominik Brodowski <linux@brodo.de> 2003 + * + * Driver to use the Power Management Timer (PMTMR) available in some + * southbridges as primary timing source for the Linux kernel. + * + * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, + * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. + * + * This file is licensed under the GPL v2. + * + * Dropped all the hardware bug workarounds for now. Hopefully they + * are not needed on 64bit chipsets. + */ + +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/cpumask.h> +#include <asm/io.h> +#include <asm/proto.h> +#include <asm/msr.h> +#include <asm/vsyscall.h> + +/* The I/O port the PMTMR resides at. + * The location is detected during setup_arch(), + * in arch/i386/kernel/acpi/boot.c */ +u32 pmtmr_ioport; + +/* value of the Power timer at last timer interrupt */ +static u32 offset_delay; +static u32 last_pmtmr_tick; + +#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ + +static inline u32 cyc2us(u32 cycles) +{ + /* The Power Management Timer ticks at 3.579545 ticks per microsecond. + * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] + * + * Even with HZ = 100, delta is at maximum 35796 ticks, so it can + * easily be multiplied with 286 (=0x11E) without having to fear + * u32 overflows. + */ + cycles *= 286; + return (cycles >> 10); +} + +int pmtimer_mark_offset(void) +{ + static int first_run = 1; + unsigned long tsc; + u32 lost; + + u32 tick = inl(pmtmr_ioport); + u32 delta; + + delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK); + + last_pmtmr_tick = tick; + monotonic_base += delta * NSEC_PER_USEC; + + delta += offset_delay; + + lost = delta / (USEC_PER_SEC / HZ); + offset_delay = delta % (USEC_PER_SEC / HZ); + + rdtscll(tsc); + vxtime.last_tsc = tsc - offset_delay * cpu_khz; + + /* don't calculate delay for first run, + or if we've got less then a tick */ + if (first_run || (lost < 1)) { + first_run = 0; + offset_delay = 0; + } + + return lost - 1; +} + +unsigned int do_gettimeoffset_pm(void) +{ + u32 now, offset, delta = 0; + + offset = last_pmtmr_tick; + now = inl(pmtmr_ioport); + delta = (now - offset) & ACPI_PM_MASK; + + return offset_delay + cyc2us(delta); +} + + +static int __init nopmtimer_setup(char *s) +{ + pmtmr_ioport = 0; + return 0; +} + +__setup("nopmtimer", nopmtimer_setup); diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index e26e86bb56f..60dc9b98951 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -257,13 +257,13 @@ static int putreg(struct task_struct *child, value &= 0xffff; return 0; case offsetof(struct user_regs_struct,fs_base): - if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) - return -EIO; + if (value >= TASK_SIZE) + return -EIO; child->thread.fs = value; return 0; case offsetof(struct user_regs_struct,gs_base): - if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) - return -EIO; + if (value >= TASK_SIZE) + return -EIO; child->thread.gs = value; return 0; case offsetof(struct user_regs_struct, eflags): @@ -277,6 +277,11 @@ static int putreg(struct task_struct *child, return -EIO; value &= 0xffff; break; + case offsetof(struct user_regs_struct, rip): + /* Check if the new RIP address is canonical */ + if (value >= TASK_SIZE) + return -EIO; + break; } put_stack_long(child, regno - sizeof(struct pt_regs), value); return 0; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 2129cf9ba6b..b9fd0252c27 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -727,11 +727,12 @@ static void __init display_cacheinfo(struct cpuinfo_x86 *c) static void __init amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - int cpu = c->x86_apicid; + int cpu = smp_processor_id(); int node = 0; if (c->x86_num_cores == 1) return; - cpu_core_id[cpu] = cpu >> hweight32(c->x86_num_cores - 1); + /* Fix up the APIC ID following the AMD specification. */ + cpu_core_id[cpu] >>= hweight32(c->x86_num_cores - 1); #ifdef CONFIG_NUMA /* When an ACPI SRAT table is available use the mappings from SRAT @@ -745,6 +746,9 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) node = cpu_to_node[cpu]; } #endif + /* For now: - better than BAD_APIC_ID at least*/ + phys_proc_id[cpu] = cpu_core_id[cpu]; + printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", cpu, c->x86_num_cores, node, cpu_core_id[cpu]); #endif @@ -925,7 +929,6 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c) c->x86_clflush_size = 64; c->x86_cache_alignment = c->x86_clflush_size; c->x86_num_cores = 1; - c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data; c->extended_cpuid_level = 0; memset(&c->x86_capability, 0, sizeof c->x86_capability); @@ -954,11 +957,15 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c) } if (c->x86_capability[0] & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; - c->x86_apicid = misc >> 24; } else { /* Have CPUID level 0 only - unheard of */ c->x86 = 4; } + +#ifdef CONFIG_SMP + phys_proc_id[smp_processor_id()] = + cpu_core_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; +#endif } /* @@ -1088,7 +1095,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Other (Linux-defined) */ - "cxmmx", NULL, "cyrix_arr", "centaur_mcr", "k8c+", + "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL, "constant_tsc", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 73f7e8b9543..5abdee1e16a 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -56,6 +56,7 @@ #include <asm/kdebug.h> #include <asm/tlbflush.h> #include <asm/proto.h> +#include <asm/nmi.h> /* Change for real CPU hotplug. Note other files need to be fixed first too. */ @@ -125,96 +126,210 @@ static void __cpuinit smp_store_cpu_info(int id) *c = boot_cpu_data; identify_cpu(c); + print_cpu_info(c); } /* - * Synchronize TSCs of CPUs + * New Funky TSC sync algorithm borrowed from IA64. + * Main advantage is that it doesn't reset the TSCs fully and + * in general looks more robust and it works better than my earlier + * attempts. I believe it was written by David Mosberger. Some minor + * adjustments for x86-64 by me -AK * - * This new algorithm is less accurate than the old "zero TSCs" - * one, but we cannot zero TSCs anymore in the new hotplug CPU - * model. + * Original comment reproduced below. + * + * Synchronize TSC of the current (slave) CPU with the TSC of the + * MASTER CPU (normally the time-keeper CPU). We use a closed loop to + * eliminate the possibility of unaccounted-for errors (such as + * getting a machine check in the middle of a calibration step). The + * basic idea is for the slave to ask the master what itc value it has + * and to read its own itc before and after the master responds. Each + * iteration gives us three timestamps: + * + * slave master + * + * t0 ---\ + * ---\ + * ---> + * tm + * /--- + * /--- + * t1 <--- + * + * + * The goal is to adjust the slave's TSC such that tm falls exactly + * half-way between t0 and t1. If we achieve this, the clocks are + * synchronized provided the interconnect between the slave and the + * master is symmetric. Even if the interconnect were asymmetric, we + * would still know that the synchronization error is smaller than the + * roundtrip latency (t0 - t1). + * + * When the interconnect is quiet and symmetric, this lets us + * synchronize the TSC to within one or two cycles. However, we can + * only *guarantee* that the synchronization is accurate to within a + * round-trip time, which is typically in the range of several hundred + * cycles (e.g., ~500 cycles). In practice, this means that the TSCs + * are usually almost perfectly synchronized, but we shouldn't assume + * that the accuracy is much better than half a micro second or so. + * + * [there are other errors like the latency of RDTSC and of the + * WRMSR. These can also account to hundreds of cycles. So it's + * probably worse. It claims 153 cycles error on a dual Opteron, + * but I suspect the numbers are actually somewhat worse -AK] */ -static atomic_t __cpuinitdata tsc_flag; +#define MASTER 0 +#define SLAVE (SMP_CACHE_BYTES/8) + +/* Intentionally don't use cpu_relax() while TSC synchronization + because we don't want to go into funky power save modi or cause + hypervisors to schedule us away. Going to sleep would likely affect + latency and low latency is the primary objective here. -AK */ +#define no_cpu_relax() barrier() + static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); -static unsigned long long __cpuinitdata bp_tsc, ap_tsc; +static volatile __cpuinitdata unsigned long go[SLAVE + 1]; +static int notscsync __cpuinitdata; + +#undef DEBUG_TSC_SYNC -#define NR_LOOPS 5 +#define NUM_ROUNDS 64 /* magic value */ +#define NUM_ITERS 5 /* likewise */ -static void __cpuinit sync_tsc_bp_init(int init) +/* Callback on boot CPU */ +static __cpuinit void sync_master(void *arg) { - if (init) - _raw_spin_lock(&tsc_sync_lock); - else - _raw_spin_unlock(&tsc_sync_lock); - atomic_set(&tsc_flag, 0); + unsigned long flags, i; + + if (smp_processor_id() != boot_cpu_id) + return; + + go[MASTER] = 0; + + local_irq_save(flags); + { + for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { + while (!go[MASTER]) + no_cpu_relax(); + go[MASTER] = 0; + rdtscll(go[SLAVE]); + } + } + local_irq_restore(flags); } /* - * Synchronize TSC on AP with BP. + * Return the number of cycles by which our tsc differs from the tsc + * on the master (time-keeper) CPU. A positive number indicates our + * tsc is ahead of the master, negative that it is behind. */ -static void __cpuinit __sync_tsc_ap(void) +static inline long +get_delta(long *rt, long *master) { - if (!cpu_has_tsc) - return; - Dprintk("AP %d syncing TSC\n", smp_processor_id()); + unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; + unsigned long tcenter, t0, t1, tm; + int i; - while (atomic_read(&tsc_flag) != 0) - cpu_relax(); - atomic_inc(&tsc_flag); - mb(); - _raw_spin_lock(&tsc_sync_lock); - wrmsrl(MSR_IA32_TSC, bp_tsc); - _raw_spin_unlock(&tsc_sync_lock); - rdtscll(ap_tsc); - mb(); - atomic_inc(&tsc_flag); - mb(); + for (i = 0; i < NUM_ITERS; ++i) { + rdtscll(t0); + go[MASTER] = 1; + while (!(tm = go[SLAVE])) + no_cpu_relax(); + go[SLAVE] = 0; + rdtscll(t1); + + if (t1 - t0 < best_t1 - best_t0) + best_t0 = t0, best_t1 = t1, best_tm = tm; + } + + *rt = best_t1 - best_t0; + *master = best_tm - best_t0; + + /* average best_t0 and best_t1 without overflow: */ + tcenter = (best_t0/2 + best_t1/2); + if (best_t0 % 2 + best_t1 % 2 == 2) + ++tcenter; + return tcenter - best_tm; } -static void __cpuinit sync_tsc_ap(void) +static __cpuinit void sync_tsc(void) { - int i; - for (i = 0; i < NR_LOOPS; i++) - __sync_tsc_ap(); + int i, done = 0; + long delta, adj, adjust_latency = 0; + unsigned long flags, rt, master_time_stamp, bound; +#if DEBUG_TSC_SYNC + static struct syncdebug { + long rt; /* roundtrip time */ + long master; /* master's timestamp */ + long diff; /* difference between midpoint and master's timestamp */ + long lat; /* estimate of tsc adjustment latency */ + } t[NUM_ROUNDS] __cpuinitdata; +#endif + + go[MASTER] = 1; + + smp_call_function(sync_master, NULL, 1, 0); + + while (go[MASTER]) /* wait for master to be ready */ + no_cpu_relax(); + + spin_lock_irqsave(&tsc_sync_lock, flags); + { + for (i = 0; i < NUM_ROUNDS; ++i) { + delta = get_delta(&rt, &master_time_stamp); + if (delta == 0) { + done = 1; /* let's lock on to this... */ + bound = rt; + } + + if (!done) { + unsigned long t; + if (i > 0) { + adjust_latency += -delta; + adj = -delta + adjust_latency/4; + } else + adj = -delta; + + rdtscll(t); + wrmsrl(MSR_IA32_TSC, t + adj); + } +#if DEBUG_TSC_SYNC + t[i].rt = rt; + t[i].master = master_time_stamp; + t[i].diff = delta; + t[i].lat = adjust_latency/4; +#endif + } + } + spin_unlock_irqrestore(&tsc_sync_lock, flags); + +#if DEBUG_TSC_SYNC + for (i = 0; i < NUM_ROUNDS; ++i) + printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", + t[i].rt, t[i].master, t[i].diff, t[i].lat); +#endif + + printk(KERN_INFO + "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " + "maxerr %lu cycles)\n", + smp_processor_id(), boot_cpu_id, delta, rt); } -/* - * Synchronize TSC from BP to AP. - */ -static void __cpuinit __sync_tsc_bp(int cpu) +static void __cpuinit tsc_sync_wait(void) { - if (!cpu_has_tsc) + if (notscsync || !cpu_has_tsc) return; - - /* Wait for AP */ - while (atomic_read(&tsc_flag) == 0) - cpu_relax(); - /* Save BPs TSC */ - sync_core(); - rdtscll(bp_tsc); - /* Don't do the sync core here to avoid too much latency. */ - mb(); - /* Start the AP */ - _raw_spin_unlock(&tsc_sync_lock); - /* Wait for AP again */ - while (atomic_read(&tsc_flag) < 2) - cpu_relax(); - rdtscl(bp_tsc); - barrier(); + printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), + boot_cpu_id); + sync_tsc(); } -static void __cpuinit sync_tsc_bp(int cpu) +static __init int notscsync_setup(char *s) { - int i; - for (i = 0; i < NR_LOOPS - 1; i++) { - __sync_tsc_bp(cpu); - sync_tsc_bp_init(1); - } - __sync_tsc_bp(cpu); - printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n", - cpu, ap_tsc - bp_tsc); + notscsync = 1; + return 0; } +__setup("notscsync", notscsync_setup); static atomic_t init_deasserted __cpuinitdata; @@ -315,11 +430,6 @@ void __cpuinit start_secondary(void) cpu_init(); smp_callin(); - /* - * Synchronize the TSC with the BP - */ - sync_tsc_ap(); - /* otherwise gcc will move up the smp_processor_id before the cpu_init */ barrier(); @@ -334,7 +444,6 @@ void __cpuinit start_secondary(void) enable_8259A_irq(0); } - enable_APIC_timer(); /* @@ -343,6 +452,11 @@ void __cpuinit start_secondary(void) cpu_set(smp_processor_id(), cpu_online_map); mb(); + /* Wait for TSC sync to not schedule things before. + We still process interrupts, which could see an inconsistent + time in that window unfortunately. */ + tsc_sync_wait(); + cpu_idle(); } @@ -531,7 +645,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) printk("failed fork for CPU %d\n", cpu); return PTR_ERR(idle); } - x86_cpu_to_apicid[cpu] = apicid; cpu_pda[cpu].pcurrent = idle; @@ -600,8 +713,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) if (cpu_isset(cpu, cpu_callin_map)) { /* number CPUs logically, starting from 1 (BSP is 0) */ - Dprintk("OK.\n"); - print_cpu_info(&cpu_data[cpu]); Dprintk("CPU has booted.\n"); } else { boot_error = 1; @@ -842,7 +953,6 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus) GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); /* Or can we switch back to PIC here? */ } - x86_cpu_to_apicid[0] = boot_cpu_id; /* * Now start the IO-APICs @@ -889,18 +999,14 @@ int __cpuinit __cpu_up(unsigned int cpu) printk("__cpu_up: bad cpu %d\n", cpu); return -EINVAL; } - sync_tsc_bp_init(1); /* Boot it! */ err = do_boot_cpu(cpu, apicid); if (err < 0) { - sync_tsc_bp_init(0); Dprintk("do_boot_cpu failed %d\n", err); return err; } - sync_tsc_bp(cpu); - /* Unleash the CPU! */ Dprintk("waiting for cpu %d\n", cpu); @@ -923,4 +1029,6 @@ void __cpuinit smp_cpus_done(unsigned int max_cpus) detect_siblings(); time_init_gtod(); + + check_nmi_watchdog(); } diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 0e3b60e1ac5..735b6767c8e 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -26,6 +26,8 @@ #include <linux/sysdev.h> #include <linux/bcd.h> #include <linux/kallsyms.h> +#include <linux/acpi.h> +#include <acpi/achware.h> /* for PM timer frequency */ #include <asm/8253pit.h> #include <asm/pgtable.h> #include <asm/vsyscall.h> @@ -396,6 +398,10 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; vxtime.last = offset; +#ifdef CONFIG_X86_PM_TIMER + } else if (vxtime.mode == VXTIME_PMTMR) { + lost = pmtimer_mark_offset(); +#endif } else { offset = (((tsc - vxtime.last_tsc) * vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); @@ -898,6 +904,13 @@ void __init time_init(void) hpet_period; cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; +#ifdef CONFIG_X86_PM_TIMER + } else if (pmtmr_ioport) { + vxtime_hz = PM_TIMER_FREQUENCY; + timename = "PM"; + pit_init(); + cpu_khz = pit_calibrate_tsc(); +#endif } else { pit_init(); cpu_khz = pit_calibrate_tsc(); @@ -923,35 +936,50 @@ void __init time_init(void) } /* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +static __init int unsynchronized_tsc(void) +{ +#ifdef CONFIG_SMP + if (oem_force_hpet_timer()) + return 1; + /* Intel systems are normally all synchronized. Exceptions + are handled in the OEM check above. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return 0; + /* All in a single socket - should be synchronized */ + if (cpus_weight(cpu_core_map[0]) == num_online_cpus()) + return 0; +#endif + /* Assume multi socket systems are not synchronized */ + return num_online_cpus() > 1; +} + +/* * Decide after all CPUs are booted what mode gettimeofday should use. */ void __init time_init_gtod(void) { char *timetype; - /* - * AMD systems with more than one CPU don't have fully synchronized - * TSCs. Always use HPET gettimeofday for these, although it is slower. - * Intel SMP systems usually have synchronized TSCs, so use always - * the TSC. - * - * Exceptions: - * IBM Summit2 checked by oem_force_hpet_timer(). - * AMD dual core may also not need HPET. Check me. - * - * Can be turned off with "notsc". - */ - if (num_online_cpus() > 1 && - boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - notsc = 1; - /* Some systems will want to disable TSC and use HPET. */ - if (oem_force_hpet_timer()) + if (unsynchronized_tsc()) notsc = 1; if (vxtime.hpet_address && notsc) { timetype = "HPET"; vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; vxtime.mode = VXTIME_HPET; do_gettimeoffset = do_gettimeoffset_hpet; +#ifdef CONFIG_X86_PM_TIMER + /* Using PM for gettimeofday is quite slow, but we have no other + choice because the TSC is too unreliable on some systems. */ + } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) { + timetype = "PM"; + do_gettimeoffset = do_gettimeoffset_pm; + vxtime.mode = VXTIME_PMTMR; + sysctl_vsyscall = 0; + printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); +#endif } else { timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC"; vxtime.mode = VXTIME_TSC; diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index b4b8dc59663..2e573442594 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -65,7 +65,7 @@ static force_inline void do_vgettimeofday(struct timeval * tv) usec = (__xtime.tv_nsec / 1000) + (__jiffies - __wall_jiffies) * (1000000 / HZ); - if (__vxtime.mode == VXTIME_TSC) { + if (__vxtime.mode != VXTIME_HPET) { sync_core(); rdtscll(t); if (t < __vxtime.last_tsc) @@ -217,8 +217,9 @@ static int __init vsyscall_init(void) BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); map_vsyscall(); - sysctl_vsyscall = 1; +#ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); +#endif return 0; } diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index e0330921676..5d6b2114cc9 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -234,6 +234,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, /* * Handle a fault on the vmalloc or module mapping area + * + * This assumes no large pages in there. */ static int vmalloc_fault(unsigned long address) { @@ -272,7 +274,10 @@ static int vmalloc_fault(unsigned long address) if (!pte_present(*pte_ref)) return -1; pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref)) + /* Don't use pte_page here, because the mappings can point + outside mem_map, and the NUMA hash lookup cannot handle + that. */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) BUG(); __flush_tlb_all(); return 0; @@ -346,7 +351,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) * protection error (error_code & 1) == 0. */ if (unlikely(address >= TASK_SIZE)) { - if (!(error_code & 5)) { + if (!(error_code & 5) && + ((address >= VMALLOC_START && address < VMALLOC_END) || + (address >= MODULES_VADDR && address < MODULES_END))) { if (vmalloc_fault(address) < 0) goto bad_area_nosemaphore; return; diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index 74ec8554b19..c6fb0cb6999 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c @@ -272,7 +272,7 @@ void iounmap(volatile void __iomem *addr) if ((p->flags >> 20) && p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) { /* p->size includes the guard page, but cpa doesn't like that */ - change_page_attr(virt_to_page(__va(p->phys_addr)), + change_page_attr_addr((unsigned long)__va(p->phys_addr), p->size >> PAGE_SHIFT, PAGE_KERNEL); global_flush_tlb(); |