aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig23
-rw-r--r--arch/x86/Makefile29
-rw-r--r--arch/x86/boot/Makefile53
-rw-r--r--arch/x86/boot/header.S29
-rw-r--r--arch/x86/boot/pm.c44
-rw-r--r--arch/x86/boot/tools/build.c9
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/boot.h4
-rw-r--r--arch/x86/include/asm/cacheflush.h53
-rwxr-xr-xarch/x86/include/asm/cpu_debug.h199
-rw-r--r--arch/x86/include/asm/desc.h3
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/fixmap.h10
-rw-r--r--arch/x86/include/asm/hardirq.h1
-rw-r--r--arch/x86/include/asm/highmem.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h1
-rw-r--r--arch/x86/include/asm/i387.h8
-rw-r--r--arch/x86/include/asm/init.h18
-rw-r--r--arch/x86/include/asm/io.h3
-rw-r--r--arch/x86/include/asm/irq.h1
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/kexec.h13
-rw-r--r--arch/x86/include/asm/linkage.h19
-rw-r--r--arch/x86/include/asm/mce.h35
-rw-r--r--arch/x86/include/asm/mmzone_32.h43
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/pat.h5
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h4
-rw-r--r--arch/x86/include/asm/xen/hypercall.h2
-rw-r--r--arch/x86/include/asm/xen/page.h1
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/alternative.c17
-rw-r--r--arch/x86/kernel/apic/apic.c15
-rw-r--r--arch/x86/kernel/check.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c54
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c2
-rw-r--r--arch/x86/kernel/cpu/common.c38
-rw-r--r--arch/x86/kernel/cpu/cpu.h11
-rwxr-xr-xarch/x86/kernel/cpu/cpu_debug.c839
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/x86/kernel/cpu/cyrix.c16
-rw-r--r--arch/x86/kernel/cpu/intel.c32
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c530
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c207
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpu/umc.c2
-rw-r--r--arch/x86/kernel/ds.c3
-rw-r--r--arch/x86/kernel/e820.c53
-rw-r--r--arch/x86/kernel/early_printk.c20
-rw-r--r--arch/x86/kernel/efi.c7
-rw-r--r--arch/x86/kernel/efi_64.c21
-rw-r--r--arch/x86/kernel/entry_32.S18
-rw-r--r--arch/x86/kernel/entry_64.S6
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/irq.c67
-rw-r--r--arch/x86/kernel/irq_32.c29
-rw-r--r--arch/x86/kernel/irqinit_32.c3
-rw-r--r--arch/x86/kernel/irqinit_64.c3
-rw-r--r--arch/x86/kernel/machine_kexec_32.c17
-rw-r--r--arch/x86/kernel/machine_kexec_64.c99
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/mpparse.c31
-rw-r--r--arch/x86/kernel/ptrace.c3
-rw-r--r--arch/x86/kernel/quirks.c3
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S24
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S189
-rw-r--r--arch/x86/kernel/setup.c9
-rw-r--r--arch/x86/kernel/setup_percpu.c355
-rw-r--r--arch/x86/kernel/smpboot.c78
-rw-r--r--arch/x86/kernel/tlb_uv.c2
-rw-r--r--arch/x86/kernel/uv_time.c393
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S7
-rw-r--r--arch/x86/lguest/boot.c21
-rw-r--r--arch/x86/lib/memcpy_64.S143
-rw-r--r--arch/x86/math-emu/fpu_aux.c31
-rw-r--r--arch/x86/mm/highmem_32.c24
-rw-r--r--arch/x86/mm/init.c344
-rw-r--r--arch/x86/mm/init_32.c273
-rw-r--r--arch/x86/mm/init_64.c352
-rw-r--r--arch/x86/mm/iomap_32.c27
-rw-r--r--arch/x86/mm/ioremap.c71
-rw-r--r--arch/x86/mm/kmmio.c162
-rw-r--r--arch/x86/mm/memtest.c3
-rw-r--r--arch/x86/mm/numa_32.c5
-rw-r--r--arch/x86/mm/pageattr.c11
-rw-r--r--arch/x86/mm/pat.c5
-rw-r--r--arch/x86/mm/testmmiotrace.c70
-rw-r--r--arch/x86/pci/common.c4
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/xen/enlighten.c10
-rw-r--r--arch/x86/xen/mmu.c7
-rw-r--r--arch/x86/xen/smp.c6
109 files changed, 4102 insertions, 1419 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 469f3450bf8..34bc3a89228 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -138,6 +138,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
config HAVE_SETUP_PER_CPU_AREA
def_bool y
+config HAVE_DYNAMIC_PER_CPU_AREA
+ def_bool y
+
config HAVE_CPUMASK_OF_CPU_MAP
def_bool X86_64_SMP
@@ -166,6 +169,9 @@ config GENERIC_HARDIRQS
bool
default y
+config GENERIC_HARDIRQS_NO__DO_IRQ
+ def_bool y
+
config GENERIC_IRQ_PROBE
bool
default y
@@ -780,6 +786,11 @@ config X86_MCE_AMD
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
+config X86_MCE_THRESHOLD
+ depends on X86_MCE_AMD || X86_MCE_INTEL
+ bool
+ default y
+
config X86_MCE_NONFATAL
tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_32 && X86_MCE
@@ -923,6 +934,12 @@ config X86_CPUID
with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
/dev/cpu/31/cpuid.
+config X86_CPU_DEBUG
+ tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
+ ---help---
+ If you select this option, this will provide various x86 CPUs
+ information through debugfs.
+
choice
prompt "High Memory Support"
default HIGHMEM4G if !X86_NUMAQ
@@ -1115,7 +1132,7 @@ config NUMA_EMU
config NODES_SHIFT
int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
- range 1 9 if X86_64
+ range 1 9
default "9" if MAXSMP
default "6" if X86_64
default "4" if X86_NUMAQ
@@ -1125,7 +1142,7 @@ config NODES_SHIFT
Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accomodate various tables.
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
def_bool y
depends on X86_32 && NUMA
@@ -1423,7 +1440,7 @@ config CRASH_DUMP
config KEXEC_JUMP
bool "kexec jump (EXPERIMENTAL)"
depends on EXPERIMENTAL
- depends on KEXEC && HIBERNATION && X86_32
+ depends on KEXEC && HIBERNATION
---help---
Jump between original kernel and kexeced kernel and invoke
code in physical address mode via KEXEC
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1836191839e..f05d8c91d9e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -153,34 +153,23 @@ endif
boot := arch/x86/boot
-PHONY += zImage bzImage compressed zlilo bzlilo \
- zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
+BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage install
+
+PHONY += bzImage $(BOOT_TARGETS)
# Default kernel to build
all: bzImage
# KBUILD_IMAGE specify target image being built
- KBUILD_IMAGE := $(boot)/bzImage
-zImage zlilo zdisk: KBUILD_IMAGE := $(boot)/zImage
+KBUILD_IMAGE := $(boot)/bzImage
-zImage bzImage: vmlinux
+bzImage: vmlinux
$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
-compressed: zImage
-
-zlilo bzlilo: vmlinux
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
-
-zdisk bzdisk: vmlinux
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
-
-fdimage fdimage144 fdimage288 isoimage: vmlinux
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
-
-install:
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
+$(BOOT_TARGETS): vmlinux
+ $(Q)$(MAKE) $(build)=$(boot) $@
PHONY += vdso_install
vdso_install:
@@ -205,7 +194,3 @@ define archhelp
echo ' FDARGS="..." arguments for the booted kernel'
echo ' FDINITRD=file initrd for the booted kernel'
endef
-
-CLEAN_FILES += arch/x86/boot/fdimage \
- arch/x86/boot/image.iso \
- arch/x86/boot/mtools.conf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index c70eff69a1f..fb737ce5888 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -6,26 +6,24 @@
# for more details.
#
# Copyright (C) 1994 by Linus Torvalds
+# Changed by many, many contributors over the years.
#
# ROOT_DEV specifies the default root-device when making the image.
# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
# the default of FLOPPY is used by 'build'.
-ROOT_DEV := CURRENT
+ROOT_DEV := CURRENT
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
# The number is the same as you would ordinarily press at bootup.
-SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
+SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
-# If you want the RAM disk device, define this to be the size in blocks.
-
-#RAMDISK := -DRAMDISK=512
-
-targets := vmlinux.bin setup.bin setup.elf zImage bzImage
+targets := vmlinux.bin setup.bin setup.elf bzImage
+targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
subdir- := compressed
setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
@@ -71,17 +69,13 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
KBUILD_CFLAGS += $(call cc-option,-m32)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK)
-$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
-$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
-$(obj)/bzImage: BUILDFLAGS := -b
+$(obj)/bzImage: asflags-y := $(SVGA_MODE)
quiet_cmd_image = BUILD $@
-cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \
- $(obj)/vmlinux.bin $(ROOT_DEV) > $@
+cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
+ $(ROOT_DEV) > $@
-$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \
- $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
$(call if_changed,image)
@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
@@ -116,9 +110,11 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
$(obj)/compressed/vmlinux: FORCE
$(Q)$(MAKE) $(build)=$(obj)/compressed $@
-# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
+# Set this if you want to pass append arguments to the
+# bzdisk/fdimage/isoimage kernel
FDARGS =
-# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel
+# Set this if you want an initrd included with the
+# bzdisk/fdimage/isoimage kernel
FDINITRD =
image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
@@ -127,7 +123,7 @@ $(obj)/mtools.conf: $(src)/mtools.conf.in
sed -e 's|@OBJ@|$(obj)|g' < $< > $@
# This requires write access to /dev/fd0
-zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
+bzdisk: $(obj)/bzImage $(obj)/mtools.conf
MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
syslinux /dev/fd0 ; sync
echo '$(image_cmdline)' | \
@@ -135,10 +131,10 @@ zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
if [ -f '$(FDINITRD)' ] ; then \
MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync
+ MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync
# These require being root or having syslinux 2.02 or higher installed
-fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
+fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
syslinux $(obj)/fdimage ; sync
@@ -147,9 +143,9 @@ fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
if [ -f '$(FDINITRD)' ] ; then \
MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync
+ MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync
-fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
+fdimage288: $(obj)/bzImage $(obj)/mtools.conf
dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
syslinux $(obj)/fdimage ; sync
@@ -158,9 +154,9 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
if [ -f '$(FDINITRD)' ] ; then \
MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync
+ MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync
-isoimage: $(BOOTIMAGE)
+isoimage: $(obj)/bzImage
-rm -rf $(obj)/isoimage
mkdir $(obj)/isoimage
for i in lib lib64 share end ; do \
@@ -170,7 +166,7 @@ isoimage: $(BOOTIMAGE)
fi ; \
if [ $$i = end ] ; then exit 1 ; fi ; \
done
- cp $(BOOTIMAGE) $(obj)/isoimage/linux
+ cp $(obj)/bzImage $(obj)/isoimage/linux
echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
if [ -f '$(FDINITRD)' ] ; then \
cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
@@ -181,12 +177,13 @@ isoimage: $(BOOTIMAGE)
isohybrid $(obj)/image.iso 2>/dev/null || true
rm -rf $(obj)/isoimage
-zlilo: $(BOOTIMAGE)
+bzlilo: $(obj)/bzImage
if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
- cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz
+ cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
cp System.map $(INSTALL_PATH)/
if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
install:
- sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
+ sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
+ System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 7ccff4884a2..5d84d1c74e4 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -24,12 +24,8 @@
#include "boot.h"
#include "offsets.h"
-SETUPSECTS = 4 /* default nr of setup-sectors */
BOOTSEG = 0x07C0 /* original address of boot-sector */
-SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
-SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
- /* to be loaded */
-ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
+SYSSEG = 0x1000 /* historical load address >> 4 */
#ifndef SVGA_MODE
#define SVGA_MODE ASK_VGA
@@ -97,12 +93,12 @@ bugger_off_msg:
.section ".header", "a"
.globl hdr
hdr:
-setup_sects: .byte SETUPSECTS
+setup_sects: .byte 0 /* Filled in by build.c */
root_flags: .word ROOT_RDONLY
-syssize: .long SYSSIZE
-ram_size: .word RAMDISK
+syssize: .long 0 /* Filled in by build.c */
+ram_size: .word 0 /* Obsolete */
vid_mode: .word SVGA_MODE
-root_dev: .word ROOT_DEV
+root_dev: .word 0 /* Filled in by build.c */
boot_flag: .word 0xAA55
# offset 512, entry point
@@ -123,14 +119,15 @@ _start:
# or else old loadlin-1.5 will fail)
.globl realmode_swtch
realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
-start_sys_seg: .word SYSSEG
+start_sys_seg: .word SYSSEG # obsolete and meaningless, but just
+ # in case something decided to "use" it
.word kernel_version-512 # pointing to kernel version string
# above section of header is compatible
# with loadlin-1.5 (header v1.5). Don't
# change it.
-type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin,
- # Bootlin, SYSLX, bootsect...)
+type_of_loader: .byte 0 # 0 means ancient bootloader, newer
+ # bootloaders know to change this.
# See Documentation/i386/boot.txt for
# assigned ids
@@ -142,11 +139,7 @@ CAN_USE_HEAP = 0x80 # If set, the loader also has set
# space behind setup.S can be used for
# heap purposes.
# Only the loader knows what is free
-#ifndef __BIG_KERNEL__
- .byte 0
-#else
.byte LOADED_HIGH
-#endif
setup_move_size: .word 0x8000 # size to move, when setup is not
# loaded at 0x90000. We will move setup
@@ -157,11 +150,7 @@ setup_move_size: .word 0x8000 # size to move, when setup is not
code32_start: # here loaders can put a different
# start address for 32-bit code.
-#ifndef __BIG_KERNEL__
- .long 0x1000 # 0x1000 = default for zImage
-#else
.long 0x100000 # 0x100000 = default for big kernel
-#endif
ramdisk_image: .long 0 # address of loaded ramdisk image
# Here the loader puts the 32-bit
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 85a1cd8a8ff..8062f891525 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -33,47 +33,6 @@ static void realmode_switch_hook(void)
}
/*
- * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000.
- * A bzImage kernel is loaded and runs at 0x100000.
- */
-static void move_kernel_around(void)
-{
- /* Note: rely on the compile-time option here rather than
- the LOADED_HIGH flag. The Qemu kernel loader unconditionally
- sets the loadflags to zero. */
-#ifndef __BIG_KERNEL__
- u16 dst_seg, src_seg;
- u32 syssize;
-
- dst_seg = 0x1000 >> 4;
- src_seg = 0x10000 >> 4;
- syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */
-
- while (syssize) {
- int paras = (syssize >= 0x1000) ? 0x1000 : syssize;
- int dwords = paras << 2;
-
- asm volatile("pushw %%es ; "
- "pushw %%ds ; "
- "movw %1,%%es ; "
- "movw %2,%%ds ; "
- "xorw %%di,%%di ; "
- "xorw %%si,%%si ; "
- "rep;movsl ; "
- "popw %%ds ; "
- "popw %%es"
- : "+c" (dwords)
- : "r" (dst_seg), "r" (src_seg)
- : "esi", "edi");
-
- syssize -= paras;
- dst_seg += paras;
- src_seg += paras;
- }
-#endif
-}
-
-/*
* Disable all interrupts at the legacy PIC.
*/
static void mask_all_interrupts(void)
@@ -147,9 +106,6 @@ void go_to_protected_mode(void)
/* Hook before leaving real mode, also disables interrupts */
realmode_switch_hook();
- /* Move the kernel/setup to their final resting places */
- move_kernel_around();
-
/* Enable the A20 gate */
if (enable_a20()) {
puts("A20 gate not responding, unable to boot...\n");
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 44dc1923c0e..ee3a4ea923a 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -130,7 +130,7 @@ static void die(const char * str, ...)
static void usage(void)
{
- die("Usage: build [-b] setup system [rootdev] [> image]");
+ die("Usage: build setup system [rootdev] [> image]");
}
int main(int argc, char ** argv)
@@ -145,11 +145,6 @@ int main(int argc, char ** argv)
void *kernel;
u32 crc = 0xffffffffUL;
- if (argc > 2 && !strcmp(argv[1], "-b"))
- {
- is_big_kernel = 1;
- argc--, argv++;
- }
if ((argc < 3) || (argc > 4))
usage();
if (argc > 3) {
@@ -216,8 +211,6 @@ int main(int argc, char ** argv)
die("Unable to mmap '%s': %m", argv[2]);
/* Number of 16-byte paragraphs, including space for a 4-byte CRC */
sys_size = (sz + 15 + 4) / 16;
- if (!is_big_kernel && sys_size > DEF_SYSSIZE)
- die("System is too big. Try using bzImage or modules.");
/* Patch the setup code with the appropriate size parameters */
buf[0x1f1] = setup_sectors-1;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 4ef949c1972..394d177d721 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -379,6 +379,7 @@ static inline u32 safe_apic_wait_icr_idle(void)
static inline void ack_APIC_irq(void)
{
+#ifdef CONFIG_X86_LOCAL_APIC
/*
* ack_APIC_irq() actually gets compiled as a single instruction
* ... yummie.
@@ -386,6 +387,7 @@ static inline void ack_APIC_irq(void)
/* Docs say use 0 for future compatibility */
apic_write(APIC_EOI, 0);
+#endif
}
static inline unsigned default_get_apic_id(unsigned long x)
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b..bc9514fb3b1 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
#define APIC_ESR_SENDILL 0x00020
#define APIC_ESR_RECVILL 0x00040
#define APIC_ESR_ILLREGA 0x00080
+#define APIC_LVTCMCI 0x2f0
#define APIC_ICR 0x300
#define APIC_DEST_SELF 0x40000
#define APIC_DEST_ALLINC 0x80000
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6526cf08b0e..6ba23dd9fc9 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -1,10 +1,6 @@
#ifndef _ASM_X86_BOOT_H
#define _ASM_X86_BOOT_H
-/* Don't touch these, unless you really know what you're doing. */
-#define DEF_SYSSEG 0x1000
-#define DEF_SYSSIZE 0x7F00
-
/* Internal svga startup constants */
#define NORMAL_VGA 0xffff /* 80x25 mode */
#define EXTENDED_VGA 0xfffe /* 80x50 mode */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 2f8466540fb..5b301b7ff5f 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -5,24 +5,43 @@
#include <linux/mm.h>
/* Caches aren't brain-dead on the intel. */
-#define flush_cache_all() do { } while (0)
-#define flush_cache_mm(mm) do { } while (0)
-#define flush_cache_dup_mm(mm) do { } while (0)
-#define flush_cache_range(vma, start, end) do { } while (0)
-#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
-#define flush_dcache_page(page) do { } while (0)
-#define flush_dcache_mmap_lock(mapping) do { } while (0)
-#define flush_dcache_mmap_unlock(mapping) do { } while (0)
-#define flush_icache_range(start, end) do { } while (0)
-#define flush_icache_page(vma, pg) do { } while (0)
-#define flush_icache_user_range(vma, pg, adr, len) do { } while (0)
-#define flush_cache_vmap(start, end) do { } while (0)
-#define flush_cache_vunmap(start, end) do { } while (0)
+static inline void flush_cache_all(void) { }
+static inline void flush_cache_mm(struct mm_struct *mm) { }
+static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
+static inline void flush_cache_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end) { }
+static inline void flush_cache_page(struct vm_area_struct *vma,
+ unsigned long vmaddr, unsigned long pfn) { }
+static inline void flush_dcache_page(struct page *page) { }
+static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
+static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
+static inline void flush_icache_range(unsigned long start,
+ unsigned long end) { }
+static inline void flush_icache_page(struct vm_area_struct *vma,
+ struct page *page) { }
+static inline void flush_icache_user_range(struct vm_area_struct *vma,
+ struct page *page,
+ unsigned long addr,
+ unsigned long len) { }
+static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
+static inline void flush_cache_vunmap(unsigned long start,
+ unsigned long end) { }
-#define copy_to_user_page(vma, page, vaddr, dst, src, len) \
- memcpy((dst), (src), (len))
-#define copy_from_user_page(vma, page, vaddr, dst, src, len) \
- memcpy((dst), (src), (len))
+static inline void copy_to_user_page(struct vm_area_struct *vma,
+ struct page *page, unsigned long vaddr,
+ void *dst, const void *src,
+ unsigned long len)
+{
+ memcpy(dst, src, len);
+}
+
+static inline void copy_from_user_page(struct vm_area_struct *vma,
+ struct page *page, unsigned long vaddr,
+ void *dst, const void *src,
+ unsigned long len)
+{
+ memcpy(dst, src, len);
+}
#define PG_non_WB PG_arch_1
PAGEFLAG(NonWB, non_WB)
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
new file mode 100755
index 00000000000..56f1635e461
--- /dev/null
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -0,0 +1,199 @@
+#ifndef _ASM_X86_CPU_DEBUG_H
+#define _ASM_X86_CPU_DEBUG_H
+
+/*
+ * CPU x86 architecture debug
+ *
+ * Copyright(C) 2009 Jaswinder Singh Rajput
+ */
+
+/* Register flags */
+enum cpu_debug_bit {
+/* Model Specific Registers (MSRs) */
+ CPU_MC_BIT, /* Machine Check */
+ CPU_MONITOR_BIT, /* Monitor */
+ CPU_TIME_BIT, /* Time */
+ CPU_PMC_BIT, /* Performance Monitor */
+ CPU_PLATFORM_BIT, /* Platform */
+ CPU_APIC_BIT, /* APIC */
+ CPU_POWERON_BIT, /* Power-on */
+ CPU_CONTROL_BIT, /* Control */
+ CPU_FEATURES_BIT, /* Features control */
+ CPU_LBRANCH_BIT, /* Last Branch */
+ CPU_BIOS_BIT, /* BIOS */
+ CPU_FREQ_BIT, /* Frequency */
+ CPU_MTTR_BIT, /* MTRR */
+ CPU_PERF_BIT, /* Performance */
+ CPU_CACHE_BIT, /* Cache */
+ CPU_SYSENTER_BIT, /* Sysenter */
+ CPU_THERM_BIT, /* Thermal */
+ CPU_MISC_BIT, /* Miscellaneous */
+ CPU_DEBUG_BIT, /* Debug */
+ CPU_PAT_BIT, /* PAT */
+ CPU_VMX_BIT, /* VMX */
+ CPU_CALL_BIT, /* System Call */
+ CPU_BASE_BIT, /* BASE Address */
+ CPU_SMM_BIT, /* System mgmt mode */
+ CPU_SVM_BIT, /*Secure Virtual Machine*/
+ CPU_OSVM_BIT, /* OS-Visible Workaround*/
+/* Standard Registers */
+ CPU_TSS_BIT, /* Task Stack Segment */
+ CPU_CR_BIT, /* Control Registers */
+ CPU_DT_BIT, /* Descriptor Table */
+/* End of Registers flags */
+ CPU_REG_ALL_BIT, /* Select all Registers */
+};
+
+#define CPU_REG_ALL (~0) /* Select all Registers */
+
+#define CPU_MC (1 << CPU_MC_BIT)
+#define CPU_MONITOR (1 << CPU_MONITOR_BIT)
+#define CPU_TIME (1 << CPU_TIME_BIT)
+#define CPU_PMC (1 << CPU_PMC_BIT)
+#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT)
+#define CPU_APIC (1 << CPU_APIC_BIT)
+#define CPU_POWERON (1 << CPU_POWERON_BIT)
+#define CPU_CONTROL (1 << CPU_CONTROL_BIT)
+#define CPU_FEATURES (1 << CPU_FEATURES_BIT)
+#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT)
+#define CPU_BIOS (1 << CPU_BIOS_BIT)
+#define CPU_FREQ (1 << CPU_FREQ_BIT)
+#define CPU_MTRR (1 << CPU_MTTR_BIT)
+#define CPU_PERF (1 << CPU_PERF_BIT)
+#define CPU_CACHE (1 << CPU_CACHE_BIT)
+#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT)
+#define CPU_THERM (1 << CPU_THERM_BIT)
+#define CPU_MISC (1 << CPU_MISC_BIT)
+#define CPU_DEBUG (1 << CPU_DEBUG_BIT)
+#define CPU_PAT (1 << CPU_PAT_BIT)
+#define CPU_VMX (1 << CPU_VMX_BIT)
+#define CPU_CALL (1 << CPU_CALL_BIT)
+#define CPU_BASE (1 << CPU_BASE_BIT)
+#define CPU_SMM (1 << CPU_SMM_BIT)
+#define CPU_SVM (1 << CPU_SVM_BIT)
+#define CPU_OSVM (1 << CPU_OSVM_BIT)
+#define CPU_TSS (1 << CPU_TSS_BIT)
+#define CPU_CR (1 << CPU_CR_BIT)
+#define CPU_DT (1 << CPU_DT_BIT)
+
+/* Register file flags */
+enum cpu_file_bit {
+ CPU_INDEX_BIT, /* index */
+ CPU_VALUE_BIT, /* value */
+};
+
+#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
+
+/*
+ * DisplayFamily_DisplayModel Processor Families/Processor Number Series
+ * -------------------------- ------------------------------------------
+ * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
+ *
+ * 06_01 Pentium Pro
+ * 06_03, 06_05 Pentium II Xeon, Pentium II
+ * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
+ *
+ * 06_09, 060D Pentium M
+ *
+ * 06_0E Core Duo, Core Solo
+ *
+ * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
+ * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
+ * Pentium dual-core
+ * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
+ *
+ * 06_1C Atom
+ *
+ * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
+ * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
+ *
+ * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
+ * Pentium 4, Pentium D
+ */
+
+/* Register processors bits */
+enum cpu_processor_bit {
+ CPU_NONE,
+/* Intel */
+ CPU_INTEL_PENTIUM_BIT,
+ CPU_INTEL_P6_BIT,
+ CPU_INTEL_PENTIUM_M_BIT,
+ CPU_INTEL_CORE_BIT,
+ CPU_INTEL_CORE2_BIT,
+ CPU_INTEL_ATOM_BIT,
+ CPU_INTEL_XEON_P4_BIT,
+ CPU_INTEL_XEON_MP_BIT,
+};
+
+#define CPU_ALL (~0) /* Select all CPUs */
+
+#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
+#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
+#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
+#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
+#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
+#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
+#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
+#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
+
+#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
+#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
+#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
+#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
+#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
+#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
+#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
+#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
+#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
+#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
+#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
+#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
+#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
+#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
+#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
+#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
+#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
+#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
+#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
+
+/* Select all Intel CPUs*/
+#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
+
+#define MAX_CPU_FILES 512
+
+struct cpu_private {
+ unsigned cpu;
+ unsigned type;
+ unsigned reg;
+ unsigned file;
+};
+
+struct cpu_debug_base {
+ char *name; /* Register name */
+ unsigned flag; /* Register flag */
+ unsigned write; /* Register write flag */
+};
+
+/*
+ * Currently it looks similar to cpu_debug_base but once we add more files
+ * cpu_file_base will go in different direction
+ */
+struct cpu_file_base {
+ char *name; /* Register file name */
+ unsigned flag; /* Register file flag */
+ unsigned write; /* Register write flag */
+};
+
+struct cpu_cpuX_base {
+ struct dentry *dentry; /* Register dentry */
+ int init; /* Register index file */
+};
+
+struct cpu_debug_range {
+ unsigned min; /* Register range min */
+ unsigned max; /* Register range max */
+ unsigned flag; /* Supported flags */
+ unsigned model; /* Supported models */
+};
+
+#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index dc27705f544..5623c50d67b 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -91,7 +91,6 @@ static inline int desc_empty(const void *ptr)
#define store_gdt(dtr) native_store_gdt(dtr)
#define store_idt(dtr) native_store_idt(dtr)
#define store_tr(tr) (tr = native_store_tr())
-#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
#define load_TLS(t, cpu) native_load_tls(t, cpu)
#define set_ldt native_set_ldt
@@ -112,6 +111,8 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
}
#endif /* CONFIG_PARAVIRT */
+#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
+
static inline void native_write_idt_entry(gate_desc *idt, int entry,
const gate_desc *gate)
{
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ca5ffb2856b..edc90f23e70 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
#else /* !CONFIG_X86_32 */
-#define MAX_EFI_IO_PAGES 100
-
extern u64 efi_call0(void *fp);
extern u64 efi_call1(void *fp, u64 arg1);
extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 854d538ae85..c2e6bedaf25 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
smp_invalidate_interrupt)
#endif
+BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
+
/*
* every pentium local APIC has two 'local interrupts', with a
* soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index dca8f03da5b..63a79c77d22 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -24,9 +24,6 @@
#include <asm/kmap_types.h>
#else
#include <asm/vsyscall.h>
-#ifdef CONFIG_EFI
-#include <asm/efi.h>
-#endif
#endif
/*
@@ -92,13 +89,6 @@ enum fixed_addresses {
FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_EFI
- FIX_EFI_IO_MAP_LAST_PAGE,
- FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
- + MAX_EFI_IO_PAGES - 1,
-#endif
-#endif
#ifdef CONFIG_X86_VISWS_APIC
FIX_CO_CPU, /* Cobalt timer */
FIX_CO_APIC, /* Cobalt APIC Redirection Table */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 176f058e715..039db6aa8e0 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,6 +12,7 @@ typedef struct {
unsigned int apic_timer_irqs; /* arch dependent */
unsigned int irq_spurious_count;
#endif
+ unsigned int generic_irqs; /* arch dependent */
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index bf9276bea66..014c2b85ae4 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -63,6 +63,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
void *kmap_atomic(struct page *page, enum km_type type);
void kunmap_atomic(void *kvaddr, enum km_type type);
void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
struct page *kmap_atomic_to_page(void *ptr);
#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 370e1c83bb4..b762ea49bd7 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,6 +27,7 @@
/* Interrupt handlers registered during init_IRQ */
extern void apic_timer_interrupt(void);
+extern void generic_interrupt(void);
extern void error_interrupt(void);
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 48f0004db8c..71c9e518398 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk)
#else /* CONFIG_X86_32 */
-extern void finit(void);
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_task(struct task_struct *tsk);
+#else
+static inline void finit_task(struct task_struct *tsk)
+{
+}
+#endif
static inline void tolerant_fwait(void)
{
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 00000000000..36fb1a6a510
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_INIT_32_H
+#define _ASM_X86_INIT_32_H
+
+#ifdef CONFIG_X86_32
+extern void __init early_ioremap_page_table_range_init(void);
+#endif
+
+extern unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
+
+
+extern unsigned long __initdata e820_table_start;
+extern unsigned long __meminitdata e820_table_end;
+extern unsigned long __meminitdata e820_table_top;
+
+#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 683d0b4c00f..e5383e3d2f8 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
extern void iounmap(volatile void __iomem *addr);
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
-
#ifdef CONFIG_X86_32
# include "io_32.h"
@@ -198,7 +196,6 @@ extern void early_ioremap_reset(void);
extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
extern void early_iounmap(void __iomem *addr, unsigned long size);
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
#define IO_SPACE_LIMIT 0xffff
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 107eb219669..f38481bcd45 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq)
extern void fixup_irqs(void);
#endif
+extern void (*generic_interrupt_extension)(void);
extern void init_IRQ(void);
extern void native_init_IRQ(void);
extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8a285f356f8..3cbd79bbb47 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -112,6 +112,11 @@
#define LOCAL_PERF_VECTOR 0xee
/*
+ * Generic system vector for platform specific use
+ */
+#define GENERIC_INTERRUPT_VECTOR 0xed
+
+/*
* First APIC vector available to drivers: (vectors 0x30-0xee) we
* start at 0x31(0x41) to spread out vectors evenly between priority
* levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 0ceb6d19ed3..317ff1703d0 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -9,13 +9,13 @@
# define PAGES_NR 4
#else
# define PA_CONTROL_PAGE 0
-# define PA_TABLE_PAGE 1
-# define PAGES_NR 2
+# define VA_CONTROL_PAGE 1
+# define PA_TABLE_PAGE 2
+# define PA_SWAP_PAGE 3
+# define PAGES_NR 4
#endif
-#ifdef CONFIG_X86_32
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
-#endif
#ifndef __ASSEMBLY__
@@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page,
unsigned int has_pae,
unsigned int preserve_context);
#else
-NORET_TYPE void
+unsigned long
relocate_kernel(unsigned long indirection_page,
unsigned long page_list,
- unsigned long start_address) ATTRIB_NORET;
+ unsigned long start_address,
+ unsigned int preserve_context);
#endif
#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 9320e2a8a26..12d55e773eb 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -1,14 +1,11 @@
#ifndef _ASM_X86_LINKAGE_H
#define _ASM_X86_LINKAGE_H
+#include <linux/stringify.h>
+
#undef notrace
#define notrace __attribute__((no_instrument_function))
-#ifdef CONFIG_X86_64
-#define __ALIGN .p2align 4,,15
-#define __ALIGN_STR ".p2align 4,,15"
-#endif
-
#ifdef CONFIG_X86_32
#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
/*
@@ -50,16 +47,20 @@
__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
"g" (arg4), "g" (arg5), "g" (arg6))
-#endif
+#endif /* CONFIG_X86_32 */
+
+#ifdef __ASSEMBLY__
#define GLOBAL(name) \
.globl name; \
name:
-#ifdef CONFIG_X86_ALIGNMENT_16
-#define __ALIGN .align 16,0x90
-#define __ALIGN_STR ".align 16,0x90"
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
+#define __ALIGN .p2align 4, 0x90
+#define __ALIGN_STR __stringify(__ALIGN)
#endif
+#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_X86_LINKAGE_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960..563933e06a3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
*/
#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
+#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
+#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
#include <asm/atomic.h>
+void mce_setup(struct mce *m);
void mce_log(struct mce *m);
DECLARE_PER_CPU(struct sys_device, device_mce);
extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
+
#ifdef CONFIG_X86_MCE_INTEL
void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void cmci_clear(void);
+void cmci_reenable(void);
+void cmci_rediscover(int dying);
+void cmci_recheck(void);
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void cmci_clear(void) {}
+static inline void cmci_reenable(void) {}
+static inline void cmci_rediscover(int dying) {}
+static inline void cmci_recheck(void) {}
#endif
#ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+extern int mce_available(struct cpuinfo_x86 *c);
+
+void mce_log_therm_throt_event(__u64 status);
extern atomic_t mce_entry;
extern void do_machine_check(struct pt_regs *, long);
+
+typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
+
+enum mcp_flags {
+ MCP_TIMESTAMP = (1 << 0), /* log time stamp */
+ MCP_UC = (1 << 1), /* log uncorrected errors */
+};
+extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+
extern int mce_notify_user(void);
#endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
#else
#define mcheck_init(c) do { } while (0)
#endif
-extern void stop_mce(void);
-extern void restart_mce(void);
+
+extern void (*mce_threshold_vector)(void);
#endif /* __KERNEL__ */
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 105fb90a063..ede6998bd92 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn)
#endif /* CONFIG_DISCONTIGMEM */
#ifdef CONFIG_NEED_MULTIPLE_NODES
-
-/*
- * Following are macros that are specific to this numa platform.
- */
-#define reserve_bootmem(addr, size, flags) \
- reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
-#define alloc_bootmem(x) \
- __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_nopanic(x) \
- __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
- __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
- __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
-#define alloc_bootmem_pages(x) \
- __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_pages_nopanic(x) \
- __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
- __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
- __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-#define alloc_bootmem_node(pgdat, x) \
-({ \
- struct pglist_data __maybe_unused \
- *__alloc_bootmem_node__pgdat = (pgdat); \
- __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
- __pa(MAX_DMA_ADDRESS)); \
-})
-#define alloc_bootmem_pages_node(pgdat, x) \
-({ \
- struct pglist_data __maybe_unused \
- *__alloc_bootmem_node__pgdat = (pgdat); \
- __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
- __pa(MAX_DMA_ADDRESS)); \
-})
-#define alloc_bootmem_low_pages_node(pgdat, x) \
-({ \
- struct pglist_data __maybe_unused \
- *__alloc_bootmem_node__pgdat = (pgdat); \
- __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
-})
+/* always use node 0 for bootmem on this numa platform */
+#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \
+ (NODE_DATA(0)->bdata)
#endif /* CONFIG_NEED_MULTIPLE_NODES */
#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc59ae0..2dbd2314139 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -77,6 +77,11 @@
#define MSR_IA32_MC0_ADDR 0x00000402
#define MSR_IA32_MC0_MISC 0x00000403
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2 0x00000280
+#define CMCI_EN (1ULL << 30)
+#define CMCI_THRESHOLD_MASK 0xffffULL
+
#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
#define MSR_P6_EVNTSEL0 0x00000186
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 2d625da6603..826ad37006a 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,14 +40,8 @@
#ifndef __ASSEMBLY__
-struct pgprot;
-
extern int page_is_ram(unsigned long pagenr);
extern int devmem_is_allowed(unsigned long pagenr);
-extern void map_devmem(unsigned long pfn, unsigned long size,
- struct pgprot vma_prot);
-extern void unmap_devmem(unsigned long pfn, unsigned long size,
- struct pgprot vma_prot);
extern unsigned long max_low_pfn_mapped;
extern unsigned long max_pfn_mapped;
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b0e70056838..2cd07b9422f 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -2,6 +2,7 @@
#define _ASM_X86_PAT_H
#include <linux/types.h>
+#include <asm/pgtable_types.h>
#ifdef CONFIG_X86_PAT
extern int pat_enabled;
@@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end);
extern int kernel_map_sync_memtype(u64 base, unsigned long size,
unsigned long flag);
+extern void map_devmem(unsigned long pfn, unsigned long size,
+ struct pgprot vma_prot);
+extern void unmap_devmem(unsigned long pfn, unsigned long size,
+ struct pgprot vma_prot);
#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c097a3a666..d0812e155f1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
return 1;
}
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
#endif /* __ASSEMBLY__ */
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index bd8df3b2fe0..2733fad45f9 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -25,6 +25,11 @@
* area for the same reason. ;)
*/
#define VMALLOC_OFFSET (8 * 1024 * 1024)
+
+#ifndef __ASSEMBLER__
+extern bool __vmalloc_start_set; /* set once high_memory is set */
+#endif
+
#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
#ifdef CONFIG_X86_PAE
#define LAST_PKMAP 512
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0..b8238dc8786 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,6 +273,7 @@ typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
extern int nx_enabled;
+extern void set_nx(void);
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index dccef5be0fc..ae85a8d66a3 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -75,9 +75,9 @@ struct cpuinfo_x86 {
#else
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
int x86_tlbsize;
+#endif
__u8 x86_virt_bits;
__u8 x86_phys_bits;
-#endif
/* CPUID returned core id bits: */
__u8 x86_coreid_bits;
/* Max extended CPUID function supported: */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 777327ef05c..9f4dfba33b2 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define SCIR_CPU_ACTIVITY 0x02 /* not idle */
#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */
+/* Loop through all installed blades */
+#define for_each_possible_blade(bid) \
+ for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++)
+
/*
* Macros for converting between kernel virtual addresses, socket local physical
* addresses, and UV global physical addresses.
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5e79ca69432..9c371e4a9fa 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -296,6 +296,8 @@ HYPERVISOR_get_debugreg(int reg)
static inline int
HYPERVISOR_update_descriptor(u64 ma, u64 desc)
{
+ if (sizeof(u64) == sizeof(long))
+ return _hypercall2(int, update_descriptor, ma, desc);
return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
}
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 4bd990ee43d..1a918dde46b 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x)
xmaddr_t arbitrary_virt_to_machine(void *address);
+unsigned long arbitrary_virt_to_mfn(void *vaddr);
void make_lowmem_page_readonly(void *vaddr);
void make_lowmem_page_readwrite(void *vaddr);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 95f216bbfaf..339ce35648e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -111,7 +111,7 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o
+ obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6907b8e85d5..4c80f155743 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
that might execute the to be patched code.
Other CPUs are not running. */
stop_nmi();
-#ifdef CONFIG_X86_MCE
- stop_mce();
-#endif
+
+ /*
+ * Don't stop machine check exceptions while patching.
+ * MCEs only happen when something got corrupted and in this
+ * case we must do something about the corruption.
+ * Ignoring it is worse than a unlikely patching race.
+ * Also machine checks tend to be broadcast and if one CPU
+ * goes into machine check the others follow quickly, so we don't
+ * expect a machine check to cause undue problems during to code
+ * patching.
+ */
apply_alternatives(__alt_instructions, __alt_instructions_end);
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
(unsigned long)__smp_locks_end);
restart_nmi();
-#ifdef CONFIG_X86_MCE
- restart_mce();
-#endif
}
/**
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f9cecdfd05c..30909a258d0 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -46,6 +46,7 @@
#include <asm/idle.h>
#include <asm/mtrr.h>
#include <asm/smp.h>
+#include <asm/mce.h>
unsigned int num_processors;
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
}
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6) {
+ v = apic_read(APIC_LVTCMCI);
+ if (!(v & APIC_LVT_MASKED))
+ apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+ }
+#endif
+
/*
* Clean APIC state for other OSs:
*/
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_LVT1, value);
preempt_enable();
+
+#ifdef CONFIG_X86_MCE_INTEL
+ /* Recheck CMCI information after local APIC is up on CPU #0 */
+ if (smp_processor_id() == 0)
+ cmci_recheck();
+#endif
}
void __cpuinit end_local_APIC_setup(void)
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 2ac0ab71412..b617b1164f1 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -83,7 +83,7 @@ void __init setup_bios_corruption_check(void)
u64 size;
addr = find_e820_area_size(addr, &size, PAGE_SIZE);
- if (addr == 0)
+ if (!(addr + 1))
break;
if ((addr + size) > corruption_check_size)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2d..d4356f8b752 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,6 +14,8 @@ obj-y += vmware.o hypervisor.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
+obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
+
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 6882a735d9c..8220ae69849 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
u32 regs[4];
const struct cpuid_bit *cb;
- static const struct cpuid_bit cpuid_bits[] = {
+ static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
{ 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5b80e..7e4a459daa6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/apic.h>
+#include <asm/cpu.h>
#ifdef CONFIG_X86_64
# include <asm/numa_64.h>
@@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
}
}
+static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ /* calling is from identify_secondary_cpu() ? */
+ if (c->cpu_index == boot_cpu_id)
+ return;
+
+ /*
+ * Certain Athlons might work (for various values of 'work') in SMP
+ * but they are not certified as MP capable.
+ */
+ /* Athlon 660/661 is valid. */
+ if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+ (c->x86_mask == 1)))
+ goto valid_k7;
+
+ /* Duron 670 is valid */
+ if ((c->x86_model == 7) && (c->x86_mask == 0))
+ goto valid_k7;
+
+ /*
+ * Athlon 662, Duron 671, and Athlon >model 7 have capability
+ * bit. It's worth noting that the A5 stepping (662) of some
+ * Athlon XP's have the MP bit set.
+ * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+ * more.
+ */
+ if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+ ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+ (c->x86_model > 7))
+ if (cpu_has_mp)
+ goto valid_k7;
+
+ /* If we get here, not a certified SMP capable AMD system. */
+
+ /*
+ * Don't taint if we are running SMP kernel on a single non-MP
+ * approved Athlon
+ */
+ WARN_ONCE(1, "WARNING: This combination of AMD"
+ "processors is not suitable for SMP.\n");
+ if (!test_taint(TAINT_UNSAFE_SMP))
+ add_taint(TAINT_UNSAFE_SMP);
+
+valid_k7:
+ ;
+#endif
+}
+
static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
{
u32 l, h;
@@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
}
set_cpu_cap(c, X86_FEATURE_K7);
+
+ amd_k7_smp_check(c);
}
#endif
@@ -450,7 +502,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
}
#endif
-static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 89bfdd9cacc..983e0830f0d 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -468,7 +468,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
return size;
}
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index a1625f5a1e7..51b09c48c9c 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -25,7 +25,7 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
}
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+static const struct cpu_dev centaur_cpu_dev __cpuinitconst = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a9e3791ca09..e2962cc1e27 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -70,7 +70,7 @@ cpumask_t cpu_callin_map;
#endif /* CONFIG_X86_32 */
-static struct cpu_dev *this_cpu __cpuinitdata;
+static const struct cpu_dev *this_cpu __cpuinitdata;
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
@@ -284,9 +284,9 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
*/
/* Look up CPU names by table lookup. */
-static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
{
- struct cpu_model_info *info;
+ const struct cpu_model_info *info;
if (c->x86_model >= 16)
return NULL; /* Range check */
@@ -333,7 +333,7 @@ void switch_to_new_gdt(int cpu)
load_percpu_segment(cpu);
}
-static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
static void __cpuinit default_init(struct cpuinfo_x86 *c)
{
@@ -352,7 +352,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
#endif
}
-static struct cpu_dev __cpuinitdata default_cpu = {
+static const struct cpu_dev __cpuinitconst default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
.c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -574,13 +574,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
}
}
-#ifdef CONFIG_X86_64
if (c->extended_cpuid_level >= 0x80000008) {
u32 eax = cpuid_eax(0x80000008);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
}
+#ifdef CONFIG_X86_32
+ else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
#endif
if (c->extended_cpuid_level >= 0x80000007)
@@ -627,8 +629,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
#else
c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
@@ -659,12 +665,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
void __init early_cpu_init(void)
{
- struct cpu_dev **cdev;
+ const struct cpu_dev *const *cdev;
int count = 0;
printk(KERN_INFO "KERNEL supported cpus:\n");
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
- struct cpu_dev *cpudev = *cdev;
+ const struct cpu_dev *cpudev = *cdev;
unsigned int j;
if (count >= X86_VENDOR_NUM)
@@ -751,9 +757,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_coreid_bits = 0;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
#else
c->cpuid_level = -1; /* CPUID not detected */
c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -793,7 +803,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
- char *p;
+ const char *p;
p = table_lookup_model(c);
if (p)
strcpy(c->x86_model_id, p);
@@ -872,7 +882,7 @@ struct msr_range {
unsigned max;
};
-static struct msr_range msr_range_array[] __cpuinitdata = {
+static const struct msr_range msr_range_array[] __cpuinitconst = {
{ 0x00000000, 0x00000418},
{ 0xc0000000, 0xc000040b},
{ 0xc0010000, 0xc0010142},
@@ -921,7 +931,7 @@ __setup("noclflush", setup_noclflush);
void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
{
- char *vendor = NULL;
+ const char *vendor = NULL;
if (c->x86_vendor < X86_VENDOR_NUM) {
vendor = this_cpu->c_vendor;
@@ -1139,8 +1149,7 @@ void __cpuinit cpu_init(void)
atomic_inc(&init_mm.mm_count);
me->active_mm = &init_mm;
- if (me->mm)
- BUG();
+ BUG_ON(me->mm);
enter_lazy_tlb(&init_mm, me);
load_sp0(t, &current->thread);
@@ -1197,8 +1206,7 @@ void __cpuinit cpu_init(void)
*/
atomic_inc(&init_mm.mm_count);
curr->active_mm = &init_mm;
- if (curr->mm)
- BUG();
+ BUG_ON(curr->mm);
enter_lazy_tlb(&init_mm, curr);
load_sp0(t, thread);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index de4094a3921..9469ecb5aeb 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -5,15 +5,15 @@
struct cpu_model_info {
int vendor;
int family;
- char *model_names[16];
+ const char *model_names[16];
};
/* attempt to consolidate cpu attributes */
struct cpu_dev {
- char * c_vendor;
+ const char * c_vendor;
/* some have two possibilities for cpuid string */
- char * c_ident[2];
+ const char * c_ident[2];
struct cpu_model_info c_models[4];
@@ -25,11 +25,12 @@ struct cpu_dev {
};
#define cpu_dev_register(cpu_devX) \
- static struct cpu_dev *__cpu_dev_##cpu_devX __used \
+ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
__attribute__((__section__(".x86_cpu_dev.init"))) = \
&cpu_devX;
-extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
+extern const struct cpu_dev *const __x86_cpu_dev_start[],
+ *const __x86_cpu_dev_end[];
extern void display_cacheinfo(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
new file mode 100755
index 00000000000..21c0cf8ced1
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -0,0 +1,839 @@
+/*
+ * CPU x86 architecture debug code
+ *
+ * Copyright(C) 2009 Jaswinder Singh Rajput
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+
+#include <asm/cpu_debug.h>
+#include <asm/paravirt.h>
+#include <asm/system.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
+static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
+static DEFINE_PER_CPU(unsigned, cpu_modelflag);
+static DEFINE_PER_CPU(int, cpu_priv_count);
+static DEFINE_PER_CPU(unsigned, cpu_model);
+
+static DEFINE_MUTEX(cpu_debug_lock);
+
+static struct dentry *cpu_debugfs_dir;
+
+static struct cpu_debug_base cpu_base[] = {
+ { "mc", CPU_MC, 0 },
+ { "monitor", CPU_MONITOR, 0 },
+ { "time", CPU_TIME, 0 },
+ { "pmc", CPU_PMC, 1 },
+ { "platform", CPU_PLATFORM, 0 },
+ { "apic", CPU_APIC, 0 },
+ { "poweron", CPU_POWERON, 0 },
+ { "control", CPU_CONTROL, 0 },
+ { "features", CPU_FEATURES, 0 },
+ { "lastbranch", CPU_LBRANCH, 0 },
+ { "bios", CPU_BIOS, 0 },
+ { "freq", CPU_FREQ, 0 },
+ { "mtrr", CPU_MTRR, 0 },
+ { "perf", CPU_PERF, 0 },
+ { "cache", CPU_CACHE, 0 },
+ { "sysenter", CPU_SYSENTER, 0 },
+ { "therm", CPU_THERM, 0 },
+ { "misc", CPU_MISC, 0 },
+ { "debug", CPU_DEBUG, 0 },
+ { "pat", CPU_PAT, 0 },
+ { "vmx", CPU_VMX, 0 },
+ { "call", CPU_CALL, 0 },
+ { "base", CPU_BASE, 0 },
+ { "smm", CPU_SMM, 0 },
+ { "svm", CPU_SVM, 0 },
+ { "osvm", CPU_OSVM, 0 },
+ { "tss", CPU_TSS, 0 },
+ { "cr", CPU_CR, 0 },
+ { "dt", CPU_DT, 0 },
+ { "registers", CPU_REG_ALL, 0 },
+};
+
+static struct cpu_file_base cpu_file[] = {
+ { "index", CPU_REG_ALL, 0 },
+ { "value", CPU_REG_ALL, 1 },
+};
+
+/* Intel Registers Range */
+static struct cpu_debug_range cpu_intel_range[] = {
+ { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL },
+ { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE },
+ { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL },
+ { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM },
+ { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE },
+ { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE },
+
+ { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE },
+ { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON },
+ { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON },
+ { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE },
+
+ { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE },
+ { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT },
+ { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT },
+ { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM },
+
+ { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE },
+ { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE },
+ { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON },
+
+ { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT },
+ { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT },
+ { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT },
+ { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE },
+
+ { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX },
+ { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT },
+
+ { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE },
+ { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE },
+ { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE },
+ { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT },
+ { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE },
+ { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE },
+ { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE },
+ { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE },
+
+ { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT },
+ { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON },
+ { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE },
+ { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON },
+ { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE },
+ { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 },
+ { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE },
+ { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 },
+
+ { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE },
+ { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE },
+
+ { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON },
+ { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE },
+ { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON },
+ { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT },
+ { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON },
+ { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT },
+ { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE },
+ { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON },
+
+ { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON },
+ { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON },
+ { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON },
+ { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE },
+ { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON },
+ { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE },
+ { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
+
+ { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
+ { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
+ { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
+
+ { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
+
+ { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
+ { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
+ { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
+ { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
+};
+
+/* AMD Registers Range */
+static struct cpu_debug_range cpu_amd_range[] = {
+ { 0x00000010, 0x00000010, CPU_TIME, CPU_ALL, },
+ { 0x0000001B, 0x0000001B, CPU_APIC, CPU_ALL, },
+ { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_ALL, },
+
+ { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_ALL, },
+ { 0x00000179, 0x0000017A, CPU_MC, CPU_ALL, },
+ { 0x0000017B, 0x0000017B, CPU_MC, CPU_ALL, },
+ { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_ALL, },
+ { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_ALL, },
+
+ { 0x00000200, 0x0000020F, CPU_MTRR, CPU_ALL, },
+ { 0x00000250, 0x00000250, CPU_MTRR, CPU_ALL, },
+ { 0x00000258, 0x00000259, CPU_MTRR, CPU_ALL, },
+ { 0x00000268, 0x0000026F, CPU_MTRR, CPU_ALL, },
+ { 0x00000277, 0x00000277, CPU_PAT, CPU_ALL, },
+ { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_ALL, },
+
+ { 0x00000400, 0x00000417, CPU_MC, CPU_ALL, },
+
+ { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_ALL, },
+ { 0xC0000081, 0xC0000084, CPU_CALL, CPU_ALL, },
+ { 0xC0000100, 0xC0000102, CPU_BASE, CPU_ALL, },
+ { 0xC0000103, 0xC0000103, CPU_TIME, CPU_ALL, },
+
+ { 0xC0000408, 0xC000040A, CPU_MC, CPU_ALL, },
+
+ { 0xc0010000, 0xc0010007, CPU_PMC, CPU_ALL, },
+ { 0xc0010010, 0xc0010010, CPU_MTRR, CPU_ALL, },
+ { 0xc0010016, 0xc001001A, CPU_MTRR, CPU_ALL, },
+ { 0xc001001D, 0xc001001D, CPU_MTRR, CPU_ALL, },
+ { 0xc0010030, 0xc0010035, CPU_BIOS, CPU_ALL, },
+ { 0xc0010056, 0xc0010056, CPU_SMM, CPU_ALL, },
+ { 0xc0010061, 0xc0010063, CPU_SMM, CPU_ALL, },
+ { 0xc0010074, 0xc0010074, CPU_MC, CPU_ALL, },
+ { 0xc0010111, 0xc0010113, CPU_SMM, CPU_ALL, },
+ { 0xc0010114, 0xc0010118, CPU_SVM, CPU_ALL, },
+ { 0xc0010119, 0xc001011A, CPU_SMM, CPU_ALL, },
+ { 0xc0010140, 0xc0010141, CPU_OSVM, CPU_ALL, },
+ { 0xc0010156, 0xc0010156, CPU_SMM, CPU_ALL, },
+};
+
+
+static int get_cpu_modelflag(unsigned cpu)
+{
+ int flag;
+
+ switch (per_cpu(cpu_model, cpu)) {
+ /* Intel */
+ case 0x0501:
+ case 0x0502:
+ case 0x0504:
+ flag = CPU_INTEL_PENTIUM;
+ break;
+ case 0x0601:
+ case 0x0603:
+ case 0x0605:
+ case 0x0607:
+ case 0x0608:
+ case 0x060A:
+ case 0x060B:
+ flag = CPU_INTEL_P6;
+ break;
+ case 0x0609:
+ case 0x060D:
+ flag = CPU_INTEL_PENTIUM_M;
+ break;
+ case 0x060E:
+ flag = CPU_INTEL_CORE;
+ break;
+ case 0x060F:
+ case 0x0617:
+ flag = CPU_INTEL_CORE2;
+ break;
+ case 0x061C:
+ flag = CPU_INTEL_ATOM;
+ break;
+ case 0x0F00:
+ case 0x0F01:
+ case 0x0F02:
+ case 0x0F03:
+ case 0x0F04:
+ flag = CPU_INTEL_XEON_P4;
+ break;
+ case 0x0F06:
+ flag = CPU_INTEL_XEON_MP;
+ break;
+ default:
+ flag = CPU_NONE;
+ break;
+ }
+
+ return flag;
+}
+
+static int get_cpu_range_count(unsigned cpu)
+{
+ int index;
+
+ switch (per_cpu(cpu_model, cpu) >> 16) {
+ case X86_VENDOR_INTEL:
+ index = ARRAY_SIZE(cpu_intel_range);
+ break;
+ case X86_VENDOR_AMD:
+ index = ARRAY_SIZE(cpu_amd_range);
+ break;
+ default:
+ index = 0;
+ break;
+ }
+
+ return index;
+}
+
+static int is_typeflag_valid(unsigned cpu, unsigned flag)
+{
+ unsigned vendor, modelflag;
+ int i, index;
+
+ /* Standard Registers should be always valid */
+ if (flag >= CPU_TSS)
+ return 1;
+
+ modelflag = per_cpu(cpu_modelflag, cpu);
+ vendor = per_cpu(cpu_model, cpu) >> 16;
+ index = get_cpu_range_count(cpu);
+
+ for (i = 0; i < index; i++) {
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if ((cpu_intel_range[i].model & modelflag) &&
+ (cpu_intel_range[i].flag & flag))
+ return 1;
+ break;
+ case X86_VENDOR_AMD:
+ if (cpu_amd_range[i].flag & flag)
+ return 1;
+ break;
+ }
+ }
+
+ /* Invalid */
+ return 0;
+}
+
+static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
+ int index, unsigned flag)
+{
+ unsigned modelflag;
+
+ modelflag = per_cpu(cpu_modelflag, cpu);
+ *max = 0;
+ switch (per_cpu(cpu_model, cpu) >> 16) {
+ case X86_VENDOR_INTEL:
+ if ((cpu_intel_range[index].model & modelflag) &&
+ (cpu_intel_range[index].flag & flag)) {
+ *min = cpu_intel_range[index].min;
+ *max = cpu_intel_range[index].max;
+ }
+ break;
+ case X86_VENDOR_AMD:
+ if (cpu_amd_range[index].flag & flag) {
+ *min = cpu_amd_range[index].min;
+ *max = cpu_amd_range[index].max;
+ }
+ break;
+ }
+
+ return *max;
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_cpu_data(struct seq_file *seq, unsigned type,
+ u32 low, u32 high)
+{
+ struct cpu_private *priv;
+ u64 val = high;
+
+ if (seq) {
+ priv = seq->private;
+ if (priv->file) {
+ val = (val << 32) | low;
+ seq_printf(seq, "0x%llx\n", val);
+ } else
+ seq_printf(seq, " %08x: %08x_%08x\n",
+ type, high, low);
+ } else
+ printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
+{
+ unsigned msr, msr_min, msr_max;
+ struct cpu_private *priv;
+ u32 low, high;
+ int i, range;
+
+ if (seq) {
+ priv = seq->private;
+ if (priv->file) {
+ if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
+ &low, &high))
+ print_cpu_data(seq, priv->reg, low, high);
+ return;
+ }
+ }
+
+ range = get_cpu_range_count(cpu);
+
+ for (i = 0; i < range; i++) {
+ if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
+ continue;
+
+ for (msr = msr_min; msr <= msr_max; msr++) {
+ if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
+ continue;
+ print_cpu_data(seq, msr, low, high);
+ }
+ }
+}
+
+static void print_tss(void *arg)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct seq_file *seq = arg;
+ unsigned int seg;
+
+ seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
+ seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
+ seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
+ seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
+
+ seq_printf(seq, " RSI\t: %016lx\n", regs->si);
+ seq_printf(seq, " RDI\t: %016lx\n", regs->di);
+ seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
+ seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
+
+#ifdef CONFIG_X86_64
+ seq_printf(seq, " R08\t: %016lx\n", regs->r8);
+ seq_printf(seq, " R09\t: %016lx\n", regs->r9);
+ seq_printf(seq, " R10\t: %016lx\n", regs->r10);
+ seq_printf(seq, " R11\t: %016lx\n", regs->r11);
+ seq_printf(seq, " R12\t: %016lx\n", regs->r12);
+ seq_printf(seq, " R13\t: %016lx\n", regs->r13);
+ seq_printf(seq, " R14\t: %016lx\n", regs->r14);
+ seq_printf(seq, " R15\t: %016lx\n", regs->r15);
+#endif
+
+ asm("movl %%cs,%0" : "=r" (seg));
+ seq_printf(seq, " CS\t: %04x\n", seg);
+ asm("movl %%ds,%0" : "=r" (seg));
+ seq_printf(seq, " DS\t: %04x\n", seg);
+ seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
+ asm("movl %%es,%0" : "=r" (seg));
+ seq_printf(seq, " ES\t: %04x\n", seg);
+ asm("movl %%fs,%0" : "=r" (seg));
+ seq_printf(seq, " FS\t: %04x\n", seg);
+ asm("movl %%gs,%0" : "=r" (seg));
+ seq_printf(seq, " GS\t: %04x\n", seg);
+
+ seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
+
+ seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
+}
+
+static void print_cr(void *arg)
+{
+ struct seq_file *seq = arg;
+
+ seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
+ seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
+ seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
+ seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
+#ifdef CONFIG_X86_64
+ seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
+#endif
+}
+
+static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
+{
+ seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
+}
+
+static void print_dt(void *seq)
+{
+ struct desc_ptr dt;
+ unsigned long ldt;
+
+ /* IDT */
+ store_idt((struct desc_ptr *)&dt);
+ print_desc_ptr("IDT", seq, dt);
+
+ /* GDT */
+ store_gdt((struct desc_ptr *)&dt);
+ print_desc_ptr("GDT", seq, dt);
+
+ /* LDT */
+ store_ldt(ldt);
+ seq_printf(seq, " LDT\t: %016lx\n", ldt);
+
+ /* TR */
+ store_tr(ldt);
+ seq_printf(seq, " TR\t: %016lx\n", ldt);
+}
+
+static void print_dr(void *arg)
+{
+ struct seq_file *seq = arg;
+ unsigned long dr;
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ /* Ignore db4, db5 */
+ if ((i == 4) || (i == 5))
+ continue;
+ get_debugreg(dr, i);
+ seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
+ }
+
+ seq_printf(seq, "\n MSR\t:\n");
+}
+
+static void print_apic(void *arg)
+{
+ struct seq_file *seq = arg;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(seq, " LAPIC\t:\n");
+ seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
+ seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
+ seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
+ seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
+ seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
+ seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
+ seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
+ seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
+ seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
+ seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
+ seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
+ seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
+ seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
+ seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
+ seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
+ seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
+ seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
+ seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
+ seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
+ seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
+ seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+ seq_printf(seq, "\n MSR\t:\n");
+}
+
+static int cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct cpu_private *priv = seq->private;
+
+ if (priv == NULL)
+ return -EINVAL;
+
+ switch (cpu_base[priv->type].flag) {
+ case CPU_TSS:
+ smp_call_function_single(priv->cpu, print_tss, seq, 1);
+ break;
+ case CPU_CR:
+ smp_call_function_single(priv->cpu, print_cr, seq, 1);
+ break;
+ case CPU_DT:
+ smp_call_function_single(priv->cpu, print_dt, seq, 1);
+ break;
+ case CPU_DEBUG:
+ if (priv->file == CPU_INDEX_BIT)
+ smp_call_function_single(priv->cpu, print_dr, seq, 1);
+ print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+ break;
+ case CPU_APIC:
+ if (priv->file == CPU_INDEX_BIT)
+ smp_call_function_single(priv->cpu, print_apic, seq, 1);
+ print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+ break;
+
+ default:
+ print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+ break;
+ }
+ seq_printf(seq, "\n");
+
+ return 0;
+}
+
+static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos == 0) /* One time is enough ;-) */
+ return seq;
+
+ return NULL;
+}
+
+static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ return cpu_seq_start(seq, pos);
+}
+
+static void cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations cpu_seq_ops = {
+ .start = cpu_seq_start,
+ .next = cpu_seq_next,
+ .stop = cpu_seq_stop,
+ .show = cpu_seq_show,
+};
+
+static int cpu_seq_open(struct inode *inode, struct file *file)
+{
+ struct cpu_private *priv = inode->i_private;
+ struct seq_file *seq;
+ int err;
+
+ err = seq_open(file, &cpu_seq_ops);
+ if (!err) {
+ seq = file->private_data;
+ seq->private = priv;
+ }
+
+ return err;
+}
+
+static int write_msr(struct cpu_private *priv, u64 val)
+{
+ u32 low, high;
+
+ high = (val >> 32) & 0xffffffff;
+ low = val & 0xffffffff;
+
+ if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
+ return 0;
+
+ return -EPERM;
+}
+
+static int write_cpu_register(struct cpu_private *priv, const char *buf)
+{
+ int ret = -EPERM;
+ u64 val;
+
+ ret = strict_strtoull(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ /* Supporting only MSRs */
+ if (priv->type < CPU_TSS_BIT)
+ return write_msr(priv, val);
+
+ return ret;
+}
+
+static ssize_t cpu_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct cpu_private *priv = seq->private;
+ char buf[19];
+
+ if ((priv == NULL) || (count >= sizeof(buf)))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, count))
+ return -EFAULT;
+
+ buf[count] = 0;
+
+ if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
+ if (!write_cpu_register(priv, buf))
+ return count;
+
+ return -EACCES;
+}
+
+static const struct file_operations cpu_fops = {
+ .owner = THIS_MODULE,
+ .open = cpu_seq_open,
+ .read = seq_read,
+ .write = cpu_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
+ unsigned file, struct dentry *dentry)
+{
+ struct cpu_private *priv = NULL;
+
+ /* Already intialized */
+ if (file == CPU_INDEX_BIT)
+ if (per_cpu(cpu_arr[type].init, cpu))
+ return 0;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv == NULL)
+ return -ENOMEM;
+
+ priv->cpu = cpu;
+ priv->type = type;
+ priv->reg = reg;
+ priv->file = file;
+ mutex_lock(&cpu_debug_lock);
+ per_cpu(priv_arr[type], cpu) = priv;
+ per_cpu(cpu_priv_count, cpu)++;
+ mutex_unlock(&cpu_debug_lock);
+
+ if (file)
+ debugfs_create_file(cpu_file[file].name, S_IRUGO,
+ dentry, (void *)priv, &cpu_fops);
+ else {
+ debugfs_create_file(cpu_base[type].name, S_IRUGO,
+ per_cpu(cpu_arr[type].dentry, cpu),
+ (void *)priv, &cpu_fops);
+ mutex_lock(&cpu_debug_lock);
+ per_cpu(cpu_arr[type].init, cpu) = 1;
+ mutex_unlock(&cpu_debug_lock);
+ }
+
+ return 0;
+}
+
+static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
+ struct dentry *dentry)
+{
+ unsigned file;
+ int err = 0;
+
+ for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
+ err = cpu_create_file(cpu, type, reg, file, dentry);
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
+{
+ struct dentry *cpu_dentry = NULL;
+ unsigned reg, reg_min, reg_max;
+ int i, range, err = 0;
+ char reg_dir[12];
+ u32 low, high;
+
+ range = get_cpu_range_count(cpu);
+
+ for (i = 0; i < range; i++) {
+ if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
+ cpu_base[type].flag))
+ continue;
+
+ for (reg = reg_min; reg <= reg_max; reg++) {
+ if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
+ continue;
+
+ sprintf(reg_dir, "0x%x", reg);
+ cpu_dentry = debugfs_create_dir(reg_dir, dentry);
+ err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
+ if (err)
+ return err;
+ }
+ }
+
+ return err;
+}
+
+static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
+{
+ struct dentry *cpu_dentry = NULL;
+ unsigned type;
+ int err = 0;
+
+ for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
+ if (!is_typeflag_valid(cpu, cpu_base[type].flag))
+ continue;
+ cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
+ per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
+
+ if (type < CPU_TSS_BIT)
+ err = cpu_init_msr(cpu, type, cpu_dentry);
+ else
+ err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
+ cpu_dentry);
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+static int cpu_init_cpu(void)
+{
+ struct dentry *cpu_dentry = NULL;
+ struct cpuinfo_x86 *cpui;
+ char cpu_dir[12];
+ unsigned cpu;
+ int err = 0;
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ cpui = &cpu_data(cpu);
+ if (!cpu_has(cpui, X86_FEATURE_MSR))
+ continue;
+ per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
+ (cpui->x86 << 8) |
+ (cpui->x86_model));
+ per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
+
+ sprintf(cpu_dir, "cpu%d", cpu);
+ cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
+ err = cpu_init_allreg(cpu, cpu_dentry);
+
+ pr_info("cpu%d(%d) debug files %d\n",
+ cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
+ if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
+ pr_err("Register files count %d exceeds limit %d\n",
+ per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
+ per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
+ err = -ENFILE;
+ }
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+static int __init cpu_debug_init(void)
+{
+ cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
+
+ return cpu_init_cpu();
+}
+
+static void __exit cpu_debug_exit(void)
+{
+ int i, cpu;
+
+ if (cpu_debugfs_dir)
+ debugfs_remove_recursive(cpu_debugfs_dir);
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+ for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
+ kfree(per_cpu(priv_arr[i], cpu));
+}
+
+module_init(cpu_debug_init);
+module_exit(cpu_debug_exit);
+
+MODULE_AUTHOR("Jaswinder Singh Rajput");
+MODULE_DESCRIPTION("CPU Debug module");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c..22590cf688a 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
if (!data)
return -ENOMEM;
- data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+ data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
per_cpu(drv_data, cpu) = data;
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index b585e04cbc9..3178c3acd97 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -277,7 +277,6 @@ static struct cpufreq_driver p4clockmod_driver = {
.name = "p4-clockmod",
.owner = THIS_MODULE,
.attr = p4clockmod_attr,
- .hide_interface = 1,
};
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index ffd0f5ed071..593171e967e 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
*/
static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
-static char Cx86_model[][9] __cpuinitdata = {
+static const char __cpuinitconst Cx86_model[][9] = {
"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
"M II ", "Unknown"
};
-static char Cx486_name[][5] __cpuinitdata = {
+static const char __cpuinitconst Cx486_name[][5] = {
"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
"SRx2", "DRx2"
};
-static char Cx486S_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486S_name[][4] = {
"S", "S2", "Se", "S2e"
};
-static char Cx486D_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486D_name[][4] = {
"DX", "DX2", "?", "?", "?", "DX4"
};
static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
-static char cyrix_model_mult1[] __cpuinitdata = "12??43";
-static char cyrix_model_mult2[] __cpuinitdata = "12233445";
+static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
+static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
/*
* Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
}
}
-static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
.c_vendor = "Cyrix",
.c_ident = { "CyrixInstead" },
.c_early_init = early_init_cyrix,
@@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
cpu_dev_register(cyrix_cpu_dev);
-static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 25c559ba8d5..b09d4eb52bb 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -13,6 +13,7 @@
#include <asm/uaccess.h>
#include <asm/ds.h>
#include <asm/bugs.h>
+#include <asm/cpu.h>
#ifdef CONFIG_X86_64
#include <asm/topology.h>
@@ -53,6 +54,11 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
c->x86_cache_alignment = 128;
#endif
+ /* CPUID workaround for 0F33/0F34 CPU */
+ if (c->x86 == 0xF && c->x86_model == 0x3
+ && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
+ c->x86_phys_bits = 36;
+
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
* with P/T states and does not stop in deep C-states
@@ -110,6 +116,28 @@ static void __cpuinit trap_init_f00f_bug(void)
}
#endif
+static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ /* calling is from identify_secondary_cpu() ? */
+ if (c->cpu_index == boot_cpu_id)
+ return;
+
+ /*
+ * Mask B, Pentium, but not Pentium MMX
+ */
+ if (c->x86 == 5 &&
+ c->x86_mask >= 1 && c->x86_mask <= 4 &&
+ c->x86_model <= 3) {
+ /*
+ * Remember we have B step Pentia with bugs
+ */
+ WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
+ "with B stepping processors.\n");
+ }
+#endif
+}
+
static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
{
unsigned long lo, hi;
@@ -186,6 +214,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_NUMAQ
numaq_tsc_disable();
#endif
+
+ intel_smp_check(c);
}
#else
static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
@@ -385,7 +415,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
}
#endif
-static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
.c_vendor = "Intel",
.c_ident = { "GenuineIntel" },
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7293508d8f5..c471eb1a389 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -32,7 +32,7 @@ struct _cache_table
};
/* all the cache descriptor types we care about (no TLB or trace cache entries) */
-static struct _cache_table cache_table[] __cpuinitdata =
+static const struct _cache_table __cpuinitconst cache_table[] =
{
{ 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
{ 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
@@ -206,15 +206,15 @@ union l3_cache {
unsigned val;
};
-static unsigned short assocs[] __cpuinitdata = {
+static const unsigned short __cpuinitconst assocs[] = {
[1] = 1, [2] = 2, [4] = 4, [6] = 8,
[8] = 16, [0xa] = 32, [0xb] = 48,
[0xc] = 64,
[0xf] = 0xffff // ??
};
-static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
-static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
+static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
+static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
static void __cpuinit
amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb6..b2f89829bbe 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633..3552119b091 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
}
}
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
- old_cr4 = read_cr4();
- clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
- if (old_cr4 & X86_CR4_MCE)
- set_in_cr4(X86_CR4_MCE);
-}
-
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f..ca14604611e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
* Rest from unknown author(s).
* 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
*/
#include <linux/init.h>
@@ -24,6 +26,9 @@
#include <linux/ctype.h>
#include <linux/kmod.h>
#include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/ratelimit.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -32,7 +37,6 @@
#include <asm/idle.h>
#define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
atomic_t mce_entry;
@@ -47,7 +51,7 @@ static int mce_dont_init;
*/
static int tolerant = 1;
static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
static unsigned long notify_user;
static int rip_msr;
static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+ m->cpu = smp_processor_id();
+ rdtscll(m->tsc);
+}
+
/*
* Lockless MCE logging infrastructure.
* This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
print_symbol("{%s}", m->ip);
printk("\n");
}
- printk(KERN_EMERG "TSC %Lx ", m->tsc);
+ printk(KERN_EMERG "TSC %llx ", m->tsc);
if (m->addr)
- printk("ADDR %Lx ", m->addr);
+ printk("ADDR %llx ", m->addr);
if (m->misc)
- printk("MISC %Lx ", m->misc);
+ printk("MISC %llx ", m->misc);
printk("\n");
printk(KERN_EMERG "This is not a software problem!\n");
printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
panic(msg);
}
-static int mce_available(struct cpuinfo_x86 *c)
+int mce_available(struct cpuinfo_x86 *c)
{
+ if (mce_dont_init)
+ return 0;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
}
/*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+ struct mce m;
+ int i;
+
+ mce_setup(&m);
+
+ rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+ for (i = 0; i < banks; i++) {
+ if (!bank[i] || !test_bit(i, *b))
+ continue;
+
+ m.misc = 0;
+ m.addr = 0;
+ m.bank = i;
+ m.tsc = 0;
+
+ barrier();
+ rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+ if (!(m.status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Uncorrected events are handled by the exception handler
+ * when it is enabled. But when the exception is disabled log
+ * everything.
+ *
+ * TBD do the same check for MCI_STATUS_EN here?
+ */
+ if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+ continue;
+
+ if (m.status & MCI_STATUS_MISCV)
+ rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+ if (m.status & MCI_STATUS_ADDRV)
+ rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+ if (!(flags & MCP_TIMESTAMP))
+ m.tsc = 0;
+ /*
+ * Don't get the IP here because it's unlikely to
+ * have anything to do with the actual error location.
+ */
+
+ mce_log(&m);
+ add_taint(TAINT_MACHINE_CHECK);
+
+ /*
+ * Clear state for this bank.
+ */
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+
+ /*
+ * Don't clear MCG_STATUS here because it's only defined for
+ * exceptions.
+ */
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
*/
void do_machine_check(struct pt_regs * regs, long error_code)
{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
* error.
*/
int kill_it = 0;
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS);
atomic_inc(&mce_entry);
- if ((regs
- && notify_die(DIE_NMI, "machine check", regs, error_code,
+ if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)
- || !banks)
+ goto out2;
+ if (!banks)
goto out2;
- memset(&m, 0, sizeof(struct mce));
- m.cpu = smp_processor_id();
+ mce_setup(&m);
+
rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
/* if the restart IP is not valid, we're done for */
if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
barrier();
for (i = 0; i < banks; i++) {
- if (i < NR_SYSFS_BANKS && !bank[i])
+ __clear_bit(i, toclear);
+ if (!bank[i])
continue;
m.misc = 0;
m.addr = 0;
m.bank = i;
- m.tsc = 0;
rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
if ((m.status & MCI_STATUS_VAL) == 0)
continue;
+ /*
+ * Non uncorrected errors are handled by machine_check_poll
+ * Leave them alone.
+ */
+ if ((m.status & MCI_STATUS_UC) == 0)
+ continue;
+
+ /*
+ * Set taint even when machine check was not enabled.
+ */
+ add_taint(TAINT_MACHINE_CHECK);
+
+ __set_bit(i, toclear);
+
if (m.status & MCI_STATUS_EN) {
/* if PCC was set, there's no way out */
no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
no_way_out = 1;
kill_it = 1;
}
+ } else {
+ /*
+ * Machine check event was not enabled. Clear, but
+ * ignore.
+ */
+ continue;
}
if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
mce_get_rip(&m, regs);
- if (error_code >= 0)
- rdtscll(m.tsc);
- if (error_code != -2)
- mce_log(&m);
+ mce_log(&m);
/* Did this bank cause the exception? */
/* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
panicm = m;
panicm_found = 1;
}
-
- add_taint(TAINT_MACHINE_CHECK);
}
- /* Never do anything final in the polling timer */
- if (!regs)
- goto out;
-
/* If we didn't find an uncorrectable error, pick
the last one (shouldn't happen, just being safe). */
if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
/* notify userspace ASAP */
set_thread_flag(TIF_MCE_NOTIFY);
- out:
/* the last thing we do is clear state */
- for (i = 0; i < banks; i++)
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ for (i = 0; i < banks; i++) {
+ if (test_bit(i, toclear))
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
wrmsrl(MSR_IA32_MCG_STATUS, 0);
out2:
atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
* and historically has been the register value of the
* MSR_IA32_THERMAL_STATUS (Intel) msr.
*/
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
{
struct mce m;
- memset(&m, 0, sizeof(m));
- m.cpu = cpu;
+ mce_setup(&m);
m.bank = MCE_THERMAL_BANK;
m.status = status;
- rdtscll(m.tsc);
mce_log(&m);
}
#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
static int check_interval = 5 * 60; /* 5 minutes */
static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static void mcheck_timer(unsigned long);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
{
- if (mce_available(&current_cpu_data))
- do_machine_check(NULL, 0);
-}
+ struct timer_list *t = &per_cpu(mce_timer, data);
-static void mcheck_timer(struct work_struct *work)
-{
- on_each_cpu(mcheck_check_cpu, NULL, 1);
+ WARN_ON(smp_processor_id() != data);
+
+ if (mce_available(&current_cpu_data))
+ machine_check_poll(MCP_TIMESTAMP,
+ &__get_cpu_var(mce_poll_banks));
/*
* Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
(int)round_jiffies_relative(check_interval*HZ));
}
- schedule_delayed_work(&mcheck_work, next_interval);
+ t->expires = jiffies + next_interval;
+ add_timer(t);
+}
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
}
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
/*
- * This is only called from process context. This is where we do
- * anything we need to alert userspace about new MCEs. This is called
- * directly from the poller and also from entry.S and idle, thanks to
- * TIF_MCE_NOTIFY.
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
*/
int mce_notify_user(void)
{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
clear_thread_flag(TIF_MCE_NOTIFY);
if (test_and_clear_bit(0, &notify_user)) {
- static unsigned long last_print;
- unsigned long now = jiffies;
-
wake_up_interruptible(&mce_wait);
- if (trigger[0])
- call_usermodehelper(trigger, trigger_argv, NULL,
- UMH_NO_WAIT);
- if (time_after_eq(now, last_print + (check_interval*HZ))) {
- last_print = now;
+ /*
+ * There is no risk of missing notifications because
+ * work_pending is always cleared before the function is
+ * executed.
+ */
+ if (trigger[0] && !work_pending(&mce_trigger_work))
+ schedule_work(&mce_trigger_work);
+
+ if (__ratelimit(&ratelimit))
printk(KERN_INFO "Machine check events logged\n");
- }
return 1;
}
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
static __init int periodic_mcheck_init(void)
{
- next_interval = check_interval * HZ;
- if (next_interval)
- schedule_delayed_work(&mcheck_work,
- round_jiffies_relative(next_interval));
- idle_notifier_register(&mce_idle_notifier);
- return 0;
+ idle_notifier_register(&mce_idle_notifier);
+ return 0;
}
__initcall(periodic_mcheck_init);
-
/*
* Initialize Machine Checks for a CPU.
*/
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
{
u64 cap;
- int i;
+ unsigned b;
rdmsrl(MSR_IA32_MCG_CAP, cap);
- banks = cap & 0xff;
- if (banks > MCE_EXTENDED_BANK) {
- banks = MCE_EXTENDED_BANK;
- printk(KERN_INFO "MCE: warning: using only %d banks\n",
- MCE_EXTENDED_BANK);
+ b = cap & 0xff;
+ if (b > MAX_NR_BANKS) {
+ printk(KERN_WARNING
+ "MCE: Using only %u machine check banks out of %u\n",
+ MAX_NR_BANKS, b);
+ b = MAX_NR_BANKS;
}
+
+ /* Don't support asymmetric configurations today */
+ WARN_ON(banks != 0 && b != banks);
+ banks = b;
+ if (!bank) {
+ bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+ if (!bank)
+ return -ENOMEM;
+ memset(bank, 0xff, banks * sizeof(u64));
+ }
+
/* Use accurate RIP reporting if available. */
if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
rip_msr = MSR_IA32_MCG_EIP;
- /* Log the machine checks left over from the previous reset.
- This also clears all registers */
- do_machine_check(NULL, mce_bootlog ? -1 : -2);
+ return 0;
+}
+
+static void mce_init(void *dummy)
+{
+ u64 cap;
+ int i;
+ mce_banks_t all_banks;
+
+ /*
+ * Log the machine checks left over from the previous reset.
+ */
+ bitmap_fill(all_banks, MAX_NR_BANKS);
+ machine_check_poll(MCP_UC, &all_banks);
set_in_cr4(X86_CR4_MCE);
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
for (i = 0; i < banks; i++) {
- if (i < NR_SYSFS_BANKS)
- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
- else
- wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
-
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
}
/* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
{
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD) {
- if(c->x86 == 15)
+ if (c->x86 == 15 && banks > 4)
/* disable GART TBL walk error reporting, which trips off
incorrectly with the IOMMU & 3ware & Cerberus. */
- clear_bit(10, &bank[4]);
+ clear_bit(10, (unsigned long *)&bank[4]);
if(c->x86 <= 17 && mce_bootlog < 0)
/* Lots of broken BIOS around that don't clear them
by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
}
}
+static void mce_init_timer(void)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+
+ /* data race harmless because everyone sets to the same value */
+ if (!next_interval)
+ next_interval = check_interval * HZ;
+ if (!next_interval)
+ return;
+ setup_timer(t, mcheck_timer, smp_processor_id());
+ t->expires = round_jiffies(jiffies + next_interval);
+ add_timer(t);
+}
+
/*
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off.
*/
void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{
- mce_cpu_quirks(c);
+ if (!mce_available(c))
+ return;
- if (mce_dont_init ||
- !mce_available(c))
+ if (mce_cap_init() < 0) {
+ mce_dont_init = 1;
return;
+ }
+ mce_cpu_quirks(c);
mce_init(NULL);
mce_cpu_features(c);
+ mce_init_timer();
}
/*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
{
unsigned long *cpu_tsc;
static DEFINE_MUTEX(mce_read_mutex);
- unsigned next;
+ unsigned prev, next;
char __user *buf = ubuf;
int i, err;
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
}
err = 0;
- for (i = 0; i < next; i++) {
- unsigned long start = jiffies;
-
- while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i,0, sizeof(struct mce));
- goto timeout;
+ prev = 0;
+ do {
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+
+ while (!mcelog.entry[i].finished) {
+ if (time_after_eq(jiffies, start + 2)) {
+ memset(mcelog.entry + i, 0,
+ sizeof(struct mce));
+ goto timeout;
+ }
+ cpu_relax();
}
- cpu_relax();
+ smp_rmb();
+ err |= copy_to_user(buf, mcelog.entry + i,
+ sizeof(struct mce));
+ buf += sizeof(struct mce);
+timeout:
+ ;
}
- smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
- buf += sizeof(struct mce);
- timeout:
- ;
- }
- memset(mcelog.entry, 0, next * sizeof(struct mce));
- mcelog.next = 0;
+ memset(mcelog.entry + prev, 0,
+ (next - prev) * sizeof(struct mce));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
synchronize_sched();
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
&mce_chrdev_ops,
};
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
- old_cr4 = read_cr4();
- clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
- if (old_cr4 & X86_CR4_MCE)
- set_in_cr4(X86_CR4_MCE);
-}
-
/*
* Old style boot options parsing. Only for compatibility.
*/
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
return 1;
}
-/* mce=off disables machine check. Note you can re-enable it later
- using sysfs.
+/* mce=off disables machine check.
mce=TOLERANCELEVEL (number, see above)
mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
* Sysfs support
*/
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+ int i;
+
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+ return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+ return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+ return mce_disable();
+}
+
/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
Only one CPU is active at this time, the others get readded later using
CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
return 0;
}
+static void mce_cpu_restart(void *data)
+{
+ del_timer_sync(&__get_cpu_var(mce_timer));
+ if (mce_available(&current_cpu_data))
+ mce_init(NULL);
+ mce_init_timer();
+}
+
/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{
- if (next_interval)
- cancel_delayed_work(&mcheck_work);
- /* Timer race is harmless here */
- on_each_cpu(mce_init, NULL, 1);
next_interval = check_interval * HZ;
- if (next_interval)
- schedule_delayed_work(&mcheck_work,
- round_jiffies_relative(next_interval));
+ on_each_cpu(mce_cpu_restart, NULL, 1);
}
static struct sysdev_class mce_sysclass = {
+ .suspend = mce_suspend,
+ .shutdown = mce_shutdown,
.resume = mce_resume,
.name = "machinecheck",
};
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-/*
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
- */
-ACCESSOR(bank0ctl,bank[0],mce_restart())
-ACCESSOR(bank1ctl,bank[1],mce_restart())
-ACCESSOR(bank2ctl,bank[2],mce_restart())
-ACCESSOR(bank3ctl,bank[3],mce_restart())
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ char *buf)
+{
+ u64 b = bank[attr - bank_attrs];
+ return sprintf(buf, "%llx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *end;
+ u64 new = simple_strtoull(buf, &end, 0);
+ if (end == buf)
+ return -EINVAL;
+ bank[attr - bank_attrs] = new;
+ mce_restart();
+ return end-buf;
+}
static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
ACCESSOR(check_interval,check_interval,mce_restart())
static struct sysdev_attribute *mce_attributes[] = {
- &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
- &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
NULL
};
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
if (err)
goto error;
}
+ for (i = 0; i < banks; i++) {
+ err = sysdev_create_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
+ if (err)
+ goto error2;
+ }
cpu_set(cpu, mce_device_initialized);
return 0;
+error2:
+ while (--i >= 0) {
+ sysdev_remove_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
+ }
error:
- while (i--) {
+ while (--i >= 0) {
sysdev_remove_file(&per_cpu(device_mce,cpu),
mce_attributes[i]);
}
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
for (i = 0; mce_attributes[i]; i++)
sysdev_remove_file(&per_cpu(device_mce,cpu),
mce_attributes[i]);
+ for (i = 0; i < banks; i++)
+ sysdev_remove_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
sysdev_unregister(&per_cpu(device_mce,cpu));
cpu_clear(cpu, mce_device_initialized);
}
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+ int i;
+ unsigned long action = *(unsigned long *)h;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_clear();
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+
+static void mce_reenable_cpu(void *h)
+{
+ int i;
+ unsigned long action = *(unsigned long *)h;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_reenable();
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+}
+
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
+ struct timer_list *t = &per_cpu(mce_timer, cpu);
switch (action) {
case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
threshold_cpu_callback(action, cpu);
mce_remove_device(cpu);
break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ del_timer_sync(t);
+ smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ t->expires = round_jiffies(jiffies + next_interval);
+ add_timer_on(t, cpu);
+ smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ break;
+ case CPU_POST_DEAD:
+ /* intentionally ignoring frozen here */
+ cmci_rediscover(cpu);
+ break;
}
return NOTIFY_OK;
}
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
.notifier_call = mce_cpu_callback,
};
+static __init int mce_init_banks(void)
+{
+ int i;
+
+ bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+ GFP_KERNEL);
+ if (!bank_attrs)
+ return -ENOMEM;
+
+ for (i = 0; i < banks; i++) {
+ struct sysdev_attribute *a = &bank_attrs[i];
+ a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+ if (!a->attr.name)
+ goto nomem;
+ a->attr.mode = 0644;
+ a->show = show_bank;
+ a->store = set_bank;
+ }
+ return 0;
+
+nomem:
+ while (--i >= 0)
+ kfree(bank_attrs[i].attr.name);
+ kfree(bank_attrs);
+ bank_attrs = NULL;
+ return -ENOMEM;
+}
+
static __init int mce_init_device(void)
{
int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
if (!mce_available(&boot_cpu_data))
return -EIO;
+
+ err = mce_init_banks();
+ if (err)
+ return err;
+
err = sysdev_class_register(&mce_sysclass);
if (err)
return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 9817506dd46..c5a32f92d07 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
+static void amd_threshold_interrupt(void);
+
/*
* CPU Initialization
*/
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
tr.reset = 0;
tr.old_limit = 0;
threshold_restart_bank(&tr);
+
+ mce_threshold_vector = amd_threshold_interrupt;
}
}
}
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
* the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number.
*/
-asmlinkage void mce_threshold_interrupt(void)
+static void amd_threshold_interrupt(void)
{
unsigned int bank, block;
struct mce m;
u32 low = 0, high = 0, address = 0;
- ack_APIC_irq();
- exit_idle();
- irq_enter();
-
- memset(&m, 0, sizeof(m));
- rdtscll(m.tsc);
- m.cpu = smp_processor_id();
+ mce_setup(&m);
/* assume first bank caused it */
for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
/* Log the machine check that caused the threshold
event. */
- do_machine_check(NULL, 0);
+ machine_check_poll(MCP_TIMESTAMP,
+ &__get_cpu_var(mce_poll_banks));
if (high & MASK_OVERFLOW_HI) {
rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
+ bank * NR_BLOCKS
+ block;
mce_log(&m);
- goto out;
+ return;
}
}
}
-out:
- inc_irq_stat(irq_threshold_count);
- irq_exit();
}
/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aa5e287c98e..aaa7d973093 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
/*
* Intel specific MCE features.
* Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
*/
#include <linux/init.h>
@@ -13,6 +15,7 @@
#include <asm/hw_irq.h>
#include <asm/idle.h>
#include <asm/therm_throt.h>
+#include <asm/apic.h>
asmlinkage void smp_thermal_interrupt(void)
{
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
if (therm_throt_process(msr_val & 1))
- mce_log_therm_throt_event(smp_processor_id(), msr_val);
+ mce_log_therm_throt_event(msr_val);
inc_irq_stat(irq_thermal_count);
irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+
+static int cmci_supported(int *banks)
+{
+ u64 cap;
+
+ /*
+ * Vendor check is not strictly needed, but the initial
+ * initialization is vendor keyed and this
+ * makes sure none of the backdoors are entered otherwise.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+ if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+ return 0;
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+ return !!(cap & MCG_CMCI_P);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+ mce_notify_user();
+}
+
+static void print_update(char *type, int *hdr, int num)
+{
+ if (*hdr == 0)
+ printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
+ *hdr = 1;
+ printk(KERN_CONT " %s:%d", type, num);
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks, int boot)
+{
+ unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+ int hdr = 0;
+ int i;
+
+ spin_lock(&cmci_discover_lock);
+ for (i = 0; i < banks; i++) {
+ u64 val;
+
+ if (test_bit(i, owned))
+ continue;
+
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+ /* Already owned by someone else? */
+ if (val & CMCI_EN) {
+ if (test_and_clear_bit(i, owned) || boot)
+ print_update("SHD", &hdr, i);
+ __clear_bit(i, __get_cpu_var(mce_poll_banks));
+ continue;
+ }
+
+ val |= CMCI_EN | CMCI_THRESHOLD;
+ wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+ /* Did the enable bit stick? -- the bank supports CMCI */
+ if (val & CMCI_EN) {
+ if (!test_and_set_bit(i, owned) || boot)
+ print_update("CMCI", &hdr, i);
+ __clear_bit(i, __get_cpu_var(mce_poll_banks));
+ } else {
+ WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+ }
+ }
+ spin_unlock(&cmci_discover_lock);
+ if (hdr)
+ printk(KERN_CONT "\n");
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+ unsigned long flags;
+ int banks;
+
+ if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+ return;
+ local_irq_save(flags);
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+ local_irq_restore(flags);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+ int i;
+ int banks;
+ u64 val;
+
+ if (!cmci_supported(&banks))
+ return;
+ spin_lock(&cmci_discover_lock);
+ for (i = 0; i < banks; i++) {
+ if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+ continue;
+ /* Disable CMCI */
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+ wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ __clear_bit(i, __get_cpu_var(mce_banks_owned));
+ }
+ spin_unlock(&cmci_discover_lock);
+}
+
+/*
+ * After a CPU went down cycle through all the others and rediscover
+ * Must run in process context.
+ */
+void cmci_rediscover(int dying)
+{
+ int banks;
+ int cpu;
+ cpumask_var_t old;
+
+ if (!cmci_supported(&banks))
+ return;
+ if (!alloc_cpumask_var(&old, GFP_KERNEL))
+ return;
+ cpumask_copy(old, &current->cpus_allowed);
+
+ for_each_online_cpu (cpu) {
+ if (cpu == dying)
+ continue;
+ if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
+ continue;
+ /* Recheck banks in case CPUs don't all have the same */
+ if (cmci_supported(&banks))
+ cmci_discover(banks, 0);
+ }
+
+ set_cpus_allowed_ptr(current, old);
+ free_cpumask_var(old);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+ int banks;
+ if (cmci_supported(&banks))
+ cmci_discover(banks, 0);
+}
+
+static __cpuinit void intel_init_cmci(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ mce_threshold_vector = intel_threshold_interrupt;
+ cmci_discover(banks, 1);
+ /*
+ * For CPU #0 this runs with still disabled APIC, but that's
+ * ok because only the vector is set up. We still do another
+ * check for the banks later for CPU #0 just to make sure
+ * to not miss any events.
+ */
+ apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+ cmci_recheck();
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
+ intel_init_cmci();
}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 00000000000..23ee9e730f7
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+
+static void default_threshold_interrupt(void)
+{
+ printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
+ THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage void mce_threshold_interrupt(void)
+{
+ exit_idle();
+ irq_enter();
+ inc_irq_stat(irq_threshold_count);
+ mce_threshold_vector();
+ irq_exit();
+ /* Ack only at the end to avoid potential reentry */
+ ack_APIC_irq();
+}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 52b3fefbd5a..bb62b3e5caa 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
#endif
}
-static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
.c_vendor = "Transmeta",
.c_ident = { "GenuineTMx86", "TransmetaCPU" },
.c_early_init = early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index e777f79e096..fd2c37bf7ac 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -8,7 +8,7 @@
* so no special init takes place.
*/
-static struct cpu_dev umc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
.c_vendor = "UMC",
.c_ident = { "UMC UMC UMC" },
.c_models = {
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 169a120587b..87b67e3a765 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -729,7 +729,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
spin_unlock_irqrestore(&ds_lock, irq);
- ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
ds_resume_pebs(tracer);
return tracer;
@@ -1029,5 +1029,4 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
void ds_exit_thread(struct task_struct *tsk)
{
- WARN_ON(tsk->thread.ds_ctx);
}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 508bec1cee2..95b81c18b6b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -110,19 +110,25 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
/*
* Add a memory region to the kernel e820 map.
*/
-void __init e820_add_region(u64 start, u64 size, int type)
+static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
+ int type)
{
- int x = e820.nr_map;
+ int x = e820x->nr_map;
- if (x == ARRAY_SIZE(e820.map)) {
+ if (x == ARRAY_SIZE(e820x->map)) {
printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
return;
}
- e820.map[x].addr = start;
- e820.map[x].size = size;
- e820.map[x].type = type;
- e820.nr_map++;
+ e820x->map[x].addr = start;
+ e820x->map[x].size = size;
+ e820x->map[x].type = type;
+ e820x->nr_map++;
+}
+
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+ __e820_add_region(&e820, start, size, type);
}
void __init e820_print_map(char *who)
@@ -417,11 +423,11 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
return __append_e820_map(biosmap, nr_map);
}
-static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
u64 size, unsigned old_type,
unsigned new_type)
{
- int i;
+ unsigned int i;
u64 real_updated_size = 0;
BUG_ON(old_type == new_type);
@@ -429,7 +435,7 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
if (size > (ULLONG_MAX - start))
size = ULLONG_MAX - start;
- for (i = 0; i < e820.nr_map; i++) {
+ for (i = 0; i < e820x->nr_map; i++) {
struct e820entry *ei = &e820x->map[i];
u64 final_start, final_end;
if (ei->type != old_type)
@@ -446,10 +452,16 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
final_end = min(start + size, ei->addr + ei->size);
if (final_start >= final_end)
continue;
- e820_add_region(final_start, final_end - final_start,
- new_type);
+
+ __e820_add_region(e820x, final_start, final_end - final_start,
+ new_type);
+
real_updated_size += final_end - final_start;
+ /*
+ * left range could be head or tail, so need to update
+ * size at first.
+ */
ei->size -= final_end - final_start;
if (ei->addr < final_start)
continue;
@@ -461,13 +473,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
unsigned new_type)
{
- return e820_update_range_map(&e820, start, size, old_type, new_type);
+ return __e820_update_range(&e820, start, size, old_type, new_type);
}
static u64 __init e820_update_range_saved(u64 start, u64 size,
unsigned old_type, unsigned new_type)
{
- return e820_update_range_map(&e820_saved, start, size, old_type,
+ return __e820_update_range(&e820_saved, start, size, old_type,
new_type);
}
@@ -1020,8 +1032,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
continue;
return addr;
}
- return -1UL;
+ return -1ULL;
}
/*
@@ -1034,13 +1046,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
u64 start;
start = startt;
- while (size < sizet)
+ while (size < sizet && (start + 1))
start = find_e820_area_size(start, &size, align);
if (size < sizet)
return 0;
+#ifdef CONFIG_X86_32
+ if (start >= MAXMEM)
+ return 0;
+ if (start + size > MAXMEM)
+ size = MAXMEM - start;
+#endif
+
addr = round_down(start + size - sizet, align);
+ if (addr < start)
+ return 0;
e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
printk(KERN_INFO "update e820 for early_reserve_e820\n");
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 639ad98238a..335f049d110 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void)
return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
}
-static void dbgp_mdelay(int ms)
+static void __init dbgp_mdelay(int ms)
{
int i;
@@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size)
writel(hi, &ehci_debug->data47);
}
-static void dbgp_get_data(void *buf, int size)
+static void __init dbgp_get_data(void *buf, int size)
{
unsigned char *bytes = buf;
u32 lo, hi;
@@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
return ret;
}
-static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
int size)
{
u32 pids, addr, ctrl;
@@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
return ret;
}
-static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
- int value, int index, void *data, int size)
+static int __init dbgp_control_msg(unsigned devnum, int requesttype,
+ int request, int value, int index, void *data, int size)
{
u32 pids, addr, ctrl;
struct usb_ctrlrequest req;
@@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
return 0;
}
-static int ehci_reset_port(int port)
+static int __init ehci_reset_port(int port)
{
u32 portsc;
u32 delay_time, delay;
@@ -532,7 +532,7 @@ static int ehci_reset_port(int port)
return -EBUSY;
}
-static int ehci_wait_for_port(int port)
+static int __init ehci_wait_for_port(int port)
{
u32 status;
int ret, reps;
@@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { }
typedef void (*set_debug_port_t)(int port);
-static void default_set_debug_port(int port)
+static void __init default_set_debug_port(int port)
{
}
-static set_debug_port_t set_debug_port = default_set_debug_port;
+static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
-static void nvidia_set_debug_port(int port)
+static void __init nvidia_set_debug_port(int port)
{
u32 dword;
dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index b205272ad39..1736acc4d7a 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void)
efi_memory_desc_t *md;
efi_status_t status;
unsigned long size;
- u64 end, systab, addr, npages;
+ u64 end, systab, addr, npages, end_pfn;
void *p, *va;
efi.systab = NULL;
@@ -481,7 +481,10 @@ void __init efi_enter_virtual_mode(void)
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
- if (PFN_UP(end) <= max_low_pfn_mapped)
+ end_pfn = PFN_UP(end);
+ if (end_pfn <= max_low_pfn_mapped
+ || (end_pfn > (1UL << (32 - PAGE_SHIFT))
+ && end_pfn <= max_pfn_mapped))
va = __va(md->phys_addr);
else
va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index a4ee29127fd..22c3b7828c5 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,24 +100,11 @@ void __init efi_call_phys_epilog(void)
void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
{
- static unsigned pages_mapped __initdata;
- unsigned i, pages;
- unsigned long offset;
+ unsigned long last_map_pfn;
- pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr);
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
-
- if (pages_mapped + pages > MAX_EFI_IO_PAGES)
+ last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
+ if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
return NULL;
- for (i = 0; i < pages; i++) {
- __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
- phys_addr, PAGE_KERNEL);
- phys_addr += PAGE_SIZE;
- pages_mapped++;
- }
-
- return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
- (pages_mapped - pages)) + offset;
+ return (void __iomem *)__va(phys_addr);
}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 899e8938e79..c929add475c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -442,8 +442,7 @@ sysenter_past_esp:
GET_THREAD_INFO(%ebp)
- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
- testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz sysenter_audit
sysenter_do_call:
cmpl $(nr_syscalls), %eax
@@ -454,7 +453,7 @@ sysenter_do_call:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $_TIF_ALLWORK_MASK, %cx
+ testl $_TIF_ALLWORK_MASK, %ecx
jne sysexit_audit
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
@@ -468,7 +467,7 @@ sysenter_exit:
#ifdef CONFIG_AUDITSYSCALL
sysenter_audit:
- testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
jnz syscall_trace_entry
addl $4,%esp
CFI_ADJUST_CFA_OFFSET -4
@@ -485,7 +484,7 @@ sysenter_audit:
jmp sysenter_do_call
sysexit_audit:
- testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
jne syscall_exit_work
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
@@ -498,7 +497,7 @@ sysexit_audit:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
jne syscall_exit_work
movl PT_EAX(%esp),%eax /* reload syscall return value */
jmp sysenter_exit
@@ -523,8 +522,7 @@ ENTRY(system_call)
SAVE_ALL
GET_THREAD_INFO(%ebp)
# system call tracing in operation / emulation
- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
- testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
@@ -538,7 +536,7 @@ syscall_exit:
# between sampling and the iret
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $_TIF_ALLWORK_MASK, %cx # current->work
+ testl $_TIF_ALLWORK_MASK, %ecx # current->work
jne syscall_exit_work
restore_all:
@@ -673,7 +671,7 @@ END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
syscall_exit_work:
- testb $_TIF_WORK_SYSCALL_EXIT, %cl
+ testl $_TIF_WORK_SYSCALL_EXIT, %ecx
jz work_pending
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 83d1836b946..a331ec38af9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -368,6 +368,7 @@ ENTRY(save_rest)
END(save_rest)
/* save complete stack frame */
+ .pushsection .kprobes.text, "ax"
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -396,6 +397,7 @@ ENTRY(save_paranoid)
1: ret
CFI_ENDPROC
END(save_paranoid)
+ .popsection
/*
* A newly forked process directly context switches into this address.
@@ -416,7 +418,6 @@ ENTRY(ret_from_fork)
GET_THREAD_INFO(%rcx)
- CFI_REMEMBER_STATE
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
@@ -428,7 +429,6 @@ ENTRY(ret_from_fork)
RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
jmp ret_from_sys_call # go to the SYSRET fastpath
- CFI_RESTORE_STATE
CFI_ENDPROC
END(ret_from_fork)
@@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \
#endif
apicinterrupt LOCAL_TIMER_VECTOR \
apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt GENERIC_INTERRUPT_VECTOR \
+ generic_interrupt smp_generic_interrupt
#ifdef CONFIG_SMP
apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b0f61f0dcd0..f2f8540a7f3 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
#ifdef CONFIG_X86_32
if (!HAVE_HWFP) {
memset(tsk->thread.xstate, 0, xstate_size);
- finit();
+ finit_task(tsk);
set_stopped_child_used_math(tsk);
return 0;
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f13ca1650aa..b8ac3b6cf77 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -15,6 +15,9 @@
atomic_t irq_err_count;
+/* Function pointer for generic interrupt vector handling */
+void (*generic_interrupt_extension)(void) = NULL;
+
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themselves.
@@ -42,55 +45,61 @@ void ack_bad_irq(unsigned int irq)
/*
* /proc/interrupts printing:
*/
-static int show_other_interrupts(struct seq_file *p)
+static int show_other_interrupts(struct seq_file *p, int prec)
{
int j;
- seq_printf(p, "NMI: ");
+ seq_printf(p, "%*s: ", prec, "NMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
seq_printf(p, " Non-maskable interrupts\n");
#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "LOC: ");
+ seq_printf(p, "%*s: ", prec, "LOC");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
seq_printf(p, " Local timer interrupts\n");
#endif
+ if (generic_interrupt_extension) {
+ seq_printf(p, "PLT: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+ seq_printf(p, " Platform interrupts\n");
+ }
#ifdef CONFIG_SMP
- seq_printf(p, "RES: ");
+ seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
seq_printf(p, " Rescheduling interrupts\n");
- seq_printf(p, "CAL: ");
+ seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
seq_printf(p, " Function call interrupts\n");
- seq_printf(p, "TLB: ");
+ seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#endif
#ifdef CONFIG_X86_MCE
- seq_printf(p, "TRM: ");
+ seq_printf(p, "%*s: ", prec, "TRM");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
# ifdef CONFIG_X86_64
- seq_printf(p, "THR: ");
+ seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
# endif
#endif
#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "SPU: ");
+ seq_printf(p, "%*s: ", prec, "SPU");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
#endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+ seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+ seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
return 0;
}
@@ -98,19 +107,22 @@ static int show_other_interrupts(struct seq_file *p)
int show_interrupts(struct seq_file *p, void *v)
{
unsigned long flags, any_count = 0;
- int i = *(loff_t *) v, j;
+ int i = *(loff_t *) v, j, prec;
struct irqaction *action;
struct irq_desc *desc;
if (i > nr_irqs)
return 0;
+ for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+ j *= 10;
+
if (i == nr_irqs)
- return show_other_interrupts(p);
+ return show_other_interrupts(p, prec);
/* print header */
if (i == 0) {
- seq_printf(p, " ");
+ seq_printf(p, "%*s", prec + 8, "");
for_each_online_cpu(j)
seq_printf(p, "CPU%-8d", j);
seq_putc(p, '\n');
@@ -131,7 +143,7 @@ int show_interrupts(struct seq_file *p, void *v)
if (!action && !any_count)
goto out;
- seq_printf(p, "%3d: ", i);
+ seq_printf(p, "%*d: ", prec, i);
#ifndef CONFIG_SMP
seq_printf(p, "%10u ", kstat_irqs(i));
#else
@@ -163,6 +175,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_LOCAL_APIC
sum += irq_stats(cpu)->apic_timer_irqs;
#endif
+ if (generic_interrupt_extension)
+ sum += irq_stats(cpu)->generic_irqs;
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
@@ -226,4 +240,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
return 1;
}
+/*
+ * Handler for GENERIC_INTERRUPT_VECTOR.
+ */
+void smp_generic_interrupt(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ ack_APIC_irq();
+
+ exit_idle();
+
+ irq_enter();
+
+ inc_irq_stat(generic_irqs);
+
+ if (generic_interrupt_extension)
+ generic_interrupt_extension();
+
+ irq_exit();
+
+ set_irq_regs(old_regs);
+}
+
EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9dc6b2b2427..3b09634a515 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
+#include <linux/percpu.h>
#include <asm/apic.h>
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
union irq_ctx {
struct thread_info tinfo;
u32 stack[THREAD_SIZE/sizeof(u32)];
-};
+} __attribute__((aligned(PAGE_SIZE)));
-static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
-static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
+static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
-static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
-static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
static void call_on_stack(void *func, void *stack)
{
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
u32 *isp, arg1, arg2;
curctx = (union irq_ctx *) current_thread_info();
- irqctx = hardirq_ctx[smp_processor_id()];
+ irqctx = __get_cpu_var(hardirq_ctx);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
{
union irq_ctx *irqctx;
- if (hardirq_ctx[cpu])
+ if (per_cpu(hardirq_ctx, cpu))
return;
- irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+ irqctx = &per_cpu(hardirq_stack, cpu);
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
- hardirq_ctx[cpu] = irqctx;
+ per_cpu(hardirq_ctx, cpu) = irqctx;
- irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
+ irqctx = &per_cpu(softirq_stack, cpu);
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = 0;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
- softirq_ctx[cpu] = irqctx;
+ per_cpu(softirq_ctx, cpu) = irqctx;
printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
- cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
+ cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
}
void irq_ctx_exit(int cpu)
{
- hardirq_ctx[cpu] = NULL;
+ per_cpu(hardirq_ctx, cpu) = NULL;
}
asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
if (local_softirq_pending()) {
curctx = current_thread_info();
- irqctx = softirq_ctx[smp_processor_id()];
+ irqctx = __get_cpu_var(softirq_ctx);
irqctx->tinfo.task = curctx->task;
irqctx->tinfo.previous_esp = current_stack_pointer;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 50b8c3a3006..bc132610544 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -175,6 +175,9 @@ void __init native_init_IRQ(void)
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+ /* generic IPI for platform specific use */
+ alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f3..c7a49e0ffbf 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -147,6 +147,9 @@ static void __init apic_intr_init(void)
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+ /* generic IPI for platform specific use */
+ alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index f5fc8c781a6..e7368c1da01 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
#include <linux/ftrace.h>
#include <linux/suspend.h>
#include <linux/gfp.h>
+#include <linux/io.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
-#include <asm/io.h>
#include <asm/apic.h>
#include <asm/cpufeature.h>
#include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
"\tmovl %%eax,%%fs\n"
"\tmovl %%eax,%%gs\n"
"\tmovl %%eax,%%ss\n"
- ::: "eax", "memory");
+ : : : "eax", "memory");
#undef STR
#undef __STR
}
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
- /* We need to put APICs in legacy mode so that we can
+ /*
+ * We need to put APICs in legacy mode so that we can
* get timer interrupts in second kernel. kexec/kdump
* paths already have calls to disable_IO_APIC() in
* one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
<< PAGE_SHIFT);
- /* The segment registers are funny things, they have both a
+ /*
+ * The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
* set to a specific selector, the invisible part is loaded
* with from a table in memory. At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
* segments, before I zap the gdt with an invalid value.
*/
load_segments();
- /* The gdt & idt are now invalid.
+ /*
+ * The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- set_gdt(phys_to_virt(0),0);
- set_idt(phys_to_virt(0),0);
+ set_gdt(phys_to_virt(0), 0);
+ set_idt(phys_to_virt(0), 0);
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6993d51b7fd..89cea4d4467 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,11 +12,47 @@
#include <linux/reboot.h>
#include <linux/numa.h>
#include <linux/ftrace.h>
+#include <linux/io.h>
+#include <linux/suspend.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
-#include <asm/io.h>
+
+static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
+ unsigned long addr)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ struct page *page;
+ int result = -ENOMEM;
+
+ addr &= PMD_MASK;
+ pgd += pgd_index(addr);
+ if (!pgd_present(*pgd)) {
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page)
+ goto out;
+ pud = (pud_t *)page_address(page);
+ memset(pud, 0, PAGE_SIZE);
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud)) {
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page)
+ goto out;
+ pmd = (pmd_t *)page_address(page);
+ memset(pmd, 0, PAGE_SIZE);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+ result = 0;
+out:
+ return result;
+}
static void init_level2_page(pmd_t *level2p, unsigned long addr)
{
@@ -83,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
}
level3p = (pud_t *)page_address(page);
result = init_level3_page(image, level3p, addr, last_addr);
- if (result) {
+ if (result)
goto out;
- }
set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
addr += PGDIR_SIZE;
}
@@ -156,6 +191,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
if (result)
return result;
+ /*
+ * image->start may be outside 0 ~ max_pfn, for example when
+ * jump back to original kernel from kexeced kernel
+ */
+ result = init_one_level2_page(image, level4p, image->start);
+ if (result)
+ return result;
return init_transition_pgtable(image, level4p);
}
@@ -229,20 +271,45 @@ void machine_kexec(struct kimage *image)
{
unsigned long page_list[PAGES_NR];
void *control_page;
+ int save_ftrace_enabled;
- tracer_disable();
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context)
+ save_processor_state();
+#endif
+
+ save_ftrace_enabled = __ftrace_enabled_save();
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
+ if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * We need to put APICs in legacy mode so that we can
+ * get timer interrupts in second kernel. kexec/kdump
+ * paths already have calls to disable_IO_APIC() in
+ * one form or other. kexec jump path also need
+ * one.
+ */
+ disable_IO_APIC();
+#endif
+ }
+
control_page = page_address(image->control_code_page) + PAGE_SIZE;
- memcpy(control_page, relocate_kernel, PAGE_SIZE);
+ memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
+ page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
page_list[PA_TABLE_PAGE] =
(unsigned long)__pa(page_address(image->control_code_page));
- /* The segment registers are funny things, they have both a
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+ << PAGE_SHIFT);
+
+ /*
+ * The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
* set to a specific selector, the invisible part is loaded
* with from a table in memory. At no other time is the
@@ -252,15 +319,25 @@ void machine_kexec(struct kimage *image)
* segments, before I zap the gdt with an invalid value.
*/
load_segments();
- /* The gdt & idt are now invalid.
+ /*
+ * The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- set_gdt(phys_to_virt(0),0);
- set_idt(phys_to_virt(0),0);
+ set_gdt(phys_to_virt(0), 0);
+ set_idt(phys_to_virt(0), 0);
/* now call it */
- relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
- image->start);
+ image->start = relocate_kernel((unsigned long)image->head,
+ (unsigned long)page_list,
+ image->start,
+ image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context)
+ restore_processor_state();
+#endif
+
+ __ftrace_enabled_restore(save_ftrace_enabled);
}
void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 666e43df51f..712d15fdc41 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
return 0;
}
-static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
+static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
{
.callback = set_check_enable_amd_mmconf,
.ident = "Sun Microsystems Machine",
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 37cb1bda1ba..47673e02ae5 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
static struct mpf_intel *mpf_found;
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+ struct mpc_table *mpc;
+ unsigned long size;
+
+ mpc = early_ioremap(physptr, PAGE_SIZE);
+ size = mpc->length;
+ early_iounmap(mpc, PAGE_SIZE);
+ apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
+
+ return size;
+}
+
/*
* Scan the memory blocks for an SMP configuration block.
*/
@@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early)
construct_default_ISA_mptable(mpf->feature1);
} else if (mpf->physptr) {
+ struct mpc_table *mpc;
+ unsigned long size;
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_ioremap(mpf->physptr, size);
/*
* Read the physical hardware table. Anything here will
* override the defaults.
*/
- if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
+ if (!smp_read_mpc(mpc, early)) {
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 0;
#endif
@@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early)
"BIOS bug, MP table errors detected!...\n");
printk(KERN_ERR "... disabling SMP support. "
"(tell your hw vendor)\n");
+ early_iounmap(mpc, size);
return;
}
+ early_iounmap(mpc, size);
if (early)
return;
@@ -697,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
if (!reserve)
return 1;
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
+ reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
BOOTMEM_DEFAULT);
if (mpf->physptr) {
- unsigned long size = PAGE_SIZE;
+ unsigned long size = get_mpc_size(mpf->physptr);
#ifdef CONFIG_X86_32
/*
* We cannot access to MPC table to compute
@@ -871,12 +890,12 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
#ifdef CONFIG_X86_IO_APIC
struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
- printk(KERN_INFO "OLD ");
+ apic_printk(APIC_VERBOSE, "OLD ");
print_MP_intsrc_info(m);
i = get_MP_intsrc_index(m);
if (i > 0) {
assign_to_mpc_intsrc(&mp_irqs[i], m);
- printk(KERN_INFO "NEW ");
+ apic_printk(APIC_VERBOSE, "NEW ");
print_mp_irq_info(&mp_irqs[i]);
} else if (!i) {
/* legacy, do nothing */
@@ -924,7 +943,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
continue;
if (nr_m_spare > 0) {
- printk(KERN_INFO "*NEW* found ");
+ apic_printk(APIC_VERBOSE, "*NEW* found\n");
nr_m_spare--;
assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
m_spare[nr_m_spare] = NULL;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3d9672e59c1..19378715f41 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -685,9 +685,8 @@ static int ptrace_bts_config(struct task_struct *child,
if (!cfg.signal)
return -EINVAL;
- return -EOPNOTSUPP;
-
child->thread.bts_ovfl_signal = cfg.signal;
+ return -EOPNOTSUPP;
}
if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 309949e9e1c..6a5a2970f4c 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void)
if (!force_hpet_address)
return;
- if (rcba_base == NULL)
- BUG();
+ BUG_ON(rcba_base == NULL);
/* read the Function Disable register, dword mode only */
val = readl(rcba_base + 0x3404);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1cc18d439bb..2aef36d8aca 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
},
},
+ { /* Handle problems with rebooting on Dell XPS710 */
+ .callback = set_bios_reboot,
+ .ident = "Dell XPS710",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+ },
+ },
{ }
};
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 2064d0aa8d2..41235531b11 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -17,7 +17,8 @@
#define PTR(x) (x << 2)
-/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
* ~ control_page + PAGE_SIZE are used as data storage and stack for
* jumping back
*/
@@ -76,8 +77,10 @@ relocate_kernel:
movl %eax, CP_PA_SWAP_PAGE(%edi)
movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
- /* get physical address of control page now */
- /* this is impossible after page table switch */
+ /*
+ * get physical address of control page now
+ * this is impossible after page table switch
+ */
movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
/* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
/* store the start address on the stack */
pushl %edx
- /* Set cr0 to a known state:
+ /*
+ * Set cr0 to a known state:
* - Paging disabled
* - Alignment check disabled
* - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
/* clear cr4 if applicable */
testl %ecx, %ecx
jz 1f
- /* Set cr4 to a known state:
+ /*
+ * Set cr4 to a known state:
* Setting everything to zero seems safe.
*/
xorl %eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
call swap_pages
addl $8, %esp
- /* To be certain of avoiding problems with self-modifying code
+ /*
+ * To be certain of avoiding problems with self-modifying code
* I need to execute a serializing instruction here.
* So I flush the TLB, it's handy, and not processor dependent.
*/
xorl %eax, %eax
movl %eax, %cr3
- /* set all of the registers to known values */
- /* leave %esp alone */
+ /*
+ * set all of the registers to known values
+ * leave %esp alone
+ */
testl %esi, %esi
jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index d32cfb27a47..4de8f5b3d47 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -19,29 +19,77 @@
#define PTR(x) (x << 3)
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define RSP DATA(0x0)
+#define CR0 DATA(0x8)
+#define CR3 DATA(0x10)
+#define CR4 DATA(0x18)
+
+/* other data */
+#define CP_PA_TABLE_PAGE DATA(0x20)
+#define CP_PA_SWAP_PAGE DATA(0x28)
+#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
+
.text
.align PAGE_SIZE
.code64
.globl relocate_kernel
relocate_kernel:
- /* %rdi indirection_page
+ /*
+ * %rdi indirection_page
* %rsi page_list
* %rdx start address
+ * %rcx preserve_context
*/
+ /* Save the CPU context, used for jumping back */
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushf
+
+ movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
+ movq %rsp, RSP(%r11)
+ movq %cr0, %rax
+ movq %rax, CR0(%r11)
+ movq %cr3, %rax
+ movq %rax, CR3(%r11)
+ movq %cr4, %rax
+ movq %rax, CR4(%r11)
+
/* zero out flags, and disable interrupts */
pushq $0
popfq
- /* get physical address of control page now */
- /* this is impossible after page table switch */
+ /*
+ * get physical address of control page now
+ * this is impossible after page table switch
+ */
movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
/* get physical address of page table now too */
- movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
+ movq PTR(PA_TABLE_PAGE)(%rsi), %r9
+
+ /* get physical address of swap page now */
+ movq PTR(PA_SWAP_PAGE)(%rsi), %r10
+
+ /* save some information for jumping back */
+ movq %r9, CP_PA_TABLE_PAGE(%r11)
+ movq %r10, CP_PA_SWAP_PAGE(%r11)
+ movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
/* Switch to the identity mapped page tables */
- movq %rcx, %cr3
+ movq %r9, %cr3
/* setup a new stack at the end of the physical control page */
lea PAGE_SIZE(%r8), %rsp
@@ -55,7 +103,8 @@ identity_mapped:
/* store the start address on the stack */
pushq %rdx
- /* Set cr0 to a known state:
+ /*
+ * Set cr0 to a known state:
* - Paging enabled
* - Alignment check disabled
* - Write protect disabled
@@ -68,7 +117,8 @@ identity_mapped:
orl $(X86_CR0_PG | X86_CR0_PE), %eax
movq %rax, %cr0
- /* Set cr4 to a known state:
+ /*
+ * Set cr4 to a known state:
* - physical address extension enabled
*/
movq $X86_CR4_PAE, %rax
@@ -78,9 +128,87 @@ identity_mapped:
1:
/* Flush the TLB (needed?) */
- movq %rcx, %cr3
+ movq %r9, %cr3
+
+ movq %rcx, %r11
+ call swap_pages
+
+ /*
+ * To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB by reloading %cr3 here, it's handy,
+ * and not processor dependent.
+ */
+ movq %cr3, %rax
+ movq %rax, %cr3
+
+ /*
+ * set all of the registers to known values
+ * leave %rsp alone
+ */
+
+ testq %r11, %r11
+ jnz 1f
+ xorq %rax, %rax
+ xorq %rbx, %rbx
+ xorq %rcx, %rcx
+ xorq %rdx, %rdx
+ xorq %rsi, %rsi
+ xorq %rdi, %rdi
+ xorq %rbp, %rbp
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r9
+ xorq %r11, %r11
+ xorq %r12, %r12
+ xorq %r13, %r13
+ xorq %r14, %r14
+ xorq %r15, %r15
+
+ ret
+
+1:
+ popq %rdx
+ leaq PAGE_SIZE(%r10), %rsp
+ call *%rdx
+
+ /* get the re-entry point of the peer system */
+ movq 0(%rsp), %rbp
+ call 1f
+1:
+ popq %r8
+ subq $(1b - relocate_kernel), %r8
+ movq CP_PA_SWAP_PAGE(%r8), %r10
+ movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
+ movq CP_PA_TABLE_PAGE(%r8), %rax
+ movq %rax, %cr3
+ lea PAGE_SIZE(%r8), %rsp
+ call swap_pages
+ movq $virtual_mapped, %rax
+ pushq %rax
+ ret
+
+virtual_mapped:
+ movq RSP(%r8), %rsp
+ movq CR4(%r8), %rax
+ movq %rax, %cr4
+ movq CR3(%r8), %rax
+ movq CR0(%r8), %r8
+ movq %rax, %cr3
+ movq %r8, %cr0
+ movq %rbp, %rax
+
+ popf
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ ret
/* Do the copies */
+swap_pages:
movq %rdi, %rcx /* Put the page_list in %rcx */
xorq %rdi, %rdi
xorq %rsi, %rsi
@@ -112,36 +240,27 @@ identity_mapped:
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
+ movq %rdi, %rdx
+ movq %rsi, %rax
+
+ movq %r10, %rdi
movq $512, %rcx
rep ; movsq
- jmp 0b
-3:
-
- /* To be certain of avoiding problems with self-modifying code
- * I need to execute a serializing instruction here.
- * So I flush the TLB by reloading %cr3 here, it's handy,
- * and not processor dependent.
- */
- movq %cr3, %rax
- movq %rax, %cr3
- /* set all of the registers to known values */
- /* leave %rsp alone */
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ movq $512, %rcx
+ rep ; movsq
- xorq %rax, %rax
- xorq %rbx, %rbx
- xorq %rcx, %rcx
- xorq %rdx, %rdx
- xorq %rsi, %rsi
- xorq %rdi, %rdi
- xorq %rbp, %rbp
- xorq %r8, %r8
- xorq %r9, %r9
- xorq %r10, %r9
- xorq %r11, %r11
- xorq %r12, %r12
- xorq %r13, %r13
- xorq %r14, %r14
- xorq %r15, %r15
+ movq %rdx, %rdi
+ movq %r10, %rsi
+ movq $512, %rcx
+ rep ; movsq
+ lea PAGE_SIZE(%rax), %rsi
+ jmp 0b
+3:
ret
+
+ .globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4c54bc0d8ff..f28c56e6bf9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -202,7 +202,9 @@ struct ist_info ist_info;
#endif
#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+ .x86_phys_bits = MAX_PHYSMEM_BITS,
+};
EXPORT_SYMBOL(boot_cpu_data);
#endif
@@ -770,6 +772,9 @@ void __init setup_arch(char **cmdline_p)
finish_e820_parsing();
+ if (efi_enabled)
+ efi_init();
+
dmi_scan_machine();
dmi_check_system(bad_bios_dmi_table);
@@ -789,8 +794,6 @@ void __init setup_arch(char **cmdline_p)
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
- if (efi_enabled)
- efi_init();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d992e6cff73..400331b50a5 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,6 +7,7 @@
#include <linux/crash_dump.h>
#include <linux/smp.h>
#include <linux/topology.h>
+#include <linux/pfn.h>
#include <asm/sections.h>
#include <asm/processor.h>
#include <asm/setup.h>
@@ -41,6 +42,309 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
};
EXPORT_SYMBOL(__per_cpu_offset);
+/*
+ * On x86_64 symbols referenced from code should be reachable using
+ * 32bit relocations. Reserve space for static percpu variables in
+ * modules so that they are always served from the first chunk which
+ * is located at the percpu segment base. On x86_32, anything can
+ * address anywhere. No need to reserve space in the first chunk.
+ */
+#ifdef CONFIG_X86_64
+#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
+#else
+#define PERCPU_FIRST_CHUNK_RESERVE 0
+#endif
+
+/**
+ * pcpu_need_numa - determine percpu allocation needs to consider NUMA
+ *
+ * If NUMA is not configured or there is only one NUMA node available,
+ * there is no reason to consider NUMA. This function determines
+ * whether percpu allocation should consider NUMA or not.
+ *
+ * RETURNS:
+ * true if NUMA should be considered; otherwise, false.
+ */
+static bool __init pcpu_need_numa(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ pg_data_t *last = NULL;
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ int node = early_cpu_to_node(cpu);
+
+ if (node_online(node) && NODE_DATA(node) &&
+ last && last != NODE_DATA(node))
+ return true;
+
+ last = NODE_DATA(node);
+ }
+#endif
+ return false;
+}
+
+/**
+ * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
+ * @cpu: cpu to allocate for
+ * @size: size allocation in bytes
+ * @align: alignment
+ *
+ * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
+ * does the right thing for NUMA regardless of the current
+ * configuration.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
+ unsigned long align)
+{
+ const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ int node = early_cpu_to_node(cpu);
+ void *ptr;
+
+ if (!node_online(node) || !NODE_DATA(node)) {
+ ptr = __alloc_bootmem_nopanic(size, align, goal);
+ pr_info("cpu %d has no node %d or node-local memory\n",
+ cpu, node);
+ pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
+ cpu, size, __pa(ptr));
+ } else {
+ ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
+ size, align, goal);
+ pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
+ "%016lx\n", cpu, size, node, __pa(ptr));
+ }
+ return ptr;
+#else
+ return __alloc_bootmem_nopanic(size, align, goal);
+#endif
+}
+
+/*
+ * Remap allocator
+ *
+ * This allocator uses PMD page as unit. A PMD page is allocated for
+ * each cpu and each is remapped into vmalloc area using PMD mapping.
+ * As PMD page is quite large, only part of it is used for the first
+ * chunk. Unused part is returned to the bootmem allocator.
+ *
+ * So, the PMD pages are mapped twice - once to the physical mapping
+ * and to the vmalloc area for the first percpu chunk. The double
+ * mapping does add one more PMD TLB entry pressure but still is much
+ * better than only using 4k mappings while still being NUMA friendly.
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+static size_t pcpur_size __initdata;
+static void **pcpur_ptrs __initdata;
+
+static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
+{
+ size_t off = (size_t)pageno << PAGE_SHIFT;
+
+ if (off >= pcpur_size)
+ return NULL;
+
+ return virt_to_page(pcpur_ptrs[cpu] + off);
+}
+
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+ static struct vm_struct vm;
+ pg_data_t *last;
+ size_t ptrs_size, dyn_size;
+ unsigned int cpu;
+ ssize_t ret;
+
+ /*
+ * If large page isn't supported, there's no benefit in doing
+ * this. Also, on non-NUMA, embedding is better.
+ */
+ if (!cpu_has_pse || pcpu_need_numa())
+ return -EINVAL;
+
+ last = NULL;
+ for_each_possible_cpu(cpu) {
+ int node = early_cpu_to_node(cpu);
+
+ if (node_online(node) && NODE_DATA(node) &&
+ last && last != NODE_DATA(node))
+ goto proceed;
+
+ last = NODE_DATA(node);
+ }
+ return -EINVAL;
+
+proceed:
+ /*
+ * Currently supports only single page. Supporting multiple
+ * pages won't be too difficult if it ever becomes necessary.
+ */
+ pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+ PERCPU_DYNAMIC_RESERVE);
+ if (pcpur_size > PMD_SIZE) {
+ pr_warning("PERCPU: static data is larger than large page, "
+ "can't use large page\n");
+ return -EINVAL;
+ }
+ dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
+
+ /* allocate pointer array and alloc large pages */
+ ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
+ pcpur_ptrs = alloc_bootmem(ptrs_size);
+
+ for_each_possible_cpu(cpu) {
+ pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
+ if (!pcpur_ptrs[cpu])
+ goto enomem;
+
+ /*
+ * Only use pcpur_size bytes and give back the rest.
+ *
+ * Ingo: The 2MB up-rounding bootmem is needed to make
+ * sure the partial 2MB page is still fully RAM - it's
+ * not well-specified to have a PAT-incompatible area
+ * (unmapped RAM, device memory, etc.) in that hole.
+ */
+ free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
+ PMD_SIZE - pcpur_size);
+
+ memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+ }
+
+ /* allocate address and map */
+ vm.flags = VM_ALLOC;
+ vm.size = num_possible_cpus() * PMD_SIZE;
+ vm_area_register_early(&vm, PMD_SIZE);
+
+ for_each_possible_cpu(cpu) {
+ pmd_t *pmd;
+
+ pmd = populate_extra_pmd((unsigned long)vm.addr
+ + cpu * PMD_SIZE);
+ set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
+ PAGE_KERNEL_LARGE));
+ }
+
+ /* we're ready, commit */
+ pr_info("PERCPU: Remapped at %p with large pages, static data "
+ "%zu bytes\n", vm.addr, static_size);
+
+ ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+ PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+ PMD_SIZE, vm.addr, NULL);
+ goto out_free_ar;
+
+enomem:
+ for_each_possible_cpu(cpu)
+ if (pcpur_ptrs[cpu])
+ free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
+ ret = -ENOMEM;
+out_free_ar:
+ free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+ return ret;
+}
+#else
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+ return -EINVAL;
+}
+#endif
+
+/*
+ * Embedding allocator
+ *
+ * The first chunk is sized to just contain the static area plus
+ * module and dynamic reserves and embedded into linear physical
+ * mapping so that it can use PMD mapping without additional TLB
+ * pressure.
+ */
+static ssize_t __init setup_pcpu_embed(size_t static_size)
+{
+ size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
+
+ /*
+ * If large page isn't supported, there's no benefit in doing
+ * this. Also, embedding allocation doesn't play well with
+ * NUMA.
+ */
+ if (!cpu_has_pse || pcpu_need_numa())
+ return -EINVAL;
+
+ return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+ reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
+}
+
+/*
+ * 4k page allocator
+ *
+ * This is the basic allocator. Static percpu area is allocated
+ * page-by-page and most of initialization is done by the generic
+ * setup function.
+ */
+static struct page **pcpu4k_pages __initdata;
+static int pcpu4k_nr_static_pages __initdata;
+
+static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
+{
+ if (pageno < pcpu4k_nr_static_pages)
+ return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
+ return NULL;
+}
+
+static void __init pcpu4k_populate_pte(unsigned long addr)
+{
+ populate_extra_pte(addr);
+}
+
+static ssize_t __init setup_pcpu_4k(size_t static_size)
+{
+ size_t pages_size;
+ unsigned int cpu;
+ int i, j;
+ ssize_t ret;
+
+ pcpu4k_nr_static_pages = PFN_UP(static_size);
+
+ /* unaligned allocations can't be freed, round up to page size */
+ pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
+ * sizeof(pcpu4k_pages[0]));
+ pcpu4k_pages = alloc_bootmem(pages_size);
+
+ /* allocate and copy */
+ j = 0;
+ for_each_possible_cpu(cpu)
+ for (i = 0; i < pcpu4k_nr_static_pages; i++) {
+ void *ptr;
+
+ ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
+ if (!ptr)
+ goto enomem;
+
+ memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
+ pcpu4k_pages[j++] = virt_to_page(ptr);
+ }
+
+ /* we're ready, commit */
+ pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
+ pcpu4k_nr_static_pages, static_size);
+
+ ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
+ PERCPU_FIRST_CHUNK_RESERVE, -1,
+ -1, NULL, pcpu4k_populate_pte);
+ goto out_free_ar;
+
+enomem:
+ while (--j >= 0)
+ free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
+ ret = -ENOMEM;
+out_free_ar:
+ free_bootmem(__pa(pcpu4k_pages), pages_size);
+ return ret;
+}
+
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
@@ -61,38 +365,35 @@ static inline void setup_percpu_segment(int cpu)
*/
void __init setup_per_cpu_areas(void)
{
- ssize_t size;
- char *ptr;
- int cpu;
-
- /* Copy section for each CPU (we discard the original) */
- size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
+ size_t static_size = __per_cpu_end - __per_cpu_start;
+ unsigned int cpu;
+ unsigned long delta;
+ size_t pcpu_unit_size;
+ ssize_t ret;
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
- pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
+ /*
+ * Allocate percpu area. If PSE is supported, try to make use
+ * of large page mappings. Please read comments on top of
+ * each allocator for details.
+ */
+ ret = setup_pcpu_remap(static_size);
+ if (ret < 0)
+ ret = setup_pcpu_embed(static_size);
+ if (ret < 0)
+ ret = setup_pcpu_4k(static_size);
+ if (ret < 0)
+ panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
+ static_size, ret);
- for_each_possible_cpu(cpu) {
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- ptr = alloc_bootmem_pages(size);
-#else
- int node = early_cpu_to_node(cpu);
- if (!node_online(node) || !NODE_DATA(node)) {
- ptr = alloc_bootmem_pages(size);
- pr_info("cpu %d has no node %d or node-local memory\n",
- cpu, node);
- pr_debug("per cpu data for cpu%d at %016lx\n",
- cpu, __pa(ptr));
- } else {
- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
- pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
- cpu, node, __pa(ptr));
- }
-#endif
+ pcpu_unit_size = ret;
- memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
- per_cpu_offset(cpu) = ptr - __per_cpu_start;
+ /* alrighty, percpu areas up and running */
+ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ for_each_possible_cpu(cpu) {
+ per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
@@ -125,8 +426,6 @@ void __init setup_per_cpu_areas(void)
*/
if (cpu == boot_cpu_id)
switch_to_new_gdt(cpu);
-
- DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
}
/* indicate the early static arrays will soon be gone */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 249334f5080..ef7d10170c3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
atomic_t init_deasserted;
-
-/* Set if we find a B stepping CPU */
-static int __cpuinitdata smp_b_stepping;
-
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
/* which logical CPUs are on which nodes */
@@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)
cpumask_set_cpu(cpuid, cpu_callin_mask);
}
-static int __cpuinitdata unsafe_smp;
-
/*
* Activate a secondary processor.
*/
@@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)
cpu_idle();
}
-static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
-{
- /*
- * Mask B, Pentium, but not Pentium MMX
- */
- if (c->x86_vendor == X86_VENDOR_INTEL &&
- c->x86 == 5 &&
- c->x86_mask >= 1 && c->x86_mask <= 4 &&
- c->x86_model <= 3)
- /*
- * Remember we have B step Pentia with bugs
- */
- smp_b_stepping = 1;
-
- /*
- * Certain Athlons might work (for various values of 'work') in SMP
- * but they are not certified as MP capable.
- */
- if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
- if (num_possible_cpus() == 1)
- goto valid_k7;
-
- /* Athlon 660/661 is valid. */
- if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
- (c->x86_mask == 1)))
- goto valid_k7;
-
- /* Duron 670 is valid */
- if ((c->x86_model == 7) && (c->x86_mask == 0))
- goto valid_k7;
-
- /*
- * Athlon 662, Duron 671, and Athlon >model 7 have capability
- * bit. It's worth noting that the A5 stepping (662) of some
- * Athlon XP's have the MP bit set.
- * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
- * more.
- */
- if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
- ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
- (c->x86_model > 7))
- if (cpu_has_mp)
- goto valid_k7;
-
- /* If we get here, not a certified SMP capable AMD system. */
- unsafe_smp = 1;
- }
-
-valid_k7:
- ;
-}
-
-static void __cpuinit smp_checks(void)
-{
- if (smp_b_stepping)
- printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
- "with B stepping processors.\n");
-
- /*
- * Don't taint if we are running SMP kernel on a single non-MP
- * approved Athlon
- */
- if (unsafe_smp && num_online_cpus() > 1) {
- printk(KERN_INFO "WARNING: This combination of AMD"
- "processors is not suitable for SMP.\n");
- add_taint(TAINT_UNSAFE_SMP);
- }
-}
-
/*
* The bootstrap kernel entry code has set these up. Save them for
* a given CPU
@@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)
c->cpu_index = id;
if (id != 0)
identify_secondary_cpu(c);
- smp_apply_quirks(c);
}
@@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
pr_debug("Boot done.\n");
impress_friends();
- smp_checks();
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
#endif
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index f04549afcfe..d038b9c45cf 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -314,8 +314,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
int locals = 0;
struct bau_desc *bau_desc;
- WARN_ON(!in_atomic());
-
cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
uv_cpu = uv_blade_processor_id();
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 00000000000..2ffb6c53326
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,393 @@
+/*
+ * SGI RTC clock/timer routines.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Dimitri Sivanich
+ */
+#include <linux/clockchips.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
+
+#define RTC_NAME "sgi_rtc"
+
+static cycle_t uv_read_rtc(void);
+static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
+static void uv_rtc_timer_setup(enum clock_event_mode,
+ struct clock_event_device *);
+
+static struct clocksource clocksource_uv = {
+ .name = RTC_NAME,
+ .rating = 400,
+ .read = uv_read_rtc,
+ .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
+ .shift = 10,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static struct clock_event_device clock_event_device_uv = {
+ .name = RTC_NAME,
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+ .shift = 20,
+ .rating = 400,
+ .irq = -1,
+ .set_next_event = uv_rtc_next_event,
+ .set_mode = uv_rtc_timer_setup,
+ .event_handler = NULL,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
+
+/* There is one of these allocated per node */
+struct uv_rtc_timer_head {
+ spinlock_t lock;
+ /* next cpu waiting for timer, local node relative: */
+ int next_cpu;
+ /* number of cpus on this node: */
+ int ncpus;
+ struct {
+ int lcpu; /* systemwide logical cpu number */
+ u64 expires; /* next timer expiration for this cpu */
+ } cpu[1];
+};
+
+/*
+ * Access to uv_rtc_timer_head via blade id.
+ */
+static struct uv_rtc_timer_head **blade_info __read_mostly;
+
+static int uv_rtc_enable;
+
+/*
+ * Hardware interface routines
+ */
+
+/* Send IPIs to another node */
+static void uv_rtc_send_IPI(int cpu)
+{
+ unsigned long apicid, val;
+ int pnode;
+
+ apicid = cpu_physical_id(cpu);
+ pnode = uv_apicid_to_pnode(apicid);
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+/* Check for an RTC interrupt pending */
+static int uv_intr_pending(int pnode)
+{
+ return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+ UVH_EVENT_OCCURRED0_RTC1_MASK;
+}
+
+/* Setup interrupt and return non-zero if early expiration occurred. */
+static int uv_setup_intr(int cpu, u64 expires)
+{
+ u64 val;
+ int pnode = uv_cpu_to_pnode(cpu);
+
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+ UVH_RTC1_INT_CONFIG_M_MASK);
+ uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
+
+ uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+ UVH_EVENT_OCCURRED0_RTC1_MASK);
+
+ val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+ ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+
+ /* Set configuration */
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
+ /* Initialize comparator value */
+ uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
+
+ return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
+}
+
+/*
+ * Per-cpu timer tracking routines
+ */
+
+static __init void uv_rtc_deallocate_timers(void)
+{
+ int bid;
+
+ for_each_possible_blade(bid) {
+ kfree(blade_info[bid]);
+ }
+ kfree(blade_info);
+}
+
+/* Allocate per-node list of cpu timer expiration times. */
+static __init int uv_rtc_allocate_timers(void)
+{
+ int cpu;
+
+ blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
+ if (!blade_info)
+ return -ENOMEM;
+ memset(blade_info, 0, uv_possible_blades * sizeof(void *));
+
+ for_each_present_cpu(cpu) {
+ int nid = cpu_to_node(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ struct uv_rtc_timer_head *head = blade_info[bid];
+
+ if (!head) {
+ head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
+ (uv_blade_nr_possible_cpus(bid) *
+ 2 * sizeof(u64)),
+ GFP_KERNEL, nid);
+ if (!head) {
+ uv_rtc_deallocate_timers();
+ return -ENOMEM;
+ }
+ spin_lock_init(&head->lock);
+ head->ncpus = uv_blade_nr_possible_cpus(bid);
+ head->next_cpu = -1;
+ blade_info[bid] = head;
+ }
+
+ head->cpu[bcpu].lcpu = cpu;
+ head->cpu[bcpu].expires = ULLONG_MAX;
+ }
+
+ return 0;
+}
+
+/* Find and set the next expiring timer. */
+static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
+{
+ u64 lowest = ULLONG_MAX;
+ int c, bcpu = -1;
+
+ head->next_cpu = -1;
+ for (c = 0; c < head->ncpus; c++) {
+ u64 exp = head->cpu[c].expires;
+ if (exp < lowest) {
+ bcpu = c;
+ lowest = exp;
+ }
+ }
+ if (bcpu >= 0) {
+ head->next_cpu = bcpu;
+ c = head->cpu[bcpu].lcpu;
+ if (uv_setup_intr(c, lowest))
+ /* If we didn't set it up in time, trigger */
+ uv_rtc_send_IPI(c);
+ } else {
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+ UVH_RTC1_INT_CONFIG_M_MASK);
+ }
+}
+
+/*
+ * Set expiration time for current cpu.
+ *
+ * Returns 1 if we missed the expiration time.
+ */
+static int uv_rtc_set_timer(int cpu, u64 expires)
+{
+ int pnode = uv_cpu_to_pnode(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ struct uv_rtc_timer_head *head = blade_info[bid];
+ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ u64 *t = &head->cpu[bcpu].expires;
+ unsigned long flags;
+ int next_cpu;
+
+ spin_lock_irqsave(&head->lock, flags);
+
+ next_cpu = head->next_cpu;
+ *t = expires;
+ /* Will this one be next to go off? */
+ if (next_cpu < 0 || bcpu == next_cpu ||
+ expires < head->cpu[next_cpu].expires) {
+ head->next_cpu = bcpu;
+ if (uv_setup_intr(cpu, expires)) {
+ *t = ULLONG_MAX;
+ uv_rtc_find_next_timer(head, pnode);
+ spin_unlock_irqrestore(&head->lock, flags);
+ return 1;
+ }
+ }
+
+ spin_unlock_irqrestore(&head->lock, flags);
+ return 0;
+}
+
+/*
+ * Unset expiration time for current cpu.
+ *
+ * Returns 1 if this timer was pending.
+ */
+static int uv_rtc_unset_timer(int cpu)
+{
+ int pnode = uv_cpu_to_pnode(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ struct uv_rtc_timer_head *head = blade_info[bid];
+ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ u64 *t = &head->cpu[bcpu].expires;
+ unsigned long flags;
+ int rc = 0;
+
+ spin_lock_irqsave(&head->lock, flags);
+
+ if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
+ rc = 1;
+
+ *t = ULLONG_MAX;
+
+ /* Was the hardware setup for this timer? */
+ if (head->next_cpu == bcpu)
+ uv_rtc_find_next_timer(head, pnode);
+
+ spin_unlock_irqrestore(&head->lock, flags);
+
+ return rc;
+}
+
+
+/*
+ * Kernel interface routines.
+ */
+
+/*
+ * Read the RTC.
+ */
+static cycle_t uv_read_rtc(void)
+{
+ return (cycle_t)uv_read_local_mmr(UVH_RTC);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int uv_rtc_next_event(unsigned long delta,
+ struct clock_event_device *ced)
+{
+ int ced_cpu = cpumask_first(ced->cpumask);
+
+ return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
+}
+
+/*
+ * Setup the RTC timer in oneshot mode
+ */
+static void uv_rtc_timer_setup(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ int ced_cpu = cpumask_first(evt->cpumask);
+
+ switch (mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ case CLOCK_EVT_MODE_ONESHOT:
+ case CLOCK_EVT_MODE_RESUME:
+ /* Nothing to do here yet */
+ break;
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ uv_rtc_unset_timer(ced_cpu);
+ break;
+ }
+}
+
+static void uv_rtc_interrupt(void)
+{
+ struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+ int cpu = smp_processor_id();
+
+ if (!ced || !ced->event_handler)
+ return;
+
+ if (uv_rtc_unset_timer(cpu) != 1)
+ return;
+
+ ced->event_handler(ced);
+}
+
+static int __init uv_enable_rtc(char *str)
+{
+ uv_rtc_enable = 1;
+
+ return 1;
+}
+__setup("uvrtc", uv_enable_rtc);
+
+static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
+{
+ struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+
+ *ced = clock_event_device_uv;
+ ced->cpumask = cpumask_of(smp_processor_id());
+ clockevents_register_device(ced);
+}
+
+static __init int uv_rtc_setup_clock(void)
+{
+ int rc;
+
+ if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
+ return -ENODEV;
+
+ generic_interrupt_extension = uv_rtc_interrupt;
+
+ clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
+ clocksource_uv.shift);
+
+ rc = clocksource_register(&clocksource_uv);
+ if (rc) {
+ generic_interrupt_extension = NULL;
+ return rc;
+ }
+
+ /* Setup and register clockevents */
+ rc = uv_rtc_allocate_timers();
+ if (rc) {
+ clocksource_unregister(&clocksource_uv);
+ generic_interrupt_extension = NULL;
+ return rc;
+ }
+
+ clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
+ NSEC_PER_SEC, clock_event_device_uv.shift);
+
+ clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
+ sn_rtc_cycles_per_second;
+
+ clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
+ (NSEC_PER_SEC / sn_rtc_cycles_per_second);
+
+ rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
+ if (rc) {
+ clocksource_unregister(&clocksource_uv);
+ generic_interrupt_extension = NULL;
+ uv_rtc_deallocate_timers();
+ }
+
+ return rc;
+}
+arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fbfced6f680..5bf54e40c6e 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -275,3 +275,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
ASSERT((per_cpu__irq_stack_union == 0),
"irq_stack_union is not at start of per-cpu area");
#endif
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "kexec control code size is too big")
+#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index f3a5305b8ad..9fe4ddaa8f6 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -348,6 +348,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
* flush_tlb_user() for both user and kernel mappings unless
* the Page Global Enable (PGE) feature bit is set. */
*dx |= 0x00002000;
+ /* We also lie, and say we're family id 5. 6 or greater
+ * leads to a rdmsr in early_init_intel which we can't handle.
+ * Family ID is returned as bits 8-12 in ax. */
+ *ax &= 0xFFFFF0FF;
+ *ax |= 0x00000500;
break;
case 0x80000000:
/* Futureproof this a little: if they ask how much extended
@@ -594,19 +599,21 @@ static void __init lguest_init_IRQ(void)
/* Some systems map "vectors" to interrupts weirdly. Lguest has
* a straightforward 1 to 1 mapping, so force that here. */
__get_cpu_var(vector_irq)[vector] = i;
- if (vector != SYSCALL_VECTOR) {
- set_intr_gate(vector,
- interrupt[vector-FIRST_EXTERNAL_VECTOR]);
- set_irq_chip_and_handler_name(i, &lguest_irq_controller,
- handle_level_irq,
- "level");
- }
+ if (vector != SYSCALL_VECTOR)
+ set_intr_gate(vector, interrupt[i]);
}
/* This call is required to set up for 4k stacks, where we have
* separate stacks for hard and soft interrupts. */
irq_ctx_init(smp_processor_id());
}
+void lguest_setup_irq(unsigned int irq)
+{
+ irq_to_desc_alloc_cpu(irq, 0);
+ set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
+ handle_level_irq, "level");
+}
+
/*
* Time.
*
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c22981fa2f3..ad5441ed1b5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
+
#include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
/*
* memcpy - Copy a memory block.
*
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
* Output:
* rax original destination
- */
+ */
+/*
+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
+ *
+ * Calls to this get patched into the kernel image via the
+ * alternative instructions framework:
+ */
ALIGN
memcpy_c:
CFI_STARTPROC
- movq %rdi,%rax
- movl %edx,%ecx
- shrl $3,%ecx
- andl $7,%edx
+ movq %rdi, %rax
+
+ movl %edx, %ecx
+ shrl $3, %ecx
+ andl $7, %edx
rep movsq
- movl %edx,%ecx
+ movl %edx, %ecx
rep movsb
ret
CFI_ENDPROC
@@ -33,99 +41,110 @@ ENDPROC(memcpy_c)
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
- pushq %rbx
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbx, 0
- movq %rdi,%rax
- movl %edx,%ecx
- shrl $6,%ecx
+ /*
+ * Put the number of full 64-byte blocks into %ecx.
+ * Tail portion is handled at the end:
+ */
+ movq %rdi, %rax
+ movl %edx, %ecx
+ shrl $6, %ecx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
+ /*
+ * We decrement the loop index here - and the zero-flag is
+ * checked at the end of the loop (instructions inbetween do
+ * not change the zero flag):
+ */
decl %ecx
- movq (%rsi),%r11
- movq 8(%rsi),%r8
+ /*
+ * Move in blocks of 4x16 bytes:
+ */
+ movq 0*8(%rsi), %r11
+ movq 1*8(%rsi), %r8
+ movq %r11, 0*8(%rdi)
+ movq %r8, 1*8(%rdi)
- movq %r11,(%rdi)
- movq %r8,1*8(%rdi)
+ movq 2*8(%rsi), %r9
+ movq 3*8(%rsi), %r10
+ movq %r9, 2*8(%rdi)
+ movq %r10, 3*8(%rdi)
- movq 2*8(%rsi),%r9
- movq 3*8(%rsi),%r10
+ movq 4*8(%rsi), %r11
+ movq 5*8(%rsi), %r8
+ movq %r11, 4*8(%rdi)
+ movq %r8, 5*8(%rdi)
- movq %r9,2*8(%rdi)
- movq %r10,3*8(%rdi)
+ movq 6*8(%rsi), %r9
+ movq 7*8(%rsi), %r10
+ movq %r9, 6*8(%rdi)
+ movq %r10, 7*8(%rdi)
- movq 4*8(%rsi),%r11
- movq 5*8(%rsi),%r8
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
- movq %r11,4*8(%rdi)
- movq %r8,5*8(%rdi)
-
- movq 6*8(%rsi),%r9
- movq 7*8(%rsi),%r10
-
- movq %r9,6*8(%rdi)
- movq %r10,7*8(%rdi)
-
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
jnz .Lloop_64
.Lhandle_tail:
- movl %edx,%ecx
- andl $63,%ecx
- shrl $3,%ecx
+ movl %edx, %ecx
+ andl $63, %ecx
+ shrl $3, %ecx
jz .Lhandle_7
+
.p2align 4
.Lloop_8:
decl %ecx
- movq (%rsi),%r8
- movq %r8,(%rdi)
- leaq 8(%rdi),%rdi
- leaq 8(%rsi),%rsi
+ movq (%rsi), %r8
+ movq %r8, (%rdi)
+ leaq 8(%rdi), %rdi
+ leaq 8(%rsi), %rsi
jnz .Lloop_8
.Lhandle_7:
- movl %edx,%ecx
- andl $7,%ecx
- jz .Lende
+ movl %edx, %ecx
+ andl $7, %ecx
+ jz .Lend
+
.p2align 4
.Lloop_1:
- movb (%rsi),%r8b
- movb %r8b,(%rdi)
+ movb (%rsi), %r8b
+ movb %r8b, (%rdi)
incq %rdi
incq %rsi
decl %ecx
jnz .Lloop_1
-.Lende:
- popq %rbx
- CFI_ADJUST_CFA_OFFSET -8
- CFI_RESTORE rbx
+.Lend:
ret
-.Lfinal:
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
- /* Some CPUs run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
+ /*
+ * Some CPUs run faster using the string copy instructions.
+ * It is also a lot simpler. Use this when possible:
+ */
- .section .altinstr_replacement,"ax"
+ .section .altinstr_replacement, "ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
2:
.previous
- .section .altinstructions,"a"
+
+ .section .altinstructions, "a"
.align 8
.quad memcpy
.quad 1b
.byte X86_FEATURE_REP_GOOD
- /* Replace only beginning, memcpy is used to apply alternatives, so it
- * is silly to overwrite itself with nops - reboot is only outcome... */
+
+ /*
+ * Replace only beginning, memcpy is used to apply alternatives,
+ * so it is silly to overwrite itself with nops - reboot is the
+ * only outcome...
+ */
.byte 2b - 1b
.byte 2b - 1b
.previous
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 491e737ce54..aa098708877 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,20 +30,29 @@ static void fclex(void)
}
/* Needs to be externally visible */
-void finit(void)
+void finit_task(struct task_struct *tsk)
{
- control_word = 0x037f;
- partial_status = 0;
- top = 0; /* We don't keep top in the status word internally. */
- fpu_tag_word = 0xffff;
+ struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
+ struct address *oaddr, *iaddr;
+ soft->cwd = 0x037f;
+ soft->swd = 0;
+ soft->ftop = 0; /* We don't keep top in the status word internally. */
+ soft->twd = 0xffff;
/* The behaviour is different from that detailed in
Section 15.1.6 of the Intel manual */
- operand_address.offset = 0;
- operand_address.selector = 0;
- instruction_address.offset = 0;
- instruction_address.selector = 0;
- instruction_address.opcode = 0;
- no_ip_update = 1;
+ oaddr = (struct address *)&soft->foo;
+ oaddr->offset = 0;
+ oaddr->selector = 0;
+ iaddr = (struct address *)&soft->fip;
+ iaddr->offset = 0;
+ iaddr->selector = 0;
+ iaddr->opcode = 0;
+ soft->no_update = 1;
+}
+
+void finit(void)
+{
+ finit_task(current);
}
/*
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 00f127c80b0..522db5e3d0b 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -121,22 +121,13 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
pagefault_enable();
}
-/* This is the same as kmap_atomic() but can map memory that doesn't
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
* have a struct page associated with it.
*/
void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
{
- enum fixed_addresses idx;
- unsigned long vaddr;
-
- pagefault_disable();
-
- idx = type + KM_TYPE_NR*smp_processor_id();
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
- arch_flush_lazy_mmu_mode();
-
- return (void*) vaddr;
+ return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
}
EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
@@ -158,7 +149,6 @@ EXPORT_SYMBOL(kunmap);
EXPORT_SYMBOL(kmap_atomic);
EXPORT_SYMBOL(kunmap_atomic);
-#ifdef CONFIG_NUMA
void __init set_highmem_pages_init(void)
{
struct zone *zone;
@@ -182,11 +172,3 @@ void __init set_highmem_pages_init(void)
}
totalram_pages += totalhigh_pages;
}
-#else
-void __init set_highmem_pages_init(void)
-{
- add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-
- totalram_pages += totalhigh_pages;
-}
-#endif /* CONFIG_NUMA */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ce6a722587d..fd3da1dda1c 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,8 +1,345 @@
+#include <linux/ioport.h>
#include <linux/swap.h>
+
#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
#include <asm/page.h>
+#include <asm/page_types.h>
#include <asm/sections.h>
#include <asm/system.h>
+#include <asm/tlbflush.h>
+
+unsigned long __initdata e820_table_start;
+unsigned long __meminitdata e820_table_end;
+unsigned long __meminitdata e820_table_top;
+
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+ int use_gbpages)
+{
+ unsigned long puds, pmds, ptes, tables, start;
+
+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+ tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+ if (use_gbpages) {
+ unsigned long extra;
+
+ extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+ pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+ } else
+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+ tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+ if (use_pse) {
+ unsigned long extra;
+
+ extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+ extra += PMD_SIZE;
+#endif
+ ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else
+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+ /* for fixmap */
+ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+
+ /*
+ * RED-PEN putting page tables only on node 0 could
+ * cause a hotspot and fill up ZONE_DMA. The page tables
+ * need roughly 0.5KB per GB.
+ */
+#ifdef CONFIG_X86_32
+ start = 0x7000;
+ e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+ tables, PAGE_SIZE);
+#else /* CONFIG_X86_64 */
+ start = 0x8000;
+ e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+#endif
+ if (e820_table_start == -1UL)
+ panic("Cannot find space for the kernel page tables");
+
+ e820_table_start >>= PAGE_SHIFT;
+ e820_table_end = e820_table_start;
+ e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+
+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+}
+
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int __meminit save_mr(struct map_range *mr, int nr_range,
+ unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long page_size_mask)
+{
+ if (start_pfn < end_pfn) {
+ if (nr_range >= NR_RANGE_MR)
+ panic("run out of range for init_memory_mapping\n");
+ mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+ mr[nr_range].end = end_pfn<<PAGE_SHIFT;
+ mr[nr_range].page_size_mask = page_size_mask;
+ nr_range++;
+ }
+
+ return nr_range;
+}
+
+#ifdef CONFIG_X86_64
+static void __init init_gbpages(void)
+{
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+#endif
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ unsigned long page_size_mask = 0;
+ unsigned long start_pfn, end_pfn;
+ unsigned long ret = 0;
+ unsigned long pos;
+
+ struct map_range mr[NR_RANGE_MR];
+ int nr_range, i;
+ int use_pse, use_gbpages;
+
+ printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+ if (!after_bootmem)
+ init_gbpages();
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ use_pse = use_gbpages = 0;
+#else
+ use_pse = cpu_has_pse;
+ use_gbpages = direct_gbpages;
+#endif
+
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_PAE
+ set_nx();
+ if (nx_enabled)
+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+#endif
+
+ if (use_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (use_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = 0;
+
+ /* head if not big page alignment ? */
+ start_pfn = start >> PAGE_SHIFT;
+ pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+ /*
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
+ */
+ if (pos == 0)
+ end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+ else
+ end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+ end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#endif
+ if (end_pfn > (end >> PAGE_SHIFT))
+ end_pfn = end >> PAGE_SHIFT;
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+ /* big page (2M) range */
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+ end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+ end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+#ifdef CONFIG_X86_64
+ /* big page (1G) range */
+ start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask &
+ ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+ /* tail is not big page (1G) alignment */
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+#endif
+
+ /* tail is not big page (2M) alignment */
+ start_pfn = pos>>PAGE_SHIFT;
+ end_pfn = end>>PAGE_SHIFT;
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+ /* try to merge same page size and continuous */
+ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+ unsigned long old_start;
+ if (mr[i].end != mr[i+1].start ||
+ mr[i].page_size_mask != mr[i+1].page_size_mask)
+ continue;
+ /* move it */
+ old_start = mr[i].start;
+ memmove(&mr[i], &mr[i+1],
+ (nr_range - 1 - i) * sizeof(struct map_range));
+ mr[i--].start = old_start;
+ nr_range--;
+ }
+
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+ mr[i].start, mr[i].end,
+ (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+ (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+ /*
+ * Find space for the kernel direct mapping tables.
+ *
+ * Later we should allocate these tables in the local node of the
+ * memory mapped. Unfortunately this is done currently before the
+ * nodes are discovered.
+ */
+ if (!after_bootmem)
+ find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_32
+ for (i = 0; i < nr_range; i++)
+ kernel_physical_mapping_init(mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+ ret = end;
+#else /* CONFIG_X86_64 */
+ for (i = 0; i < nr_range; i++)
+ ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+#endif
+
+#ifdef CONFIG_X86_32
+ early_ioremap_page_table_range_init();
+
+ load_cr3(swapper_pg_dir);
+#endif
+
+#ifdef CONFIG_X86_64
+ if (!after_bootmem)
+ mmu_cr4_features = read_cr4();
+#endif
+ __flush_tlb_all();
+
+ if (!after_bootmem && e820_table_end > e820_table_start)
+ reserve_early(e820_table_start << PAGE_SHIFT,
+ e820_table_end << PAGE_SHIFT, "PGTABLE");
+
+ if (!after_bootmem)
+ early_memtest(start, end);
+
+ return ret >> PAGE_SHIFT;
+}
+
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (pagenr <= 256)
+ return 1;
+ if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+ return 0;
+ if (!page_is_ram(pagenr))
+ return 1;
+ return 0;
+}
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
@@ -47,3 +384,10 @@ void free_initmem(void)
(unsigned long)(&__init_begin),
(unsigned long)(&__init_end));
}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+ free_init_pages("initrd memory", start, end);
+}
+#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0b087dcd2c1..db81e9a8556 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,6 +49,7 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
+#include <asm/init.h>
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
@@ -58,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
-
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
-static int __initdata after_init_bootmem;
+bool __read_mostly __vmalloc_start_set = false;
static __init void *alloc_low_page(void)
{
- unsigned long pfn = table_end++;
+ unsigned long pfn = e820_table_end++;
void *adr;
- if (pfn >= table_top)
+ if (pfn >= e820_table_top)
panic("alloc_low_page: ran out of memory");
adr = __va(pfn * PAGE_SIZE);
@@ -90,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- if (after_init_bootmem)
+ if (after_bootmem)
pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
else
pmd_table = (pmd_t *)alloc_low_page();
@@ -117,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = NULL;
- if (after_init_bootmem) {
+ if (after_bootmem) {
#ifdef CONFIG_DEBUG_PAGEALLOC
page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
#endif
@@ -135,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
return pte_offset_kernel(pmd, 0);
}
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+ int pgd_idx = pgd_index(vaddr);
+ int pmd_idx = pmd_index(vaddr);
+
+ return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+ int pte_idx = pte_index(vaddr);
+ pmd_t *pmd;
+
+ pmd = populate_extra_pmd(vaddr);
+ return one_page_table_init(pmd) + pte_idx;
+}
+
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
unsigned long vaddr, pte_t *lastpte)
{
@@ -151,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
&& (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
- && ((__pa(pte) >> PAGE_SHIFT) < table_start
- || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+ && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+ || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
pte_t *newpte;
int i;
- BUG_ON(after_init_bootmem);
+ BUG_ON(after_bootmem);
newpte = alloc_low_page();
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
@@ -225,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET:
*/
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
- unsigned long start_pfn,
- unsigned long end_pfn,
- int use_pse)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
{
+ int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+ unsigned long start_pfn, end_pfn;
+ pgd_t *pgd_base = swapper_pg_dir;
int pgd_idx, pmd_idx, pte_ofs;
unsigned long pfn;
pgd_t *pgd;
@@ -238,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
unsigned pages_2m, pages_4k;
int mapping_iter;
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = end >> PAGE_SHIFT;
+
/*
* First iteration will setup identity mapping using large/small pages
* based on use_pse, with other attributes same as set by
@@ -352,26 +371,6 @@ repeat:
mapping_iter = 2;
goto repeat;
}
-}
-
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
- if (pagenr <= 256)
- return 1;
- if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
- return 0;
- if (!page_is_ram(pagenr))
- return 1;
return 0;
}
@@ -528,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
* be partially populated, and so it avoids stomping on any existing
* mappings.
*/
-static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
+void __init early_ioremap_page_table_range_init(void)
{
+ pgd_t *pgd_base = swapper_pg_dir;
unsigned long vaddr, end;
/*
@@ -624,7 +624,7 @@ static int __init noexec_setup(char *str)
}
early_param("noexec", noexec_setup);
-static void __init set_nx(void)
+void __init set_nx(void)
{
unsigned int v[4], l, h;
@@ -776,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
#ifdef CONFIG_FLATMEM
max_mapnr = num_physpages;
#endif
+ __vmalloc_start_set = true;
+
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
@@ -797,176 +799,66 @@ static void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
+static unsigned long __init setup_node_bootmem(int nodeid,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long bootmap)
+{
+ unsigned long bootmap_size;
+
+ /* don't touch min_low_pfn */
+ bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+ bootmap >> PAGE_SHIFT,
+ start_pfn, end_pfn);
+ printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
+ nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+ printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
+ nodeid, bootmap, bootmap + bootmap_size);
+ free_bootmem_with_active_regions(nodeid, end_pfn);
+ early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+ return bootmap + bootmap_size;
+}
+
void __init setup_bootmem_allocator(void)
{
- int i;
+ int nodeid;
unsigned long bootmap_size, bootmap;
/*
* Initialize the boot-time allocator (with low memory only):
*/
bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
- bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
- max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+ bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
PAGE_SIZE);
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
- /* don't touch min_low_pfn */
- bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
- min_low_pfn, max_low_pfn);
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
- printk(KERN_INFO " low ram: %08lx - %08lx\n",
- min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
- printk(KERN_INFO " bootmap %08lx - %08lx\n",
- bootmap, bootmap + bootmap_size);
- for_each_online_node(i)
- free_bootmem_with_active_regions(i, max_low_pfn);
- early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-
- after_init_bootmem = 1;
-}
-
-static void __init find_early_table_space(unsigned long end, int use_pse)
-{
- unsigned long puds, pmds, ptes, tables, start;
-
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
- if (use_pse) {
- unsigned long extra;
+ printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
- extra += PMD_SIZE;
- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for_each_online_node(nodeid) {
+ unsigned long start_pfn, end_pfn;
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
- /* for fixmap */
- tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-
- /*
- * RED-PEN putting page tables only on node 0 could
- * cause a hotspot and fill up ZONE_DMA. The page tables
- * need roughly 0.5KB per GB.
- */
- start = 0x7000;
- table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
- tables, PAGE_SIZE);
- if (table_start == -1UL)
- panic("Cannot find space for the kernel page tables");
-
- table_start >>= PAGE_SHIFT;
- table_end = table_start;
- table_top = table_start + (tables>>PAGE_SHIFT);
-
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_start << PAGE_SHIFT,
- (table_start << PAGE_SHIFT) + tables);
-}
-
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
-{
- pgd_t *pgd_base = swapper_pg_dir;
- unsigned long start_pfn, end_pfn;
- unsigned long big_page_start;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- int use_pse = 0;
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ start_pfn = node_start_pfn[nodeid];
+ end_pfn = node_end_pfn[nodeid];
+ if (start_pfn > max_low_pfn)
+ continue;
+ if (end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
#else
- int use_pse = cpu_has_pse;
-#endif
-
- /*
- * Find space for the kernel direct mapping tables.
- */
- if (!after_init_bootmem)
- find_early_table_space(end, use_pse);
-
-#ifdef CONFIG_X86_PAE
- set_nx();
- if (nx_enabled)
- printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+ start_pfn = 0;
+ end_pfn = max_low_pfn;
#endif
-
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
-
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
- }
-
- /*
- * Don't use a large page for the first 2/4MB of memory
- * because there are often fixed size MTRRs in there
- * and overlapping MTRRs into large pages can cause
- * slowdowns.
- */
- big_page_start = PMD_SIZE;
-
- if (start < big_page_start) {
- start_pfn = start >> PAGE_SHIFT;
- end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
- } else {
- /* head is not big page alignment ? */
- start_pfn = start >> PAGE_SHIFT;
- end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- }
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
-
- /* big page range */
- start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < (big_page_start >> PAGE_SHIFT))
- start_pfn = big_page_start >> PAGE_SHIFT;
- end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
- use_pse);
-
- /* tail is not big page alignment ? */
- start_pfn = end_pfn;
- if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
- end_pfn = end >> PAGE_SHIFT;
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn,
- end_pfn, 0);
+ bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
+ bootmap);
}
- early_ioremap_page_table_range_init(pgd_base);
-
- load_cr3(swapper_pg_dir);
-
- __flush_tlb_all();
-
- if (!after_init_bootmem)
- reserve_early(table_start << PAGE_SHIFT,
- table_end << PAGE_SHIFT, "PGTABLE");
-
- if (!after_init_bootmem)
- early_memtest(start, end);
-
- return end >> PAGE_SHIFT;
+ after_bootmem = 1;
}
-
/*
* paging_init() sets up the page tables - note that the first 8MB are
* already mapped by head.S.
@@ -1200,13 +1092,6 @@ void mark_rodata_ro(void)
}
#endif
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_init_pages("initrd memory", start, end);
-}
-#endif
-
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 724e537432e..54efa57d1c0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
#include <asm/kdebug.h>
#include <asm/numa.h>
#include <asm/cacheflush.h>
+#include <asm/init.h>
/*
* end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
- = 1
-#endif
-;
-
static int __init parse_direct_gbpages_off(char *arg)
{
direct_gbpages = 0;
@@ -87,12 +82,10 @@ early_param("gbpages", parse_direct_gbpages_on);
* around without checking the pgd every time.
*/
-int after_bootmem;
-
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
-static int do_not_nx __cpuinitdata;
+static int disable_nx __cpuinitdata;
/*
* noexec=on|off
@@ -107,9 +100,9 @@ static int __init nonx_setup(char *str)
return -EINVAL;
if (!strncmp(str, "on", 2)) {
__supported_pte_mask |= _PAGE_NX;
- do_not_nx = 0;
+ disable_nx = 0;
} else if (!strncmp(str, "off", 3)) {
- do_not_nx = 1;
+ disable_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
}
return 0;
@@ -121,7 +114,7 @@ void __cpuinit check_efer(void)
unsigned long efer;
rdmsrl(MSR_EFER, efer);
- if (!(efer & EFER_NX) || do_not_nx)
+ if (!(efer & EFER_NX) || disable_nx)
__supported_pte_mask &= ~_PAGE_NX;
}
@@ -168,34 +161,51 @@ static __ref void *spp_getpage(void)
return ptr;
}
-void
-set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ if (pgd_none(*pgd)) {
+ pud_t *pud = (pud_t *)spp_getpage();
+ pgd_populate(&init_mm, pgd, pud);
+ if (pud != pud_offset(pgd, 0))
+ printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+ pud, pud_offset(pgd, 0));
+ }
+ return pud_offset(pgd, vaddr);
+}
- pud = pud_page + pud_index(vaddr);
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
if (pud_none(*pud)) {
- pmd = (pmd_t *) spp_getpage();
+ pmd_t *pmd = (pmd_t *) spp_getpage();
pud_populate(&init_mm, pud, pmd);
- if (pmd != pmd_offset(pud, 0)) {
+ if (pmd != pmd_offset(pud, 0))
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
- pmd, pmd_offset(pud, 0));
- return;
- }
+ pmd, pmd_offset(pud, 0));
}
- pmd = pmd_offset(pud, vaddr);
+ return pmd_offset(pud, vaddr);
+}
+
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
if (pmd_none(*pmd)) {
- pte = (pte_t *) spp_getpage();
+ pte_t *pte = (pte_t *) spp_getpage();
pmd_populate_kernel(&init_mm, pmd, pte);
- if (pte != pte_offset_kernel(pmd, 0)) {
+ if (pte != pte_offset_kernel(pmd, 0))
printk(KERN_ERR "PAGETABLE BUG #02!\n");
- return;
- }
}
+ return pte_offset_kernel(pmd, vaddr);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pud = pud_page + pud_index(vaddr);
+ pmd = fill_pmd(pud, vaddr);
+ pte = fill_pte(pmd, vaddr);
- pte = pte_offset_kernel(pmd, vaddr);
set_pte(pte, new_pte);
/*
@@ -205,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
__flush_tlb_one(vaddr);
}
-void
-set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
pud_t *pud_page;
@@ -223,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
set_pte_vaddr_pud(pud_page, vaddr, pteval);
}
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(vaddr);
+ pud = fill_pud(pgd, vaddr);
+ return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+ pmd_t *pmd;
+
+ pmd = populate_extra_pmd(vaddr);
+ return fill_pte(pmd, vaddr);
+}
+
/*
* Create large page table mappings for a range of physical addresses.
*/
@@ -291,13 +318,9 @@ void __init cleanup_highmap(void)
}
}
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
static __ref void *alloc_low_page(unsigned long *phys)
{
- unsigned long pfn = table_end++;
+ unsigned long pfn = e820_table_end++;
void *adr;
if (after_bootmem) {
@@ -307,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
return adr;
}
- if (pfn >= table_top)
+ if (pfn >= e820_table_top)
panic("alloc_low_page: ran out of memory");
adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -547,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
return phys_pud_init(pud, addr, end, page_size_mask);
}
-static void __init find_early_table_space(unsigned long end, int use_pse,
- int use_gbpages)
-{
- unsigned long puds, pmds, ptes, tables, start;
-
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
- if (use_gbpages) {
- unsigned long extra;
- extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
- pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
- } else
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
- if (use_pse) {
- unsigned long extra;
- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
- /*
- * RED-PEN putting page tables only on node 0 could
- * cause a hotspot and fill up ZONE_DMA. The page tables
- * need roughly 0.5KB per GB.
- */
- start = 0x8000;
- table_start = find_e820_area(start, end, tables, PAGE_SIZE);
- if (table_start == -1UL)
- panic("Cannot find space for the kernel page tables");
-
- table_start >>= PAGE_SHIFT;
- table_end = table_start;
- table_top = table_start + (tables >> PAGE_SHIFT);
-
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-
-static void __init init_gbpages(void)
-{
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-}
-
-static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
- unsigned long end,
- unsigned long page_size_mask)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
{
unsigned long next, last_map_addr = end;
@@ -635,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
return last_map_addr;
}
-struct map_range {
- unsigned long start;
- unsigned long end;
- unsigned page_size_mask;
-};
-
-#define NR_RANGE_MR 5
-
-static int save_mr(struct map_range *mr, int nr_range,
- unsigned long start_pfn, unsigned long end_pfn,
- unsigned long page_size_mask)
-{
-
- if (start_pfn < end_pfn) {
- if (nr_range >= NR_RANGE_MR)
- panic("run out of range for init_memory_mapping\n");
- mr[nr_range].start = start_pfn<<PAGE_SHIFT;
- mr[nr_range].end = end_pfn<<PAGE_SHIFT;
- mr[nr_range].page_size_mask = page_size_mask;
- nr_range++;
- }
-
- return nr_range;
-}
-
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
-{
- unsigned long last_map_addr = 0;
- unsigned long page_size_mask = 0;
- unsigned long start_pfn, end_pfn;
- unsigned long pos;
-
- struct map_range mr[NR_RANGE_MR];
- int nr_range, i;
- int use_pse, use_gbpages;
-
- printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
- /*
- * Find space for the kernel direct mapping tables.
- *
- * Later we should allocate these tables in the local node of the
- * memory mapped. Unfortunately this is done currently before the
- * nodes are discovered.
- */
- if (!after_bootmem)
- init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- use_pse = use_gbpages = 0;
-#else
- use_pse = cpu_has_pse;
- use_gbpages = direct_gbpages;
-#endif
-
- if (use_gbpages)
- page_size_mask |= 1 << PG_LEVEL_1G;
- if (use_pse)
- page_size_mask |= 1 << PG_LEVEL_2M;
-
- memset(mr, 0, sizeof(mr));
- nr_range = 0;
-
- /* head if not big page alignment ?*/
- start_pfn = start >> PAGE_SHIFT;
- pos = start_pfn << PAGE_SHIFT;
- end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- if (end_pfn > (end >> PAGE_SHIFT))
- end_pfn = end >> PAGE_SHIFT;
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* big page (2M) range*/
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
- end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* big page (1G) range */
- start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask &
- ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* tail is not big page (1G) alignment */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* tail is not big page (2M) alignment */
- start_pfn = pos>>PAGE_SHIFT;
- end_pfn = end>>PAGE_SHIFT;
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-
- /* try to merge same page size and continuous */
- for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
- unsigned long old_start;
- if (mr[i].end != mr[i+1].start ||
- mr[i].page_size_mask != mr[i+1].page_size_mask)
- continue;
- /* move it */
- old_start = mr[i].start;
- memmove(&mr[i], &mr[i+1],
- (nr_range - 1 - i) * sizeof (struct map_range));
- mr[i--].start = old_start;
- nr_range--;
- }
-
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG " %010lx - %010lx page %s\n",
- mr[i].start, mr[i].end,
- (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
- (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
- if (!after_bootmem)
- find_early_table_space(end, use_pse, use_gbpages);
-
- for (i = 0; i < nr_range; i++)
- last_map_addr = kernel_physical_mapping_init(
- mr[i].start, mr[i].end,
- mr[i].page_size_mask);
-
- if (!after_bootmem)
- mmu_cr4_features = read_cr4();
- __flush_tlb_all();
-
- if (!after_bootmem && table_end > table_start)
- reserve_early(table_start << PAGE_SHIFT,
- table_end << PAGE_SHIFT, "PGTABLE");
-
- printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
- last_map_addr, end);
-
- if (!after_bootmem)
- early_memtest(start, end);
-
- return last_map_addr >> PAGE_SHIFT;
-}
-
#ifndef CONFIG_NUMA
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -876,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif /* CONFIG_MEMORY_HOTPLUG */
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
- if (pagenr <= 256)
- return 1;
- if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
- return 0;
- if (!page_is_ram(pagenr))
- return 1;
- return 0;
-}
-
-
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
kcore_modules, kcore_vsyscall;
@@ -985,13 +768,6 @@ void mark_rodata_ro(void)
#endif
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_init_pages("initrd memory", start, end);
-}
-#endif
-
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 04102d42ff4..6e60ba698ce 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -18,6 +18,7 @@
#include <asm/iomap.h>
#include <asm/pat.h>
+#include <asm/highmem.h>
#include <linux/module.h>
int is_io_mapping_possible(resource_size_t base, unsigned long size)
@@ -31,16 +32,27 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
}
EXPORT_SYMBOL_GPL(is_io_mapping_possible);
-/* Map 'pfn' using fixed map 'type' and protections 'prot'
- */
-void *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
pagefault_disable();
+ idx = type + KM_TYPE_NR * smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
+ arch_flush_lazy_mmu_mode();
+
+ return (void *)vaddr;
+}
+
+/*
+ * Map 'pfn' using fixed map 'type' and protections 'prot'
+ */
+void *
+iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+{
/*
* For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
* PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
@@ -50,12 +62,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
prot = PAGE_KERNEL_UC_MINUS;
- idx = type + KM_TYPE_NR*smp_processor_id();
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
- arch_flush_lazy_mmu_mode();
-
- return (void*) vaddr;
+ return kmap_atomic_prot_pfn(pfn, type, prot);
}
EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 433f7bd4648..55e127f71ed 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -22,13 +22,17 @@
#include <asm/pgalloc.h>
#include <asm/pat.h>
-#ifdef CONFIG_X86_64
-
-static inline int phys_addr_valid(unsigned long addr)
+static inline int phys_addr_valid(resource_size_t addr)
{
- return addr < (1UL << boot_cpu_data.x86_phys_bits);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ return !(addr >> boot_cpu_data.x86_phys_bits);
+#else
+ return 1;
+#endif
}
+#ifdef CONFIG_X86_64
+
unsigned long __phys_addr(unsigned long x)
{
if (x >= __START_KERNEL_map) {
@@ -38,8 +42,7 @@ unsigned long __phys_addr(unsigned long x)
} else {
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
x -= PAGE_OFFSET;
- VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
- !phys_addr_valid(x));
+ VIRTUAL_BUG_ON(!phys_addr_valid(x));
}
return x;
}
@@ -56,10 +59,8 @@ bool __virt_addr_valid(unsigned long x)
if (x < PAGE_OFFSET)
return false;
x -= PAGE_OFFSET;
- if (system_state == SYSTEM_BOOTING ?
- x > MAXMEM : !phys_addr_valid(x)) {
+ if (!phys_addr_valid(x))
return false;
- }
}
return pfn_valid(x >> PAGE_SHIFT);
@@ -68,18 +69,12 @@ EXPORT_SYMBOL(__virt_addr_valid);
#else
-static inline int phys_addr_valid(unsigned long addr)
-{
- return 1;
-}
-
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
- /* VMALLOC_* aren't constants; not available at the boot time */
+ /* VMALLOC_* aren't constants */
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
- is_vmalloc_addr((void *) x));
+ VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
return x - PAGE_OFFSET;
}
EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +84,9 @@ bool __virt_addr_valid(unsigned long x)
{
if (x < PAGE_OFFSET)
return false;
- if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+ if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+ return false;
+ if (x >= FIXADDR_START)
return false;
return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
}
@@ -490,7 +487,12 @@ static int __init early_ioremap_debug_setup(char *str)
early_param("early_ioremap_debug", early_ioremap_debug_setup);
static __initdata int after_paging_init;
-static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
+#define __FIXADDR_TOP (-PAGE_SIZE)
+static pte_t bm_pte[(__fix_to_virt(FIX_DBGP_BASE)
+ ^ __fix_to_virt(FIX_BTMAP_BEGIN)) >> PMD_SHIFT
+ ? PAGE_SIZE / sizeof(pte_t) : 0] __page_aligned_bss;
+#undef __FIXADDR_TOP
+static __initdata pte_t *bm_ptep;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
@@ -505,19 +507,33 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
static inline pte_t * __init early_ioremap_pte(unsigned long addr)
{
+ if (!sizeof(bm_pte))
+ return &bm_ptep[pte_index(addr)];
return &bm_pte[pte_index(addr)];
}
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
void __init early_ioremap_init(void)
{
pmd_t *pmd;
+ int i;
if (early_ioremap_debug)
printk(KERN_INFO "early_ioremap_init()\n");
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
- memset(bm_pte, 0, sizeof(bm_pte));
- pmd_populate_kernel(&init_mm, pmd, bm_pte);
+ if (sizeof(bm_pte)) {
+ memset(bm_pte, 0, sizeof(bm_pte));
+ pmd_populate_kernel(&init_mm, pmd, bm_pte);
+ } else {
+ bm_ptep = pte_offset_kernel(pmd, 0);
+ if (early_ioremap_debug)
+ printk(KERN_INFO "bm_ptep=%p\n", bm_ptep);
+ }
/*
* The boot-ioremap range spans multiple pmds, for which
@@ -581,6 +597,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+
static int __init check_early_ioremap_leak(void)
{
int count = 0;
@@ -602,7 +619,8 @@ static int __init check_early_ioremap_leak(void)
}
late_initcall(check_early_ioremap_leak);
-static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
+static void __init __iomem *
+__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
{
unsigned long offset, last_addr;
unsigned int nrpages;
@@ -668,9 +686,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo
--nrpages;
}
if (early_ioremap_debug)
- printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+ printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
- prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0));
+ prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
return prev_map[slot];
}
@@ -738,8 +756,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
}
prev_map[slot] = NULL;
}
-
-void __this_fixmap_does_not_exist(void)
-{
- WARN_ON(1);
-}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 93d82038af4..4f115e00486 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,11 +32,14 @@ struct kmmio_fault_page {
struct list_head list;
struct kmmio_fault_page *release_next;
unsigned long page; /* location of the fault page */
+ bool old_presence; /* page presence prior to arming */
+ bool armed;
/*
* Number of times this page has been registered as a part
* of a probe. If zero, page is disarmed and this may be freed.
- * Used only by writers (RCU).
+ * Used only by writers (RCU) and post_kmmio_handler().
+ * Protected by kmmio_lock, when linked into kmmio_page_table.
*/
int count;
};
@@ -105,57 +108,85 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
return NULL;
}
-static void set_page_present(unsigned long addr, bool present,
- unsigned int *pglevel)
+static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
+{
+ pmdval_t v = pmd_val(*pmd);
+ *old = !!(v & _PAGE_PRESENT);
+ v &= ~_PAGE_PRESENT;
+ if (present)
+ v |= _PAGE_PRESENT;
+ set_pmd(pmd, __pmd(v));
+}
+
+static void set_pte_presence(pte_t *pte, bool present, bool *old)
+{
+ pteval_t v = pte_val(*pte);
+ *old = !!(v & _PAGE_PRESENT);
+ v &= ~_PAGE_PRESENT;
+ if (present)
+ v |= _PAGE_PRESENT;
+ set_pte_atomic(pte, __pte(v));
+}
+
+static int set_page_presence(unsigned long addr, bool present, bool *old)
{
- pteval_t pteval;
- pmdval_t pmdval;
unsigned int level;
- pmd_t *pmd;
pte_t *pte = lookup_address(addr, &level);
if (!pte) {
pr_err("kmmio: no pte for page 0x%08lx\n", addr);
- return;
+ return -1;
}
- if (pglevel)
- *pglevel = level;
-
switch (level) {
case PG_LEVEL_2M:
- pmd = (pmd_t *)pte;
- pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
- if (present)
- pmdval |= _PAGE_PRESENT;
- set_pmd(pmd, __pmd(pmdval));
+ set_pmd_presence((pmd_t *)pte, present, old);
break;
-
case PG_LEVEL_4K:
- pteval = pte_val(*pte) & ~_PAGE_PRESENT;
- if (present)
- pteval |= _PAGE_PRESENT;
- set_pte_atomic(pte, __pte(pteval));
+ set_pte_presence(pte, present, old);
break;
-
default:
pr_err("kmmio: unexpected page level 0x%x.\n", level);
- return;
+ return -1;
}
__flush_tlb_one(addr);
+ return 0;
}
-/** Mark the given page as not present. Access to it will trigger a fault. */
-static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+/*
+ * Mark the given page as not present. Access to it will trigger a fault.
+ *
+ * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
+ * protection is ignored here. RCU read lock is assumed held, so the struct
+ * will not disappear unexpectedly. Furthermore, the caller must guarantee,
+ * that double arming the same virtual address (page) cannot occur.
+ *
+ * Double disarming on the other hand is allowed, and may occur when a fault
+ * and mmiotrace shutdown happen simultaneously.
+ */
+static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
{
- set_page_present(page & PAGE_MASK, false, pglevel);
+ int ret;
+ WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
+ if (f->armed) {
+ pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
+ f->page, f->count, f->old_presence);
+ }
+ ret = set_page_presence(f->page, false, &f->old_presence);
+ WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
+ f->armed = true;
+ return ret;
}
-/** Mark the given page as present. */
-static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+/** Restore the given page to saved presence state. */
+static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
{
- set_page_present(page & PAGE_MASK, true, pglevel);
+ bool tmp;
+ int ret = set_page_presence(f->page, f->old_presence, &tmp);
+ WARN_ONCE(ret < 0,
+ KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
+ f->armed = false;
}
/*
@@ -202,28 +233,32 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
ctx = &get_cpu_var(kmmio_ctx);
if (ctx->active) {
- disarm_kmmio_fault_page(faultpage->page, NULL);
if (addr == ctx->addr) {
/*
- * On SMP we sometimes get recursive probe hits on the
- * same address. Context is already saved, fall out.
+ * A second fault on the same page means some other
+ * condition needs handling by do_page_fault(), the
+ * page really not being present is the most common.
*/
- pr_debug("kmmio: duplicate probe hit on CPU %d, for "
- "address 0x%08lx.\n",
- smp_processor_id(), addr);
- ret = 1;
- goto no_kmmio_ctx;
- }
- /*
- * Prevent overwriting already in-flight context.
- * This should not happen, let's hope disarming at least
- * prevents a panic.
- */
- pr_emerg("kmmio: recursive probe hit on CPU %d, "
+ pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n",
+ addr, smp_processor_id());
+
+ if (!faultpage->old_presence)
+ pr_info("kmmio: unexpected secondary hit for "
+ "address 0x%08lx on CPU %d.\n", addr,
+ smp_processor_id());
+ } else {
+ /*
+ * Prevent overwriting already in-flight context.
+ * This should not happen, let's hope disarming at
+ * least prevents a panic.
+ */
+ pr_emerg("kmmio: recursive probe hit on CPU %d, "
"for address 0x%08lx. Ignoring.\n",
smp_processor_id(), addr);
- pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
- ctx->addr);
+ pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+ ctx->addr);
+ disarm_kmmio_fault_page(faultpage);
+ }
goto no_kmmio_ctx;
}
ctx->active++;
@@ -244,7 +279,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
regs->flags &= ~X86_EFLAGS_IF;
/* Now we set present bit in PTE and single step. */
- disarm_kmmio_fault_page(ctx->fpage->page, NULL);
+ disarm_kmmio_fault_page(ctx->fpage);
/*
* If another cpu accesses the same page while we are stepping,
@@ -283,7 +318,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
if (ctx->probe && ctx->probe->post_handler)
ctx->probe->post_handler(ctx->probe, condition, regs);
- arm_kmmio_fault_page(ctx->fpage->page, NULL);
+ /* Prevent racing against release_kmmio_fault_page(). */
+ spin_lock(&kmmio_lock);
+ if (ctx->fpage->count)
+ arm_kmmio_fault_page(ctx->fpage);
+ spin_unlock(&kmmio_lock);
regs->flags &= ~X86_EFLAGS_TF;
regs->flags |= ctx->saved_flags;
@@ -315,20 +354,24 @@ static int add_kmmio_fault_page(unsigned long page)
f = get_kmmio_fault_page(page);
if (f) {
if (!f->count)
- arm_kmmio_fault_page(f->page, NULL);
+ arm_kmmio_fault_page(f);
f->count++;
return 0;
}
- f = kmalloc(sizeof(*f), GFP_ATOMIC);
+ f = kzalloc(sizeof(*f), GFP_ATOMIC);
if (!f)
return -1;
f->count = 1;
f->page = page;
- list_add_rcu(&f->list, kmmio_page_list(f->page));
- arm_kmmio_fault_page(f->page, NULL);
+ if (arm_kmmio_fault_page(f)) {
+ kfree(f);
+ return -1;
+ }
+
+ list_add_rcu(&f->list, kmmio_page_list(f->page));
return 0;
}
@@ -347,7 +390,7 @@ static void release_kmmio_fault_page(unsigned long page,
f->count--;
BUG_ON(f->count < 0);
if (!f->count) {
- disarm_kmmio_fault_page(f->page, NULL);
+ disarm_kmmio_fault_page(f);
f->release_next = *release_list;
*release_list = f;
}
@@ -408,23 +451,24 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
static void remove_kmmio_fault_pages(struct rcu_head *head)
{
- struct kmmio_delayed_release *dr = container_of(
- head,
- struct kmmio_delayed_release,
- rcu);
+ struct kmmio_delayed_release *dr =
+ container_of(head, struct kmmio_delayed_release, rcu);
struct kmmio_fault_page *p = dr->release_list;
struct kmmio_fault_page **prevp = &dr->release_list;
unsigned long flags;
+
spin_lock_irqsave(&kmmio_lock, flags);
while (p) {
- if (!p->count)
+ if (!p->count) {
list_del_rcu(&p->list);
- else
+ prevp = &p->release_next;
+ } else {
*prevp = p->release_next;
- prevp = &p->release_next;
+ }
p = p->release_next;
}
spin_unlock_irqrestore(&kmmio_lock, flags);
+
/* This is the real RCU destroy call. */
call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 0bcd7883d03..605c8be0621 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg)
{
if (arg)
memtest_pattern = simple_strtoul(arg, NULL, 0);
+ else
+ memtest_pattern = ARRAY_SIZE(patterns);
+
return 0;
}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 451fe95a035..3daefa04ace 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn,
for_each_online_node(nid)
propagate_e820_map_node(nid);
- for_each_online_node(nid)
+ for_each_online_node(nid) {
memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+ NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+ }
- NODE_DATA(0)->bdata = &bootmem_node_data[0];
setup_bootmem_allocator();
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8253bc97587..9c4294986af 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -522,6 +522,17 @@ static int split_large_page(pte_t *kpte, unsigned long address)
* primary protection behavior:
*/
__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+
+ /*
+ * Intel Atom errata AAH41 workaround.
+ *
+ * The real fix should be in hw or in a microcode update, but
+ * we also probabilistically try to reduce the window of having
+ * a large TLB mixed with 4K TLBs while instruction fetches are
+ * going on.
+ */
+ __flush_tlb_all();
+
base = NULL;
out_unlock:
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2ed37158012..640339ee4fb 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -677,10 +677,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
/*
- * reserve_pfn_range() doesn't support RAM pages.
+ * reserve_pfn_range() doesn't support RAM pages. Maintain the current
+ * behavior with RAM pages by returning success.
*/
if (is_ram != 0)
- return -EINVAL;
+ return 0;
ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
if (ret)
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index ab50a8d7402..427fd1b56df 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,5 +1,5 @@
/*
- * Written by Pekka Paalanen, 2008 <pq@iki.fi>
+ * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
*/
#include <linux/module.h>
#include <linux/io.h>
@@ -9,35 +9,74 @@
static unsigned long mmio_address;
module_param(mmio_address, ulong, 0);
-MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
+MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
+ "(or 8 MB if read_far is non-zero).");
+
+static unsigned long read_far = 0x400100;
+module_param(read_far, ulong, 0);
+MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
+ "(default: 0x400100).");
+
+static unsigned v16(unsigned i)
+{
+ return i * 12 + 7;
+}
+
+static unsigned v32(unsigned i)
+{
+ return i * 212371 + 13;
+}
static void do_write_test(void __iomem *p)
{
unsigned int i;
+ pr_info(MODULE_NAME ": write test.\n");
mmiotrace_printk("Write test.\n");
+
for (i = 0; i < 256; i++)
iowrite8(i, p + i);
+
for (i = 1024; i < (5 * 1024); i += 2)
- iowrite16(i * 12 + 7, p + i);
+ iowrite16(v16(i), p + i);
+
for (i = (5 * 1024); i < (16 * 1024); i += 4)
- iowrite32(i * 212371 + 13, p + i);
+ iowrite32(v32(i), p + i);
}
static void do_read_test(void __iomem *p)
{
unsigned int i;
+ unsigned errs[3] = { 0 };
+ pr_info(MODULE_NAME ": read test.\n");
mmiotrace_printk("Read test.\n");
+
for (i = 0; i < 256; i++)
- ioread8(p + i);
+ if (ioread8(p + i) != i)
+ ++errs[0];
+
for (i = 1024; i < (5 * 1024); i += 2)
- ioread16(p + i);
+ if (ioread16(p + i) != v16(i))
+ ++errs[1];
+
for (i = (5 * 1024); i < (16 * 1024); i += 4)
- ioread32(p + i);
+ if (ioread32(p + i) != v32(i))
+ ++errs[2];
+
+ mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
+ errs[0], errs[1], errs[2]);
}
-static void do_test(void)
+static void do_read_far_test(void __iomem *p)
{
- void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
+ pr_info(MODULE_NAME ": read far test.\n");
+ mmiotrace_printk("Read far test.\n");
+
+ ioread32(p + read_far);
+}
+
+static void do_test(unsigned long size)
+{
+ void __iomem *p = ioremap_nocache(mmio_address, size);
if (!p) {
pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
return;
@@ -45,11 +84,15 @@ static void do_test(void)
mmiotrace_printk("ioremap returned %p.\n", p);
do_write_test(p);
do_read_test(p);
+ if (read_far && read_far < size - 4)
+ do_read_far_test(p);
iounmap(p);
}
static int __init init(void)
{
+ unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
+
if (mmio_address == 0) {
pr_err(MODULE_NAME ": you have to use the module argument "
"mmio_address.\n");
@@ -58,10 +101,11 @@ static int __init init(void)
return -ENXIO;
}
- pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
- "in PCI address space, and writing "
- "rubbish in there.\n", mmio_address);
- do_test();
+ pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
+ "address space, and writing 16 kB of rubbish in there.\n",
+ size >> 10, mmio_address);
+ do_test(size);
+ pr_info(MODULE_NAME ": All done.\n");
return 0;
}
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 82d22fc601a..8c362b96b64 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -90,7 +90,7 @@ static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
return 0;
}
-static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = {
+static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitconst = {
/*
* Systems where PCI IO resource ISA alignment can be skipped
* when the ISA enable bit in the bridge control is not set
@@ -183,7 +183,7 @@ static int __devinit assign_all_busses(const struct dmi_system_id *d)
}
#endif
-static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
+static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
#ifdef __i386__
/*
* Laptops which need pci=assign-busses to see Cardbus cards
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 7d388d5cf54..9c49919e4d1 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -356,7 +356,7 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
-static struct dmi_system_id __devinitdata msi_k8t_dmi_table[] = {
+static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = {
{
.ident = "MSI-K8T-Neo2Fir",
.matches = {
@@ -413,7 +413,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
*/
static u16 toshiba_line_size;
-static struct dmi_system_id __devinitdata toshiba_ohci1394_dmi_table[] = {
+static const struct dmi_system_id __devinitconst toshiba_ohci1394_dmi_table[] = {
{
.ident = "Toshiba PS5 based laptop",
.matches = {
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c52f4034c7f..82cd39a6cbd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
vcpup = &per_cpu(xen_vcpu_info, cpu);
- info.mfn = virt_to_mfn(vcpup);
+ info.mfn = arbitrary_virt_to_mfn(vcpup);
info.offset = offset_in_page(vcpup);
printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
frames = mcs.args;
for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
- frames[f] = virt_to_mfn(va);
+ frames[f] = arbitrary_virt_to_mfn((void *)va);
+
make_lowmem_page_readonly((void *)va);
+ make_lowmem_page_readonly(mfn_to_virt(frames[f]));
}
MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
unsigned int cpu, unsigned int i)
{
struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+ xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
struct multicall_space mc = __xen_mc_entry(0);
MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
break;
default: {
- xmaddr_t maddr = virt_to_machine(&dt[entry]);
+ xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
xen_mc_flush();
if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 319bd40a57c..cb6afa4ec95 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
p2m_top[topidx][idx] = mfn;
}
+unsigned long arbitrary_virt_to_mfn(void *vaddr)
+{
+ xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
+
+ return PFN_DOWN(maddr.maddr);
+}
+
xmaddr_t arbitrary_virt_to_machine(void *vaddr)
{
unsigned long address = (unsigned long)vaddr;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 035582ae815..8d470562ffc 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
struct desc_struct *gdt;
+ unsigned long gdt_mfn;
if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
return 0;
@@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->ldt_ents = 0;
BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+
+ gdt_mfn = arbitrary_virt_to_mfn(gdt);
make_lowmem_page_readonly(gdt);
+ make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
- ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+ ctxt->gdt_frames[0] = gdt_mfn;
ctxt->gdt_ents = GDT_ENTRIES;
ctxt->user_regs.cs = __KERNEL_CS;