284 files changed, 13105 insertions, 7546 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e18dfd3c3b7..bcf4cae711b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,12 +34,18 @@ config X86
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
+	select HAVE_FTRACE_SYSCALLS
 	select HAVE_KVM
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select USER_STACKTRACE_SUPPORT
+	select HAVE_DMA_API_DEBUG
+	select HAVE_KERNEL_GZIP
+	select HAVE_KERNEL_BZIP2
+	select HAVE_KERNEL_LZMA
 
 config ARCH_DEFCONFIG
 	string
@@ -135,6 +141,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
+config HAVE_DYNAMIC_PER_CPU_AREA
+	def_bool y
+
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
 
@@ -158,11 +167,17 @@ config AUDIT_ARCH
 config ARCH_SUPPORTS_OPTIMIZED_INLINING
 	def_bool y
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	def_bool y
+
 # Use the generic interrupt handling code in kernel/irq/:
 config GENERIC_HARDIRQS
 	bool
 	default y
 
+config GENERIC_HARDIRQS_NO__DO_IRQ
+       def_bool y
+
 config GENERIC_IRQ_PROBE
 	bool
 	default y
@@ -778,6 +793,11 @@ config X86_MCE_AMD
 	   Additional support for AMD specific MCE features such as
 	   the DRAM Error Threshold.
 
+config X86_MCE_THRESHOLD
+	depends on X86_MCE_AMD || X86_MCE_INTEL
+	bool
+	default y
+
 config X86_MCE_NONFATAL
 	tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
 	depends on X86_32 && X86_MCE
@@ -921,6 +941,12 @@ config X86_CPUID
 	  with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
 	  /dev/cpu/31/cpuid.
 
+config X86_CPU_DEBUG
+	tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
+	---help---
+	  If you select this option, this will provide various x86 CPUs
+	  information through debugfs.
+
 choice
 	prompt "High Memory Support"
 	default HIGHMEM4G if !X86_NUMAQ
@@ -1113,7 +1139,7 @@ config NUMA_EMU
 
 config NODES_SHIFT
 	int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
-	range 1 9   if X86_64
+	range 1 9
 	default "9" if MAXSMP
 	default "6" if X86_64
 	default "4" if X86_NUMAQ
@@ -1121,9 +1147,9 @@ config NODES_SHIFT
 	depends on NEED_MULTIPLE_NODES
 	---help---
 	  Specify the maximum number of NUMA Nodes available on the target
-	  system.  Increases memory reserved to accomodate various tables.
+	  system.  Increases memory reserved to accommodate various tables.
 
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
 	def_bool y
 	depends on X86_32 && NUMA
 
@@ -1299,7 +1325,7 @@ config MTRR_SANITIZER
 	  add writeback entries.
 
 	  Can be disabled with disable_mtrr_cleanup on the kernel command line.
-	  The largest mtrr entry size for a continous block can be set with
+	  The largest mtrr entry size for a continuous block can be set with
 	  mtrr_chunk_size.
 
 	  If unsure, say Y.
@@ -1421,7 +1447,7 @@ config CRASH_DUMP
 config KEXEC_JUMP
 	bool "kexec jump (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
-	depends on KEXEC && HIBERNATION && X86_32
+	depends on KEXEC && HIBERNATION
 	---help---
 	  Jump between original kernel and kexeced kernel and invoke
 	  code in physical address mode via KEXEC
@@ -1814,8 +1840,8 @@ config PCI_MMCONFIG
 
 config DMAR
 	bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
-	depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
-	---help---
+	depends on PCI_MSI && ACPI && EXPERIMENTAL
+	help
 	  DMA remapping (DMAR) devices support enables independent address
 	  translations for Direct Memory Access (DMA) from devices.
 	  These DMA remapping devices are reported via ACPI tables
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a95eaf0e582..924e156a85a 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -456,24 +456,9 @@ config CPU_SUP_AMD
 
 	  If unsure, say N.
 
-config CPU_SUP_CENTAUR_32
+config CPU_SUP_CENTAUR
 	default y
 	bool "Support Centaur processors" if PROCESSOR_SELECT
-	depends on !64BIT
-	---help---
-	  This enables detection, tunings and quirks for Centaur processors
-
-	  You need this enabled if you want your kernel to run on a
-	  Centaur CPU. Disabling this option on other types of CPUs
-	  makes the kernel a tiny bit smaller. Disabling it on a Centaur
-	  CPU might render the kernel unbootable.
-
-	  If unsure, say N.
-
-config CPU_SUP_CENTAUR_64
-	default y
-	bool "Support Centaur processors" if PROCESSOR_SELECT
-	depends on 64BIT
 	---help---
 	  This enables detection, tunings and quirks for Centaur processors
 
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index fdb45df608b..d8359e73317 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -72,14 +72,6 @@ config DEBUG_STACK_USAGE
 
 	  This option will slow down process creation somewhat.
 
-config DEBUG_PAGEALLOC
-	bool "Debug page memory allocations"
-	depends on DEBUG_KERNEL
-	---help---
-	  Unmap pages from the kernel linear mapping after free_pages().
-	  This results in a large slowdown, but helps to find certain types
-	  of memory corruptions.
-
 config DEBUG_PER_CPU_MAPS
 	bool "Debug access to per_cpu maps"
 	depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1836191839e..f05d8c91d9e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -153,34 +153,23 @@ endif
 
 boot := arch/x86/boot
 
-PHONY += zImage bzImage compressed zlilo bzlilo \
-         zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
+BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage install
+
+PHONY += bzImage $(BOOT_TARGETS)
 
 # Default kernel to build
 all: bzImage
 
 # KBUILD_IMAGE specify target image being built
-                    KBUILD_IMAGE := $(boot)/bzImage
-zImage zlilo zdisk: KBUILD_IMAGE := $(boot)/zImage
+KBUILD_IMAGE := $(boot)/bzImage
 
-zImage bzImage: vmlinux
+bzImage: vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
 	$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
 	$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
 
-compressed: zImage
-
-zlilo bzlilo: vmlinux
-	$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
-
-zdisk bzdisk: vmlinux
-	$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
-
-fdimage fdimage144 fdimage288 isoimage: vmlinux
-	$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
-
-install:
-	$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
+$(BOOT_TARGETS): vmlinux
+	$(Q)$(MAKE) $(build)=$(boot) $@
 
 PHONY += vdso_install
 vdso_install:
@@ -205,7 +194,3 @@ define archhelp
   echo  '                  FDARGS="..."  arguments for the booted kernel'
   echo  '                  FDINITRD=file initrd for the booted kernel'
 endef
-
-CLEAN_FILES += arch/x86/boot/fdimage \
-	       arch/x86/boot/image.iso \
-	       arch/x86/boot/mtools.conf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index c70eff69a1f..6633b6e7505 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -6,26 +6,24 @@
 # for more details.
 #
 # Copyright (C) 1994 by Linus Torvalds
+# Changed by many, many contributors over the years.
 #
 
 # ROOT_DEV specifies the default root-device when making the image.
 # This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
 # the default of FLOPPY is used by 'build'.
 
-ROOT_DEV := CURRENT
+ROOT_DEV	:= CURRENT
 
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
 # The number is the same as you would ordinarily press at bootup.
 
-SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
+SVGA_MODE	:= -DSVGA_MODE=NORMAL_VGA
 
-# If you want the RAM disk device, define this to be the size in blocks.
-
-#RAMDISK := -DRAMDISK=512
-
-targets		:= vmlinux.bin setup.bin setup.elf zImage bzImage
+targets		:= vmlinux.bin setup.bin setup.elf bzImage
+targets		+= fdimage fdimage144 fdimage288 image.iso mtools.conf
 subdir-		:= compressed
 
 setup-y		+= a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
@@ -59,6 +57,7 @@ $(obj)/cpustr.h: $(obj)/mkcpustr FORCE
 # How to compile the 16-bit code.  Note we always compile for -march=i386,
 # that way we can complain to the user if the CPU is insufficient.
 KBUILD_CFLAGS	:= $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
+		   -DDISABLE_BRANCH_PROFILING \
 		   -Wall -Wstrict-prototypes \
 		   -march=i386 -mregparm=3 \
 		   -include $(srctree)/$(src)/code16gcc.h \
@@ -68,20 +67,16 @@ KBUILD_CFLAGS	:= $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
 			$(call cc-option, -fno-unit-at-a-time)) \
 		   $(call cc-option, -fno-stack-protector) \
 		   $(call cc-option, -mpreferred-stack-boundary=2)
-KBUILD_CFLAGS +=   $(call cc-option,-m32)
+KBUILD_CFLAGS	+= $(call cc-option, -m32)
 KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
 
-$(obj)/zImage:  asflags-y := $(SVGA_MODE) $(RAMDISK)
-$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
-$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
-$(obj)/bzImage: BUILDFLAGS   := -b
+$(obj)/bzImage: asflags-y  := $(SVGA_MODE)
 
 quiet_cmd_image = BUILD   $@
-cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \
-	    $(obj)/vmlinux.bin $(ROOT_DEV) > $@
+cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
+	$(ROOT_DEV) > $@
 
-$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \
-			      $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
 	$(call if_changed,image)
 	@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
 
@@ -116,9 +111,11 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
 $(obj)/compressed/vmlinux: FORCE
 	$(Q)$(MAKE) $(build)=$(obj)/compressed $@
 
-# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
+# Set this if you want to pass append arguments to the
+# bzdisk/fdimage/isoimage kernel
 FDARGS =
-# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel
+# Set this if you want an initrd included with the
+# bzdisk/fdimage/isoimage kernel
 FDINITRD =
 
 image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
@@ -127,7 +124,7 @@ $(obj)/mtools.conf: $(src)/mtools.conf.in
 	sed -e 's|@OBJ@|$(obj)|g' < $< > $@
 
 # This requires write access to /dev/fd0
-zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
+bzdisk: $(obj)/bzImage $(obj)/mtools.conf
 	MTOOLSRC=$(obj)/mtools.conf mformat a:			; sync
 	syslinux /dev/fd0					; sync
 	echo '$(image_cmdline)' | \
@@ -135,10 +132,10 @@ zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
 	if [ -f '$(FDINITRD)' ] ; then \
 		MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
 	fi
-	MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux	; sync
+	MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux	; sync
 
 # These require being root or having syslinux 2.02 or higher installed
-fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
+fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
 	dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
 	MTOOLSRC=$(obj)/mtools.conf mformat v:			; sync
 	syslinux $(obj)/fdimage					; sync
@@ -147,9 +144,9 @@ fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
 	if [ -f '$(FDINITRD)' ] ; then \
 		MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
 	fi
-	MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux	; sync
+	MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux	; sync
 
-fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
+fdimage288: $(obj)/bzImage $(obj)/mtools.conf
 	dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
 	MTOOLSRC=$(obj)/mtools.conf mformat w:			; sync
 	syslinux $(obj)/fdimage					; sync
@@ -158,9 +155,9 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
 	if [ -f '$(FDINITRD)' ] ; then \
 		MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
 	fi
-	MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux	; sync
+	MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux	; sync
 
-isoimage: $(BOOTIMAGE)
+isoimage: $(obj)/bzImage
 	-rm -rf $(obj)/isoimage
 	mkdir $(obj)/isoimage
 	for i in lib lib64 share end ; do \
@@ -170,7 +167,7 @@ isoimage: $(BOOTIMAGE)
 		fi ; \
 		if [ $$i = end ] ; then exit 1 ; fi ; \
 	done
-	cp $(BOOTIMAGE) $(obj)/isoimage/linux
+	cp $(obj)/bzImage $(obj)/isoimage/linux
 	echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
 	if [ -f '$(FDINITRD)' ] ; then \
 		cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
@@ -181,12 +178,13 @@ isoimage: $(BOOTIMAGE)
 	isohybrid $(obj)/image.iso 2>/dev/null || true
 	rm -rf $(obj)/isoimage
 
-zlilo: $(BOOTIMAGE)
+bzlilo: $(obj)/bzImage
 	if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
 	if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
-	cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz
+	cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
 	cp System.map $(INSTALL_PATH)/
 	if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
 
 install:
-	sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
+	sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
+		System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 1771c804e02..65551c9f857 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,10 +4,11 @@
 # create a compressed vmlinux image from the original vmlinux
 #
 
-targets := vmlinux vmlinux.bin vmlinux.bin.gz head_$(BITS).o misc.o piggy.o
+targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o
 
 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 cflags-$(CONFIG_X86_64) := -mcmodel=small
 KBUILD_CFLAGS += $(cflags-y)
 KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
@@ -47,18 +48,35 @@ ifeq ($(CONFIG_X86_32),y)
 ifdef CONFIG_RELOCATABLE
 $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
 	$(call if_changed,gzip)
+$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
+	$(call if_changed,bzip2)
+$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
+	$(call if_changed,lzma)
 else
 $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
 	$(call if_changed,gzip)
+$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,bzip2)
+$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,lzma)
 endif
 LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
 
 else
+
 $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
 	$(call if_changed,gzip)
+$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,bzip2)
+$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,lzma)
 
 LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
 endif
 
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
+suffix_$(CONFIG_KERNEL_GZIP)  = gz
+suffix_$(CONFIG_KERNEL_BZIP2) = bz2
+suffix_$(CONFIG_KERNEL_LZMA)  = lzma
+
+$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
 	$(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index da062216948..e45be73684f 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -116,71 +116,13 @@
 /*
  * gzip declarations
  */
-
-#define OF(args)	args
 #define STATIC		static
 
 #undef memset
 #undef memcpy
 #define memzero(s, n)	memset((s), 0, (n))
 
-typedef unsigned char	uch;
-typedef unsigned short	ush;
-typedef unsigned long	ulg;
-
-/*
- * Window size must be at least 32k, and a power of two.
- * We don't actually have a window just a huge output buffer,
- * so we report a 2G window size, as that should always be
- * larger than our output buffer:
- */
-#define WSIZE		0x80000000
-
-/* Input buffer: */
-static unsigned char	*inbuf;
-
-/* Sliding window buffer (and final output buffer): */
-static unsigned char	*window;
-
-/* Valid bytes in inbuf: */
-static unsigned		insize;
-
-/* Index of next byte to be processed in inbuf: */
-static unsigned		inptr;
-
-/* Bytes in output buffer: */
-static unsigned		outcnt;
-
-/* gzip flag byte */
-#define ASCII_FLAG	0x01 /* bit 0 set: file probably ASCII text */
-#define CONTINUATION	0x02 /* bit 1 set: continuation of multi-part gz file */
-#define EXTRA_FIELD	0x04 /* bit 2 set: extra field present */
-#define ORIG_NAM	0x08 /* bit 3 set: original file name present */
-#define COMMENT		0x10 /* bit 4 set: file comment present */
-#define ENCRYPTED	0x20 /* bit 5 set: file is encrypted */
-#define RESERVED	0xC0 /* bit 6, 7:  reserved */
-
-#define get_byte()	(inptr < insize ? inbuf[inptr++] : fill_inbuf())
-
-/* Diagnostic functions */
-#ifdef DEBUG
-#  define Assert(cond, msg) do { if (!(cond)) error(msg); } while (0)
-#  define Trace(x)	do { fprintf x; } while (0)
-#  define Tracev(x)	do { if (verbose) fprintf x ; } while (0)
-#  define Tracevv(x)	do { if (verbose > 1) fprintf x ; } while (0)
-#  define Tracec(c, x)	do { if (verbose && (c)) fprintf x ; } while (0)
-#  define Tracecv(c, x)	do { if (verbose > 1 && (c)) fprintf x ; } while (0)
-#else
-#  define Assert(cond, msg)
-#  define Trace(x)
-#  define Tracev(x)
-#  define Tracevv(x)
-#  define Tracec(c, x)
-#  define Tracecv(c, x)
-#endif
 
-static int  fill_inbuf(void);
-static void flush_window(void);
 static void error(char *m);
 
 /*
@@ -189,13 +131,8 @@ static void error(char *m);
 static struct boot_params *real_mode;		/* Pointer to real-mode data */
 static int quiet;
 
-extern unsigned char input_data[];
-extern int input_len;
-
-static long bytes_out;
-
 static void *memset(void *s, int c, unsigned n);
-static void *memcpy(void *dest, const void *src, unsigned n);
+void *memcpy(void *dest, const void *src, unsigned n);
 
 static void __putstr(int, const char *);
 #define putstr(__x)  __putstr(0, __x)
@@ -213,7 +150,17 @@ static char *vidmem;
 static int vidport;
 static int lines, cols;
 
-#include "../../../../lib/inflate.c"
+#ifdef CONFIG_KERNEL_GZIP
+#include "../../../../lib/decompress_inflate.c"
+#endif
+
+#ifdef CONFIG_KERNEL_BZIP2
+#include "../../../../lib/decompress_bunzip2.c"
+#endif
+
+#ifdef CONFIG_KERNEL_LZMA
+#include "../../../../lib/decompress_unlzma.c"
+#endif
 
 static void scroll(void)
 {
@@ -282,7 +229,7 @@ static void *memset(void *s, int c, unsigned n)
 	return s;
 }
 
-static void *memcpy(void *dest, const void *src, unsigned n)
+void *memcpy(void *dest, const void *src, unsigned n)
 {
 	int i;
 	const char *s = src;
@@ -293,38 +240,6 @@ static void *memcpy(void *dest, const void *src, unsigned n)
 	return dest;
 }
 
-/* ===========================================================================
- * Fill the input buffer. This is called only when the buffer is empty
- * and at least one byte is really needed.
- */
-static int fill_inbuf(void)
-{
-	error("ran out of input data");
-	return 0;
-}
-
-/* ===========================================================================
- * Write the output window window[0..outcnt-1] and update crc and bytes_out.
- * (Used for the decompressed data only.)
- */
-static void flush_window(void)
-{
-	/* With my window equal to my output buffer
-	 * I only need to compute the crc here.
-	 */
-	unsigned long c = crc;         /* temporary variable */
-	unsigned n;
-	unsigned char *in, ch;
-
-	in = window;
-	for (n = 0; n < outcnt; n++) {
-		ch = *in++;
-		c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
-	}
-	crc = c;
-	bytes_out += (unsigned long)outcnt;
-	outcnt = 0;
-}
 
 static void error(char *x)
 {
@@ -407,12 +322,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 	lines = real_mode->screen_info.orig_video_lines;
 	cols = real_mode->screen_info.orig_video_cols;
 
-	window = output;		/* Output buffer (Normally at 1M) */
 	free_mem_ptr     = heap;	/* Heap */
 	free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
-	inbuf  = input_data;		/* Input buffer */
-	insize = input_len;
-	inptr  = 0;
 
 #ifdef CONFIG_X86_64
 	if ((unsigned long)output & (__KERNEL_ALIGN - 1))
@@ -430,10 +341,9 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 #endif
 #endif
 
-	makecrc();
 	if (!quiet)
 		putstr("\nDecompressing Linux... ");
-	gunzip();
+	decompress(input_data, input_len, NULL, NULL, output, NULL, error);
 	parse_elf(output);
 	if (!quiet)
 		putstr("done.\nBooting the kernel.\n");
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 7ccff4884a2..5d84d1c74e4 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -24,12 +24,8 @@
 #include "boot.h"
 #include "offsets.h"
 
-SETUPSECTS	= 4			/* default nr of setup-sectors */
 BOOTSEG		= 0x07C0		/* original address of boot-sector */
-SYSSEG		= DEF_SYSSEG		/* system loaded at 0x10000 (65536) */
-SYSSIZE		= DEF_SYSSIZE		/* system size: # of 16-byte clicks */
-					/* to be loaded */
-ROOT_DEV	= 0			/* ROOT_DEV is now written by "build" */
+SYSSEG		= 0x1000		/* historical load address >> 4 */
 
 #ifndef SVGA_MODE
 #define SVGA_MODE ASK_VGA
@@ -97,12 +93,12 @@ bugger_off_msg:
 	.section ".header", "a"
 	.globl	hdr
 hdr:
-setup_sects:	.byte SETUPSECTS
+setup_sects:	.byte 0			/* Filled in by build.c */
 root_flags:	.word ROOT_RDONLY
-syssize:	.long SYSSIZE
-ram_size:	.word RAMDISK
+syssize:	.long 0			/* Filled in by build.c */
+ram_size:	.word 0			/* Obsolete */
 vid_mode:	.word SVGA_MODE
-root_dev:	.word ROOT_DEV
+root_dev:	.word 0			/* Filled in by build.c */
 boot_flag:	.word 0xAA55
 
 	# offset 512, entry point
@@ -123,14 +119,15 @@ _start:
 					# or else old loadlin-1.5 will fail)
 		.globl realmode_swtch
 realmode_swtch:	.word	0, 0		# default_switch, SETUPSEG
-start_sys_seg:	.word	SYSSEG
+start_sys_seg:	.word	SYSSEG		# obsolete and meaningless, but just
+					# in case something decided to "use" it
 		.word	kernel_version-512 # pointing to kernel version string
 					# above section of header is compatible
 					# with loadlin-1.5 (header v1.5). Don't
 					# change it.
 
-type_of_loader:	.byte	0		# = 0, old one (LILO, Loadlin,
-					#      Bootlin, SYSLX, bootsect...)
+type_of_loader:	.byte	0		# 0 means ancient bootloader, newer
+					# bootloaders know to change this.
 					# See Documentation/i386/boot.txt for
 					# assigned ids
 
@@ -142,11 +139,7 @@ CAN_USE_HEAP	= 0x80			# If set, the loader also has set
 					# space behind setup.S can be used for
 					# heap purposes.
 					# Only the loader knows what is free
-#ifndef __BIG_KERNEL__
-		.byte	0
-#else
 		.byte	LOADED_HIGH
-#endif
 
 setup_move_size: .word  0x8000		# size to move, when setup is not
 					# loaded at 0x90000. We will move setup
@@ -157,11 +150,7 @@ setup_move_size: .word  0x8000		# size to move, when setup is not
 
 code32_start:				# here loaders can put a different
 					# start address for 32-bit code.
-#ifndef __BIG_KERNEL__
-		.long	0x1000		#   0x1000 = default for zImage
-#else
 		.long	0x100000	# 0x100000 = default for big kernel
-#endif
 
 ramdisk_image:	.long	0		# address of loaded ramdisk image
 					# Here the loader puts the 32-bit
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 8c3c25f3557..5054c2ddd1a 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -16,24 +17,38 @@
 
 #define SMAP	0x534d4150	/* ASCII "SMAP" */
 
+struct e820_ext_entry {
+	struct e820entry std;
+	u32 ext_flags;
+} __attribute__((packed));
+
 static int detect_memory_e820(void)
 {
 	int count = 0;
 	u32 next = 0;
-	u32 size, id;
+	u32 size, id, edi;
 	u8 err;
 	struct e820entry *desc = boot_params.e820_map;
+	static struct e820_ext_entry buf; /* static so it is zeroed */
+
+	/*
+	 * Set this here so that if the BIOS doesn't change this field
+	 * but still doesn't change %ecx, we're still okay...
+	 */
+	buf.ext_flags = 1;
 
 	do {
-		size = sizeof(struct e820entry);
+		size = sizeof buf;
 
-		/* Important: %edx is clobbered by some BIOSes,
-		   so it must be either used for the error output
-		   or explicitly marked clobbered. */
-		asm("int $0x15; setc %0"
+		/* Important: %edx and %esi are clobbered by some BIOSes,
+		   so they must be either used for the error output
+		   or explicitly marked clobbered.  Given that, assume there
+		   is something out there clobbering %ebp and %edi, too. */
+		asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
 		    : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
-		      "=m" (*desc)
-		    : "D" (desc), "d" (SMAP), "a" (0xe820));
+		      "=D" (edi), "+m" (buf)
+		    : "D" (&buf), "d" (SMAP), "a" (0xe820)
+		    : "esi");
 
 		/* BIOSes which terminate the chain with CF = 1 as opposed
 		   to %ebx = 0 don't always report the SMAP signature on
@@ -51,8 +66,14 @@ static int detect_memory_e820(void)
 			break;
 		}
 
+		/* ACPI 3.0 added the extended flags support.  If bit 0
+		   in the extended flags is zero, we're supposed to simply
+		   ignore the entry -- a backwards incompatible change! */
+		if (size > 20 && !(buf.ext_flags & 1))
+			continue;
+
+		*desc++ = buf.std;
 		count++;
-		desc++;
 	} while (next && count < ARRAY_SIZE(boot_params.e820_map));
 
 	return boot_params.e820_entries = count;
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 85a1cd8a8ff..8062f891525 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -33,47 +33,6 @@ static void realmode_switch_hook(void)
 }
 
 /*
- * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000.
- * A bzImage kernel is loaded and runs at 0x100000.
- */
-static void move_kernel_around(void)
-{
-	/* Note: rely on the compile-time option here rather than
-	   the LOADED_HIGH flag.  The Qemu kernel loader unconditionally
-	   sets the loadflags to zero. */
-#ifndef __BIG_KERNEL__
-	u16 dst_seg, src_seg;
-	u32 syssize;
-
-	dst_seg =  0x1000 >> 4;
-	src_seg = 0x10000 >> 4;
-	syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */
-
-	while (syssize) {
-		int paras  = (syssize >= 0x1000) ? 0x1000 : syssize;
-		int dwords = paras << 2;
-
-		asm volatile("pushw %%es ; "
-			     "pushw %%ds ; "
-			     "movw %1,%%es ; "
-			     "movw %2,%%ds ; "
-			     "xorw %%di,%%di ; "
-			     "xorw %%si,%%si ; "
-			     "rep;movsl ; "
-			     "popw %%ds ; "
-			     "popw %%es"
-			     : "+c" (dwords)
-			     : "r" (dst_seg), "r" (src_seg)
-			     : "esi", "edi");
-
-		syssize -= paras;
-		dst_seg += paras;
-		src_seg += paras;
-	}
-#endif
-}
-
-/*
  * Disable all interrupts at the legacy PIC.
  */
 static void mask_all_interrupts(void)
@@ -147,9 +106,6 @@ void go_to_protected_mode(void)
 	/* Hook before leaving real mode, also disables interrupts */
 	realmode_switch_hook();
 
-	/* Move the kernel/setup to their final resting places */
-	move_kernel_around();
-
 	/* Enable the A20 gate */
 	if (enable_a20()) {
 		puts("A20 gate not responding, unable to boot...\n");
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index 019c17a7585..3e0edc6d2a2 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -47,6 +47,7 @@ GLOBAL(protected_mode_jump)
 ENDPROC(protected_mode_jump)
 
 	.code32
+	.section ".text32","ax"
 GLOBAL(in_pm32)
 	# Set up data segments for flat 32-bit mode
 	movl	%ecx, %ds
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index df9234b3a5e..bb8dc2de796 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -17,7 +17,8 @@ SECTIONS
 	.header		: { *(.header) }
 	.inittext	: { *(.inittext) }
 	.initdata	: { *(.initdata) }
-	.text		: { *(.text*) }
+	.text		: { *(.text) }
+	.text32		: { *(.text32) }
 
 	. = ALIGN(16);
 	.rodata		: { *(.rodata*) }
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 44dc1923c0e..ee3a4ea923a 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -130,7 +130,7 @@ static void die(const char * str, ...)
 
 static void usage(void)
 {
-	die("Usage: build [-b] setup system [rootdev] [> image]");
+	die("Usage: build setup system [rootdev] [> image]");
 }
 
 int main(int argc, char ** argv)
@@ -145,11 +145,6 @@ int main(int argc, char ** argv)
 	void *kernel;
 	u32 crc = 0xffffffffUL;
 
-	if (argc > 2 && !strcmp(argv[1], "-b"))
-	  {
-	    is_big_kernel = 1;
-	    argc--, argv++;
-	  }
 	if ((argc < 3) || (argc > 4))
 		usage();
 	if (argc > 3) {
@@ -216,8 +211,6 @@ int main(int argc, char ** argv)
 		die("Unable to mmap '%s': %m", argv[2]);
 	/* Number of 16-byte paragraphs, including space for a 4-byte CRC */
 	sys_size = (sz + 15 + 4) / 16;
-	if (!is_big_kernel && sys_size > DEF_SYSSIZE)
-		die("System is too big. Try using bzImage or modules.");
 
 	/* Patch the setup code with the appropriate size parameters */
 	buf[0x1f1] = setup_sectors-1;
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 5d4742ed4aa..95d86ce0421 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -129,41 +129,45 @@ u16 vga_crtc(void)
 	return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4;
 }
 
-static void vga_set_480_scanlines(int end)
+static void vga_set_480_scanlines(int lines)
 {
-	u16 crtc;
-	u8  csel;
+	u16 crtc;		/* CRTC base address */
+	u8  csel;		/* CRTC miscellaneous output register */
+	u8  ovfw;		/* CRTC overflow register */
+	int end = lines-1;
 
 	crtc = vga_crtc();
 
+	ovfw = 0x3c | ((end >> (8-1)) & 0x02) | ((end >> (9-6)) & 0x40);
+
 	out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */
 	out_idx(0x0b, crtc, 0x06); /* Vertical total */
-	out_idx(0x3e, crtc, 0x07); /* Vertical overflow */
+	out_idx(ovfw, crtc, 0x07); /* Vertical overflow */
 	out_idx(0xea, crtc, 0x10); /* Vertical sync start */
-	out_idx(end, crtc, 0x12); /* Vertical display end */
+	out_idx(end,  crtc, 0x12); /* Vertical display end */
 	out_idx(0xe7, crtc, 0x15); /* Vertical blank start */
 	out_idx(0x04, crtc, 0x16); /* Vertical blank end */
 	csel = inb(0x3cc);
 	csel &= 0x0d;
 	csel |= 0xe2;
-	outb(csel, 0x3cc);
+	outb(csel, 0x3c2);
 }
 
 static void vga_set_80x30(void)
 {
-	vga_set_480_scanlines(0xdf);
+	vga_set_480_scanlines(30*16);
 }
 
 static void vga_set_80x34(void)
 {
 	vga_set_14font();
-	vga_set_480_scanlines(0xdb);
+	vga_set_480_scanlines(34*14);
 }
 
 static void vga_set_80x60(void)
 {
 	vga_set_8font();
-	vga_set_480_scanlines(0xdf);
+	vga_set_480_scanlines(60*8);
 }
 
 static int vga_set_mode(struct mode_info *mode)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 903de4aa509..ebe7deedd5b 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
+obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 
@@ -19,3 +20,5 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
+
+aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
index e41b147f450..b949ec2f9af 100644
--- a/arch/x86/crypto/aes-i586-asm_32.S
+++ b/arch/x86/crypto/aes-i586-asm_32.S
@@ -41,14 +41,14 @@
 #define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words)
 
 /* offsets to parameters with one register pushed onto stack */
-#define tfm 8
+#define ctx 8
 #define out_blk 12
 #define in_blk 16
 
-/* offsets in crypto_tfm structure */
-#define klen (crypto_tfm_ctx_offset + 0)
-#define ekey (crypto_tfm_ctx_offset + 4)
-#define dkey (crypto_tfm_ctx_offset + 244)
+/* offsets in crypto_aes_ctx structure */
+#define klen (480)
+#define ekey (0)
+#define dkey (240)
 
 // register mapping for encrypt and decrypt subroutines
 
@@ -217,7 +217,7 @@
 	do_col (table, r5,r0,r1,r4, r2,r3);		/* idx=r5 */
 
 // AES (Rijndael) Encryption Subroutine
-/* void aes_enc_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
+/* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
 
 .global  aes_enc_blk
 
@@ -228,7 +228,7 @@
 
 aes_enc_blk:
 	push    %ebp
-	mov     tfm(%esp),%ebp
+	mov     ctx(%esp),%ebp
 
 // CAUTION: the order and the values used in these assigns 
 // rely on the register mappings
@@ -292,7 +292,7 @@ aes_enc_blk:
 	ret
 
 // AES (Rijndael) Decryption Subroutine
-/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
+/* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
 
 .global  aes_dec_blk
 
@@ -303,7 +303,7 @@ aes_enc_blk:
 
 aes_dec_blk:
 	push    %ebp
-	mov     tfm(%esp),%ebp
+	mov     ctx(%esp),%ebp
 
 // CAUTION: the order and the values used in these assigns 
 // rely on the register mappings
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
index a120f526c3d..5b577d5a059 100644
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -17,8 +17,6 @@
 
 #include <asm/asm-offsets.h>
 
-#define BASE crypto_tfm_ctx_offset
-
 #define R1	%rax
 #define R1E	%eax
 #define R1X	%ax
@@ -56,13 +54,13 @@
 	.align	8;			\
 FUNC:	movq	r1,r2;			\
 	movq	r3,r4;			\
-	leaq	BASE+KEY+48+4(r8),r9;	\
+	leaq	KEY+48(r8),r9;		\
 	movq	r10,r11;		\
 	movl	(r7),r5 ## E;		\
 	movl	4(r7),r1 ## E;		\
 	movl	8(r7),r6 ## E;		\
 	movl	12(r7),r7 ## E;		\
-	movl	BASE+0(r8),r10 ## E;	\
+	movl	480(r8),r10 ## E;	\
 	xorl	-48(r9),r5 ## E;	\
 	xorl	-44(r9),r1 ## E;	\
 	xorl	-40(r9),r6 ## E;	\
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
index 71f45782711..49ae9fe32b2 100644
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@ -5,17 +5,29 @@
 
 #include <crypto/aes.h>
 
-asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
-asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
+asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
+
+void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+{
+	aes_enc_blk(ctx, dst, src);
+}
+EXPORT_SYMBOL_GPL(crypto_aes_encrypt_x86);
+
+void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+{
+	aes_dec_blk(ctx, dst, src);
+}
+EXPORT_SYMBOL_GPL(crypto_aes_decrypt_x86);
 
 static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	aes_enc_blk(tfm, dst, src);
+	aes_enc_blk(crypto_tfm_ctx(tfm), dst, src);
 }
 
 static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
-	aes_dec_blk(tfm, dst, src);
+	aes_dec_blk(crypto_tfm_ctx(tfm), dst, src);
 }
 
 static struct crypto_alg aes_alg = {
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
new file mode 100644
index 00000000000..caba9960170
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -0,0 +1,896 @@
+/*
+ * Implement AES algorithm in Intel AES-NI instructions.
+ *
+ * The white paper of AES-NI instructions can be downloaded from:
+ *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
+ *
+ * Copyright (C) 2008, Intel Corp.
+ *    Author: Huang Ying <ying.huang@intel.com>
+ *            Vinodh Gopal <vinodh.gopal@intel.com>
+ *            Kahraman Akdemir
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.text
+
+#define STATE1	%xmm0
+#define STATE2	%xmm4
+#define STATE3	%xmm5
+#define STATE4	%xmm6
+#define STATE	STATE1
+#define IN1	%xmm1
+#define IN2	%xmm7
+#define IN3	%xmm8
+#define IN4	%xmm9
+#define IN	IN1
+#define KEY	%xmm2
+#define IV	%xmm3
+
+#define KEYP	%rdi
+#define OUTP	%rsi
+#define INP	%rdx
+#define LEN	%rcx
+#define IVP	%r8
+#define KLEN	%r9d
+#define T1	%r10
+#define TKEYP	T1
+#define T2	%r11
+
+_key_expansion_128:
+_key_expansion_256a:
+	pshufd $0b11111111, %xmm1, %xmm1
+	shufps $0b00010000, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	shufps $0b10001100, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	pxor %xmm1, %xmm0
+	movaps %xmm0, (%rcx)
+	add $0x10, %rcx
+	ret
+
+_key_expansion_192a:
+	pshufd $0b01010101, %xmm1, %xmm1
+	shufps $0b00010000, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	shufps $0b10001100, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	pxor %xmm1, %xmm0
+
+	movaps %xmm2, %xmm5
+	movaps %xmm2, %xmm6
+	pslldq $4, %xmm5
+	pshufd $0b11111111, %xmm0, %xmm3
+	pxor %xmm3, %xmm2
+	pxor %xmm5, %xmm2
+
+	movaps %xmm0, %xmm1
+	shufps $0b01000100, %xmm0, %xmm6
+	movaps %xmm6, (%rcx)
+	shufps $0b01001110, %xmm2, %xmm1
+	movaps %xmm1, 16(%rcx)
+	add $0x20, %rcx
+	ret
+
+_key_expansion_192b:
+	pshufd $0b01010101, %xmm1, %xmm1
+	shufps $0b00010000, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	shufps $0b10001100, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	pxor %xmm1, %xmm0
+
+	movaps %xmm2, %xmm5
+	pslldq $4, %xmm5
+	pshufd $0b11111111, %xmm0, %xmm3
+	pxor %xmm3, %xmm2
+	pxor %xmm5, %xmm2
+
+	movaps %xmm0, (%rcx)
+	add $0x10, %rcx
+	ret
+
+_key_expansion_256b:
+	pshufd $0b10101010, %xmm1, %xmm1
+	shufps $0b00010000, %xmm2, %xmm4
+	pxor %xmm4, %xmm2
+	shufps $0b10001100, %xmm2, %xmm4
+	pxor %xmm4, %xmm2
+	pxor %xmm1, %xmm2
+	movaps %xmm2, (%rcx)
+	add $0x10, %rcx
+	ret
+
+/*
+ * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ *                   unsigned int key_len)
+ */
+ENTRY(aesni_set_key)
+	movups (%rsi), %xmm0		# user key (first 16 bytes)
+	movaps %xmm0, (%rdi)
+	lea 0x10(%rdi), %rcx		# key addr
+	movl %edx, 480(%rdi)
+	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
+	cmp $24, %dl
+	jb .Lenc_key128
+	je .Lenc_key192
+	movups 0x10(%rsi), %xmm2	# other user key
+	movaps %xmm2, (%rcx)
+	add $0x10, %rcx
+	# aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
+	call _key_expansion_256a
+	# aeskeygenassist $0x1, %xmm0, %xmm1
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
+	call _key_expansion_256b
+	# aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
+	call _key_expansion_256a
+	# aeskeygenassist $0x2, %xmm0, %xmm1
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
+	call _key_expansion_256b
+	# aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
+	call _key_expansion_256a
+	# aeskeygenassist $0x4, %xmm0, %xmm1
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
+	call _key_expansion_256b
+	# aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
+	call _key_expansion_256a
+	# aeskeygenassist $0x8, %xmm0, %xmm1
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
+	call _key_expansion_256b
+	# aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
+	call _key_expansion_256a
+	# aeskeygenassist $0x10, %xmm0, %xmm1
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
+	call _key_expansion_256b
+	# aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
+	call _key_expansion_256a
+	# aeskeygenassist $0x20, %xmm0, %xmm1
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
+	call _key_expansion_256b
+	# aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
+	call _key_expansion_256a
+	jmp .Ldec_key
+.Lenc_key192:
+	movq 0x10(%rsi), %xmm2		# other user key
+	# aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
+	call _key_expansion_192a
+	# aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
+	call _key_expansion_192b
+	# aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
+	call _key_expansion_192a
+	# aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
+	call _key_expansion_192b
+	# aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
+	call _key_expansion_192a
+	# aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
+	call _key_expansion_192b
+	# aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
+	call _key_expansion_192a
+	# aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
+	call _key_expansion_192b
+	jmp .Ldec_key
+.Lenc_key128:
+	# aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
+	call _key_expansion_128
+	# aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
+	call _key_expansion_128
+	# aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
+	call _key_expansion_128
+	# aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
+	call _key_expansion_128
+	# aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
+	call _key_expansion_128
+	# aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
+	call _key_expansion_128
+	# aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
+	call _key_expansion_128
+	# aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
+	call _key_expansion_128
+	# aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
+	call _key_expansion_128
+	# aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
+	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
+	call _key_expansion_128
+.Ldec_key:
+	sub $0x10, %rcx
+	movaps (%rdi), %xmm0
+	movaps (%rcx), %xmm1
+	movaps %xmm0, 240(%rcx)
+	movaps %xmm1, 240(%rdi)
+	add $0x10, %rdi
+	lea 240-16(%rcx), %rsi
+.align 4
+.Ldec_key_loop:
+	movaps (%rdi), %xmm0
+	# aesimc %xmm0, %xmm1
+	.byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
+	movaps %xmm1, (%rsi)
+	add $0x10, %rdi
+	sub $0x10, %rsi
+	cmp %rcx, %rdi
+	jb .Ldec_key_loop
+	xor %rax, %rax
+	ret
+
+/*
+ * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_enc)
+	movl 480(KEYP), KLEN		# key length
+	movups (INP), STATE		# input
+	call _aesni_enc1
+	movups STATE, (OUTP)		# output
+	ret
+
+/*
+ * _aesni_enc1:		internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	KLEN:		round count
+ *	STATE:		initial state (input)
+ * output:
+ *	STATE:		finial state (output)
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_enc1:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE		# round 0
+	add $0x30, TKEYP
+	cmp $24, KLEN
+	jb .Lenc128
+	lea 0x20(TKEYP), TKEYP
+	je .Lenc192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	movaps -0x50(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+.align 4
+.Lenc192:
+	movaps -0x40(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	movaps -0x30(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+.align 4
+.Lenc128:
+	movaps -0x20(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	movaps -0x10(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	movaps (TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	movaps 0x10(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	movaps 0x20(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	movaps 0x30(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	movaps 0x40(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	movaps 0x50(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	movaps 0x60(TKEYP), KEY
+	# aesenc KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	movaps 0x70(TKEYP), KEY
+	# aesenclast KEY, STATE	# last round
+	.byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
+	ret
+
+/*
+ * _aesni_enc4:	internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	KLEN:		round count
+ *	STATE1:		initial state (input)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * output:
+ *	STATE1:		finial state (output)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_enc4:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE1		# round 0
+	pxor KEY, STATE2
+	pxor KEY, STATE3
+	pxor KEY, STATE4
+	add $0x30, TKEYP
+	cmp $24, KLEN
+	jb .L4enc128
+	lea 0x20(TKEYP), TKEYP
+	je .L4enc192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+	movaps -0x50(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+#.align 4
+.L4enc192:
+	movaps -0x40(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+	movaps -0x30(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+#.align 4
+.L4enc128:
+	movaps -0x20(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+	movaps -0x10(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+	movaps (TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+	movaps 0x10(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+	movaps 0x20(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+	movaps 0x30(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+	movaps 0x40(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+	movaps 0x50(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+	movaps 0x60(TKEYP), KEY
+	# aesenc KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+	# aesenc KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
+	# aesenc KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
+	# aesenc KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+	movaps 0x70(TKEYP), KEY
+	# aesenclast KEY, STATE1	# last round
+	.byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
+	# aesenclast KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdd, 0xe2
+	# aesenclast KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdd, 0xea
+	# aesenclast KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdd, 0xf2
+	ret
+
+/*
+ * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_dec)
+	mov 480(KEYP), KLEN		# key length
+	add $240, KEYP
+	movups (INP), STATE		# input
+	call _aesni_dec1
+	movups STATE, (OUTP)		#output
+	ret
+
+/*
+ * _aesni_dec1:		internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	KLEN:		key length
+ *	STATE:		initial state (input)
+ * output:
+ *	STATE:		finial state (output)
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_dec1:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE		# round 0
+	add $0x30, TKEYP
+	cmp $24, KLEN
+	jb .Ldec128
+	lea 0x20(TKEYP), TKEYP
+	je .Ldec192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	movaps -0x50(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+.align 4
+.Ldec192:
+	movaps -0x40(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	movaps -0x30(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+.align 4
+.Ldec128:
+	movaps -0x20(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	movaps -0x10(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	movaps (TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	movaps 0x10(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	movaps 0x20(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	movaps 0x30(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	movaps 0x40(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	movaps 0x50(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	movaps 0x60(TKEYP), KEY
+	# aesdec KEY, STATE
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	movaps 0x70(TKEYP), KEY
+	# aesdeclast KEY, STATE		# last round
+	.byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
+	ret
+
+/*
+ * _aesni_dec4:	internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	KLEN:		key length
+ *	STATE1:		initial state (input)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * output:
+ *	STATE1:		finial state (output)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_dec4:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE1		# round 0
+	pxor KEY, STATE2
+	pxor KEY, STATE3
+	pxor KEY, STATE4
+	add $0x30, TKEYP
+	cmp $24, KLEN
+	jb .L4dec128
+	lea 0x20(TKEYP), TKEYP
+	je .L4dec192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+	movaps -0x50(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+.align 4
+.L4dec192:
+	movaps -0x40(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+	movaps -0x30(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+.align 4
+.L4dec128:
+	movaps -0x20(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+	movaps -0x10(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+	movaps (TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+	movaps 0x10(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+	movaps 0x20(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+	movaps 0x30(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+	movaps 0x40(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+	movaps 0x50(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+	movaps 0x60(TKEYP), KEY
+	# aesdec KEY, STATE1
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+	# aesdec KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
+	# aesdec KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
+	# aesdec KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+	movaps 0x70(TKEYP), KEY
+	# aesdeclast KEY, STATE1	# last round
+	.byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
+	# aesdeclast KEY, STATE2
+	.byte 0x66, 0x0f, 0x38, 0xdf, 0xe2
+	# aesdeclast KEY, STATE3
+	.byte 0x66, 0x0f, 0x38, 0xdf, 0xea
+	# aesdeclast KEY, STATE4
+	.byte 0x66, 0x0f, 0x38, 0xdf, 0xf2
+	ret
+
+/*
+ * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len)
+ */
+ENTRY(aesni_ecb_enc)
+	test LEN, LEN		# check length
+	jz .Lecb_enc_ret
+	mov 480(KEYP), KLEN
+	cmp $16, LEN
+	jb .Lecb_enc_ret
+	cmp $64, LEN
+	jb .Lecb_enc_loop1
+.align 4
+.Lecb_enc_loop4:
+	movups (INP), STATE1
+	movups 0x10(INP), STATE2
+	movups 0x20(INP), STATE3
+	movups 0x30(INP), STATE4
+	call _aesni_enc4
+	movups STATE1, (OUTP)
+	movups STATE2, 0x10(OUTP)
+	movups STATE3, 0x20(OUTP)
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lecb_enc_loop4
+	cmp $16, LEN
+	jb .Lecb_enc_ret
+.align 4
+.Lecb_enc_loop1:
+	movups (INP), STATE1
+	call _aesni_enc1
+	movups STATE1, (OUTP)
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lecb_enc_loop1
+.Lecb_enc_ret:
+	ret
+
+/*
+ * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len);
+ */
+ENTRY(aesni_ecb_dec)
+	test LEN, LEN
+	jz .Lecb_dec_ret
+	mov 480(KEYP), KLEN
+	add $240, KEYP
+	cmp $16, LEN
+	jb .Lecb_dec_ret
+	cmp $64, LEN
+	jb .Lecb_dec_loop1
+.align 4
+.Lecb_dec_loop4:
+	movups (INP), STATE1
+	movups 0x10(INP), STATE2
+	movups 0x20(INP), STATE3
+	movups 0x30(INP), STATE4
+	call _aesni_dec4
+	movups STATE1, (OUTP)
+	movups STATE2, 0x10(OUTP)
+	movups STATE3, 0x20(OUTP)
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lecb_dec_loop4
+	cmp $16, LEN
+	jb .Lecb_dec_ret
+.align 4
+.Lecb_dec_loop1:
+	movups (INP), STATE1
+	call _aesni_dec1
+	movups STATE1, (OUTP)
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lecb_dec_loop1
+.Lecb_dec_ret:
+	ret
+
+/*
+ * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_enc)
+	cmp $16, LEN
+	jb .Lcbc_enc_ret
+	mov 480(KEYP), KLEN
+	movups (IVP), STATE	# load iv as initial state
+.align 4
+.Lcbc_enc_loop:
+	movups (INP), IN	# load input
+	pxor IN, STATE
+	call _aesni_enc1
+	movups STATE, (OUTP)	# store output
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lcbc_enc_loop
+	movups STATE, (IVP)
+.Lcbc_enc_ret:
+	ret
+
+/*
+ * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_dec)
+	cmp $16, LEN
+	jb .Lcbc_dec_ret
+	mov 480(KEYP), KLEN
+	add $240, KEYP
+	movups (IVP), IV
+	cmp $64, LEN
+	jb .Lcbc_dec_loop1
+.align 4
+.Lcbc_dec_loop4:
+	movups (INP), IN1
+	movaps IN1, STATE1
+	movups 0x10(INP), IN2
+	movaps IN2, STATE2
+	movups 0x20(INP), IN3
+	movaps IN3, STATE3
+	movups 0x30(INP), IN4
+	movaps IN4, STATE4
+	call _aesni_dec4
+	pxor IV, STATE1
+	pxor IN1, STATE2
+	pxor IN2, STATE3
+	pxor IN3, STATE4
+	movaps IN4, IV
+	movups STATE1, (OUTP)
+	movups STATE2, 0x10(OUTP)
+	movups STATE3, 0x20(OUTP)
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lcbc_dec_loop4
+	cmp $16, LEN
+	jb .Lcbc_dec_ret
+.align 4
+.Lcbc_dec_loop1:
+	movups (INP), IN
+	movaps IN, STATE
+	call _aesni_dec1
+	pxor IV, STATE
+	movups STATE, (OUTP)
+	movaps IN, IV
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lcbc_dec_loop1
+	movups IV, (IVP)
+.Lcbc_dec_ret:
+	ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
new file mode 100644
index 00000000000..02af0af6549
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -0,0 +1,461 @@
+/*
+ * Support for Intel AES-NI instructions. This file contains glue
+ * code, the real AES implementation is in intel-aes_asm.S.
+ *
+ * Copyright (C) 2008, Intel Corp.
+ *    Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/aes.h>
+#include <crypto/cryptd.h>
+#include <asm/i387.h>
+#include <asm/aes.h>
+
+struct async_aes_ctx {
+	struct cryptd_ablkcipher *cryptd_tfm;
+};
+
+#define AESNI_ALIGN	16
+#define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE-1))
+
+asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+			     unsigned int key_len);
+asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out,
+			  const u8 *in);
+asmlinkage void aesni_dec(struct crypto_aes_ctx *ctx, u8 *out,
+			  const u8 *in);
+asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len);
+asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len);
+asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len, u8 *iv);
+
+static inline int kernel_fpu_using(void)
+{
+	if (in_interrupt() && !(read_cr0() & X86_CR0_TS))
+		return 1;
+	return 0;
+}
+
+static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
+{
+	unsigned long addr = (unsigned long)raw_ctx;
+	unsigned long align = AESNI_ALIGN;
+
+	if (align <= crypto_tfm_ctx_alignment())
+		align = 1;
+	return (struct crypto_aes_ctx *)ALIGN(addr, align);
+}
+
+static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
+			      const u8 *in_key, unsigned int key_len)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(raw_ctx);
+	u32 *flags = &tfm->crt_flags;
+	int err;
+
+	if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
+	    key_len != AES_KEYSIZE_256) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	if (kernel_fpu_using())
+		err = crypto_aes_expand_key(ctx, in_key, key_len);
+	else {
+		kernel_fpu_begin();
+		err = aesni_set_key(ctx, in_key, key_len);
+		kernel_fpu_end();
+	}
+
+	return err;
+}
+
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+		       unsigned int key_len)
+{
+	return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len);
+}
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+	if (kernel_fpu_using())
+		crypto_aes_encrypt_x86(ctx, dst, src);
+	else {
+		kernel_fpu_begin();
+		aesni_enc(ctx, dst, src);
+		kernel_fpu_end();
+	}
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+	if (kernel_fpu_using())
+		crypto_aes_decrypt_x86(ctx, dst, src);
+	else {
+		kernel_fpu_begin();
+		aesni_dec(ctx, dst, src);
+		kernel_fpu_end();
+	}
+}
+
+static struct crypto_alg aesni_alg = {
+	.cra_name		= "aes",
+	.cra_driver_name	= "aes-aesni",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
+	.cra_alignmask		= 0,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(aesni_alg.cra_list),
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= AES_MIN_KEY_SIZE,
+			.cia_max_keysize	= AES_MAX_KEY_SIZE,
+			.cia_setkey		= aes_set_key,
+			.cia_encrypt		= aes_encrypt,
+			.cia_decrypt		= aes_decrypt
+		}
+	}
+};
+
+static int ecb_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	kernel_fpu_begin();
+	while ((nbytes = walk.nbytes)) {
+		aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes & AES_BLOCK_MASK);
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	kernel_fpu_end();
+
+	return err;
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	kernel_fpu_begin();
+	while ((nbytes = walk.nbytes)) {
+		aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes & AES_BLOCK_MASK);
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	kernel_fpu_end();
+
+	return err;
+}
+
+static struct crypto_alg blk_ecb_alg = {
+	.cra_name		= "__ecb-aes-aesni",
+	.cra_driver_name	= "__driver-ecb-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_ecb_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+};
+
+static int cbc_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	kernel_fpu_begin();
+	while ((nbytes = walk.nbytes)) {
+		aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes & AES_BLOCK_MASK, walk.iv);
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	kernel_fpu_end();
+
+	return err;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	kernel_fpu_begin();
+	while ((nbytes = walk.nbytes)) {
+		aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes & AES_BLOCK_MASK, walk.iv);
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	kernel_fpu_end();
+
+	return err;
+}
+
+static struct crypto_alg blk_cbc_alg = {
+	.cra_name		= "__cbc-aes-aesni",
+	.cra_driver_name	= "__driver-cbc-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_cbc_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+};
+
+static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
+			unsigned int key_len)
+{
+	struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+	return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len);
+}
+
+static int ablk_encrypt(struct ablkcipher_request *req)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+	if (kernel_fpu_using()) {
+		struct ablkcipher_request *cryptd_req =
+			ablkcipher_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_ablkcipher_encrypt(cryptd_req);
+	} else {
+		struct blkcipher_desc desc;
+		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+		desc.info = req->info;
+		desc.flags = 0;
+		return crypto_blkcipher_crt(desc.tfm)->encrypt(
+			&desc, req->dst, req->src, req->nbytes);
+	}
+}
+
+static int ablk_decrypt(struct ablkcipher_request *req)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+	if (kernel_fpu_using()) {
+		struct ablkcipher_request *cryptd_req =
+			ablkcipher_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_ablkcipher_decrypt(cryptd_req);
+	} else {
+		struct blkcipher_desc desc;
+		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+		desc.info = req->info;
+		desc.flags = 0;
+		return crypto_blkcipher_crt(desc.tfm)->decrypt(
+			&desc, req->dst, req->src, req->nbytes);
+	}
+}
+
+static void ablk_exit(struct crypto_tfm *tfm)
+{
+	struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	cryptd_free_ablkcipher(ctx->cryptd_tfm);
+}
+
+static void ablk_init_common(struct crypto_tfm *tfm,
+			     struct cryptd_ablkcipher *cryptd_tfm)
+{
+	struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->cryptd_tfm = cryptd_tfm;
+	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
+		crypto_ablkcipher_reqsize(&cryptd_tfm->base);
+}
+
+static int ablk_ecb_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-aes-aesni", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_ecb_alg = {
+	.cra_name		= "ecb(aes)",
+	.cra_driver_name	= "ecb-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
+	.cra_init		= ablk_ecb_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+
+static int ablk_cbc_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-aes-aesni", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_cbc_alg = {
+	.cra_name		= "cbc(aes)",
+	.cra_driver_name	= "cbc-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
+	.cra_init		= ablk_cbc_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+
+static int __init aesni_init(void)
+{
+	int err;
+
+	if (!cpu_has_aes) {
+		printk(KERN_ERR "Intel AES-NI instructions are not detected.\n");
+		return -ENODEV;
+	}
+	if ((err = crypto_register_alg(&aesni_alg)))
+		goto aes_err;
+	if ((err = crypto_register_alg(&blk_ecb_alg)))
+		goto blk_ecb_err;
+	if ((err = crypto_register_alg(&blk_cbc_alg)))
+		goto blk_cbc_err;
+	if ((err = crypto_register_alg(&ablk_ecb_alg)))
+		goto ablk_ecb_err;
+	if ((err = crypto_register_alg(&ablk_cbc_alg)))
+		goto ablk_cbc_err;
+
+	return err;
+
+ablk_cbc_err:
+	crypto_unregister_alg(&ablk_ecb_alg);
+ablk_ecb_err:
+	crypto_unregister_alg(&blk_cbc_alg);
+blk_cbc_err:
+	crypto_unregister_alg(&blk_ecb_alg);
+blk_ecb_err:
+	crypto_unregister_alg(&aesni_alg);
+aes_err:
+	return err;
+}
+
+static void __exit aesni_exit(void)
+{
+	crypto_unregister_alg(&ablk_cbc_alg);
+	crypto_unregister_alg(&ablk_ecb_alg);
+	crypto_unregister_alg(&blk_cbc_alg);
+	crypto_unregister_alg(&blk_ecb_alg);
+	crypto_unregister_alg(&aesni_alg);
+}
+
+module_init(aesni_init);
+module_exit(aesni_exit);
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("aes");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e4baa06bbce..19c61ef6ab5 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -557,7 +557,7 @@ ia32_sys_call_table:
 	.quad sys32_olduname
 	.quad sys_umask		/* 60 */
 	.quad sys_chroot
-	.quad sys32_ustat
+	.quad compat_sys_ustat
 	.quad sys_dup2
 	.quad sys_getppid
 	.quad sys_getpgrp		/* 65 */
@@ -828,5 +828,7 @@ ia32_sys_call_table:
 	.quad sys_dup3				/* 330 */
 	.quad sys_pipe2
 	.quad sys_inotify_init1
+	.quad compat_sys_preadv
+	.quad compat_sys_pwritev
 	.quad sys_perf_counter_open
 ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 6c0d7f6231a..efac92fd1ef 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -638,28 +638,6 @@ long sys32_uname(struct old_utsname __user *name)
 	return err ? -EFAULT : 0;
 }
 
-long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
-{
-	struct ustat u;
-	mm_segment_t seg;
-	int ret;
-
-	seg = get_fs();
-	set_fs(KERNEL_DS);
-	ret = sys_ustat(dev, (struct ustat __user *)&u);
-	set_fs(seg);
-	if (ret < 0)
-		return ret;
-
-	if (!access_ok(VERIFY_WRITE, u32p, sizeof(struct ustat32)) ||
-	    __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
-	    __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
-	    __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
-	    __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
-		ret = -EFAULT;
-	return ret;
-}
-
 asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
 			     compat_uptr_t __user *envp, struct pt_regs *regs)
 {
diff --git a/arch/x86/include/asm/aes.h b/arch/x86/include/asm/aes.h
new file mode 100644
index 00000000000..80545a1cbe3
--- /dev/null
+++ b/arch/x86/include/asm/aes.h
@@ -0,0 +1,11 @@
+#ifndef ASM_X86_AES_H
+#define ASM_X86_AES_H
+
+#include <linux/crypto.h>
+#include <crypto/aes.h>
+
+void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst,
+			    const u8 *src);
+void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst,
+			    const u8 *src);
+#endif
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index a6208dc7463..df8a300dfe6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -75,7 +75,14 @@ static inline void default_inquire_remote_apic(int apicid)
 #define setup_secondary_clock setup_secondary_APIC_clock
 #endif
 
+#ifdef CONFIG_X86_64
 extern int is_vsmp_box(void);
+#else
+static inline int is_vsmp_box(void)
+{
+	return 0;
+}
+#endif
 extern void xapic_wait_icr_idle(void);
 extern u32 safe_xapic_wait_icr_idle(void);
 extern void xapic_icr_write(u32, u32);
@@ -101,6 +108,16 @@ extern void native_apic_icr_write(u32 low, u32 id);
 extern u64 native_apic_icr_read(void);
 
 #ifdef CONFIG_X86_X2APIC
+/*
+ * Make previous memory operations globally visible before
+ * sending the IPI through x2apic wrmsr. We need a serializing instruction or
+ * mfence for this.
+ */
+static inline void x2apic_wrmsr_fence(void)
+{
+	asm volatile("mfence" : : : "memory");
+}
+
 static inline void native_apic_msr_write(u32 reg, u32 v)
 {
 	if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
@@ -177,6 +194,9 @@ static inline int x2apic_enabled(void)
 {
 	return 0;
 }
+
+#define	x2apic	0
+
 #endif
 
 extern int get_physical_broadcast(void);
@@ -306,7 +326,7 @@ struct apic {
 	void (*send_IPI_self)(int vector);
 
 	/* wakeup_secondary_cpu */
-	int (*wakeup_cpu)(int apicid, unsigned long start_eip);
+	int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
 
 	int trampoline_phys_low;
 	int trampoline_phys_high;
@@ -324,8 +344,21 @@ struct apic {
 	u32 (*safe_wait_icr_idle)(void);
 };
 
+/*
+ * Pointer to the local APIC driver in use on this system (there's
+ * always just one such driver in use - the kernel decides via an
+ * early probing process which one it picks - and then sticks to it):
+ */
 extern struct apic *apic;
 
+/*
+ * APIC functionality to boot other CPUs - only used on SMP:
+ */
+#ifdef CONFIG_SMP
+extern atomic_t init_deasserted;
+extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
+#endif
+
 static inline u32 apic_read(u32 reg)
 {
 	return apic->read(reg);
@@ -359,6 +392,7 @@ static inline u32 safe_apic_wait_icr_idle(void)
 
 static inline void ack_APIC_irq(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * ack_APIC_irq() actually gets compiled as a single instruction
 	 * ... yummie.
@@ -366,6 +400,7 @@ static inline void ack_APIC_irq(void)
 
 	/* Docs say use 0 for future compatibility */
 	apic_write(APIC_EOI, 0);
+#endif
 }
 
 static inline unsigned default_get_apic_id(unsigned long x)
@@ -384,9 +419,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
 #define DEFAULT_TRAMPOLINE_PHYS_LOW		0x467
 #define DEFAULT_TRAMPOLINE_PHYS_HIGH		0x469
 
-#ifdef CONFIG_X86_32
-extern void es7000_update_apic_to_cluster(void);
-#else
+#ifdef CONFIG_X86_64
 extern struct apic apic_flat;
 extern struct apic apic_physflat;
 extern struct apic apic_x2apic_cluster;
@@ -456,10 +489,19 @@ static inline int default_apic_id_registered(void)
 	return physid_isset(read_apic_id(), phys_cpu_present_map);
 }
 
+static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+	return cpuid_apic >> index_msb;
+}
+
+extern int default_apicid_to_node(int logical_apicid);
+
+#endif
+
 static inline unsigned int
 default_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
-	return cpumask_bits(cpumask)[0];
+	return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
 }
 
 static inline unsigned int
@@ -473,15 +515,6 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 	return (unsigned int)(mask1 & mask2 & mask3);
 }
 
-static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
-{
-	return cpuid_apic >> index_msb;
-}
-
-extern int default_apicid_to_node(int logical_apicid);
-
-#endif
-
 static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid)
 {
 	return physid_isset(apicid, bitmap);
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b..bc9514fb3b1 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
 #define		APIC_ESR_SENDILL	0x00020
 #define		APIC_ESR_RECVILL	0x00040
 #define		APIC_ESR_ILLREGA	0x00080
+#define 	APIC_LVTCMCI	0x2f0
 #define	APIC_ICR	0x300
 #define		APIC_DEST_SELF		0x40000
 #define		APIC_DEST_ALLINC	0x80000
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index dd61616cb73..6ba23dd9fc9 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -1,26 +1,36 @@
 #ifndef _ASM_X86_BOOT_H
 #define _ASM_X86_BOOT_H
 
-/* Don't touch these, unless you really know what you're doing. */
-#define DEF_SYSSEG	0x1000
-#define DEF_SYSSIZE	0x7F00
-
 /* Internal svga startup constants */
 #define NORMAL_VGA	0xffff		/* 80x25 mode */
 #define EXTENDED_VGA	0xfffe		/* 80x50 mode */
 #define ASK_VGA		0xfffd		/* ask for it at bootup */
 
+#ifdef __KERNEL__
+
 /* Physical address where kernel should be loaded. */
 #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
 				+ (CONFIG_PHYSICAL_ALIGN - 1)) \
 				& ~(CONFIG_PHYSICAL_ALIGN - 1))
 
+#ifdef CONFIG_KERNEL_BZIP2
+#define BOOT_HEAP_SIZE             0x400000
+#else /* !CONFIG_KERNEL_BZIP2 */
+
 #ifdef CONFIG_X86_64
 #define BOOT_HEAP_SIZE	0x7000
-#define BOOT_STACK_SIZE	0x4000
 #else
 #define BOOT_HEAP_SIZE	0x4000
+#endif
+
+#endif /* !CONFIG_KERNEL_BZIP2 */
+
+#ifdef CONFIG_X86_64
+#define BOOT_STACK_SIZE	0x4000
+#else
 #define BOOT_STACK_SIZE	0x1000
 #endif
 
+#endif /* __KERNEL__ */
+
 #endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 2f8466540fb..e55dfc1ad45 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -5,24 +5,43 @@
 #include <linux/mm.h>
 
 /* Caches aren't brain-dead on the intel. */
-#define flush_cache_all()			do { } while (0)
-#define flush_cache_mm(mm)			do { } while (0)
-#define flush_cache_dup_mm(mm)			do { } while (0)
-#define flush_cache_range(vma, start, end)	do { } while (0)
-#define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
-#define flush_dcache_page(page)			do { } while (0)
-#define flush_dcache_mmap_lock(mapping)		do { } while (0)
-#define flush_dcache_mmap_unlock(mapping)	do { } while (0)
-#define flush_icache_range(start, end)		do { } while (0)
-#define flush_icache_page(vma, pg)		do { } while (0)
-#define flush_icache_user_range(vma, pg, adr, len)	do { } while (0)
-#define flush_cache_vmap(start, end)		do { } while (0)
-#define flush_cache_vunmap(start, end)		do { } while (0)
-
-#define copy_to_user_page(vma, page, vaddr, dst, src, len)	\
-	memcpy((dst), (src), (len))
-#define copy_from_user_page(vma, page, vaddr, dst, src, len)	\
-	memcpy((dst), (src), (len))
+static inline void flush_cache_all(void) { }
+static inline void flush_cache_mm(struct mm_struct *mm) { }
+static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
+static inline void flush_cache_range(struct vm_area_struct *vma,
+				     unsigned long start, unsigned long end) { }
+static inline void flush_cache_page(struct vm_area_struct *vma,
+				    unsigned long vmaddr, unsigned long pfn) { }
+static inline void flush_dcache_page(struct page *page) { }
+static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
+static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
+static inline void flush_icache_range(unsigned long start,
+				      unsigned long end) { }
+static inline void flush_icache_page(struct vm_area_struct *vma,
+				     struct page *page) { }
+static inline void flush_icache_user_range(struct vm_area_struct *vma,
+					   struct page *page,
+					   unsigned long addr,
+					   unsigned long len) { }
+static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
+static inline void flush_cache_vunmap(unsigned long start,
+				      unsigned long end) { }
+
+static inline void copy_to_user_page(struct vm_area_struct *vma,
+				     struct page *page, unsigned long vaddr,
+				     void *dst, const void *src,
+				     unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+
+static inline void copy_from_user_page(struct vm_area_struct *vma,
+				       struct page *page, unsigned long vaddr,
+				       void *dst, const void *src,
+				       unsigned long len)
+{
+	memcpy(dst, src, len);
+}
 
 #define PG_non_WB				PG_arch_1
 PAGEFLAG(NonWB, non_WB)
@@ -71,6 +90,9 @@ int set_memory_4k(unsigned long addr, int numpages);
 int set_memory_array_uc(unsigned long *addr, int addrinarray);
 int set_memory_array_wb(unsigned long *addr, int addrinarray);
 
+int set_pages_array_uc(struct page **pages, int addrinarray);
+int set_pages_array_wb(struct page **pages, int addrinarray);
+
 /*
  * For legacy compatibility with the old APIs, a few functions
  * are provided that work on a "struct page".
@@ -104,6 +126,11 @@ void clflush_cache_range(void *addr, unsigned int size);
 #ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void);
 extern const int rodata_test_data;
+void set_kernel_text_rw(void);
+void set_kernel_text_ro(void);
+#else
+static inline void set_kernel_text_rw(void) { }
+static inline void set_kernel_text_ro(void) { }
 #endif
 
 #ifdef CONFIG_DEBUG_RODATA_TEST
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
new file mode 100755
index 00000000000..222802029fa
--- /dev/null
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -0,0 +1,226 @@
+#ifndef _ASM_X86_CPU_DEBUG_H
+#define _ASM_X86_CPU_DEBUG_H
+
+/*
+ * CPU x86 architecture debug
+ *
+ * Copyright(C) 2009 Jaswinder Singh Rajput
+ */
+
+/* Register flags */
+enum cpu_debug_bit {
+/* Model Specific Registers (MSRs)					*/
+	CPU_MC_BIT,				/* Machine Check	*/
+	CPU_MONITOR_BIT,			/* Monitor		*/
+	CPU_TIME_BIT,				/* Time			*/
+	CPU_PMC_BIT,				/* Performance Monitor	*/
+	CPU_PLATFORM_BIT,			/* Platform		*/
+	CPU_APIC_BIT,				/* APIC			*/
+	CPU_POWERON_BIT,			/* Power-on		*/
+	CPU_CONTROL_BIT,			/* Control		*/
+	CPU_FEATURES_BIT,			/* Features control	*/
+	CPU_LBRANCH_BIT,			/* Last Branch		*/
+	CPU_BIOS_BIT,				/* BIOS			*/
+	CPU_FREQ_BIT,				/* Frequency		*/
+	CPU_MTTR_BIT,				/* MTRR			*/
+	CPU_PERF_BIT,				/* Performance		*/
+	CPU_CACHE_BIT,				/* Cache		*/
+	CPU_SYSENTER_BIT,			/* Sysenter		*/
+	CPU_THERM_BIT,				/* Thermal		*/
+	CPU_MISC_BIT,				/* Miscellaneous	*/
+	CPU_DEBUG_BIT,				/* Debug		*/
+	CPU_PAT_BIT,				/* PAT			*/
+	CPU_VMX_BIT,				/* VMX			*/
+	CPU_CALL_BIT,				/* System Call		*/
+	CPU_BASE_BIT,				/* BASE Address		*/
+	CPU_VER_BIT,				/* Version ID		*/
+	CPU_CONF_BIT,				/* Configuration	*/
+	CPU_SMM_BIT,				/* System mgmt mode	*/
+	CPU_SVM_BIT,				/*Secure Virtual Machine*/
+	CPU_OSVM_BIT,				/* OS-Visible Workaround*/
+/* Standard Registers							*/
+	CPU_TSS_BIT,				/* Task Stack Segment	*/
+	CPU_CR_BIT,				/* Control Registers	*/
+	CPU_DT_BIT,				/* Descriptor Table	*/
+/* End of Registers flags						*/
+	CPU_REG_ALL_BIT,			/* Select all Registers	*/
+};
+
+#define	CPU_REG_ALL		(~0)		/* Select all Registers	*/
+
+#define	CPU_MC			(1 << CPU_MC_BIT)
+#define	CPU_MONITOR		(1 << CPU_MONITOR_BIT)
+#define	CPU_TIME		(1 << CPU_TIME_BIT)
+#define	CPU_PMC			(1 << CPU_PMC_BIT)
+#define	CPU_PLATFORM		(1 << CPU_PLATFORM_BIT)
+#define	CPU_APIC		(1 << CPU_APIC_BIT)
+#define	CPU_POWERON		(1 << CPU_POWERON_BIT)
+#define	CPU_CONTROL		(1 << CPU_CONTROL_BIT)
+#define	CPU_FEATURES		(1 << CPU_FEATURES_BIT)
+#define	CPU_LBRANCH		(1 << CPU_LBRANCH_BIT)
+#define	CPU_BIOS		(1 << CPU_BIOS_BIT)
+#define	CPU_FREQ		(1 << CPU_FREQ_BIT)
+#define	CPU_MTRR		(1 << CPU_MTTR_BIT)
+#define	CPU_PERF		(1 << CPU_PERF_BIT)
+#define	CPU_CACHE		(1 << CPU_CACHE_BIT)
+#define	CPU_SYSENTER		(1 << CPU_SYSENTER_BIT)
+#define	CPU_THERM		(1 << CPU_THERM_BIT)
+#define	CPU_MISC		(1 << CPU_MISC_BIT)
+#define	CPU_DEBUG		(1 << CPU_DEBUG_BIT)
+#define	CPU_PAT			(1 << CPU_PAT_BIT)
+#define	CPU_VMX			(1 << CPU_VMX_BIT)
+#define	CPU_CALL		(1 << CPU_CALL_BIT)
+#define	CPU_BASE		(1 << CPU_BASE_BIT)
+#define	CPU_VER			(1 << CPU_VER_BIT)
+#define	CPU_CONF		(1 << CPU_CONF_BIT)
+#define	CPU_SMM			(1 << CPU_SMM_BIT)
+#define	CPU_SVM			(1 << CPU_SVM_BIT)
+#define	CPU_OSVM		(1 << CPU_OSVM_BIT)
+#define	CPU_TSS			(1 << CPU_TSS_BIT)
+#define	CPU_CR			(1 << CPU_CR_BIT)
+#define	CPU_DT			(1 << CPU_DT_BIT)
+
+/* Register file flags */
+enum cpu_file_bit {
+	CPU_INDEX_BIT,				/* index		*/
+	CPU_VALUE_BIT,				/* value		*/
+};
+
+#define	CPU_FILE_VALUE			(1 << CPU_VALUE_BIT)
+
+/*
+ * DisplayFamily_DisplayModel	Processor Families/Processor Number Series
+ * --------------------------	------------------------------------------
+ * 05_01, 05_02, 05_04		Pentium, Pentium with MMX
+ *
+ * 06_01			Pentium Pro
+ * 06_03, 06_05			Pentium II Xeon, Pentium II
+ * 06_07, 06_08, 06_0A, 06_0B	Pentium III Xeon, Pentum III
+ *
+ * 06_09, 060D			Pentium M
+ *
+ * 06_0E			Core Duo, Core Solo
+ *
+ * 06_0F			Xeon 3000, 3200, 5100, 5300, 7300 series,
+ *				Core 2 Quad, Core 2 Extreme, Core 2 Duo,
+ *				Pentium dual-core
+ * 06_17			Xeon 5200, 5400 series, Core 2 Quad Q9650
+ *
+ * 06_1C			Atom
+ *
+ * 0F_00, 0F_01, 0F_02		Xeon, Xeon MP, Pentium 4
+ * 0F_03, 0F_04			Xeon, Xeon MP, Pentium 4, Pentium D
+ *
+ * 0F_06			Xeon 7100, 5000 Series, Xeon MP,
+ *				Pentium 4, Pentium D
+ */
+
+/* Register processors bits */
+enum cpu_processor_bit {
+	CPU_NONE,
+/* Intel */
+	CPU_INTEL_PENTIUM_BIT,
+	CPU_INTEL_P6_BIT,
+	CPU_INTEL_PENTIUM_M_BIT,
+	CPU_INTEL_CORE_BIT,
+	CPU_INTEL_CORE2_BIT,
+	CPU_INTEL_ATOM_BIT,
+	CPU_INTEL_XEON_P4_BIT,
+	CPU_INTEL_XEON_MP_BIT,
+/* AMD */
+	CPU_AMD_K6_BIT,
+	CPU_AMD_K7_BIT,
+	CPU_AMD_K8_BIT,
+	CPU_AMD_0F_BIT,
+	CPU_AMD_10_BIT,
+	CPU_AMD_11_BIT,
+};
+
+#define	CPU_INTEL_PENTIUM	(1 << CPU_INTEL_PENTIUM_BIT)
+#define	CPU_INTEL_P6		(1 << CPU_INTEL_P6_BIT)
+#define	CPU_INTEL_PENTIUM_M	(1 << CPU_INTEL_PENTIUM_M_BIT)
+#define	CPU_INTEL_CORE		(1 << CPU_INTEL_CORE_BIT)
+#define	CPU_INTEL_CORE2		(1 << CPU_INTEL_CORE2_BIT)
+#define	CPU_INTEL_ATOM		(1 << CPU_INTEL_ATOM_BIT)
+#define	CPU_INTEL_XEON_P4	(1 << CPU_INTEL_XEON_P4_BIT)
+#define	CPU_INTEL_XEON_MP	(1 << CPU_INTEL_XEON_MP_BIT)
+
+#define	CPU_INTEL_PX		(CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
+#define	CPU_INTEL_COREX		(CPU_INTEL_CORE | CPU_INTEL_CORE2)
+#define	CPU_INTEL_XEON		(CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
+#define	CPU_CO_AT		(CPU_INTEL_CORE | CPU_INTEL_ATOM)
+#define	CPU_C2_AT		(CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
+#define	CPU_CX_AT		(CPU_INTEL_COREX | CPU_INTEL_ATOM)
+#define	CPU_CX_XE		(CPU_INTEL_COREX | CPU_INTEL_XEON)
+#define	CPU_P6_XE		(CPU_INTEL_P6 | CPU_INTEL_XEON)
+#define	CPU_PM_CO_AT		(CPU_INTEL_PENTIUM_M | CPU_CO_AT)
+#define	CPU_C2_AT_XE		(CPU_C2_AT | CPU_INTEL_XEON)
+#define	CPU_CX_AT_XE		(CPU_CX_AT | CPU_INTEL_XEON)
+#define	CPU_P6_CX_AT		(CPU_INTEL_P6 | CPU_CX_AT)
+#define	CPU_P6_CX_XE		(CPU_P6_XE | CPU_INTEL_COREX)
+#define	CPU_P6_CX_AT_XE		(CPU_INTEL_P6 | CPU_CX_AT_XE)
+#define	CPU_PM_CX_AT_XE		(CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
+#define	CPU_PM_CX_AT		(CPU_INTEL_PENTIUM_M | CPU_CX_AT)
+#define	CPU_PM_CX_XE		(CPU_INTEL_PENTIUM_M | CPU_CX_XE)
+#define	CPU_PX_CX_AT		(CPU_INTEL_PX | CPU_CX_AT)
+#define	CPU_PX_CX_AT_XE		(CPU_INTEL_PX | CPU_CX_AT_XE)
+
+/* Select all supported Intel CPUs */
+#define	CPU_INTEL_ALL		(CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
+
+#define	CPU_AMD_K6		(1 << CPU_AMD_K6_BIT)
+#define	CPU_AMD_K7		(1 << CPU_AMD_K7_BIT)
+#define	CPU_AMD_K8		(1 << CPU_AMD_K8_BIT)
+#define	CPU_AMD_0F		(1 << CPU_AMD_0F_BIT)
+#define	CPU_AMD_10		(1 << CPU_AMD_10_BIT)
+#define	CPU_AMD_11		(1 << CPU_AMD_11_BIT)
+
+#define	CPU_K10_PLUS		(CPU_AMD_10 | CPU_AMD_11)
+#define	CPU_K0F_PLUS		(CPU_AMD_0F | CPU_K10_PLUS)
+#define	CPU_K8_PLUS		(CPU_AMD_K8 | CPU_K0F_PLUS)
+#define	CPU_K7_PLUS		(CPU_AMD_K7 | CPU_K8_PLUS)
+
+/* Select all supported AMD CPUs */
+#define	CPU_AMD_ALL		(CPU_AMD_K6 | CPU_K7_PLUS)
+
+/* Select all supported CPUs */
+#define	CPU_ALL			(CPU_INTEL_ALL | CPU_AMD_ALL)
+
+#define MAX_CPU_FILES		512
+
+struct cpu_private {
+	unsigned		cpu;
+	unsigned		type;
+	unsigned		reg;
+	unsigned		file;
+};
+
+struct cpu_debug_base {
+	char			*name;		/* Register name	*/
+	unsigned		flag;		/* Register flag	*/
+	unsigned		write;		/* Register write flag	*/
+};
+
+/*
+ * Currently it looks similar to cpu_debug_base but once we add more files
+ * cpu_file_base will go in different direction
+ */
+struct cpu_file_base {
+	char			*name;		/* Register file name	*/
+	unsigned		flag;		/* Register file flag	*/
+	unsigned		write;		/* Register write flag	*/
+};
+
+struct cpu_cpuX_base {
+	struct dentry		*dentry;	/* Register dentry	*/
+	int			init;		/* Register index file	*/
+};
+
+struct cpu_debug_range {
+	unsigned		min;		/* Register range min	*/
+	unsigned		max;		/* Register range max	*/
+	unsigned		flag;		/* Supported flags	*/
+	unsigned		model;		/* Supported models	*/
+};
+
+#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7301e60dc4a..0beba0d1468 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -213,6 +213,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_xmm		boot_cpu_has(X86_FEATURE_XMM)
 #define cpu_has_xmm2		boot_cpu_has(X86_FEATURE_XMM2)
 #define cpu_has_xmm3		boot_cpu_has(X86_FEATURE_XMM3)
+#define cpu_has_aes		boot_cpu_has(X86_FEATURE_AES)
 #define cpu_has_ht		boot_cpu_has(X86_FEATURE_HT)
 #define cpu_has_mp		boot_cpu_has(X86_FEATURE_MP)
 #define cpu_has_nx		boot_cpu_has(X86_FEATURE_NX)
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
index a7f3c75f8ad..61c852fa346 100644
--- a/arch/x86/include/asm/cpumask.h
+++ b/arch/x86/include/asm/cpumask.h
@@ -3,8 +3,6 @@
 #ifndef __ASSEMBLY__
 #include <linux/cpumask.h>
 
-#ifdef CONFIG_X86_64
-
 extern cpumask_var_t cpu_callin_mask;
 extern cpumask_var_t cpu_callout_mask;
 extern cpumask_var_t cpu_initialized_mask;
@@ -12,21 +10,5 @@ extern cpumask_var_t cpu_sibling_setup_mask;
 
 extern void setup_cpu_local_masks(void);
 
-#else /* CONFIG_X86_32 */
-
-extern cpumask_t cpu_callin_map;
-extern cpumask_t cpu_callout_map;
-extern cpumask_t cpu_initialized;
-extern cpumask_t cpu_sibling_setup_map;
-
-#define cpu_callin_mask		((struct cpumask *)&cpu_callin_map)
-#define cpu_callout_mask	((struct cpumask *)&cpu_callout_map)
-#define cpu_initialized_mask	((struct cpumask *)&cpu_initialized)
-#define cpu_sibling_setup_mask	((struct cpumask *)&cpu_sibling_setup_map)
-
-static inline void setup_cpu_local_masks(void) { }
-
-#endif /* CONFIG_X86_32 */
-
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_CPUMASK_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index dc27705f544..5623c50d67b 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -91,7 +91,6 @@ static inline int desc_empty(const void *ptr)
 #define store_gdt(dtr) native_store_gdt(dtr)
 #define store_idt(dtr) native_store_idt(dtr)
 #define store_tr(tr) (tr = native_store_tr())
-#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
 
 #define load_TLS(t, cpu) native_load_tls(t, cpu)
 #define set_ldt native_set_ldt
@@ -112,6 +111,8 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
 }
 #endif	/* CONFIG_PARAVIRT */
 
+#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
+
 static inline void native_write_idt_entry(gate_desc *idt, int entry,
 					  const gate_desc *gate)
 {
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 3c034f48fdb..4994a20acbc 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -6,7 +6,7 @@ struct dev_archdata {
 	void	*acpi_handle;
 #endif
 #ifdef CONFIG_X86_64
-struct dma_mapping_ops *dma_ops;
+struct dma_map_ops *dma_ops;
 #endif
 #ifdef CONFIG_DMAR
 	void *iommu; /* hook for IOMMU specific extension */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 132a134d12f..cea7b74963e 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -7,6 +7,8 @@
  */
 
 #include <linux/scatterlist.h>
+#include <linux/dma-debug.h>
+#include <linux/dma-attrs.h>
 #include <asm/io.h>
 #include <asm/swiotlb.h>
 #include <asm-generic/dma-coherent.h>
@@ -16,47 +18,9 @@ extern int iommu_merge;
 extern struct device x86_dma_fallback_dev;
 extern int panic_on_overflow;
 
-struct dma_mapping_ops {
-	int             (*mapping_error)(struct device *dev,
-					 dma_addr_t dma_addr);
-	void*           (*alloc_coherent)(struct device *dev, size_t size,
-				dma_addr_t *dma_handle, gfp_t gfp);
-	void            (*free_coherent)(struct device *dev, size_t size,
-				void *vaddr, dma_addr_t dma_handle);
-	dma_addr_t      (*map_single)(struct device *hwdev, phys_addr_t ptr,
-				size_t size, int direction);
-	void            (*unmap_single)(struct device *dev, dma_addr_t addr,
-				size_t size, int direction);
-	void            (*sync_single_for_cpu)(struct device *hwdev,
-				dma_addr_t dma_handle, size_t size,
-				int direction);
-	void            (*sync_single_for_device)(struct device *hwdev,
-				dma_addr_t dma_handle, size_t size,
-				int direction);
-	void            (*sync_single_range_for_cpu)(struct device *hwdev,
-				dma_addr_t dma_handle, unsigned long offset,
-				size_t size, int direction);
-	void            (*sync_single_range_for_device)(struct device *hwdev,
-				dma_addr_t dma_handle, unsigned long offset,
-				size_t size, int direction);
-	void            (*sync_sg_for_cpu)(struct device *hwdev,
-				struct scatterlist *sg, int nelems,
-				int direction);
-	void            (*sync_sg_for_device)(struct device *hwdev,
-				struct scatterlist *sg, int nelems,
-				int direction);
-	int             (*map_sg)(struct device *hwdev, struct scatterlist *sg,
-				int nents, int direction);
-	void            (*unmap_sg)(struct device *hwdev,
-				struct scatterlist *sg, int nents,
-				int direction);
-	int             (*dma_supported)(struct device *hwdev, u64 mask);
-	int		is_phys;
-};
-
-extern struct dma_mapping_ops *dma_ops;
-
-static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
+extern struct dma_map_ops *dma_ops;
+
+static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 #ifdef CONFIG_X86_32
 	return dma_ops;
@@ -71,7 +35,7 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
 /* Make sure we keep the same behaviour */
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
 	if (ops->mapping_error)
 		return ops->mapping_error(dev, dma_addr);
 
@@ -90,137 +54,167 @@ extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 
 static inline dma_addr_t
 dma_map_single(struct device *hwdev, void *ptr, size_t size,
-	       int direction)
+	       enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
-
-	BUG_ON(!valid_dma_direction(direction));
-	return ops->map_single(hwdev, virt_to_phys(ptr), size, direction);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
+	dma_addr_t addr;
+
+	BUG_ON(!valid_dma_direction(dir));
+	addr = ops->map_page(hwdev, virt_to_page(ptr),
+			     (unsigned long)ptr & ~PAGE_MASK, size,
+			     dir, NULL);
+	debug_dma_map_page(hwdev, virt_to_page(ptr),
+			   (unsigned long)ptr & ~PAGE_MASK, size,
+			   dir, addr, true);
+	return addr;
 }
 
 static inline void
 dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
-		 int direction)
+		 enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
 
-	BUG_ON(!valid_dma_direction(direction));
-	if (ops->unmap_single)
-		ops->unmap_single(dev, addr, size, direction);
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, NULL);
+	debug_dma_unmap_page(dev, addr, size, dir, true);
 }
 
 static inline int
 dma_map_sg(struct device *hwdev, struct scatterlist *sg,
-	   int nents, int direction)
+	   int nents, enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
+	int ents;
+
+	BUG_ON(!valid_dma_direction(dir));
+	ents = ops->map_sg(hwdev, sg, nents, dir, NULL);
+	debug_dma_map_sg(hwdev, sg, nents, ents, dir);
 
-	BUG_ON(!valid_dma_direction(direction));
-	return ops->map_sg(hwdev, sg, nents, direction);
+	return ents;
 }
 
 static inline void
 dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
-	     int direction)
+	     enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
+	debug_dma_unmap_sg(hwdev, sg, nents, dir);
 	if (ops->unmap_sg)
-		ops->unmap_sg(hwdev, sg, nents, direction);
+		ops->unmap_sg(hwdev, sg, nents, dir, NULL);
 }
 
 static inline void
 dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
-			size_t size, int direction)
+			size_t size, enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_single_for_cpu)
-		ops->sync_single_for_cpu(hwdev, dma_handle, size, direction);
+		ops->sync_single_for_cpu(hwdev, dma_handle, size, dir);
+	debug_dma_sync_single_for_cpu(hwdev, dma_handle, size, dir);
 	flush_write_buffers();
 }
 
 static inline void
 dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
-			   size_t size, int direction)
+			   size_t size, enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_single_for_device)
-		ops->sync_single_for_device(hwdev, dma_handle, size, direction);
+		ops->sync_single_for_device(hwdev, dma_handle, size, dir);
+	debug_dma_sync_single_for_device(hwdev, dma_handle, size, dir);
 	flush_write_buffers();
 }
 
 static inline void
 dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
-			      unsigned long offset, size_t size, int direction)
+			      unsigned long offset, size_t size,
+			      enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_single_range_for_cpu)
 		ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
-					       size, direction);
+					       size, dir);
+	debug_dma_sync_single_range_for_cpu(hwdev, dma_handle,
+					    offset, size, dir);
 	flush_write_buffers();
 }
 
 static inline void
 dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
 				 unsigned long offset, size_t size,
-				 int direction)
+				 enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_single_range_for_device)
 		ops->sync_single_range_for_device(hwdev, dma_handle,
-						  offset, size, direction);
+						  offset, size, dir);
+	debug_dma_sync_single_range_for_device(hwdev, dma_handle,
+					       offset, size, dir);
 	flush_write_buffers();
 }
 
 static inline void
 dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
-		    int nelems, int direction)
+		    int nelems, enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_sg_for_cpu)
-		ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
+		ops->sync_sg_for_cpu(hwdev, sg, nelems, dir);
+	debug_dma_sync_sg_for_cpu(hwdev, sg, nelems, dir);
 	flush_write_buffers();
 }
 
 static inline void
 dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
-		       int nelems, int direction)
+		       int nelems, enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+	struct dma_map_ops *ops = get_dma_ops(hwdev);
 
-	BUG_ON(!valid_dma_direction(direction));
+	BUG_ON(!valid_dma_direction(dir));
 	if (ops->sync_sg_for_device)
-		ops->sync_sg_for_device(hwdev, sg, nelems, direction);
+		ops->sync_sg_for_device(hwdev, sg, nelems, dir);
+	debug_dma_sync_sg_for_device(hwdev, sg, nelems, dir);
 
 	flush_write_buffers();
 }
 
 static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
 				      size_t offset, size_t size,
-				      int direction)
+				      enum dma_data_direction dir)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	dma_addr_t addr;
 
-	BUG_ON(!valid_dma_direction(direction));
-	return ops->map_single(dev, page_to_phys(page) + offset,
-			       size, direction);
+	BUG_ON(!valid_dma_direction(dir));
+	addr = ops->map_page(dev, page, offset, size, dir, NULL);
+	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
+
+	return addr;
 }
 
 static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
-				  size_t size, int direction)
+				  size_t size, enum dma_data_direction dir)
 {
-	dma_unmap_single(dev, addr, size, direction);
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, NULL);
+	debug_dma_unmap_page(dev, addr, size, dir, false);
 }
 
 static inline void
@@ -266,7 +260,7 @@ static inline void *
 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
 	void *memory;
 
 	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
@@ -285,20 +279,24 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	if (!ops->alloc_coherent)
 		return NULL;
 
-	return ops->alloc_coherent(dev, size, dma_handle,
-				   dma_alloc_coherent_gfp_flags(dev, gfp));
+	memory = ops->alloc_coherent(dev, size, dma_handle,
+				     dma_alloc_coherent_gfp_flags(dev, gfp));
+	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
+
+	return memory;
 }
 
 static inline void dma_free_coherent(struct device *dev, size_t size,
 				     void *vaddr, dma_addr_t bus)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
 
 	WARN_ON(irqs_disabled());       /* for portability */
 
 	if (dma_release_from_coherent(dev, get_order(size), vaddr))
 		return;
 
+	debug_dma_free_coherent(dev, size, vaddr, bus);
 	if (ops->free_coherent)
 		ops->free_coherent(dev, size, vaddr, bus);
 }
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index bc68212c6bc..fd8f9e2ca35 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -1,22 +1,15 @@
 #ifndef _ASM_X86_DMI_H
 #define _ASM_X86_DMI_H
 
-#include <asm/io.h>
-
-#define DMI_MAX_DATA 2048
+#include <linux/compiler.h>
+#include <linux/init.h>
 
-extern int dmi_alloc_index;
-extern char dmi_alloc_data[DMI_MAX_DATA];
+#include <asm/io.h>
+#include <asm/setup.h>
 
-/* This is so early that there is no good way to allocate dynamic memory.
-   Allocate data in an BSS array. */
-static inline void *dmi_alloc(unsigned len)
+static __always_inline __init void *dmi_alloc(unsigned len)
 {
-	int idx = dmi_alloc_index;
-	if ((dmi_alloc_index + len) > DMI_MAX_DATA)
-		return NULL;
-	dmi_alloc_index += len;
-	return dmi_alloc_data + idx;
+	return extend_brk(len, sizeof(int));
 }
 
 /* Use early IO mappings for DMI because it's initialized early */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 00d41ce4c84..7ecba4d8508 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -72,7 +72,7 @@ extern int e820_all_mapped(u64 start, u64 end, unsigned type);
 extern void e820_add_region(u64 start, u64 size, int type);
 extern void e820_print_map(char *who);
 extern int
-sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
+sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map);
 extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
 			       unsigned new_type);
 extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ca5ffb2856b..edc90f23e70 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #else /* !CONFIG_X86_32 */
 
-#define MAX_EFI_IO_PAGES	100
-
 extern u64 efi_call0(void *fp);
 extern u64 efi_call1(void *fp, u64 arg1);
 extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 854d538ae85..c2e6bedaf25 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
 		 smp_invalidate_interrupt)
 #endif
 
+BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
+
 /*
  * every pentium local APIC has two 'local interrupts', with a
  * soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 23696d44a0a..81937a5dc77 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -1,11 +1,147 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
+ */
+
 #ifndef _ASM_X86_FIXMAP_H
 #define _ASM_X86_FIXMAP_H
 
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#ifdef CONFIG_X86_32
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#else
+#include <asm/vsyscall.h>
+#endif
+
+/*
+ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
+ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
+ * Because of this, FIXADDR_TOP x86 integration was left as later work.
+ */
+#ifdef CONFIG_X86_32
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+#define FIXADDR_TOP	((unsigned long)__FIXADDR_TOP)
+
+#define FIXADDR_USER_START     __fix_to_virt(FIX_VDSO)
+#define FIXADDR_USER_END       __fix_to_virt(FIX_VDSO - 1)
+#else
+#define FIXADDR_TOP	(VSYSCALL_END-PAGE_SIZE)
+
+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
+#define FIXADDR_USER_START	((unsigned long)VSYSCALL32_VSYSCALL)
+#define FIXADDR_USER_END	(FIXADDR_USER_START + PAGE_SIZE)
+#endif
+
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
 #ifdef CONFIG_X86_32
-# include "fixmap_32.h"
+	FIX_HOLE,
+	FIX_VDSO,
 #else
-# include "fixmap_64.h"
+	VSYSCALL_LAST_PAGE,
+	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+	VSYSCALL_HPET,
 #endif
+	FIX_DBGP_BASE,
+	FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_X86_LOCAL_APIC
+	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+	FIX_IO_APIC_BASE_0,
+	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+	FIX_CO_CPU,	/* Cobalt timer */
+	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */
+	FIX_LI_PCIA,	/* Lithium PCI Bridge A */
+	FIX_LI_PCIB,	/* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_X86_F00F_BUG
+	FIX_F00F_IDT,	/* Virtual mapping for IDT */
+#endif
+#ifdef CONFIG_X86_CYCLONE_TIMER
+	FIX_CYCLONE_TIMER, /*cyclone timer register*/
+#endif
+#ifdef CONFIG_X86_32
+	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
+	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+	FIX_PCIE_MCFG,
+#endif
+#endif
+#ifdef CONFIG_PARAVIRT
+	FIX_PARAVIRT_BOOTMAP,
+#endif
+	FIX_TEXT_POKE0,	/* reserve 2 pages for text_poke() */
+	FIX_TEXT_POKE1,
+	__end_of_permanent_fixed_addresses,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	FIX_OHCI1394_BASE,
+#endif
+	/*
+	 * 256 temporary boot-time mappings, used by early_ioremap(),
+	 * before ioremap() is functional.
+	 *
+	 * We round it up to the next 256 pages boundary so that we
+	 * can have a single pgd entry and a single pte table:
+	 */
+#define NR_FIX_BTMAPS		64
+#define FIX_BTMAPS_SLOTS	4
+	FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+			(__end_of_permanent_fixed_addresses & 255),
+	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
+#ifdef CONFIG_X86_32
+	FIX_WP_TEST,
+#endif
+	__end_of_fixed_addresses
+};
+
+
+extern void reserve_top_address(unsigned long reserve);
+
+#define FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_BOOT_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START		(FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_BOOT_START	(FIXADDR_TOP - FIXADDR_BOOT_SIZE)
 
 extern int fixmaps_set;
 
@@ -69,4 +205,5 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
 	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
 	return __virt_to_fix(vaddr);
 }
+#endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/fixmap_32.h b/arch/x86/include/asm/fixmap_32.h
deleted file mode 100644
index 047d9bab2b3..00000000000
--- a/arch/x86/include/asm/fixmap_32.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * fixmap.h: compile-time virtual memory allocation
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1998 Ingo Molnar
- *
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- */
-
-#ifndef _ASM_X86_FIXMAP_32_H
-#define _ASM_X86_FIXMAP_32_H
-
-
-/* used by vmalloc.c, vsyscall.lds.S.
- *
- * Leave one empty page between vmalloc'ed areas and
- * the start of the fixmap.
- */
-extern unsigned long __FIXADDR_TOP;
-#define FIXADDR_USER_START     __fix_to_virt(FIX_VDSO)
-#define FIXADDR_USER_END       __fix_to_virt(FIX_VDSO - 1)
-
-#ifndef __ASSEMBLY__
-#include <linux/kernel.h>
-#include <asm/acpi.h>
-#include <asm/apicdef.h>
-#include <asm/page.h>
-#include <linux/threads.h>
-#include <asm/kmap_types.h>
-
-/*
- * Here we define all the compile-time 'special' virtual
- * addresses. The point is to have a constant address at
- * compile time, but to set the physical address only
- * in the boot process. We allocate these special addresses
- * from the end of virtual memory (0xfffff000) backwards.
- * Also this lets us do fail-safe vmalloc(), we
- * can guarantee that these special addresses and
- * vmalloc()-ed addresses never overlap.
- *
- * these 'compile-time allocated' memory buffers are
- * fixed-size 4k pages. (or larger if used with an increment
- * highger than 1) use fixmap_set(idx,phys) to associate
- * physical memory with fixmap indices.
- *
- * TLB entries of such buffers will not be flushed across
- * task switches.
- */
-enum fixed_addresses {
-	FIX_HOLE,
-	FIX_VDSO,
-	FIX_DBGP_BASE,
-	FIX_EARLYCON_MEM_BASE,
-#ifdef CONFIG_X86_LOCAL_APIC
-	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
-#endif
-#ifdef CONFIG_X86_IO_APIC
-	FIX_IO_APIC_BASE_0,
-	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
-#endif
-#ifdef CONFIG_X86_VISWS_APIC
-	FIX_CO_CPU,	/* Cobalt timer */
-	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */
-	FIX_LI_PCIA,	/* Lithium PCI Bridge A */
-	FIX_LI_PCIB,	/* Lithium PCI Bridge B */
-#endif
-#ifdef CONFIG_X86_F00F_BUG
-	FIX_F00F_IDT,	/* Virtual mapping for IDT */
-#endif
-#ifdef CONFIG_X86_CYCLONE_TIMER
-	FIX_CYCLONE_TIMER, /*cyclone timer register*/
-#endif
-	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
-	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
-#ifdef CONFIG_PCI_MMCONFIG
-	FIX_PCIE_MCFG,
-#endif
-#ifdef CONFIG_PARAVIRT
-	FIX_PARAVIRT_BOOTMAP,
-#endif
-	__end_of_permanent_fixed_addresses,
-	/*
-	 * 256 temporary boot-time mappings, used by early_ioremap(),
-	 * before ioremap() is functional.
-	 *
-	 * We round it up to the next 256 pages boundary so that we
-	 * can have a single pgd entry and a single pte table:
-	 */
-#define NR_FIX_BTMAPS		64
-#define FIX_BTMAPS_SLOTS	4
-	FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
-			(__end_of_permanent_fixed_addresses & 255),
-	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
-	FIX_WP_TEST,
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-	FIX_OHCI1394_BASE,
-#endif
-	__end_of_fixed_addresses
-};
-
-extern void reserve_top_address(unsigned long reserve);
-
-
-#define FIXADDR_TOP	((unsigned long)__FIXADDR_TOP)
-
-#define __FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
-#define __FIXADDR_BOOT_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START		(FIXADDR_TOP - __FIXADDR_SIZE)
-#define FIXADDR_BOOT_START	(FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_FIXMAP_32_H */
diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h
deleted file mode 100644
index 298d9ba3fae..00000000000
--- a/arch/x86/include/asm/fixmap_64.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * fixmap.h: compile-time virtual memory allocation
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1998 Ingo Molnar
- */
-
-#ifndef _ASM_X86_FIXMAP_64_H
-#define _ASM_X86_FIXMAP_64_H
-
-#include <linux/kernel.h>
-#include <asm/acpi.h>
-#include <asm/apicdef.h>
-#include <asm/page.h>
-#include <asm/vsyscall.h>
-#include <asm/efi.h>
-
-/*
- * Here we define all the compile-time 'special' virtual
- * addresses. The point is to have a constant address at
- * compile time, but to set the physical address only
- * in the boot process.
- *
- * These 'compile-time allocated' memory buffers are
- * fixed-size 4k pages (or larger if used with an increment
- * higher than 1). Use set_fixmap(idx,phys) to associate
- * physical memory with fixmap indices.
- *
- * TLB entries of such buffers will not be flushed across
- * task switches.
- */
-
-enum fixed_addresses {
-	VSYSCALL_LAST_PAGE,
-	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
-			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
-	VSYSCALL_HPET,
-	FIX_DBGP_BASE,
-	FIX_EARLYCON_MEM_BASE,
-	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
-	FIX_IO_APIC_BASE_0,
-	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
-	FIX_EFI_IO_MAP_LAST_PAGE,
-	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
-				  + MAX_EFI_IO_PAGES - 1,
-#ifdef CONFIG_PARAVIRT
-	FIX_PARAVIRT_BOOTMAP,
-#endif
-	__end_of_permanent_fixed_addresses,
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-	FIX_OHCI1394_BASE,
-#endif
-	/*
-	 * 256 temporary boot-time mappings, used by early_ioremap(),
-	 * before ioremap() is functional.
-	 *
-	 * We round it up to the next 256 pages boundary so that we
-	 * can have a single pgd entry and a single pte table:
-	 */
-#define NR_FIX_BTMAPS		64
-#define FIX_BTMAPS_SLOTS	4
-	FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
-			(__end_of_permanent_fixed_addresses & 255),
-	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
-	__end_of_fixed_addresses
-};
-
-#define FIXADDR_TOP	(VSYSCALL_END-PAGE_SIZE)
-#define FIXADDR_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START	(FIXADDR_TOP - FIXADDR_SIZE)
-
-/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
-#define FIXADDR_USER_START	((unsigned long)VSYSCALL32_VSYSCALL)
-#define FIXADDR_USER_END	(FIXADDR_USER_START + PAGE_SIZE)
-
-#endif /* _ASM_X86_FIXMAP_64_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index b55b4a7fbef..bd2c6511c88 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -28,6 +28,13 @@
 
 #endif
 
+/* FIXME: I don't want to stay hardcoded */
+#ifdef CONFIG_X86_64
+# define FTRACE_SYSCALL_MAX     296
+#else
+# define FTRACE_SYSCALL_MAX     333
+#endif
+
 #ifdef CONFIG_FUNCTION_TRACER
 #define MCOUNT_ADDR		((long)(mcount))
 #define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */
@@ -55,29 +62,4 @@ struct dyn_arch_ftrace {
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-
-#ifndef __ASSEMBLY__
-
-/*
- * Stack of return addresses for functions
- * of a thread.
- * Used in struct thread_info
- */
-struct ftrace_ret_stack {
-	unsigned long ret;
-	unsigned long func;
-	unsigned long long calltime;
-};
-
-/*
- * Primary handler of a function return.
- * It relays on ftrace_return_to_handler.
- * Defined in entry_32/64.S
- */
-extern void return_to_handler(void);
-
-#endif /* __ASSEMBLY__ */
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
 #endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 46ebed797e4..25454427cee 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,6 +12,7 @@ typedef struct {
 	unsigned int apic_timer_irqs;	/* arch dependent */
 	unsigned int irq_spurious_count;
 #endif
+	unsigned int generic_irqs;	/* arch dependent */
 	unsigned int apic_perf_irqs;
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index bf9276bea66..014c2b85ae4 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -63,6 +63,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
 void *kmap_atomic(struct page *page, enum km_type type);
 void kunmap_atomic(void *kvaddr, enum km_type type);
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
 
 #ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index f39881b6b68..ae80f64973e 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,6 +27,7 @@
 
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
+extern void generic_interrupt(void);
 extern void error_interrupt(void);
 extern void perf_counter_interrupt(void);
 
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 48f0004db8c..71c9e518398 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 
 #else  /* CONFIG_X86_32 */
 
-extern void finit(void);
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_task(struct task_struct *tsk);
+#else
+static inline void finit_task(struct task_struct *tsk)
+{
+}
+#endif
 
 static inline void tolerant_fwait(void)
 {
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 50ca486fd88..1f7e6251728 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -129,13 +129,6 @@ typedef struct compat_siginfo {
 	} _sifields;
 } compat_siginfo_t;
 
-struct ustat32 {
-	__u32			f_tfree;
-	compat_ino_t		f_tinode;
-	char			f_fname[6];
-	char			f_fpack[6];
-};
-
 #define IA32_STACK_TOP IA32_PAGE_OFFSET
 
 #ifdef __KERNEL__
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 00000000000..36fb1a6a510
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_INIT_32_H
+#define _ASM_X86_INIT_32_H
+
+#ifdef CONFIG_X86_32
+extern void __init early_ioremap_page_table_range_init(void);
+#endif
+
+extern unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask);
+
+
+extern unsigned long __initdata e820_table_start;
+extern unsigned long __meminitdata e820_table_end;
+extern unsigned long __meminitdata e820_table_top;
+
+#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 683d0b4c00f..e5383e3d2f8 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
 
 extern void iounmap(volatile void __iomem *addr);
 
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
-
 
 #ifdef CONFIG_X86_32
 # include "io_32.h"
@@ -198,7 +196,6 @@ extern void early_ioremap_reset(void);
 extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
 extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
 
 #define IO_SPACE_LIMIT 0xffff
 
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 59cb4a1317b..373cc2bbcad 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -162,7 +162,8 @@ extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
 #ifdef CONFIG_X86_64
-extern int save_mask_IO_APIC_setup(void);
+extern int save_IO_APIC_setup(void);
+extern void mask_IO_APIC_setup(void);
 extern void restore_IO_APIC_setup(void);
 extern void reinit_intr_remapped_IO_APIC(int);
 #endif
@@ -172,7 +173,7 @@ extern void probe_nr_irqs_gsi(void);
 extern int setup_ioapic_entry(int apic, int irq,
 			      struct IO_APIC_route_entry *entry,
 			      unsigned int destination, int trigger,
-			      int polarity, int vector);
+			      int polarity, int vector, int pin);
 extern void ioapic_write_entry(int apic, int pin,
 			       struct IO_APIC_route_entry e);
 #else  /* !CONFIG_X86_IO_APIC */
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index bd46495ff7d..86af26091d6 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -24,10 +24,7 @@
 #include <asm/tlbflush.h>
 
 int
-reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot);
-
-void
-free_io_memtype(u64 base, unsigned long size);
+is_io_mapping_possible(resource_size_t base, unsigned long size);
 
 void *
 iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index a6ee9e6f530..af326a2975b 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -3,7 +3,7 @@
 
 extern void pci_iommu_shutdown(void);
 extern void no_iommu_init(void);
-extern struct dma_mapping_ops nommu_dma_ops;
+extern struct dma_map_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 107eb219669..f38481bcd45 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq)
 extern void fixup_irqs(void);
 #endif
 
+extern void (*generic_interrupt_extension)(void);
 extern void init_IRQ(void);
 extern void native_init_IRQ(void);
 extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 20e1fd588db..0396760fccb 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,8 +1,6 @@
 #ifndef _ASM_X86_IRQ_REMAPPING_H
 #define _ASM_X86_IRQ_REMAPPING_H
 
-extern int x2apic;
-
 #define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
 
 #endif	/* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8a285f356f8..3cbd79bbb47 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -112,6 +112,11 @@
 #define LOCAL_PERF_VECTOR		0xee
 
 /*
+ * Generic system vector for platform specific use
+ */
+#define GENERIC_INTERRUPT_VECTOR	0xed
+
+/*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
  * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 0ceb6d19ed3..317ff1703d0 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -9,13 +9,13 @@
 # define PAGES_NR		4
 #else
 # define PA_CONTROL_PAGE	0
-# define PA_TABLE_PAGE		1
-# define PAGES_NR		2
+# define VA_CONTROL_PAGE	1
+# define PA_TABLE_PAGE		2
+# define PA_SWAP_PAGE		3
+# define PAGES_NR		4
 #endif
 
-#ifdef CONFIG_X86_32
 # define KEXEC_CONTROL_CODE_MAX_SIZE	2048
-#endif
 
 #ifndef __ASSEMBLY__
 
@@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page,
 		unsigned int has_pae,
 		unsigned int preserve_context);
 #else
-NORET_TYPE void
+unsigned long
 relocate_kernel(unsigned long indirection_page,
 		unsigned long page_list,
-		unsigned long start_address) ATTRIB_NORET;
+		unsigned long start_address,
+		unsigned int preserve_context);
 #endif
 
 #define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 886c9402ec4..dc3f6cf1170 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -15,6 +15,7 @@
 #define __KVM_HAVE_DEVICE_ASSIGNMENT
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
+#define __KVM_HAVE_GUEST_DEBUG
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -212,7 +213,30 @@ struct kvm_pit_channel_state {
 	__s64 count_load_time;
 };
 
+struct kvm_debug_exit_arch {
+	__u32 exception;
+	__u32 pad;
+	__u64 pc;
+	__u64 dr6;
+	__u64 dr7;
+};
+
+#define KVM_GUESTDBG_USE_SW_BP		0x00010000
+#define KVM_GUESTDBG_USE_HW_BP		0x00020000
+#define KVM_GUESTDBG_INJECT_DB		0x00040000
+#define KVM_GUESTDBG_INJECT_BP		0x00080000
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+	__u64 debugreg[8];
+};
+
 struct kvm_pit_state {
 	struct kvm_pit_channel_state channels[3];
 };
+
+struct kvm_reinject_control {
+	__u8 pit_reinject;
+	__u8 reserved[31];
+};
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 730843d1d2f..f0faf58044f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,7 @@
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
+#include <asm/msr-index.h>
 
 #define KVM_MAX_VCPUS 16
 #define KVM_MEMORY_SLOTS 32
@@ -134,11 +135,18 @@ enum {
 
 #define KVM_NR_MEM_OBJS 40
 
-struct kvm_guest_debug {
-	int enabled;
-	unsigned long bp[4];
-	int singlestep;
-};
+#define KVM_NR_DB_REGS	4
+
+#define DR6_BD		(1 << 13)
+#define DR6_BS		(1 << 14)
+#define DR6_FIXED_1	0xffff0ff0
+#define DR6_VOLATILE	0x0000e00f
+
+#define DR7_BP_EN_MASK	0x000000ff
+#define DR7_GE		(1 << 9)
+#define DR7_GD		(1 << 13)
+#define DR7_FIXED_1	0x00000400
+#define DR7_VOLATILE	0xffff23ff
 
 /*
  * We don't want allocation failures within the mmu code, so we preallocate
@@ -162,7 +170,8 @@ struct kvm_pte_chain {
  *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
  *   bits 4:7 - page table level for this shadow (1-4)
  *   bits 8:9 - page table quadrant for 2-level guests
- *   bit   16 - "metaphysical" - gfn is not a real page (huge page/real mode)
+ *   bit   16 - direct mapping of virtual to physical mapping at gfn
+ *              used for real mode and two-dimensional paging
  *   bits 17:19 - common access permissions for all ptes in this shadow page
  */
 union kvm_mmu_page_role {
@@ -172,9 +181,10 @@ union kvm_mmu_page_role {
 		unsigned level:4;
 		unsigned quadrant:2;
 		unsigned pad_for_nice_hex_output:6;
-		unsigned metaphysical:1;
+		unsigned direct:1;
 		unsigned access:3;
 		unsigned invalid:1;
+		unsigned cr4_pge:1;
 	};
 };
 
@@ -218,6 +228,18 @@ struct kvm_pv_mmu_op_buffer {
 	char buf[512] __aligned(sizeof(long));
 };
 
+struct kvm_pio_request {
+	unsigned long count;
+	int cur_count;
+	gva_t guest_gva;
+	int in;
+	int port;
+	int size;
+	int string;
+	int down;
+	int rep;
+};
+
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
  * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
@@ -236,6 +258,7 @@ struct kvm_mmu {
 	hpa_t root_hpa;
 	int root_level;
 	int shadow_root_level;
+	union kvm_mmu_page_role base_role;
 
 	u64 *pae_root;
 };
@@ -258,6 +281,7 @@ struct kvm_vcpu_arch {
 	unsigned long cr3;
 	unsigned long cr4;
 	unsigned long cr8;
+	u32 hflags;
 	u64 pdptrs[4]; /* pae */
 	u64 shadow_efer;
 	u64 apic_base;
@@ -338,6 +362,15 @@ struct kvm_vcpu_arch {
 
 	struct mtrr_state_type mtrr_state;
 	u32 pat;
+
+	int switch_db_regs;
+	unsigned long host_db[KVM_NR_DB_REGS];
+	unsigned long host_dr6;
+	unsigned long host_dr7;
+	unsigned long db[KVM_NR_DB_REGS];
+	unsigned long dr6;
+	unsigned long dr7;
+	unsigned long eff_db[KVM_NR_DB_REGS];
 };
 
 struct kvm_mem_alias {
@@ -378,6 +411,7 @@ struct kvm_arch{
 
 	unsigned long irq_sources_bitmap;
 	unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
+	u64 vm_init_tsc;
 };
 
 struct kvm_vm_stat {
@@ -446,8 +480,7 @@ struct kvm_x86_ops {
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
 	int (*set_guest_debug)(struct kvm_vcpu *vcpu,
-			       struct kvm_debug_guest *dbg);
-	void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
+			       struct kvm_guest_debug *dbg);
 	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -583,16 +616,12 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
 			   u32 error_code);
 
-void kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_set_irq(void *opaque, int irq, int level);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
 void fx_init(struct kvm_vcpu *vcpu);
 
-int emulator_read_std(unsigned long addr,
-		      void *val,
-		      unsigned int bytes,
-		      struct kvm_vcpu *vcpu);
 int emulator_write_emulated(unsigned long addr,
 			    const void *val,
 			    unsigned int bytes,
@@ -737,6 +766,10 @@ enum {
 	TASK_SWITCH_GATE = 3,
 };
 
+#define HF_GIF_MASK		(1 << 0)
+#define HF_HIF_MASK		(1 << 1)
+#define HF_VINTR_MASK		(1 << 2)
+
 /*
  * Hardware virtualization extension instructions may fault if a
  * reboot turns off virtualization while processes are running.
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 43894428c3c..0f4ee7148af 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -26,36 +26,20 @@
 
 #ifndef __ASSEMBLY__
 #include <asm/hw_irq.h>
+#include <asm/kvm_para.h>
 
 /*G:031 But first, how does our Guest contact the Host to ask for privileged
  * operations?  There are two ways: the direct way is to make a "hypercall",
  * to make requests of the Host Itself.
  *
- * Our hypercall mechanism uses the highest unused trap code (traps 32 and
- * above are used by real hardware interrupts).  Fifteen hypercalls are
+ * We use the KVM hypercall mechanism. Eighteen hypercalls are
  * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %edx, %ebx and %ecx.  If a return
+ * arguments (when required) are placed in %ebx, %ecx and %edx.  If a return
  * value makes sense, it's returned in %eax.
  *
  * Grossly invalid calls result in Sudden Death at the hands of the vengeful
  * Host, rather than returning failure.  This reflects Winston Churchill's
  * definition of a gentleman: "someone who is only rude intentionally". */
-static inline unsigned long
-hcall(unsigned long call,
-      unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	/* "int" is the Intel instruction to trigger a trap. */
-	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
-		     /* The call in %eax (aka "a") might be overwritten */
-		     : "=a"(call)
-		       /* The arguments are in %eax, %edx, %ebx & %ecx */
-		     : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
-		       /* "memory" means this might write somewhere in memory.
-			* This isn't true for all calls, but it's safe to tell
-			* gcc that it might happen so it doesn't get clever. */
-		     : "memory");
-	return call;
-}
 /*:*/
 
 /* Can't use our min() macro here: needs to be a constant */
@@ -64,7 +48,7 @@ hcall(unsigned long call,
 #define LHCALL_RING_SIZE 64
 struct hcall_args {
 	/* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
-	unsigned long arg0, arg2, arg3, arg1;
+	unsigned long arg0, arg1, arg2, arg3;
 };
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 9320e2a8a26..12d55e773eb 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -1,14 +1,11 @@
 #ifndef _ASM_X86_LINKAGE_H
 #define _ASM_X86_LINKAGE_H
 
+#include <linux/stringify.h>
+
 #undef notrace
 #define notrace __attribute__((no_instrument_function))
 
-#ifdef CONFIG_X86_64
-#define __ALIGN .p2align 4,,15
-#define __ALIGN_STR ".p2align 4,,15"
-#endif
-
 #ifdef CONFIG_X86_32
 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
 /*
@@ -50,16 +47,20 @@
 	__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
 			      "g" (arg4), "g" (arg5), "g" (arg6))
 
-#endif
+#endif /* CONFIG_X86_32 */
+
+#ifdef __ASSEMBLY__
 
 #define GLOBAL(name)	\
 	.globl name;	\
 	name:
 
-#ifdef CONFIG_X86_ALIGNMENT_16
-#define __ALIGN .align 16,0x90
-#define __ALIGN_STR ".align 16,0x90"
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
+#define __ALIGN		.p2align 4, 0x90
+#define __ALIGN_STR	__stringify(__ALIGN)
 #endif
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _ASM_X86_LINKAGE_H */
 
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960..563933e06a3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
  */
 
 #define MCG_CTL_P	 (1UL<<8)   /* MCG_CAP register available */
+#define MCG_EXT_P	 (1ULL<<9)   /* Extended registers available */
+#define MCG_CMCI_P	 (1ULL<<10)  /* CMCI supported */
 
 #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
 
 #include <asm/atomic.h>
 
+void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
 DECLARE_PER_CPU(struct sys_device, device_mce);
 extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
+
 #ifdef CONFIG_X86_MCE_INTEL
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void cmci_clear(void);
+void cmci_reenable(void);
+void cmci_rediscover(int dying);
+void cmci_recheck(void);
 #else
 static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void cmci_clear(void) {}
+static inline void cmci_reenable(void) {}
+static inline void cmci_rediscover(int dying) {}
+static inline void cmci_recheck(void) {}
 #endif
 
 #ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
 static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
 #endif
 
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+extern int mce_available(struct cpuinfo_x86 *c);
+
+void mce_log_therm_throt_event(__u64 status);
 
 extern atomic_t mce_entry;
 
 extern void do_machine_check(struct pt_regs *, long);
+
+typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
+
+enum mcp_flags {
+	MCP_TIMESTAMP = (1 << 0),	/* log time stamp */
+	MCP_UC = (1 << 1),		/* log uncorrected errors */
+};
+extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+
 extern int mce_notify_user(void);
 
 #endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
 #else
 #define mcheck_init(c) do { } while (0)
 #endif
-extern void stop_mce(void);
-extern void restart_mce(void);
+
+extern void (*mce_threshold_vector)(void);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 105fb90a063..ede6998bd92 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn)
 #endif /* CONFIG_DISCONTIGMEM */
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-
-/*
- * Following are macros that are specific to this numa platform.
- */
-#define reserve_bootmem(addr, size, flags) \
-	reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
-#define alloc_bootmem(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_nopanic(x) \
-	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
-				__pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
-#define alloc_bootmem_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_pages_nopanic(x) \
-	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
-				__pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-#define alloc_bootmem_node(pgdat, x)					\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES,	\
-						__pa(MAX_DMA_ADDRESS));	\
-})
-#define alloc_bootmem_pages_node(pgdat, x)				\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE,		\
-						__pa(MAX_DMA_ADDRESS));	\
-})
-#define alloc_bootmem_low_pages_node(pgdat, x)				\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0);		\
-})
+/* always use node 0 for bootmem on this numa platform */
+#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit)	\
+	(NODE_DATA(0)->bdata)
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
 #endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
index 6706b3006f1..4cc48af23fe 100644
--- a/arch/x86/include/asm/msidef.h
+++ b/arch/x86/include/asm/msidef.h
@@ -47,6 +47,7 @@
 #define	 MSI_ADDR_DEST_ID_MASK		0x00ffff0
 #define  MSI_ADDR_DEST_ID(dest)		(((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
 					 MSI_ADDR_DEST_ID_MASK)
+#define MSI_ADDR_EXT_DEST_ID(dest)	((dest) & 0xffffff00)
 
 #define MSI_ADDR_IR_EXT_INT		(1 << 4)
 #define MSI_ADDR_IR_SHV			(1 << 3)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc59ae0..ec41fc16c16 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -18,11 +18,15 @@
 #define _EFER_LME		8  /* Long mode enable */
 #define _EFER_LMA		10 /* Long mode active (read-only) */
 #define _EFER_NX		11 /* No execute enable */
+#define _EFER_SVME		12 /* Enable virtualization */
+#define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
 
 #define EFER_SCE		(1<<_EFER_SCE)
 #define EFER_LME		(1<<_EFER_LME)
 #define EFER_LMA		(1<<_EFER_LMA)
 #define EFER_NX			(1<<_EFER_NX)
+#define EFER_SVME		(1<<_EFER_SVME)
+#define EFER_FFXSR		(1<<_EFER_FFXSR)
 
 /* Intel MSRs. Some also available on other CPUs */
 #define MSR_IA32_PERFCTR0		0x000000c1
@@ -77,6 +81,11 @@
 #define MSR_IA32_MC0_ADDR		0x00000402
 #define MSR_IA32_MC0_MISC		0x00000403
 
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2		0x00000280
+#define CMCI_EN			(1ULL << 30)
+#define CMCI_THRESHOLD_MASK		0xffffULL
+
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186
@@ -360,4 +369,9 @@
 #define MSR_IA32_VMX_PROCBASED_CTLS2    0x0000048b
 #define MSR_IA32_VMX_EPT_VPID_CAP       0x0000048c
 
+/* AMD-V MSRs */
+
+#define MSR_VM_CR                       0xc0010114
+#define MSR_VM_HSAVE_PA                 0xc0010117
+
 #endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index e9f5db79624..a37229011b5 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -4,8 +4,12 @@
 extern int pxm_to_nid(int pxm);
 extern void numa_remove_cpu(int cpu);
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_HIGHMEM
 extern void set_highmem_pages_init(void);
+#else
+static inline void set_highmem_pages_init(void)
+{
+}
 #endif
 
 #endif /* _ASM_X86_NUMA_32_H */
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index f1e4a79a6e4..0f915ae649a 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -39,6 +39,11 @@
 #define __VIRTUAL_MASK_SHIFT	32
 #endif	/* CONFIG_X86_PAE */
 
+/*
+ * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
+ */
+#define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
+
 #ifndef __ASSEMBLY__
 
 /*
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 2d625da6603..826ad37006a 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,14 +40,8 @@
 
 #ifndef __ASSEMBLY__
 
-struct pgprot;
-
 extern int page_is_ram(unsigned long pagenr);
 extern int devmem_is_allowed(unsigned long pagenr);
-extern void map_devmem(unsigned long pfn, unsigned long size,
-		       struct pgprot vma_prot);
-extern void unmap_devmem(unsigned long pfn, unsigned long size,
-			 struct pgprot vma_prot);
 
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0617d5cc971..7727aa8b7dd 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -317,8 +317,6 @@ struct pv_mmu_ops {
 #if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
 	void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
-	void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
-				pte_t *ptep, pte_t pte);
 	void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
 			  pte_t *ptep);
 	void (*pmd_clear)(pmd_t *pmdp);
@@ -389,7 +387,7 @@ extern struct pv_lock_ops pv_lock_ops;
 
 #define paravirt_type(op)				\
 	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
-	[paravirt_opptr] "m" (op)
+	[paravirt_opptr] "i" (&(op))
 #define paravirt_clobber(clobber)		\
 	[paravirt_clobber] "i" (clobber)
 
@@ -443,7 +441,7 @@ int paravirt_disable_iospace(void);
  * offset into the paravirt_patch_template structure, and can therefore be
  * freely converted back into a structure offset.
  */
-#define PARAVIRT_CALL	"call *%[paravirt_opptr];"
+#define PARAVIRT_CALL	"call *%c[paravirt_opptr];"
 
 /*
  * These macros are intended to wrap calls through one of the paravirt
@@ -1365,13 +1363,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
 		    pte.pte, pte.pte >> 32);
 }
 
-static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
-				   pte_t *ptep, pte_t pte)
-{
-	/* 5 arg words */
-	pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
-}
-
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
 			     pte_t *ptep)
 {
@@ -1388,12 +1379,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
 	set_pte(ptep, pte);
 }
 
-static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
-				   pte_t *ptep, pte_t pte)
-{
-	set_pte(ptep, pte);
-}
-
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
 			     pte_t *ptep)
 {
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b0e70056838..2cd07b9422f 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_PAT_H
 
 #include <linux/types.h>
+#include <asm/pgtable_types.h>
 
 #ifdef CONFIG_X86_PAT
 extern int pat_enabled;
@@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end);
 
 extern int kernel_map_sync_memtype(u64 base, unsigned long size,
 		unsigned long flag);
+extern void map_devmem(unsigned long pfn, unsigned long size,
+		       struct pgprot vma_prot);
+extern void unmap_devmem(unsigned long pfn, unsigned long size,
+			 struct pgprot vma_prot);
 
 #endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index a977de23cb4..b51a1e8b0ba 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -86,12 +86,43 @@ static inline void early_quirks(void) { }
 
 extern void pci_iommu_alloc(void);
 
-#endif  /* __KERNEL__ */
+/* MSI arch hook */
+#define arch_setup_msi_irqs arch_setup_msi_irqs
+
+#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG)
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)       \
+	        dma_addr_t ADDR_NAME;
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)         \
+	        __u32 LEN_NAME;
+#define pci_unmap_addr(PTR, ADDR_NAME)                  \
+	        ((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)         \
+	        (((PTR)->ADDR_NAME) = (VAL))
+#define pci_unmap_len(PTR, LEN_NAME)                    \
+	        ((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL)           \
+	        (((PTR)->LEN_NAME) = (VAL))
 
-#ifdef CONFIG_X86_32
-# include "pci_32.h"
 #else
-# include "pci_64.h"
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)       dma_addr_t ADDR_NAME[0];
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
+#define pci_unmap_addr(PTR, ADDR_NAME)  sizeof((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
+	        do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
+#define pci_unmap_len(PTR, LEN_NAME)            sizeof((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
+	        do { break; } while (pci_unmap_len(PTR, LEN_NAME))
+
+#endif
+
+#endif  /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+#include "pci_64.h"
 #endif
 
 /* implement the pci_ DMA API in terms of the generic device dma_ one */
@@ -109,11 +140,6 @@ static inline int __pcibus_to_node(const struct pci_bus *bus)
 	return sd->node;
 }
 
-static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
-{
-	return node_to_cpumask(__pcibus_to_node(bus));
-}
-
 static inline const struct cpumask *
 cpumask_of_pcibus(const struct pci_bus *bus)
 {
diff --git a/arch/x86/include/asm/pci_32.h b/arch/x86/include/asm/pci_32.h
deleted file mode 100644
index 6f1213a6ef4..00000000000
--- a/arch/x86/include/asm/pci_32.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef _ASM_X86_PCI_32_H
-#define _ASM_X86_PCI_32_H
-
-
-#ifdef __KERNEL__
-
-
-/* Dynamic DMA mapping stuff.
- * i386 has everything mapped statically.
- */
-
-struct pci_dev;
-
-/* The PCI address space does equal the physical memory
- * address space.  The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS	(1)
-
-/* pci_unmap_{page,single} is a nop so... */
-#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)	dma_addr_t ADDR_NAME[0];
-#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)	unsigned LEN_NAME[0];
-#define pci_unmap_addr(PTR, ADDR_NAME)	sizeof((PTR)->ADDR_NAME)
-#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
-	do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
-#define pci_unmap_len(PTR, LEN_NAME)		sizeof((PTR)->LEN_NAME)
-#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
-	do { break; } while (pci_unmap_len(PTR, LEN_NAME))
-
-
-#endif /* __KERNEL__ */
-
-
-#endif /* _ASM_X86_PCI_32_H */
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index 4da20798277..ae5e40f67da 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -24,28 +24,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
 
 extern void dma32_reserve_bootmem(void);
 
-/* The PCI address space does equal the physical memory
- * address space.  The networking and block device layers use
- * this boolean for bounce buffer decisions
- *
- * On AMD64 it mostly equals, but we set it to zero if a hardware
- * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
- */
-#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
-
-#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)	\
-	dma_addr_t ADDR_NAME;
-#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)		\
-	__u32 LEN_NAME;
-#define pci_unmap_addr(PTR, ADDR_NAME)			\
-	((PTR)->ADDR_NAME)
-#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)		\
-	(((PTR)->ADDR_NAME) = (VAL))
-#define pci_unmap_len(PTR, LEN_NAME)			\
-	((PTR)->LEN_NAME)
-#define pci_unmap_len_set(PTR, LEN_NAME, VAL)		\
-	(((PTR)->LEN_NAME) = (VAL))
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index c1774ac9da7..2334982b339 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -26,13 +26,6 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
 	native_set_pte(ptep, pte);
 }
 
-static inline void native_set_pte_present(struct mm_struct *mm,
-					  unsigned long addr,
-					  pte_t *ptep, pte_t pte)
-{
-	native_set_pte(ptep, pte);
-}
-
 static inline void native_pmd_clear(pmd_t *pmdp)
 {
 	native_set_pmd(pmdp, __pmd(0));
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 3f13cdf6115..177b0165ea0 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -31,23 +31,6 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
 	ptep->pte_low = pte.pte_low;
 }
 
-/*
- * Since this is only called on user PTEs, and the page fault handler
- * must handle the already racy situation of simultaneous page faults,
- * we are justified in merely clearing the PTE present bit, followed
- * by a set.  The ordering here is important.
- */
-static inline void native_set_pte_present(struct mm_struct *mm,
-					  unsigned long addr,
-					  pte_t *ptep, pte_t pte)
-{
-	ptep->pte_low = 0;
-	smp_wmb();
-	ptep->pte_high = pte.pte_high;
-	smp_wmb();
-	ptep->pte_low = pte.pte_low;
-}
-
 static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
 	set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c097a3a666..29d96d168bc 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -31,8 +31,6 @@ extern struct list_head pgd_list;
 #define set_pte(ptep, pte)		native_set_pte(ptep, pte)
 #define set_pte_at(mm, addr, ptep, pte)	native_set_pte_at(mm, addr, ptep, pte)
 
-#define set_pte_present(mm, addr, ptep, pte)				\
-	native_set_pte_present(mm, addr, ptep, pte)
 #define set_pte_atomic(ptep, pte)					\
 	native_set_pte_atomic(ptep, pte)
 
@@ -288,6 +286,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
 	return 1;
 }
 
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
 #endif	/* __ASSEMBLY__ */
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 97612fc7632..31bd120cf2a 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -42,9 +42,6 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
  */
 #undef TEST_ACCESS_OK
 
-/* The boot page tables (all created as a single array) */
-extern unsigned long pg0[];
-
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
 #else
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index bd8df3b2fe0..2733fad45f9 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -25,6 +25,11 @@
  * area for the same reason. ;)
  */
 #define VMALLOC_OFFSET	(8 * 1024 * 1024)
+
+#ifndef __ASSEMBLER__
+extern bool __vmalloc_start_set; /* set once high_memory is set */
+#endif
+
 #define VMALLOC_START	((unsigned long)high_memory + VMALLOC_OFFSET)
 #ifdef CONFIG_X86_PAE
 #define LAST_PKMAP 512
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0..b8238dc8786 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,6 +273,7 @@ typedef struct page *pgtable_t;
 
 extern pteval_t __supported_pte_mask;
 extern int nx_enabled;
+extern void set_nx(void);
 
 #define pgprot_writecombine	pgprot_writecombine
 extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c7a98f73821..34c52370f2f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -75,9 +75,9 @@ struct cpuinfo_x86 {
 #else
 	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
 	int			x86_tlbsize;
+#endif
 	__u8			x86_virt_bits;
 	__u8			x86_phys_bits;
-#endif
 	/* CPUID returned core id bits: */
 	__u8			x86_coreid_bits;
 	/* Max extended CPUID function supported: */
@@ -94,7 +94,7 @@ struct cpuinfo_x86 {
 	unsigned long		loops_per_jiffy;
 #ifdef CONFIG_SMP
 	/* cpus sharing the last level cache: */
-	cpumask_t		llc_shared_map;
+	cpumask_var_t		llc_shared_map;
 #endif
 	/* cpuid returned max cores value: */
 	u16			 x86_max_cores;
@@ -248,7 +248,6 @@ struct x86_hw_tss {
 #define IO_BITMAP_LONGS			(IO_BITMAP_BYTES/sizeof(long))
 #define IO_BITMAP_OFFSET		offsetof(struct tss_struct, io_bitmap)
 #define INVALID_IO_BITMAP_OFFSET	0x8000
-#define INVALID_IO_BITMAP_OFFSET_LAZY	0x9000
 
 struct tss_struct {
 	/*
@@ -263,11 +262,6 @@ struct tss_struct {
 	 * be within the limit.
 	 */
 	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
-	/*
-	 * Cache the current maximum and the last task that used the bitmap:
-	 */
-	unsigned long		io_bitmap_max;
-	struct thread_struct	*io_bitmap_owner;
 
 	/*
 	 * .. and then another 0x100 bytes for the emergency kernel stack:
@@ -397,6 +391,9 @@ DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
 DECLARE_INIT_PER_CPU(irq_stack_union);
 
 DECLARE_PER_CPU(char *, irq_stack_ptr);
+DECLARE_PER_CPU(unsigned int, irq_count);
+extern unsigned long kernel_eflags;
+extern asmlinkage void ignore_sysret(void);
 #else	/* X86_64 */
 #ifdef CONFIG_CC_STACKPROTECTOR
 DECLARE_PER_CPU(unsigned long, stack_canary);
@@ -739,6 +736,7 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
 
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
+extern void init_c1e_mask(void);
 
 extern unsigned long		boot_option_idle_override;
 extern unsigned long		idle_halt;
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 8e0f8d199e0..86723035a51 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -80,8 +80,6 @@
 
 #define PTRACE_SINGLEBLOCK	33	/* resume execution until next branch */
 
-#ifdef CONFIG_X86_PTRACE_BTS
-
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 
@@ -140,6 +138,5 @@ struct ptrace_bts_config {
    BTS records are read from oldest to newest.
    Returns number of BTS records drained.
 */
-#endif /* CONFIG_X86_PTRACE_BTS */
 
 #endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 2b8c5160388..1b7ee5d673c 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -1 +1,8 @@
+#ifndef _ASM_X86_SECTIONS_H
+#define _ASM_X86_SECTIONS_H
+
 #include <asm-generic/sections.h>
+
+extern char __brk_base[], __brk_limit[];
+
+#endif	/* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 66801cb72f6..bdc2ada05ae 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -31,7 +31,6 @@ struct x86_quirks {
 	void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable,
 				unsigned short oemsize);
 	int (*setup_ioapic_ids)(void);
-	int (*update_apic)(void);
 };
 
 extern void x86_quirk_pre_intr_init(void);
@@ -65,7 +64,11 @@ extern void x86_quirk_time_init(void);
 #include <asm/bootparam.h>
 
 /* Interrupt control for vSMPowered x86_64 systems */
+#ifdef CONFIG_X86_64
 void vsmp_init(void);
+#else
+static inline void vsmp_init(void) { }
+#endif
 
 void setup_bios_corruption_check(void);
 
@@ -77,8 +80,6 @@ static inline void visws_early_detect(void) { }
 static inline int is_visws_box(void) { return 0; }
 #endif
 
-extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
-extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip);
 extern struct x86_quirks *x86_quirks;
 extern unsigned long saved_video_mode;
 
@@ -99,20 +100,51 @@ extern struct boot_params boot_params;
  */
 #define LOWMEMSIZE()	(0x9f000)
 
+/* exceedingly early brk-like allocator */
+extern unsigned long _brk_end;
+void *extend_brk(size_t size, size_t align);
+
+/*
+ * Reserve space in the brk section.  The name must be unique within
+ * the file, and somewhat descriptive.  The size is in bytes.  Must be
+ * used at file scope.
+ *
+ * (This uses a temp function to wrap the asm so we can pass it the
+ * size parameter; otherwise we wouldn't be able to.  We can't use a
+ * "section" attribute on a normal variable because it always ends up
+ * being @progbits, which ends up allocating space in the vmlinux
+ * executable.)
+ */
+#define RESERVE_BRK(name,sz)						\
+	static void __section(.discard) __used				\
+	__brk_reservation_fn_##name##__(void) {				\
+		asm volatile (						\
+			".pushsection .brk_reservation,\"aw\",@nobits;" \
+			".brk." #name ":"				\
+			" 1:.skip %c0;"					\
+			" .size .brk." #name ", . - 1b;"		\
+			" .popsection"					\
+			: : "i" (sz));					\
+	}
+
 #ifdef __i386__
 
 void __init i386_start_kernel(void);
 extern void probe_roms(void);
 
-extern unsigned long init_pg_tables_start;
-extern unsigned long init_pg_tables_end;
-
 #else
 void __init x86_64_start_kernel(char *real_mode);
 void __init x86_64_start_reservations(char *real_mode_data);
 
 #endif /* __i386__ */
 #endif /* _SETUP */
+#else
+#define RESERVE_BRK(name,sz)				\
+	.pushsection .brk_reservation,"aw",@nobits;	\
+.brk.name:						\
+1:	.skip sz;					\
+	.size .brk.name,.-1b;				\
+	.popsection
 #endif /* __ASSEMBLY__ */
 #endif  /*  __KERNEL__  */
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 47d0e21f2b9..19e0d88b966 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -21,19 +21,19 @@
 extern int smp_num_siblings;
 extern unsigned int num_processors;
 
-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
 DECLARE_PER_CPU(u16, cpu_llc_id);
 DECLARE_PER_CPU(int, cpu_number);
 
 static inline struct cpumask *cpu_sibling_mask(int cpu)
 {
-	return &per_cpu(cpu_sibling_map, cpu);
+	return per_cpu(cpu_sibling_map, cpu);
 }
 
 static inline struct cpumask *cpu_core_mask(int cpu)
 {
-	return &per_cpu(cpu_core_map, cpu);
+	return per_cpu(cpu_core_map, cpu);
 }
 
 DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
@@ -121,9 +121,10 @@ static inline void arch_send_call_function_single_ipi(int cpu)
 	smp_ops.send_call_func_single_ipi(cpu);
 }
 
-static inline void arch_send_call_function_ipi(cpumask_t mask)
+#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
+static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
-	smp_ops.send_call_func_ipi(&mask);
+	smp_ops.send_call_func_ipi(mask);
 }
 
 void cpu_disable_common(void);
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h
index 8ab9cc8b2ec..ca8bf2cd0ba 100644
--- a/arch/x86/include/asm/socket.h
+++ b/arch/x86/include/asm/socket.h
@@ -54,4 +54,7 @@
 
 #define SO_MARK			36
 
+#define SO_TIMESTAMPING		37
+#define SCM_TIMESTAMPING	SO_TIMESTAMPING
+
 #endif /* _ASM_X86_SOCKET_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 3a569665668..e5e6caffec8 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -295,6 +295,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 		     : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index a5074bd0f8b..48dcfa62ea0 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -24,28 +24,4 @@ struct saved_context {
 	unsigned long return_address;
 } __attribute__((packed));
 
-#ifdef CONFIG_ACPI
-extern unsigned long saved_eip;
-extern unsigned long saved_esp;
-extern unsigned long saved_ebp;
-extern unsigned long saved_ebx;
-extern unsigned long saved_esi;
-extern unsigned long saved_edi;
-
-static inline void acpi_save_register_state(unsigned long return_point)
-{
-	saved_eip = return_point;
-	asm volatile("movl %%esp,%0" : "=m" (saved_esp));
-	asm volatile("movl %%ebp,%0" : "=m" (saved_ebp));
-	asm volatile("movl %%ebx,%0" : "=m" (saved_ebx));
-	asm volatile("movl %%edi,%0" : "=m" (saved_edi));
-	asm volatile("movl %%esi,%0" : "=m" (saved_esi));
-}
-
-#define acpi_restore_register_state()  do {} while (0)
-
-/* routines for saving/restoring kernel state */
-extern int acpi_save_state_mem(void);
-#endif
-
 #endif /* _ASM_X86_SUSPEND_32_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1b8afa78e86..82ada75f3eb 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -174,10 +174,6 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_CPUID_FEATURE_SHIFT 2
 #define SVM_CPUID_FUNC 0x8000000a
 
-#define MSR_EFER_SVME_MASK (1ULL << 12)
-#define MSR_VM_CR       0xc0010114
-#define MSR_VM_HSAVE_PA 0xc0010117ULL
-
 #define SVM_VM_CR_SVM_DISABLE 4
 
 #define SVM_SELECTOR_S_SHIFT 4
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index ffb08be2a53..72a6dcd1299 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -70,8 +70,6 @@ struct old_utsname;
 asmlinkage long sys32_olduname(struct oldold_utsname __user *);
 long sys32_uname(struct old_utsname __user *);
 
-long sys32_ustat(unsigned, struct ustat32 __user *);
-
 asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
 			     compat_uptr_t __user *, struct pt_regs *);
 asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index c00bfdbdd45..643c59b4bc6 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -20,6 +20,9 @@
 struct task_struct; /* one of the stranger aspects of C forward declarations */
 struct task_struct *__switch_to(struct task_struct *prev,
 				struct task_struct *next);
+struct tss_struct;
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+		      struct tss_struct *tss);
 
 #ifdef CONFIG_X86_32
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ca7310e0244..3ffd5d2a367 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,7 @@ struct thread_info {
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
 #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
+#define TIF_SYSCALL_FTRACE	27	/* for ftrace syscall instrumentation */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -117,15 +118,17 @@ struct thread_info {
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
 #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
+#define _TIF_SYSCALL_FTRACE	(1 << TIF_SYSCALL_FTRACE)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
-	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE |	\
 	 _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
 
 /* work to do in syscall_trace_leave() */
 #define _TIF_WORK_SYSCALL_EXIT	\
-	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP |	\
+	 _TIF_SYSCALL_FTRACE)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK							\
@@ -134,7 +137,7 @@ struct thread_info {
 	   _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
 
 /* work to do on any return to user space */
-#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
+#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE)
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index a81195eaa2b..bd37ed444a2 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -12,9 +12,9 @@ unsigned long native_calibrate_tsc(void);
 
 #ifdef CONFIG_X86_32
 extern int timer_ack;
-extern int recalibrate_cpu_khz(void);
 extern irqreturn_t timer_interrupt(int irq, void *dev_id);
 #endif /* CONFIG_X86_32 */
+extern int recalibrate_cpu_khz(void);
 
 extern int no_timer_check;
 
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 77cfb2cfb38..892b119dba6 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -44,9 +44,6 @@
 
 #ifdef CONFIG_X86_32
 
-/* Mappings between node number and cpus on that node. */
-extern cpumask_t node_to_cpumask_map[];
-
 /* Mappings between logical cpu number and node number */
 extern int cpu_to_node_map[];
 
@@ -57,30 +54,8 @@ static inline int cpu_to_node(int cpu)
 }
 #define early_cpu_to_node(cpu)	cpu_to_node(cpu)
 
-/* Returns a bitmask of CPUs on Node 'node'.
- *
- * Side note: this function creates the returned cpumask on the stack
- * so with a high NR_CPUS count, excessive stack space is used.  The
- * cpumask_of_node function should be used whenever possible.
- */
-static inline cpumask_t node_to_cpumask(int node)
-{
-	return node_to_cpumask_map[node];
-}
-
-/* Returns a bitmask of CPUs on Node 'node'. */
-static inline const struct cpumask *cpumask_of_node(int node)
-{
-	return &node_to_cpumask_map[node];
-}
-
-static inline void setup_node_to_cpumask_map(void) { }
-
 #else /* CONFIG_X86_64 */
 
-/* Mappings between node number and cpus on that node. */
-extern cpumask_t *node_to_cpumask_map;
-
 /* Mappings between logical cpu number and node number */
 DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
 
@@ -91,8 +66,6 @@ DECLARE_PER_CPU(int, node_number);
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 extern int cpu_to_node(int cpu);
 extern int early_cpu_to_node(int cpu);
-extern const cpumask_t *cpumask_of_node(int node);
-extern cpumask_t node_to_cpumask(int node);
 
 #else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 
@@ -108,42 +81,32 @@ static inline int early_cpu_to_node(int cpu)
 	return early_per_cpu(x86_cpu_to_node_map, cpu);
 }
 
-/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
-static inline const cpumask_t *cpumask_of_node(int node)
-{
-	return &node_to_cpumask_map[node];
-}
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+#endif /* CONFIG_X86_64 */
 
-/* Returns a bitmask of CPUs on Node 'node'. */
-static inline cpumask_t node_to_cpumask(int node)
+/* Mappings between node number and cpus on that node. */
+extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+extern const struct cpumask *cpumask_of_node(int node);
+#else
+/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
+static inline const struct cpumask *cpumask_of_node(int node)
 {
 	return node_to_cpumask_map[node];
 }
-
-#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+#endif
 
 extern void setup_node_to_cpumask_map(void);
 
 /*
- * Replace default node_to_cpumask_ptr with optimized version
- * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
- */
-#define node_to_cpumask_ptr(v, node)		\
-		const cpumask_t *v = cpumask_of_node(node)
-
-#define node_to_cpumask_ptr_next(v, node)	\
-			   v = cpumask_of_node(node)
-
-#endif /* CONFIG_X86_64 */
-
-/*
  * Returns the number of the node containing Node 'node'. This
  * architecture is flat, so it is a pretty simple function!
  */
 #define parent_node(node) (node)
 
 #define pcibus_to_node(bus) __pcibus_to_node(bus)
-#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
 
 #ifdef CONFIG_X86_32
 extern unsigned long node_start_pfn[];
@@ -209,52 +172,24 @@ static inline int early_cpu_to_node(int cpu)
 	return 0;
 }
 
-static inline const cpumask_t *cpumask_of_node(int node)
-{
-	return &cpu_online_map;
-}
-static inline cpumask_t node_to_cpumask(int node)
-{
-	return cpu_online_map;
-}
-static inline int node_to_first_cpu(int node)
+static inline const struct cpumask *cpumask_of_node(int node)
 {
-	return first_cpu(cpu_online_map);
+	return cpu_online_mask;
 }
 
 static inline void setup_node_to_cpumask_map(void) { }
 
-/*
- * Replace default node_to_cpumask_ptr with optimized version
- * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
- */
-#define node_to_cpumask_ptr(v, node)		\
-		const cpumask_t *v = cpumask_of_node(node)
-
-#define node_to_cpumask_ptr_next(v, node)	\
-			   v = cpumask_of_node(node)
 #endif
 
 #include <asm-generic/topology.h>
 
-#ifdef CONFIG_NUMA
-/* Returns the number of the first CPU on Node 'node'. */
-static inline int node_to_first_cpu(int node)
-{
-	return cpumask_first(cpumask_of_node(node));
-}
-#endif
-
-extern cpumask_t cpu_coregroup_map(int cpu);
 extern const struct cpumask *cpu_coregroup_mask(int cpu);
 
 #ifdef ENABLE_TOPO_DEFINES
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).phys_proc_id)
 #define topology_core_id(cpu)			(cpu_data(cpu).cpu_core_id)
-#define topology_core_siblings(cpu)		(per_cpu(cpu_core_map, cpu))
-#define topology_thread_siblings(cpu)		(per_cpu(cpu_sibling_map, cpu))
-#define topology_core_cpumask(cpu)		(&per_cpu(cpu_core_map, cpu))
-#define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
+#define topology_core_cpumask(cpu)		(per_cpu(cpu_core_map, cpu))
+#define topology_thread_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
 
 /* indicates that pointers to the topology cpumask_t maps are valid */
 #define arch_provides_topology_pointers		yes
@@ -268,7 +203,7 @@ struct pci_bus;
 void set_pci_bus_resources_arch_default(struct pci_bus *b);
 
 #ifdef CONFIG_SMP
-#define mc_capable()	(cpus_weight(per_cpu(cpu_core_map, 0)) != nr_cpu_ids)
+#define mc_capable()	(cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)
 #define smt_capable()			(smp_num_siblings > 1)
 #endif
 
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index a0ba6138697..5e06259e90e 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -157,7 +157,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 }
 
 static __always_inline unsigned long __copy_from_user_nocache(void *to,
-		const void __user *from, unsigned long n, unsigned long total)
+				const void __user *from, unsigned long n)
 {
 	might_fault();
 	if (__builtin_constant_p(n)) {
@@ -180,7 +180,7 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
 
 static __always_inline unsigned long
 __copy_from_user_inatomic_nocache(void *to, const void __user *from,
-				  unsigned long n, unsigned long total)
+				  unsigned long n)
 {
        return __copy_from_user_ll_nocache_nozero(to, from, n);
 }
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index dcaa0404cf7..8cc687326eb 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -188,29 +188,18 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 extern long __copy_user_nocache(void *dst, const void __user *src,
 				unsigned size, int zerorest);
 
-static inline int __copy_from_user_nocache(void *dst, const void __user *src,
-				   unsigned size, unsigned long total)
+static inline int
+__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
 {
 	might_sleep();
-	/*
-	 * In practice this limit means that large file write()s
-	 * which get chunked to 4K copies get handled via
-	 * non-temporal stores here. Smaller writes get handled
-	 * via regular __copy_from_user():
-	 */
-	if (likely(total >= PAGE_SIZE))
-		return __copy_user_nocache(dst, src, size, 1);
-	else
-		return __copy_from_user(dst, src, size);
+	return __copy_user_nocache(dst, src, size, 1);
 }
 
-static inline int __copy_from_user_inatomic_nocache(void *dst,
-	    const void __user *src, unsigned size, unsigned total)
+static inline int
+__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
+				  unsigned size)
 {
-	if (likely(total >= PAGE_SIZE))
-		return __copy_user_nocache(dst, src, size, 0);
-	else
-		return __copy_from_user_inatomic(dst, src, size);
+	return __copy_user_nocache(dst, src, size, 0);
 }
 
 unsigned long
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 7e47658b0a6..0b4d8c2b157 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,8 @@
 #define __NR_dup3		330
 #define __NR_pipe2		331
 #define __NR_inotify_init1	332
+#define __NR_preadv		333
+#define __NR_pwritev		334
 #define __NR_perf_counter_open	333
 
 #ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 53025feaf88..d9aad876ad7 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,6 +653,10 @@ __SYSCALL(__NR_dup3, sys_dup3)
 __SYSCALL(__NR_pipe2, sys_pipe2)
 #define __NR_inotify_init1			294
 __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+#define __NR_preadv				295
+__SYSCALL(__NR_preadv, sys_preadv)
+#define __NR_pwritev				296
+__SYSCALL(__NR_pwritev, sys_pwritev)
 #define __NR_perf_counter_open		295
 __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
 
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 8242bf96581..c0a01b5d985 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -12,7 +12,6 @@ extern enum uv_system_type get_uv_system_type(void);
 extern int is_uv_system(void);
 extern void uv_cpu_init(void);
 extern void uv_system_init(void);
-extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
 extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 						 struct mm_struct *mm,
 						 unsigned long va,
@@ -24,8 +23,6 @@ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
 static inline int is_uv_system(void)	{ return 0; }
 static inline void uv_cpu_init(void)	{ }
 static inline void uv_system_init(void)	{ }
-static inline int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
-{ return 1; }
 static inline const struct cpumask *
 uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
 		    unsigned long va, unsigned int cpu)
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 777327ef05c..d3a98ea1062 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -11,11 +11,13 @@
 #ifndef _ASM_X86_UV_UV_HUB_H
 #define _ASM_X86_UV_UV_HUB_H
 
+#ifdef CONFIG_X86_64
 #include <linux/numa.h>
 #include <linux/percpu.h>
 #include <linux/timer.h>
 #include <asm/types.h>
 #include <asm/percpu.h>
+#include <asm/uv/uv_mmrs.h>
 
 
 /*
@@ -199,6 +201,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define SCIR_CPU_ACTIVITY	0x02	/* not idle */
 #define SCIR_CPU_HB_INTERVAL	(HZ)	/* once per second */
 
+/* Loop through all installed blades */
+#define for_each_possible_blade(bid)		\
+	for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++)
+
 /*
  * Macros for converting between kernel virtual addresses, socket local physical
  * addresses, and UV global physical addresses.
@@ -393,6 +399,7 @@ static inline void uv_set_scir_bits(unsigned char value)
 		uv_write_local_mmr8(uv_hub_info->scir.offset, value);
 	}
 }
+
 static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
 {
 	if (uv_cpu_hub_info(cpu)->scir.state != value) {
@@ -401,4 +408,15 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
 	}
 }
 
+static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
+{
+	unsigned long val;
+
+	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+			((apicid & 0x3f) << UVH_IPI_INT_APIC_ID_SHFT) |
+			(vector << UVH_IPI_INT_VECTOR_SHFT);
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+#endif /* CONFIG_X86_64 */
 #endif /* _ASM_X86_UV_UV_HUB_H */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index dd627793a23..db68ac8a5ac 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -1,3 +1,4 @@
+
 /*
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -243,6 +244,158 @@ union uvh_event_occurred0_u {
 #define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
 
 /* ========================================================================= */
+/*                         UVH_GR0_TLB_INT0_CONFIG                           */
+/* ========================================================================= */
+#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
+
+#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
+#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
+#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
+#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
+#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
+#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
+#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_gr0_tlb_int0_config_u {
+    unsigned long	v;
+    struct uvh_gr0_tlb_int0_config_s {
+	unsigned long	vector_  :  8;  /* RW */
+	unsigned long	dm       :  3;  /* RW */
+	unsigned long	destmode :  1;  /* RW */
+	unsigned long	status   :  1;  /* RO */
+	unsigned long	p        :  1;  /* RO */
+	unsigned long	rsvd_14  :  1;  /*    */
+	unsigned long	t        :  1;  /* RO */
+	unsigned long	m        :  1;  /* RW */
+	unsigned long	rsvd_17_31: 15;  /*    */
+	unsigned long	apic_id  : 32;  /* RW */
+    } s;
+};
+
+/* ========================================================================= */
+/*                         UVH_GR0_TLB_INT1_CONFIG                           */
+/* ========================================================================= */
+#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
+
+#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
+#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
+#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
+#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
+#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
+#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
+#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_gr0_tlb_int1_config_u {
+    unsigned long	v;
+    struct uvh_gr0_tlb_int1_config_s {
+	unsigned long	vector_  :  8;  /* RW */
+	unsigned long	dm       :  3;  /* RW */
+	unsigned long	destmode :  1;  /* RW */
+	unsigned long	status   :  1;  /* RO */
+	unsigned long	p        :  1;  /* RO */
+	unsigned long	rsvd_14  :  1;  /*    */
+	unsigned long	t        :  1;  /* RO */
+	unsigned long	m        :  1;  /* RW */
+	unsigned long	rsvd_17_31: 15;  /*    */
+	unsigned long	apic_id  : 32;  /* RW */
+    } s;
+};
+
+/* ========================================================================= */
+/*                         UVH_GR1_TLB_INT0_CONFIG                           */
+/* ========================================================================= */
+#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
+
+#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
+#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
+#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
+#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
+#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
+#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
+#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_gr1_tlb_int0_config_u {
+    unsigned long	v;
+    struct uvh_gr1_tlb_int0_config_s {
+	unsigned long	vector_  :  8;  /* RW */
+	unsigned long	dm       :  3;  /* RW */
+	unsigned long	destmode :  1;  /* RW */
+	unsigned long	status   :  1;  /* RO */
+	unsigned long	p        :  1;  /* RO */
+	unsigned long	rsvd_14  :  1;  /*    */
+	unsigned long	t        :  1;  /* RO */
+	unsigned long	m        :  1;  /* RW */
+	unsigned long	rsvd_17_31: 15;  /*    */
+	unsigned long	apic_id  : 32;  /* RW */
+    } s;
+};
+
+/* ========================================================================= */
+/*                         UVH_GR1_TLB_INT1_CONFIG                           */
+/* ========================================================================= */
+#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
+
+#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
+#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
+#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
+#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
+#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
+#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
+#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_gr1_tlb_int1_config_u {
+    unsigned long	v;
+    struct uvh_gr1_tlb_int1_config_s {
+	unsigned long	vector_  :  8;  /* RW */
+	unsigned long	dm       :  3;  /* RW */
+	unsigned long	destmode :  1;  /* RW */
+	unsigned long	status   :  1;  /* RO */
+	unsigned long	p        :  1;  /* RO */
+	unsigned long	rsvd_14  :  1;  /*    */
+	unsigned long	t        :  1;  /* RO */
+	unsigned long	m        :  1;  /* RW */
+	unsigned long	rsvd_17_31: 15;  /*    */
+	unsigned long	apic_id  : 32;  /* RW */
+    } s;
+};
+
+/* ========================================================================= */
 /*                               UVH_INT_CMPB                                */
 /* ========================================================================= */
 #define UVH_INT_CMPB 0x22080UL
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 59363627523..e0f9aa16358 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -118,7 +118,7 @@ static inline void cpu_svm_disable(void)
 
 	wrmsrl(MSR_VM_HSAVE_PA, 0);
 	rdmsrl(MSR_EFER, efer);
-	wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+	wrmsrl(MSR_EFER, efer & ~EFER_SVME);
 }
 
 /** Makes sure SVM is disabled, if it is supported on the CPU
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index d0238e6151d..498f944010b 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -270,8 +270,9 @@ enum vmcs_field {
 
 #define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
 #define INTR_TYPE_NMI_INTR		(2 << 8) /* NMI */
-#define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
+#define INTR_TYPE_HARD_EXCEPTION	(3 << 8) /* processor exception */
 #define INTR_TYPE_SOFT_INTR             (4 << 8) /* software interrupt */
+#define INTR_TYPE_SOFT_EXCEPTION	(6 << 8) /* software exception */
 
 /* GUEST_INTERRUPTIBILITY_INFO flags. */
 #define GUEST_INTR_STATE_STI		0x00000001
@@ -311,7 +312,7 @@ enum vmcs_field {
 #define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
 #define TYPE_MOV_TO_DR                  (0 << 4)
 #define TYPE_MOV_FROM_DR                (1 << 4)
-#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose reg. */
+#define DEBUG_REG_ACCESS_REG(eq)        (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
 
 
 /* segment AR */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5e79ca69432..9c371e4a9fa 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -296,6 +296,8 @@ HYPERVISOR_get_debugreg(int reg)
 static inline int
 HYPERVISOR_update_descriptor(u64 ma, u64 desc)
 {
+	if (sizeof(u64) == sizeof(long))
+		return _hypercall2(int, update_descriptor, ma, desc);
 	return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
 }
 
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 4bd990ee43d..1a918dde46b 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x)
 
 
 xmaddr_t arbitrary_virt_to_machine(void *address);
+unsigned long arbitrary_virt_to_mfn(void *vaddr);
 void make_lowmem_page_readonly(void *vaddr);
 void make_lowmem_page_readwrite(void *vaddr);
 
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index de5657c039e..145cce75cda 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -66,11 +66,11 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
+obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
-obj-y				+= vsmp_64.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_MODULES)		+= module_$(BITS).o
 obj-$(CONFIG_EFI) 		+= efi.o efi_$(BITS).o efi_stub_$(BITS).o
@@ -106,12 +106,12 @@ obj-$(CONFIG_MICROCODE)			+= microcode.o
 
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 
-obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb_64.o # NB rename without _64
+obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o
+	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
 	obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
 	obj-$(CONFIG_AUDIT)		+= audit_64.o
 
@@ -120,4 +120,5 @@ ifeq ($(CONFIG_X86_64),y)
 	obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o amd_iommu.o
 
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
+	obj-y				+= vsmp_64.o
 endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a18eb7ce223..723989d7f80 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -230,6 +230,35 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 }
 
 static int __init
+acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
+{
+	struct acpi_madt_local_x2apic *processor = NULL;
+
+	processor = (struct acpi_madt_local_x2apic *)header;
+
+	if (BAD_MADT_ENTRY(processor, end))
+		return -EINVAL;
+
+	acpi_table_print_madt_entry(header);
+
+#ifdef CONFIG_X86_X2APIC
+	/*
+	 * We need to register disabled CPU as well to permit
+	 * counting disabled CPUs. This allows us to size
+	 * cpus_possible_map more accurately, to permit
+	 * to not preallocating memory for all NR_CPUS
+	 * when we use CPU hotplug.
+	 */
+	acpi_register_lapic(processor->local_apic_id,	/* APIC ID */
+			    processor->lapic_flags & ACPI_MADT_ENABLED);
+#else
+	printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+#endif
+
+	return 0;
+}
+
+static int __init
 acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
 {
 	struct acpi_madt_local_apic *processor = NULL;
@@ -289,6 +318,25 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 }
 
 static int __init
+acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
+		      const unsigned long end)
+{
+	struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
+
+	x2apic_nmi = (struct acpi_madt_local_x2apic_nmi *)header;
+
+	if (BAD_MADT_ENTRY(x2apic_nmi, end))
+		return -EINVAL;
+
+	acpi_table_print_madt_entry(header);
+
+	if (x2apic_nmi->lint != 1)
+		printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+
+	return 0;
+}
+
+static int __init
 acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
 {
 	struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
@@ -793,6 +841,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
 static int __init acpi_parse_madt_lapic_entries(void)
 {
 	int count;
+	int x2count = 0;
 
 	if (!cpu_has_apic)
 		return -ENODEV;
@@ -816,22 +865,28 @@ static int __init acpi_parse_madt_lapic_entries(void)
 	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
 				      acpi_parse_sapic, MAX_APICS);
 
-	if (!count)
+	if (!count) {
+		x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
+						acpi_parse_x2apic, MAX_APICS);
 		count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
 					      acpi_parse_lapic, MAX_APICS);
-	if (!count) {
+	}
+	if (!count && !x2count) {
 		printk(KERN_ERR PREFIX "No LAPIC entries present\n");
 		/* TBD: Cleanup to allow fallback to MPS */
 		return -ENODEV;
-	} else if (count < 0) {
+	} else if (count < 0 || x2count < 0) {
 		printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
 		/* TBD: Cleanup to allow fallback to MPS */
 		return count;
 	}
 
+	x2count =
+	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+				  acpi_parse_x2apic_nmi, 0);
 	count =
 	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
-	if (count < 0) {
+	if (count < 0 || x2count < 0) {
 		printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
 		/* TBD: Cleanup to allow fallback to MPS */
 		return count;
@@ -1470,7 +1525,7 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
 
 /*
  * If your system is blacklisted here, but you find that acpi=force
- * works for you, please contact acpi-devel@sourceforge.net
+ * works for you, please contact linux-acpi@vger.kernel.org
  */
 static struct dmi_system_id __initdata acpi_dmi_table[] = {
 	/*
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6907b8e85d5..f5765870257 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -5,6 +5,7 @@
 #include <linux/kprobes.h>
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
+#include <linux/memory.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
 #include <asm/pgtable.h>
@@ -12,7 +13,9 @@
 #include <asm/nmi.h>
 #include <asm/vsyscall.h>
 #include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
 #include <asm/io.h>
+#include <asm/fixmap.h>
 
 #define MAX_PATCH_LEN (255-1)
 
@@ -226,6 +229,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 {
 	u8 **ptr;
 
+	mutex_lock(&text_mutex);
 	for (ptr = start; ptr < end; ptr++) {
 		if (*ptr < text)
 			continue;
@@ -234,6 +238,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 		/* turn DS segment override prefix into lock prefix */
 		text_poke(*ptr, ((unsigned char []){0xf0}), 1);
 	};
+	mutex_unlock(&text_mutex);
 }
 
 static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
@@ -243,6 +248,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
 	if (noreplace_smp)
 		return;
 
+	mutex_lock(&text_mutex);
 	for (ptr = start; ptr < end; ptr++) {
 		if (*ptr < text)
 			continue;
@@ -251,6 +257,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
 		/* turn lock prefix into DS segment override prefix */
 		text_poke(*ptr, ((unsigned char []){0x3E}), 1);
 	};
+	mutex_unlock(&text_mutex);
 }
 
 struct smp_alt_module {
@@ -414,9 +421,17 @@ void __init alternative_instructions(void)
 	   that might execute the to be patched code.
 	   Other CPUs are not running. */
 	stop_nmi();
-#ifdef CONFIG_X86_MCE
-	stop_mce();
-#endif
+
+	/*
+	 * Don't stop machine check exceptions while patching.
+	 * MCEs only happen when something got corrupted and in this
+	 * case we must do something about the corruption.
+	 * Ignoring it is worse than a unlikely patching race.
+	 * Also machine checks tend to be broadcast and if one CPU
+	 * goes into machine check the others follow quickly, so we don't
+	 * expect a machine check to cause undue problems during to code
+	 * patching.
+	 */
 
 	apply_alternatives(__alt_instructions, __alt_instructions_end);
 
@@ -456,9 +471,6 @@ void __init alternative_instructions(void)
 				(unsigned long)__smp_locks_end);
 
 	restart_nmi();
-#ifdef CONFIG_X86_MCE
-	restart_mce();
-#endif
 }
 
 /**
@@ -495,15 +507,16 @@ void *text_poke_early(void *addr, const void *opcode, size_t len)
  * It means the size must be writable atomically and the address must be aligned
  * in a way that permits an atomic write. It also makes sure we fit on a single
  * page.
+ *
+ * Note: Must be called under text_mutex.
  */
 void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
 {
+	unsigned long flags;
 	char *vaddr;
-	int nr_pages = 2;
 	struct page *pages[2];
 	int i;
 
-	might_sleep();
 	if (!core_kernel_text((unsigned long)addr)) {
 		pages[0] = vmalloc_to_page(addr);
 		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
@@ -513,18 +526,21 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
 		pages[1] = virt_to_page(addr + PAGE_SIZE);
 	}
 	BUG_ON(!pages[0]);
-	if (!pages[1])
-		nr_pages = 1;
-	vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
-	BUG_ON(!vaddr);
-	local_irq_disable();
+	local_irq_save(flags);
+	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
+	if (pages[1])
+		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
+	vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
 	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
-	local_irq_enable();
-	vunmap(vaddr);
+	clear_fixmap(FIX_TEXT_POKE0);
+	if (pages[1])
+		clear_fixmap(FIX_TEXT_POKE1);
+	local_flush_tlb();
 	sync_core();
 	/* Could also do a CLFLUSH here to speed up CPU recovery; but
 	   that causes hangs on some VIA CPUs. */
 	for (i = 0; i < len; i++)
 		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
+	local_irq_restore(flags);
 	return addr;
 }
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 5113c080f0c..a97db99dad5 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -22,10 +22,9 @@
 #include <linux/bitops.h>
 #include <linux/debugfs.h>
 #include <linux/scatterlist.h>
+#include <linux/dma-mapping.h>
 #include <linux/iommu-helper.h>
-#ifdef CONFIG_IOMMU_API
 #include <linux/iommu.h>
-#endif
 #include <asm/proto.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
@@ -1297,8 +1296,10 @@ static void __unmap_single(struct amd_iommu *iommu,
 /*
  * The exported map_single function for dma_ops.
  */
-static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
-			     size_t size, int dir)
+static dma_addr_t map_page(struct device *dev, struct page *page,
+			   unsigned long offset, size_t size,
+			   enum dma_data_direction dir,
+			   struct dma_attrs *attrs)
 {
 	unsigned long flags;
 	struct amd_iommu *iommu;
@@ -1306,6 +1307,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
 	u16 devid;
 	dma_addr_t addr;
 	u64 dma_mask;
+	phys_addr_t paddr = page_to_phys(page) + offset;
 
 	INC_STATS_COUNTER(cnt_map_single);
 
@@ -1340,8 +1342,8 @@ out:
 /*
  * The exported unmap_single function for dma_ops.
  */
-static void unmap_single(struct device *dev, dma_addr_t dma_addr,
-			 size_t size, int dir)
+static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
+		       enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	unsigned long flags;
 	struct amd_iommu *iommu;
@@ -1390,7 +1392,8 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
  * lists).
  */
 static int map_sg(struct device *dev, struct scatterlist *sglist,
-		  int nelems, int dir)
+		  int nelems, enum dma_data_direction dir,
+		  struct dma_attrs *attrs)
 {
 	unsigned long flags;
 	struct amd_iommu *iommu;
@@ -1457,7 +1460,8 @@ unmap:
  * lists).
  */
 static void unmap_sg(struct device *dev, struct scatterlist *sglist,
-		     int nelems, int dir)
+		     int nelems, enum dma_data_direction dir,
+		     struct dma_attrs *attrs)
 {
 	unsigned long flags;
 	struct amd_iommu *iommu;
@@ -1644,11 +1648,11 @@ static void prealloc_protection_domains(void)
 	}
 }
 
-static struct dma_mapping_ops amd_iommu_dma_ops = {
+static struct dma_map_ops amd_iommu_dma_ops = {
 	.alloc_coherent = alloc_coherent,
 	.free_coherent = free_coherent,
-	.map_single = map_single,
-	.unmap_single = unmap_single,
+	.map_page = map_page,
+	.unmap_page = unmap_page,
 	.map_sg = map_sg,
 	.unmap_sg = unmap_sg,
 	.dma_supported = amd_iommu_dma_supported,
@@ -1924,6 +1928,12 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 	return paddr;
 }
 
+static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
+				    unsigned long cap)
+{
+	return 0;
+}
+
 static struct iommu_ops amd_iommu_ops = {
 	.domain_init = amd_iommu_domain_init,
 	.domain_destroy = amd_iommu_domain_destroy,
@@ -1932,5 +1942,6 @@ static struct iommu_ops amd_iommu_ops = {
 	.map = amd_iommu_map_range,
 	.unmap = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
+	.domain_has_cap = amd_iommu_domain_has_cap,
 };
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4732768c534..b0e5e712a7a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -47,6 +47,7 @@
 #include <asm/idle.h>
 #include <asm/mtrr.h>
 #include <asm/smp.h>
+#include <asm/mce.h>
 
 unsigned int num_processors;
 
@@ -811,7 +812,7 @@ void clear_local_APIC(void)
 	u32 v;
 
 	/* APIC hasn't been mapped yet */
-	if (!apic_phys)
+	if (!x2apic && !apic_phys)
 		return;
 
 	maxlvt = lapic_get_maxlvt();
@@ -845,6 +846,14 @@ void clear_local_APIC(void)
 		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
 	}
 #endif
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 6) {
+		v = apic_read(APIC_LVTCMCI);
+		if (!(v & APIC_LVT_MASKED))
+			apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+	}
+#endif
+
 	/*
 	 * Clean APIC state for other OSs:
 	 */
@@ -1245,6 +1254,12 @@ void __cpuinit setup_local_APIC(void)
 	apic_write(APIC_LVT1, value);
 
 	preempt_enable();
+
+#ifdef CONFIG_X86_MCE_INTEL
+	/* Recheck CMCI information after local APIC is up on CPU #0 */
+	if (smp_processor_id() == 0)
+		cmci_recheck();
+#endif
 }
 
 void __cpuinit end_local_APIC_setup(void)
@@ -1323,15 +1338,16 @@ void __init enable_IR_x2apic(void)
 		return;
 	}
 
-	local_irq_save(flags);
-	mask_8259A();
-
-	ret = save_mask_IO_APIC_setup();
+	ret = save_IO_APIC_setup();
 	if (ret) {
 		pr_info("Saving IO-APIC state failed: %d\n", ret);
 		goto end;
 	}
 
+	local_irq_save(flags);
+	mask_IO_APIC_setup();
+	mask_8259A();
+
 	ret = enable_intr_remapping(1);
 
 	if (ret && x2apic_preenabled) {
@@ -1356,10 +1372,10 @@ end_restore:
 	else
 		reinit_intr_remapped_IO_APIC(x2apic_preenabled);
 
-end:
 	unmask_8259A();
 	local_irq_restore(flags);
 
+end:
 	if (!ret) {
 		if (!x2apic_preenabled)
 			pr_info("Enabled x2apic and interrupt-remapping\n");
@@ -1512,12 +1528,10 @@ void __init early_init_lapic_mapping(void)
  */
 void __init init_apic_mappings(void)
 {
-#ifdef CONFIG_X86_X2APIC
 	if (x2apic) {
 		boot_cpu_physical_apicid = read_apic_id();
 		return;
 	}
-#endif
 
 	/*
 	 * If no local APIC can be found then set up a fake all
@@ -1961,12 +1975,9 @@ static int lapic_resume(struct sys_device *dev)
 
 	local_irq_save(flags);
 
-#ifdef CONFIG_X86_X2APIC
 	if (x2apic)
 		enable_x2apic();
-	else
-#endif
-	{
+	else {
 		/*
 		 * Make sure the APICBASE points to the right address
 		 *
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 3b002995e14..0014714ea97 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -159,20 +159,6 @@ static int flat_apic_id_registered(void)
 	return physid_isset(read_xapic_id(), phys_cpu_present_map);
 }
 
-static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
-	return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
-}
-
-static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
-						const struct cpumask *andmask)
-{
-	unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
-	unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
-
-	return mask1 & mask2;
-}
-
 static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
 {
 	return hard_smp_processor_id() >> index_msb;
@@ -213,8 +199,8 @@ struct apic apic_flat =  {
 	.set_apic_id			= set_apic_id,
 	.apic_id_mask			= 0xFFu << 24,
 
-	.cpu_mask_to_apicid		= flat_cpu_mask_to_apicid,
-	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,
+	.cpu_mask_to_apicid		= default_cpu_mask_to_apicid,
+	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= flat_send_IPI_mask,
 	.send_IPI_mask_allbutself	= flat_send_IPI_mask_allbutself,
@@ -222,7 +208,6 @@ struct apic apic_flat =  {
 	.send_IPI_all			= flat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,
 
-	.wakeup_cpu			= NULL,
 	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
 	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 	.wait_for_init_deassert		= NULL,
@@ -373,7 +358,6 @@ struct apic apic_physflat =  {
 	.send_IPI_all			= physflat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,
 
-	.wakeup_cpu			= NULL,
 	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
 	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 	.wait_for_init_deassert		= NULL,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 0b1093394fd..676cdac385c 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -16,32 +16,31 @@
 #include <asm/apic.h>
 #include <asm/ipi.h>
 
-static inline unsigned bigsmp_get_apic_id(unsigned long x)
+static unsigned bigsmp_get_apic_id(unsigned long x)
 {
 	return (x >> 24) & 0xFF;
 }
 
-static inline int bigsmp_apic_id_registered(void)
+static int bigsmp_apic_id_registered(void)
 {
 	return 1;
 }
 
-static inline const cpumask_t *bigsmp_target_cpus(void)
+static const struct cpumask *bigsmp_target_cpus(void)
 {
 #ifdef CONFIG_SMP
-	return &cpu_online_map;
+	return cpu_online_mask;
 #else
-	return &cpumask_of_cpu(0);
+	return cpumask_of(0);
 #endif
 }
 
-static inline unsigned long
-bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
+static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
 {
 	return 0;
 }
 
-static inline unsigned long bigsmp_check_apicid_present(int bit)
+static unsigned long bigsmp_check_apicid_present(int bit)
 {
 	return 1;
 }
@@ -64,7 +63,7 @@ static inline unsigned long calculate_ldr(int cpu)
  * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
  * document number 292116).  So here it goes...
  */
-static inline void bigsmp_init_apic_ldr(void)
+static void bigsmp_init_apic_ldr(void)
 {
 	unsigned long val;
 	int cpu = smp_processor_id();
@@ -74,19 +73,19 @@ static inline void bigsmp_init_apic_ldr(void)
 	apic_write(APIC_LDR, val);
 }
 
-static inline void bigsmp_setup_apic_routing(void)
+static void bigsmp_setup_apic_routing(void)
 {
 	printk(KERN_INFO
 		"Enabling APIC mode:  Physflat.  Using %d I/O APICs\n",
 		nr_ioapics);
 }
 
-static inline int bigsmp_apicid_to_node(int logical_apicid)
+static int bigsmp_apicid_to_node(int logical_apicid)
 {
 	return apicid_2_node[hard_smp_processor_id()];
 }
 
-static inline int bigsmp_cpu_present_to_apicid(int mps_cpu)
+static int bigsmp_cpu_present_to_apicid(int mps_cpu)
 {
 	if (mps_cpu < nr_cpu_ids)
 		return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
@@ -94,7 +93,7 @@ static inline int bigsmp_cpu_present_to_apicid(int mps_cpu)
 	return BAD_APICID;
 }
 
-static inline physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
+static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
 {
 	return physid_mask_of_physid(phys_apicid);
 }
@@ -107,29 +106,24 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
 	return cpu_physical_id(cpu);
 }
 
-static inline physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
+static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
 {
 	/* For clustered we don't have a good way to do this yet - hack */
 	return physids_promote(0xFFL);
 }
 
-static inline void bigsmp_setup_portio_remap(void)
-{
-}
-
-static inline int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
+static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
 {
 	return 1;
 }
 
 /* As we are using single CPU as destination, pick only one CPU here */
-static inline unsigned int bigsmp_cpu_mask_to_apicid(const cpumask_t *cpumask)
+static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
-	return bigsmp_cpu_to_logical_apicid(first_cpu(*cpumask));
+	return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask));
 }
 
-static inline unsigned int
-bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 			      const struct cpumask *andmask)
 {
 	int cpu;
@@ -148,7 +142,7 @@ bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 	return BAD_APICID;
 }
 
-static inline int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
+static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
 {
 	return cpuid_apic >> index_msb;
 }
@@ -158,12 +152,12 @@ static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
 	default_send_IPI_mask_sequence_phys(mask, vector);
 }
 
-static inline void bigsmp_send_IPI_allbutself(int vector)
+static void bigsmp_send_IPI_allbutself(int vector)
 {
 	default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
 }
 
-static inline void bigsmp_send_IPI_all(int vector)
+static void bigsmp_send_IPI_all(int vector)
 {
 	bigsmp_send_IPI_mask(cpu_online_mask, vector);
 }
@@ -194,10 +188,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
 	{ } /* NULL entry stops DMI scanning */
 };
 
-static void bigsmp_vector_allocation_domain(int cpu, cpumask_t *retmask)
+static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-	cpus_clear(*retmask);
-	cpu_set(cpu, *retmask);
+	cpumask_clear(retmask);
+	cpumask_set_cpu(cpu, retmask);
 }
 
 static int probe_bigsmp(void)
@@ -256,7 +250,6 @@ struct apic apic_bigsmp = {
 	.send_IPI_all			= bigsmp_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.wakeup_cpu			= NULL,
 	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
 	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 320f2d2e4e5..1c11b819f24 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -163,22 +163,17 @@ static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 	return 0;
 }
 
-static int __init es7000_update_apic(void)
+static int es7000_apic_is_cluster(void)
 {
-	apic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
-
 	/* MPENTIUMIII */
 	if (boot_cpu_data.x86 == 6 &&
-	    (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
-		es7000_update_apic_to_cluster();
-		apic->wait_for_init_deassert = NULL;
-		apic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
-	}
+	    (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11))
+		return 1;
 
 	return 0;
 }
 
-static void __init setup_unisys(void)
+static void setup_unisys(void)
 {
 	/*
 	 * Determine the generation of the ES7000 currently running.
@@ -192,14 +187,12 @@ static void __init setup_unisys(void)
 	else
 		es7000_plat = ES7000_CLASSIC;
 	ioapic_renumber_irq = es7000_rename_gsi;
-
-	x86_quirks->update_apic = es7000_update_apic;
 }
 
 /*
  * Parse the OEM Table:
  */
-static int __init parse_unisys_oem(char *oemptr)
+static int parse_unisys_oem(char *oemptr)
 {
 	int			i;
 	int 			success = 0;
@@ -261,7 +254,7 @@ static int __init parse_unisys_oem(char *oemptr)
 }
 
 #ifdef CONFIG_ACPI
-static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
+static int find_unisys_acpi_oem_table(unsigned long *oem_addr)
 {
 	struct acpi_table_header *header = NULL;
 	struct es7000_oem_table *table;
@@ -292,7 +285,7 @@ static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
 	return 0;
 }
 
-static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
+static void unmap_unisys_acpi_oem_table(unsigned long oem_addr)
 {
 	if (!oem_addr)
 		return;
@@ -310,8 +303,10 @@ static int es7000_check_dsdt(void)
 	return 0;
 }
 
+static int es7000_acpi_ret;
+
 /* Hook from generic ACPI tables.c */
-static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	unsigned long oem_addr = 0;
 	int check_dsdt;
@@ -332,10 +327,26 @@ static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 		 */
 		unmap_unisys_acpi_oem_table(oem_addr);
 	}
-	return ret;
+
+	es7000_acpi_ret = ret;
+
+	return ret && !es7000_apic_is_cluster();
 }
+
+static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
+{
+	int ret = es7000_acpi_ret;
+
+	return ret && es7000_apic_is_cluster();
+}
+
 #else /* !CONFIG_ACPI: */
-static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	return 0;
+}
+
+static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
 {
 	return 0;
 }
@@ -349,8 +360,7 @@ static void es7000_spin(int n)
 		rep_nop();
 }
 
-static int __init
-es7000_mip_write(struct mip_reg *mip_reg)
+static int es7000_mip_write(struct mip_reg *mip_reg)
 {
 	int status = 0;
 	int spin;
@@ -383,7 +393,7 @@ es7000_mip_write(struct mip_reg *mip_reg)
 	return status;
 }
 
-static void __init es7000_enable_apic_mode(void)
+static void es7000_enable_apic_mode(void)
 {
 	struct mip_reg es7000_mip_reg;
 	int mip_status;
@@ -400,7 +410,7 @@ static void __init es7000_enable_apic_mode(void)
 		WARN(1, "Command failed, status = %x\n", mip_status);
 }
 
-static void es7000_vector_allocation_domain(int cpu, cpumask_t *retmask)
+static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
 	/* Careful. Some cpus do not strictly honor the set of cpus
 	 * specified in the interrupt destination when using lowest
@@ -410,17 +420,15 @@ static void es7000_vector_allocation_domain(int cpu, cpumask_t *retmask)
 	 * deliver interrupts to the wrong hyperthread when only one
 	 * hyperthread was specified in the interrupt desitination.
 	 */
-	*retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
+	cpumask_clear(retmask);
+	cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
 }
 
 
 static void es7000_wait_for_init_deassert(atomic_t *deassert)
 {
-#ifndef CONFIG_ES7000_CLUSTERED_APIC
 	while (!atomic_read(deassert))
 		cpu_relax();
-#endif
-	return;
 }
 
 static unsigned int es7000_get_apic_id(unsigned long x)
@@ -448,14 +456,14 @@ static int es7000_apic_id_registered(void)
 	return 1;
 }
 
-static const cpumask_t *target_cpus_cluster(void)
+static const struct cpumask *target_cpus_cluster(void)
 {
-	return &CPU_MASK_ALL;
+	return cpu_all_mask;
 }
 
-static const cpumask_t *es7000_target_cpus(void)
+static const struct cpumask *es7000_target_cpus(void)
 {
-	return &cpumask_of_cpu(smp_processor_id());
+	return cpumask_of(smp_processor_id());
 }
 
 static unsigned long
@@ -510,7 +518,7 @@ static void es7000_setup_apic_routing(void)
 	  "Enabling APIC mode:  %s. Using %d I/O APICs, target cpus %lx\n",
 		(apic_version[apic] == 0x14) ?
 			"Physical Cluster" : "Logical Cluster",
-		nr_ioapics, cpus_addr(*es7000_target_cpus())[0]);
+		nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
 }
 
 static int es7000_apicid_to_node(int logical_apicid)
@@ -565,72 +573,24 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
 	return 1;
 }
 
-static unsigned int
-es7000_cpu_mask_to_apicid_cluster(const struct cpumask *cpumask)
-{
-	int cpus_found = 0;
-	int num_bits_set;
-	int apicid;
-	int cpu;
-
-	num_bits_set = cpumask_weight(cpumask);
-	/* Return id to all */
-	if (num_bits_set == nr_cpu_ids)
-		return 0xFF;
-	/*
-	 * The cpus in the mask must all be on the apic cluster.  If are not
-	 * on the same apicid cluster return default value of target_cpus():
-	 */
-	cpu = cpumask_first(cpumask);
-	apicid = es7000_cpu_to_logical_apicid(cpu);
-
-	while (cpus_found < num_bits_set) {
-		if (cpumask_test_cpu(cpu, cpumask)) {
-			int new_apicid = es7000_cpu_to_logical_apicid(cpu);
-
-			if (APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
-				WARN(1, "Not a valid mask!");
-
-				return 0xFF;
-			}
-			apicid = new_apicid;
-			cpus_found++;
-		}
-		cpu++;
-	}
-	return apicid;
-}
-
-static unsigned int es7000_cpu_mask_to_apicid(const cpumask_t *cpumask)
+static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
-	int cpus_found = 0;
-	int num_bits_set;
-	int apicid;
-	int cpu;
+	unsigned int round = 0;
+	int cpu, uninitialized_var(apicid);
 
-	num_bits_set = cpus_weight(*cpumask);
-	/* Return id to all */
-	if (num_bits_set == nr_cpu_ids)
-		return es7000_cpu_to_logical_apicid(0);
 	/*
-	 * The cpus in the mask must all be on the apic cluster.  If are not
-	 * on the same apicid cluster return default value of target_cpus():
+	 * The cpus in the mask must all be on the apic cluster.
 	 */
-	cpu = first_cpu(*cpumask);
-	apicid = es7000_cpu_to_logical_apicid(cpu);
-	while (cpus_found < num_bits_set) {
-		if (cpu_isset(cpu, *cpumask)) {
-			int new_apicid = es7000_cpu_to_logical_apicid(cpu);
+	for_each_cpu(cpu, cpumask) {
+		int new_apicid = es7000_cpu_to_logical_apicid(cpu);
 
-			if (APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
-				printk("%s: Not a valid mask!\n", __func__);
+		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
+			WARN(1, "Not a valid mask!");
 
-				return es7000_cpu_to_logical_apicid(0);
-			}
-			apicid = new_apicid;
-			cpus_found++;
+			return BAD_APICID;
 		}
-		cpu++;
+		apicid = new_apicid;
+		round++;
 	}
 	return apicid;
 }
@@ -659,37 +619,103 @@ static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
 	return cpuid_apic >> index_msb;
 }
 
-void __init es7000_update_apic_to_cluster(void)
-{
-	apic->target_cpus = target_cpus_cluster;
-	apic->irq_delivery_mode = dest_LowestPrio;
-	/* logical delivery broadcast to all procs: */
-	apic->irq_dest_mode = 1;
-
-	apic->init_apic_ldr = es7000_init_apic_ldr_cluster;
-
-	apic->cpu_mask_to_apicid = es7000_cpu_mask_to_apicid_cluster;
-}
-
 static int probe_es7000(void)
 {
 	/* probed later in mptable/ACPI hooks */
 	return 0;
 }
 
-static __init int
-es7000_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
+static int es7000_mps_ret;
+static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem,
+		char *productid)
 {
+	int ret = 0;
+
 	if (mpc->oemptr) {
 		struct mpc_oemtable *oem_table =
 			(struct mpc_oemtable *)mpc->oemptr;
 
 		if (!strncmp(oem, "UNISYS", 6))
-			return parse_unisys_oem((char *)oem_table);
+			ret = parse_unisys_oem((char *)oem_table);
 	}
-	return 0;
+
+	es7000_mps_ret = ret;
+
+	return ret && !es7000_apic_is_cluster();
 }
 
+static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
+		char *productid)
+{
+	int ret = es7000_mps_ret;
+
+	return ret && es7000_apic_is_cluster();
+}
+
+struct apic apic_es7000_cluster = {
+
+	.name				= "es7000",
+	.probe				= probe_es7000,
+	.acpi_madt_oem_check		= es7000_acpi_madt_oem_check_cluster,
+	.apic_id_registered		= es7000_apic_id_registered,
+
+	.irq_delivery_mode		= dest_LowestPrio,
+	/* logical delivery broadcast to all procs: */
+	.irq_dest_mode			= 1,
+
+	.target_cpus			= target_cpus_cluster,
+	.disable_esr			= 1,
+	.dest_logical			= 0,
+	.check_apicid_used		= es7000_check_apicid_used,
+	.check_apicid_present		= es7000_check_apicid_present,
+
+	.vector_allocation_domain	= es7000_vector_allocation_domain,
+	.init_apic_ldr			= es7000_init_apic_ldr_cluster,
+
+	.ioapic_phys_id_map		= es7000_ioapic_phys_id_map,
+	.setup_apic_routing		= es7000_setup_apic_routing,
+	.multi_timer_check		= NULL,
+	.apicid_to_node			= es7000_apicid_to_node,
+	.cpu_to_logical_apicid		= es7000_cpu_to_logical_apicid,
+	.cpu_present_to_apicid		= es7000_cpu_present_to_apicid,
+	.apicid_to_cpu_present		= es7000_apicid_to_cpu_present,
+	.setup_portio_remap		= NULL,
+	.check_phys_apicid_present	= es7000_check_phys_apicid_present,
+	.enable_apic_mode		= es7000_enable_apic_mode,
+	.phys_pkg_id			= es7000_phys_pkg_id,
+	.mps_oem_check			= es7000_mps_oem_check_cluster,
+
+	.get_apic_id			= es7000_get_apic_id,
+	.set_apic_id			= NULL,
+	.apic_id_mask			= 0xFF << 24,
+
+	.cpu_mask_to_apicid		= es7000_cpu_mask_to_apicid,
+	.cpu_mask_to_apicid_and		= es7000_cpu_mask_to_apicid_and,
+
+	.send_IPI_mask			= es7000_send_IPI_mask,
+	.send_IPI_mask_allbutself	= NULL,
+	.send_IPI_allbutself		= es7000_send_IPI_allbutself,
+	.send_IPI_all			= es7000_send_IPI_all,
+	.send_IPI_self			= default_send_IPI_self,
+
+	.wakeup_secondary_cpu		= wakeup_secondary_cpu_via_mip,
+
+	.trampoline_phys_low		= 0x467,
+	.trampoline_phys_high		= 0x469,
+
+	.wait_for_init_deassert		= NULL,
+
+	/* Nothing to do for most platforms, since cleared by the INIT cycle: */
+	.smp_callin_clear_local_apic	= NULL,
+	.inquire_remote_apic		= default_inquire_remote_apic,
+
+	.read				= native_apic_mem_read,
+	.write				= native_apic_mem_write,
+	.icr_read			= native_apic_icr_read,
+	.icr_write			= native_apic_icr_write,
+	.wait_icr_idle			= native_apic_wait_icr_idle,
+	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
+};
 
 struct apic apic_es7000 = {
 
@@ -737,8 +763,6 @@ struct apic apic_es7000 = {
 	.send_IPI_all			= es7000_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.wakeup_cpu			= NULL,
-
 	.trampoline_phys_low		= 0x467,
 	.trampoline_phys_high		= 0x469,
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00e6071cefc..1bb5c6cee3e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -389,6 +389,8 @@ struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
 	unsigned int data;
+	unsigned int unused2[11];
+	unsigned int eoi;
 };
 
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
@@ -397,6 +399,12 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 		+ (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
 }
 
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(vector, &io_apic->eoi);
+}
+
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -546,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
 
 		apic = entry->apic;
 		pin = entry->pin;
-#ifdef CONFIG_INTR_REMAP
 		/*
 		 * With interrupt-remapping, destination information comes
 		 * from interrupt-remapping table entry.
 		 */
 		if (!irq_remapped(irq))
 			io_apic_write(apic, 0x11 + pin*2, dest);
-#else
-		io_apic_write(apic, 0x11 + pin*2, dest);
-#endif
 		reg = io_apic_read(apic, 0x10 + pin*2);
 		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
 		reg |= vector;
@@ -588,10 +592,12 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 	if (assign_irq_vector(irq, cfg, mask))
 		return BAD_APICID;
 
-	cpumask_and(desc->affinity, cfg->domain, mask);
+	/* check that before desc->addinity get updated */
 	set_extra_move_desc(desc, mask);
 
-	return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
+	cpumask_copy(desc->affinity, mask);
+
+	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
 }
 
 static void
@@ -849,9 +855,9 @@ __setup("pirq=", ioapic_pirq_setup);
 static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
 
 /*
- * Saves and masks all the unmasked IO-APIC RTE's
+ * Saves all the IO-APIC RTE's
  */
-int save_mask_IO_APIC_setup(void)
+int save_IO_APIC_setup(void)
 {
 	union IO_APIC_reg_01 reg_01;
 	unsigned long flags;
@@ -876,16 +882,9 @@ int save_mask_IO_APIC_setup(void)
 	}
 
 	for (apic = 0; apic < nr_ioapics; apic++)
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-			struct IO_APIC_route_entry entry;
-
-			entry = early_ioapic_entries[apic][pin] =
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+			early_ioapic_entries[apic][pin] =
 				ioapic_read_entry(apic, pin);
-			if (!entry.mask) {
-				entry.mask = 1;
-				ioapic_write_entry(apic, pin, entry);
-			}
-		}
 
 	return 0;
 
@@ -898,6 +897,25 @@ nomem:
 	return -ENOMEM;
 }
 
+void mask_IO_APIC_setup(void)
+{
+	int apic, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		if (!early_ioapic_entries[apic])
+			break;
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+			struct IO_APIC_route_entry entry;
+
+			entry = early_ioapic_entries[apic][pin];
+			if (!entry.mask) {
+				entry.mask = 1;
+				ioapic_write_entry(apic, pin, entry);
+			}
+		}
+	}
+}
+
 void restore_IO_APIC_setup(void)
 {
 	int apic, pin;
@@ -1411,9 +1429,7 @@ void __setup_vector_irq(int cpu)
 }
 
 static struct irq_chip ioapic_chip;
-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip ir_ioapic_chip;
-#endif
 
 #define IOAPIC_AUTO     -1
 #define IOAPIC_EDGE     0
@@ -1452,7 +1468,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
 	else
 		desc->status &= ~IRQ_LEVEL;
 
-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		desc->status |= IRQ_MOVE_PCNTXT;
 		if (trigger)
@@ -1464,7 +1479,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
 						      handle_edge_irq, "edge");
 		return;
 	}
-#endif
+
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1478,14 +1493,13 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
 int setup_ioapic_entry(int apic_id, int irq,
 		       struct IO_APIC_route_entry *entry,
 		       unsigned int destination, int trigger,
-		       int polarity, int vector)
+		       int polarity, int vector, int pin)
 {
 	/*
 	 * add it to the IO-APIC irq-routing table:
 	 */
 	memset(entry,0,sizeof(*entry));
 
-#ifdef CONFIG_INTR_REMAP
 	if (intr_remapping_enabled) {
 		struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
 		struct irte irte;
@@ -1504,7 +1518,14 @@ int setup_ioapic_entry(int apic_id, int irq,
 
 		irte.present = 1;
 		irte.dst_mode = apic->irq_dest_mode;
-		irte.trigger_mode = trigger;
+		/*
+		 * Trigger mode in the IRTE will always be edge, and the
+		 * actual level or edge trigger will be setup in the IO-APIC
+		 * RTE. This will help simplify level triggered irq migration.
+		 * For more details, see the comments above explainig IO-APIC
+		 * irq migration in the presence of interrupt-remapping.
+		 */
+		irte.trigger_mode = 0;
 		irte.dlvry_mode = apic->irq_delivery_mode;
 		irte.vector = vector;
 		irte.dest_id = IRTE_DEST(destination);
@@ -1515,18 +1536,21 @@ int setup_ioapic_entry(int apic_id, int irq,
 		ir_entry->zero = 0;
 		ir_entry->format = 1;
 		ir_entry->index = (index & 0x7fff);
-	} else
-#endif
-	{
+		/*
+		 * IO-APIC RTE will be configured with virtual vector.
+		 * irq handler will do the explicit EOI to the io-apic.
+		 */
+		ir_entry->vector = pin;
+	} else {
 		entry->delivery_mode = apic->irq_delivery_mode;
 		entry->dest_mode = apic->irq_dest_mode;
 		entry->dest = destination;
+		entry->vector = vector;
 	}
 
 	entry->mask = 0;				/* enable IRQ */
 	entry->trigger = trigger;
 	entry->polarity = polarity;
-	entry->vector = vector;
 
 	/* Mask level triggered irqs.
 	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1561,7 +1585,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
 
 
 	if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
-			       dest, trigger, polarity, cfg->vector)) {
+			       dest, trigger, polarity, cfg->vector, pin)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic_id].apicid, pin);
 		__clear_irq_vector(irq, cfg);
@@ -1642,10 +1666,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
 {
 	struct IO_APIC_route_entry entry;
 
-#ifdef CONFIG_INTR_REMAP
 	if (intr_remapping_enabled)
 		return;
-#endif
 
 	memset(&entry, 0, sizeof(entry));
 
@@ -2040,8 +2062,13 @@ void disable_IO_APIC(void)
 	 * If the i8259 is routed through an IOAPIC
 	 * Put that IOAPIC in virtual wire mode
 	 * so legacy interrupts can be delivered.
+	 *
+	 * With interrupt-remapping, for now we will use virtual wire A mode,
+	 * as virtual wire B is little complex (need to configure both
+	 * IOAPIC RTE aswell as interrupt-remapping table entry).
+	 * As this gets called during crash dump, keep this simple for now.
 	 */
-	if (ioapic_i8259.pin != -1) {
+	if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
 		struct IO_APIC_route_entry entry;
 
 		memset(&entry, 0, sizeof(entry));
@@ -2061,7 +2088,10 @@ void disable_IO_APIC(void)
 		ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
 	}
 
-	disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+	/*
+	 * Use virtual wire A mode when interrupt remapping is enabled.
+	 */
+	disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
 }
 
 #ifdef CONFIG_X86_32
@@ -2303,37 +2333,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
 #ifdef CONFIG_SMP
 
 #ifdef CONFIG_INTR_REMAP
-static void ir_irq_migration(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
 
 /*
  * Migrate the IO-APIC irq in the presence of intr-remapping.
  *
- * For edge triggered, irq migration is a simple atomic update(of vector
- * and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we need to modify the io-apic RTE aswell with the update
- * vector information, along with modifying IRTE with vector and destination.
- * So irq migration for level triggered is little  bit more complex compared to
- * edge triggered migration. But the good news is, we use the same algorithm
- * for level triggered migration as we have today, only difference being,
- * we now initiate the irq migration from process context instead of the
- * interrupt context.
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
  *
- * In future, when we do a directed EOI (combined with cpu EOI broadcast
- * suppression) to the IO-APIC, level triggered irq migration will also be
- * as simple as edge triggered migration and we can do the irq migration
- * with a simple atomic update to IO-APIC RTE.
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
  */
 static void
 migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	struct irte irte;
-	int modify_ioapic_rte;
 	unsigned int dest;
-	unsigned long flags;
 	unsigned int irq;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2351,13 +2368,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
 
-	modify_ioapic_rte = desc->status & IRQ_LEVEL;
-	if (modify_ioapic_rte) {
-		spin_lock_irqsave(&ioapic_lock, flags);
-		__target_IO_APIC_irq(irq, dest, cfg);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-	}
-
 	irte.vector = cfg->vector;
 	irte.dest_id = IRTE_DEST(dest);
 
@@ -2372,73 +2382,12 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 	cpumask_copy(desc->affinity, mask);
 }
 
-static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
-{
-	int ret = -1;
-	struct irq_cfg *cfg = desc->chip_data;
-
-	mask_IO_APIC_irq_desc(desc);
-
-	if (io_apic_level_ack_pending(cfg)) {
-		/*
-		 * Interrupt in progress. Migrating irq now will change the
-		 * vector information in the IO-APIC RTE and that will confuse
-		 * the EOI broadcast performed by cpu.
-		 * So, delay the irq migration to the next instance.
-		 */
-		schedule_delayed_work(&ir_migration_work, 1);
-		goto unmask;
-	}
-
-	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq_desc(desc, desc->pending_mask);
-
-	ret = 0;
-	desc->status &= ~IRQ_MOVE_PENDING;
-	cpumask_clear(desc->pending_mask);
-
-unmask:
-	unmask_IO_APIC_irq_desc(desc);
-
-	return ret;
-}
-
-static void ir_irq_migration(struct work_struct *work)
-{
-	unsigned int irq;
-	struct irq_desc *desc;
-
-	for_each_irq_desc(irq, desc) {
-		if (desc->status & IRQ_MOVE_PENDING) {
-			unsigned long flags;
-
-			spin_lock_irqsave(&desc->lock, flags);
-			if (!desc->chip->set_affinity ||
-			    !(desc->status & IRQ_MOVE_PENDING)) {
-				desc->status &= ~IRQ_MOVE_PENDING;
-				spin_unlock_irqrestore(&desc->lock, flags);
-				continue;
-			}
-
-			desc->chip->set_affinity(irq, desc->pending_mask);
-			spin_unlock_irqrestore(&desc->lock, flags);
-		}
-	}
-}
-
 /*
  * Migrates the IRQ destination in the process context.
  */
 static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 					    const struct cpumask *mask)
 {
-	if (desc->status & IRQ_LEVEL) {
-		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(desc->pending_mask, mask);
-		migrate_irq_remapped_level_desc(desc);
-		return;
-	}
-
 	migrate_ioapic_irq_desc(desc, mask);
 }
 static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2448,6 +2397,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
 
 	set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
+#else
+static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+						   const struct cpumask *mask)
+{
+}
 #endif
 
 asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2461,6 +2415,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		unsigned int irq;
+		unsigned int irr;
 		struct irq_desc *desc;
 		struct irq_cfg *cfg;
 		irq = __get_cpu_var(vector_irq)[vector];
@@ -2480,6 +2435,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 			goto unlock;
 
+		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+		/*
+		 * Check if the vector that needs to be cleanedup is
+		 * registered at the cpu's IRR. If so, then this is not
+		 * the best time to clean it up. Lets clean it up in the
+		 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+		 * to myself.
+		 */
+		if (irr  & (1 << (vector % 32))) {
+			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+			goto unlock;
+		}
 		__get_cpu_var(vector_irq)[vector] = -1;
 		cfg->move_cleanup_count--;
 unlock:
@@ -2529,9 +2496,44 @@ static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
 
 #ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+	int apic, pin;
+	struct irq_pin_list *entry;
+
+	entry = cfg->irq_2_pin;
+	for (;;) {
+
+		if (!entry)
+			break;
+
+		apic = entry->apic;
+		pin = entry->pin;
+		io_apic_eoi(apic, pin);
+		entry = entry->next;
+	}
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	unsigned int irq;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__eoi_ioapic_irq(irq, cfg);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
 static void ack_x2apic_level(unsigned int irq)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	ack_x2APIC_irq();
+	eoi_ioapic_irq(desc);
 }
 
 static void ack_x2apic_edge(unsigned int irq)
@@ -2662,20 +2664,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
 	.retrigger	= ioapic_retrigger_irq,
 };
 
-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip ir_ioapic_chip __read_mostly = {
 	.name		= "IR-IO-APIC",
 	.startup	= startup_ioapic_irq,
 	.mask		= mask_IO_APIC_irq,
 	.unmask		= unmask_IO_APIC_irq,
+#ifdef CONFIG_INTR_REMAP
 	.ack		= ack_x2apic_edge,
 	.eoi		= ack_x2apic_level,
 #ifdef CONFIG_SMP
 	.set_affinity	= set_ir_ioapic_affinity_irq,
 #endif
+#endif
 	.retrigger	= ioapic_retrigger_irq,
 };
-#endif
 
 static inline void init_IO_APIC_traps(void)
 {
@@ -2901,10 +2903,8 @@ static inline void __init check_timer(void)
 	 * 8259A.
 	 */
 	if (pin1 == -1) {
-#ifdef CONFIG_INTR_REMAP
 		if (intr_remapping_enabled)
 			panic("BIOS bug: timer not connected to IO-APIC");
-#endif
 		pin1 = pin2;
 		apic1 = apic2;
 		no_pin1 = 1;
@@ -2940,10 +2940,8 @@ static inline void __init check_timer(void)
 				clear_IO_APIC_pin(0, pin1);
 			goto out;
 		}
-#ifdef CONFIG_INTR_REMAP
 		if (intr_remapping_enabled)
 			panic("timer doesn't work through Interrupt-remapped IO-APIC");
-#endif
 		local_irq_disable();
 		clear_IO_APIC_pin(apic1, pin1);
 		if (!no_pin1)
@@ -3237,9 +3235,7 @@ void destroy_irq(unsigned int irq)
 	if (desc)
 		desc->chip_data = cfg;
 
-#ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
-#endif
 	spin_lock_irqsave(&vector_lock, flags);
 	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
@@ -3265,7 +3261,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
 
-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		struct irte irte;
 		int ir_index;
@@ -3291,10 +3286,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 				  MSI_ADDR_IR_SHV |
 				  MSI_ADDR_IR_INDEX1(ir_index) |
 				  MSI_ADDR_IR_INDEX2(ir_index);
-	} else
-#endif
-	{
-		msg->address_hi = MSI_ADDR_BASE_HI;
+	} else {
+		if (x2apic_enabled())
+			msg->address_hi = MSI_ADDR_BASE_HI |
+					  MSI_ADDR_EXT_DEST_ID(dest);
+		else
+			msg->address_hi = MSI_ADDR_BASE_HI;
+
 		msg->address_lo =
 			MSI_ADDR_BASE_LO |
 			((apic->irq_dest_mode == 0) ?
@@ -3394,15 +3392,16 @@ static struct irq_chip msi_chip = {
 	.retrigger	= ioapic_retrigger_irq,
 };
 
-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip msi_ir_chip = {
 	.name		= "IR-PCI-MSI",
 	.unmask		= unmask_msi_irq,
 	.mask		= mask_msi_irq,
+#ifdef CONFIG_INTR_REMAP
 	.ack		= ack_x2apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity	= ir_set_msi_irq_affinity,
 #endif
+#endif
 	.retrigger	= ioapic_retrigger_irq,
 };
 
@@ -3432,7 +3431,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 	}
 	return index;
 }
-#endif
 
 static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
@@ -3446,7 +3444,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 	set_irq_msi(irq, msidesc);
 	write_msi_msg(irq, &msg);
 
-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		struct irq_desc *desc = irq_to_desc(irq);
 		/*
@@ -3455,7 +3452,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 		desc->status |= IRQ_MOVE_PCNTXT;
 		set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
 	} else
-#endif
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
 	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3469,11 +3465,12 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	int ret, sub_handle;
 	struct msi_desc *msidesc;
 	unsigned int irq_want;
-
-#ifdef CONFIG_INTR_REMAP
-	struct intel_iommu *iommu = 0;
+	struct intel_iommu *iommu = NULL;
 	int index = 0;
-#endif
+
+	/* x86 doesn't support multiple MSI yet */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
 
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
@@ -3482,7 +3479,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		if (irq == 0)
 			return -1;
 		irq_want = irq + 1;
-#ifdef CONFIG_INTR_REMAP
 		if (!intr_remapping_enabled)
 			goto no_ir;
 
@@ -3510,7 +3506,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			set_irte_irq(irq, iommu, index, sub_handle);
 		}
 no_ir:
-#endif
 		ret = setup_msi_irq(dev, msidesc, irq);
 		if (ret < 0)
 			goto error;
@@ -3528,7 +3523,7 @@ void arch_teardown_msi_irq(unsigned int irq)
 	destroy_irq(irq);
 }
 
-#ifdef CONFIG_DMAR
+#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
 static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
@@ -3609,7 +3604,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 #endif /* CONFIG_SMP */
 
-struct irq_chip hpet_msi_type = {
+static struct irq_chip hpet_msi_type = {
 	.name = "HPET_MSI",
 	.unmask = hpet_msi_unmask,
 	.mask = hpet_msi_mask,
@@ -4045,11 +4040,9 @@ void __init setup_ioapic_dest(void)
 			else
 				mask = apic->target_cpus();
 
-#ifdef CONFIG_INTR_REMAP
 			if (intr_remapping_enabled)
 				set_ir_ioapic_affinity_irq_desc(desc, mask);
 			else
-#endif
 				set_ioapic_affinity_irq_desc(desc, mask);
 		}
 
@@ -4142,9 +4135,12 @@ static int __init ioapic_insert_resources(void)
 	struct resource *r = ioapic_resources;
 
 	if (!r) {
-		printk(KERN_ERR
-		       "IO APIC resources could be not be allocated.\n");
-		return -1;
+		if (nr_ioapics > 0) {
+			printk(KERN_ERR
+				"IO APIC resources couldn't be allocated.\n");
+			return -1;
+		}
+		return 0;
 	}
 
 	for (i = 0; i < nr_ioapics; i++) {
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index bdfad80c3cf..d6bd6240715 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,7 @@
 int unknown_nmi_panic;
 int nmi_watchdog_enabled;
 
-static cpumask_t backtrace_mask = CPU_MASK_NONE;
+static cpumask_var_t backtrace_mask;
 
 /* nmi_active:
  * >0: the lapic NMI watchdog is active, but can be disabled
@@ -138,6 +138,7 @@ int __init check_nmi_watchdog(void)
 	if (!prev_nmi_count)
 		goto error;
 
+	alloc_cpumask_var(&backtrace_mask, GFP_KERNEL);
 	printk(KERN_INFO "Testing NMI watchdog ... ");
 
 #ifdef CONFIG_SMP
@@ -413,14 +414,14 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 		touched = 1;
 	}
 
-	if (cpu_isset(cpu, backtrace_mask)) {
+	if (cpumask_test_cpu(cpu, backtrace_mask)) {
 		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
 
 		spin_lock(&lock);
 		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
 		dump_stack();
 		spin_unlock(&lock);
-		cpu_clear(cpu, backtrace_mask);
+		cpumask_clear_cpu(cpu, backtrace_mask);
 	}
 
 	/* Could check oops_in_progress here too, but it's safer not to */
@@ -554,10 +555,10 @@ void __trigger_all_cpu_backtrace(void)
 {
 	int i;
 
-	backtrace_mask = cpu_online_map;
+	cpumask_copy(backtrace_mask, cpu_online_mask);
 	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
 	for (i = 0; i < 10 * 1000; i++) {
-		if (cpus_empty(backtrace_mask))
+		if (cpumask_empty(backtrace_mask))
 			break;
 		mdelay(1);
 	}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index d9d6d61eed8..533e59c6fc8 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -69,7 +69,7 @@ struct mpc_trans {
 /* x86_quirks member */
 static int				mpc_record;
 
-static __cpuinitdata struct mpc_trans	*translation_table[MAX_MPC_ENTRY];
+static struct mpc_trans			*translation_table[MAX_MPC_ENTRY];
 
 int					mp_bus_id_to_node[MAX_MP_BUSSES];
 int					mp_bus_id_to_local[MAX_MP_BUSSES];
@@ -256,13 +256,6 @@ static int __init numaq_setup_ioapic_ids(void)
 	return 1;
 }
 
-static int __init numaq_update_apic(void)
-{
-	apic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
-
-	return 0;
-}
-
 static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_pre_time_init		= numaq_pre_time_init,
 	.arch_time_init			= NULL,
@@ -278,7 +271,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
 	.mpc_oem_pci_bus		= mpc_oem_pci_bus,
 	.smp_read_mpc_oem		= smp_read_mpc_oem,
 	.setup_ioapic_ids		= numaq_setup_ioapic_ids,
-	.update_apic			= numaq_update_apic,
 };
 
 static __init void early_check_numaq(void)
@@ -342,9 +334,9 @@ static inline void numaq_smp_callin_clear_local_apic(void)
 	clear_local_APIC();
 }
 
-static inline const cpumask_t *numaq_target_cpus(void)
+static inline const struct cpumask *numaq_target_cpus(void)
 {
-	return &CPU_MASK_ALL;
+	return cpu_all_mask;
 }
 
 static inline unsigned long
@@ -435,7 +427,7 @@ static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid)
  * We use physical apicids here, not logical, so just return the default
  * physical broadcast to stop people from breaking us
  */
-static inline unsigned int numaq_cpu_mask_to_apicid(const cpumask_t *cpumask)
+static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
 	return 0x0F;
 }
@@ -470,7 +462,7 @@ static int probe_numaq(void)
 	return found_numaq;
 }
 
-static void numaq_vector_allocation_domain(int cpu, cpumask_t *retmask)
+static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
 	/* Careful. Some cpus do not strictly honor the set of cpus
 	 * specified in the interrupt destination when using lowest
@@ -480,7 +472,8 @@ static void numaq_vector_allocation_domain(int cpu, cpumask_t *retmask)
 	 * deliver interrupts to the wrong hyperthread when only one
 	 * hyperthread was specified in the interrupt desitination.
 	 */
-	*retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
+	cpumask_clear(retmask);
+	cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
 }
 
 static void numaq_setup_portio_remap(void)
@@ -546,7 +539,7 @@ struct apic apic_numaq = {
 	.send_IPI_all			= numaq_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.wakeup_cpu			= NULL,
+	.wakeup_secondary_cpu		= wakeup_secondary_cpu_via_nmi,
 	.trampoline_phys_low		= NUMAQ_TRAMPOLINE_PHYS_LOW,
 	.trampoline_phys_high		= NUMAQ_TRAMPOLINE_PHYS_HIGH,
 
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 3a730fa574b..01eda2ac65e 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -83,7 +83,8 @@ static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
 	 * deliver interrupts to the wrong hyperthread when only one
 	 * hyperthread was specified in the interrupt desitination.
 	 */
-	*retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } };
+	cpumask_clear(retmask);
+	cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
 }
 
 /* should be called last. */
@@ -138,7 +139,6 @@ struct apic apic_default = {
 	.send_IPI_all			= default_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.wakeup_cpu			= NULL,
 	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
 	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 
@@ -159,6 +159,7 @@ extern struct apic apic_numaq;
 extern struct apic apic_summit;
 extern struct apic apic_bigsmp;
 extern struct apic apic_es7000;
+extern struct apic apic_es7000_cluster;
 extern struct apic apic_default;
 
 struct apic *apic = &apic_default;
@@ -176,6 +177,7 @@ static struct apic *apic_probe[] __initdata = {
 #endif
 #ifdef CONFIG_X86_ES7000
 	&apic_es7000,
+	&apic_es7000_cluster,
 #endif
 	&apic_default,	/* must be last */
 	NULL,
@@ -197,9 +199,6 @@ static int __init parse_apic(char *arg)
 		}
 	}
 
-	if (x86_quirks->update_apic)
-		x86_quirks->update_apic();
-
 	/* Parsed again by __setup for debug/verbose */
 	return 0;
 }
@@ -218,8 +217,6 @@ void __init generic_bigsmp_probe(void)
 	if (!cmdline_apic && apic == &apic_default) {
 		if (apic_bigsmp.probe()) {
 			apic = &apic_bigsmp;
-			if (x86_quirks->update_apic)
-				x86_quirks->update_apic();
 			printk(KERN_INFO "Overriding APIC driver with %s\n",
 			       apic->name);
 		}
@@ -240,9 +237,6 @@ void __init generic_apic_probe(void)
 		/* Not visible without early console */
 		if (!apic_probe[i])
 			panic("Didn't find an APIC driver");
-
-		if (x86_quirks->update_apic)
-			x86_quirks->update_apic();
 	}
 	printk(KERN_INFO "Using APIC driver %s\n", apic->name);
 }
@@ -262,8 +256,6 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
 
 		if (!cmdline_apic) {
 			apic = apic_probe[i];
-			if (x86_quirks->update_apic)
-				x86_quirks->update_apic();
 			printk(KERN_INFO "Switched to APIC driver `%s'.\n",
 			       apic->name);
 		}
@@ -284,8 +276,6 @@ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 		if (!cmdline_apic) {
 			apic = apic_probe[i];
-			if (x86_quirks->update_apic)
-				x86_quirks->update_apic();
 			printk(KERN_INFO "Switched to APIC driver `%s'.\n",
 			       apic->name);
 		}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index e7c163661c7..1783652bb0e 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -69,8 +69,12 @@ void __init default_setup_apic_routing(void)
 		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
 	}
 
-	if (x86_quirks->update_apic)
-		x86_quirks->update_apic();
+	/*
+	 * Now that apic routing model is selected, configure the
+	 * fault handling for intr remapping.
+	 */
+	if (intr_remapping_enabled)
+		enable_drhd_fault_handling();
 }
 
 /* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 32838b57a94..9cfe1f415d8 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -53,23 +53,19 @@ static unsigned summit_get_apic_id(unsigned long x)
 	return (x >> 24) & 0xFF;
 }
 
-static inline void summit_send_IPI_mask(const cpumask_t *mask, int vector)
+static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector)
 {
 	default_send_IPI_mask_sequence_logical(mask, vector);
 }
 
 static void summit_send_IPI_allbutself(int vector)
 {
-	cpumask_t mask = cpu_online_map;
-	cpu_clear(smp_processor_id(), mask);
-
-	if (!cpus_empty(mask))
-		summit_send_IPI_mask(&mask, vector);
+	default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
 }
 
 static void summit_send_IPI_all(int vector)
 {
-	summit_send_IPI_mask(&cpu_online_map, vector);
+	summit_send_IPI_mask(cpu_online_mask, vector);
 }
 
 #include <asm/tsc.h>
@@ -77,9 +73,9 @@ static void summit_send_IPI_all(int vector)
 extern int use_cyclone;
 
 #ifdef CONFIG_X86_SUMMIT_NUMA
-extern void setup_summit(void);
+static void setup_summit(void);
 #else
-#define setup_summit()	{}
+static inline void setup_summit(void) {}
 #endif
 
 static int summit_mps_oem_check(struct mpc_table *mpc, char *oem,
@@ -186,13 +182,13 @@ static inline int is_WPEG(struct rio_detail *rio){
 
 #define SUMMIT_APIC_DFR_VALUE	(APIC_DFR_CLUSTER)
 
-static const cpumask_t *summit_target_cpus(void)
+static const struct cpumask *summit_target_cpus(void)
 {
 	/* CPU_MASK_ALL (0xff) has undefined behaviour with
 	 * dest_LowestPrio mode logical clustered apic interrupt routing
 	 * Just start on cpu 0.  IRQ balancing will spread load
 	 */
-	return &cpumask_of_cpu(0);
+	return cpumask_of(0);
 }
 
 static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid)
@@ -289,35 +285,23 @@ static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid)
 	return 1;
 }
 
-static unsigned int summit_cpu_mask_to_apicid(const cpumask_t *cpumask)
+static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
-	int cpus_found = 0;
-	int num_bits_set;
-	int apicid;
-	int cpu;
+	unsigned int round = 0;
+	int cpu, apicid = 0;
 
-	num_bits_set = cpus_weight(*cpumask);
-	if (num_bits_set >= nr_cpu_ids)
-		return BAD_APICID;
 	/*
 	 * The cpus in the mask must all be on the apic cluster.
 	 */
-	cpu = first_cpu(*cpumask);
-	apicid = summit_cpu_to_logical_apicid(cpu);
-
-	while (cpus_found < num_bits_set) {
-		if (cpu_isset(cpu, *cpumask)) {
-			int new_apicid = summit_cpu_to_logical_apicid(cpu);
+	for_each_cpu(cpu, cpumask) {
+		int new_apicid = summit_cpu_to_logical_apicid(cpu);
 
-			if (APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
-				printk("%s: Not a valid mask!\n", __func__);
-
-				return BAD_APICID;
-			}
-			apicid = apicid | new_apicid;
-			cpus_found++;
+		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
+			printk("%s: Not a valid mask!\n", __func__);
+			return BAD_APICID;
 		}
-		cpu++;
+		apicid |= new_apicid;
+		round++;
 	}
 	return apicid;
 }
@@ -358,7 +342,7 @@ static int probe_summit(void)
 	return 0;
 }
 
-static void summit_vector_allocation_domain(int cpu, cpumask_t *retmask)
+static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
 	/* Careful. Some cpus do not strictly honor the set of cpus
 	 * specified in the interrupt destination when using lowest
@@ -368,19 +352,20 @@ static void summit_vector_allocation_domain(int cpu, cpumask_t *retmask)
 	 * deliver interrupts to the wrong hyperthread when only one
 	 * hyperthread was specified in the interrupt desitination.
 	 */
-	*retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
+	cpumask_clear(retmask);
+	cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
 }
 
 #ifdef CONFIG_X86_SUMMIT_NUMA
-static struct rio_table_hdr *rio_table_hdr __initdata;
-static struct scal_detail   *scal_devs[MAX_NUMNODES] __initdata;
-static struct rio_detail    *rio_devs[MAX_NUMNODES*4] __initdata;
+static struct rio_table_hdr *rio_table_hdr;
+static struct scal_detail   *scal_devs[MAX_NUMNODES];
+static struct rio_detail    *rio_devs[MAX_NUMNODES*4];
 
 #ifndef CONFIG_X86_NUMAQ
-static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
+static int mp_bus_id_to_node[MAX_MP_BUSSES];
 #endif
 
-static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
+static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
 {
 	int twister = 0, node = 0;
 	int i, bus, num_buses;
@@ -442,7 +427,7 @@ static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
 	return bus;
 }
 
-static int __init build_detail_arrays(void)
+static int build_detail_arrays(void)
 {
 	unsigned long ptr;
 	int i, scal_detail_size, rio_detail_size;
@@ -476,7 +461,7 @@ static int __init build_detail_arrays(void)
 	return 1;
 }
 
-void __init setup_summit(void)
+void setup_summit(void)
 {
 	unsigned long		ptr;
 	unsigned short		offset;
@@ -574,7 +559,6 @@ struct apic apic_summit = {
 	.send_IPI_all			= summit_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.wakeup_cpu			= NULL,
 	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
 	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 354b9c45601..4a903e2f0d1 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -57,6 +57,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_cpu(query_cpu, mask) {
 		__x2apic_send_IPI_dest(
@@ -73,6 +75,8 @@ static void
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_cpu(query_cpu, mask) {
 		if (query_cpu == this_cpu)
@@ -90,6 +94,8 @@ static void x2apic_send_IPI_allbutself(int vector)
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_online_cpu(query_cpu) {
 		if (query_cpu == this_cpu)
@@ -224,7 +230,6 @@ struct apic apic_x2apic_cluster = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,
 
-	.wakeup_cpu			= NULL,
 	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
 	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 	.wait_for_init_deassert		= NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 5bcb174409b..a284359627e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -58,6 +58,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_cpu(query_cpu, mask) {
 		__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
@@ -73,6 +75,8 @@ static void
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_cpu(query_cpu, mask) {
 		if (query_cpu != this_cpu)
@@ -89,6 +93,8 @@ static void x2apic_send_IPI_allbutself(int vector)
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_online_cpu(query_cpu) {
 		if (query_cpu == this_cpu)
@@ -213,7 +219,6 @@ struct apic apic_x2apic_phys = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,
 
-	.wakeup_cpu			= NULL,
 	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
 	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 	.wait_for_init_deassert		= NULL,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 20b4ad07c3a..1248318436e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -7,28 +7,28 @@
  *
  * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
  */
-
-#include <linux/kernel.h>
-#include <linux/threads.h>
-#include <linux/cpu.h>
 #include <linux/cpumask.h>
+#include <linux/hardirq.h>
+#include <linux/proc_fs.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
-#include <linux/init.h>
 #include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/hardirq.h>
 #include <linux/timer.h>
-#include <linux/proc_fs.h>
-#include <asm/current.h>
-#include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
-#include <asm/pgtable.h>
-#include <asm/uv/uv.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
+#include <asm/current.h>
+#include <asm/pgtable.h>
 #include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/smp.h>
 
 DEFINE_PER_CPU(int, x2apic_extra_bits);
 
@@ -91,40 +91,39 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
 	cpumask_set_cpu(cpu, retmask);
 }
 
-int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
+static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 {
+#ifdef CONFIG_SMP
 	unsigned long val;
 	int pnode;
 
 	pnode = uv_apicid_to_pnode(phys_apicid);
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
-	    (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+	    ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
 	    APIC_DM_INIT;
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 	mdelay(10);
 
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
-	    (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+	    ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
 	    APIC_DM_STARTUP;
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+
+	atomic_set(&init_deasserted, 1);
+#endif
 	return 0;
 }
 
 static void uv_send_IPI_one(int cpu, int vector)
 {
-	unsigned long val, apicid;
+	unsigned long apicid;
 	int pnode;
 
 	apicid = per_cpu(x86_cpu_to_apicid, cpu);
 	pnode = uv_apicid_to_pnode(apicid);
-
-	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
-	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
-	      (vector << UVH_IPI_INT_VECTOR_SHFT);
-
-	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+	uv_hub_send_ipi(pnode, apicid, vector);
 }
 
 static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
@@ -285,7 +284,7 @@ struct apic apic_x2apic_uv_x = {
 	.send_IPI_all			= uv_send_IPI_all,
 	.send_IPI_self			= uv_send_IPI_self,
 
-	.wakeup_cpu			= NULL,
+	.wakeup_secondary_cpu		= uv_wakeup_secondary,
 	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
 	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 	.wait_for_init_deassert		= NULL,
@@ -365,7 +364,7 @@ static __init void map_high(char *id, unsigned long base, int shift,
 	paddr = base << shift;
 	bytes = (1UL << shift) * (max_pnode + 1);
 	printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
-	       					paddr + bytes);
+						paddr + bytes);
 	if (map_type == map_uc)
 		init_extra_mapping_uc(paddr, bytes);
 	else
@@ -528,7 +527,7 @@ late_initcall(uv_init_heartbeat);
 
 /*
  * Called on each cpu to initialize the per_cpu UV data area.
- * 	ZZZ hotplug not supported yet
+ * FIXME: hotplug not supported yet
  */
 void __cpuinit uv_cpu_init(void)
 {
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 10033fe718e..49e0939bac4 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -466,7 +466,7 @@ static const lookup_t error_table[] = {
  *	@err: APM BIOS return code
  *
  *	Write a meaningful log entry to the kernel log in the event of
- *	an APM error.
+ *	an APM error.  Note that this also handles (negative) kernel errors.
  */
 
 static void apm_error(char *str, int err)
@@ -478,43 +478,14 @@ static void apm_error(char *str, int err)
 			break;
 	if (i < ERROR_COUNT)
 		printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
+	else if (err < 0)
+		printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err);
 	else
 		printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
 		       str, err);
 }
 
 /*
- * Lock APM functionality to physical CPU 0
- */
-
-#ifdef CONFIG_SMP
-
-static cpumask_t apm_save_cpus(void)
-{
-	cpumask_t x = current->cpus_allowed;
-	/* Some bioses don't like being called from CPU != 0 */
-	set_cpus_allowed(current, cpumask_of_cpu(0));
-	BUG_ON(smp_processor_id() != 0);
-	return x;
-}
-
-static inline void apm_restore_cpus(cpumask_t mask)
-{
-	set_cpus_allowed(current, mask);
-}
-
-#else
-
-/*
- *	No CPU lockdown needed on a uniprocessor
- */
-
-#define apm_save_cpus()		(current->cpus_allowed)
-#define apm_restore_cpus(x)	(void)(x)
-
-#endif
-
-/*
  * These are the actual BIOS calls.  Depending on APM_ZERO_SEGS and
  * apm_info.allow_ints, we are being really paranoid here!  Not only
  * are interrupts disabled, but all the segment registers (except SS)
@@ -568,16 +539,23 @@ static inline void apm_irq_restore(unsigned long flags)
 #	define APM_DO_RESTORE_SEGS
 #endif
 
+struct apm_bios_call {
+	u32 func;
+	/* In and out */
+	u32 ebx;
+	u32 ecx;
+	/* Out only */
+	u32 eax;
+	u32 edx;
+	u32 esi;
+
+	/* Error: -ENOMEM, or bits 8-15 of eax */
+	int err;
+};
+
 /**
- *	apm_bios_call	-	Make an APM BIOS 32bit call
- *	@func: APM function to execute
- *	@ebx_in: EBX register for call entry
- *	@ecx_in: ECX register for call entry
- *	@eax: EAX register return
- *	@ebx: EBX register return
- *	@ecx: ECX register return
- *	@edx: EDX register return
- *	@esi: ESI register return
+ *	__apm_bios_call - Make an APM BIOS 32bit call
+ *	@_call: pointer to struct apm_bios_call.
  *
  *	Make an APM call using the 32bit protected mode interface. The
  *	caller is responsible for knowing if APM BIOS is configured and
@@ -586,80 +564,142 @@ static inline void apm_irq_restore(unsigned long flags)
  *	flag is loaded into AL.  If there is an error, then the error
  *	code is returned in AH (bits 8-15 of eax) and this function
  *	returns non-zero.
+ *
+ *	Note: this makes the call on the current CPU.
  */
-
-static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
-	u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
+static long __apm_bios_call(void *_call)
 {
 	APM_DECL_SEGS
 	unsigned long		flags;
-	cpumask_t		cpus;
 	int			cpu;
 	struct desc_struct	save_desc_40;
 	struct desc_struct	*gdt;
-
-	cpus = apm_save_cpus();
+	struct apm_bios_call	*call = _call;
 
 	cpu = get_cpu();
+	BUG_ON(cpu != 0);
 	gdt = get_cpu_gdt_table(cpu);
 	save_desc_40 = gdt[0x40 / 8];
 	gdt[0x40 / 8] = bad_bios_desc;
 
 	apm_irq_save(flags);
 	APM_DO_SAVE_SEGS;
-	apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
+	apm_bios_call_asm(call->func, call->ebx, call->ecx,
+			  &call->eax, &call->ebx, &call->ecx, &call->edx,
+			  &call->esi);
 	APM_DO_RESTORE_SEGS;
 	apm_irq_restore(flags);
 	gdt[0x40 / 8] = save_desc_40;
 	put_cpu();
-	apm_restore_cpus(cpus);
 
-	return *eax & 0xff;
+	return call->eax & 0xff;
+}
+
+/* Run __apm_bios_call or __apm_bios_call_simple on CPU 0 */
+static int on_cpu0(long (*fn)(void *), struct apm_bios_call *call)
+{
+	int ret;
+
+	/* Don't bother with work_on_cpu in the common case, so we don't
+	 * have to worry about OOM or overhead. */
+	if (get_cpu() == 0) {
+		ret = fn(call);
+		put_cpu();
+	} else {
+		put_cpu();
+		ret = work_on_cpu(0, fn, call);
+	}
+
+	/* work_on_cpu can fail with -ENOMEM */
+	if (ret < 0)
+		call->err = ret;
+	else
+		call->err = (call->eax >> 8) & 0xff;
+
+	return ret;
 }
 
 /**
- *	apm_bios_call_simple	-	make a simple APM BIOS 32bit call
- *	@func: APM function to invoke
- *	@ebx_in: EBX register value for BIOS call
- *	@ecx_in: ECX register value for BIOS call
- *	@eax: EAX register on return from the BIOS call
+ *	apm_bios_call	-	Make an APM BIOS 32bit call (on CPU 0)
+ *	@call: the apm_bios_call registers.
+ *
+ *	If there is an error, it is returned in @call.err.
+ */
+static int apm_bios_call(struct apm_bios_call *call)
+{
+	return on_cpu0(__apm_bios_call, call);
+}
+
+/**
+ *	__apm_bios_call_simple - Make an APM BIOS 32bit call (on CPU 0)
+ *	@_call: pointer to struct apm_bios_call.
  *
  *	Make a BIOS call that returns one value only, or just status.
  *	If there is an error, then the error code is returned in AH
- *	(bits 8-15 of eax) and this function returns non-zero. This is
- *	used for simpler BIOS operations. This call may hold interrupts
- *	off for a long time on some laptops.
+ *	(bits 8-15 of eax) and this function returns non-zero (it can
+ *	also return -ENOMEM). This is used for simpler BIOS operations.
+ *	This call may hold interrupts off for a long time on some laptops.
+ *
+ *	Note: this makes the call on the current CPU.
  */
-
-static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
+static long __apm_bios_call_simple(void *_call)
 {
 	u8			error;
 	APM_DECL_SEGS
 	unsigned long		flags;
-	cpumask_t		cpus;
 	int			cpu;
 	struct desc_struct	save_desc_40;
 	struct desc_struct	*gdt;
-
-	cpus = apm_save_cpus();
+	struct apm_bios_call	*call = _call;
 
 	cpu = get_cpu();
+	BUG_ON(cpu != 0);
 	gdt = get_cpu_gdt_table(cpu);
 	save_desc_40 = gdt[0x40 / 8];
 	gdt[0x40 / 8] = bad_bios_desc;
 
 	apm_irq_save(flags);
 	APM_DO_SAVE_SEGS;
-	error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
+	error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
+					 &call->eax);
 	APM_DO_RESTORE_SEGS;
 	apm_irq_restore(flags);
 	gdt[0x40 / 8] = save_desc_40;
 	put_cpu();
-	apm_restore_cpus(cpus);
 	return error;
 }
 
 /**
+ *	apm_bios_call_simple	-	make a simple APM BIOS 32bit call
+ *	@func: APM function to invoke
+ *	@ebx_in: EBX register value for BIOS call
+ *	@ecx_in: ECX register value for BIOS call
+ *	@eax: EAX register on return from the BIOS call
+ *	@err: bits
+ *
+ *	Make a BIOS call that returns one value only, or just status.
+ *	If there is an error, then the error code is returned in @err
+ *	and this function returns non-zero. This is used for simpler
+ *	BIOS operations.  This call may hold interrupts off for a long
+ *	time on some laptops.
+ */
+static int apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax,
+				int *err)
+{
+	struct apm_bios_call call;
+	int ret;
+
+	call.func = func;
+	call.ebx = ebx_in;
+	call.ecx = ecx_in;
+
+	ret = on_cpu0(__apm_bios_call_simple, &call);
+	*eax = call.eax;
+	*err = call.err;
+	return ret;
+}
+
+/**
  *	apm_driver_version	-	APM driver version
  *	@val:	loaded with the APM version on return
  *
@@ -678,9 +718,10 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
 static int apm_driver_version(u_short *val)
 {
 	u32 eax;
+	int err;
 
-	if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
-		return (eax >> 8) & 0xff;
+	if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax, &err))
+		return err;
 	*val = eax;
 	return APM_SUCCESS;
 }
@@ -701,22 +742,21 @@ static int apm_driver_version(u_short *val)
  *	that APM 1.2 is in use. If no messges are pending the value 0x80
  *	is returned (No power management events pending).
  */
-
 static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
 {
-	u32 eax;
-	u32 ebx;
-	u32 ecx;
-	u32 dummy;
+	struct apm_bios_call call;
 
-	if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
-			  &dummy, &dummy))
-		return (eax >> 8) & 0xff;
-	*event = ebx;
+	call.func = APM_FUNC_GET_EVENT;
+	call.ebx = call.ecx = 0;
+
+	if (apm_bios_call(&call))
+		return call.err;
+
+	*event = call.ebx;
 	if (apm_info.connection_version < 0x0102)
 		*info = ~0; /* indicate info not valid */
 	else
-		*info = ecx;
+		*info = call.ecx;
 	return APM_SUCCESS;
 }
 
@@ -737,9 +777,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
 static int set_power_state(u_short what, u_short state)
 {
 	u32 eax;
+	int err;
 
-	if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
-		return (eax >> 8) & 0xff;
+	if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax, &err))
+		return err;
 	return APM_SUCCESS;
 }
 
@@ -770,6 +811,7 @@ static int apm_do_idle(void)
 	u8 ret = 0;
 	int idled = 0;
 	int polling;
+	int err;
 
 	polling = !!(current_thread_info()->status & TS_POLLING);
 	if (polling) {
@@ -782,7 +824,7 @@ static int apm_do_idle(void)
 	}
 	if (!need_resched()) {
 		idled = 1;
-		ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
+		ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
 	}
 	if (polling)
 		current_thread_info()->status |= TS_POLLING;
@@ -797,8 +839,7 @@ static int apm_do_idle(void)
 		 * Only report the failure the first 5 times.
 		 */
 		if (++t < 5) {
-			printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
-			       (eax >> 8) & 0xff);
+			printk(KERN_DEBUG "apm_do_idle failed (%d)\n", err);
 			t = jiffies;
 		}
 		return -1;
@@ -816,9 +857,10 @@ static int apm_do_idle(void)
 static void apm_do_busy(void)
 {
 	u32 dummy;
+	int err;
 
 	if (clock_slowed || ALWAYS_CALL_BUSY) {
-		(void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
+		(void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy, &err);
 		clock_slowed = 0;
 	}
 }
@@ -937,7 +979,7 @@ static void apm_power_off(void)
 
 	/* Some bioses don't like being called from CPU != 0 */
 	if (apm_info.realmode_power_off) {
-		(void)apm_save_cpus();
+		set_cpus_allowed_ptr(current, cpumask_of(0));
 		machine_real_restart(po_bios_call, sizeof(po_bios_call));
 	} else {
 		(void)set_system_power_state(APM_STATE_OFF);
@@ -956,12 +998,13 @@ static void apm_power_off(void)
 static int apm_enable_power_management(int enable)
 {
 	u32 eax;
+	int err;
 
 	if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
 		return APM_NOT_ENGAGED;
 	if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
-				 enable, &eax))
-		return (eax >> 8) & 0xff;
+				 enable, &eax, &err))
+		return err;
 	if (enable)
 		apm_info.bios.flags &= ~APM_BIOS_DISABLED;
 	else
@@ -986,24 +1029,23 @@ static int apm_enable_power_management(int enable)
 
 static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
 {
-	u32 eax;
-	u32 ebx;
-	u32 ecx;
-	u32 edx;
-	u32 dummy;
+	struct apm_bios_call call;
+
+	call.func = APM_FUNC_GET_STATUS;
+	call.ebx = APM_DEVICE_ALL;
+	call.ecx = 0;
 
 	if (apm_info.get_power_status_broken)
 		return APM_32_UNSUPPORTED;
-	if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
-			  &eax, &ebx, &ecx, &edx, &dummy))
-		return (eax >> 8) & 0xff;
-	*status = ebx;
-	*bat = ecx;
+	if (apm_bios_call(&call))
+		return call.err;
+	*status = call.ebx;
+	*bat = call.ecx;
 	if (apm_info.get_power_status_swabinminutes) {
-		*life = swab16((u16)edx);
+		*life = swab16((u16)call.edx);
 		*life |= 0x8000;
 	} else
-		*life = edx;
+		*life = call.edx;
 	return APM_SUCCESS;
 }
 
@@ -1048,12 +1090,14 @@ static int apm_get_battery_status(u_short which, u_short *status,
 static int apm_engage_power_management(u_short device, int enable)
 {
 	u32 eax;
+	int err;
 
 	if ((enable == 0) && (device == APM_DEVICE_ALL)
 	    && (apm_info.bios.flags & APM_BIOS_DISABLED))
 		return APM_DISABLED;
-	if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax))
-		return (eax >> 8) & 0xff;
+	if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable,
+				 &eax, &err))
+		return err;
 	if (device == APM_DEVICE_ALL) {
 		if (enable)
 			apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
@@ -1190,8 +1234,10 @@ static int suspend(int vetoable)
 	struct apm_user	*as;
 
 	device_suspend(PMSG_SUSPEND);
-	local_irq_disable();
+
 	device_power_down(PMSG_SUSPEND);
+
+	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
 
 	local_irq_enable();
@@ -1209,9 +1255,12 @@ static int suspend(int vetoable)
 	if (err != APM_SUCCESS)
 		apm_error("suspend", err);
 	err = (err == APM_SUCCESS) ? 0 : -EIO;
+
 	sysdev_resume();
-	device_power_up(PMSG_RESUME);
 	local_irq_enable();
+
+	device_power_up(PMSG_RESUME);
+
 	device_resume(PMSG_RESUME);
 	queue_event(APM_NORMAL_RESUME, NULL);
 	spin_lock(&user_list_lock);
@@ -1228,8 +1277,9 @@ static void standby(void)
 {
 	int err;
 
-	local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
+
+	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
 	local_irq_enable();
 
@@ -1239,8 +1289,9 @@ static void standby(void)
 
 	local_irq_disable();
 	sysdev_resume();
-	device_power_up(PMSG_RESUME);
 	local_irq_enable();
+
+	device_power_up(PMSG_RESUME);
 }
 
 static apm_event_t get_event(void)
@@ -1682,16 +1733,14 @@ static int apm(void *unused)
 	char 		*power_stat;
 	char 		*bat_stat;
 
-#ifdef CONFIG_SMP
 	/* 2002/08/01 - WT
 	 * This is to avoid random crashes at boot time during initialization
 	 * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
 	 * Some bioses don't like being called from CPU != 0.
 	 * Method suggested by Ingo Molnar.
 	 */
-	set_cpus_allowed(current, cpumask_of_cpu(0));
+	set_cpus_allowed_ptr(current, cpumask_of(0));
 	BUG_ON(smp_processor_id() != 0);
-#endif
 
 	if (apm_info.connection_version == 0) {
 		apm_info.connection_version = apm_info.bios.version;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fbf2f33e308..5a6aa1c1162 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -18,6 +18,7 @@
 #include <asm/thread_info.h>
 #include <asm/bootparam.h>
 #include <asm/elf.h>
+#include <asm/suspend.h>
 
 #include <xen/interface/xen.h>
 
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 8793ab33e2c..e72f062fb4b 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -16,6 +16,7 @@
 #include <asm/thread_info.h>
 #include <asm/ia32.h>
 #include <asm/bootparam.h>
+#include <asm/suspend.h>
 
 #include <xen/interface/xen.h>
 
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 2ac0ab71412..fc999e6fc46 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -83,15 +83,15 @@ void __init setup_bios_corruption_check(void)
 		u64 size;
 		addr = find_e820_area_size(addr, &size, PAGE_SIZE);
 
-		if (addr == 0)
+		if (!(addr + 1))
+			break;
+
+		if (addr >= corruption_check_size)
 			break;
 
 		if ((addr + size) > corruption_check_size)
 			size = corruption_check_size - addr;
 
-		if (size == 0)
-			break;
-
 		e820_update_range(addr, size, E820_RAM, E820_RESERVED);
 		scan_areas[num_scan_areas].addr = addr;
 		scan_areas[num_scan_areas].size = size;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c3813306e0b..3efcb2b96a1 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,11 +14,12 @@ obj-y			+= vmware.o hypervisor.o
 obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
 
+obj-$(CONFIG_X86_CPU_DEBUG)		+= cpu_debug.o
+
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= amd.o
 obj-$(CONFIG_CPU_SUP_CYRIX_32)		+= cyrix.o
-obj-$(CONFIG_CPU_SUP_CENTAUR_32)	+= centaur.o
-obj-$(CONFIG_CPU_SUP_CENTAUR_64)	+= centaur_64.o
+obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 6882a735d9c..8220ae69849 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	u32 regs[4];
 	const struct cpuid_bit *cb;
 
-	static const struct cpuid_bit cpuid_bits[] = {
+	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
 		{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
 		{ 0, 0, 0, 0 }
 	};
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index edcde52bd17..fd69c514ca2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/numa_64.h>
@@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
 	}
 }
 
+static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	/* calling is from identify_secondary_cpu() ? */
+	if (c->cpu_index == boot_cpu_id)
+		return;
+
+	/*
+	 * Certain Athlons might work (for various values of 'work') in SMP
+	 * but they are not certified as MP capable.
+	 */
+	/* Athlon 660/661 is valid. */
+	if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+	    (c->x86_mask == 1)))
+		goto valid_k7;
+
+	/* Duron 670 is valid */
+	if ((c->x86_model == 7) && (c->x86_mask == 0))
+		goto valid_k7;
+
+	/*
+	 * Athlon 662, Duron 671, and Athlon >model 7 have capability
+	 * bit. It's worth noting that the A5 stepping (662) of some
+	 * Athlon XP's have the MP bit set.
+	 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+	 * more.
+	 */
+	if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+	    ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+	     (c->x86_model > 7))
+		if (cpu_has_mp)
+			goto valid_k7;
+
+	/* If we get here, not a certified SMP capable AMD system. */
+
+	/*
+	 * Don't taint if we are running SMP kernel on a single non-MP
+	 * approved Athlon
+	 */
+	WARN_ONCE(1, "WARNING: This combination of AMD"
+		"processors is not suitable for SMP.\n");
+	if (!test_taint(TAINT_UNSAFE_SMP))
+		add_taint(TAINT_UNSAFE_SMP);
+
+valid_k7:
+	;
+#endif
+}
+
 static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
@@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 	}
 
 	set_cpu_cap(c, X86_FEATURE_K7);
+
+	amd_k7_smp_check(c);
 }
 #endif
 
@@ -454,7 +506,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
 }
 #endif
 
-static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
 	.c_vendor	= "AMD",
 	.c_ident	= { "AuthenticAMD" },
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 89bfdd9cacc..c95e831bb09 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,11 +1,11 @@
+#include <linux/bitops.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <linux/bitops.h>
 
 #include <asm/processor.h>
-#include <asm/msr.h>
 #include <asm/e820.h>
 #include <asm/mtrr.h>
+#include <asm/msr.h>
 
 #include "cpu.h"
 
@@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
 		 */
 		c->x86_capability[5] = cpuid_edx(0xC0000001);
 	}
-
+#ifdef CONFIG_X86_32
 	/* Cyrix III family needs CX8 & PGE explicitly enabled. */
 	if (c->x86_model >= 6 && c->x86_model <= 9) {
 		rdmsr(MSR_VIA_FCR, lo, hi);
@@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
 	/* Before Nehemiah, the C3's had 3dNOW! */
 	if (c->x86_model >= 6 && c->x86_model < 9)
 		set_cpu_cap(c, X86_FEATURE_3DNOW);
+#endif
+	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+		c->x86_cache_alignment = c->x86_clflush_size * 2;
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	}
 
 	display_cacheinfo(c);
 }
@@ -316,16 +321,25 @@ enum {
 static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
 {
 	switch (c->x86) {
+#ifdef CONFIG_X86_32
 	case 5:
 		/* Emulate MTRRs using Centaur's MCR. */
 		set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
 		break;
+#endif
+	case 6:
+		if (c->x86_model >= 0xf)
+			set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+		break;
 	}
+#ifdef CONFIG_X86_64
+	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#endif
 }
 
 static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 {
-
+#ifdef CONFIG_X86_32
 	char *name;
 	u32  fcr_set = 0;
 	u32  fcr_clr = 0;
@@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 	 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
 	 */
 	clear_cpu_cap(c, 0*32+31);
-
+#endif
+	early_init_centaur(c);
 	switch (c->x86) {
+#ifdef CONFIG_X86_32
 	case 5:
 		switch (c->x86_model) {
 		case 4:
@@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 		}
 		sprintf(c->x86_model_id, "WinChip %s", name);
 		break;
-
+#endif
 	case 6:
 		init_c3(c);
 		break;
 	}
+#ifdef CONFIG_X86_64
+	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+#endif
 }
 
 static unsigned int __cpuinit
 centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 {
+#ifdef CONFIG_X86_32
 	/* VIA C3 CPUs (670-68F) need further shifting. */
 	if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
 		size >>= 8;
@@ -464,11 +484,11 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 	if ((c->x86 == 6) && (c->x86_model == 9) &&
 				(c->x86_mask == 1) && (size == 65))
 		size -= 1;
-
+#endif
 	return size;
 }
 
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
 	.c_vendor	= "Centaur",
 	.c_ident	= { "CentaurHauls" },
 	.c_early_init	= early_init_centaur,
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
deleted file mode 100644
index a1625f5a1e7..00000000000
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-#include "cpu.h"
-
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-{
-	if (c->x86 == 0x6 && c->x86_model >= 0xf)
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
-	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
-}
-
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
-{
-	early_init_centaur(c);
-
-	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
-		c->x86_cache_alignment = c->x86_clflush_size * 2;
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	}
-	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-}
-
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
-	.c_vendor	= "Centaur",
-	.c_ident	= { "CentaurHauls" },
-	.c_early_init	= early_init_centaur,
-	.c_init		= init_centaur,
-	.c_x86_vendor	= X86_VENDOR_CENTAUR,
-};
-
-cpu_dev_register(centaur_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b66af09a6c7..a86769efe0d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,53 +1,51 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
 #include <linux/bootmem.h>
+#include <linux/linkage.h>
 #include <linux/bitops.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/kgdb.h>
-#include <linux/topology.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
 #include <linux/smp.h>
-#include <linux/percpu.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/linkage.h>
+#include <linux/io.h>
+
+#include <asm/stackprotector.h>
+#include <asm/perf_counter.h>
 #include <asm/mmu_context.h>
+#include <asm/hypervisor.h>
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/topology.h>
+#include <asm/cpumask.h>
+#include <asm/pgtable.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
 #include <asm/mtrr.h>
+#include <asm/numa.h>
+#include <asm/asm.h>
+#include <asm/cpu.h>
 #include <asm/mce.h>
-#include <asm/perf_counter.h>
+#include <asm/msr.h>
 #include <asm/pat.h>
-#include <asm/asm.h>
-#include <asm/numa.h>
 #include <asm/smp.h>
-#include <asm/cpu.h>
-#include <asm/cpumask.h>
-#include <asm/apic.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/uv/uv.h>
 #endif
 
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/hypervisor.h>
-#include <asm/stackprotector.h>
-
 #include "cpu.h"
 
-#ifdef CONFIG_X86_64
-
 /* all of these masks are initialized in setup_cpu_local_masks() */
-cpumask_var_t cpu_callin_mask;
-cpumask_var_t cpu_callout_mask;
 cpumask_var_t cpu_initialized_mask;
+cpumask_var_t cpu_callout_mask;
+cpumask_var_t cpu_callin_mask;
 
 /* representing cpus for which sibling maps can be computed */
 cpumask_var_t cpu_sibling_setup_mask;
@@ -61,17 +59,7 @@ void __init setup_cpu_local_masks(void)
 	alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
 }
 
-#else /* CONFIG_X86_32 */
-
-cpumask_t cpu_callin_map;
-cpumask_t cpu_callout_map;
-cpumask_t cpu_initialized;
-cpumask_t cpu_sibling_setup_map;
-
-#endif /* CONFIG_X86_32 */
-
-
-static struct cpu_dev *this_cpu __cpuinitdata;
+static const struct cpu_dev *this_cpu __cpuinitdata;
 
 DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
@@ -80,48 +68,48 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 	 * IRET will check the segment types  kkeil 2000/10/28
 	 * Also sysret mandates a special GDT layout
 	 *
-	 * The TLS descriptors are currently at a different place compared to i386.
+	 * TLS descriptors are currently at a different place compared to i386.
 	 * Hopefully nobody expects them at a fixed place (Wine?)
 	 */
-	[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
-	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
-	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
-	[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
-	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
-	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+	[GDT_ENTRY_KERNEL32_CS]		= { { { 0x0000ffff, 0x00cf9b00 } } },
+	[GDT_ENTRY_KERNEL_CS]		= { { { 0x0000ffff, 0x00af9b00 } } },
+	[GDT_ENTRY_KERNEL_DS]		= { { { 0x0000ffff, 0x00cf9300 } } },
+	[GDT_ENTRY_DEFAULT_USER32_CS]	= { { { 0x0000ffff, 0x00cffb00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS]	= { { { 0x0000ffff, 0x00cff300 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS]	= { { { 0x0000ffff, 0x00affb00 } } },
 #else
-	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
-	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
-	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
-	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+	[GDT_ENTRY_KERNEL_CS]		= { { { 0x0000ffff, 0x00cf9a00 } } },
+	[GDT_ENTRY_KERNEL_DS]		= { { { 0x0000ffff, 0x00cf9200 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS]	= { { { 0x0000ffff, 0x00cffa00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS]	= { { { 0x0000ffff, 0x00cff200 } } },
 	/*
 	 * Segments used for calling PnP BIOS have byte granularity.
 	 * They code segments and data segments have fixed 64k limits,
 	 * the transfer segment sizes are set at run time.
 	 */
 	/* 32-bit code */
-	[GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+	[GDT_ENTRY_PNPBIOS_CS32]	= { { { 0x0000ffff, 0x00409a00 } } },
 	/* 16-bit code */
-	[GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+	[GDT_ENTRY_PNPBIOS_CS16]	= { { { 0x0000ffff, 0x00009a00 } } },
 	/* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+	[GDT_ENTRY_PNPBIOS_DS]		= { { { 0x0000ffff, 0x00009200 } } },
 	/* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+	[GDT_ENTRY_PNPBIOS_TS1]		= { { { 0x00000000, 0x00009200 } } },
 	/* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+	[GDT_ENTRY_PNPBIOS_TS2]		= { { { 0x00000000, 0x00009200 } } },
 	/*
 	 * The APM segments have byte granularity and their bases
 	 * are set at run time.  All have 64k limits.
 	 */
 	/* 32-bit code */
-	[GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+	[GDT_ENTRY_APMBIOS_BASE]	= { { { 0x0000ffff, 0x00409a00 } } },
 	/* 16-bit code */
-	[GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+	[GDT_ENTRY_APMBIOS_BASE+1]	= { { { 0x0000ffff, 0x00009a00 } } },
 	/* data */
-	[GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+	[GDT_ENTRY_APMBIOS_BASE+2]	= { { { 0x0000ffff, 0x00409200 } } },
 
-	[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
-	[GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+	[GDT_ENTRY_ESPFIX_SS]		= { { { 0x00000000, 0x00c09200 } } },
+	[GDT_ENTRY_PERCPU]		= { { { 0x0000ffff, 0x00cf9200 } } },
 	GDT_STACK_CANARY_INIT
 #endif
 } };
@@ -165,16 +153,17 @@ static inline int flag_is_changeable_p(u32 flag)
 	 * the CPUID. Add "volatile" to not allow gcc to
 	 * optimize the subsequent calls to this function.
 	 */
-	asm volatile ("pushfl\n\t"
-		      "pushfl\n\t"
-		      "popl %0\n\t"
-		      "movl %0,%1\n\t"
-		      "xorl %2,%0\n\t"
-		      "pushl %0\n\t"
-		      "popfl\n\t"
-		      "pushfl\n\t"
-		      "popl %0\n\t"
-		      "popfl\n\t"
+	asm volatile ("pushfl		\n\t"
+		      "pushfl		\n\t"
+		      "popl %0		\n\t"
+		      "movl %0, %1	\n\t"
+		      "xorl %2, %0	\n\t"
+		      "pushl %0		\n\t"
+		      "popfl		\n\t"
+		      "pushfl		\n\t"
+		      "popl %0		\n\t"
+		      "popfl		\n\t"
+
 		      : "=&r" (f1), "=&r" (f2)
 		      : "ir" (flag));
 
@@ -189,18 +178,22 @@ static int __cpuinit have_cpuid_p(void)
 
 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 {
-	if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
-		/* Disable processor serial number */
-		unsigned long lo, hi;
-		rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-		lo |= 0x200000;
-		wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-		printk(KERN_NOTICE "CPU serial number disabled.\n");
-		clear_cpu_cap(c, X86_FEATURE_PN);
-
-		/* Disabling the serial number may affect the cpuid level */
-		c->cpuid_level = cpuid_eax(0);
-	}
+	unsigned long lo, hi;
+
+	if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+		return;
+
+	/* Disable processor serial number: */
+
+	rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+	lo |= 0x200000;
+	wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+	printk(KERN_NOTICE "CPU serial number disabled.\n");
+	clear_cpu_cap(c, X86_FEATURE_PN);
+
+	/* Disabling the serial number may affect the cpuid level */
+	c->cpuid_level = cpuid_eax(0);
 }
 
 static int __init x86_serial_nr_setup(char *s)
@@ -233,6 +226,7 @@ struct cpuid_dependent_feature {
 	u32 feature;
 	u32 level;
 };
+
 static const struct cpuid_dependent_feature __cpuinitconst
 cpuid_dependent_features[] = {
 	{ X86_FEATURE_MWAIT,		0x00000005 },
@@ -244,7 +238,11 @@ cpuid_dependent_features[] = {
 static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
 {
 	const struct cpuid_dependent_feature *df;
+
 	for (df = cpuid_dependent_features; df->feature; df++) {
+
+		if (!cpu_has(c, df->feature))
+			continue;
 		/*
 		 * Note: cpuid_level is set to -1 if unavailable, but
 		 * extended_extended_level is set to 0 if unavailable
@@ -252,32 +250,32 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
 		 * when signed; hence the weird messing around with
 		 * signs here...
 		 */
-		if (cpu_has(c, df->feature) &&
-		    ((s32)df->level < 0 ?
+		if (!((s32)df->level < 0 ?
 		     (u32)df->level > (u32)c->extended_cpuid_level :
-		     (s32)df->level > (s32)c->cpuid_level)) {
-			clear_cpu_cap(c, df->feature);
-			if (warn)
-				printk(KERN_WARNING
-				       "CPU: CPU feature %s disabled "
-				       "due to lack of CPUID level 0x%x\n",
-				       x86_cap_flags[df->feature],
-				       df->level);
-		}
+		     (s32)df->level > (s32)c->cpuid_level))
+			continue;
+
+		clear_cpu_cap(c, df->feature);
+		if (!warn)
+			continue;
+
+		printk(KERN_WARNING
+		       "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+				x86_cap_flags[df->feature], df->level);
 	}
 }
 
 /*
  * Naming convention should be: <Name> [(<Codename>)]
  * This table only is used unless init_<vendor>() below doesn't set it;
- * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
- *
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
  */
 
 /* Look up CPU names by table lookup. */
-static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
 {
-	struct cpu_model_info *info;
+	const struct cpu_model_info *info;
 
 	if (c->x86_model >= 16)
 		return NULL;	/* Range check */
@@ -308,8 +306,10 @@ void load_percpu_segment(int cpu)
 	load_stack_canary_segment();
 }
 
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
+/*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
 void switch_to_new_gdt(int cpu)
 {
 	struct desc_ptr gdt_descr;
@@ -322,7 +322,7 @@ void switch_to_new_gdt(int cpu)
 	load_percpu_segment(cpu);
 }
 
-static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
 
 static void __cpuinit default_init(struct cpuinfo_x86 *c)
 {
@@ -341,7 +341,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
 #endif
 }
 
-static struct cpu_dev __cpuinitdata default_cpu = {
+static const struct cpu_dev __cpuinitconst default_cpu = {
 	.c_init	= default_init,
 	.c_vendor = "Unknown",
 	.c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -355,22 +355,24 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level < 0x80000004)
 		return;
 
-	v = (unsigned int *) c->x86_model_id;
+	v = (unsigned int *)c->x86_model_id;
 	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
 	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
 	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
 	c->x86_model_id[48] = 0;
 
-	/* Intel chips right-justify this string for some dumb reason;
-	   undo that brain damage */
+	/*
+	 * Intel chips right-justify this string for some dumb reason;
+	 * undo that brain damage:
+	 */
 	p = q = &c->x86_model_id[0];
 	while (*p == ' ')
-	     p++;
+		p++;
 	if (p != q) {
-	     while (*p)
-		  *q++ = *p++;
-	     while (q <= &c->x86_model_id[48])
-		  *q++ = '\0';	/* Zero-pad the rest */
+		while (*p)
+			*q++ = *p++;
+		while (q <= &c->x86_model_id[48])
+			*q++ = '\0';	/* Zero-pad the rest */
 	}
 }
 
@@ -439,27 +441,30 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 	if (smp_num_siblings == 1) {
 		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1) {
+		goto out;
+	}
 
-		if (smp_num_siblings > nr_cpu_ids) {
-			printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
-					smp_num_siblings);
-			smp_num_siblings = 1;
-			return;
-		}
+	if (smp_num_siblings <= 1)
+		goto out;
 
-		index_msb = get_count_order(smp_num_siblings);
-		c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+	if (smp_num_siblings > nr_cpu_ids) {
+		pr_warning("CPU: Unsupported number of siblings %d",
+			   smp_num_siblings);
+		smp_num_siblings = 1;
+		return;
+	}
 
-		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+	index_msb = get_count_order(smp_num_siblings);
+	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
 
-		index_msb = get_count_order(smp_num_siblings);
+	smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
-		core_bits = get_count_order(c->x86_max_cores);
+	index_msb = get_count_order(smp_num_siblings);
 
-		c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
-					       ((1 << core_bits) - 1);
-	}
+	core_bits = get_count_order(c->x86_max_cores);
+
+	c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+				       ((1 << core_bits) - 1);
 
 out:
 	if ((c->x86_max_cores * smp_num_siblings) > 1) {
@@ -474,8 +479,8 @@ out:
 static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
-	int i;
 	static int printed;
+	int i;
 
 	for (i = 0; i < X86_VENDOR_NUM; i++) {
 		if (!cpu_devs[i])
@@ -484,6 +489,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 		if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
 		    (cpu_devs[i]->c_ident[1] &&
 		     !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
 			this_cpu = cpu_devs[i];
 			c->x86_vendor = this_cpu->c_x86_vendor;
 			return;
@@ -492,7 +498,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 
 	if (!printed) {
 		printed++;
-		printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+		printk(KERN_ERR
+		    "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+
 		printk(KERN_ERR "CPU: Your system may be unstable.\n");
 	}
 
@@ -512,14 +520,17 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
 	/* Intel-defined flags: level 0x00000001 */
 	if (c->cpuid_level >= 0x00000001) {
 		u32 junk, tfms, cap0, misc;
+
 		cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
 		c->x86 = (tfms >> 8) & 0xf;
 		c->x86_model = (tfms >> 4) & 0xf;
 		c->x86_mask = tfms & 0xf;
+
 		if (c->x86 == 0xf)
 			c->x86 += (tfms >> 20) & 0xff;
 		if (c->x86 >= 0x6)
 			c->x86_model += ((tfms >> 16) & 0xf) << 4;
+
 		if (cap0 & (1<<19)) {
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
 			c->x86_cache_alignment = c->x86_clflush_size;
@@ -535,6 +546,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 	/* Intel-defined flags: level 0x00000001 */
 	if (c->cpuid_level >= 0x00000001) {
 		u32 capability, excap;
+
 		cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
 		c->x86_capability[0] = capability;
 		c->x86_capability[4] = excap;
@@ -543,6 +555,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
 	c->extended_cpuid_level = xlvl;
+
 	if ((xlvl & 0xffff0000) == 0x80000000) {
 		if (xlvl >= 0x80000001) {
 			c->x86_capability[1] = cpuid_edx(0x80000001);
@@ -550,13 +563,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 		}
 	}
 
-#ifdef CONFIG_X86_64
 	if (c->extended_cpuid_level >= 0x80000008) {
 		u32 eax = cpuid_eax(0x80000008);
 
 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
 	}
+#ifdef CONFIG_X86_32
+	else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+		c->x86_phys_bits = 36;
 #endif
 
 	if (c->extended_cpuid_level >= 0x80000007)
@@ -603,8 +618,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_X86_64
 	c->x86_clflush_size = 64;
+	c->x86_phys_bits = 36;
+	c->x86_virt_bits = 48;
 #else
 	c->x86_clflush_size = 32;
+	c->x86_phys_bits = 32;
+	c->x86_virt_bits = 32;
 #endif
 	c->x86_cache_alignment = c->x86_clflush_size;
 
@@ -635,12 +654,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 void __init early_cpu_init(void)
 {
-	struct cpu_dev **cdev;
+	const struct cpu_dev *const *cdev;
 	int count = 0;
 
-	printk("KERNEL supported cpus:\n");
+	printk(KERN_INFO "KERNEL supported cpus:\n");
 	for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
-		struct cpu_dev *cpudev = *cdev;
+		const struct cpu_dev *cpudev = *cdev;
 		unsigned int j;
 
 		if (count >= X86_VENDOR_NUM)
@@ -651,7 +670,7 @@ void __init early_cpu_init(void)
 		for (j = 0; j < 2; j++) {
 			if (!cpudev->c_ident[j])
 				continue;
-			printk("  %s %s\n", cpudev->c_vendor,
+			printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
 				cpudev->c_ident[j]);
 		}
 	}
@@ -727,9 +746,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	c->x86_coreid_bits = 0;
 #ifdef CONFIG_X86_64
 	c->x86_clflush_size = 64;
+	c->x86_phys_bits = 36;
+	c->x86_virt_bits = 48;
 #else
 	c->cpuid_level = -1;	/* CPUID not detected */
 	c->x86_clflush_size = 32;
+	c->x86_phys_bits = 32;
+	c->x86_virt_bits = 32;
 #endif
 	c->x86_cache_alignment = c->x86_clflush_size;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -760,8 +783,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	squash_the_stupid_serial_number(c);
 
 	/*
-	 * The vendor-specific functions might have changed features.  Now
-	 * we do "generic changes."
+	 * The vendor-specific functions might have changed features.
+	 * Now we do "generic changes."
 	 */
 
 	/* Filter out anything that depends on CPUID levels we don't have */
@@ -769,7 +792,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	/* If the model name is still unset, do table lookup. */
 	if (!c->x86_model_id[0]) {
-		char *p;
+		const char *p;
 		p = table_lookup_model(c);
 		if (p)
 			strcpy(c->x86_model_id, p);
@@ -825,6 +848,7 @@ static void vgetcpu_set_mode(void)
 void __init identify_boot_cpu(void)
 {
 	identify_cpu(&boot_cpu_data);
+	init_c1e_mask();
 #ifdef CONFIG_X86_32
 	sysenter_setup();
 	enable_sep_cpu();
@@ -845,11 +869,11 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
 }
 
 struct msr_range {
-	unsigned min;
-	unsigned max;
+	unsigned	min;
+	unsigned	max;
 };
 
-static struct msr_range msr_range_array[] __cpuinitdata = {
+static const struct msr_range msr_range_array[] __cpuinitconst = {
 	{ 0x00000000, 0x00000418},
 	{ 0xc0000000, 0xc000040b},
 	{ 0xc0010000, 0xc0010142},
@@ -858,14 +882,15 @@ static struct msr_range msr_range_array[] __cpuinitdata = {
 
 static void __cpuinit print_cpu_msr(void)
 {
+	unsigned index_min, index_max;
 	unsigned index;
 	u64 val;
 	int i;
-	unsigned index_min, index_max;
 
 	for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
 		index_min = msr_range_array[i].min;
 		index_max = msr_range_array[i].max;
+
 		for (index = index_min; index < index_max; index++) {
 			if (rdmsrl_amd_safe(index, &val))
 				continue;
@@ -875,6 +900,7 @@ static void __cpuinit print_cpu_msr(void)
 }
 
 static int show_msr __cpuinitdata;
+
 static __init int setup_show_msr(char *arg)
 {
 	int num;
@@ -896,12 +922,14 @@ __setup("noclflush", setup_noclflush);
 
 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
-	char *vendor = NULL;
+	const char *vendor = NULL;
 
-	if (c->x86_vendor < X86_VENDOR_NUM)
+	if (c->x86_vendor < X86_VENDOR_NUM) {
 		vendor = this_cpu->c_vendor;
-	else if (c->cpuid_level >= 0)
-		vendor = c->x86_vendor_id;
+	} else {
+		if (c->cpuid_level >= 0)
+			vendor = c->x86_vendor_id;
+	}
 
 	if (vendor && !strstr(c->x86_model_id, vendor))
 		printk(KERN_CONT "%s ", vendor);
@@ -928,10 +956,12 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 static __init int setup_disablecpuid(char *arg)
 {
 	int bit;
+
 	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
 		setup_clear_cpu_cap(bit);
 	else
 		return 0;
+
 	return 1;
 }
 __setup("clearcpuid=", setup_disablecpuid);
@@ -941,6 +971,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
 		     irq_stack_union) __aligned(PAGE_SIZE);
+
 DEFINE_PER_CPU(char *, irq_stack_ptr) =
 	init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
 
@@ -950,12 +981,21 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack);
 
 DEFINE_PER_CPU(unsigned int, irq_count) = -1;
 
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,
+	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
+};
+
 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
 	__aligned(PAGE_SIZE);
 
-extern asmlinkage void ignore_sysret(void);
-
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
@@ -985,7 +1025,7 @@ unsigned long kernel_eflags;
  */
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
 
-#else	/* x86_64 */
+#else	/* CONFIG_X86_64 */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 DEFINE_PER_CPU(unsigned long, stack_canary);
@@ -997,9 +1037,26 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
 	memset(regs, 0, sizeof(struct pt_regs));
 	regs->fs = __KERNEL_PERCPU;
 	regs->gs = __KERNEL_STACK_CANARY;
+
 	return regs;
 }
-#endif	/* x86_64 */
+#endif	/* CONFIG_X86_64 */
+
+/*
+ * Clear all 6 debug registers:
+ */
+static void clear_all_debug_regs(void)
+{
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		/* Ignore db4, db5 */
+		if ((i == 4) || (i == 5))
+			continue;
+
+		set_debugreg(0, i);
+	}
+}
 
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
@@ -1009,15 +1066,20 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
  * A lot of state is already set up in PDA init for 64 bit
  */
 #ifdef CONFIG_X86_64
+
 void __cpuinit cpu_init(void)
 {
-	int cpu = stack_smp_processor_id();
-	struct tss_struct *t = &per_cpu(init_tss, cpu);
-	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
-	unsigned long v;
+	struct orig_ist *orig_ist;
 	struct task_struct *me;
+	struct tss_struct *t;
+	unsigned long v;
+	int cpu;
 	int i;
 
+	cpu = stack_smp_processor_id();
+	t = &per_cpu(init_tss, cpu);
+	orig_ist = &per_cpu(orig_ist, cpu);
+
 #ifdef CONFIG_NUMA
 	if (cpu != 0 && percpu_read(node_number) == 0 &&
 	    cpu_to_node(cpu) != NUMA_NO_NODE)
@@ -1058,19 +1120,17 @@ void __cpuinit cpu_init(void)
 	 * set up and load the per-CPU TSS
 	 */
 	if (!orig_ist->ist[0]) {
-		static const unsigned int sizes[N_EXCEPTION_STACKS] = {
-		  [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
-		  [DEBUG_STACK - 1] = DEBUG_STKSZ
-		};
 		char *estacks = per_cpu(exception_stacks, cpu);
+
 		for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-			estacks += sizes[v];
+			estacks += exception_stack_sizes[v];
 			orig_ist->ist[v] = t->x86_tss.ist[v] =
 					(unsigned long)estacks;
 		}
 	}
 
 	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
 	/*
 	 * <= is required because the CPU will access up to
 	 * 8 bits beyond the end of the IO permission bitmap.
@@ -1080,8 +1140,7 @@ void __cpuinit cpu_init(void)
 
 	atomic_inc(&init_mm.mm_count);
 	me->active_mm = &init_mm;
-	if (me->mm)
-		BUG();
+	BUG_ON(me->mm);
 	enter_lazy_tlb(&init_mm, me);
 
 	load_sp0(t, &current->thread);
@@ -1100,17 +1159,7 @@ void __cpuinit cpu_init(void)
 		arch_kgdb_ops.correct_hw_break();
 	else
 #endif
-	{
-		/*
-		 * Clear all 6 debug registers:
-		 */
-		set_debugreg(0UL, 0);
-		set_debugreg(0UL, 1);
-		set_debugreg(0UL, 2);
-		set_debugreg(0UL, 3);
-		set_debugreg(0UL, 6);
-		set_debugreg(0UL, 7);
-	}
+		clear_all_debug_regs();
 
 	fpu_init();
 
@@ -1131,7 +1180,8 @@ void __cpuinit cpu_init(void)
 
 	if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
 		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
-		for (;;) local_irq_enable();
+		for (;;)
+			local_irq_enable();
 	}
 
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1147,8 +1197,7 @@ void __cpuinit cpu_init(void)
 	 */
 	atomic_inc(&init_mm.mm_count);
 	curr->active_mm = &init_mm;
-	if (curr->mm)
-		BUG();
+	BUG_ON(curr->mm);
 	enter_lazy_tlb(&init_mm, curr);
 
 	load_sp0(t, thread);
@@ -1161,13 +1210,7 @@ void __cpuinit cpu_init(void)
 	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
 #endif
 
-	/* Clear all 6 debug registers: */
-	set_debugreg(0, 0);
-	set_debugreg(0, 1);
-	set_debugreg(0, 2);
-	set_debugreg(0, 3);
-	set_debugreg(0, 6);
-	set_debugreg(0, 7);
+	clear_all_debug_regs();
 
 	/*
 	 * Force FPU initialization:
@@ -1187,6 +1230,4 @@ void __cpuinit cpu_init(void)
 
 	xsave_init();
 }
-
-
 #endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index de4094a3921..6de9a908e40 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -3,33 +3,34 @@
 #define ARCH_X86_CPU_H
 
 struct cpu_model_info {
-	int vendor;
-	int family;
-	char *model_names[16];
+	int		vendor;
+	int		family;
+	const char	*model_names[16];
 };
 
 /* attempt to consolidate cpu attributes */
 struct cpu_dev {
-	char	* c_vendor;
+	const char	*c_vendor;
 
 	/* some have two possibilities for cpuid string */
-	char	* c_ident[2];	
+	const char	*c_ident[2];
 
 	struct		cpu_model_info c_models[4];
 
-	void            (*c_early_init)(struct cpuinfo_x86 *c);
-	void		(*c_init)(struct cpuinfo_x86 * c);
-	void		(*c_identify)(struct cpuinfo_x86 * c);
-	unsigned int	(*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
-	int	c_x86_vendor;
+	void            (*c_early_init)(struct cpuinfo_x86 *);
+	void		(*c_init)(struct cpuinfo_x86 *);
+	void		(*c_identify)(struct cpuinfo_x86 *);
+	unsigned int	(*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
+	int		c_x86_vendor;
 };
 
 #define cpu_dev_register(cpu_devX) \
-	static struct cpu_dev *__cpu_dev_##cpu_devX __used \
+	static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
 	__attribute__((__section__(".x86_cpu_dev.init"))) = \
 	&cpu_devX;
 
-extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
+extern const struct cpu_dev *const __x86_cpu_dev_start[],
+			    *const __x86_cpu_dev_end[];
 
 extern void display_cacheinfo(struct cpuinfo_x86 *c);
 
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
new file mode 100755
index 00000000000..46e29ab96c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -0,0 +1,901 @@
+/*
+ * CPU x86 architecture debug code
+ *
+ * Copyright(C) 2009 Jaswinder Singh Rajput
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+
+#include <asm/cpu_debug.h>
+#include <asm/paravirt.h>
+#include <asm/system.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
+static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
+static DEFINE_PER_CPU(unsigned, cpu_modelflag);
+static DEFINE_PER_CPU(int, cpu_priv_count);
+static DEFINE_PER_CPU(unsigned, cpu_model);
+
+static DEFINE_MUTEX(cpu_debug_lock);
+
+static struct dentry *cpu_debugfs_dir;
+
+static struct cpu_debug_base cpu_base[] = {
+	{ "mc",		CPU_MC,		0	},
+	{ "monitor",	CPU_MONITOR,	0	},
+	{ "time",	CPU_TIME,	0	},
+	{ "pmc",	CPU_PMC,	1	},
+	{ "platform",	CPU_PLATFORM,	0	},
+	{ "apic",	CPU_APIC,	0	},
+	{ "poweron",	CPU_POWERON,	0	},
+	{ "control",	CPU_CONTROL,	0	},
+	{ "features",	CPU_FEATURES,	0	},
+	{ "lastbranch",	CPU_LBRANCH,	0	},
+	{ "bios",	CPU_BIOS,	0	},
+	{ "freq",	CPU_FREQ,	0	},
+	{ "mtrr",	CPU_MTRR,	0	},
+	{ "perf",	CPU_PERF,	0	},
+	{ "cache",	CPU_CACHE,	0	},
+	{ "sysenter",	CPU_SYSENTER,	0	},
+	{ "therm",	CPU_THERM,	0	},
+	{ "misc",	CPU_MISC,	0	},
+	{ "debug",	CPU_DEBUG,	0	},
+	{ "pat",	CPU_PAT,	0	},
+	{ "vmx",	CPU_VMX,	0	},
+	{ "call",	CPU_CALL,	0	},
+	{ "base",	CPU_BASE,	0	},
+	{ "ver",	CPU_VER,	0	},
+	{ "conf",	CPU_CONF,	0	},
+	{ "smm",	CPU_SMM,	0	},
+	{ "svm",	CPU_SVM,	0	},
+	{ "osvm",	CPU_OSVM,	0	},
+	{ "tss",	CPU_TSS,	0	},
+	{ "cr",		CPU_CR,		0	},
+	{ "dt",		CPU_DT,		0	},
+	{ "registers",	CPU_REG_ALL,	0	},
+};
+
+static struct cpu_file_base cpu_file[] = {
+	{ "index",	CPU_REG_ALL,	0	},
+	{ "value",	CPU_REG_ALL,	1	},
+};
+
+/* Intel Registers Range */
+static struct cpu_debug_range cpu_intel_range[] = {
+	{ 0x00000000, 0x00000001, CPU_MC,	CPU_INTEL_ALL		},
+	{ 0x00000006, 0x00000007, CPU_MONITOR,	CPU_CX_AT_XE		},
+	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_INTEL_ALL		},
+	{ 0x00000011, 0x00000013, CPU_PMC,	CPU_INTEL_PENTIUM	},
+	{ 0x00000017, 0x00000017, CPU_PLATFORM,	CPU_PX_CX_AT_XE		},
+	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_P6_CX_AT_XE		},
+
+	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_PX_CX_AT_XE		},
+	{ 0x0000002B, 0x0000002B, CPU_POWERON,	CPU_INTEL_XEON		},
+	{ 0x0000002C, 0x0000002C, CPU_FREQ,	CPU_INTEL_XEON		},
+	{ 0x0000003A, 0x0000003A, CPU_CONTROL,	CPU_CX_AT_XE		},
+
+	{ 0x00000040, 0x00000043, CPU_LBRANCH,	CPU_PM_CX_AT_XE		},
+	{ 0x00000044, 0x00000047, CPU_LBRANCH,	CPU_PM_CO_AT		},
+	{ 0x00000060, 0x00000063, CPU_LBRANCH,	CPU_C2_AT		},
+	{ 0x00000064, 0x00000067, CPU_LBRANCH,	CPU_INTEL_ATOM		},
+
+	{ 0x00000079, 0x00000079, CPU_BIOS,	CPU_P6_CX_AT_XE		},
+	{ 0x00000088, 0x0000008A, CPU_CACHE,	CPU_INTEL_P6		},
+	{ 0x0000008B, 0x0000008B, CPU_BIOS,	CPU_P6_CX_AT_XE		},
+	{ 0x0000009B, 0x0000009B, CPU_MONITOR,	CPU_INTEL_XEON		},
+
+	{ 0x000000C1, 0x000000C2, CPU_PMC,	CPU_P6_CX_AT		},
+	{ 0x000000CD, 0x000000CD, CPU_FREQ,	CPU_CX_AT		},
+	{ 0x000000E7, 0x000000E8, CPU_PERF,	CPU_CX_AT		},
+	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_P6_CX_XE		},
+
+	{ 0x00000116, 0x00000116, CPU_CACHE,	CPU_INTEL_P6		},
+	{ 0x00000118, 0x00000118, CPU_CACHE,	CPU_INTEL_P6		},
+	{ 0x00000119, 0x00000119, CPU_CACHE,	CPU_INTEL_PX		},
+	{ 0x0000011A, 0x0000011B, CPU_CACHE,	CPU_INTEL_P6		},
+	{ 0x0000011E, 0x0000011E, CPU_CACHE,	CPU_PX_CX_AT		},
+
+	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_P6_CX_AT_XE		},
+	{ 0x00000179, 0x0000017A, CPU_MC,	CPU_PX_CX_AT_XE		},
+	{ 0x0000017B, 0x0000017B, CPU_MC,	CPU_P6_XE		},
+	{ 0x00000186, 0x00000187, CPU_PMC,	CPU_P6_CX_AT		},
+	{ 0x00000198, 0x00000199, CPU_PERF,	CPU_PM_CX_AT_XE		},
+	{ 0x0000019A, 0x0000019A, CPU_TIME,	CPU_PM_CX_AT_XE		},
+	{ 0x0000019B, 0x0000019D, CPU_THERM,	CPU_PM_CX_AT_XE		},
+	{ 0x000001A0, 0x000001A0, CPU_MISC,	CPU_PM_CX_AT_XE		},
+
+	{ 0x000001C9, 0x000001C9, CPU_LBRANCH,	CPU_PM_CX_AT		},
+	{ 0x000001D7, 0x000001D8, CPU_LBRANCH,	CPU_INTEL_XEON		},
+	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_CX_AT_XE		},
+	{ 0x000001DA, 0x000001DA, CPU_LBRANCH,	CPU_INTEL_XEON		},
+	{ 0x000001DB, 0x000001DB, CPU_LBRANCH,	CPU_P6_XE		},
+	{ 0x000001DC, 0x000001DC, CPU_LBRANCH,	CPU_INTEL_P6		},
+	{ 0x000001DD, 0x000001DE, CPU_LBRANCH,	CPU_PX_CX_AT_XE		},
+	{ 0x000001E0, 0x000001E0, CPU_LBRANCH,	CPU_INTEL_P6		},
+
+	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_P6_CX_XE		},
+	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_P6_CX_XE		},
+	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_P6_CX_XE		},
+	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_P6_CX_XE		},
+	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_C2_AT_XE		},
+	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_P6_CX_XE		},
+
+	{ 0x00000300, 0x00000308, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x00000309, 0x0000030B, CPU_PMC,	CPU_C2_AT_XE		},
+	{ 0x0000030C, 0x00000311, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x00000345, 0x00000345, CPU_PMC,	CPU_C2_AT		},
+	{ 0x00000360, 0x00000371, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x0000038D, 0x00000390, CPU_PMC,	CPU_C2_AT		},
+	{ 0x000003A0, 0x000003BE, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x000003C0, 0x000003CD, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x000003E0, 0x000003E1, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x000003F0, 0x000003F0, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x000003F1, 0x000003F1, CPU_PMC,	CPU_C2_AT_XE		},
+	{ 0x000003F2, 0x000003F2, CPU_PMC,	CPU_INTEL_XEON		},
+
+	{ 0x00000400, 0x00000402, CPU_MC,	CPU_PM_CX_AT_XE		},
+	{ 0x00000403, 0x00000403, CPU_MC,	CPU_INTEL_XEON		},
+	{ 0x00000404, 0x00000406, CPU_MC,	CPU_PM_CX_AT_XE		},
+	{ 0x00000407, 0x00000407, CPU_MC,	CPU_INTEL_XEON		},
+	{ 0x00000408, 0x0000040A, CPU_MC,	CPU_PM_CX_AT_XE		},
+	{ 0x0000040B, 0x0000040B, CPU_MC,	CPU_INTEL_XEON		},
+	{ 0x0000040C, 0x0000040E, CPU_MC,	CPU_PM_CX_XE		},
+	{ 0x0000040F, 0x0000040F, CPU_MC,	CPU_INTEL_XEON		},
+	{ 0x00000410, 0x00000412, CPU_MC,	CPU_PM_CX_AT_XE		},
+	{ 0x00000413, 0x00000417, CPU_MC,	CPU_CX_AT_XE		},
+	{ 0x00000480, 0x0000048B, CPU_VMX,	CPU_CX_AT_XE		},
+
+	{ 0x00000600, 0x00000600, CPU_DEBUG,	CPU_PM_CX_AT_XE		},
+	{ 0x00000680, 0x0000068F, CPU_LBRANCH,	CPU_INTEL_XEON		},
+	{ 0x000006C0, 0x000006CF, CPU_LBRANCH,	CPU_INTEL_XEON		},
+
+	{ 0x000107CC, 0x000107D3, CPU_PMC,	CPU_INTEL_XEON_MP	},
+
+	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_INTEL_XEON		},
+	{ 0xC0000081, 0xC0000082, CPU_CALL,	CPU_INTEL_XEON		},
+	{ 0xC0000084, 0xC0000084, CPU_CALL,	CPU_INTEL_XEON		},
+	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_INTEL_XEON		},
+};
+
+/* AMD Registers Range */
+static struct cpu_debug_range cpu_amd_range[] = {
+	{ 0x00000000, 0x00000001, CPU_MC,	CPU_K10_PLUS,		},
+	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_K8_PLUS,		},
+	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_K8_PLUS,		},
+	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_K7_PLUS		},
+	{ 0x0000008B, 0x0000008B, CPU_VER,	CPU_K8_PLUS		},
+	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_K8_PLUS,		},
+
+	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_K8_PLUS,		},
+	{ 0x00000179, 0x0000017B, CPU_MC,	CPU_K8_PLUS,		},
+	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_K8_PLUS,		},
+	{ 0x000001DB, 0x000001DE, CPU_LBRANCH,	CPU_K8_PLUS,		},
+
+	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_K8_PLUS,		},
+	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_K8_PLUS,		},
+
+	{ 0x00000400, 0x00000413, CPU_MC,	CPU_K8_PLUS,		},
+
+	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_AMD_ALL,		},
+	{ 0xC0000081, 0xC0000084, CPU_CALL,	CPU_K8_PLUS,		},
+	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_K8_PLUS,		},
+	{ 0xC0000103, 0xC0000103, CPU_TIME,	CPU_K10_PLUS,		},
+
+	{ 0xC0010000, 0xC0010007, CPU_PMC,	CPU_K8_PLUS,		},
+	{ 0xC0010010, 0xC0010010, CPU_CONF,	CPU_K7_PLUS,		},
+	{ 0xC0010015, 0xC0010015, CPU_CONF,	CPU_K7_PLUS,		},
+	{ 0xC0010016, 0xC001001A, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0xC001001D, 0xC001001D, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0xC001001F, 0xC001001F, CPU_CONF,	CPU_K8_PLUS,		},
+	{ 0xC0010030, 0xC0010035, CPU_BIOS,	CPU_K8_PLUS,		},
+	{ 0xC0010044, 0xC0010048, CPU_MC,	CPU_K8_PLUS,		},
+	{ 0xC0010050, 0xC0010056, CPU_SMM,	CPU_K0F_PLUS,		},
+	{ 0xC0010058, 0xC0010058, CPU_CONF,	CPU_K10_PLUS,		},
+	{ 0xC0010060, 0xC0010060, CPU_CACHE,	CPU_AMD_11,		},
+	{ 0xC0010061, 0xC0010068, CPU_SMM,	CPU_K10_PLUS,		},
+	{ 0xC0010069, 0xC001006B, CPU_SMM,	CPU_AMD_11,		},
+	{ 0xC0010070, 0xC0010071, CPU_SMM,	CPU_K10_PLUS,		},
+	{ 0xC0010111, 0xC0010113, CPU_SMM,	CPU_K8_PLUS,		},
+	{ 0xC0010114, 0xC0010118, CPU_SVM,	CPU_K10_PLUS,		},
+	{ 0xC0010140, 0xC0010141, CPU_OSVM,	CPU_K10_PLUS,		},
+	{ 0xC0011022, 0xC0011023, CPU_CONF,	CPU_K10_PLUS,		},
+};
+
+
+/* Intel */
+static int get_intel_modelflag(unsigned model)
+{
+	int flag;
+
+	switch (model) {
+	case 0x0501:
+	case 0x0502:
+	case 0x0504:
+		flag = CPU_INTEL_PENTIUM;
+		break;
+	case 0x0601:
+	case 0x0603:
+	case 0x0605:
+	case 0x0607:
+	case 0x0608:
+	case 0x060A:
+	case 0x060B:
+		flag = CPU_INTEL_P6;
+		break;
+	case 0x0609:
+	case 0x060D:
+		flag = CPU_INTEL_PENTIUM_M;
+		break;
+	case 0x060E:
+		flag = CPU_INTEL_CORE;
+		break;
+	case 0x060F:
+	case 0x0617:
+		flag = CPU_INTEL_CORE2;
+		break;
+	case 0x061C:
+		flag = CPU_INTEL_ATOM;
+		break;
+	case 0x0F00:
+	case 0x0F01:
+	case 0x0F02:
+	case 0x0F03:
+	case 0x0F04:
+		flag = CPU_INTEL_XEON_P4;
+		break;
+	case 0x0F06:
+		flag = CPU_INTEL_XEON_MP;
+		break;
+	default:
+		flag = CPU_NONE;
+		break;
+	}
+
+	return flag;
+}
+
+/* AMD */
+static int get_amd_modelflag(unsigned model)
+{
+	int flag;
+
+	switch (model >> 8) {
+	case 0x6:
+		flag = CPU_AMD_K6;
+		break;
+	case 0x7:
+		flag = CPU_AMD_K7;
+		break;
+	case 0x8:
+		flag = CPU_AMD_K8;
+		break;
+	case 0xf:
+		flag = CPU_AMD_0F;
+		break;
+	case 0x10:
+		flag = CPU_AMD_10;
+		break;
+	case 0x11:
+		flag = CPU_AMD_11;
+		break;
+	default:
+		flag = CPU_NONE;
+		break;
+	}
+
+	return flag;
+}
+
+static int get_cpu_modelflag(unsigned cpu)
+{
+	int flag;
+
+	flag = per_cpu(cpu_model, cpu);
+
+	switch (flag >> 16) {
+	case X86_VENDOR_INTEL:
+		flag = get_intel_modelflag(flag);
+		break;
+	case X86_VENDOR_AMD:
+		flag = get_amd_modelflag(flag & 0xffff);
+		break;
+	default:
+		flag = CPU_NONE;
+		break;
+	}
+
+	return flag;
+}
+
+static int get_cpu_range_count(unsigned cpu)
+{
+	int index;
+
+	switch (per_cpu(cpu_model, cpu) >> 16) {
+	case X86_VENDOR_INTEL:
+		index = ARRAY_SIZE(cpu_intel_range);
+		break;
+	case X86_VENDOR_AMD:
+		index = ARRAY_SIZE(cpu_amd_range);
+		break;
+	default:
+		index = 0;
+		break;
+	}
+
+	return index;
+}
+
+static int is_typeflag_valid(unsigned cpu, unsigned flag)
+{
+	unsigned vendor, modelflag;
+	int i, index;
+
+	/* Standard Registers should be always valid */
+	if (flag >= CPU_TSS)
+		return 1;
+
+	modelflag = per_cpu(cpu_modelflag, cpu);
+	vendor = per_cpu(cpu_model, cpu) >> 16;
+	index = get_cpu_range_count(cpu);
+
+	for (i = 0; i < index; i++) {
+		switch (vendor) {
+		case X86_VENDOR_INTEL:
+			if ((cpu_intel_range[i].model & modelflag) &&
+			    (cpu_intel_range[i].flag & flag))
+				return 1;
+			break;
+		case X86_VENDOR_AMD:
+			if ((cpu_amd_range[i].model & modelflag) &&
+			    (cpu_amd_range[i].flag & flag))
+				return 1;
+			break;
+		}
+	}
+
+	/* Invalid */
+	return 0;
+}
+
+static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
+			      int index, unsigned flag)
+{
+	unsigned modelflag;
+
+	modelflag = per_cpu(cpu_modelflag, cpu);
+	*max = 0;
+	switch (per_cpu(cpu_model, cpu) >> 16) {
+	case X86_VENDOR_INTEL:
+		if ((cpu_intel_range[index].model & modelflag) &&
+		    (cpu_intel_range[index].flag & flag)) {
+			*min = cpu_intel_range[index].min;
+			*max = cpu_intel_range[index].max;
+		}
+		break;
+	case X86_VENDOR_AMD:
+		if ((cpu_amd_range[index].model & modelflag) &&
+		    (cpu_amd_range[index].flag & flag)) {
+			*min = cpu_amd_range[index].min;
+			*max = cpu_amd_range[index].max;
+		}
+		break;
+	}
+
+	return *max;
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_cpu_data(struct seq_file *seq, unsigned type,
+			   u32 low, u32 high)
+{
+	struct cpu_private *priv;
+	u64 val = high;
+
+	if (seq) {
+		priv = seq->private;
+		if (priv->file) {
+			val = (val << 32) | low;
+			seq_printf(seq, "0x%llx\n", val);
+		} else
+			seq_printf(seq, " %08x: %08x_%08x\n",
+				   type, high, low);
+	} else
+		printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
+{
+	unsigned msr, msr_min, msr_max;
+	struct cpu_private *priv;
+	u32 low, high;
+	int i, range;
+
+	if (seq) {
+		priv = seq->private;
+		if (priv->file) {
+			if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
+					       &low, &high))
+				print_cpu_data(seq, priv->reg, low, high);
+			return;
+		}
+	}
+
+	range = get_cpu_range_count(cpu);
+
+	for (i = 0; i < range; i++) {
+		if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
+			continue;
+
+		for (msr = msr_min; msr <= msr_max; msr++) {
+			if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
+				continue;
+			print_cpu_data(seq, msr, low, high);
+		}
+	}
+}
+
+static void print_tss(void *arg)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	struct seq_file *seq = arg;
+	unsigned int seg;
+
+	seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
+	seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
+	seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
+	seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
+
+	seq_printf(seq, " RSI\t: %016lx\n", regs->si);
+	seq_printf(seq, " RDI\t: %016lx\n", regs->di);
+	seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
+	seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
+
+#ifdef CONFIG_X86_64
+	seq_printf(seq, " R08\t: %016lx\n", regs->r8);
+	seq_printf(seq, " R09\t: %016lx\n", regs->r9);
+	seq_printf(seq, " R10\t: %016lx\n", regs->r10);
+	seq_printf(seq, " R11\t: %016lx\n", regs->r11);
+	seq_printf(seq, " R12\t: %016lx\n", regs->r12);
+	seq_printf(seq, " R13\t: %016lx\n", regs->r13);
+	seq_printf(seq, " R14\t: %016lx\n", regs->r14);
+	seq_printf(seq, " R15\t: %016lx\n", regs->r15);
+#endif
+
+	asm("movl %%cs,%0" : "=r" (seg));
+	seq_printf(seq, " CS\t:             %04x\n", seg);
+	asm("movl %%ds,%0" : "=r" (seg));
+	seq_printf(seq, " DS\t:             %04x\n", seg);
+	seq_printf(seq, " SS\t:             %04lx\n", regs->ss & 0xffff);
+	asm("movl %%es,%0" : "=r" (seg));
+	seq_printf(seq, " ES\t:             %04x\n", seg);
+	asm("movl %%fs,%0" : "=r" (seg));
+	seq_printf(seq, " FS\t:             %04x\n", seg);
+	asm("movl %%gs,%0" : "=r" (seg));
+	seq_printf(seq, " GS\t:             %04x\n", seg);
+
+	seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
+
+	seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
+}
+
+static void print_cr(void *arg)
+{
+	struct seq_file *seq = arg;
+
+	seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
+	seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
+	seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
+	seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
+#ifdef CONFIG_X86_64
+	seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
+#endif
+}
+
+static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
+{
+	seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
+}
+
+static void print_dt(void *seq)
+{
+	struct desc_ptr dt;
+	unsigned long ldt;
+
+	/* IDT */
+	store_idt((struct desc_ptr *)&dt);
+	print_desc_ptr("IDT", seq, dt);
+
+	/* GDT */
+	store_gdt((struct desc_ptr *)&dt);
+	print_desc_ptr("GDT", seq, dt);
+
+	/* LDT */
+	store_ldt(ldt);
+	seq_printf(seq, " LDT\t: %016lx\n", ldt);
+
+	/* TR */
+	store_tr(ldt);
+	seq_printf(seq, " TR\t: %016lx\n", ldt);
+}
+
+static void print_dr(void *arg)
+{
+	struct seq_file *seq = arg;
+	unsigned long dr;
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		/* Ignore db4, db5 */
+		if ((i == 4) || (i == 5))
+			continue;
+		get_debugreg(dr, i);
+		seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
+	}
+
+	seq_printf(seq, "\n MSR\t:\n");
+}
+
+static void print_apic(void *arg)
+{
+	struct seq_file *seq = arg;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	seq_printf(seq, " LAPIC\t:\n");
+	seq_printf(seq, " ID\t\t: %08x\n",  apic_read(APIC_ID) >> 24);
+	seq_printf(seq, " LVR\t\t: %08x\n",  apic_read(APIC_LVR));
+	seq_printf(seq, " TASKPRI\t: %08x\n",  apic_read(APIC_TASKPRI));
+	seq_printf(seq, " ARBPRI\t\t: %08x\n",  apic_read(APIC_ARBPRI));
+	seq_printf(seq, " PROCPRI\t: %08x\n",  apic_read(APIC_PROCPRI));
+	seq_printf(seq, " LDR\t\t: %08x\n",  apic_read(APIC_LDR));
+	seq_printf(seq, " DFR\t\t: %08x\n",  apic_read(APIC_DFR));
+	seq_printf(seq, " SPIV\t\t: %08x\n",  apic_read(APIC_SPIV));
+	seq_printf(seq, " ISR\t\t: %08x\n",  apic_read(APIC_ISR));
+	seq_printf(seq, " ESR\t\t: %08x\n",  apic_read(APIC_ESR));
+	seq_printf(seq, " ICR\t\t: %08x\n",  apic_read(APIC_ICR));
+	seq_printf(seq, " ICR2\t\t: %08x\n",  apic_read(APIC_ICR2));
+	seq_printf(seq, " LVTT\t\t: %08x\n",  apic_read(APIC_LVTT));
+	seq_printf(seq, " LVTTHMR\t: %08x\n",  apic_read(APIC_LVTTHMR));
+	seq_printf(seq, " LVTPC\t\t: %08x\n",  apic_read(APIC_LVTPC));
+	seq_printf(seq, " LVT0\t\t: %08x\n",  apic_read(APIC_LVT0));
+	seq_printf(seq, " LVT1\t\t: %08x\n",  apic_read(APIC_LVT1));
+	seq_printf(seq, " LVTERR\t\t: %08x\n",  apic_read(APIC_LVTERR));
+	seq_printf(seq, " TMICT\t\t: %08x\n",  apic_read(APIC_TMICT));
+	seq_printf(seq, " TMCCT\t\t: %08x\n",  apic_read(APIC_TMCCT));
+	seq_printf(seq, " TDCR\t\t: %08x\n",  apic_read(APIC_TDCR));
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+	seq_printf(seq, "\n MSR\t:\n");
+}
+
+static int cpu_seq_show(struct seq_file *seq, void *v)
+{
+	struct cpu_private *priv = seq->private;
+
+	if (priv == NULL)
+		return -EINVAL;
+
+	switch (cpu_base[priv->type].flag) {
+	case CPU_TSS:
+		smp_call_function_single(priv->cpu, print_tss, seq, 1);
+		break;
+	case CPU_CR:
+		smp_call_function_single(priv->cpu, print_cr, seq, 1);
+		break;
+	case CPU_DT:
+		smp_call_function_single(priv->cpu, print_dt, seq, 1);
+		break;
+	case CPU_DEBUG:
+		if (priv->file == CPU_INDEX_BIT)
+			smp_call_function_single(priv->cpu, print_dr, seq, 1);
+		print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+		break;
+	case CPU_APIC:
+		if (priv->file == CPU_INDEX_BIT)
+			smp_call_function_single(priv->cpu, print_apic, seq, 1);
+		print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+		break;
+
+	default:
+		print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+		break;
+	}
+	seq_printf(seq, "\n");
+
+	return 0;
+}
+
+static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos == 0) /* One time is enough ;-) */
+		return seq;
+
+	return NULL;
+}
+
+static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	return cpu_seq_start(seq, pos);
+}
+
+static void cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations cpu_seq_ops = {
+	.start		= cpu_seq_start,
+	.next		= cpu_seq_next,
+	.stop		= cpu_seq_stop,
+	.show		= cpu_seq_show,
+};
+
+static int cpu_seq_open(struct inode *inode, struct file *file)
+{
+	struct cpu_private *priv = inode->i_private;
+	struct seq_file *seq;
+	int err;
+
+	err = seq_open(file, &cpu_seq_ops);
+	if (!err) {
+		seq = file->private_data;
+		seq->private = priv;
+	}
+
+	return err;
+}
+
+static int write_msr(struct cpu_private *priv, u64 val)
+{
+	u32 low, high;
+
+	high = (val >> 32) & 0xffffffff;
+	low = val & 0xffffffff;
+
+	if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
+		return 0;
+
+	return -EPERM;
+}
+
+static int write_cpu_register(struct cpu_private *priv, const char *buf)
+{
+	int ret = -EPERM;
+	u64 val;
+
+	ret = strict_strtoull(buf, 0, &val);
+	if (ret < 0)
+		return ret;
+
+	/* Supporting only MSRs */
+	if (priv->type < CPU_TSS_BIT)
+		return write_msr(priv, val);
+
+	return ret;
+}
+
+static ssize_t cpu_write(struct file *file, const char __user *ubuf,
+			     size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct cpu_private *priv = seq->private;
+	char buf[19];
+
+	if ((priv == NULL) || (count >= sizeof(buf)))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, count))
+		return -EFAULT;
+
+	buf[count] = 0;
+
+	if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
+		if (!write_cpu_register(priv, buf))
+			return count;
+
+	return -EACCES;
+}
+
+static const struct file_operations cpu_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cpu_seq_open,
+	.read		= seq_read,
+	.write		= cpu_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
+			   unsigned file, struct dentry *dentry)
+{
+	struct cpu_private *priv = NULL;
+
+	/* Already intialized */
+	if (file == CPU_INDEX_BIT)
+		if (per_cpu(cpu_arr[type].init, cpu))
+			return 0;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv == NULL)
+		return -ENOMEM;
+
+	priv->cpu = cpu;
+	priv->type = type;
+	priv->reg = reg;
+	priv->file = file;
+	mutex_lock(&cpu_debug_lock);
+	per_cpu(priv_arr[type], cpu) = priv;
+	per_cpu(cpu_priv_count, cpu)++;
+	mutex_unlock(&cpu_debug_lock);
+
+	if (file)
+		debugfs_create_file(cpu_file[file].name, S_IRUGO,
+				    dentry, (void *)priv, &cpu_fops);
+	else {
+		debugfs_create_file(cpu_base[type].name, S_IRUGO,
+				    per_cpu(cpu_arr[type].dentry, cpu),
+				    (void *)priv, &cpu_fops);
+		mutex_lock(&cpu_debug_lock);
+		per_cpu(cpu_arr[type].init, cpu) = 1;
+		mutex_unlock(&cpu_debug_lock);
+	}
+
+	return 0;
+}
+
+static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
+			     struct dentry *dentry)
+{
+	unsigned file;
+	int err = 0;
+
+	for (file = 0; file <  ARRAY_SIZE(cpu_file); file++) {
+		err = cpu_create_file(cpu, type, reg, file, dentry);
+		if (err)
+			return err;
+	}
+
+	return err;
+}
+
+static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
+{
+	struct dentry *cpu_dentry = NULL;
+	unsigned reg, reg_min, reg_max;
+	int i, range, err = 0;
+	char reg_dir[12];
+	u32 low, high;
+
+	range = get_cpu_range_count(cpu);
+
+	for (i = 0; i < range; i++) {
+		if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
+				   cpu_base[type].flag))
+			continue;
+
+		for (reg = reg_min; reg <= reg_max; reg++) {
+			if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
+				continue;
+
+			sprintf(reg_dir, "0x%x", reg);
+			cpu_dentry = debugfs_create_dir(reg_dir, dentry);
+			err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
+			if (err)
+				return err;
+		}
+	}
+
+	return err;
+}
+
+static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
+{
+	struct dentry *cpu_dentry = NULL;
+	unsigned type;
+	int err = 0;
+
+	for (type = 0; type <  ARRAY_SIZE(cpu_base) - 1; type++) {
+		if (!is_typeflag_valid(cpu, cpu_base[type].flag))
+			continue;
+		cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
+		per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
+
+		if (type < CPU_TSS_BIT)
+			err = cpu_init_msr(cpu, type, cpu_dentry);
+		else
+			err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
+					      cpu_dentry);
+		if (err)
+			return err;
+	}
+
+	return err;
+}
+
+static int cpu_init_cpu(void)
+{
+	struct dentry *cpu_dentry = NULL;
+	struct cpuinfo_x86 *cpui;
+	char cpu_dir[12];
+	unsigned cpu;
+	int err = 0;
+
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+		cpui = &cpu_data(cpu);
+		if (!cpu_has(cpui, X86_FEATURE_MSR))
+			continue;
+		per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
+					   (cpui->x86 << 8) |
+					   (cpui->x86_model));
+		per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
+
+		sprintf(cpu_dir, "cpu%d", cpu);
+		cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
+		err = cpu_init_allreg(cpu, cpu_dentry);
+
+		pr_info("cpu%d(%d) debug files %d\n",
+			cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
+		if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
+			pr_err("Register files count %d exceeds limit %d\n",
+				per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
+			per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
+			err = -ENFILE;
+		}
+		if (err)
+			return err;
+	}
+
+	return err;
+}
+
+static int __init cpu_debug_init(void)
+{
+	cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
+
+	return cpu_init_cpu();
+}
+
+static void __exit cpu_debug_exit(void)
+{
+	int i, cpu;
+
+	if (cpu_debugfs_dir)
+		debugfs_remove_recursive(cpu_debugfs_dir);
+
+	for (cpu = 0; cpu <  nr_cpu_ids; cpu++)
+		for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
+			kfree(per_cpu(priv_arr[i], cpu));
+}
+
+module_init(cpu_debug_init);
+module_exit(cpu_debug_exit);
+
+MODULE_AUTHOR("Jaswinder Singh Rajput");
+MODULE_DESCRIPTION("CPU Debug module");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 65792c2cc46..52c83987547 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -87,30 +87,15 @@ config X86_POWERNOW_K7_ACPI
 config X86_POWERNOW_K8
 	tristate "AMD Opteron/Athlon64 PowerNow!"
 	select CPU_FREQ_TABLE
+	depends on ACPI && ACPI_PROCESSOR
 	help
-	  This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors.
+	  This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called powernow-k8.
 
 	  For details, take a look at <file:Documentation/cpu-freq/>.
 
-	  If in doubt, say N.
-
-config X86_POWERNOW_K8_ACPI
-	bool
-	prompt "ACPI Support" if X86_32
-	depends on ACPI && X86_POWERNOW_K8 && ACPI_PROCESSOR
-	depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
-	default y
-	help
-	  This provides access to the K8s Processor Performance States via ACPI.
-	  This driver is probably required for CPUFreq to work with multi-socket and
-	  SMP systems.  It is not required on at least some single-socket yet
-	  multi-core systems, even if SMP is enabled.
-
-	  It is safe to say Y here.
-
 config X86_GX_SUSPMOD
 	tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
 	depends on X86_32 && PCI
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 560f7760dae..509296df294 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -1,6 +1,11 @@
+# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
+# K8 systems. ACPI is preferred to all other hardware-specific drivers.
+# speedstep-* is preferred over p4-clockmod.
+
+obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o
 obj-$(CONFIG_X86_POWERNOW_K6)		+= powernow-k6.o
 obj-$(CONFIG_X86_POWERNOW_K7)		+= powernow-k7.o
-obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o
 obj-$(CONFIG_X86_LONGHAUL)		+= longhaul.o
 obj-$(CONFIG_X86_E_POWERSAVER)		+= e_powersaver.o
 obj-$(CONFIG_ELAN_CPUFREQ)		+= elanfreq.o
@@ -10,7 +15,6 @@ obj-$(CONFIG_X86_GX_SUSPMOD)		+= gx-suspmod.o
 obj-$(CONFIG_X86_SPEEDSTEP_ICH)		+= speedstep-ich.o
 obj-$(CONFIG_X86_SPEEDSTEP_LIB)		+= speedstep-lib.o
 obj-$(CONFIG_X86_SPEEDSTEP_SMI)		+= speedstep-smi.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o
 obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO)	+= speedstep-centrino.o
 obj-$(CONFIG_X86_P4_CLOCKMOD)		+= p4-clockmod.o
 obj-$(CONFIG_X86_CPUFREQ_NFORCE2)	+= cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c..19f6b9d27e8 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -1,5 +1,5 @@
 /*
- * acpi-cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.4 $)
+ * acpi-cpufreq.c - ACPI Processor P-States Driver
  *
  *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
  *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
@@ -33,19 +33,21 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 
 #include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+
 #include <acpi/processor.h>
 
-#include <asm/io.h>
 #include <asm/msr.h>
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
-#include <asm/delay.h>
-#include <asm/uaccess.h>
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+		"acpi-cpufreq", msg)
 
 MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
 MODULE_DESCRIPTION("ACPI Processor P-States Driver");
@@ -70,6 +72,8 @@ struct acpi_cpufreq_data {
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
 
+DEFINE_TRACE(power_mark);
+
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
@@ -95,7 +99,7 @@ static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
 
 	perf = data->acpi_data;
 
-	for (i=0; i<perf->state_count; i++) {
+	for (i = 0; i < perf->state_count; i++) {
 		if (value == perf->states[i].status)
 			return data->freq_table[i].frequency;
 	}
@@ -110,7 +114,7 @@ static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
 	msr &= INTEL_MSR_RANGE;
 	perf = data->acpi_data;
 
-	for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+	for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
 		if (msr == perf->states[data->freq_table[i].index].status)
 			return data->freq_table[i].frequency;
 	}
@@ -138,15 +142,13 @@ struct io_addr {
 	u8 bit_width;
 };
 
-typedef union {
-	struct msr_addr msr;
-	struct io_addr io;
-} drv_addr_union;
-
 struct drv_cmd {
 	unsigned int type;
 	const struct cpumask *mask;
-	drv_addr_union addr;
+	union {
+		struct msr_addr msr;
+		struct io_addr io;
+	} addr;
 	u32 val;
 };
 
@@ -369,7 +371,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
 	unsigned int cur_freq;
 	unsigned int i;
 
-	for (i=0; i<100; i++) {
+	for (i = 0; i < 100; i++) {
 		cur_freq = extract_freq(get_cur_val(mask), data);
 		if (cur_freq == freq)
 			return 1;
@@ -494,7 +496,7 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
 		unsigned long freq;
 		unsigned long freqn = perf->states[0].core_frequency * 1000;
 
-		for (i=0; i<(perf->state_count-1); i++) {
+		for (i = 0; i < (perf->state_count-1); i++) {
 			freq = freqn;
 			freqn = perf->states[i+1].core_frequency * 1000;
 			if ((2 * cpu_khz) > (freqn + freq)) {
@@ -601,7 +603,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (!data)
 		return -ENOMEM;
 
-	data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
 	per_cpu(drv_data, cpu) = data;
 
 	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
@@ -673,17 +675,29 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	/* detect transition latency */
 	policy->cpuinfo.transition_latency = 0;
-	for (i=0; i<perf->state_count; i++) {
+	for (i = 0; i < perf->state_count; i++) {
 		if ((perf->states[i].transition_latency * 1000) >
 		    policy->cpuinfo.transition_latency)
 			policy->cpuinfo.transition_latency =
 			    perf->states[i].transition_latency * 1000;
 	}
 
+	/* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
+	if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
+	    policy->cpuinfo.transition_latency > 20 * 1000) {
+		static int print_once;
+		policy->cpuinfo.transition_latency = 20 * 1000;
+		if (!print_once) {
+			print_once = 1;
+			printk(KERN_INFO "Capping off P-state tranision latency"
+				" at 20 uS\n");
+		}
+	}
+
 	data->max_freq = perf->states[0].core_frequency * 1000;
 	/* table init */
-	for (i=0; i<perf->state_count; i++) {
-		if (i>0 && perf->states[i].core_frequency >=
+	for (i = 0; i < perf->state_count; i++) {
+		if (i > 0 && perf->states[i].core_frequency >=
 		    data->freq_table[valid_states-1].frequency / 1000)
 			continue;
 
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index 965ea52767a..733093d6043 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -32,7 +32,7 @@
  * nforce2_chipset:
  * FSB is changed using the chipset
  */
-static struct pci_dev *nforce2_chipset_dev;
+static struct pci_dev *nforce2_dev;
 
 /* fid:
  * multiplier * 10
@@ -56,7 +56,9 @@ MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
 MODULE_PARM_DESC(min_fsb,
 		"Minimum FSB to use, if not defined: current FSB - 50");
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg)
+#define PFX "cpufreq-nforce2: "
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+		"cpufreq-nforce2", msg)
 
 /**
  * nforce2_calc_fsb - calculate FSB
@@ -118,11 +120,11 @@ static void nforce2_write_pll(int pll)
 	int temp;
 
 	/* Set the pll addr. to 0x00 */
-	pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLADR, 0);
+	pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
 
 	/* Now write the value in all 64 registers */
 	for (temp = 0; temp <= 0x3f; temp++)
-		pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, pll);
+		pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
 
 	return;
 }
@@ -139,8 +141,8 @@ static unsigned int nforce2_fsb_read(int bootfsb)
 	u32 fsb, temp = 0;
 
 	/* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
-	nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
-						0x01EF, PCI_ANY_ID, PCI_ANY_ID, NULL);
+	nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
+				PCI_ANY_ID, PCI_ANY_ID, NULL);
 	if (!nforce2_sub5)
 		return 0;
 
@@ -148,13 +150,13 @@ static unsigned int nforce2_fsb_read(int bootfsb)
 	fsb /= 1000000;
 
 	/* Check if PLL register is already set */
-	pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
+	pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
 
 	if (bootfsb || !temp)
 		return fsb;
 
 	/* Use PLL register FSB value */
-	pci_read_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, &temp);
+	pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
 	fsb = nforce2_calc_fsb(temp);
 
 	return fsb;
@@ -174,18 +176,18 @@ static int nforce2_set_fsb(unsigned int fsb)
 	int pll = 0;
 
 	if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
-		printk(KERN_ERR "cpufreq: FSB %d is out of range!\n", fsb);
+		printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
 		return -EINVAL;
 	}
 
 	tfsb = nforce2_fsb_read(0);
 	if (!tfsb) {
-		printk(KERN_ERR "cpufreq: Error while reading the FSB\n");
+		printk(KERN_ERR PFX "Error while reading the FSB\n");
 		return -EINVAL;
 	}
 
 	/* First write? Then set actual value */
-	pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
+	pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
 	if (!temp) {
 		pll = nforce2_calc_pll(tfsb);
 
@@ -197,7 +199,7 @@ static int nforce2_set_fsb(unsigned int fsb)
 
 	/* Enable write access */
 	temp = 0x01;
-	pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8)temp);
+	pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
 
 	diff = tfsb - fsb;
 
@@ -222,7 +224,7 @@ static int nforce2_set_fsb(unsigned int fsb)
 	}
 
 	temp = 0x40;
-	pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLADR, (u8)temp);
+	pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
 
 	return 0;
 }
@@ -244,7 +246,8 @@ static unsigned int nforce2_get(unsigned int cpu)
  * nforce2_target - set a new CPUFreq policy
  * @policy: new policy
  * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ * @relation: how that frequency relates to achieved frequency
+ *  (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
  *
  * Sets a new CPUFreq policy.
  */
@@ -276,7 +279,7 @@ static int nforce2_target(struct cpufreq_policy *policy,
 	/* local_irq_save(flags); */
 
 	if (nforce2_set_fsb(target_fsb) < 0)
-		printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n",
+		printk(KERN_ERR PFX "Changing FSB to %d failed\n",
 			target_fsb);
 	else
 		dprintk("Changed FSB successfully to %d\n",
@@ -327,8 +330,8 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
 	/* FIX: Get FID from CPU */
 	if (!fid) {
 		if (!cpu_khz) {
-			printk(KERN_WARNING
-			       "cpufreq: cpu_khz not set, can't calculate multiplier!\n");
+			printk(KERN_WARNING PFX
+			"cpu_khz not set, can't calculate multiplier!\n");
 			return -ENODEV;
 		}
 
@@ -343,7 +346,7 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
 		}
 	}
 
-	printk(KERN_INFO "cpufreq: FSB currently at %i MHz, FID %d.%d\n", fsb,
+	printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
 	       fid / 10, fid % 10);
 
 	/* Set maximum FSB to FSB at boot time */
@@ -392,17 +395,18 @@ static struct cpufreq_driver nforce2_driver = {
  */
 static unsigned int nforce2_detect_chipset(void)
 {
-	nforce2_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
+	nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
 					PCI_DEVICE_ID_NVIDIA_NFORCE2,
 					PCI_ANY_ID, PCI_ANY_ID, NULL);
 
-	if (nforce2_chipset_dev == NULL)
+	if (nforce2_dev == NULL)
 		return -ENODEV;
 
-	printk(KERN_INFO "cpufreq: Detected nForce2 chipset revision %X\n",
-	       nforce2_chipset_dev->revision);
-	printk(KERN_INFO
-	       "cpufreq: FSB changing is maybe unstable and can lead to crashes and data loss.\n");
+	printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
+	       nforce2_dev->revision);
+	printk(KERN_INFO PFX
+	       "FSB changing is maybe unstable and can lead to "
+	       "crashes and data loss.\n");
 
 	return 0;
 }
@@ -420,7 +424,7 @@ static int __init nforce2_init(void)
 
 	/* detect chipset */
 	if (nforce2_detect_chipset()) {
-		printk(KERN_ERR "cpufreq: No nForce2 chipset.\n");
+		printk(KERN_INFO PFX "No nForce2 chipset.\n");
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
index 41ab3f064cb..35a257dd4bb 100644
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
@@ -12,12 +12,12 @@
 #include <linux/cpufreq.h>
 #include <linux/ioport.h>
 #include <linux/slab.h>
+#include <linux/timex.h>
+#include <linux/io.h>
+#include <linux/delay.h>
 
 #include <asm/msr.h>
 #include <asm/tsc.h>
-#include <asm/timex.h>
-#include <asm/io.h>
-#include <asm/delay.h>
 
 #define EPS_BRAND_C7M	0
 #define EPS_BRAND_C7	1
@@ -184,7 +184,7 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
 		break;
 	}
 
-	switch(brand) {
+	switch (brand) {
 	case EPS_BRAND_C7M:
 		printk(KERN_CONT "C7-M\n");
 		break;
@@ -218,17 +218,20 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
 	/* Print voltage and multiplier */
 	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
 	current_voltage = lo & 0xff;
-	printk(KERN_INFO "eps: Current voltage = %dmV\n", current_voltage * 16 + 700);
+	printk(KERN_INFO "eps: Current voltage = %dmV\n",
+			current_voltage * 16 + 700);
 	current_multiplier = (lo >> 8) & 0xff;
 	printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
 
 	/* Print limits */
 	max_voltage = hi & 0xff;
-	printk(KERN_INFO "eps: Highest voltage = %dmV\n", max_voltage * 16 + 700);
+	printk(KERN_INFO "eps: Highest voltage = %dmV\n",
+			max_voltage * 16 + 700);
 	max_multiplier = (hi >> 8) & 0xff;
 	printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
 	min_voltage = (hi >> 16) & 0xff;
-	printk(KERN_INFO "eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700);
+	printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
+			min_voltage * 16 + 700);
 	min_multiplier = (hi >> 24) & 0xff;
 	printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
 
@@ -318,7 +321,7 @@ static int eps_cpu_exit(struct cpufreq_policy *policy)
 	return 0;
 }
 
-static struct freq_attr* eps_attr[] = {
+static struct freq_attr *eps_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
 	NULL,
 };
@@ -356,7 +359,7 @@ static void __exit eps_exit(void)
 	cpufreq_unregister_driver(&eps_driver);
 }
 
-MODULE_AUTHOR("Rafa� Bilski <rafalbilski@interia.pl>");
+MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
 MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
 MODULE_LICENSE("GPL");
 
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index fe613c93b36..006b278b0d5 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -184,7 +184,8 @@ static int elanfreq_target(struct cpufreq_policy *policy,
 {
 	unsigned int newstate = 0;
 
-	if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], target_freq, relation, &newstate))
+	if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
+				target_freq, relation, &newstate))
 		return -EINVAL;
 
 	elanfreq_set_cpu_state(newstate);
@@ -301,7 +302,8 @@ static void __exit elanfreq_exit(void)
 module_param(max_freq, int, 0444);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
+MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
+		"Sven Geggus <sven@geggus.net>");
 MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
 
 module_init(elanfreq_init);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 9d9eae82e60..ac27ec2264d 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -79,8 +79,9 @@
 #include <linux/smp.h>
 #include <linux/cpufreq.h>
 #include <linux/pci.h>
+#include <linux/errno.h>
+
 #include <asm/processor-cyrix.h>
-#include <asm/errno.h>
 
 /* PCI config registers, all at F0 */
 #define PCI_PMER1	0x80	/* power management enable register 1 */
@@ -122,8 +123,8 @@ static struct gxfreq_params *gx_params;
 static int stock_freq;
 
 /* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
-static int pci_busclk = 0;
-module_param (pci_busclk, int, 0444);
+static int pci_busclk;
+module_param(pci_busclk, int, 0444);
 
 /* maximum duration for which the cpu may be suspended
  * (32us * MAX_DURATION). If no parameter is given, this defaults
@@ -132,7 +133,7 @@ module_param (pci_busclk, int, 0444);
  * is suspended -- processing power is just 0.39% of what it used to be,
  * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
 static int max_duration = 255;
-module_param (max_duration, int, 0444);
+module_param(max_duration, int, 0444);
 
 /* For the default policy, we want at least some processing power
  * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
@@ -140,7 +141,8 @@ module_param (max_duration, int, 0444);
 #define POLICY_MIN_DIV 20
 
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "gx-suspmod", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+		"gx-suspmod", msg)
 
 /**
  * we can detect a core multipiler from dir0_lsb
@@ -166,12 +168,20 @@ static int gx_freq_mult[16] = {
  *	Low Level chipset interface				*
  ****************************************************************/
 static struct pci_device_id gx_chipset_tbl[] __initdata = {
-	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, PCI_ANY_ID, PCI_ANY_ID },
-	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, PCI_ANY_ID, PCI_ANY_ID },
-	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, PCI_ANY_ID, PCI_ANY_ID },
+	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
+		PCI_ANY_ID, PCI_ANY_ID },
+	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520,
+		PCI_ANY_ID, PCI_ANY_ID },
+	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
+		PCI_ANY_ID, PCI_ANY_ID },
 	{ 0, },
 };
 
+static void gx_write_byte(int reg, int value)
+{
+	pci_write_config_byte(gx_params->cs55x0, reg, value);
+}
+
 /**
  * gx_detect_chipset:
  *
@@ -200,7 +210,8 @@ static __init struct pci_dev *gx_detect_chipset(void)
 /**
  * gx_get_cpuspeed:
  *
- * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi Geode CPU runs.
+ * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
+ * Geode CPU runs.
  */
 static unsigned int gx_get_cpuspeed(unsigned int cpu)
 {
@@ -217,17 +228,18 @@ static unsigned int gx_get_cpuspeed(unsigned int cpu)
  *
  **/
 
-static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, u8 *off_duration)
+static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
+		u8 *off_duration)
 {
 	unsigned int i;
 	u8 tmp_on, tmp_off;
 	int old_tmp_freq = stock_freq;
 	int tmp_freq;
 
-	*off_duration=1;
-	*on_duration=0;
+	*off_duration = 1;
+	*on_duration = 0;
 
-	for (i=max_duration; i>0; i--) {
+	for (i = max_duration; i > 0; i--) {
 		tmp_off = ((khz * i) / stock_freq) & 0xff;
 		tmp_on = i - tmp_off;
 		tmp_freq = (stock_freq * tmp_off) / i;
@@ -259,26 +271,34 @@ static void gx_set_cpuspeed(unsigned int khz)
 	freqs.cpu = 0;
 	freqs.old = gx_get_cpuspeed(0);
 
-	new_khz = gx_validate_speed(khz, &gx_params->on_duration, &gx_params->off_duration);
+	new_khz = gx_validate_speed(khz, &gx_params->on_duration,
+			&gx_params->off_duration);
 
 	freqs.new = new_khz;
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 	local_irq_save(flags);
 
-	if (new_khz != stock_freq) {  /* if new khz == 100% of CPU speed, it is special case */
+
+
+	if (new_khz != stock_freq) {
+		/* if new khz == 100% of CPU speed, it is special case */
 		switch (gx_params->cs55x0->device) {
 		case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
 			pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
 			/* FIXME: need to test other values -- Zwane,Miura */
-			pci_write_config_byte(gx_params->cs55x0, PCI_IRQTC, 4); /* typical 2 to 4ms */
-			pci_write_config_byte(gx_params->cs55x0, PCI_VIDTC, 100);/* typical 50 to 100ms */
-			pci_write_config_byte(gx_params->cs55x0, PCI_PMER1, pmer1);
-
-			if (gx_params->cs55x0->revision < 0x10) {   /* CS5530(rev 1.2, 1.3) */
-				suscfg = gx_params->pci_suscfg | SUSMOD;
-			} else {                           /* CS5530A,B.. */
-				suscfg = gx_params->pci_suscfg | SUSMOD | PWRSVE;
+			/* typical 2 to 4ms */
+			gx_write_byte(PCI_IRQTC, 4);
+			/* typical 50 to 100ms */
+			gx_write_byte(PCI_VIDTC, 100);
+			gx_write_byte(PCI_PMER1, pmer1);
+
+			if (gx_params->cs55x0->revision < 0x10) {
+				/* CS5530(rev 1.2, 1.3) */
+				suscfg = gx_params->pci_suscfg|SUSMOD;
+			} else {
+				/* CS5530A,B.. */
+				suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
 			}
 			break;
 		case PCI_DEVICE_ID_CYRIX_5520:
@@ -294,13 +314,13 @@ static void gx_set_cpuspeed(unsigned int khz)
 		suscfg = gx_params->pci_suscfg & ~(SUSMOD);
 		gx_params->off_duration = 0;
 		gx_params->on_duration = 0;
-		dprintk("suspend modulation disabled: cpu runs 100 percent speed.\n");
+		dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
 	}
 
-	pci_write_config_byte(gx_params->cs55x0, PCI_MODOFF, gx_params->off_duration);
-	pci_write_config_byte(gx_params->cs55x0, PCI_MODON, gx_params->on_duration);
+	gx_write_byte(PCI_MODOFF, gx_params->off_duration);
+	gx_write_byte(PCI_MODON, gx_params->on_duration);
 
-	pci_write_config_byte(gx_params->cs55x0, PCI_SUSCFG, suscfg);
+	gx_write_byte(PCI_SUSCFG, suscfg);
 	pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
 
 	local_irq_restore(flags);
@@ -334,7 +354,8 @@ static int cpufreq_gx_verify(struct cpufreq_policy *policy)
 		return -EINVAL;
 
 	policy->cpu = 0;
-	cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq);
+	cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
+			stock_freq);
 
 	/* it needs to be assured that at least one supported frequency is
 	 * within policy->min and policy->max. If it is not, policy->max
@@ -354,7 +375,8 @@ static int cpufreq_gx_verify(struct cpufreq_policy *policy)
 	policy->max = tmp_freq;
 	if (policy->max < policy->min)
 		policy->max = policy->min;
-	cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq);
+	cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
+			stock_freq);
 
 	return 0;
 }
@@ -398,18 +420,18 @@ static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
 		return -ENODEV;
 
 	/* determine maximum frequency */
-	if (pci_busclk) {
+	if (pci_busclk)
 		maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-	} else if (cpu_khz) {
+	else if (cpu_khz)
 		maxfreq = cpu_khz;
-	} else {
+	else
 		maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-	}
+
 	stock_freq = maxfreq;
 	curfreq = gx_get_cpuspeed(0);
 
 	dprintk("cpu max frequency is %d.\n", maxfreq);
-	dprintk("cpu current frequency is %dkHz.\n",curfreq);
+	dprintk("cpu current frequency is %dkHz.\n", curfreq);
 
 	/* setup basic struct for cpufreq API */
 	policy->cpu = 0;
@@ -447,7 +469,8 @@ static int __init cpufreq_gx_init(void)
 	struct pci_dev *gx_pci;
 
 	/* Test if we have the right hardware */
-	if ((gx_pci = gx_detect_chipset()) == NULL)
+	gx_pci = gx_detect_chipset();
+	if (gx_pci == NULL)
 		return -ENODEV;
 
 	/* check whether module parameters are sane */
@@ -468,9 +491,11 @@ static int __init cpufreq_gx_init(void)
 	pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
 	pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
 	pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
-	pci_read_config_byte(params->cs55x0, PCI_MODOFF, &(params->off_duration));
+	pci_read_config_byte(params->cs55x0, PCI_MODOFF,
+			&(params->off_duration));
 
-	if ((ret = cpufreq_register_driver(&gx_suspmod_driver))) {
+	ret = cpufreq_register_driver(&gx_suspmod_driver);
+	if (ret) {
 		kfree(params);
 		return ret;                   /* register error! */
 	}
@@ -485,9 +510,9 @@ static void __exit cpufreq_gx_exit(void)
 	kfree(gx_params);
 }
 
-MODULE_AUTHOR ("Hiroshi Miura <miura@da-cha.org>");
-MODULE_DESCRIPTION ("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
-MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
+MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
+MODULE_LICENSE("GPL");
 
 module_init(cpufreq_gx_init);
 module_exit(cpufreq_gx_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index a4cff5d6e38..0bd48e65a0c 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -30,12 +30,12 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/delay.h>
+#include <linux/timex.h>
+#include <linux/io.h>
+#include <linux/acpi.h>
+#include <linux/kernel.h>
 
 #include <asm/msr.h>
-#include <asm/timex.h>
-#include <asm/io.h>
-#include <asm/acpi.h>
-#include <linux/acpi.h>
 #include <acpi/processor.h>
 
 #include "longhaul.h"
@@ -58,7 +58,7 @@
 #define USE_NORTHBRIDGE		(1 << 2)
 
 static int cpu_model;
-static unsigned int numscales=16;
+static unsigned int numscales = 16;
 static unsigned int fsb;
 
 static const struct mV_pos *vrm_mV_table;
@@ -67,8 +67,8 @@ static const unsigned char *mV_vrm_table;
 static unsigned int highest_speed, lowest_speed; /* kHz */
 static unsigned int minmult, maxmult;
 static int can_scale_voltage;
-static struct acpi_processor *pr = NULL;
-static struct acpi_processor_cx *cx = NULL;
+static struct acpi_processor *pr;
+static struct acpi_processor_cx *cx;
 static u32 acpi_regs_addr;
 static u8 longhaul_flags;
 static unsigned int longhaul_index;
@@ -78,12 +78,13 @@ static int scale_voltage;
 static int disable_acpi_c3;
 static int revid_errata;
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+		"longhaul", msg)
 
 
 /* Clock ratios multiplied by 10 */
-static int clock_ratio[32];
-static int eblcr_table[32];
+static int mults[32];
+static int eblcr[32];
 static int longhaul_version;
 static struct cpufreq_frequency_table *longhaul_table;
 
@@ -93,7 +94,7 @@ static char speedbuffer[8];
 static char *print_speed(int speed)
 {
 	if (speed < 1000) {
-		snprintf(speedbuffer, sizeof(speedbuffer),"%dMHz", speed);
+		snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
 		return speedbuffer;
 	}
 
@@ -122,27 +123,28 @@ static unsigned int calc_speed(int mult)
 
 static int longhaul_get_cpu_mult(void)
 {
-	unsigned long invalue=0,lo, hi;
+	unsigned long invalue = 0, lo, hi;
 
-	rdmsr (MSR_IA32_EBL_CR_POWERON, lo, hi);
-	invalue = (lo & (1<<22|1<<23|1<<24|1<<25)) >>22;
-	if (longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) {
+	rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
+	invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
+	if (longhaul_version == TYPE_LONGHAUL_V2 ||
+	    longhaul_version == TYPE_POWERSAVER) {
 		if (lo & (1<<27))
-			invalue+=16;
+			invalue += 16;
 	}
-	return eblcr_table[invalue];
+	return eblcr[invalue];
 }
 
 /* For processor with BCR2 MSR */
 
-static void do_longhaul1(unsigned int clock_ratio_index)
+static void do_longhaul1(unsigned int mults_index)
 {
 	union msr_bcr2 bcr2;
 
 	rdmsrl(MSR_VIA_BCR2, bcr2.val);
 	/* Enable software clock multiplier */
 	bcr2.bits.ESOFTBF = 1;
-	bcr2.bits.CLOCKMUL = clock_ratio_index & 0xff;
+	bcr2.bits.CLOCKMUL = mults_index & 0xff;
 
 	/* Sync to timer tick */
 	safe_halt();
@@ -161,7 +163,7 @@ static void do_longhaul1(unsigned int clock_ratio_index)
 
 /* For processor with Longhaul MSR */
 
-static void do_powersaver(int cx_address, unsigned int clock_ratio_index,
+static void do_powersaver(int cx_address, unsigned int mults_index,
 			  unsigned int dir)
 {
 	union msr_longhaul longhaul;
@@ -173,11 +175,11 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index,
 		longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
 	else
 		longhaul.bits.RevisionKey = 0;
-	longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf;
-	longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4;
+	longhaul.bits.SoftBusRatio = mults_index & 0xf;
+	longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
 	/* Setup new voltage */
 	if (can_scale_voltage)
-		longhaul.bits.SoftVID = (clock_ratio_index >> 8) & 0x1f;
+		longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
 	/* Sync to timer tick */
 	safe_halt();
 	/* Raise voltage if necessary */
@@ -240,14 +242,14 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index,
 
 /**
  * longhaul_set_cpu_frequency()
- * @clock_ratio_index : bitpattern of the new multiplier.
+ * @mults_index : bitpattern of the new multiplier.
  *
  * Sets a new clock ratio.
  */
 
 static void longhaul_setstate(unsigned int table_index)
 {
-	unsigned int clock_ratio_index;
+	unsigned int mults_index;
 	int speed, mult;
 	struct cpufreq_freqs freqs;
 	unsigned long flags;
@@ -256,9 +258,9 @@ static void longhaul_setstate(unsigned int table_index)
 	u32 bm_timeout = 1000;
 	unsigned int dir = 0;
 
-	clock_ratio_index = longhaul_table[table_index].index;
+	mults_index = longhaul_table[table_index].index;
 	/* Safety precautions */
-	mult = clock_ratio[clock_ratio_index & 0x1f];
+	mult = mults[mults_index & 0x1f];
 	if (mult == -1)
 		return;
 	speed = calc_speed(mult);
@@ -274,7 +276,7 @@ static void longhaul_setstate(unsigned int table_index)
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 
-	dprintk ("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
+	dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
 			fsb, mult/10, mult%10, print_speed(speed/1000));
 retry_loop:
 	preempt_disable();
@@ -282,8 +284,8 @@ retry_loop:
 
 	pic2_mask = inb(0xA1);
 	pic1_mask = inb(0x21);	/* works on C3. save mask. */
-	outb(0xFF,0xA1);	/* Overkill */
-	outb(0xFE,0x21);	/* TMR0 only */
+	outb(0xFF, 0xA1);	/* Overkill */
+	outb(0xFE, 0x21);	/* TMR0 only */
 
 	/* Wait while PCI bus is busy. */
 	if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
@@ -303,7 +305,7 @@ retry_loop:
 		outb(3, 0x22);
 	} else if ((pr != NULL) && pr->flags.bm_control) {
 		/* Disable bus master arbitration */
-		acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
+		acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
 	}
 	switch (longhaul_version) {
 
@@ -312,7 +314,7 @@ retry_loop:
 	 * Software controlled multipliers only.
 	 */
 	case TYPE_LONGHAUL_V1:
-		do_longhaul1(clock_ratio_index);
+		do_longhaul1(mults_index);
 		break;
 
 	/*
@@ -326,10 +328,10 @@ retry_loop:
 	case TYPE_POWERSAVER:
 		if (longhaul_flags & USE_ACPI_C3) {
 			/* Don't allow wakeup */
-			acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
-			do_powersaver(cx->address, clock_ratio_index, dir);
+			acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
+			do_powersaver(cx->address, mults_index, dir);
 		} else {
-			do_powersaver(0, clock_ratio_index, dir);
+			do_powersaver(0, mults_index, dir);
 		}
 		break;
 	}
@@ -339,10 +341,10 @@ retry_loop:
 		outb(0, 0x22);
 	} else if ((pr != NULL) && pr->flags.bm_control) {
 		/* Enable bus master arbitration */
-		acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+		acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
 	}
-	outb(pic2_mask,0xA1);	/* restore mask */
-	outb(pic1_mask,0x21);
+	outb(pic2_mask, 0xA1);	/* restore mask */
+	outb(pic1_mask, 0x21);
 
 	local_irq_restore(flags);
 	preempt_enable();
@@ -392,7 +394,8 @@ retry_loop:
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 
 	if (!bm_timeout)
-		printk(KERN_INFO PFX "Warning: Timeout while waiting for idle PCI bus.\n");
+		printk(KERN_INFO PFX "Warning: Timeout while waiting for "
+				"idle PCI bus.\n");
 }
 
 /*
@@ -458,31 +461,32 @@ static int __init longhaul_get_ranges(void)
 		break;
 	}
 
-	dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n",
+	dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
 		 minmult/10, minmult%10, maxmult/10, maxmult%10);
 
 	highest_speed = calc_speed(maxmult);
 	lowest_speed = calc_speed(minmult);
-	dprintk ("FSB:%dMHz  Lowest speed: %s   Highest speed:%s\n", fsb,
+	dprintk("FSB:%dMHz  Lowest speed: %s   Highest speed:%s\n", fsb,
 		 print_speed(lowest_speed/1000),
 		 print_speed(highest_speed/1000));
 
 	if (lowest_speed == highest_speed) {
-		printk (KERN_INFO PFX "highestspeed == lowest, aborting.\n");
+		printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
 		return -EINVAL;
 	}
 	if (lowest_speed > highest_speed) {
-		printk (KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
+		printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
 			lowest_speed, highest_speed);
 		return -EINVAL;
 	}
 
-	longhaul_table = kmalloc((numscales + 1) * sizeof(struct cpufreq_frequency_table), GFP_KERNEL);
-	if(!longhaul_table)
+	longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
+			GFP_KERNEL);
+	if (!longhaul_table)
 		return -ENOMEM;
 
 	for (j = 0; j < numscales; j++) {
-		ratio = clock_ratio[j];
+		ratio = mults[j];
 		if (ratio == -1)
 			continue;
 		if (ratio > maxmult || ratio < minmult)
@@ -507,13 +511,10 @@ static int __init longhaul_get_ranges(void)
 			}
 		}
 		if (min_i != j) {
-			unsigned int temp;
-			temp = longhaul_table[j].frequency;
-			longhaul_table[j].frequency = longhaul_table[min_i].frequency;
-			longhaul_table[min_i].frequency = temp;
-			temp = longhaul_table[j].index;
-			longhaul_table[j].index = longhaul_table[min_i].index;
-			longhaul_table[min_i].index = temp;
+			swap(longhaul_table[j].frequency,
+			     longhaul_table[min_i].frequency);
+			swap(longhaul_table[j].index,
+			     longhaul_table[min_i].index);
 		}
 	}
 
@@ -521,7 +522,7 @@ static int __init longhaul_get_ranges(void)
 
 	/* Find index we are running on */
 	for (j = 0; j < k; j++) {
-		if (clock_ratio[longhaul_table[j].index & 0x1f] == mult) {
+		if (mults[longhaul_table[j].index & 0x1f] == mult) {
 			longhaul_index = j;
 			break;
 		}
@@ -559,20 +560,22 @@ static void __init longhaul_setup_voltagescaling(void)
 	maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
 
 	if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
-		printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
+		printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
 					"Voltage scaling disabled.\n",
-					minvid.mV/1000, minvid.mV%1000, maxvid.mV/1000, maxvid.mV%1000);
+					minvid.mV/1000, minvid.mV%1000,
+					maxvid.mV/1000, maxvid.mV%1000);
 		return;
 	}
 
 	if (minvid.mV == maxvid.mV) {
-		printk (KERN_INFO PFX "Claims to support voltage scaling but min & max are "
-				"both %d.%03d. Voltage scaling disabled\n",
+		printk(KERN_INFO PFX "Claims to support voltage scaling but "
+				"min & max are both %d.%03d. "
+				"Voltage scaling disabled\n",
 				maxvid.mV/1000, maxvid.mV%1000);
 		return;
 	}
 
-	/* How many voltage steps */
+	/* How many voltage steps*/
 	numvscales = maxvid.pos - minvid.pos + 1;
 	printk(KERN_INFO PFX
 		"Max VID=%d.%03d  "
@@ -586,7 +589,7 @@ static void __init longhaul_setup_voltagescaling(void)
 	j = longhaul.bits.MinMHzBR;
 	if (longhaul.bits.MinMHzBR4)
 		j += 16;
-	min_vid_speed = eblcr_table[j];
+	min_vid_speed = eblcr[j];
 	if (min_vid_speed == -1)
 		return;
 	switch (longhaul.bits.MinMHzFSB) {
@@ -617,7 +620,8 @@ static void __init longhaul_setup_voltagescaling(void)
 			pos = minvid.pos;
 		longhaul_table[j].index |= mV_vrm_table[pos] << 8;
 		vid = vrm_mV_table[mV_vrm_table[pos]];
-		printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", speed, j, vid.mV);
+		printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
+				speed, j, vid.mV);
 		j++;
 	}
 
@@ -640,7 +644,8 @@ static int longhaul_target(struct cpufreq_policy *policy,
 	unsigned int dir = 0;
 	u8 vid, current_vid;
 
-	if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, relation, &table_index))
+	if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
+				relation, &table_index))
 		return -EINVAL;
 
 	/* Don't set same frequency again */
@@ -656,7 +661,8 @@ static int longhaul_target(struct cpufreq_policy *policy,
 		 * this in hardware, C3 is old and we need to do this
 		 * in software. */
 		i = longhaul_index;
-		current_vid = (longhaul_table[longhaul_index].index >> 8) & 0x1f;
+		current_vid = (longhaul_table[longhaul_index].index >> 8);
+		current_vid &= 0x1f;
 		if (table_index > longhaul_index)
 			dir = 1;
 		while (i != table_index) {
@@ -691,9 +697,9 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
 {
 	struct acpi_device *d;
 
-	if ( acpi_bus_get_device(obj_handle, &d) ) {
+	if (acpi_bus_get_device(obj_handle, &d))
 		return 0;
-	}
+
 	*return_value = acpi_driver_data(d);
 	return 1;
 }
@@ -750,7 +756,7 @@ static int longhaul_setup_southbridge(void)
 	/* Find VT8235 southbridge */
 	dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
 	if (dev == NULL)
-	/* Find VT8237 southbridge */
+		/* Find VT8237 southbridge */
 		dev = pci_get_device(PCI_VENDOR_ID_VIA,
 				     PCI_DEVICE_ID_VIA_8237, NULL);
 	if (dev != NULL) {
@@ -769,7 +775,8 @@ static int longhaul_setup_southbridge(void)
 		if (pci_cmd & 1 << 7) {
 			pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
 			acpi_regs_addr &= 0xff00;
-			printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", acpi_regs_addr);
+			printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
+					acpi_regs_addr);
 		}
 
 		pci_dev_put(dev);
@@ -781,7 +788,7 @@ static int longhaul_setup_southbridge(void)
 static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 {
 	struct cpuinfo_x86 *c = &cpu_data(0);
-	char *cpuname=NULL;
+	char *cpuname = NULL;
 	int ret;
 	u32 lo, hi;
 
@@ -791,8 +798,8 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 		cpu_model = CPU_SAMUEL;
 		cpuname = "C3 'Samuel' [C5A]";
 		longhaul_version = TYPE_LONGHAUL_V1;
-		memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio));
-		memcpy (eblcr_table, samuel1_eblcr, sizeof(samuel1_eblcr));
+		memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
+		memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
 		break;
 
 	case 7:
@@ -803,10 +810,8 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 			cpuname = "C3 'Samuel 2' [C5B]";
 			/* Note, this is not a typo, early Samuel2's had
 			 * Samuel1 ratios. */
-			memcpy(clock_ratio, samuel1_clock_ratio,
-				sizeof(samuel1_clock_ratio));
-			memcpy(eblcr_table, samuel2_eblcr,
-				sizeof(samuel2_eblcr));
+			memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
+			memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
 			break;
 		case 1 ... 15:
 			longhaul_version = TYPE_LONGHAUL_V1;
@@ -817,10 +822,8 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 				cpu_model = CPU_EZRA;
 				cpuname = "C3 'Ezra' [C5C]";
 			}
-			memcpy(clock_ratio, ezra_clock_ratio,
-				sizeof(ezra_clock_ratio));
-			memcpy(eblcr_table, ezra_eblcr,
-				sizeof(ezra_eblcr));
+			memcpy(mults, ezra_mults, sizeof(ezra_mults));
+			memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
 			break;
 		}
 		break;
@@ -829,18 +832,16 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 		cpu_model = CPU_EZRA_T;
 		cpuname = "C3 'Ezra-T' [C5M]";
 		longhaul_version = TYPE_POWERSAVER;
-		numscales=32;
-		memcpy (clock_ratio, ezrat_clock_ratio, sizeof(ezrat_clock_ratio));
-		memcpy (eblcr_table, ezrat_eblcr, sizeof(ezrat_eblcr));
+		numscales = 32;
+		memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
+		memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
 		break;
 
 	case 9:
 		longhaul_version = TYPE_POWERSAVER;
 		numscales = 32;
-		memcpy(clock_ratio,
-		       nehemiah_clock_ratio,
-		       sizeof(nehemiah_clock_ratio));
-		memcpy(eblcr_table, nehemiah_eblcr, sizeof(nehemiah_eblcr));
+		memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
+		memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
 		switch (c->x86_mask) {
 		case 0 ... 1:
 			cpu_model = CPU_NEHEMIAH;
@@ -869,14 +870,14 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
 			longhaul_version = TYPE_LONGHAUL_V1;
 	}
 
-	printk (KERN_INFO PFX "VIA %s CPU detected.  ", cpuname);
+	printk(KERN_INFO PFX "VIA %s CPU detected.  ", cpuname);
 	switch (longhaul_version) {
 	case TYPE_LONGHAUL_V1:
 	case TYPE_LONGHAUL_V2:
-		printk ("Longhaul v%d supported.\n", longhaul_version);
+		printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
 		break;
 	case TYPE_POWERSAVER:
-		printk ("Powersaver supported.\n");
+		printk(KERN_CONT "Powersaver supported.\n");
 		break;
 	};
 
@@ -940,7 +941,7 @@ static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
 	return 0;
 }
 
-static struct freq_attr* longhaul_attr[] = {
+static struct freq_attr *longhaul_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
 	NULL,
 };
@@ -966,13 +967,15 @@ static int __init longhaul_init(void)
 
 #ifdef CONFIG_SMP
 	if (num_online_cpus() > 1) {
-		printk(KERN_ERR PFX "More than 1 CPU detected, longhaul disabled.\n");
+		printk(KERN_ERR PFX "More than 1 CPU detected, "
+				"longhaul disabled.\n");
 		return -ENODEV;
 	}
 #endif
 #ifdef CONFIG_X86_IO_APIC
 	if (cpu_has_apic) {
-		printk(KERN_ERR PFX "APIC detected. Longhaul is currently broken in this configuration.\n");
+		printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
+				"broken in this configuration.\n");
 		return -ENODEV;
 	}
 #endif
@@ -993,8 +996,8 @@ static void __exit longhaul_exit(void)
 {
 	int i;
 
-	for (i=0; i < numscales; i++) {
-		if (clock_ratio[i] == maxmult) {
+	for (i = 0; i < numscales; i++) {
+		if (mults[i] == maxmult) {
 			longhaul_setstate(i);
 			break;
 		}
@@ -1007,11 +1010,11 @@ static void __exit longhaul_exit(void)
 /* Even if BIOS is exporting ACPI C3 state, and it is used
  * with success when CPU is idle, this state doesn't
  * trigger frequency transition in some cases. */
-module_param (disable_acpi_c3, int, 0644);
+module_param(disable_acpi_c3, int, 0644);
 MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
 /* Change CPU voltage with frequency. Very usefull to save
  * power, but most VIA C3 processors aren't supporting it. */
-module_param (scale_voltage, int, 0644);
+module_param(scale_voltage, int, 0644);
 MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
 /* Force revision key to 0 for processors which doesn't
  * support voltage scaling, but are introducing itself as
@@ -1019,9 +1022,9 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
 module_param(revid_errata, int, 0644);
 MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
 
-MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
-MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
+MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
+MODULE_LICENSE("GPL");
 
 late_initcall(longhaul_init);
 module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index 4fcc320997d..e2360a469f7 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -49,14 +49,14 @@ union msr_longhaul {
 
 /*
  * Clock ratio tables. Div/Mod by 10 to get ratio.
- * The eblcr ones specify the ratio read from the CPU.
- * The clock_ratio ones specify what to write to the CPU.
+ * The eblcr values specify the ratio read from the CPU.
+ * The mults values specify what to write to the CPU.
  */
 
 /*
  * VIA C3 Samuel 1  & Samuel 2 (stepping 0)
  */
-static const int __initdata samuel1_clock_ratio[16] = {
+static const int __initdata samuel1_mults[16] = {
 	-1, /* 0000 -> RESERVED */
 	30, /* 0001 ->  3.0x */
 	40, /* 0010 ->  4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
 /*
  * VIA C3 Ezra
  */
-static const int __initdata ezra_clock_ratio[16] = {
+static const int __initdata ezra_mults[16] = {
 	100, /* 0000 -> 10.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
 /*
  * VIA C3 (Ezra-T) [C5M].
  */
-static const int __initdata ezrat_clock_ratio[32] = {
+static const int __initdata ezrat_mults[32] = {
 	100, /* 0000 -> 10.0x */
 	30,  /* 0001 ->  3.0x */
 	40,  /* 0010 ->  4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
 /*
  * VIA C3 Nehemiah */
 
-static const int __initdata  nehemiah_clock_ratio[32] = {
+static const int __initdata  nehemiah_mults[32] = {
 	100, /* 0000 -> 10.0x */
 	-1, /* 0001 -> 16.0x */
 	40,  /* 0010 ->  4.0x */
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index 777a7ff075d..da5f70fcb76 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -11,12 +11,13 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/cpufreq.h>
+#include <linux/timex.h>
 
 #include <asm/msr.h>
 #include <asm/processor.h>
-#include <asm/timex.h>
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longrun", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+		"longrun", msg)
 
 static struct cpufreq_driver	longrun_driver;
 
@@ -51,7 +52,7 @@ static void __init longrun_get_policy(struct cpufreq_policy *policy)
 	msr_lo &= 0x0000007F;
 	msr_hi &= 0x0000007F;
 
-	if ( longrun_high_freq <= longrun_low_freq ) {
+	if (longrun_high_freq <= longrun_low_freq) {
 		/* Assume degenerate Longrun table */
 		policy->min = policy->max = longrun_high_freq;
 	} else {
@@ -79,7 +80,7 @@ static int longrun_set_policy(struct cpufreq_policy *policy)
 	if (!policy)
 		return -EINVAL;
 
-	if ( longrun_high_freq <= longrun_low_freq ) {
+	if (longrun_high_freq <= longrun_low_freq) {
 		/* Assume degenerate Longrun table */
 		pctg_lo = pctg_hi = 100;
 	} else {
@@ -152,7 +153,7 @@ static unsigned int longrun_get(unsigned int cpu)
 	cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
 	dprintk("cpuid eax is %u\n", eax);
 
-	return (eax * 1000);
+	return eax * 1000;
 }
 
 /**
@@ -196,7 +197,8 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
 		rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
 		*high_freq = msr_lo * 1000; /* to kHz */
 
-		dprintk("longrun table interface told %u - %u kHz\n", *low_freq, *high_freq);
+		dprintk("longrun table interface told %u - %u kHz\n",
+				*low_freq, *high_freq);
 
 		if (*low_freq > *high_freq)
 			*low_freq = *high_freq;
@@ -219,7 +221,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
 	cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
 	/* try decreasing in 10% steps, some processors react only
 	 * on some barrier values */
-	for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -=10) {
+	for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
 		/* set to 0 to try_hi perf_pctg */
 		msr_lo &= 0xFFFFFF80;
 		msr_hi &= 0xFFFFFF80;
@@ -236,7 +238,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
 
 	/* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
 	 * eqals
-	 * low_freq * ( 1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
+	 * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
 	 *
 	 * high_freq * perf_pctg is stored tempoarily into "ebx".
 	 */
@@ -317,9 +319,10 @@ static void __exit longrun_exit(void)
 }
 
 
-MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION ("LongRun driver for Transmeta Crusoe and Efficeon processors.");
-MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
+		"Efficeon processors.");
+MODULE_LICENSE("GPL");
 
 module_init(longrun_init);
 module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index b585e04cbc9..6ac55bd341a 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -27,15 +27,17 @@
 #include <linux/cpufreq.h>
 #include <linux/slab.h>
 #include <linux/cpumask.h>
+#include <linux/timex.h>
 
 #include <asm/processor.h>
 #include <asm/msr.h>
-#include <asm/timex.h>
+#include <asm/timer.h>
 
 #include "speedstep-lib.h"
 
 #define PFX	"p4-clockmod: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "p4-clockmod", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+		"p4-clockmod", msg)
 
 /*
  * Duty Cycle (3bits), note DC_DISABLE is not specified in
@@ -58,7 +60,8 @@ static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
 {
 	u32 l, h;
 
-	if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV))
+	if (!cpu_online(cpu) ||
+	    (newstate > DC_DISABLE) || (newstate == DC_RESV))
 		return -EINVAL;
 
 	rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
@@ -66,7 +69,8 @@ static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
 	if (l & 0x01)
 		dprintk("CPU#%d currently thermal throttled\n", cpu);
 
-	if (has_N44_O17_errata[cpu] && (newstate == DC_25PT || newstate == DC_DFLT))
+	if (has_N44_O17_errata[cpu] &&
+	    (newstate == DC_25PT || newstate == DC_DFLT))
 		newstate = DC_38PT;
 
 	rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
@@ -112,7 +116,8 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
 	struct cpufreq_freqs freqs;
 	int i;
 
-	if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], target_freq, relation, &newstate))
+	if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
+				target_freq, relation, &newstate))
 		return -EINVAL;
 
 	freqs.old = cpufreq_p4_get(policy->cpu);
@@ -127,7 +132,8 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
 		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 	}
 
-	/* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
+	/* run on each logical CPU,
+	 * see section 13.15.3 of IA32 Intel Architecture Software
 	 * Developer's Manual, Volume 3
 	 */
 	for_each_cpu(i, policy->cpus)
@@ -153,28 +159,30 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
 {
 	if (c->x86 == 0x06) {
 		if (cpu_has(c, X86_FEATURE_EST))
-			printk(KERN_WARNING PFX "Warning: EST-capable CPU detected. "
-			       "The acpi-cpufreq module offers voltage scaling"
-			       " in addition of frequency scaling. You should use "
-			       "that instead of p4-clockmod, if possible.\n");
+			printk(KERN_WARNING PFX "Warning: EST-capable CPU "
+			       "detected. The acpi-cpufreq module offers "
+			       "voltage scaling in addition of frequency "
+			       "scaling. You should use that instead of "
+			       "p4-clockmod, if possible.\n");
 		switch (c->x86_model) {
 		case 0x0E: /* Core */
 		case 0x0F: /* Core Duo */
 		case 0x16: /* Celeron Core */
 			p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-			return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PCORE);
+			return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
 		case 0x0D: /* Pentium M (Dothan) */
 			p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
 			/* fall through */
 		case 0x09: /* Pentium M (Banias) */
-			return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM);
+			return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
 		}
 	}
 
 	if (c->x86 != 0xF) {
 		if (!cpu_has(c, X86_FEATURE_EST))
-			printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. "
-				"Please send an e-mail to <cpufreq@vger.kernel.org>\n");
+			printk(KERN_WARNING PFX "Unknown CPU. "
+				"Please send an e-mail to "
+				"<cpufreq@vger.kernel.org>\n");
 		return 0;
 	}
 
@@ -182,16 +190,16 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
 	 * throttling is active or not. */
 	p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
 
-	if (speedstep_detect_processor() == SPEEDSTEP_PROCESSOR_P4M) {
+	if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
 		printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
 		       "The speedstep-ich or acpi cpufreq modules offer "
 		       "voltage scaling in addition of frequency scaling. "
 		       "You should use either one instead of p4-clockmod, "
 		       "if possible.\n");
-		return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4M);
+		return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
 	}
 
-	return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4D);
+	return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
 }
 
 
@@ -203,7 +211,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
 	unsigned int i;
 
 #ifdef CONFIG_SMP
-	cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu));
+	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
 #endif
 
 	/* Errata workaround */
@@ -217,14 +225,20 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
 		dprintk("has errata -- disabling low frequencies\n");
 	}
 
+	if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
+	    c->x86_model < 2) {
+		/* switch to maximum frequency and measure result */
+		cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
+		recalibrate_cpu_khz();
+	}
 	/* get max frequency */
 	stock_freq = cpufreq_p4_get_frequency(c);
 	if (!stock_freq)
 		return -EINVAL;
 
 	/* table init */
-	for (i=1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
-		if ((i<2) && (has_N44_O17_errata[policy->cpu]))
+	for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
+		if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
 			p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
 		else
 			p4clockmod_table[i].frequency = (stock_freq * i)/8;
@@ -232,7 +246,10 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
 	cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
 
 	/* cpuinfo and default policy values */
-	policy->cpuinfo.transition_latency = 1000000; /* assumed */
+
+	/* the transition latency is set to be 1 higher than the maximum
+	 * transition latency of the ondemand governor */
+	policy->cpuinfo.transition_latency = 10000001;
 	policy->cur = stock_freq;
 
 	return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
@@ -258,12 +275,12 @@ static unsigned int cpufreq_p4_get(unsigned int cpu)
 		l = DC_DISABLE;
 
 	if (l != DC_DISABLE)
-		return (stock_freq * l / 8);
+		return stock_freq * l / 8;
 
 	return stock_freq;
 }
 
-static struct freq_attr* p4clockmod_attr[] = {
+static struct freq_attr *p4clockmod_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
 	NULL,
 };
@@ -277,7 +294,6 @@ static struct cpufreq_driver p4clockmod_driver = {
 	.name		= "p4-clockmod",
 	.owner		= THIS_MODULE,
 	.attr		= p4clockmod_attr,
-	.hide_interface	= 1,
 };
 
 
@@ -299,9 +315,10 @@ static int __init cpufreq_p4_init(void)
 
 	ret = cpufreq_register_driver(&p4clockmod_driver);
 	if (!ret)
-		printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock Modulation available\n");
+		printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
+				"Modulation available\n");
 
-	return (ret);
+	return ret;
 }
 
 
@@ -311,9 +328,9 @@ static void __exit cpufreq_p4_exit(void)
 }
 
 
-MODULE_AUTHOR ("Zwane Mwaikambo <zwane@commfireservices.com>");
-MODULE_DESCRIPTION ("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
-MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
+MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
+MODULE_LICENSE("GPL");
 
 late_initcall(cpufreq_p4_init);
 module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index c1ac5790c63..f10dea409f4 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -1,6 +1,7 @@
 /*
  *  This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
- *  (C) 2000-2003  Dave Jones, Arjan van de Ven, Janne Pänkälä, Dominik Brodowski.
+ *  (C) 2000-2003  Dave Jones, Arjan van de Ven, Janne Pänkälä,
+ *                 Dominik Brodowski.
  *
  *  Licensed under the terms of the GNU GPL License version 2.
  *
@@ -13,14 +14,15 @@
 #include <linux/cpufreq.h>
 #include <linux/ioport.h>
 #include <linux/slab.h>
-
-#include <asm/msr.h>
 #include <linux/timex.h>
 #include <linux/io.h>
 
+#include <asm/msr.h>
+
 #define POWERNOW_IOPORT 0xfff0          /* it doesn't matter where, as long
 					   as it is unused */
 
+#define PFX "powernow-k6: "
 static unsigned int                     busfreq;   /* FSB, in 10 kHz */
 static unsigned int                     max_multiplier;
 
@@ -47,8 +49,8 @@ static struct cpufreq_frequency_table clock_ratio[] = {
  */
 static int powernow_k6_get_cpu_multiplier(void)
 {
-	u64             invalue = 0;
-	u32             msrval;
+	u64 invalue = 0;
+	u32 msrval;
 
 	msrval = POWERNOW_IOPORT + 0x1;
 	wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
@@ -68,12 +70,12 @@ static int powernow_k6_get_cpu_multiplier(void)
  */
 static void powernow_k6_set_state(unsigned int best_i)
 {
-	unsigned long           outvalue = 0, invalue = 0;
-	unsigned long           msrval;
-	struct cpufreq_freqs    freqs;
+	unsigned long outvalue = 0, invalue = 0;
+	unsigned long msrval;
+	struct cpufreq_freqs freqs;
 
 	if (clock_ratio[best_i].index > max_multiplier) {
-		printk(KERN_ERR "cpufreq: invalid target frequency\n");
+		printk(KERN_ERR PFX "invalid target frequency\n");
 		return;
 	}
 
@@ -119,7 +121,8 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
  * powernow_k6_setpolicy - sets a new CPUFreq policy
  * @policy: new policy
  * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ * @relation: how that frequency relates to achieved frequency
+ *  (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
  *
  * sets a new CPUFreq policy
  */
@@ -127,9 +130,10 @@ static int powernow_k6_target(struct cpufreq_policy *policy,
 			       unsigned int target_freq,
 			       unsigned int relation)
 {
-	unsigned int    newstate = 0;
+	unsigned int newstate = 0;
 
-	if (cpufreq_frequency_table_target(policy, &clock_ratio[0], target_freq, relation, &newstate))
+	if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
+				target_freq, relation, &newstate))
 		return -EINVAL;
 
 	powernow_k6_set_state(newstate);
@@ -140,7 +144,7 @@ static int powernow_k6_target(struct cpufreq_policy *policy,
 
 static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
 {
-	unsigned int i;
+	unsigned int i, f;
 	int result;
 
 	if (policy->cpu != 0)
@@ -152,10 +156,11 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
 
 	/* table init */
 	for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
-		if (clock_ratio[i].index > max_multiplier)
+		f = clock_ratio[i].index;
+		if (f > max_multiplier)
 			clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
 		else
-			clock_ratio[i].frequency = busfreq * clock_ratio[i].index;
+			clock_ratio[i].frequency = busfreq * f;
 	}
 
 	/* cpuinfo and default policy values */
@@ -185,7 +190,9 @@ static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
 
 static unsigned int powernow_k6_get(unsigned int cpu)
 {
-	return busfreq * powernow_k6_get_cpu_multiplier();
+	unsigned int ret;
+	ret = (busfreq * powernow_k6_get_cpu_multiplier());
+	return ret;
 }
 
 static struct freq_attr *powernow_k6_attr[] = {
@@ -221,7 +228,7 @@ static int __init powernow_k6_init(void)
 		return -ENODEV;
 
 	if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
-		printk("cpufreq: PowerNow IOPORT region already used.\n");
+		printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
 		return -EIO;
 	}
 
@@ -246,7 +253,8 @@ static void __exit powernow_k6_exit(void)
 }
 
 
-MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
+MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
+		"Dominik Brodowski <linux@brodo.de>");
 MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
 MODULE_LICENSE("GPL");
 
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 1b446d79a8f..3c28ccd4974 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -6,10 +6,12 @@
  *  Licensed under the terms of the GNU GPL License version 2.
  *  Based upon datasheets & sample CPUs kindly provided by AMD.
  *
- * Errata 5: Processor may fail to execute a FID/VID change in presence of interrupt.
- * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
- * Errata 15: Processors with half frequency multipliers may hang upon wakeup from disconnect.
- * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
+ * Errata 5:
+ *  CPU may fail to execute a FID/VID change in presence of interrupt.
+ *  - We cli/sti on stepping A0 CPUs around the FID/VID transition.
+ * Errata 15:
+ *  CPU with half frequency multipliers may hang upon wakeup from disconnect.
+ *  - We disable half multipliers if ACPI is used on A0 stepping CPUs.
  */
 
 #include <linux/kernel.h>
@@ -20,11 +22,11 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/dmi.h>
+#include <linux/timex.h>
+#include <linux/io.h>
 
+#include <asm/timer.h>		/* Needed for recalibrate_cpu_khz() */
 #include <asm/msr.h>
-#include <asm/timer.h>
-#include <asm/timex.h>
-#include <asm/io.h>
 #include <asm/system.h>
 
 #ifdef CONFIG_X86_POWERNOW_K7_ACPI
@@ -58,9 +60,9 @@ struct pst_s {
 union powernow_acpi_control_t {
 	struct {
 		unsigned long fid:5,
-		vid:5,
-		sgtc:20,
-		res1:2;
+			vid:5,
+			sgtc:20,
+			res1:2;
 	} bits;
 	unsigned long val;
 };
@@ -94,14 +96,15 @@ static struct cpufreq_frequency_table *powernow_table;
 
 static unsigned int can_scale_bus;
 static unsigned int can_scale_vid;
-static unsigned int minimum_speed=-1;
+static unsigned int minimum_speed = -1;
 static unsigned int maximum_speed;
 static unsigned int number_scales;
 static unsigned int fsb;
 static unsigned int latency;
 static char have_a0;
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k7", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+		"powernow-k7", msg)
 
 static int check_fsb(unsigned int fsbspeed)
 {
@@ -109,7 +112,7 @@ static int check_fsb(unsigned int fsbspeed)
 	unsigned int f = fsb / 1000;
 
 	delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
-	return (delta < 5);
+	return delta < 5;
 }
 
 static int check_powernow(void)
@@ -117,24 +120,26 @@ static int check_powernow(void)
 	struct cpuinfo_x86 *c = &cpu_data(0);
 	unsigned int maxei, eax, ebx, ecx, edx;
 
-	if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 !=6)) {
+	if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
 #ifdef MODULE
-		printk (KERN_INFO PFX "This module only works with AMD K7 CPUs\n");
+		printk(KERN_INFO PFX "This module only works with "
+				"AMD K7 CPUs\n");
 #endif
 		return 0;
 	}
 
 	/* Get maximum capabilities */
-	maxei = cpuid_eax (0x80000000);
+	maxei = cpuid_eax(0x80000000);
 	if (maxei < 0x80000007) {	/* Any powernow info ? */
 #ifdef MODULE
-		printk (KERN_INFO PFX "No powernow capabilities detected\n");
+		printk(KERN_INFO PFX "No powernow capabilities detected\n");
 #endif
 		return 0;
 	}
 
 	if ((c->x86_model == 6) && (c->x86_mask == 0)) {
-		printk (KERN_INFO PFX "K7 660[A0] core detected, enabling errata workarounds\n");
+		printk(KERN_INFO PFX "K7 660[A0] core detected, "
+				"enabling errata workarounds\n");
 		have_a0 = 1;
 	}
 
@@ -144,37 +149,42 @@ static int check_powernow(void)
 	if (!(edx & (1 << 1 | 1 << 2)))
 		return 0;
 
-	printk (KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
+	printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
 
 	if (edx & 1 << 1) {
-		printk ("frequency");
-		can_scale_bus=1;
+		printk("frequency");
+		can_scale_bus = 1;
 	}
 
 	if ((edx & (1 << 1 | 1 << 2)) == 0x6)
-		printk (" and ");
+		printk(" and ");
 
 	if (edx & 1 << 2) {
-		printk ("voltage");
-		can_scale_vid=1;
+		printk("voltage");
+		can_scale_vid = 1;
 	}
 
-	printk (".\n");
+	printk(".\n");
 	return 1;
 }
 
+static void invalidate_entry(unsigned int entry)
+{
+	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
+}
 
-static int get_ranges (unsigned char *pst)
+static int get_ranges(unsigned char *pst)
 {
 	unsigned int j;
 	unsigned int speed;
 	u8 fid, vid;
 
-	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * (number_scales + 1)), GFP_KERNEL);
+	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
+				(number_scales + 1)), GFP_KERNEL);
 	if (!powernow_table)
 		return -ENOMEM;
 
-	for (j=0 ; j < number_scales; j++) {
+	for (j = 0 ; j < number_scales; j++) {
 		fid = *pst++;
 
 		powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
@@ -182,10 +192,10 @@ static int get_ranges (unsigned char *pst)
 
 		speed = powernow_table[j].frequency;
 
-		if ((fid_codes[fid] % 10)==5) {
+		if ((fid_codes[fid] % 10) == 5) {
 #ifdef CONFIG_X86_POWERNOW_K7_ACPI
 			if (have_a0 == 1)
-				powernow_table[j].frequency = CPUFREQ_ENTRY_INVALID;
+				invalidate_entry(j);
 #endif
 		}
 
@@ -197,7 +207,7 @@ static int get_ranges (unsigned char *pst)
 		vid = *pst++;
 		powernow_table[j].index |= (vid << 8); /* upper 8 bits */
 
-		dprintk ("   FID: 0x%x (%d.%dx [%dMHz])  "
+		dprintk("   FID: 0x%x (%d.%dx [%dMHz])  "
 			 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
 			 fid_codes[fid] % 10, speed/1000, vid,
 			 mobile_vid_table[vid]/1000,
@@ -214,13 +224,13 @@ static void change_FID(int fid)
 {
 	union msr_fidvidctl fidvidctl;
 
-	rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
+	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
 	if (fidvidctl.bits.FID != fid) {
 		fidvidctl.bits.SGTC = latency;
 		fidvidctl.bits.FID = fid;
 		fidvidctl.bits.VIDC = 0;
 		fidvidctl.bits.FIDC = 1;
-		wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
+		wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
 	}
 }
 
@@ -229,18 +239,18 @@ static void change_VID(int vid)
 {
 	union msr_fidvidctl fidvidctl;
 
-	rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
+	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
 	if (fidvidctl.bits.VID != vid) {
 		fidvidctl.bits.SGTC = latency;
 		fidvidctl.bits.VID = vid;
 		fidvidctl.bits.FIDC = 0;
 		fidvidctl.bits.VIDC = 1;
-		wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
+		wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
 	}
 }
 
 
-static void change_speed (unsigned int index)
+static void change_speed(unsigned int index)
 {
 	u8 fid, vid;
 	struct cpufreq_freqs freqs;
@@ -257,7 +267,7 @@ static void change_speed (unsigned int index)
 
 	freqs.cpu = 0;
 
-	rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
+	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
 	cfid = fidvidstatus.bits.CFID;
 	freqs.old = fsb * fid_codes[cfid] / 10;
 
@@ -321,12 +331,14 @@ static int powernow_acpi_init(void)
 		goto err1;
 	}
 
-	if (acpi_processor_perf->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) {
+	if (acpi_processor_perf->control_register.space_id !=
+			ACPI_ADR_SPACE_FIXED_HARDWARE) {
 		retval = -ENODEV;
 		goto err2;
 	}
 
-	if (acpi_processor_perf->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) {
+	if (acpi_processor_perf->status_register.space_id !=
+			ACPI_ADR_SPACE_FIXED_HARDWARE) {
 		retval = -ENODEV;
 		goto err2;
 	}
@@ -338,7 +350,8 @@ static int powernow_acpi_init(void)
 		goto err2;
 	}
 
-	powernow_table = kzalloc((number_scales + 1) * (sizeof(struct cpufreq_frequency_table)), GFP_KERNEL);
+	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
+				(number_scales + 1)), GFP_KERNEL);
 	if (!powernow_table) {
 		retval = -ENOMEM;
 		goto err2;
@@ -352,7 +365,7 @@ static int powernow_acpi_init(void)
 		unsigned int speed, speed_mhz;
 
 		pc.val = (unsigned long) state->control;
-		dprintk ("acpi:  P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
+		dprintk("acpi:  P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
 			 i,
 			 (u32) state->core_frequency,
 			 (u32) state->power,
@@ -381,12 +394,12 @@ static int powernow_acpi_init(void)
 		if (speed % 1000 > 0)
 			speed_mhz++;
 
-		if ((fid_codes[fid] % 10)==5) {
+		if ((fid_codes[fid] % 10) == 5) {
 			if (have_a0 == 1)
-				powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+				invalidate_entry(i);
 		}
 
-		dprintk ("   FID: 0x%x (%d.%dx [%dMHz])  "
+		dprintk("   FID: 0x%x (%d.%dx [%dMHz])  "
 			 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
 			 fid_codes[fid] % 10, speed_mhz, vid,
 			 mobile_vid_table[vid]/1000,
@@ -422,7 +435,8 @@ err1:
 err05:
 	kfree(acpi_processor_perf);
 err0:
-	printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n");
+	printk(KERN_WARNING PFX "ACPI perflib can not be used on "
+			"this platform\n");
 	acpi_processor_perf = NULL;
 	return retval;
 }
@@ -435,7 +449,14 @@ static int powernow_acpi_init(void)
 }
 #endif
 
-static int powernow_decode_bios (int maxfid, int startvid)
+static void print_pst_entry(struct pst_s *pst, unsigned int j)
+{
+	dprintk("PST:%d (@%p)\n", j, pst);
+	dprintk(" cpuid: 0x%x  fsb: %d  maxFID: 0x%x  startvid: 0x%x\n",
+		pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
+}
+
+static int powernow_decode_bios(int maxfid, int startvid)
 {
 	struct psb_s *psb;
 	struct pst_s *pst;
@@ -446,61 +467,67 @@ static int powernow_decode_bios (int maxfid, int startvid)
 
 	etuple = cpuid_eax(0x80000001);
 
-	for (i=0xC0000; i < 0xffff0 ; i+=16) {
+	for (i = 0xC0000; i < 0xffff0 ; i += 16) {
 
 		p = phys_to_virt(i);
 
-		if (memcmp(p, "AMDK7PNOW!",  10) == 0){
-			dprintk ("Found PSB header at %p\n", p);
+		if (memcmp(p, "AMDK7PNOW!",  10) == 0) {
+			dprintk("Found PSB header at %p\n", p);
 			psb = (struct psb_s *) p;
-			dprintk ("Table version: 0x%x\n", psb->tableversion);
+			dprintk("Table version: 0x%x\n", psb->tableversion);
 			if (psb->tableversion != 0x12) {
-				printk (KERN_INFO PFX "Sorry, only v1.2 tables supported right now\n");
+				printk(KERN_INFO PFX "Sorry, only v1.2 tables"
+						" supported right now\n");
 				return -ENODEV;
 			}
 
-			dprintk ("Flags: 0x%x\n", psb->flags);
-			if ((psb->flags & 1)==0) {
-				dprintk ("Mobile voltage regulator\n");
-			} else {
-				dprintk ("Desktop voltage regulator\n");
-			}
+			dprintk("Flags: 0x%x\n", psb->flags);
+			if ((psb->flags & 1) == 0)
+				dprintk("Mobile voltage regulator\n");
+			else
+				dprintk("Desktop voltage regulator\n");
 
 			latency = psb->settlingtime;
 			if (latency < 100) {
-				printk(KERN_INFO PFX "BIOS set settling time to %d microseconds. "
-						"Should be at least 100. Correcting.\n", latency);
+				printk(KERN_INFO PFX "BIOS set settling time "
+						"to %d microseconds. "
+						"Should be at least 100. "
+						"Correcting.\n", latency);
 				latency = 100;
 			}
-			dprintk ("Settling Time: %d microseconds.\n", psb->settlingtime);
-			dprintk ("Has %d PST tables. (Only dumping ones relevant to this CPU).\n", psb->numpst);
+			dprintk("Settling Time: %d microseconds.\n",
+					psb->settlingtime);
+			dprintk("Has %d PST tables. (Only dumping ones "
+					"relevant to this CPU).\n",
+					psb->numpst);
 
-			p += sizeof (struct psb_s);
+			p += sizeof(struct psb_s);
 
 			pst = (struct pst_s *) p;
 
-			for (j=0; j<psb->numpst; j++) {
+			for (j = 0; j < psb->numpst; j++) {
 				pst = (struct pst_s *) p;
 				number_scales = pst->numpstates;
 
-				if ((etuple == pst->cpuid) && check_fsb(pst->fsbspeed) &&
-				    (maxfid==pst->maxfid) && (startvid==pst->startvid))
-				{
-					dprintk ("PST:%d (@%p)\n", j, pst);
-					dprintk (" cpuid: 0x%x  fsb: %d  maxFID: 0x%x  startvid: 0x%x\n",
-						 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
-
-					ret = get_ranges ((char *) pst + sizeof (struct pst_s));
+				if ((etuple == pst->cpuid) &&
+				    check_fsb(pst->fsbspeed) &&
+				    (maxfid == pst->maxfid) &&
+				    (startvid == pst->startvid)) {
+					print_pst_entry(pst, j);
+					p = (char *)pst + sizeof(struct pst_s);
+					ret = get_ranges(p);
 					return ret;
 				} else {
 					unsigned int k;
-					p = (char *) pst + sizeof (struct pst_s);
-					for (k=0; k<number_scales; k++)
-						p+=2;
+					p = (char *)pst + sizeof(struct pst_s);
+					for (k = 0; k < number_scales; k++)
+						p += 2;
 				}
 			}
-			printk (KERN_INFO PFX "No PST tables match this cpuid (0x%x)\n", etuple);
-			printk (KERN_INFO PFX "This is indicative of a broken BIOS.\n");
+			printk(KERN_INFO PFX "No PST tables match this cpuid "
+					"(0x%x)\n", etuple);
+			printk(KERN_INFO PFX "This is indicative of a broken "
+					"BIOS.\n");
 
 			return -EINVAL;
 		}
@@ -511,13 +538,14 @@ static int powernow_decode_bios (int maxfid, int startvid)
 }
 
 
-static int powernow_target (struct cpufreq_policy *policy,
+static int powernow_target(struct cpufreq_policy *policy,
 			    unsigned int target_freq,
 			    unsigned int relation)
 {
 	unsigned int newstate;
 
-	if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, relation, &newstate))
+	if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
+				relation, &newstate))
 		return -EINVAL;
 
 	change_speed(newstate);
@@ -526,7 +554,7 @@ static int powernow_target (struct cpufreq_policy *policy,
 }
 
 
-static int powernow_verify (struct cpufreq_policy *policy)
+static int powernow_verify(struct cpufreq_policy *policy)
 {
 	return cpufreq_frequency_table_verify(policy, powernow_table);
 }
@@ -566,18 +594,23 @@ static unsigned int powernow_get(unsigned int cpu)
 
 	if (cpu)
 		return 0;
-	rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
+	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
 	cfid = fidvidstatus.bits.CFID;
 
-	return (fsb * fid_codes[cfid] / 10);
+	return fsb * fid_codes[cfid] / 10;
 }
 
 
 static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
 {
-	printk(KERN_WARNING "%s laptop with broken PST tables in BIOS detected.\n", d->ident);
-	printk(KERN_WARNING "You need to downgrade to 3A21 (09/09/2002), or try a newer BIOS than 3A71 (01/20/2003)\n");
-	printk(KERN_WARNING "cpufreq scaling has been disabled as a result of this.\n");
+	printk(KERN_WARNING PFX
+		"%s laptop with broken PST tables in BIOS detected.\n",
+		d->ident);
+	printk(KERN_WARNING PFX
+		"You need to downgrade to 3A21 (09/09/2002), or try a newer "
+		"BIOS than 3A71 (01/20/2003)\n");
+	printk(KERN_WARNING PFX
+		"cpufreq scaling has been disabled as a result of this.\n");
 	return 0;
 }
 
@@ -598,7 +631,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
 	{ }
 };
 
-static int __init powernow_cpu_init (struct cpufreq_policy *policy)
+static int __init powernow_cpu_init(struct cpufreq_policy *policy)
 {
 	union msr_fidvidstatus fidvidstatus;
 	int result;
@@ -606,7 +639,7 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
 	if (policy->cpu != 0)
 		return -ENODEV;
 
-	rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
+	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
 
 	recalibrate_cpu_khz();
 
@@ -618,19 +651,21 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
 	dprintk("FSB: %3dMHz\n", fsb/1000);
 
 	if (dmi_check_system(powernow_dmi_table) || acpi_force) {
-		printk (KERN_INFO PFX "PSB/PST known to be broken.  Trying ACPI instead\n");
+		printk(KERN_INFO PFX "PSB/PST known to be broken.  "
+				"Trying ACPI instead\n");
 		result = powernow_acpi_init();
 	} else {
-		result = powernow_decode_bios(fidvidstatus.bits.MFID, fidvidstatus.bits.SVID);
+		result = powernow_decode_bios(fidvidstatus.bits.MFID,
+				fidvidstatus.bits.SVID);
 		if (result) {
-			printk (KERN_INFO PFX "Trying ACPI perflib\n");
+			printk(KERN_INFO PFX "Trying ACPI perflib\n");
 			maximum_speed = 0;
 			minimum_speed = -1;
 			latency = 0;
 			result = powernow_acpi_init();
 			if (result) {
-				printk (KERN_INFO PFX "ACPI and legacy methods failed\n");
-				printk (KERN_INFO PFX "See http://www.codemonkey.org.uk/projects/cpufreq/powernow-k7.html\n");
+				printk(KERN_INFO PFX
+					"ACPI and legacy methods failed\n");
 			}
 		} else {
 			/* SGTC use the bus clock as timer */
@@ -642,10 +677,11 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
 	if (result)
 		return result;
 
-	printk (KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
+	printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
 				minimum_speed/1000, maximum_speed/1000);
 
-	policy->cpuinfo.transition_latency = cpufreq_scale(2000000UL, fsb, latency);
+	policy->cpuinfo.transition_latency =
+		cpufreq_scale(2000000UL, fsb, latency);
 
 	policy->cur = powernow_get(0);
 
@@ -654,7 +690,8 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
 	return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
 }
 
-static int powernow_cpu_exit (struct cpufreq_policy *policy) {
+static int powernow_cpu_exit(struct cpufreq_policy *policy)
+{
 	cpufreq_frequency_table_put_attr(policy->cpu);
 
 #ifdef CONFIG_X86_POWERNOW_K7_ACPI
@@ -669,7 +706,7 @@ static int powernow_cpu_exit (struct cpufreq_policy *policy) {
 	return 0;
 }
 
-static struct freq_attr* powernow_table_attr[] = {
+static struct freq_attr *powernow_table_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
 	NULL,
 };
@@ -685,15 +722,15 @@ static struct cpufreq_driver powernow_driver = {
 	.attr	= powernow_table_attr,
 };
 
-static int __init powernow_init (void)
+static int __init powernow_init(void)
 {
-	if (check_powernow()==0)
+	if (check_powernow() == 0)
 		return -ENODEV;
 	return cpufreq_register_driver(&powernow_driver);
 }
 
 
-static void __exit powernow_exit (void)
+static void __exit powernow_exit(void)
 {
 	cpufreq_unregister_driver(&powernow_driver);
 }
@@ -701,9 +738,9 @@ static void __exit powernow_exit (void)
 module_param(acpi_force,  int, 0444);
 MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
 
-MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
-MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
+MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
+MODULE_LICENSE("GPL");
 
 late_initcall(powernow_init);
 module_exit(powernow_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6428aa17b40..4709ead2db5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -33,16 +33,14 @@
 #include <linux/string.h>
 #include <linux/cpumask.h>
 #include <linux/sched.h>	/* for current / set_cpus_allowed() */
+#include <linux/io.h>
+#include <linux/delay.h>
 
 #include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/delay.h>
 
-#ifdef CONFIG_X86_POWERNOW_K8_ACPI
 #include <linux/acpi.h>
 #include <linux/mutex.h>
 #include <acpi/processor.h>
-#endif
 
 #define PFX "powernow-k8: "
 #define VERSION "version 2.20.00"
@@ -56,7 +54,10 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
 static int cpu_family = CPU_OPTERON;
 
 #ifndef CONFIG_SMP
-DEFINE_PER_CPU(cpumask_t, cpu_core_map);
+static inline const struct cpumask *cpu_core_mask(int cpu)
+{
+	return cpumask_of(0);
+}
 #endif
 
 /* Return a frequency in MHz, given an input fid */
@@ -71,7 +72,8 @@ static u32 find_khz_freq_from_fid(u32 fid)
 	return 1000 * find_freq_from_fid(fid);
 }
 
-static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 pstate)
+static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
+		u32 pstate)
 {
 	return data[pstate].frequency;
 }
@@ -186,7 +188,9 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid)
 		return 1;
 	}
 
-	lo = fid | (data->currvid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID;
+	lo = fid;
+	lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
+	lo |= MSR_C_LO_INIT_FID_VID;
 
 	dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
 		fid, lo, data->plllock * PLL_LOCK_CONVERSION);
@@ -194,7 +198,9 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid)
 	do {
 		wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
 		if (i++ > 100) {
-			printk(KERN_ERR PFX "Hardware error - pending bit very stuck - no further pstate changes possible\n");
+			printk(KERN_ERR PFX
+				"Hardware error - pending bit very stuck - "
+				"no further pstate changes possible\n");
 			return 1;
 		}
 	} while (query_current_values_with_pending_wait(data));
@@ -202,14 +208,16 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid)
 	count_off_irt(data);
 
 	if (savevid != data->currvid) {
-		printk(KERN_ERR PFX "vid change on fid trans, old 0x%x, new 0x%x\n",
-		       savevid, data->currvid);
+		printk(KERN_ERR PFX
+			"vid change on fid trans, old 0x%x, new 0x%x\n",
+			savevid, data->currvid);
 		return 1;
 	}
 
 	if (fid != data->currfid) {
-		printk(KERN_ERR PFX "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
-		        data->currfid);
+		printk(KERN_ERR PFX
+			"fid trans failed, fid 0x%x, curr 0x%x\n", fid,
+			data->currfid);
 		return 1;
 	}
 
@@ -228,7 +236,9 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid)
 		return 1;
 	}
 
-	lo = data->currfid | (vid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID;
+	lo = data->currfid;
+	lo |= (vid << MSR_C_LO_VID_SHIFT);
+	lo |= MSR_C_LO_INIT_FID_VID;
 
 	dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
 		vid, lo, STOP_GRANT_5NS);
@@ -236,20 +246,24 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid)
 	do {
 		wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
 		if (i++ > 100) {
-			printk(KERN_ERR PFX "internal error - pending bit very stuck - no further pstate changes possible\n");
+			printk(KERN_ERR PFX "internal error - pending bit "
+					"very stuck - no further pstate "
+					"changes possible\n");
 			return 1;
 		}
 	} while (query_current_values_with_pending_wait(data));
 
 	if (savefid != data->currfid) {
-		printk(KERN_ERR PFX "fid changed on vid trans, old 0x%x new 0x%x\n",
+		printk(KERN_ERR PFX "fid changed on vid trans, old "
+			"0x%x new 0x%x\n",
 		       savefid, data->currfid);
 		return 1;
 	}
 
 	if (vid != data->currvid) {
-		printk(KERN_ERR PFX "vid trans failed, vid 0x%x, curr 0x%x\n", vid,
-				data->currvid);
+		printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
+				"curr 0x%x\n",
+				vid, data->currvid);
 		return 1;
 	}
 
@@ -261,7 +275,8 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid)
  * Decreasing vid codes represent increasing voltages:
  * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
  */
-static int decrease_vid_code_by_step(struct powernow_k8_data *data, u32 reqvid, u32 step)
+static int decrease_vid_code_by_step(struct powernow_k8_data *data,
+		u32 reqvid, u32 step)
 {
 	if ((data->currvid - reqvid) > step)
 		reqvid = data->currvid - step;
@@ -283,7 +298,8 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
 }
 
 /* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
-static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid)
+static int transition_fid_vid(struct powernow_k8_data *data,
+		u32 reqfid, u32 reqvid)
 {
 	if (core_voltage_pre_transition(data, reqvid))
 		return 1;
@@ -298,7 +314,8 @@ static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 req
 		return 1;
 
 	if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
-		printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, curr 0x%x 0x%x\n",
+		printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
+				"curr 0x%x 0x%x\n",
 				smp_processor_id(),
 				reqfid, reqvid, data->currfid, data->currvid);
 		return 1;
@@ -311,13 +328,15 @@ static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 req
 }
 
 /* Phase 1 - core voltage transition ... setup voltage */
-static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid)
+static int core_voltage_pre_transition(struct powernow_k8_data *data,
+		u32 reqvid)
 {
 	u32 rvosteps = data->rvo;
 	u32 savefid = data->currfid;
 	u32 maxvid, lo;
 
-	dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, reqvid 0x%x, rvo 0x%x\n",
+	dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
+		"reqvid 0x%x, rvo 0x%x\n",
 		smp_processor_id(),
 		data->currfid, data->currvid, reqvid, data->rvo);
 
@@ -340,7 +359,7 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid
 		} else {
 			dprintk("ph1: changing vid for rvo, req 0x%x\n",
 				data->currvid - 1);
-			if (decrease_vid_code_by_step(data, data->currvid - 1, 1))
+			if (decrease_vid_code_by_step(data, data->currvid-1, 1))
 				return 1;
 			rvosteps--;
 		}
@@ -350,7 +369,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid
 		return 1;
 
 	if (savefid != data->currfid) {
-		printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", data->currfid);
+		printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
+				data->currfid);
 		return 1;
 	}
 
@@ -363,20 +383,24 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid
 /* Phase 2 - core frequency transition */
 static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
 {
-	u32 vcoreqfid, vcocurrfid, vcofiddiff, fid_interval, savevid = data->currvid;
+	u32 vcoreqfid, vcocurrfid, vcofiddiff;
+	u32 fid_interval, savevid = data->currvid;
 
-	if ((reqfid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) {
-		printk(KERN_ERR PFX "ph2: illegal lo-lo transition 0x%x 0x%x\n",
-			reqfid, data->currfid);
+	if ((reqfid < HI_FID_TABLE_BOTTOM) &&
+	    (data->currfid < HI_FID_TABLE_BOTTOM)) {
+		printk(KERN_ERR PFX "ph2: illegal lo-lo transition "
+				"0x%x 0x%x\n", reqfid, data->currfid);
 		return 1;
 	}
 
 	if (data->currfid == reqfid) {
-		printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid);
+		printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
+				data->currfid);
 		return 0;
 	}
 
-	dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, reqfid 0x%x\n",
+	dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
+		"reqfid 0x%x\n",
 		smp_processor_id(),
 		data->currfid, data->currvid, reqfid);
 
@@ -390,14 +414,14 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
 
 		if (reqfid > data->currfid) {
 			if (data->currfid > LO_FID_TABLE_TOP) {
-				if (write_new_fid(data, data->currfid + fid_interval)) {
+				if (write_new_fid(data,
+						data->currfid + fid_interval))
 					return 1;
-				}
 			} else {
 				if (write_new_fid
-				    (data, 2 + convert_fid_to_vco_fid(data->currfid))) {
+				    (data,
+				     2 + convert_fid_to_vco_fid(data->currfid)))
 					return 1;
-				}
 			}
 		} else {
 			if (write_new_fid(data, data->currfid - fid_interval))
@@ -417,7 +441,8 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
 
 	if (data->currfid != reqfid) {
 		printk(KERN_ERR PFX
-			"ph2: mismatch, failed fid transition, curr 0x%x, req 0x%x\n",
+			"ph2: mismatch, failed fid transition, "
+			"curr 0x%x, req 0x%x\n",
 			data->currfid, reqfid);
 		return 1;
 	}
@@ -435,7 +460,8 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
 }
 
 /* Phase 3 - core voltage transition flow ... jump to the final vid. */
-static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid)
+static int core_voltage_post_transition(struct powernow_k8_data *data,
+		u32 reqvid)
 {
 	u32 savefid = data->currfid;
 	u32 savereqvid = reqvid;
@@ -457,7 +483,8 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi
 
 		if (data->currvid != reqvid) {
 			printk(KERN_ERR PFX
-			       "ph3: failed vid transition\n, req 0x%x, curr 0x%x",
+			       "ph3: failed vid transition\n, "
+			       "req 0x%x, curr 0x%x",
 			       reqvid, data->currvid);
 			return 1;
 		}
@@ -508,7 +535,8 @@ static int check_supported_cpu(unsigned int cpu)
 	if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
 		if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
 		    ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
-			printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax);
+			printk(KERN_INFO PFX
+				"Processor cpuid %x not supported\n", eax);
 			goto out;
 		}
 
@@ -520,8 +548,10 @@ static int check_supported_cpu(unsigned int cpu)
 		}
 
 		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
-		if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) {
-			printk(KERN_INFO PFX "Power state transitions not supported\n");
+		if ((edx & P_STATE_TRANSITION_CAPABLE)
+			!= P_STATE_TRANSITION_CAPABLE) {
+			printk(KERN_INFO PFX
+				"Power state transitions not supported\n");
 			goto out;
 		}
 	} else { /* must be a HW Pstate capable processor */
@@ -539,7 +569,8 @@ out:
 	return rc;
 }
 
-static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid)
+static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
+		u8 maxvid)
 {
 	unsigned int j;
 	u8 lastfid = 0xff;
@@ -550,12 +581,14 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
 			       j, pst[j].vid);
 			return -EINVAL;
 		}
-		if (pst[j].vid < data->rvo) {	/* vid + rvo >= 0 */
+		if (pst[j].vid < data->rvo) {
+			/* vid + rvo >= 0 */
 			printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
 			       " %d\n", j);
 			return -ENODEV;
 		}
-		if (pst[j].vid < maxvid + data->rvo) {	/* vid + rvo >= maxvid */
+		if (pst[j].vid < maxvid + data->rvo) {
+			/* vid + rvo >= maxvid */
 			printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
 			       " %d\n", j);
 			return -ENODEV;
@@ -579,23 +612,31 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
 		return -EINVAL;
 	}
 	if (lastfid > LO_FID_TABLE_TOP)
-		printk(KERN_INFO FW_BUG PFX  "first fid not from lo freq table\n");
+		printk(KERN_INFO FW_BUG PFX
+			"first fid not from lo freq table\n");
 
 	return 0;
 }
 
+static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry)
+{
+	data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
+}
+
 static void print_basics(struct powernow_k8_data *data)
 {
 	int j;
 	for (j = 0; j < data->numps; j++) {
-		if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID) {
+		if (data->powernow_table[j].frequency !=
+				CPUFREQ_ENTRY_INVALID) {
 			if (cpu_family == CPU_HW_PSTATE) {
-				printk(KERN_INFO PFX "   %d : pstate %d (%d MHz)\n",
-					j,
+				printk(KERN_INFO PFX
+					"   %d : pstate %d (%d MHz)\n", j,
 					data->powernow_table[j].index,
 					data->powernow_table[j].frequency/1000);
 			} else {
-				printk(KERN_INFO PFX "   %d : fid 0x%x (%d MHz), vid 0x%x\n",
+				printk(KERN_INFO PFX
+					"   %d : fid 0x%x (%d MHz), vid 0x%x\n",
 					j,
 					data->powernow_table[j].index & 0xff,
 					data->powernow_table[j].frequency/1000,
@@ -604,20 +645,25 @@ static void print_basics(struct powernow_k8_data *data)
 		}
 	}
 	if (data->batps)
-		printk(KERN_INFO PFX "Only %d pstates on battery\n", data->batps);
+		printk(KERN_INFO PFX "Only %d pstates on battery\n",
+				data->batps);
 }
 
-static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid)
+static int fill_powernow_table(struct powernow_k8_data *data,
+		struct pst_s *pst, u8 maxvid)
 {
 	struct cpufreq_frequency_table *powernow_table;
 	unsigned int j;
 
-	if (data->batps) {    /* use ACPI support to get full speed on mains power */
-		printk(KERN_WARNING PFX "Only %d pstates usable (use ACPI driver for full range\n", data->batps);
+	if (data->batps) {
+		/* use ACPI support to get full speed on mains power */
+		printk(KERN_WARNING PFX
+			"Only %d pstates usable (use ACPI driver for full "
+			"range\n", data->batps);
 		data->numps = data->batps;
 	}
 
-	for ( j=1; j<data->numps; j++ ) {
+	for (j = 1; j < data->numps; j++) {
 		if (pst[j-1].fid >= pst[j].fid) {
 			printk(KERN_ERR PFX "PST out of sequence\n");
 			return -EINVAL;
@@ -640,9 +686,11 @@ static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst,
 	}
 
 	for (j = 0; j < data->numps; j++) {
+		int freq;
 		powernow_table[j].index = pst[j].fid; /* lower 8 bits */
 		powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
-		powernow_table[j].frequency = find_khz_freq_from_fid(pst[j].fid);
+		freq = find_khz_freq_from_fid(pst[j].fid);
+		powernow_table[j].frequency = freq;
 	}
 	powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
 	powernow_table[data->numps].index = 0;
@@ -654,11 +702,12 @@ static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst,
 
 	dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
 	data->powernow_table = powernow_table;
-	if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu)
+	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
 		print_basics(data);
 
 	for (j = 0; j < data->numps; j++)
-		if ((pst[j].fid==data->currfid) && (pst[j].vid==data->currvid))
+		if ((pst[j].fid == data->currfid) &&
+		    (pst[j].vid == data->currvid))
 			return 0;
 
 	dprintk("currfid/vid do not match PST, ignoring\n");
@@ -698,7 +747,8 @@ static int find_psb_table(struct powernow_k8_data *data)
 		}
 
 		data->vstable = psb->vstable;
-		dprintk("voltage stabilization time: %d(*20us)\n", data->vstable);
+		dprintk("voltage stabilization time: %d(*20us)\n",
+				data->vstable);
 
 		dprintk("flags2: 0x%x\n", psb->flags2);
 		data->rvo = psb->flags2 & 3;
@@ -713,11 +763,12 @@ static int find_psb_table(struct powernow_k8_data *data)
 
 		dprintk("numpst: 0x%x\n", psb->num_tables);
 		cpst = psb->num_tables;
-		if ((psb->cpuid == 0x00000fc0) || (psb->cpuid == 0x00000fe0) ){
+		if ((psb->cpuid == 0x00000fc0) ||
+		    (psb->cpuid == 0x00000fe0)) {
 			thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-			if ((thiscpuid == 0x00000fc0) || (thiscpuid == 0x00000fe0) ) {
+			if ((thiscpuid == 0x00000fc0) ||
+			    (thiscpuid == 0x00000fe0))
 				cpst = 1;
-			}
 		}
 		if (cpst != 1) {
 			printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
@@ -732,7 +783,8 @@ static int find_psb_table(struct powernow_k8_data *data)
 
 		data->numps = psb->numps;
 		dprintk("numpstates: 0x%x\n", data->numps);
-		return fill_powernow_table(data, (struct pst_s *)(psb+1), maxvid);
+		return fill_powernow_table(data,
+				(struct pst_s *)(psb+1), maxvid);
 	}
 	/*
 	 * If you see this message, complain to BIOS manufacturer. If
@@ -745,28 +797,31 @@ static int find_psb_table(struct powernow_k8_data *data)
 	 * BIOS and Kernel Developer's Guide, which is available on
 	 * www.amd.com
 	 */
-	printk(KERN_ERR PFX "BIOS error - no PSB or ACPI _PSS objects\n");
+	printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
 	return -ENODEV;
 }
 
-#ifdef CONFIG_X86_POWERNOW_K8_ACPI
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index)
+static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
+		unsigned int index)
 {
+	acpi_integer control;
+
 	if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
 		return;
 
-	data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK;
-	data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK;
-	data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
-	data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK;
-	data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK);
-	data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK;
-}
+	control = data->acpi_data.states[index].control; data->irt = (control
+			>> IRT_SHIFT) & IRT_MASK; data->rvo = (control >>
+				RVO_SHIFT) & RVO_MASK; data->exttype = (control
+					>> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
+	data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1
+		<< ((control >> MVS_SHIFT) & MVS_MASK); data->vstable =
+		(control >> VST_SHIFT) & VST_MASK; }
 
 static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 {
 	struct cpufreq_frequency_table *powernow_table;
 	int ret_val = -ENODEV;
+	acpi_integer space_id;
 
 	if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
 		dprintk("register performance failed: bad ACPI data\n");
@@ -779,11 +834,12 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 		goto err_out;
 	}
 
-	if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-		(data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
+	space_id = data->acpi_data.control_register.space_id;
+	if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
+		(space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
 		dprintk("Invalid control/status registers (%x - %x)\n",
 			data->acpi_data.control_register.space_id,
-			data->acpi_data.status_register.space_id);
+			space_id);
 		goto err_out;
 	}
 
@@ -802,13 +858,14 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 	if (ret_val)
 		goto err_out_mem;
 
-	powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END;
+	powernow_table[data->acpi_data.state_count].frequency =
+		CPUFREQ_TABLE_END;
 	powernow_table[data->acpi_data.state_count].index = 0;
 	data->powernow_table = powernow_table;
 
 	/* fill in data */
 	data->numps = data->acpi_data.state_count;
-	if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu)
+	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
 		print_basics(data);
 	powernow_k8_acpi_pst_values(data, 0);
 
@@ -830,13 +887,15 @@ err_out_mem:
 err_out:
 	acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
 
-	/* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
+	/* data->acpi_data.state_count informs us at ->exit()
+	 * whether ACPI was used */
 	data->acpi_data.state_count = 0;
 
 	return ret_val;
 }
 
-static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
+static int fill_powernow_table_pstate(struct powernow_k8_data *data,
+		struct cpufreq_frequency_table *powernow_table)
 {
 	int i;
 	u32 hi = 0, lo = 0;
@@ -848,84 +907,101 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
 
 		index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
 		if (index > data->max_hw_pstate) {
-			printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index);
-			printk(KERN_ERR PFX "Please report to BIOS manufacturer\n");
-			powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+			printk(KERN_ERR PFX "invalid pstate %d - "
+					"bad value %d.\n", i, index);
+			printk(KERN_ERR PFX "Please report to BIOS "
+					"manufacturer\n");
+			invalidate_entry(data, i);
 			continue;
 		}
 		rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
 		if (!(hi & HW_PSTATE_VALID_MASK)) {
 			dprintk("invalid pstate %d, ignoring\n", index);
-			powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+			invalidate_entry(data, i);
 			continue;
 		}
 
 		powernow_table[i].index = index;
 
-		powernow_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000;
+		powernow_table[i].frequency =
+			data->acpi_data.states[i].core_frequency * 1000;
 	}
 	return 0;
 }
 
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
+static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
+		struct cpufreq_frequency_table *powernow_table)
 {
 	int i;
 	int cntlofreq = 0;
+
 	for (i = 0; i < data->acpi_data.state_count; i++) {
 		u32 fid;
 		u32 vid;
+		u32 freq, index;
+		acpi_integer status, control;
 
 		if (data->exttype) {
-			fid = data->acpi_data.states[i].status & EXT_FID_MASK;
-			vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK;
+			status =  data->acpi_data.states[i].status;
+			fid = status & EXT_FID_MASK;
+			vid = (status >> VID_SHIFT) & EXT_VID_MASK;
 		} else {
-			fid = data->acpi_data.states[i].control & FID_MASK;
-			vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK;
+			control =  data->acpi_data.states[i].control;
+			fid = control & FID_MASK;
+			vid = (control >> VID_SHIFT) & VID_MASK;
 		}
 
 		dprintk("   %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
 
-		powernow_table[i].index = fid; /* lower 8 bits */
-		powernow_table[i].index |= (vid << 8); /* upper 8 bits */
-		powernow_table[i].frequency = find_khz_freq_from_fid(fid);
+		index = fid | (vid<<8);
+		powernow_table[i].index = index;
+
+		freq = find_khz_freq_from_fid(fid);
+		powernow_table[i].frequency = freq;
 
 		/* verify frequency is OK */
-		if ((powernow_table[i].frequency > (MAX_FREQ * 1000)) ||
-			(powernow_table[i].frequency < (MIN_FREQ * 1000))) {
-			dprintk("invalid freq %u kHz, ignoring\n", powernow_table[i].frequency);
-			powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+		if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
+			dprintk("invalid freq %u kHz, ignoring\n", freq);
+			invalidate_entry(data, i);
 			continue;
 		}
 
-		/* verify voltage is OK - BIOSs are using "off" to indicate invalid */
+		/* verify voltage is OK -
+		 * BIOSs are using "off" to indicate invalid */
 		if (vid == VID_OFF) {
 			dprintk("invalid vid %u, ignoring\n", vid);
-			powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+			invalidate_entry(data, i);
 			continue;
 		}
 
 		/* verify only 1 entry from the lo frequency table */
 		if (fid < HI_FID_TABLE_BOTTOM) {
 			if (cntlofreq) {
-				/* if both entries are the same, ignore this one ... */
-				if ((powernow_table[i].frequency != powernow_table[cntlofreq].frequency) ||
-				    (powernow_table[i].index != powernow_table[cntlofreq].index)) {
-					printk(KERN_ERR PFX "Too many lo freq table entries\n");
+				/* if both entries are the same,
+				 * ignore this one ... */
+				if ((freq != powernow_table[cntlofreq].frequency) ||
+				    (index != powernow_table[cntlofreq].index)) {
+					printk(KERN_ERR PFX
+						"Too many lo freq table "
+						"entries\n");
 					return 1;
 				}
 
-				dprintk("double low frequency table entry, ignoring it.\n");
-				powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+				dprintk("double low frequency table entry, "
+						"ignoring it.\n");
+				invalidate_entry(data, i);
 				continue;
 			} else
 				cntlofreq = i;
 		}
 
-		if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) {
-			printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n",
-				powernow_table[i].frequency,
-				(unsigned int) (data->acpi_data.states[i].core_frequency * 1000));
-			powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+		if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
+			printk(KERN_INFO PFX "invalid freq entries "
+				"%u kHz vs. %u kHz\n", freq,
+				(unsigned int)
+				(data->acpi_data.states[i].core_frequency
+				 * 1000));
+			invalidate_entry(data, i);
 			continue;
 		}
 	}
@@ -935,7 +1011,8 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
 static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
 {
 	if (data->acpi_data.state_count)
-		acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
+		acpi_processor_unregister_performance(&data->acpi_data,
+				data->cpu);
 	free_cpumask_var(data->acpi_data.shared_cpu_map);
 }
 
@@ -953,15 +1030,9 @@ static int get_transition_latency(struct powernow_k8_data *data)
 	return 1000 * max_latency;
 }
 
-#else
-static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; }
-static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; }
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; }
-static int get_transition_latency(struct powernow_k8_data *data) { return 0; }
-#endif /* CONFIG_X86_POWERNOW_K8_ACPI */
-
 /* Take a frequency, and issue the fid/vid transition command */
-static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned int index)
+static int transition_frequency_fidvid(struct powernow_k8_data *data,
+		unsigned int index)
 {
 	u32 fid = 0;
 	u32 vid = 0;
@@ -989,7 +1060,8 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
 		return 0;
 	}
 
-	if ((fid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) {
+	if ((fid < HI_FID_TABLE_BOTTOM) &&
+	    (data->currfid < HI_FID_TABLE_BOTTOM)) {
 		printk(KERN_ERR PFX
 		       "ignoring illegal change in lo freq table-%x to 0x%x\n",
 		       data->currfid, fid);
@@ -1017,7 +1089,8 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
 }
 
 /* Take a frequency, and issue the hardware pstate transition command */
-static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned int index)
+static int transition_frequency_pstate(struct powernow_k8_data *data,
+		unsigned int index)
 {
 	u32 pstate = 0;
 	int res, i;
@@ -1029,7 +1102,8 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
 	pstate = index & HW_PSTATE_MASK;
 	if (pstate > data->max_hw_pstate)
 		return 0;
-	freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
+	freqs.old = find_khz_freq_from_pstate(data->powernow_table,
+			data->currpstate);
 	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
 
 	for_each_cpu_mask_nr(i, *(data->available_cores)) {
@@ -1048,7 +1122,8 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
 }
 
 /* Driver entry point to switch to the target frequency */
-static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
+static int powernowk8_target(struct cpufreq_policy *pol,
+		unsigned targfreq, unsigned relation)
 {
 	cpumask_t oldmask;
 	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
@@ -1087,14 +1162,18 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
 		dprintk("targ: curr fid 0x%x, vid 0x%x\n",
 		data->currfid, data->currvid);
 
-		if ((checkvid != data->currvid) || (checkfid != data->currfid)) {
+		if ((checkvid != data->currvid) ||
+		    (checkfid != data->currfid)) {
 			printk(KERN_INFO PFX
-				"error - out of sync, fix 0x%x 0x%x, vid 0x%x 0x%x\n",
-				checkfid, data->currfid, checkvid, data->currvid);
+				"error - out of sync, fix 0x%x 0x%x, "
+				"vid 0x%x 0x%x\n",
+				checkfid, data->currfid,
+				checkvid, data->currvid);
 		}
 	}
 
-	if (cpufreq_frequency_table_target(pol, data->powernow_table, targfreq, relation, &newstate))
+	if (cpufreq_frequency_table_target(pol, data->powernow_table,
+				targfreq, relation, &newstate))
 		goto err_out;
 
 	mutex_lock(&fidvid_mutex);
@@ -1114,7 +1193,8 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
 	mutex_unlock(&fidvid_mutex);
 
 	if (cpu_family == CPU_HW_PSTATE)
-		pol->cur = find_khz_freq_from_pstate(data->powernow_table, newstate);
+		pol->cur = find_khz_freq_from_pstate(data->powernow_table,
+				newstate);
 	else
 		pol->cur = find_khz_freq_from_fid(data->currfid);
 	ret = 0;
@@ -1141,6 +1221,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 	struct powernow_k8_data *data;
 	cpumask_t oldmask;
 	int rc;
+	static int print_once;
 
 	if (!cpu_online(pol->cpu))
 		return -ENODEV;
@@ -1163,33 +1244,31 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 		 * an UP version, and is deprecated by AMD.
 		 */
 		if (num_online_cpus() != 1) {
-#ifndef CONFIG_ACPI_PROCESSOR
-			printk(KERN_ERR PFX "ACPI Processor support is required "
-			       "for SMP systems but is absent. Please load the "
-			       "ACPI Processor module before starting this "
-			       "driver.\n");
-#else
-			printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide"
-			       " ACPI _PSS objects in a way that Linux "
-			       "understands. Please report this to the Linux "
-			       "ACPI maintainers and complain to your BIOS "
-			       "vendor.\n");
-#endif
-			kfree(data);
-			return -ENODEV;
+			/*
+			 * Replace this one with print_once as soon as such a
+			 * thing gets introduced
+			 */
+			if (!print_once) {
+				WARN_ONCE(1, KERN_ERR FW_BUG PFX "Your BIOS "
+					"does not provide ACPI _PSS objects "
+					"in a way that Linux understands. "
+					"Please report this to the Linux ACPI"
+					" maintainers and complain to your "
+					"BIOS vendor.\n");
+				print_once++;
+			}
+			goto err_out;
 		}
 		if (pol->cpu != 0) {
 			printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
 			       "CPU other than CPU0. Complain to your BIOS "
 			       "vendor.\n");
-			kfree(data);
-			return -ENODEV;
+			goto err_out;
 		}
 		rc = find_psb_table(data);
-		if (rc) {
-			kfree(data);
-			return -ENODEV;
-		}
+		if (rc)
+			goto err_out;
+
 		/* Take a crude guess here.
 		 * That guess was in microseconds, so multiply with 1000 */
 		pol->cpuinfo.transition_latency = (
@@ -1204,16 +1283,16 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 
 	if (smp_processor_id() != pol->cpu) {
 		printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
-		goto err_out;
+		goto err_out_unmask;
 	}
 
 	if (pending_bit_stuck()) {
 		printk(KERN_ERR PFX "failing init, change pending bit set\n");
-		goto err_out;
+		goto err_out_unmask;
 	}
 
 	if (query_current_values_with_pending_wait(data))
-		goto err_out;
+		goto err_out_unmask;
 
 	if (cpu_family == CPU_OPTERON)
 		fidvid_msr_init();
@@ -1224,11 +1303,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 	if (cpu_family == CPU_HW_PSTATE)
 		cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
 	else
-		cpumask_copy(pol->cpus, &per_cpu(cpu_core_map, pol->cpu));
+		cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
 	data->available_cores = pol->cpus;
 
 	if (cpu_family == CPU_HW_PSTATE)
-		pol->cur = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
+		pol->cur = find_khz_freq_from_pstate(data->powernow_table,
+				data->currpstate);
 	else
 		pol->cur = find_khz_freq_from_fid(data->currfid);
 	dprintk("policy current frequency %d kHz\n", pol->cur);
@@ -1245,7 +1325,8 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 	cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
 
 	if (cpu_family == CPU_HW_PSTATE)
-		dprintk("cpu_init done, current pstate 0x%x\n", data->currpstate);
+		dprintk("cpu_init done, current pstate 0x%x\n",
+				data->currpstate);
 	else
 		dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
 			data->currfid, data->currvid);
@@ -1254,15 +1335,16 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 
 	return 0;
 
-err_out:
+err_out_unmask:
 	set_cpus_allowed_ptr(current, &oldmask);
 	powernow_k8_cpu_exit_acpi(data);
 
+err_out:
 	kfree(data);
 	return -ENODEV;
 }
 
-static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
+static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
 {
 	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
 
@@ -1279,14 +1361,14 @@ static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
 	return 0;
 }
 
-static unsigned int powernowk8_get (unsigned int cpu)
+static unsigned int powernowk8_get(unsigned int cpu)
 {
 	struct powernow_k8_data *data;
 	cpumask_t oldmask = current->cpus_allowed;
 	unsigned int khz = 0;
 	unsigned int first;
 
-	first = first_cpu(per_cpu(cpu_core_map, cpu));
+	first = cpumask_first(cpu_core_mask(cpu));
 	data = per_cpu(powernow_data, first);
 
 	if (!data)
@@ -1315,7 +1397,7 @@ out:
 	return khz;
 }
 
-static struct freq_attr* powernow_k8_attr[] = {
+static struct freq_attr *powernow_k8_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
 	NULL,
 };
@@ -1360,7 +1442,8 @@ static void __exit powernowk8_exit(void)
 	cpufreq_unregister_driver(&cpufreq_amd64_driver);
 }
 
-MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and Mark Langsdorf <mark.langsdorf@amd.com>");
+MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
+		"Mark Langsdorf <mark.langsdorf@amd.com>");
 MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
 MODULE_LICENSE("GPL");
 
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 8ecc75b6c7c..6c6698feade 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -45,11 +45,10 @@ struct powernow_k8_data {
 	 * frequency is in kHz */
 	struct cpufreq_frequency_table  *powernow_table;
 
-#ifdef CONFIG_X86_POWERNOW_K8_ACPI
 	/* the acpi table needs to be kept. it's only available if ACPI was
 	 * used to determine valid frequency/vid/fid states */
 	struct acpi_processor_performance acpi_data;
-#endif
+
 	/* we need to keep track of associated cores, but let cpufreq
 	 * handle hotplug events - so just point at cpufreq pol->cpus
 	 * structure */
@@ -222,10 +221,8 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
 
 static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
 
-#ifdef CONFIG_X86_POWERNOW_K8_ACPI
 static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
 static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
-#endif
 
 #ifdef CONFIG_SMP
 static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
index 42da9bd677d..435a996a613 100644
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
@@ -19,17 +19,19 @@
 
 #include <linux/delay.h>
 #include <linux/cpufreq.h>
+#include <linux/timex.h>
+#include <linux/io.h>
 
 #include <asm/msr.h>
-#include <asm/timex.h>
-#include <asm/io.h>
 
 #define MMCR_BASE	0xfffef000	/* The default base address */
 #define OFFS_CPUCTL	0x2   /* CPU Control Register */
 
 static __u8 __iomem *cpuctl;
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "sc520_freq", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+		"sc520_freq", msg)
+#define PFX "sc520_freq: "
 
 static struct cpufreq_frequency_table sc520_freq_table[] = {
 	{0x01,	100000},
@@ -43,7 +45,8 @@ static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
 
 	switch (clockspeed_reg & 0x03) {
 	default:
-		printk(KERN_ERR "sc520_freq: error: cpuctl register has unexpected value %02x\n", clockspeed_reg);
+		printk(KERN_ERR PFX "error: cpuctl register has unexpected "
+				"value %02x\n", clockspeed_reg);
 	case 0x01:
 		return 100000;
 	case 0x02:
@@ -51,7 +54,7 @@ static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
 	}
 }
 
-static void sc520_freq_set_cpu_state (unsigned int state)
+static void sc520_freq_set_cpu_state(unsigned int state)
 {
 
 	struct cpufreq_freqs	freqs;
@@ -76,18 +79,19 @@ static void sc520_freq_set_cpu_state (unsigned int state)
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 };
 
-static int sc520_freq_verify (struct cpufreq_policy *policy)
+static int sc520_freq_verify(struct cpufreq_policy *policy)
 {
 	return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
 }
 
-static int sc520_freq_target (struct cpufreq_policy *policy,
+static int sc520_freq_target(struct cpufreq_policy *policy,
 			    unsigned int target_freq,
 			    unsigned int relation)
 {
 	unsigned int newstate = 0;
 
-	if (cpufreq_frequency_table_target(policy, sc520_freq_table, target_freq, relation, &newstate))
+	if (cpufreq_frequency_table_target(policy, sc520_freq_table,
+				target_freq, relation, &newstate))
 		return -EINVAL;
 
 	sc520_freq_set_cpu_state(newstate);
@@ -116,7 +120,7 @@ static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
 
 	result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
 	if (result)
-		return (result);
+		return result;
 
 	cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
 
@@ -131,7 +135,7 @@ static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
 }
 
 
-static struct freq_attr* sc520_freq_attr[] = {
+static struct freq_attr *sc520_freq_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
 	NULL,
 };
@@ -155,13 +159,13 @@ static int __init sc520_freq_init(void)
 	int err;
 
 	/* Test if we have the right hardware */
-	if(c->x86_vendor != X86_VENDOR_AMD ||
-				c->x86 != 4 || c->x86_model != 9) {
+	if (c->x86_vendor != X86_VENDOR_AMD ||
+	    c->x86 != 4 || c->x86_model != 9) {
 		dprintk("no Elan SC520 processor found!\n");
 		return -ENODEV;
 	}
 	cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
-	if(!cpuctl) {
+	if (!cpuctl) {
 		printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
 		return -ENOMEM;
 	}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index dedc1e98f16..016c1a4fa3f 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev;
 
 /* speedstep_processor
  */
-static unsigned int speedstep_processor = 0;
+static unsigned int speedstep_processor;
 
 static u32 pmbase;
 
@@ -54,7 +54,8 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
 };
 
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-ich", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+		"speedstep-ich", msg)
 
 
 /**
@@ -62,7 +63,7 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
  *
  * Returns: -ENODEV if no register could be found
  */
-static int speedstep_find_register (void)
+static int speedstep_find_register(void)
 {
 	if (!speedstep_chipset_dev)
 		return -ENODEV;
@@ -90,7 +91,7 @@ static int speedstep_find_register (void)
  *
  *   Tries to change the SpeedStep state.
  */
-static void speedstep_set_state (unsigned int state)
+static void speedstep_set_state(unsigned int state)
 {
 	u8 pm2_blk;
 	u8 value;
@@ -133,11 +134,11 @@ static void speedstep_set_state (unsigned int state)
 
 	dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
 
-	if (state == (value & 0x1)) {
-		dprintk("change to %u MHz succeeded\n", (speedstep_get_processor_frequency(speedstep_processor) / 1000));
-	} else {
-		printk (KERN_ERR "cpufreq: change failed - I/O error\n");
-	}
+	if (state == (value & 0x1))
+		dprintk("change to %u MHz succeeded\n",
+			speedstep_get_frequency(speedstep_processor) / 1000);
+	else
+		printk(KERN_ERR "cpufreq: change failed - I/O error\n");
 
 	return;
 }
@@ -149,7 +150,7 @@ static void speedstep_set_state (unsigned int state)
  *   Tries to activate the SpeedStep status and control registers.
  * Returns -EINVAL on an unsupported chipset, and zero on success.
  */
-static int speedstep_activate (void)
+static int speedstep_activate(void)
 {
 	u16 value = 0;
 
@@ -175,20 +176,18 @@ static int speedstep_activate (void)
  * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
  * chipset, or zero on failure.
  */
-static unsigned int speedstep_detect_chipset (void)
+static unsigned int speedstep_detect_chipset(void)
 {
 	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
 			      PCI_DEVICE_ID_INTEL_82801DB_12,
-			      PCI_ANY_ID,
-			      PCI_ANY_ID,
+			      PCI_ANY_ID, PCI_ANY_ID,
 			      NULL);
 	if (speedstep_chipset_dev)
 		return 4; /* 4-M */
 
 	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
 			      PCI_DEVICE_ID_INTEL_82801CA_12,
-			      PCI_ANY_ID,
-			      PCI_ANY_ID,
+			      PCI_ANY_ID, PCI_ANY_ID,
 			      NULL);
 	if (speedstep_chipset_dev)
 		return 3; /* 3-M */
@@ -196,8 +195,7 @@ static unsigned int speedstep_detect_chipset (void)
 
 	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
 			      PCI_DEVICE_ID_INTEL_82801BA_10,
-			      PCI_ANY_ID,
-			      PCI_ANY_ID,
+			      PCI_ANY_ID, PCI_ANY_ID,
 			      NULL);
 	if (speedstep_chipset_dev) {
 		/* speedstep.c causes lockups on Dell Inspirons 8000 and
@@ -208,8 +206,7 @@ static unsigned int speedstep_detect_chipset (void)
 
 		hostbridge  = pci_get_subsys(PCI_VENDOR_ID_INTEL,
 			      PCI_DEVICE_ID_INTEL_82815_MC,
-			      PCI_ANY_ID,
-			      PCI_ANY_ID,
+			      PCI_ANY_ID, PCI_ANY_ID,
 			      NULL);
 
 		if (!hostbridge)
@@ -236,7 +233,7 @@ static unsigned int _speedstep_get(const struct cpumask *cpus)
 
 	cpus_allowed = current->cpus_allowed;
 	set_cpus_allowed_ptr(current, cpus);
-	speed = speedstep_get_processor_frequency(speedstep_processor);
+	speed = speedstep_get_frequency(speedstep_processor);
 	set_cpus_allowed_ptr(current, &cpus_allowed);
 	dprintk("detected %u kHz as current frequency\n", speed);
 	return speed;
@@ -251,11 +248,12 @@ static unsigned int speedstep_get(unsigned int cpu)
  * speedstep_target - set a new CPUFreq policy
  * @policy: new policy
  * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ * @relation: how that frequency relates to achieved frequency
+ *	(CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
  *
  * Sets a new CPUFreq policy.
  */
-static int speedstep_target (struct cpufreq_policy *policy,
+static int speedstep_target(struct cpufreq_policy *policy,
 			     unsigned int target_freq,
 			     unsigned int relation)
 {
@@ -264,7 +262,8 @@ static int speedstep_target (struct cpufreq_policy *policy,
 	cpumask_t cpus_allowed;
 	int i;
 
-	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
+	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
+				target_freq, relation, &newstate))
 		return -EINVAL;
 
 	freqs.old = _speedstep_get(policy->cpus);
@@ -308,7 +307,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
  * Limit must be within speedstep_low_freq and speedstep_high_freq, with
  * at least one border included.
  */
-static int speedstep_verify (struct cpufreq_policy *policy)
+static int speedstep_verify(struct cpufreq_policy *policy)
 {
 	return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
 }
@@ -322,7 +321,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 
 	/* only run on CPU to be set, or on its sibling */
 #ifdef CONFIG_SMP
-	cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu));
+	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
 #endif
 
 	cpus_allowed = current->cpus_allowed;
@@ -344,7 +343,8 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 		return -EIO;
 
 	dprintk("currently at %s speed setting - %i MHz\n",
-		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high",
+		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
+		? "low" : "high",
 		(speed / 1000));
 
 	/* cpuinfo and default policy values */
@@ -352,9 +352,9 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 
 	result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
 	if (result)
-		return (result);
+		return result;
 
-        cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
+	cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
 
 	return 0;
 }
@@ -366,7 +366,7 @@ static int speedstep_cpu_exit(struct cpufreq_policy *policy)
 	return 0;
 }
 
-static struct freq_attr* speedstep_attr[] = {
+static struct freq_attr *speedstep_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
 	NULL,
 };
@@ -396,13 +396,15 @@ static int __init speedstep_init(void)
 	/* detect processor */
 	speedstep_processor = speedstep_detect_processor();
 	if (!speedstep_processor) {
-		dprintk("Intel(R) SpeedStep(TM) capable processor not found\n");
+		dprintk("Intel(R) SpeedStep(TM) capable processor "
+				"not found\n");
 		return -ENODEV;
 	}
 
 	/* detect chipset */
 	if (!speedstep_detect_chipset()) {
-		dprintk("Intel(R) SpeedStep(TM) for this chipset not (yet) available.\n");
+		dprintk("Intel(R) SpeedStep(TM) for this chipset not "
+				"(yet) available.\n");
 		return -ENODEV;
 	}
 
@@ -431,9 +433,11 @@ static void __exit speedstep_exit(void)
 }
 
 
-MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
-MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
+		"Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
+		"with ICH-M southbridges.");
+MODULE_LICENSE("GPL");
 
 module_init(speedstep_init);
 module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index cdac7d62369..2e3c6862657 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -16,12 +16,16 @@
 #include <linux/slab.h>
 
 #include <asm/msr.h>
+#include <asm/tsc.h>
 #include "speedstep-lib.h"
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-lib", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+		"speedstep-lib", msg)
+
+#define PFX "speedstep-lib: "
 
 #ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-static int relaxed_check = 0;
+static int relaxed_check;
 #else
 #define relaxed_check 0
 #endif
@@ -30,14 +34,14 @@ static int relaxed_check = 0;
  *                   GET PROCESSOR CORE SPEED IN KHZ                 *
  *********************************************************************/
 
-static unsigned int pentium3_get_frequency (unsigned int processor)
+static unsigned int pentium3_get_frequency(unsigned int processor)
 {
-        /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
+	/* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
 	struct {
 		unsigned int ratio;	/* Frequency Multiplier (x10) */
 		u8 bitmap;		/* power on configuration bits
 					[27, 25:22] (in MSR 0x2a) */
-	} msr_decode_mult [] = {
+	} msr_decode_mult[] = {
 		{ 30, 0x01 },
 		{ 35, 0x05 },
 		{ 40, 0x02 },
@@ -52,7 +56,7 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
 		{ 85, 0x26 },
 		{ 90, 0x20 },
 		{ 100, 0x2b },
-		{ 0, 0xff }     /* error or unknown value */
+		{ 0, 0xff }	/* error or unknown value */
 	};
 
 	/* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
@@ -60,7 +64,7 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
 		unsigned int value;	/* Front Side Bus speed in MHz */
 		u8 bitmap;		/* power on configuration bits [18: 19]
 					(in MSR 0x2a) */
-	} msr_decode_fsb [] = {
+	} msr_decode_fsb[] = {
 		{  66, 0x0 },
 		{ 100, 0x2 },
 		{ 133, 0x1 },
@@ -85,7 +89,7 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
 	}
 
 	/* decode the multiplier */
-	if (processor == SPEEDSTEP_PROCESSOR_PIII_C_EARLY) {
+	if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
 		dprintk("workaround for early PIIIs\n");
 		msr_lo &= 0x03c00000;
 	} else
@@ -97,9 +101,10 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
 		j++;
 	}
 
-	dprintk("speed is %u\n", (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
+	dprintk("speed is %u\n",
+		(msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
 
-	return (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100);
+	return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
 }
 
 
@@ -112,20 +117,23 @@ static unsigned int pentiumM_get_frequency(void)
 
 	/* see table B-2 of 24547212.pdf */
 	if (msr_lo & 0x00040000) {
-		printk(KERN_DEBUG "speedstep-lib: PM - invalid FSB: 0x%x 0x%x\n", msr_lo, msr_tmp);
+		printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
+				msr_lo, msr_tmp);
 		return 0;
 	}
 
 	msr_tmp = (msr_lo >> 22) & 0x1f;
-	dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * 100 * 1000));
+	dprintk("bits 22-26 are 0x%x, speed is %u\n",
+			msr_tmp, (msr_tmp * 100 * 1000));
 
-	return (msr_tmp * 100 * 1000);
+	return msr_tmp * 100 * 1000;
 }
 
 static unsigned int pentium_core_get_frequency(void)
 {
 	u32 fsb = 0;
 	u32 msr_lo, msr_tmp;
+	int ret;
 
 	rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
 	/* see table B-2 of 25366920.pdf */
@@ -153,12 +161,15 @@ static unsigned int pentium_core_get_frequency(void)
 	}
 
 	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-	dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
+	dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
+			msr_lo, msr_tmp);
 
 	msr_tmp = (msr_lo >> 22) & 0x1f;
-	dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * fsb));
+	dprintk("bits 22-26 are 0x%x, speed is %u\n",
+			msr_tmp, (msr_tmp * fsb));
 
-	return (msr_tmp * fsb);
+	ret = (msr_tmp * fsb);
+	return ret;
 }
 
 
@@ -167,6 +178,16 @@ static unsigned int pentium4_get_frequency(void)
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 	u32 msr_lo, msr_hi, mult;
 	unsigned int fsb = 0;
+	unsigned int ret;
+	u8 fsb_code;
+
+	/* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
+	 * to System Bus Frequency Ratio Field in the Processor Frequency
+	 * Configuration Register of the MSR. Therefore the current
+	 * frequency cannot be calculated and has to be measured.
+	 */
+	if (c->x86_model < 2)
+		return cpu_khz;
 
 	rdmsr(0x2c, msr_lo, msr_hi);
 
@@ -177,62 +198,61 @@ static unsigned int pentium4_get_frequency(void)
 	 * revision #12 in Table B-1: MSRs in the Pentium 4 and
 	 * Intel Xeon Processors, on page B-4 and B-5.
 	 */
-	if (c->x86_model < 2)
+	fsb_code = (msr_lo >> 16) & 0x7;
+	switch (fsb_code) {
+	case 0:
 		fsb = 100 * 1000;
-	else {
-		u8 fsb_code = (msr_lo >> 16) & 0x7;
-		switch (fsb_code) {
-		case 0:
-			fsb = 100 * 1000;
-			break;
-		case 1:
-			fsb = 13333 * 10;
-			break;
-		case 2:
-			fsb = 200 * 1000;
-			break;
-		}
+		break;
+	case 1:
+		fsb = 13333 * 10;
+		break;
+	case 2:
+		fsb = 200 * 1000;
+		break;
 	}
 
 	if (!fsb)
-		printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n");
+		printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
+				"Please send an e-mail to <linux@brodo.de>\n");
 
 	/* Multiplier. */
 	mult = msr_lo >> 24;
 
-	dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult));
+	dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
+			fsb, mult, (fsb * mult));
 
-	return (fsb * mult);
+	ret = (fsb * mult);
+	return ret;
 }
 
 
-unsigned int speedstep_get_processor_frequency(unsigned int processor)
+unsigned int speedstep_get_frequency(unsigned int processor)
 {
 	switch (processor) {
-	case SPEEDSTEP_PROCESSOR_PCORE:
+	case SPEEDSTEP_CPU_PCORE:
 		return pentium_core_get_frequency();
-	case SPEEDSTEP_PROCESSOR_PM:
+	case SPEEDSTEP_CPU_PM:
 		return pentiumM_get_frequency();
-	case SPEEDSTEP_PROCESSOR_P4D:
-	case SPEEDSTEP_PROCESSOR_P4M:
+	case SPEEDSTEP_CPU_P4D:
+	case SPEEDSTEP_CPU_P4M:
 		return pentium4_get_frequency();
-	case SPEEDSTEP_PROCESSOR_PIII_T:
-	case SPEEDSTEP_PROCESSOR_PIII_C:
-	case SPEEDSTEP_PROCESSOR_PIII_C_EARLY:
+	case SPEEDSTEP_CPU_PIII_T:
+	case SPEEDSTEP_CPU_PIII_C:
+	case SPEEDSTEP_CPU_PIII_C_EARLY:
 		return pentium3_get_frequency(processor);
 	default:
 		return 0;
 	};
 	return 0;
 }
-EXPORT_SYMBOL_GPL(speedstep_get_processor_frequency);
+EXPORT_SYMBOL_GPL(speedstep_get_frequency);
 
 
 /*********************************************************************
  *                 DETECT SPEEDSTEP-CAPABLE PROCESSOR                *
  *********************************************************************/
 
-unsigned int speedstep_detect_processor (void)
+unsigned int speedstep_detect_processor(void)
 {
 	struct cpuinfo_x86 *c = &cpu_data(0);
 	u32 ebx, msr_lo, msr_hi;
@@ -261,7 +281,7 @@ unsigned int speedstep_detect_processor (void)
 			 * sample has ebx = 0x0f, production has 0x0e.
 			 */
 			if ((ebx == 0x0e) || (ebx == 0x0f))
-				return SPEEDSTEP_PROCESSOR_P4M;
+				return SPEEDSTEP_CPU_P4M;
 			break;
 		case 7:
 			/*
@@ -272,7 +292,7 @@ unsigned int speedstep_detect_processor (void)
 			 * samples are only of B-stepping...
 			 */
 			if (ebx == 0x0e)
-				return SPEEDSTEP_PROCESSOR_P4M;
+				return SPEEDSTEP_CPU_P4M;
 			break;
 		case 9:
 			/*
@@ -288,10 +308,13 @@ unsigned int speedstep_detect_processor (void)
 			 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
 			 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
 			 * also, M-P4M HTs have ebx=0x8, too
-			 * For now, they are distinguished by the model_id string
+			 * For now, they are distinguished by the model_id
+			 * string
 			 */
-			if ((ebx == 0x0e) || (strstr(c->x86_model_id,"Mobile Intel(R) Pentium(R) 4") != NULL))
-				return SPEEDSTEP_PROCESSOR_P4M;
+			if ((ebx == 0x0e) ||
+				(strstr(c->x86_model_id,
+				    "Mobile Intel(R) Pentium(R) 4") != NULL))
+				return SPEEDSTEP_CPU_P4M;
 			break;
 		default:
 			break;
@@ -301,7 +324,8 @@ unsigned int speedstep_detect_processor (void)
 
 	switch (c->x86_model) {
 	case 0x0B: /* Intel PIII [Tualatin] */
-		/* cpuid_ebx(1) is 0x04 for desktop PIII, 0x06 for mobile PIII-M */
+		/* cpuid_ebx(1) is 0x04 for desktop PIII,
+		 * 0x06 for mobile PIII-M */
 		ebx = cpuid_ebx(0x00000001);
 		dprintk("ebx is %x\n", ebx);
 
@@ -313,14 +337,15 @@ unsigned int speedstep_detect_processor (void)
 		/* So far all PIII-M processors support SpeedStep. See
 		 * Intel's 24540640.pdf of June 2003
 		 */
-		return SPEEDSTEP_PROCESSOR_PIII_T;
+		return SPEEDSTEP_CPU_PIII_T;
 
 	case 0x08: /* Intel PIII [Coppermine] */
 
 		/* all mobile PIII Coppermines have FSB 100 MHz
 		 * ==> sort out a few desktop PIIIs. */
 		rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
-		dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", msr_lo, msr_hi);
+		dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
+				msr_lo, msr_hi);
 		msr_lo &= 0x00c0000;
 		if (msr_lo != 0x0080000)
 			return 0;
@@ -332,13 +357,15 @@ unsigned int speedstep_detect_processor (void)
 		 * bit 56 or 57 is set
 		 */
 		rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
-		dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", msr_lo, msr_hi);
-		if ((msr_hi & (1<<18)) && (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
+		dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
+				msr_lo, msr_hi);
+		if ((msr_hi & (1<<18)) &&
+		    (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
 			if (c->x86_mask == 0x01) {
 				dprintk("early PIII version\n");
-				return SPEEDSTEP_PROCESSOR_PIII_C_EARLY;
+				return SPEEDSTEP_CPU_PIII_C_EARLY;
 			} else
-				return SPEEDSTEP_PROCESSOR_PIII_C;
+				return SPEEDSTEP_CPU_PIII_C;
 		}
 
 	default:
@@ -369,7 +396,7 @@ unsigned int speedstep_get_freqs(unsigned int processor,
 	dprintk("trying to determine both speeds\n");
 
 	/* get current speed */
-	prev_speed = speedstep_get_processor_frequency(processor);
+	prev_speed = speedstep_get_frequency(processor);
 	if (!prev_speed)
 		return -EIO;
 
@@ -379,7 +406,7 @@ unsigned int speedstep_get_freqs(unsigned int processor,
 
 	/* switch to low state */
 	set_state(SPEEDSTEP_LOW);
-	*low_speed = speedstep_get_processor_frequency(processor);
+	*low_speed = speedstep_get_frequency(processor);
 	if (!*low_speed) {
 		ret = -EIO;
 		goto out;
@@ -398,7 +425,7 @@ unsigned int speedstep_get_freqs(unsigned int processor,
 	if (transition_latency)
 		do_gettimeofday(&tv2);
 
-	*high_speed = speedstep_get_processor_frequency(processor);
+	*high_speed = speedstep_get_frequency(processor);
 	if (!*high_speed) {
 		ret = -EIO;
 		goto out;
@@ -426,9 +453,12 @@ unsigned int speedstep_get_freqs(unsigned int processor,
 		/* check if the latency measurement is too high or too low
 		 * and set it to a safe value (500uSec) in that case
 		 */
-		if (*transition_latency > 10000000 || *transition_latency < 50000) {
-			printk (KERN_WARNING "speedstep: frequency transition measured seems out of "
-					"range (%u nSec), falling back to a safe one of %u nSec.\n",
+		if (*transition_latency > 10000000 ||
+		    *transition_latency < 50000) {
+			printk(KERN_WARNING PFX "frequency transition "
+					"measured seems out of range (%u "
+					"nSec), falling back to a safe one of"
+					"%u nSec.\n",
 					*transition_latency, 500000);
 			*transition_latency = 500000;
 		}
@@ -436,15 +466,16 @@ unsigned int speedstep_get_freqs(unsigned int processor,
 
 out:
 	local_irq_restore(flags);
-	return (ret);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(speedstep_get_freqs);
 
 #ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
 module_param(relaxed_check, int, 0444);
-MODULE_PARM_DESC(relaxed_check, "Don't do all checks for speedstep capability.");
+MODULE_PARM_DESC(relaxed_check,
+		"Don't do all checks for speedstep capability.");
 #endif
 
-MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION ("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
-MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
index b11bcc608ca..2b6c04e5a30 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -12,17 +12,17 @@
 
 /* processors */
 
-#define SPEEDSTEP_PROCESSOR_PIII_C_EARLY	0x00000001  /* Coppermine core */
-#define SPEEDSTEP_PROCESSOR_PIII_C		0x00000002  /* Coppermine core */
-#define SPEEDSTEP_PROCESSOR_PIII_T		0x00000003  /* Tualatin core */
-#define SPEEDSTEP_PROCESSOR_P4M			0x00000004  /* P4-M  */
+#define SPEEDSTEP_CPU_PIII_C_EARLY	0x00000001  /* Coppermine core */
+#define SPEEDSTEP_CPU_PIII_C		0x00000002  /* Coppermine core */
+#define SPEEDSTEP_CPU_PIII_T		0x00000003  /* Tualatin core */
+#define SPEEDSTEP_CPU_P4M		0x00000004  /* P4-M  */
 
 /* the following processors are not speedstep-capable and are not auto-detected
  * in speedstep_detect_processor(). However, their speed can be detected using
- * the speedstep_get_processor_frequency() call. */
-#define SPEEDSTEP_PROCESSOR_PM			0xFFFFFF03  /* Pentium M  */
-#define SPEEDSTEP_PROCESSOR_P4D			0xFFFFFF04  /* desktop P4  */
-#define SPEEDSTEP_PROCESSOR_PCORE		0xFFFFFF05  /* Core */
+ * the speedstep_get_frequency() call. */
+#define SPEEDSTEP_CPU_PM		0xFFFFFF03  /* Pentium M  */
+#define SPEEDSTEP_CPU_P4D		0xFFFFFF04  /* desktop P4  */
+#define SPEEDSTEP_CPU_PCORE		0xFFFFFF05  /* Core */
 
 /* speedstep states -- only two of them */
 
@@ -34,7 +34,7 @@
 extern unsigned int speedstep_detect_processor (void);
 
 /* detect the current speed (in khz) of the processor */
-extern unsigned int speedstep_get_processor_frequency(unsigned int processor);
+extern unsigned int speedstep_get_frequency(unsigned int processor);
 
 
 /* detect the low and high speeds of the processor. The callback
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 8a85c93bd62..befea088e4f 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -19,8 +19,8 @@
 #include <linux/cpufreq.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/io.h>
 #include <asm/ist.h>
-#include <asm/io.h>
 
 #include "speedstep-lib.h"
 
@@ -30,12 +30,12 @@
  * If user gives it, these are used.
  *
  */
-static int smi_port = 0;
-static int smi_cmd = 0;
-static unsigned int smi_sig = 0;
+static int smi_port;
+static int smi_cmd;
+static unsigned int smi_sig;
 
 /* info about the processor */
-static unsigned int speedstep_processor = 0;
+static unsigned int speedstep_processor;
 
 /*
  * There are only two frequency states for each processor. Values
@@ -56,12 +56,13 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
  * of DMA activity going on? */
 #define SMI_TRIES 5
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-smi", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+		"speedstep-smi", msg)
 
 /**
  * speedstep_smi_ownership
  */
-static int speedstep_smi_ownership (void)
+static int speedstep_smi_ownership(void)
 {
 	u32 command, result, magic, dummy;
 	u32 function = GET_SPEEDSTEP_OWNER;
@@ -70,16 +71,18 @@ static int speedstep_smi_ownership (void)
 	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
 	magic = virt_to_phys(magic_data);
 
-	dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port);
+	dprintk("trying to obtain ownership with command %x at port %x\n",
+			command, smi_port);
 
 	__asm__ __volatile__(
 		"push %%ebp\n"
 		"out %%al, (%%dx)\n"
 		"pop %%ebp\n"
-		: "=D" (result), "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
-			"=S" (dummy)
+		: "=D" (result),
+		  "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
+		  "=S" (dummy)
 		: "a" (command), "b" (function), "c" (0), "d" (smi_port),
-			"D" (0), "S" (magic)
+		  "D" (0), "S" (magic)
 		: "memory"
 	);
 
@@ -97,10 +100,10 @@ static int speedstep_smi_ownership (void)
  * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
  * shows that the latter occurs if !(ist_info.event & 0xFFFF).
  */
-static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
+static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
 {
 	u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
-	u32 state=0;
+	u32 state = 0;
 	u32 function = GET_SPEEDSTEP_FREQS;
 
 	if (!(ist_info.event & 0xFFFF)) {
@@ -110,17 +113,25 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
 
 	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
 
-	dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port);
+	dprintk("trying to determine frequencies with command %x at port %x\n",
+			command, smi_port);
 
 	__asm__ __volatile__(
 		"push %%ebp\n"
 		"out %%al, (%%dx)\n"
 		"pop %%ebp"
-		: "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi), "=S" (dummy)
-		: "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0)
+		: "=a" (result),
+		  "=b" (high_mhz),
+		  "=c" (low_mhz),
+		  "=d" (state), "=D" (edi), "=S" (dummy)
+		: "a" (command),
+		  "b" (function),
+		  "c" (state),
+		  "d" (smi_port), "S" (0), "D" (0)
 	);
 
-	dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz);
+	dprintk("result %x, low_freq %u, high_freq %u\n",
+			result, low_mhz, high_mhz);
 
 	/* abort if results are obviously incorrect... */
 	if ((high_mhz + low_mhz) < 600)
@@ -137,26 +148,30 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
  * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
  *
  */
-static int speedstep_get_state (void)
+static int speedstep_get_state(void)
 {
-	u32 function=GET_SPEEDSTEP_STATE;
+	u32 function = GET_SPEEDSTEP_STATE;
 	u32 result, state, edi, command, dummy;
 
 	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
 
-	dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port);
+	dprintk("trying to determine current setting with command %x "
+		"at port %x\n", command, smi_port);
 
 	__asm__ __volatile__(
 		"push %%ebp\n"
 		"out %%al, (%%dx)\n"
 		"pop %%ebp\n"
-		: "=a" (result), "=b" (state), "=D" (edi), "=c" (dummy), "=d" (dummy), "=S" (dummy)
-		: "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0), "D" (0)
+		: "=a" (result),
+		  "=b" (state), "=D" (edi),
+		  "=c" (dummy), "=d" (dummy), "=S" (dummy)
+		: "a" (command), "b" (function), "c" (0),
+		  "d" (smi_port), "S" (0), "D" (0)
 	);
 
 	dprintk("state is %x, result is %x\n", state, result);
 
-	return (state & 1);
+	return state & 1;
 }
 
 
@@ -165,11 +180,11 @@ static int speedstep_get_state (void)
  * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
  *
  */
-static void speedstep_set_state (unsigned int state)
+static void speedstep_set_state(unsigned int state)
 {
 	unsigned int result = 0, command, new_state, dummy;
 	unsigned long flags;
-	unsigned int function=SET_SPEEDSTEP_STATE;
+	unsigned int function = SET_SPEEDSTEP_STATE;
 	unsigned int retry = 0;
 
 	if (state > 0x1)
@@ -180,11 +195,14 @@ static void speedstep_set_state (unsigned int state)
 
 	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
 
-	dprintk("trying to set frequency to state %u with command %x at port %x\n", state, command, smi_port);
+	dprintk("trying to set frequency to state %u "
+		"with command %x at port %x\n",
+		state, command, smi_port);
 
 	do {
 		if (retry) {
-			dprintk("retry %u, previous result %u, waiting...\n", retry, result);
+			dprintk("retry %u, previous result %u, waiting...\n",
+					retry, result);
 			mdelay(retry * 50);
 		}
 		retry++;
@@ -192,20 +210,26 @@ static void speedstep_set_state (unsigned int state)
 			"push %%ebp\n"
 			"out %%al, (%%dx)\n"
 			"pop %%ebp"
-			: "=b" (new_state), "=D" (result), "=c" (dummy), "=a" (dummy),
-				"=d" (dummy), "=S" (dummy)
-			: "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0)
+			: "=b" (new_state), "=D" (result),
+			  "=c" (dummy), "=a" (dummy),
+			  "=d" (dummy), "=S" (dummy)
+			: "a" (command), "b" (function), "c" (state),
+			  "d" (smi_port), "S" (0), "D" (0)
 			);
 	} while ((new_state != state) && (retry <= SMI_TRIES));
 
 	/* enable IRQs */
 	local_irq_restore(flags);
 
-	if (new_state == state) {
-		dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result);
-	} else {
-		printk(KERN_ERR "cpufreq: change to state %u failed with new_state %u and result %u\n", state, new_state, result);
-	}
+	if (new_state == state)
+		dprintk("change to %u MHz succeeded after %u tries "
+			"with result %u\n",
+			(speedstep_freqs[new_state].frequency / 1000),
+			retry, result);
+	else
+		printk(KERN_ERR "cpufreq: change to state %u "
+			"failed with new_state %u and result %u\n",
+			state, new_state, result);
 
 	return;
 }
@@ -219,13 +243,14 @@ static void speedstep_set_state (unsigned int state)
  *
  * Sets a new CPUFreq policy/freq.
  */
-static int speedstep_target (struct cpufreq_policy *policy,
+static int speedstep_target(struct cpufreq_policy *policy,
 			unsigned int target_freq, unsigned int relation)
 {
 	unsigned int newstate = 0;
 	struct cpufreq_freqs freqs;
 
-	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
+	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
+				target_freq, relation, &newstate))
 		return -EINVAL;
 
 	freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
@@ -250,7 +275,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
  * Limit must be within speedstep_low_freq and speedstep_high_freq, with
  * at least one border included.
  */
-static int speedstep_verify (struct cpufreq_policy *policy)
+static int speedstep_verify(struct cpufreq_policy *policy)
 {
 	return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
 }
@@ -259,7 +284,8 @@ static int speedstep_verify (struct cpufreq_policy *policy)
 static int speedstep_cpu_init(struct cpufreq_policy *policy)
 {
 	int result;
-	unsigned int speed,state;
+	unsigned int speed, state;
+	unsigned int *low, *high;
 
 	/* capability check */
 	if (policy->cpu != 0)
@@ -272,19 +298,23 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 	}
 
 	/* detect low and high frequency */
-	result = speedstep_smi_get_freqs(&speedstep_freqs[SPEEDSTEP_LOW].frequency,
-				&speedstep_freqs[SPEEDSTEP_HIGH].frequency);
+	low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
+	high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
+
+	result = speedstep_smi_get_freqs(low, high);
 	if (result) {
-		/* fall back to speedstep_lib.c dection mechanism: try both states out */
-		dprintk("could not detect low and high frequencies by SMI call.\n");
+		/* fall back to speedstep_lib.c dection mechanism:
+		 * try both states out */
+		dprintk("could not detect low and high frequencies "
+				"by SMI call.\n");
 		result = speedstep_get_freqs(speedstep_processor,
-				&speedstep_freqs[SPEEDSTEP_LOW].frequency,
-				&speedstep_freqs[SPEEDSTEP_HIGH].frequency,
+				low, high,
 				NULL,
 				&speedstep_set_state);
 
 		if (result) {
-			dprintk("could not detect two different speeds -- aborting.\n");
+			dprintk("could not detect two different speeds"
+					" -- aborting.\n");
 			return result;
 		} else
 			dprintk("workaround worked.\n");
@@ -295,7 +325,8 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 	speed = speedstep_freqs[state].frequency;
 
 	dprintk("currently at %s speed setting - %i MHz\n",
-		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high",
+		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
+		? "low" : "high",
 		(speed / 1000));
 
 	/* cpuinfo and default policy values */
@@ -304,7 +335,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 
 	result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
 	if (result)
-		return (result);
+		return result;
 
 	cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
 
@@ -321,7 +352,7 @@ static unsigned int speedstep_get(unsigned int cpu)
 {
 	if (cpu)
 		return -ENODEV;
-	return speedstep_get_processor_frequency(speedstep_processor);
+	return speedstep_get_frequency(speedstep_processor);
 }
 
 
@@ -335,7 +366,7 @@ static int speedstep_resume(struct cpufreq_policy *policy)
 	return result;
 }
 
-static struct freq_attr* speedstep_attr[] = {
+static struct freq_attr *speedstep_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
 	NULL,
 };
@@ -364,21 +395,23 @@ static int __init speedstep_init(void)
 	speedstep_processor = speedstep_detect_processor();
 
 	switch (speedstep_processor) {
-	case SPEEDSTEP_PROCESSOR_PIII_T:
-	case SPEEDSTEP_PROCESSOR_PIII_C:
-	case SPEEDSTEP_PROCESSOR_PIII_C_EARLY:
+	case SPEEDSTEP_CPU_PIII_T:
+	case SPEEDSTEP_CPU_PIII_C:
+	case SPEEDSTEP_CPU_PIII_C_EARLY:
 		break;
 	default:
 		speedstep_processor = 0;
 	}
 
 	if (!speedstep_processor) {
-		dprintk ("No supported Intel CPU detected.\n");
+		dprintk("No supported Intel CPU detected.\n");
 		return -ENODEV;
 	}
 
-	dprintk("signature:0x%.8lx, command:0x%.8lx, event:0x%.8lx, perf_level:0x%.8lx.\n",
-		ist_info.signature, ist_info.command, ist_info.event, ist_info.perf_level);
+	dprintk("signature:0x%.8lx, command:0x%.8lx, "
+		"event:0x%.8lx, perf_level:0x%.8lx.\n",
+		ist_info.signature, ist_info.command,
+		ist_info.event, ist_info.perf_level);
 
 	/* Error if no IST-SMI BIOS or no PARM
 		 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
@@ -416,17 +449,20 @@ static void __exit speedstep_exit(void)
 	cpufreq_unregister_driver(&speedstep_driver);
 }
 
-module_param(smi_port,  int, 0444);
-module_param(smi_cmd,   int, 0444);
-module_param(smi_sig,  uint, 0444);
+module_param(smi_port, int, 0444);
+module_param(smi_cmd,  int, 0444);
+module_param(smi_sig, uint, 0444);
 
-MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value -- Intel's default setting is 0xb2");
-MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value -- Intel's default setting is 0x82");
-MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the SMI interface.");
+MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
+		"-- Intel's default setting is 0xb2");
+MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
+		"-- Intel's default setting is 0x82");
+MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
+		"SMI interface.");
 
-MODULE_AUTHOR ("Hiroshi Miura");
-MODULE_DESCRIPTION ("Speedstep driver for IST applet SMI interface.");
-MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Hiroshi Miura");
+MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
+MODULE_LICENSE("GPL");
 
 module_init(speedstep_init);
 module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index ffd0f5ed071..593171e967e 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
  */
 static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
 
-static char Cx86_model[][9] __cpuinitdata = {
+static const char __cpuinitconst Cx86_model[][9] = {
 	"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
 	"M II ", "Unknown"
 };
-static char Cx486_name[][5] __cpuinitdata = {
+static const char __cpuinitconst Cx486_name[][5] = {
 	"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
 	"SRx2", "DRx2"
 };
-static char Cx486S_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486S_name[][4] = {
 	"S", "S2", "Se", "S2e"
 };
-static char Cx486D_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486D_name[][4] = {
 	"DX", "DX2", "?", "?", "?", "DX4"
 };
 static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
-static char cyrix_model_mult1[] __cpuinitdata = "12??43";
-static char cyrix_model_mult2[] __cpuinitdata = "12233445";
+static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
+static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
 
 /*
  * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
 	}
 }
 
-static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
 	.c_vendor	= "Cyrix",
 	.c_ident	= { "CyrixInstead" },
 	.c_early_init	= early_init_cyrix,
@@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
 
 cpu_dev_register(cyrix_cpu_dev);
 
-static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
 	.c_vendor	= "NSC",
 	.c_ident	= { "Geode by NSC" },
 	.c_init		= init_nsc,
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 25c559ba8d5..7437fa133c0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -4,6 +4,7 @@
 #include <linux/string.h>
 #include <linux/bitops.h>
 #include <linux/smp.h>
+#include <linux/sched.h>
 #include <linux/thread_info.h>
 #include <linux/module.h>
 
@@ -13,6 +14,7 @@
 #include <asm/uaccess.h>
 #include <asm/ds.h>
 #include <asm/bugs.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/topology.h>
@@ -53,13 +55,23 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 		c->x86_cache_alignment = 128;
 #endif
 
+	/* CPUID workaround for 0F33/0F34 CPU */
+	if (c->x86 == 0xF && c->x86_model == 0x3
+	    && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
+		c->x86_phys_bits = 36;
+
 	/*
 	 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
-	 * with P/T states and does not stop in deep C-states
+	 * with P/T states and does not stop in deep C-states.
+	 *
+	 * It is also reliable across cores and sockets. (but not across
+	 * cabinets - we turn it off in that case explicitly.)
 	 */
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+		set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
+		sched_clock_stable = 1;
 	}
 
 	/*
@@ -110,6 +122,28 @@ static void __cpuinit trap_init_f00f_bug(void)
 }
 #endif
 
+static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	/* calling is from identify_secondary_cpu() ? */
+	if (c->cpu_index == boot_cpu_id)
+		return;
+
+	/*
+	 * Mask B, Pentium, but not Pentium MMX
+	 */
+	if (c->x86 == 5 &&
+	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
+	    c->x86_model <= 3) {
+		/*
+		 * Remember we have B step Pentia with bugs
+		 */
+		WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
+				    "with B stepping processors.\n");
+	}
+#endif
+}
+
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 {
 	unsigned long lo, hi;
@@ -186,6 +220,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_NUMAQ
 	numaq_tsc_disable();
 #endif
+
+	intel_smp_check(c);
 }
 #else
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
@@ -385,7 +421,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
 }
 #endif
 
-static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
 	.c_vendor	= "Intel",
 	.c_ident	= { "GenuineIntel" },
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7293508d8f5..483eda96e10 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -32,7 +32,7 @@ struct _cache_table
 };
 
 /* all the cache descriptor types we care about (no TLB or trace cache entries) */
-static struct _cache_table cache_table[] __cpuinitdata =
+static const struct _cache_table __cpuinitconst cache_table[] =
 {
 	{ 0x06, LVL_1_INST, 8 },	/* 4-way set assoc, 32 byte line size */
 	{ 0x08, LVL_1_INST, 16 },	/* 4-way set assoc, 32 byte line size */
@@ -159,7 +159,7 @@ struct _cpuid4_info_regs {
 	unsigned long can_disable;
 };
 
-#ifdef CONFIG_PCI
+#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
 static struct pci_device_id k8_nb_id[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
@@ -206,15 +206,15 @@ union l3_cache {
 	unsigned val;
 };
 
-static unsigned short assocs[] __cpuinitdata = {
+static const unsigned short __cpuinitconst assocs[] = {
 	[1] = 1, [2] = 2, [4] = 4, [6] = 8,
 	[8] = 16, [0xa] = 32, [0xb] = 48,
 	[0xc] = 64,
 	[0xf] = 0xffff // ??
 };
 
-static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
-static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
+static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
+static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
 
 static void __cpuinit
 amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
@@ -324,15 +324,6 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
 	return 0;
 }
 
-static int
-__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
-{
-	struct _cpuid4_info_regs *leaf_regs =
-		(struct _cpuid4_info_regs *)this_leaf;
-
-	return cpuid4_cache_lookup_regs(index, leaf_regs);
-}
-
 static int __cpuinit find_num_cache_leaves(void)
 {
 	unsigned int		eax, ebx, ecx, edx;
@@ -508,6 +499,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 	return l2;
 }
 
+#ifdef CONFIG_SYSFS
+
 /* pointer to _cpuid4_info array (for each cache leaf) */
 static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
 #define CPUID4_INFO_IDX(x, y)	(&((per_cpu(cpuid4_info, x))[y]))
@@ -571,6 +564,15 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
 	per_cpu(cpuid4_info, cpu) = NULL;
 }
 
+static int
+__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+{
+	struct _cpuid4_info_regs *leaf_regs =
+		(struct _cpuid4_info_regs *)this_leaf;
+
+	return cpuid4_cache_lookup_regs(index, leaf_regs);
+}
+
 static void __cpuinit get_cpu_leaves(void *_retval)
 {
 	int j, *retval = _retval, cpu = smp_processor_id();
@@ -612,8 +614,6 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
 	return retval;
 }
 
-#ifdef CONFIG_SYSFS
-
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
 
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb6..b2f89829bbe 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd_64.o
 obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633..3552119b091 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
 	}
 }
 
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-	old_cr4 = read_cr4();
-	clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-	if (old_cr4 & X86_CR4_MCE)
-		set_in_cr4(X86_CR4_MCE);
-}
-
 static int __init mcheck_disable(char *str)
 {
 	mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f..863f89568b1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  * Rest from unknown author(s).
  * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -24,6 +26,9 @@
 #include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/ratelimit.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -32,7 +37,6 @@
 #include <asm/idle.h>
 
 #define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
 
 atomic_t mce_entry;
 
@@ -47,7 +51,7 @@ static int mce_dont_init;
  */
 static int tolerant = 1;
 static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
 static unsigned long notify_user;
 static int rip_msr;
 static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
 
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+	memset(m, 0, sizeof(struct mce));
+	m->cpu = smp_processor_id();
+	rdtscll(m->tsc);
+}
+
 /*
  * Lockless MCE logging infrastructure.
  * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
 			print_symbol("{%s}", m->ip);
 		printk("\n");
 	}
-	printk(KERN_EMERG "TSC %Lx ", m->tsc);
+	printk(KERN_EMERG "TSC %llx ", m->tsc);
 	if (m->addr)
-		printk("ADDR %Lx ", m->addr);
+		printk("ADDR %llx ", m->addr);
 	if (m->misc)
-		printk("MISC %Lx ", m->misc);
+		printk("MISC %llx ", m->misc);
 	printk("\n");
 	printk(KERN_EMERG "This is not a software problem!\n");
 	printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 	panic(msg);
 }
 
-static int mce_available(struct cpuinfo_x86 *c)
+int mce_available(struct cpuinfo_x86 *c)
 {
+	if (mce_dont_init)
+		return 0;
 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 }
 
 /*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+	struct mce m;
+	int i;
+
+	mce_setup(&m);
+
+	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+	for (i = 0; i < banks; i++) {
+		if (!bank[i] || !test_bit(i, *b))
+			continue;
+
+		m.misc = 0;
+		m.addr = 0;
+		m.bank = i;
+		m.tsc = 0;
+
+		barrier();
+		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+		if (!(m.status & MCI_STATUS_VAL))
+			continue;
+
+		/*
+		 * Uncorrected events are handled by the exception handler
+		 * when it is enabled. But when the exception is disabled log
+		 * everything.
+		 *
+		 * TBD do the same check for MCI_STATUS_EN here?
+		 */
+		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+			continue;
+
+		if (m.status & MCI_STATUS_MISCV)
+			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+		if (m.status & MCI_STATUS_ADDRV)
+			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+		if (!(flags & MCP_TIMESTAMP))
+			m.tsc = 0;
+		/*
+		 * Don't get the IP here because it's unlikely to
+		 * have anything to do with the actual error location.
+		 */
+
+		mce_log(&m);
+		add_taint(TAINT_MACHINE_CHECK);
+
+		/*
+		 * Clear state for this bank.
+		 */
+		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	}
+
+	/*
+	 * Don't clear MCG_STATUS here because it's only defined for
+	 * exceptions.
+	 */
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
  */
 void do_machine_check(struct pt_regs * regs, long error_code)
 {
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	 * error.
 	 */
 	int kill_it = 0;
+	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 
 	atomic_inc(&mce_entry);
 
-	if ((regs
-	     && notify_die(DIE_NMI, "machine check", regs, error_code,
+	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
-	    || !banks)
+		goto out2;
+	if (!banks)
 		goto out2;
 
-	memset(&m, 0, sizeof(struct mce));
-	m.cpu = smp_processor_id();
+	mce_setup(&m);
+
 	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 	/* if the restart IP is not valid, we're done for */
 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	barrier();
 
 	for (i = 0; i < banks; i++) {
-		if (i < NR_SYSFS_BANKS && !bank[i])
+		__clear_bit(i, toclear);
+		if (!bank[i])
 			continue;
 
 		m.misc = 0;
 		m.addr = 0;
 		m.bank = i;
-		m.tsc = 0;
 
 		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 		if ((m.status & MCI_STATUS_VAL) == 0)
 			continue;
 
+		/*
+		 * Non uncorrected errors are handled by machine_check_poll
+		 * Leave them alone.
+		 */
+		if ((m.status & MCI_STATUS_UC) == 0)
+			continue;
+
+		/*
+		 * Set taint even when machine check was not enabled.
+		 */
+		add_taint(TAINT_MACHINE_CHECK);
+
+		__set_bit(i, toclear);
+
 		if (m.status & MCI_STATUS_EN) {
 			/* if PCC was set, there's no way out */
 			no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 					no_way_out = 1;
 				kill_it = 1;
 			}
+		} else {
+			/*
+			 * Machine check event was not enabled. Clear, but
+			 * ignore.
+			 */
+			continue;
 		}
 
 		if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 
 		mce_get_rip(&m, regs);
-		if (error_code >= 0)
-			rdtscll(m.tsc);
-		if (error_code != -2)
-			mce_log(&m);
+		mce_log(&m);
 
 		/* Did this bank cause the exception? */
 		/* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			panicm = m;
 			panicm_found = 1;
 		}
-
-		add_taint(TAINT_MACHINE_CHECK);
 	}
 
-	/* Never do anything final in the polling timer */
-	if (!regs)
-		goto out;
-
 	/* If we didn't find an uncorrectable error, pick
 	   the last one (shouldn't happen, just being safe). */
 	if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	/* notify userspace ASAP */
 	set_thread_flag(TIF_MCE_NOTIFY);
 
- out:
 	/* the last thing we do is clear state */
-	for (i = 0; i < banks; i++)
-		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	for (i = 0; i < banks; i++) {
+		if (test_bit(i, toclear))
+			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	}
 	wrmsrl(MSR_IA32_MCG_STATUS, 0);
  out2:
 	atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
  * and historically has been the register value of the
  * MSR_IA32_THERMAL_STATUS (Intel) msr.
  */
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
 {
 	struct mce m;
 
-	memset(&m, 0, sizeof(m));
-	m.cpu = cpu;
+	mce_setup(&m);
 	m.bank = MCE_THERMAL_BANK;
 	m.status = status;
-	rdtscll(m.tsc);
 	mce_log(&m);
 }
 #endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 
 static int check_interval = 5 * 60; /* 5 minutes */
 static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static void mcheck_timer(unsigned long);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
 {
-	if (mce_available(&current_cpu_data))
-		do_machine_check(NULL, 0);
-}
+	struct timer_list *t = &per_cpu(mce_timer, data);
 
-static void mcheck_timer(struct work_struct *work)
-{
-	on_each_cpu(mcheck_check_cpu, NULL, 1);
+	WARN_ON(smp_processor_id() != data);
+
+	if (mce_available(&current_cpu_data))
+		machine_check_poll(MCP_TIMESTAMP,
+				&__get_cpu_var(mce_poll_banks));
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
 				(int)round_jiffies_relative(check_interval*HZ));
 	}
 
-	schedule_delayed_work(&mcheck_work, next_interval);
+	t->expires = jiffies + next_interval;
+	add_timer(t);
+}
+
+static void mce_do_trigger(struct work_struct *work)
+{
+	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 }
 
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
 /*
- * This is only called from process context.  This is where we do
- * anything we need to alert userspace about new MCEs.  This is called
- * directly from the poller and also from entry.S and idle, thanks to
- * TIF_MCE_NOTIFY.
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
  */
 int mce_notify_user(void)
 {
+	/* Not more than two messages every minute */
+	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
 	clear_thread_flag(TIF_MCE_NOTIFY);
 	if (test_and_clear_bit(0, &notify_user)) {
-		static unsigned long last_print;
-		unsigned long now = jiffies;
-
 		wake_up_interruptible(&mce_wait);
-		if (trigger[0])
-			call_usermodehelper(trigger, trigger_argv, NULL,
-						UMH_NO_WAIT);
 
-		if (time_after_eq(now, last_print + (check_interval*HZ))) {
-			last_print = now;
+		/*
+		 * There is no risk of missing notifications because
+		 * work_pending is always cleared before the function is
+		 * executed.
+		 */
+		if (trigger[0] && !work_pending(&mce_trigger_work))
+			schedule_work(&mce_trigger_work);
+
+		if (__ratelimit(&ratelimit))
 			printk(KERN_INFO "Machine check events logged\n");
-		}
 
 		return 1;
 	}
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
 
 static __init int periodic_mcheck_init(void)
 {
-	next_interval = check_interval * HZ;
-	if (next_interval)
-		schedule_delayed_work(&mcheck_work,
-				      round_jiffies_relative(next_interval));
-	idle_notifier_register(&mce_idle_notifier);
-	return 0;
+       idle_notifier_register(&mce_idle_notifier);
+       return 0;
 }
 __initcall(periodic_mcheck_init);
 
-
 /*
  * Initialize Machine Checks for a CPU.
  */
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
 {
 	u64 cap;
-	int i;
+	unsigned b;
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
-	banks = cap & 0xff;
-	if (banks > MCE_EXTENDED_BANK) {
-		banks = MCE_EXTENDED_BANK;
-		printk(KERN_INFO "MCE: warning: using only %d banks\n",
-		       MCE_EXTENDED_BANK);
+	b = cap & 0xff;
+	if (b > MAX_NR_BANKS) {
+		printk(KERN_WARNING
+		       "MCE: Using only %u machine check banks out of %u\n",
+			MAX_NR_BANKS, b);
+		b = MAX_NR_BANKS;
 	}
+
+	/* Don't support asymmetric configurations today */
+	WARN_ON(banks != 0 && b != banks);
+	banks = b;
+	if (!bank) {
+		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+		if (!bank)
+			return -ENOMEM;
+		memset(bank, 0xff, banks * sizeof(u64));
+	}
+
 	/* Use accurate RIP reporting if available. */
 	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 		rip_msr = MSR_IA32_MCG_EIP;
 
-	/* Log the machine checks left over from the previous reset.
-	   This also clears all registers */
-	do_machine_check(NULL, mce_bootlog ? -1 : -2);
+	return 0;
+}
+
+static void mce_init(void *dummy)
+{
+	u64 cap;
+	int i;
+	mce_banks_t all_banks;
+
+	/*
+	 * Log the machine checks left over from the previous reset.
+	 */
+	bitmap_fill(all_banks, MAX_NR_BANKS);
+	machine_check_poll(MCP_UC, &all_banks);
 
 	set_in_cr4(X86_CR4_MCE);
 
+	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	if (cap & MCG_CTL_P)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
 	for (i = 0; i < banks; i++) {
-		if (i < NR_SYSFS_BANKS)
-			wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
-		else
-			wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
-
+		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
 }
 
 /* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
-		if(c->x86 == 15)
+		if (c->x86 == 15 && banks > 4)
 			/* disable GART TBL walk error reporting, which trips off
 			   incorrectly with the IOMMU & 3ware & Cerberus. */
-			clear_bit(10, &bank[4]);
+			clear_bit(10, (unsigned long *)&bank[4]);
 		if(c->x86 <= 17 && mce_bootlog < 0)
 			/* Lots of broken BIOS around that don't clear them
 			   by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
 	}
 }
 
+static void mce_init_timer(void)
+{
+	struct timer_list *t = &__get_cpu_var(mce_timer);
+
+	/* data race harmless because everyone sets to the same value */
+	if (!next_interval)
+		next_interval = check_interval * HZ;
+	if (!next_interval)
+		return;
+	setup_timer(t, mcheck_timer, smp_processor_id());
+	t->expires = round_jiffies(jiffies + next_interval);
+	add_timer(t);
+}
+
 /*
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off.
  */
 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
-	mce_cpu_quirks(c);
+	if (!mce_available(c))
+		return;
 
-	if (mce_dont_init ||
-	    !mce_available(c))
+	if (mce_cap_init() < 0) {
+		mce_dont_init = 1;
 		return;
+	}
+	mce_cpu_quirks(c);
 
 	mce_init(NULL);
 	mce_cpu_features(c);
+	mce_init_timer();
 }
 
 /*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 {
 	unsigned long *cpu_tsc;
 	static DEFINE_MUTEX(mce_read_mutex);
-	unsigned next;
+	unsigned prev, next;
 	char __user *buf = ubuf;
 	int i, err;
 
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	}
 
 	err = 0;
-	for (i = 0; i < next; i++) {
-		unsigned long start = jiffies;
-
-		while (!mcelog.entry[i].finished) {
-			if (time_after_eq(jiffies, start + 2)) {
-				memset(mcelog.entry + i,0, sizeof(struct mce));
-				goto timeout;
+	prev = 0;
+	do {
+		for (i = prev; i < next; i++) {
+			unsigned long start = jiffies;
+
+			while (!mcelog.entry[i].finished) {
+				if (time_after_eq(jiffies, start + 2)) {
+					memset(mcelog.entry + i, 0,
+					       sizeof(struct mce));
+					goto timeout;
+				}
+				cpu_relax();
 			}
-			cpu_relax();
+			smp_rmb();
+			err |= copy_to_user(buf, mcelog.entry + i,
+					    sizeof(struct mce));
+			buf += sizeof(struct mce);
+timeout:
+			;
 		}
-		smp_rmb();
-		err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
-		buf += sizeof(struct mce);
- timeout:
-		;
-	}
 
-	memset(mcelog.entry, 0, next * sizeof(struct mce));
-	mcelog.next = 0;
+		memset(mcelog.entry + prev, 0,
+		       (next - prev) * sizeof(struct mce));
+		prev = next;
+		next = cmpxchg(&mcelog.next, prev, 0);
+	} while (next != prev);
 
 	synchronize_sched();
 
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
 	&mce_chrdev_ops,
 };
 
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-	old_cr4 = read_cr4();
-	clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-	if (old_cr4 & X86_CR4_MCE)
-		set_in_cr4(X86_CR4_MCE);
-}
-
 /*
  * Old style boot options parsing. Only for compatibility.
  */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
 	return 1;
 }
 
-/* mce=off disables machine check. Note you can re-enable it later
-   using sysfs.
+/* mce=off disables machine check.
    mce=TOLERANCELEVEL (number, see above)
    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
    mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
  * Sysfs support
  */
 
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+	int i;
+
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+	return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+	return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+	return mce_disable();
+}
+
 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
    Only one CPU is active at this time, the others get readded later using
    CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
 	return 0;
 }
 
+static void mce_cpu_restart(void *data)
+{
+	del_timer_sync(&__get_cpu_var(mce_timer));
+	if (mce_available(&current_cpu_data))
+		mce_init(NULL);
+	mce_init_timer();
+}
+
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
-	if (next_interval)
-		cancel_delayed_work(&mcheck_work);
-	/* Timer race is harmless here */
-	on_each_cpu(mce_init, NULL, 1);
 	next_interval = check_interval * HZ;
-	if (next_interval)
-		schedule_delayed_work(&mcheck_work,
-				      round_jiffies_relative(next_interval));
+	on_each_cpu(mce_cpu_restart, NULL, 1);
 }
 
 static struct sysdev_class mce_sysclass = {
+	.suspend = mce_suspend,
+	.shutdown = mce_shutdown,
 	.resume = mce_resume,
 	.name = "machinecheck",
 };
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
 	}								\
 	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 
-/*
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
- */
-ACCESSOR(bank0ctl,bank[0],mce_restart())
-ACCESSOR(bank1ctl,bank[1],mce_restart())
-ACCESSOR(bank2ctl,bank[2],mce_restart())
-ACCESSOR(bank3ctl,bank[3],mce_restart())
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+			 char *buf)
+{
+	u64 b = bank[attr - bank_attrs];
+	return sprintf(buf, "%llx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+			const char *buf, size_t siz)
+{
+	char *end;
+	u64 new = simple_strtoull(buf, &end, 0);
+	if (end == buf)
+		return -EINVAL;
+	bank[attr - bank_attrs] = new;
+	mce_restart();
+	return end-buf;
+}
 
 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 				char *buf)
@@ -814,13 +986,11 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 ACCESSOR(check_interval,check_interval,mce_restart())
 static struct sysdev_attribute *mce_attributes[] = {
-	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
-	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 	NULL
 };
 
-static cpumask_t mce_device_initialized = CPU_MASK_NONE;
+static cpumask_var_t mce_device_initialized;
 
 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 static __cpuinit int mce_create_device(unsigned int cpu)
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 		if (err)
 			goto error;
 	}
-	cpu_set(cpu, mce_device_initialized);
+	for (i = 0; i < banks; i++) {
+		err = sysdev_create_file(&per_cpu(device_mce, cpu),
+					&bank_attrs[i]);
+		if (err)
+			goto error2;
+	}
+	cpumask_set_cpu(cpu, mce_device_initialized);
 
 	return 0;
+error2:
+	while (--i >= 0) {
+		sysdev_remove_file(&per_cpu(device_mce, cpu),
+					&bank_attrs[i]);
+	}
 error:
-	while (i--) {
+	while (--i >= 0) {
 		sysdev_remove_file(&per_cpu(device_mce,cpu),
 				   mce_attributes[i]);
 	}
@@ -862,14 +1043,44 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 {
 	int i;
 
-	if (!cpu_isset(cpu, mce_device_initialized))
+	if (!cpumask_test_cpu(cpu, mce_device_initialized))
 		return;
 
 	for (i = 0; mce_attributes[i]; i++)
 		sysdev_remove_file(&per_cpu(device_mce,cpu),
 			mce_attributes[i]);
+	for (i = 0; i < banks; i++)
+		sysdev_remove_file(&per_cpu(device_mce, cpu),
+			&bank_attrs[i]);
 	sysdev_unregister(&per_cpu(device_mce,cpu));
-	cpu_clear(cpu, mce_device_initialized);
+	cpumask_clear_cpu(cpu, mce_device_initialized);
+}
+
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+	int i;
+	unsigned long action = *(unsigned long *)h;
+
+	if (!mce_available(&current_cpu_data))
+		return;
+	if (!(action & CPU_TASKS_FROZEN))
+		cmci_clear();
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+
+static void mce_reenable_cpu(void *h)
+{
+	int i;
+	unsigned long action = *(unsigned long *)h;
+
+	if (!mce_available(&current_cpu_data))
+		return;
+	if (!(action & CPU_TASKS_FROZEN))
+		cmci_reenable();
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
 }
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
@@ -877,6 +1088,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
+	struct timer_list *t = &per_cpu(mce_timer, cpu);
 
 	switch (action) {
 	case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 			threshold_cpu_callback(action, cpu);
 		mce_remove_device(cpu);
 		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		del_timer_sync(t);
+		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+		break;
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		t->expires = round_jiffies(jiffies + next_interval);
+		add_timer_on(t, cpu);
+		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+		break;
+	case CPU_POST_DEAD:
+		/* intentionally ignoring frozen here */
+		cmci_rediscover(cpu);
+		break;
 	}
 	return NOTIFY_OK;
 }
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
 	.notifier_call = mce_cpu_callback,
 };
 
+static __init int mce_init_banks(void)
+{
+	int i;
+
+	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+				GFP_KERNEL);
+	if (!bank_attrs)
+		return -ENOMEM;
+
+	for (i = 0; i < banks; i++) {
+		struct sysdev_attribute *a = &bank_attrs[i];
+		a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+		if (!a->attr.name)
+			goto nomem;
+		a->attr.mode = 0644;
+		a->show = show_bank;
+		a->store = set_bank;
+	}
+	return 0;
+
+nomem:
+	while (--i >= 0)
+		kfree(bank_attrs[i].attr.name);
+	kfree(bank_attrs);
+	bank_attrs = NULL;
+	return -ENOMEM;
+}
+
 static __init int mce_init_device(void)
 {
 	int err;
@@ -906,6 +1161,13 @@ static __init int mce_init_device(void)
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
+
+	alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
+
+	err = mce_init_banks();
+	if (err)
+		return err;
+
 	err = sysdev_class_register(&mce_sysclass);
 	if (err)
 		return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 9817506dd46..56dde9c4bc9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
 
 static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 
+static void amd_threshold_interrupt(void);
+
 /*
  * CPU Initialization
  */
@@ -90,7 +92,8 @@ struct thresh_restart {
 };
 
 /* must be called with correct cpu affinity */
-static long threshold_restart_bank(void *_tr)
+/* Called via smp_call_function_single() */
+static void threshold_restart_bank(void *_tr)
 {
 	struct thresh_restart *tr = _tr;
 	u32 mci_misc_hi, mci_misc_lo;
@@ -117,7 +120,6 @@ static long threshold_restart_bank(void *_tr)
 
 	mci_misc_hi |= MASK_COUNT_EN_HI;
 	wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
-	return 0;
 }
 
 /* cpu init entry point, called from mce.c with preempt off */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			tr.reset = 0;
 			tr.old_limit = 0;
 			threshold_restart_bank(&tr);
+
+			mce_threshold_vector = amd_threshold_interrupt;
 		}
 	}
 }
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
  * the interrupt goes off when error_count reaches threshold_limit.
  * the handler will simply log mcelog w/ software defined bank number.
  */
-asmlinkage void mce_threshold_interrupt(void)
+static void amd_threshold_interrupt(void)
 {
 	unsigned int bank, block;
 	struct mce m;
 	u32 low = 0, high = 0, address = 0;
 
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
-
-	memset(&m, 0, sizeof(m));
-	rdtscll(m.tsc);
-	m.cpu = smp_processor_id();
+	mce_setup(&m);
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
 
 			/* Log the machine check that caused the threshold
 			   event. */
-			do_machine_check(NULL, 0);
+			machine_check_poll(MCP_TIMESTAMP,
+					&__get_cpu_var(mce_poll_banks));
 
 			if (high & MASK_OVERFLOW_HI) {
 				rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
 				       + bank * NR_BLOCKS
 				       + block;
 				mce_log(&m);
-				goto out;
+				return;
 			}
 		}
 	}
-out:
-	inc_irq_stat(irq_threshold_count);
-	irq_exit();
 }
 
 /*
@@ -283,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
 	tr.b = b;
 	tr.reset = 0;
 	tr.old_limit = 0;
-	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
 	return end - buf;
 }
@@ -305,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
 	tr.b = b;
 	tr.reset = 0;
 
-	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
 	return end - buf;
 }
 
-static long local_error_count(void *_b)
+struct threshold_block_cross_cpu {
+	struct threshold_block *tb;
+	long retval;
+};
+
+static void local_error_count_handler(void *_tbcc)
 {
-	struct threshold_block *b = _b;
+	struct threshold_block_cross_cpu *tbcc = _tbcc;
+	struct threshold_block *b = tbcc->tb;
 	u32 low, high;
 
 	rdmsr(b->address, low, high);
-	return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+	tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
 }
 
 static ssize_t show_error_count(struct threshold_block *b, char *buf)
 {
-	return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
+	struct threshold_block_cross_cpu tbcc = { .tb = b, };
+
+	smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
+	return sprintf(buf, "%lx\n", tbcc.retval);
 }
 
 static ssize_t store_error_count(struct threshold_block *b,
@@ -329,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b,
 {
 	struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
 
-	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 	return 1;
 }
 
@@ -398,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
 	if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
 		return 0;
 
-	if (rdmsr_safe(address, &low, &high))
+	if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
 		return 0;
 
 	if (!(high & MASK_VALID_HI)) {
@@ -462,12 +467,11 @@ out_free:
 	return err;
 }
 
-static __cpuinit long local_allocate_threshold_blocks(void *_bank)
+static __cpuinit long
+local_allocate_threshold_blocks(int cpu, unsigned int bank)
 {
-	unsigned int *bank = _bank;
-
-	return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
-					 MSR_IA32_MC0_MISC + *bank * 4);
+	return allocate_threshold_blocks(cpu, bank, 0,
+					 MSR_IA32_MC0_MISC + bank * 4);
 }
 
 /* symlinks sibling shared banks to first core.  first core owns dir/files. */
@@ -481,7 +485,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
 #ifdef CONFIG_SMP
 	if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {	/* symlink */
-		i = cpumask_first(&per_cpu(cpu_core_map, cpu));
+		i = cpumask_first(cpu_core_mask(cpu));
 
 		/* first core not up yet */
 		if (cpu_data(i).cpu_core_id)
@@ -501,7 +505,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (err)
 			goto out;
 
-		cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
+		cpumask_copy(b->cpus, cpu_core_mask(cpu));
 		per_cpu(threshold_banks, cpu)[bank] = b;
 		goto out;
 	}
@@ -525,12 +529,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 #ifndef CONFIG_SMP
 	cpumask_setall(b->cpus);
 #else
-	cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
+	cpumask_copy(b->cpus, cpu_core_mask(cpu));
 #endif
 
 	per_cpu(threshold_banks, cpu)[bank] = b;
 
-	err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
+	err = local_allocate_threshold_blocks(cpu, bank);
 	if (err)
 		goto out_free;
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aa5e287c98e..d6b72df89d6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
 /*
  * Intel specific MCE features.
  * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -13,6 +15,7 @@
 #include <asm/hw_irq.h>
 #include <asm/idle.h>
 #include <asm/therm_throt.h>
+#include <asm/apic.h>
 
 asmlinkage void smp_thermal_interrupt(void)
 {
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
 	if (therm_throt_process(msr_val & 1))
-		mce_log_therm_throt_event(smp_processor_id(), msr_val);
+		mce_log_therm_throt_event(msr_val);
 
 	inc_irq_stat(irq_thermal_count);
 	irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	return;
 }
 
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+
+static int cmci_supported(int *banks)
+{
+	u64 cap;
+
+	/*
+	 * Vendor check is not strictly needed, but the initial
+	 * initialization is vendor keyed and this
+	 * makes sure none of the backdoors are entered otherwise.
+	 */
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return 0;
+	if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+		return 0;
+	rdmsrl(MSR_IA32_MCG_CAP, cap);
+	*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+	return !!(cap & MCG_CMCI_P);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+	mce_notify_user();
+}
+
+static void print_update(char *type, int *hdr, int num)
+{
+	if (*hdr == 0)
+		printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
+	*hdr = 1;
+	printk(KERN_CONT " %s:%d", type, num);
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks, int boot)
+{
+	unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+	int hdr = 0;
+	int i;
+
+	spin_lock(&cmci_discover_lock);
+	for (i = 0; i < banks; i++) {
+		u64 val;
+
+		if (test_bit(i, owned))
+			continue;
+
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+		/* Already owned by someone else? */
+		if (val & CMCI_EN) {
+			if (test_and_clear_bit(i, owned) || boot)
+				print_update("SHD", &hdr, i);
+			__clear_bit(i, __get_cpu_var(mce_poll_banks));
+			continue;
+		}
+
+		val |= CMCI_EN | CMCI_THRESHOLD;
+		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+		/* Did the enable bit stick? -- the bank supports CMCI */
+		if (val & CMCI_EN) {
+			if (!test_and_set_bit(i, owned) || boot)
+				print_update("CMCI", &hdr, i);
+			__clear_bit(i, __get_cpu_var(mce_poll_banks));
+		} else {
+			WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+		}
+	}
+	spin_unlock(&cmci_discover_lock);
+	if (hdr)
+		printk(KERN_CONT "\n");
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+	unsigned long flags;
+	int banks;
+
+	if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+		return;
+	local_irq_save(flags);
+	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+	local_irq_restore(flags);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+	int i;
+	int banks;
+	u64 val;
+
+	if (!cmci_supported(&banks))
+		return;
+	spin_lock(&cmci_discover_lock);
+	for (i = 0; i < banks; i++) {
+		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+			continue;
+		/* Disable CMCI */
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		__clear_bit(i, __get_cpu_var(mce_banks_owned));
+	}
+	spin_unlock(&cmci_discover_lock);
+}
+
+/*
+ * After a CPU went down cycle through all the others and rediscover
+ * Must run in process context.
+ */
+void cmci_rediscover(int dying)
+{
+	int banks;
+	int cpu;
+	cpumask_var_t old;
+
+	if (!cmci_supported(&banks))
+		return;
+	if (!alloc_cpumask_var(&old, GFP_KERNEL))
+		return;
+	cpumask_copy(old, &current->cpus_allowed);
+
+	for_each_online_cpu (cpu) {
+		if (cpu == dying)
+			continue;
+		if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
+			continue;
+		/* Recheck banks in case CPUs don't all have the same */
+		if (cmci_supported(&banks))
+			cmci_discover(banks, 0);
+	}
+
+	set_cpus_allowed_ptr(current, old);
+	free_cpumask_var(old);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+	int banks;
+	if (cmci_supported(&banks))
+		cmci_discover(banks, 0);
+}
+
+static void intel_init_cmci(void)
+{
+	int banks;
+
+	if (!cmci_supported(&banks))
+		return;
+
+	mce_threshold_vector = intel_threshold_interrupt;
+	cmci_discover(banks, 1);
+	/*
+	 * For CPU #0 this runs with still disabled APIC, but that's
+	 * ok because only the vector is set up. We still do another
+	 * check for the banks later for CPU #0 just to make sure
+	 * to not miss any events.
+	 */
+	apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+	cmci_recheck();
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
+	intel_init_cmci();
 }
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 00000000000..23ee9e730f7
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+
+static void default_threshold_interrupt(void)
+{
+	printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
+			 THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage void mce_threshold_interrupt(void)
+{
+	exit_idle();
+	irq_enter();
+	inc_irq_stat(irq_threshold_count);
+	mce_threshold_vector();
+	irq_exit();
+	/* Ack only at the end to avoid potential reentry */
+	ack_APIC_irq();
+}
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index 191fc053364..f4361b56f8e 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
-obj-y		:= main.o if.o generic.o state.o
+obj-y		:= main.o if.o generic.o state.o cleanup.o
 obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
 
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644
index 00000000000..ce0fe4b5c04
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -0,0 +1,1101 @@
+/*  MTRR (Memory Type Range Register) cleanup
+
+    Copyright (C) 2009 Yinghai Lu
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Library General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Library General Public License for more details.
+
+    You should have received a copy of the GNU Library General Public
+    License along with this library; if not, write to the Free
+    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/sort.h>
+
+#include <asm/e820.h>
+#include <asm/mtrr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/kvm_para.h>
+#include "mtrr.h"
+
+/* should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
+
+struct res_range {
+	unsigned long start;
+	unsigned long end;
+};
+
+static int __init
+add_range(struct res_range *range, int nr_range, unsigned long start,
+			      unsigned long end)
+{
+	/* out of slots */
+	if (nr_range >= RANGE_NUM)
+		return nr_range;
+
+	range[nr_range].start = start;
+	range[nr_range].end = end;
+
+	nr_range++;
+
+	return nr_range;
+}
+
+static int __init
+add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
+			      unsigned long end)
+{
+	int i;
+
+	/* try to merge it with old one */
+	for (i = 0; i < nr_range; i++) {
+		unsigned long final_start, final_end;
+		unsigned long common_start, common_end;
+
+		if (!range[i].end)
+			continue;
+
+		common_start = max(range[i].start, start);
+		common_end = min(range[i].end, end);
+		if (common_start > common_end + 1)
+			continue;
+
+		final_start = min(range[i].start, start);
+		final_end = max(range[i].end, end);
+
+		range[i].start = final_start;
+		range[i].end =  final_end;
+		return nr_range;
+	}
+
+	/* need to add that */
+	return add_range(range, nr_range, start, end);
+}
+
+static void __init
+subtract_range(struct res_range *range, unsigned long start, unsigned long end)
+{
+	int i, j;
+
+	for (j = 0; j < RANGE_NUM; j++) {
+		if (!range[j].end)
+			continue;
+
+		if (start <= range[j].start && end >= range[j].end) {
+			range[j].start = 0;
+			range[j].end = 0;
+			continue;
+		}
+
+		if (start <= range[j].start && end < range[j].end &&
+		    range[j].start < end + 1) {
+			range[j].start = end + 1;
+			continue;
+		}
+
+
+		if (start > range[j].start && end >= range[j].end &&
+		    range[j].end > start - 1) {
+			range[j].end = start - 1;
+			continue;
+		}
+
+		if (start > range[j].start && end < range[j].end) {
+			/* find the new spare */
+			for (i = 0; i < RANGE_NUM; i++) {
+				if (range[i].end == 0)
+					break;
+			}
+			if (i < RANGE_NUM) {
+				range[i].end = range[j].end;
+				range[i].start = end + 1;
+			} else {
+				printk(KERN_ERR "run of slot in ranges\n");
+			}
+			range[j].end = start - 1;
+			continue;
+		}
+	}
+}
+
+static int __init cmp_range(const void *x1, const void *x2)
+{
+	const struct res_range *r1 = x1;
+	const struct res_range *r2 = x2;
+	long start1, start2;
+
+	start1 = r1->start;
+	start2 = r2->start;
+
+	return start1 - start2;
+}
+
+struct var_mtrr_range_state {
+	unsigned long base_pfn;
+	unsigned long size_pfn;
+	mtrr_type type;
+};
+
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static int __initdata debug_print;
+
+static int __init
+x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+		       unsigned long extra_remove_base,
+		       unsigned long extra_remove_size)
+{
+	unsigned long base, size;
+	mtrr_type type;
+	int i;
+
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_WRBACK)
+			continue;
+		base = range_state[i].base_pfn;
+		size = range_state[i].size_pfn;
+		nr_range = add_range_with_merge(range, nr_range, base,
+						base + size - 1);
+	}
+	if (debug_print) {
+		printk(KERN_DEBUG "After WB checking\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* take out UC ranges */
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_UNCACHABLE &&
+		    type != MTRR_TYPE_WRPROT)
+			continue;
+		size = range_state[i].size_pfn;
+		if (!size)
+			continue;
+		base = range_state[i].base_pfn;
+		if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
+		    (mtrr_state.enabled & 1)) {
+			/* Var MTRR contains UC entry below 1M? Skip it: */
+			printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d "
+				"contains strange UC entry under 1M, check "
+				"with your system vendor!\n", i);
+			if (base + size <= (1<<(20-PAGE_SHIFT)))
+				continue;
+			size -= (1<<(20-PAGE_SHIFT)) - base;
+			base = 1<<(20-PAGE_SHIFT);
+		}
+		subtract_range(range, base, base + size - 1);
+	}
+	if (extra_remove_size)
+		subtract_range(range, extra_remove_base,
+				 extra_remove_base + extra_remove_size  - 1);
+
+	/* get new range num */
+	nr_range = 0;
+	for (i = 0; i < RANGE_NUM; i++) {
+		if (!range[i].end)
+			continue;
+		nr_range++;
+	}
+	if  (debug_print) {
+		printk(KERN_DEBUG "After UC checking\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* sort the ranges */
+	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+	if  (debug_print) {
+		printk(KERN_DEBUG "After sorting\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* clear those is not used */
+	for (i = nr_range; i < RANGE_NUM; i++)
+		memset(&range[i], 0, sizeof(range[i]));
+
+	return nr_range;
+}
+
+static struct res_range __initdata range[RANGE_NUM];
+static int __initdata nr_range;
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+{
+	unsigned long sum;
+	int i;
+
+	sum = 0;
+	for (i = 0; i < nr_range; i++)
+		sum += range[i].end + 1 - range[i].start;
+
+	return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+	CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+	enable_mtrr_cleanup = 0;
+	return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+	enable_mtrr_cleanup = 1;
+	return 0;
+}
+early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+	debug_print = 1;
+	return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
+
+struct var_mtrr_state {
+	unsigned long	range_startk;
+	unsigned long	range_sizek;
+	unsigned long	chunk_sizek;
+	unsigned long	gran_sizek;
+	unsigned int	reg;
+};
+
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+		unsigned char type, unsigned int address_bits)
+{
+	u32 base_lo, base_hi, mask_lo, mask_hi;
+	u64 base, mask;
+
+	if (!sizek) {
+		fill_mtrr_var_range(reg, 0, 0, 0, 0);
+		return;
+	}
+
+	mask = (1ULL << address_bits) - 1;
+	mask &= ~((((u64)sizek) << 10) - 1);
+
+	base  = ((u64)basek) << 10;
+
+	base |= type;
+	mask |= 0x800;
+
+	base_lo = base & ((1ULL<<32) - 1);
+	base_hi = base >> 32;
+
+	mask_lo = mask & ((1ULL<<32) - 1);
+	mask_hi = mask >> 32;
+
+	fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+		unsigned char type)
+{
+	range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+	range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+	range_state[reg].type = type;
+}
+
+static void __init
+set_var_mtrr_all(unsigned int address_bits)
+{
+	unsigned long basek, sizek;
+	unsigned char type;
+	unsigned int reg;
+
+	for (reg = 0; reg < num_var_ranges; reg++) {
+		basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+		sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+		type = range_state[reg].type;
+
+		set_var_mtrr(reg, basek, sizek, type, address_bits);
+	}
+}
+
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+	char factor;
+	unsigned long base = sizek;
+
+	if (base & ((1<<10) - 1)) {
+		/* not MB alignment */
+		factor = 'K';
+	} else if (base & ((1<<20) - 1)) {
+		factor = 'M';
+		base >>= 10;
+	} else {
+		factor = 'G';
+		base >>= 20;
+	}
+
+	*factorp = factor;
+
+	return base;
+}
+
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+	      unsigned long range_sizek, unsigned char type)
+{
+	if (!range_sizek || (reg >= num_var_ranges))
+		return reg;
+
+	while (range_sizek) {
+		unsigned long max_align, align;
+		unsigned long sizek;
+
+		/* Compute the maximum size I can make a range */
+		if (range_startk)
+			max_align = ffs(range_startk) - 1;
+		else
+			max_align = 32;
+		align = fls(range_sizek) - 1;
+		if (align > max_align)
+			align = max_align;
+
+		sizek = 1 << align;
+		if (debug_print) {
+			char start_factor = 'K', size_factor = 'K';
+			unsigned long start_base, size_base;
+
+			start_base = to_size_factor(range_startk,
+							 &start_factor),
+			size_base = to_size_factor(sizek, &size_factor),
+
+			printk(KERN_DEBUG "Setting variable MTRR %d, "
+				"base: %ld%cB, range: %ld%cB, type %s\n",
+				reg, start_base, start_factor,
+				size_base, size_factor,
+				(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+				   ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
+				);
+		}
+		save_var_mtrr(reg++, range_startk, sizek, type);
+		range_startk += sizek;
+		range_sizek -= sizek;
+		if (reg >= num_var_ranges)
+			break;
+	}
+	return reg;
+}
+
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+			unsigned long sizek)
+{
+	unsigned long hole_basek, hole_sizek;
+	unsigned long second_basek, second_sizek;
+	unsigned long range0_basek, range0_sizek;
+	unsigned long range_basek, range_sizek;
+	unsigned long chunk_sizek;
+	unsigned long gran_sizek;
+
+	hole_basek = 0;
+	hole_sizek = 0;
+	second_basek = 0;
+	second_sizek = 0;
+	chunk_sizek = state->chunk_sizek;
+	gran_sizek = state->gran_sizek;
+
+	/* align with gran size, prevent small block used up MTRRs */
+	range_basek = ALIGN(state->range_startk, gran_sizek);
+	if ((range_basek > basek) && basek)
+		return second_sizek;
+	state->range_sizek -= (range_basek - state->range_startk);
+	range_sizek = ALIGN(state->range_sizek, gran_sizek);
+
+	while (range_sizek > state->range_sizek) {
+		range_sizek -= gran_sizek;
+		if (!range_sizek)
+			return 0;
+	}
+	state->range_sizek = range_sizek;
+
+	/* try to append some small hole */
+	range0_basek = state->range_startk;
+	range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+	/* no increase */
+	if (range0_sizek == state->range_sizek) {
+		if (debug_print)
+			printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
+				range0_basek<<10,
+				(range0_basek + state->range_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range0_basek,
+				state->range_sizek, MTRR_TYPE_WRBACK);
+		return 0;
+	}
+
+	/* only cut back, when it is not the last */
+	if (sizek) {
+		while (range0_basek + range0_sizek > (basek + sizek)) {
+			if (range0_sizek >= chunk_sizek)
+				range0_sizek -= chunk_sizek;
+			else
+				range0_sizek = 0;
+
+			if (!range0_sizek)
+				break;
+		}
+	}
+
+second_try:
+	range_basek = range0_basek + range0_sizek;
+
+	/* one hole in the middle */
+	if (range_basek > basek && range_basek <= (basek + sizek))
+		second_sizek = range_basek - basek;
+
+	if (range0_sizek > state->range_sizek) {
+
+		/* one hole in middle or at end */
+		hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+		/* hole size should be less than half of range0 size */
+		if (hole_sizek >= (range0_sizek >> 1) &&
+		    range0_sizek >= chunk_sizek) {
+			range0_sizek -= chunk_sizek;
+			second_sizek = 0;
+			hole_sizek = 0;
+
+			goto second_try;
+		}
+	}
+
+	if (range0_sizek) {
+		if (debug_print)
+			printk(KERN_DEBUG "range0: %016lx - %016lx\n",
+				range0_basek<<10,
+				(range0_basek + range0_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range0_basek,
+				range0_sizek, MTRR_TYPE_WRBACK);
+	}
+
+	if (range0_sizek < state->range_sizek) {
+		/* need to handle left over */
+		range_sizek = state->range_sizek - range0_sizek;
+
+		if (debug_print)
+			printk(KERN_DEBUG "range: %016lx - %016lx\n",
+				 range_basek<<10,
+				 (range_basek + range_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range_basek,
+				 range_sizek, MTRR_TYPE_WRBACK);
+	}
+
+	if (hole_sizek) {
+		hole_basek = range_basek - hole_sizek - second_sizek;
+		if (debug_print)
+			printk(KERN_DEBUG "hole: %016lx - %016lx\n",
+				 hole_basek<<10,
+				 (hole_basek + hole_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, hole_basek,
+				 hole_sizek, MTRR_TYPE_UNCACHABLE);
+	}
+
+	return second_sizek;
+}
+
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+		   unsigned long size_pfn)
+{
+	unsigned long basek, sizek;
+	unsigned long second_sizek = 0;
+
+	if (state->reg >= num_var_ranges)
+		return;
+
+	basek = base_pfn << (PAGE_SHIFT - 10);
+	sizek = size_pfn << (PAGE_SHIFT - 10);
+
+	/* See if I can merge with the last range */
+	if ((basek <= 1024) ||
+	    (state->range_startk + state->range_sizek == basek)) {
+		unsigned long endk = basek + sizek;
+		state->range_sizek = endk - state->range_startk;
+		return;
+	}
+	/* Write the range mtrrs */
+	if (state->range_sizek != 0)
+		second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
+
+	/* Allocate an msr */
+	state->range_startk = basek + second_sizek;
+	state->range_sizek  = sizek - second_sizek;
+}
+
+/* mininum size of mtrr block that can take hole */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+	if (!p)
+		return -EINVAL;
+	mtrr_chunk_size = memparse(p, &p);
+	return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* granity of mtrr of block */
+static u64 mtrr_gran_size __initdata;
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+	if (!p)
+		return -EINVAL;
+	mtrr_gran_size = memparse(p, &p);
+	return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static int nr_mtrr_spare_reg __initdata =
+				 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+	if (arg)
+		nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
+x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+		    u64 chunk_size, u64 gran_size)
+{
+	struct var_mtrr_state var_state;
+	int i;
+	int num_reg;
+
+	var_state.range_startk	= 0;
+	var_state.range_sizek	= 0;
+	var_state.reg		= 0;
+	var_state.chunk_sizek	= chunk_size >> 10;
+	var_state.gran_sizek	= gran_size >> 10;
+
+	memset(range_state, 0, sizeof(range_state));
+
+	/* Write the range etc */
+	for (i = 0; i < nr_range; i++)
+		set_var_mtrr_range(&var_state, range[i].start,
+				   range[i].end - range[i].start + 1);
+
+	/* Write the last range */
+	if (var_state.range_sizek != 0)
+		range_to_mtrr_with_hole(&var_state, 0, 0);
+
+	num_reg = var_state.reg;
+	/* Clear out the extra MTRR's */
+	while (var_state.reg < num_var_ranges) {
+		save_var_mtrr(var_state.reg, 0, 0, 0);
+		var_state.reg++;
+	}
+
+	return num_reg;
+}
+
+struct mtrr_cleanup_result {
+	unsigned long gran_sizek;
+	unsigned long chunk_sizek;
+	unsigned long lose_cover_sizek;
+	unsigned int num_reg;
+	int bad;
+};
+
+/*
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
+ */
+#define NUM_RESULT	136
+#define PSHIFT		(PAGE_SHIFT - 10)
+
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
+
+static void __init print_out_mtrr_range_state(void)
+{
+	int i;
+	char start_factor = 'K', size_factor = 'K';
+	unsigned long start_base, size_base;
+	mtrr_type type;
+
+	for (i = 0; i < num_var_ranges; i++) {
+
+		size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+		if (!size_base)
+			continue;
+
+		size_base = to_size_factor(size_base, &size_factor),
+		start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+		start_base = to_size_factor(start_base, &start_factor),
+		type = range_state[i].type;
+
+		printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+			i, start_base, start_factor,
+			size_base, size_factor,
+			(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+			    ((type == MTRR_TYPE_WRPROT) ? "WP" :
+			     ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+			);
+	}
+}
+
+static int __init mtrr_need_cleanup(void)
+{
+	int i;
+	mtrr_type type;
+	unsigned long size;
+	/* extra one for all 0 */
+	int num[MTRR_NUM_TYPES + 1];
+
+	/* check entries number */
+	memset(num, 0, sizeof(num));
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		size = range_state[i].size_pfn;
+		if (type >= MTRR_NUM_TYPES)
+			continue;
+		if (!size)
+			type = MTRR_NUM_TYPES;
+		if (type == MTRR_TYPE_WRPROT)
+			type = MTRR_TYPE_UNCACHABLE;
+		num[type]++;
+	}
+
+	/* check if we got UC entries */
+	if (!num[MTRR_TYPE_UNCACHABLE])
+		return 0;
+
+	/* check if we only had WB and UC */
+	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+		num_var_ranges - num[MTRR_NUM_TYPES])
+		return 0;
+
+	return 1;
+}
+
+static unsigned long __initdata range_sums;
+static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
+					 unsigned long extra_remove_base,
+					 unsigned long extra_remove_size,
+					 int i)
+{
+	int num_reg;
+	static struct res_range range_new[RANGE_NUM];
+	static int nr_range_new;
+	unsigned long range_sums_new;
+
+	/* convert ranges to var ranges state */
+	num_reg = x86_setup_var_mtrrs(range, nr_range,
+						chunk_size, gran_size);
+
+	/* we got new setting in range_state, check it */
+	memset(range_new, 0, sizeof(range_new));
+	nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+				extra_remove_base, extra_remove_size);
+	range_sums_new = sum_ranges(range_new, nr_range_new);
+
+	result[i].chunk_sizek = chunk_size >> 10;
+	result[i].gran_sizek = gran_size >> 10;
+	result[i].num_reg = num_reg;
+	if (range_sums < range_sums_new) {
+		result[i].lose_cover_sizek =
+			(range_sums_new - range_sums) << PSHIFT;
+		result[i].bad = 1;
+	} else
+		result[i].lose_cover_sizek =
+			(range_sums - range_sums_new) << PSHIFT;
+
+	/* double check it */
+	if (!result[i].bad && !result[i].lose_cover_sizek) {
+		if (nr_range_new != nr_range ||
+			memcmp(range, range_new, sizeof(range)))
+				result[i].bad = 1;
+	}
+
+	if (!result[i].bad && (range_sums - range_sums_new <
+				min_loss_pfn[num_reg])) {
+		min_loss_pfn[num_reg] =
+			range_sums - range_sums_new;
+	}
+}
+
+static void __init mtrr_print_out_one_result(int i)
+{
+	char gran_factor, chunk_factor, lose_factor;
+	unsigned long gran_base, chunk_base, lose_base;
+
+	gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+	chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+	lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+	printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+			result[i].bad ? "*BAD*" : " ",
+			gran_base, gran_factor, chunk_base, chunk_factor);
+	printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ld%c\n",
+			result[i].num_reg, result[i].bad ? "-" : "",
+			lose_base, lose_factor);
+}
+
+static int __init mtrr_search_optimal_index(void)
+{
+	int i;
+	int num_reg_good;
+	int index_good;
+
+	if (nr_mtrr_spare_reg >= num_var_ranges)
+		nr_mtrr_spare_reg = num_var_ranges - 1;
+	num_reg_good = -1;
+	for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+		if (!min_loss_pfn[i])
+			num_reg_good = i;
+	}
+
+	index_good = -1;
+	if (num_reg_good != -1) {
+		for (i = 0; i < NUM_RESULT; i++) {
+			if (!result[i].bad &&
+			    result[i].num_reg == num_reg_good &&
+			    !result[i].lose_cover_sizek) {
+				index_good = i;
+				break;
+			}
+		}
+	}
+
+	return index_good;
+}
+
+
+int __init mtrr_cleanup(unsigned address_bits)
+{
+	unsigned long extra_remove_base, extra_remove_size;
+	unsigned long base, size, def, dummy;
+	mtrr_type type;
+	u64 chunk_size, gran_size;
+	int index_good;
+	int i;
+
+	if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+		return 0;
+	rdmsr(MTRRdefType_MSR, def, dummy);
+	def &= 0xff;
+	if (def != MTRR_TYPE_UNCACHABLE)
+		return 0;
+
+	/* get it and store it aside */
+	memset(range_state, 0, sizeof(range_state));
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		range_state[i].base_pfn = base;
+		range_state[i].size_pfn = size;
+		range_state[i].type = type;
+	}
+
+	/* check if we need handle it and can handle it */
+	if (!mtrr_need_cleanup())
+		return 0;
+
+	/* print original var MTRRs at first, for debugging: */
+	printk(KERN_DEBUG "original variable MTRRs\n");
+	print_out_mtrr_range_state();
+
+	memset(range, 0, sizeof(range));
+	extra_remove_size = 0;
+	extra_remove_base = 1 << (32 - PAGE_SHIFT);
+	if (mtrr_tom2)
+		extra_remove_size =
+			(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
+	nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
+					  extra_remove_size);
+	/*
+	 * [0, 1M) should always be coverred by var mtrr with WB
+	 * and fixed mtrrs should take effective before var mtrr for it
+	 */
+	nr_range = add_range_with_merge(range, nr_range, 0,
+					(1ULL<<(20 - PAGE_SHIFT)) - 1);
+	/* sort the ranges */
+	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+
+	range_sums = sum_ranges(range, nr_range);
+	printk(KERN_INFO "total RAM coverred: %ldM\n",
+	       range_sums >> (20 - PAGE_SHIFT));
+
+	if (mtrr_chunk_size && mtrr_gran_size) {
+		i = 0;
+		mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
+				      extra_remove_base, extra_remove_size, i);
+
+		mtrr_print_out_one_result(i);
+
+		if (!result[i].bad) {
+			set_var_mtrr_all(address_bits);
+			printk(KERN_DEBUG "New variable MTRRs\n");
+			print_out_mtrr_range_state();
+			return 1;
+		}
+		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
+		       "will find optimal one\n");
+	}
+
+	i = 0;
+	memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+	memset(result, 0, sizeof(result));
+	for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+
+		for (chunk_size = gran_size; chunk_size < (1ULL<<32);
+		     chunk_size <<= 1) {
+
+			if (i >= NUM_RESULT)
+				continue;
+
+			mtrr_calc_range_state(chunk_size, gran_size,
+				      extra_remove_base, extra_remove_size, i);
+			if (debug_print) {
+				mtrr_print_out_one_result(i);
+				printk(KERN_INFO "\n");
+			}
+
+			i++;
+		}
+	}
+
+	/* try to find the optimal index */
+	index_good = mtrr_search_optimal_index();
+
+	if (index_good != -1) {
+		printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+		i = index_good;
+		mtrr_print_out_one_result(i);
+
+		/* convert ranges to var ranges state */
+		chunk_size = result[i].chunk_sizek;
+		chunk_size <<= 10;
+		gran_size = result[i].gran_sizek;
+		gran_size <<= 10;
+		x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+		set_var_mtrr_all(address_bits);
+		printk(KERN_DEBUG "New variable MTRRs\n");
+		print_out_mtrr_range_state();
+		return 1;
+	} else {
+		/* print out all */
+		for (i = 0; i < NUM_RESULT; i++)
+			mtrr_print_out_one_result(i);
+	}
+
+	printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
+	printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+	return 0;
+}
+#else
+int __init mtrr_cleanup(unsigned address_bits)
+{
+	return 0;
+}
+#endif
+
+static int disable_mtrr_trim;
+
+static int __init disable_mtrr_trim_setup(char *str)
+{
+	disable_mtrr_trim = 1;
+	return 0;
+}
+early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+int __init amd_special_default_mtrr(void)
+{
+	u32 l, h;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		return 0;
+	if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+		return 0;
+	/* In case some hypervisor doesn't pass SYSCFG through */
+	if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+		return 0;
+	/*
+	 * Memory between 4GB and top of mem is forced WB by this magic bit.
+	 * Reserved before K8RevF, but should be zero there.
+	 */
+	if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+		 (Tom2Enabled | Tom2ForceMemTypeWB))
+		return 1;
+	return 0;
+}
+
+static u64 __init real_trim_memory(unsigned long start_pfn,
+				   unsigned long limit_pfn)
+{
+	u64 trim_start, trim_size;
+	trim_start = start_pfn;
+	trim_start <<= PAGE_SHIFT;
+	trim_size = limit_pfn;
+	trim_size <<= PAGE_SHIFT;
+	trim_size -= trim_start;
+
+	return e820_update_range(trim_start, trim_size, E820_RAM,
+				E820_RESERVED);
+}
+/**
+ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ * @end_pfn: ending page frame number
+ *
+ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
+ * memory configurations.  This routine checks that the highest MTRR matches
+ * the end of memory, to make sure the MTRRs having a write back type cover
+ * all of the memory the kernel is intending to use. If not, it'll trim any
+ * memory off the end by adjusting end_pfn, removing it from the kernel's
+ * allocation pools, warning the user with an obnoxious message.
+ */
+int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+	unsigned long i, base, size, highest_pfn = 0, def, dummy;
+	mtrr_type type;
+	u64 total_trim_size;
+
+	/* extra one for all 0 */
+	int num[MTRR_NUM_TYPES + 1];
+	/*
+	 * Make sure we only trim uncachable memory on machines that
+	 * support the Intel MTRR architecture:
+	 */
+	if (!is_cpu(INTEL) || disable_mtrr_trim)
+		return 0;
+	rdmsr(MTRRdefType_MSR, def, dummy);
+	def &= 0xff;
+	if (def != MTRR_TYPE_UNCACHABLE)
+		return 0;
+
+	/* get it and store it aside */
+	memset(range_state, 0, sizeof(range_state));
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		range_state[i].base_pfn = base;
+		range_state[i].size_pfn = size;
+		range_state[i].type = type;
+	}
+
+	/* Find highest cached pfn */
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_WRBACK)
+			continue;
+		base = range_state[i].base_pfn;
+		size = range_state[i].size_pfn;
+		if (highest_pfn < base + size)
+			highest_pfn = base + size;
+	}
+
+	/* kvm/qemu doesn't have mtrr set right, don't trim them all */
+	if (!highest_pfn) {
+		printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+		return 0;
+	}
+
+	/* check entries number */
+	memset(num, 0, sizeof(num));
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type >= MTRR_NUM_TYPES)
+			continue;
+		size = range_state[i].size_pfn;
+		if (!size)
+			type = MTRR_NUM_TYPES;
+		num[type]++;
+	}
+
+	/* no entry for WB? */
+	if (!num[MTRR_TYPE_WRBACK])
+		return 0;
+
+	/* check if we only had WB and UC */
+	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+		num_var_ranges - num[MTRR_NUM_TYPES])
+		return 0;
+
+	memset(range, 0, sizeof(range));
+	nr_range = 0;
+	if (mtrr_tom2) {
+		range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+		range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
+		if (highest_pfn < range[nr_range].end + 1)
+			highest_pfn = range[nr_range].end + 1;
+		nr_range++;
+	}
+	nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+	total_trim_size = 0;
+	/* check the head */
+	if (range[0].start)
+		total_trim_size += real_trim_memory(0, range[0].start);
+	/* check the holes */
+	for (i = 0; i < nr_range - 1; i++) {
+		if (range[i].end + 1 < range[i+1].start)
+			total_trim_size += real_trim_memory(range[i].end + 1,
+							    range[i+1].start);
+	}
+	/* check the top */
+	i = nr_range - 1;
+	if (range[i].end + 1 < end_pfn)
+		total_trim_size += real_trim_memory(range[i].end + 1,
+							 end_pfn);
+
+	if (total_trim_size) {
+		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
+			" all of memory, losing %lluMB of RAM.\n",
+			total_trim_size >> 20);
+
+		if (!changed_by_mtrr_cleanup)
+			WARN_ON(1);
+
+		printk(KERN_INFO "update e820 for mtrr\n");
+		update_e820();
+
+		return 1;
+	}
+
+	return 0;
+}
+
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0c0a455fe95..0b776c09aff 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,13 +33,31 @@ u64 mtrr_tom2;
 struct mtrr_state_type mtrr_state = {};
 EXPORT_SYMBOL_GPL(mtrr_state);
 
-static int __initdata mtrr_show;
-static int __init mtrr_debug(char *opt)
+/**
+ * BIOS is expected to clear MtrrFixDramModEn bit, see for example
+ * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
+ * Opteron Processors" (26094 Rev. 3.30 February 2006), section
+ * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
+ * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
+ * 0 for operation."
+ */
+static inline void k8_check_syscfg_dram_mod_en(void)
 {
-	mtrr_show = 1;
-	return 0;
+	u32 lo, hi;
+
+	if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+	      (boot_cpu_data.x86 >= 0x0f)))
+		return;
+
+	rdmsr(MSR_K8_SYSCFG, lo, hi);
+	if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
+		printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+		       " not cleared by BIOS, clearing this bit\n",
+		       smp_processor_id());
+		lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
+		mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+	}
 }
-early_param("mtrr.show", mtrr_debug);
 
 /*
  * Returns the effective MTRR type for the region
@@ -174,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs)
 	unsigned int *p = (unsigned int *) frs;
 	int i;
 
+	k8_check_syscfg_dram_mod_en();
+
 	rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
 
 	for (i = 0; i < 2; i++)
@@ -188,18 +208,94 @@ void mtrr_save_fixed_ranges(void *info)
 		get_fixed_ranges(mtrr_state.fixed_ranges);
 }
 
-static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
+static unsigned __initdata last_fixed_start;
+static unsigned __initdata last_fixed_end;
+static mtrr_type __initdata last_fixed_type;
+
+static void __init print_fixed_last(void)
+{
+	if (!last_fixed_end)
+		return;
+
+	printk(KERN_DEBUG "  %05X-%05X %s\n", last_fixed_start,
+		last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
+
+	last_fixed_end = 0;
+}
+
+static void __init update_fixed_last(unsigned base, unsigned end,
+				       mtrr_type type)
+{
+	last_fixed_start = base;
+	last_fixed_end = end;
+	last_fixed_type = type;
+}
+
+static void __init print_fixed(unsigned base, unsigned step,
+			       const mtrr_type *types)
 {
 	unsigned i;
 
-	for (i = 0; i < 8; ++i, ++types, base += step)
-		printk(KERN_INFO "MTRR %05X-%05X %s\n",
-			base, base + step - 1, mtrr_attrib_to_str(*types));
+	for (i = 0; i < 8; ++i, ++types, base += step) {
+		if (last_fixed_end == 0) {
+			update_fixed_last(base, base + step, *types);
+			continue;
+		}
+		if (last_fixed_end == base && last_fixed_type == *types) {
+			last_fixed_end = base + step;
+			continue;
+		}
+		/* new segments: gap or different type */
+		print_fixed_last();
+		update_fixed_last(base, base + step, *types);
+	}
 }
 
 static void prepare_set(void);
 static void post_set(void);
 
+static void __init print_mtrr_state(void)
+{
+	unsigned int i;
+	int high_width;
+
+	printk(KERN_DEBUG "MTRR default type: %s\n",
+			 mtrr_attrib_to_str(mtrr_state.def_type));
+	if (mtrr_state.have_fixed) {
+		printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n",
+		       mtrr_state.enabled & 1 ? "en" : "dis");
+		print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
+		for (i = 0; i < 2; ++i)
+			print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
+		for (i = 0; i < 8; ++i)
+			print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
+
+		/* tail */
+		print_fixed_last();
+	}
+	printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
+	       mtrr_state.enabled & 2 ? "en" : "dis");
+	high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
+	for (i = 0; i < num_var_ranges; ++i) {
+		if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
+			printk(KERN_DEBUG "  %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+			       i,
+			       high_width,
+			       mtrr_state.var_ranges[i].base_hi,
+			       mtrr_state.var_ranges[i].base_lo >> 12,
+			       high_width,
+			       mtrr_state.var_ranges[i].mask_hi,
+			       mtrr_state.var_ranges[i].mask_lo >> 12,
+			       mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+		else
+			printk(KERN_DEBUG "  %u disabled\n", i);
+	}
+	if (mtrr_tom2) {
+		printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
+				  mtrr_tom2, mtrr_tom2>>20);
+	}
+}
+
 /*  Grab all of the MTRR state for this CPU into *state  */
 void __init get_mtrr_state(void)
 {
@@ -231,41 +327,9 @@ void __init get_mtrr_state(void)
 		mtrr_tom2 |= low;
 		mtrr_tom2 &= 0xffffff800000ULL;
 	}
-	if (mtrr_show) {
-		int high_width;
-
-		printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
-		if (mtrr_state.have_fixed) {
-			printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
-			       mtrr_state.enabled & 1 ? "en" : "dis");
-			print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
-			for (i = 0; i < 2; ++i)
-				print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
-			for (i = 0; i < 8; ++i)
-				print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
-		}
-		printk(KERN_INFO "MTRR variable ranges %sabled:\n",
-		       mtrr_state.enabled & 2 ? "en" : "dis");
-		high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
-		for (i = 0; i < num_var_ranges; ++i) {
-			if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
-				printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
-				       i,
-				       high_width,
-				       mtrr_state.var_ranges[i].base_hi,
-				       mtrr_state.var_ranges[i].base_lo >> 12,
-				       high_width,
-				       mtrr_state.var_ranges[i].mask_hi,
-				       mtrr_state.var_ranges[i].mask_lo >> 12,
-				       mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
-			else
-				printk(KERN_INFO "MTRR %u disabled\n", i);
-		}
-		if (mtrr_tom2) {
-			printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
-					  mtrr_tom2, mtrr_tom2>>20);
-		}
-	}
+
+	print_mtrr_state();
+
 	mtrr_state_set = 1;
 
 	/* PAT setup for BP. We need to go through sync steps here */
@@ -308,27 +372,10 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
 }
 
 /**
- * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
- * see AMD publication no. 24593, chapter 3.2.1 for more information
- */
-static inline void k8_enable_fixed_iorrs(void)
-{
-	unsigned lo, hi;
-
-	rdmsr(MSR_K8_SYSCFG, lo, hi);
-	mtrr_wrmsr(MSR_K8_SYSCFG, lo
-				| K8_MTRRFIXRANGE_DRAM_ENABLE
-				| K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
-}
-
-/**
  * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
  * @msr: MSR address of the MTTR which should be checked and updated
  * @changed: pointer which indicates whether the MTRR needed to be changed
  * @msrwords: pointer to the MSR values which the MSR should have
- *
- * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
- * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
  */
 static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
 {
@@ -337,10 +384,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
 	rdmsr(msr, lo, hi);
 
 	if (lo != msrwords[0] || hi != msrwords[1]) {
-		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		    (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
-		    ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
-			k8_enable_fixed_iorrs();
 		mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
 		*changed = true;
 	}
@@ -376,22 +419,31 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 {
 	unsigned int mask_lo, mask_hi, base_lo, base_hi;
 	unsigned int tmp, hi;
+	int cpu;
+
+	/*
+	 * get_mtrr doesn't need to update mtrr_state, also it could be called
+	 * from any cpu, so try to print it out directly.
+	 */
+	cpu = get_cpu();
 
 	rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
+
 	if ((mask_lo & 0x800) == 0) {
 		/*  Invalid (i.e. free) range  */
 		*base = 0;
 		*size = 0;
 		*type = 0;
-		return;
+		goto out_put_cpu;
 	}
 
 	rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
 
-	/* Work out the shifted address mask. */
+	/* Work out the shifted address mask: */
 	tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
 	mask_lo = size_or_mask | tmp;
-	/* Expand tmp with high bits to all 1s*/
+
+	/* Expand tmp with high bits to all 1s: */
 	hi = fls(tmp);
 	if (hi > 0) {
 		tmp |= ~((1<<(hi - 1)) - 1);
@@ -402,11 +454,16 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 		}
 	}
 
-	/* This works correctly if size is a power of two, i.e. a
-	   contiguous range. */
+	/*
+	 * This works correctly if size is a power of two, i.e. a
+	 * contiguous range:
+	 */
 	*size = -mask_lo;
 	*base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
 	*type = base_lo & 0xff;
+
+out_put_cpu:
+	put_cpu();
 }
 
 /**
@@ -419,6 +476,8 @@ static int set_fixed_ranges(mtrr_type * frs)
 	bool changed = false;
 	int block=-1, range;
 
+	k8_check_syscfg_dram_mod_en();
+
 	while (fixed_range_blocks[++block].ranges)
 	    for (range=0; range < fixed_range_blocks[block].ranges; range++)
 		set_fixed_range(fixed_range_blocks[block].base_msr + range,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 4c4214690dd..fb73a52913a 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -377,10 +377,6 @@ static const struct file_operations mtrr_fops = {
 	.release = mtrr_close,
 };
 
-
-static struct proc_dir_entry *proc_root_mtrr;
-
-
 static int mtrr_seq_show(struct seq_file *seq, void *offset)
 {
 	char factor;
@@ -423,11 +419,7 @@ static int __init mtrr_if_init(void)
 	    (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
 		return -ENODEV;
 
-	proc_root_mtrr =
-		proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
-
-	if (proc_root_mtrr)
-		proc_root_mtrr->owner = THIS_MODULE;
+	proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 236a401b825..03cda01f57c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -574,7 +574,7 @@ struct mtrr_value {
 	unsigned long	lsize;
 };
 
-static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
+static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
 
 static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 {
@@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 
 	for (i = 0; i < num_var_ranges; i++) {
 		mtrr_if->get(i,
-			     &mtrr_state[i].lbase,
-			     &mtrr_state[i].lsize,
-			     &mtrr_state[i].ltype);
+			     &mtrr_value[i].lbase,
+			     &mtrr_value[i].lsize,
+			     &mtrr_value[i].ltype);
 	}
 	return 0;
 }
@@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev)
 	int i;
 
 	for (i = 0; i < num_var_ranges; i++) {
-		if (mtrr_state[i].lsize) 
+		if (mtrr_value[i].lsize)
 			set_mtrr(i,
-				 mtrr_state[i].lbase,
-				 mtrr_state[i].lsize,
-				 mtrr_state[i].ltype);
+				 mtrr_value[i].lbase,
+				 mtrr_value[i].lsize,
+				 mtrr_value[i].ltype);
 	}
 	return 0;
 }
@@ -610,1058 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
 	.resume		= mtrr_restore,
 };
 
-/* should be related to MTRR_VAR_RANGES nums */
-#define RANGE_NUM 256
-
-struct res_range {
-	unsigned long start;
-	unsigned long end;
-};
-
-static int __init
-add_range(struct res_range *range, int nr_range, unsigned long start,
-			      unsigned long end)
-{
-	/* out of slots */
-	if (nr_range >= RANGE_NUM)
-		return nr_range;
-
-	range[nr_range].start = start;
-	range[nr_range].end = end;
-
-	nr_range++;
-
-	return nr_range;
-}
-
-static int __init
-add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
-			      unsigned long end)
-{
-	int i;
-
-	/* try to merge it with old one */
-	for (i = 0; i < nr_range; i++) {
-		unsigned long final_start, final_end;
-		unsigned long common_start, common_end;
-
-		if (!range[i].end)
-			continue;
-
-		common_start = max(range[i].start, start);
-		common_end = min(range[i].end, end);
-		if (common_start > common_end + 1)
-			continue;
-
-		final_start = min(range[i].start, start);
-		final_end = max(range[i].end, end);
-
-		range[i].start = final_start;
-		range[i].end =  final_end;
-		return nr_range;
-	}
-
-	/* need to add that */
-	return add_range(range, nr_range, start, end);
-}
-
-static void __init
-subtract_range(struct res_range *range, unsigned long start, unsigned long end)
-{
-	int i, j;
-
-	for (j = 0; j < RANGE_NUM; j++) {
-		if (!range[j].end)
-			continue;
-
-		if (start <= range[j].start && end >= range[j].end) {
-			range[j].start = 0;
-			range[j].end = 0;
-			continue;
-		}
-
-		if (start <= range[j].start && end < range[j].end &&
-		    range[j].start < end + 1) {
-			range[j].start = end + 1;
-			continue;
-		}
-
-
-		if (start > range[j].start && end >= range[j].end &&
-		    range[j].end > start - 1) {
-			range[j].end = start - 1;
-			continue;
-		}
-
-		if (start > range[j].start && end < range[j].end) {
-			/* find the new spare */
-			for (i = 0; i < RANGE_NUM; i++) {
-				if (range[i].end == 0)
-					break;
-			}
-			if (i < RANGE_NUM) {
-				range[i].end = range[j].end;
-				range[i].start = end + 1;
-			} else {
-				printk(KERN_ERR "run of slot in ranges\n");
-			}
-			range[j].end = start - 1;
-			continue;
-		}
-	}
-}
-
-static int __init cmp_range(const void *x1, const void *x2)
-{
-	const struct res_range *r1 = x1;
-	const struct res_range *r2 = x2;
-	long start1, start2;
-
-	start1 = r1->start;
-	start2 = r2->start;
-
-	return start1 - start2;
-}
-
-struct var_mtrr_range_state {
-	unsigned long base_pfn;
-	unsigned long size_pfn;
-	mtrr_type type;
-};
-
-static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
-static int __initdata debug_print;
-
-static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
-		       unsigned long extra_remove_base,
-		       unsigned long extra_remove_size)
-{
-	unsigned long i, base, size;
-	mtrr_type type;
-
-	for (i = 0; i < num_var_ranges; i++) {
-		type = range_state[i].type;
-		if (type != MTRR_TYPE_WRBACK)
-			continue;
-		base = range_state[i].base_pfn;
-		size = range_state[i].size_pfn;
-		nr_range = add_range_with_merge(range, nr_range, base,
-						base + size - 1);
-	}
-	if (debug_print) {
-		printk(KERN_DEBUG "After WB checking\n");
-		for (i = 0; i < nr_range; i++)
-			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
-				 range[i].start, range[i].end + 1);
-	}
-
-	/* take out UC ranges */
-	for (i = 0; i < num_var_ranges; i++) {
-		type = range_state[i].type;
-		if (type != MTRR_TYPE_UNCACHABLE &&
-		    type != MTRR_TYPE_WRPROT)
-			continue;
-		size = range_state[i].size_pfn;
-		if (!size)
-			continue;
-		base = range_state[i].base_pfn;
-		subtract_range(range, base, base + size - 1);
-	}
-	if (extra_remove_size)
-		subtract_range(range, extra_remove_base,
-				 extra_remove_base + extra_remove_size  - 1);
-
-	/* get new range num */
-	nr_range = 0;
-	for (i = 0; i < RANGE_NUM; i++) {
-		if (!range[i].end)
-			continue;
-		nr_range++;
-	}
-	if  (debug_print) {
-		printk(KERN_DEBUG "After UC checking\n");
-		for (i = 0; i < nr_range; i++)
-			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
-				 range[i].start, range[i].end + 1);
-	}
-
-	/* sort the ranges */
-	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-	if  (debug_print) {
-		printk(KERN_DEBUG "After sorting\n");
-		for (i = 0; i < nr_range; i++)
-			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
-				 range[i].start, range[i].end + 1);
-	}
-
-	/* clear those is not used */
-	for (i = nr_range; i < RANGE_NUM; i++)
-		memset(&range[i], 0, sizeof(range[i]));
-
-	return nr_range;
-}
-
-static struct res_range __initdata range[RANGE_NUM];
-static int __initdata nr_range;
-
-#ifdef CONFIG_MTRR_SANITIZER
-
-static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
-{
-	unsigned long sum;
-	int i;
-
-	sum = 0;
-	for (i = 0; i < nr_range; i++)
-		sum += range[i].end + 1 - range[i].start;
-
-	return sum;
-}
-
-static int enable_mtrr_cleanup __initdata =
-	CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
-
-static int __init disable_mtrr_cleanup_setup(char *str)
-{
-	enable_mtrr_cleanup = 0;
-	return 0;
-}
-early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
-
-static int __init enable_mtrr_cleanup_setup(char *str)
-{
-	enable_mtrr_cleanup = 1;
-	return 0;
-}
-early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
-
-static int __init mtrr_cleanup_debug_setup(char *str)
-{
-	debug_print = 1;
-	return 0;
-}
-early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
-
-struct var_mtrr_state {
-	unsigned long	range_startk;
-	unsigned long	range_sizek;
-	unsigned long	chunk_sizek;
-	unsigned long	gran_sizek;
-	unsigned int	reg;
-};
-
-static void __init
-set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
-		unsigned char type, unsigned int address_bits)
-{
-	u32 base_lo, base_hi, mask_lo, mask_hi;
-	u64 base, mask;
-
-	if (!sizek) {
-		fill_mtrr_var_range(reg, 0, 0, 0, 0);
-		return;
-	}
-
-	mask = (1ULL << address_bits) - 1;
-	mask &= ~((((u64)sizek) << 10) - 1);
-
-	base  = ((u64)basek) << 10;
-
-	base |= type;
-	mask |= 0x800;
-
-	base_lo = base & ((1ULL<<32) - 1);
-	base_hi = base >> 32;
-
-	mask_lo = mask & ((1ULL<<32) - 1);
-	mask_hi = mask >> 32;
-
-	fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
-}
-
-static void __init
-save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
-		unsigned char type)
-{
-	range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
-	range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
-	range_state[reg].type = type;
-}
-
-static void __init
-set_var_mtrr_all(unsigned int address_bits)
-{
-	unsigned long basek, sizek;
-	unsigned char type;
-	unsigned int reg;
-
-	for (reg = 0; reg < num_var_ranges; reg++) {
-		basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
-		sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
-		type = range_state[reg].type;
-
-		set_var_mtrr(reg, basek, sizek, type, address_bits);
-	}
-}
-
-static unsigned long to_size_factor(unsigned long sizek, char *factorp)
-{
-	char factor;
-	unsigned long base = sizek;
-
-	if (base & ((1<<10) - 1)) {
-		/* not MB alignment */
-		factor = 'K';
-	} else if (base & ((1<<20) - 1)){
-		factor = 'M';
-		base >>= 10;
-	} else {
-		factor = 'G';
-		base >>= 20;
-	}
-
-	*factorp = factor;
-
-	return base;
-}
-
-static unsigned int __init
-range_to_mtrr(unsigned int reg, unsigned long range_startk,
-	      unsigned long range_sizek, unsigned char type)
-{
-	if (!range_sizek || (reg >= num_var_ranges))
-		return reg;
-
-	while (range_sizek) {
-		unsigned long max_align, align;
-		unsigned long sizek;
-
-		/* Compute the maximum size I can make a range */
-		if (range_startk)
-			max_align = ffs(range_startk) - 1;
-		else
-			max_align = 32;
-		align = fls(range_sizek) - 1;
-		if (align > max_align)
-			align = max_align;
-
-		sizek = 1 << align;
-		if (debug_print) {
-			char start_factor = 'K', size_factor = 'K';
-			unsigned long start_base, size_base;
-
-			start_base = to_size_factor(range_startk, &start_factor),
-			size_base = to_size_factor(sizek, &size_factor),
-
-			printk(KERN_DEBUG "Setting variable MTRR %d, "
-				"base: %ld%cB, range: %ld%cB, type %s\n",
-				reg, start_base, start_factor,
-				size_base, size_factor,
-				(type == MTRR_TYPE_UNCACHABLE)?"UC":
-				    ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
-				);
-		}
-		save_var_mtrr(reg++, range_startk, sizek, type);
-		range_startk += sizek;
-		range_sizek -= sizek;
-		if (reg >= num_var_ranges)
-			break;
-	}
-	return reg;
-}
-
-static unsigned __init
-range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
-			unsigned long sizek)
-{
-	unsigned long hole_basek, hole_sizek;
-	unsigned long second_basek, second_sizek;
-	unsigned long range0_basek, range0_sizek;
-	unsigned long range_basek, range_sizek;
-	unsigned long chunk_sizek;
-	unsigned long gran_sizek;
-
-	hole_basek = 0;
-	hole_sizek = 0;
-	second_basek = 0;
-	second_sizek = 0;
-	chunk_sizek = state->chunk_sizek;
-	gran_sizek = state->gran_sizek;
-
-	/* align with gran size, prevent small block used up MTRRs */
-	range_basek = ALIGN(state->range_startk, gran_sizek);
-	if ((range_basek > basek) && basek)
-		return second_sizek;
-	state->range_sizek -= (range_basek - state->range_startk);
-	range_sizek = ALIGN(state->range_sizek, gran_sizek);
-
-	while (range_sizek > state->range_sizek) {
-		range_sizek -= gran_sizek;
-		if (!range_sizek)
-			return 0;
-	}
-	state->range_sizek = range_sizek;
-
-	/* try to append some small hole */
-	range0_basek = state->range_startk;
-	range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
-
-	/* no increase */
-	if (range0_sizek == state->range_sizek) {
-		if (debug_print)
-			printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
-				range0_basek<<10,
-				(range0_basek + state->range_sizek)<<10);
-		state->reg = range_to_mtrr(state->reg, range0_basek,
-				state->range_sizek, MTRR_TYPE_WRBACK);
-		return 0;
-	}
-
-	/* only cut back, when it is not the last */
-	if (sizek) {
-		while (range0_basek + range0_sizek > (basek + sizek)) {
-			if (range0_sizek >= chunk_sizek)
-				range0_sizek -= chunk_sizek;
-			else
-				range0_sizek = 0;
-
-			if (!range0_sizek)
-				break;
-		}
-	}
-
-second_try:
-	range_basek = range0_basek + range0_sizek;
-
-	/* one hole in the middle */
-	if (range_basek > basek && range_basek <= (basek + sizek))
-		second_sizek = range_basek - basek;
-
-	if (range0_sizek > state->range_sizek) {
-
-		/* one hole in middle or at end */
-		hole_sizek = range0_sizek - state->range_sizek - second_sizek;
-
-		/* hole size should be less than half of range0 size */
-		if (hole_sizek >= (range0_sizek >> 1) &&
-		    range0_sizek >= chunk_sizek) {
-			range0_sizek -= chunk_sizek;
-			second_sizek = 0;
-			hole_sizek = 0;
-
-			goto second_try;
-		}
-	}
-
-	if (range0_sizek) {
-		if (debug_print)
-			printk(KERN_DEBUG "range0: %016lx - %016lx\n",
-				range0_basek<<10,
-				(range0_basek + range0_sizek)<<10);
-		state->reg = range_to_mtrr(state->reg, range0_basek,
-				range0_sizek, MTRR_TYPE_WRBACK);
-	}
-
-	if (range0_sizek < state->range_sizek) {
-		/* need to handle left over */
-		range_sizek = state->range_sizek - range0_sizek;
-
-		if (debug_print)
-			printk(KERN_DEBUG "range: %016lx - %016lx\n",
-				 range_basek<<10,
-				 (range_basek + range_sizek)<<10);
-		state->reg = range_to_mtrr(state->reg, range_basek,
-				 range_sizek, MTRR_TYPE_WRBACK);
-	}
-
-	if (hole_sizek) {
-		hole_basek = range_basek - hole_sizek - second_sizek;
-		if (debug_print)
-			printk(KERN_DEBUG "hole: %016lx - %016lx\n",
-				 hole_basek<<10,
-				 (hole_basek + hole_sizek)<<10);
-		state->reg = range_to_mtrr(state->reg, hole_basek,
-				 hole_sizek, MTRR_TYPE_UNCACHABLE);
-	}
-
-	return second_sizek;
-}
-
-static void __init
-set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
-		   unsigned long size_pfn)
-{
-	unsigned long basek, sizek;
-	unsigned long second_sizek = 0;
-
-	if (state->reg >= num_var_ranges)
-		return;
-
-	basek = base_pfn << (PAGE_SHIFT - 10);
-	sizek = size_pfn << (PAGE_SHIFT - 10);
-
-	/* See if I can merge with the last range */
-	if ((basek <= 1024) ||
-	    (state->range_startk + state->range_sizek == basek)) {
-		unsigned long endk = basek + sizek;
-		state->range_sizek = endk - state->range_startk;
-		return;
-	}
-	/* Write the range mtrrs */
-	if (state->range_sizek != 0)
-		second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
-
-	/* Allocate an msr */
-	state->range_startk = basek + second_sizek;
-	state->range_sizek  = sizek - second_sizek;
-}
-
-/* mininum size of mtrr block that can take hole */
-static u64 mtrr_chunk_size __initdata = (256ULL<<20);
-
-static int __init parse_mtrr_chunk_size_opt(char *p)
-{
-	if (!p)
-		return -EINVAL;
-	mtrr_chunk_size = memparse(p, &p);
-	return 0;
-}
-early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
-
-/* granity of mtrr of block */
-static u64 mtrr_gran_size __initdata;
-
-static int __init parse_mtrr_gran_size_opt(char *p)
-{
-	if (!p)
-		return -EINVAL;
-	mtrr_gran_size = memparse(p, &p);
-	return 0;
-}
-early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
-
-static int nr_mtrr_spare_reg __initdata =
-				 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
-
-static int __init parse_mtrr_spare_reg(char *arg)
-{
-	if (arg)
-		nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
-	return 0;
-}
-
-early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
-
-static int __init
-x86_setup_var_mtrrs(struct res_range *range, int nr_range,
-		    u64 chunk_size, u64 gran_size)
-{
-	struct var_mtrr_state var_state;
-	int i;
-	int num_reg;
-
-	var_state.range_startk	= 0;
-	var_state.range_sizek	= 0;
-	var_state.reg		= 0;
-	var_state.chunk_sizek	= chunk_size >> 10;
-	var_state.gran_sizek	= gran_size >> 10;
-
-	memset(range_state, 0, sizeof(range_state));
-
-	/* Write the range etc */
-	for (i = 0; i < nr_range; i++)
-		set_var_mtrr_range(&var_state, range[i].start,
-				   range[i].end - range[i].start + 1);
-
-	/* Write the last range */
-	if (var_state.range_sizek != 0)
-		range_to_mtrr_with_hole(&var_state, 0, 0);
-
-	num_reg = var_state.reg;
-	/* Clear out the extra MTRR's */
-	while (var_state.reg < num_var_ranges) {
-		save_var_mtrr(var_state.reg, 0, 0, 0);
-		var_state.reg++;
-	}
-
-	return num_reg;
-}
-
-struct mtrr_cleanup_result {
-	unsigned long gran_sizek;
-	unsigned long chunk_sizek;
-	unsigned long lose_cover_sizek;
-	unsigned int num_reg;
-	int bad;
-};
-
-/*
- * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
- * chunk size: gran_size, ..., 2G
- * so we need (1+16)*8
- */
-#define NUM_RESULT	136
-#define PSHIFT		(PAGE_SHIFT - 10)
-
-static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
-static unsigned long __initdata min_loss_pfn[RANGE_NUM];
-
-static void __init print_out_mtrr_range_state(void)
-{
-	int i;
-	char start_factor = 'K', size_factor = 'K';
-	unsigned long start_base, size_base;
-	mtrr_type type;
-
-	for (i = 0; i < num_var_ranges; i++) {
-
-		size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
-		if (!size_base)
-			continue;
-
-		size_base = to_size_factor(size_base, &size_factor),
-		start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
-		start_base = to_size_factor(start_base, &start_factor),
-		type = range_state[i].type;
-
-		printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
-			i, start_base, start_factor,
-			size_base, size_factor,
-			(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
-			    ((type == MTRR_TYPE_WRPROT) ? "WP" :
-			     ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
-			);
-	}
-}
-
-static int __init mtrr_need_cleanup(void)
-{
-	int i;
-	mtrr_type type;
-	unsigned long size;
-	/* extra one for all 0 */
-	int num[MTRR_NUM_TYPES + 1];
-
-	/* check entries number */
-	memset(num, 0, sizeof(num));
-	for (i = 0; i < num_var_ranges; i++) {
-		type = range_state[i].type;
-		size = range_state[i].size_pfn;
-		if (type >= MTRR_NUM_TYPES)
-			continue;
-		if (!size)
-			type = MTRR_NUM_TYPES;
-		if (type == MTRR_TYPE_WRPROT)
-			type = MTRR_TYPE_UNCACHABLE;
-		num[type]++;
-	}
-
-	/* check if we got UC entries */
-	if (!num[MTRR_TYPE_UNCACHABLE])
-		return 0;
-
-	/* check if we only had WB and UC */
-	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
-		num_var_ranges - num[MTRR_NUM_TYPES])
-		return 0;
-
-	return 1;
-}
-
-static unsigned long __initdata range_sums;
-static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
-					 unsigned long extra_remove_base,
-					 unsigned long extra_remove_size,
-					 int i)
-{
-	int num_reg;
-	static struct res_range range_new[RANGE_NUM];
-	static int nr_range_new;
-	unsigned long range_sums_new;
-
-	/* convert ranges to var ranges state */
-	num_reg = x86_setup_var_mtrrs(range, nr_range,
-						chunk_size, gran_size);
-
-	/* we got new setting in range_state, check it */
-	memset(range_new, 0, sizeof(range_new));
-	nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
-				extra_remove_base, extra_remove_size);
-	range_sums_new = sum_ranges(range_new, nr_range_new);
-
-	result[i].chunk_sizek = chunk_size >> 10;
-	result[i].gran_sizek = gran_size >> 10;
-	result[i].num_reg = num_reg;
-	if (range_sums < range_sums_new) {
-		result[i].lose_cover_sizek =
-			(range_sums_new - range_sums) << PSHIFT;
-		result[i].bad = 1;
-	} else
-		result[i].lose_cover_sizek =
-			(range_sums - range_sums_new) << PSHIFT;
-
-	/* double check it */
-	if (!result[i].bad && !result[i].lose_cover_sizek) {
-		if (nr_range_new != nr_range ||
-			memcmp(range, range_new, sizeof(range)))
-				result[i].bad = 1;
-	}
-
-	if (!result[i].bad && (range_sums - range_sums_new <
-				min_loss_pfn[num_reg])) {
-		min_loss_pfn[num_reg] =
-			range_sums - range_sums_new;
-	}
-}
-
-static void __init mtrr_print_out_one_result(int i)
-{
-	char gran_factor, chunk_factor, lose_factor;
-	unsigned long gran_base, chunk_base, lose_base;
-
-	gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
-	chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
-	lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
-	printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
-			result[i].bad ? "*BAD*" : " ",
-			gran_base, gran_factor, chunk_base, chunk_factor);
-	printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ld%c\n",
-			result[i].num_reg, result[i].bad ? "-" : "",
-			lose_base, lose_factor);
-}
-
-static int __init mtrr_search_optimal_index(void)
-{
-	int i;
-	int num_reg_good;
-	int index_good;
-
-	if (nr_mtrr_spare_reg >= num_var_ranges)
-		nr_mtrr_spare_reg = num_var_ranges - 1;
-	num_reg_good = -1;
-	for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
-		if (!min_loss_pfn[i])
-			num_reg_good = i;
-	}
-
-	index_good = -1;
-	if (num_reg_good != -1) {
-		for (i = 0; i < NUM_RESULT; i++) {
-			if (!result[i].bad &&
-			    result[i].num_reg == num_reg_good &&
-			    !result[i].lose_cover_sizek) {
-				index_good = i;
-				break;
-			}
-		}
-	}
-
-	return index_good;
-}
-
-
-static int __init mtrr_cleanup(unsigned address_bits)
-{
-	unsigned long extra_remove_base, extra_remove_size;
-	unsigned long base, size, def, dummy;
-	mtrr_type type;
-	u64 chunk_size, gran_size;
-	int index_good;
-	int i;
-
-	if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
-		return 0;
-	rdmsr(MTRRdefType_MSR, def, dummy);
-	def &= 0xff;
-	if (def != MTRR_TYPE_UNCACHABLE)
-		return 0;
-
-	/* get it and store it aside */
-	memset(range_state, 0, sizeof(range_state));
-	for (i = 0; i < num_var_ranges; i++) {
-		mtrr_if->get(i, &base, &size, &type);
-		range_state[i].base_pfn = base;
-		range_state[i].size_pfn = size;
-		range_state[i].type = type;
-	}
-
-	/* check if we need handle it and can handle it */
-	if (!mtrr_need_cleanup())
-		return 0;
-
-	/* print original var MTRRs at first, for debugging: */
-	printk(KERN_DEBUG "original variable MTRRs\n");
-	print_out_mtrr_range_state();
-
-	memset(range, 0, sizeof(range));
-	extra_remove_size = 0;
-	extra_remove_base = 1 << (32 - PAGE_SHIFT);
-	if (mtrr_tom2)
-		extra_remove_size =
-			(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
-	nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
-					  extra_remove_size);
-	/*
-	 * [0, 1M) should always be coverred by var mtrr with WB
-	 * and fixed mtrrs should take effective before var mtrr for it
-	 */
-	nr_range = add_range_with_merge(range, nr_range, 0,
-					(1ULL<<(20 - PAGE_SHIFT)) - 1);
-	/* sort the ranges */
-	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-
-	range_sums = sum_ranges(range, nr_range);
-	printk(KERN_INFO "total RAM coverred: %ldM\n",
-	       range_sums >> (20 - PAGE_SHIFT));
-
-	if (mtrr_chunk_size && mtrr_gran_size) {
-		i = 0;
-		mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
-				      extra_remove_base, extra_remove_size, i);
-
-		mtrr_print_out_one_result(i);
-
-		if (!result[i].bad) {
-			set_var_mtrr_all(address_bits);
-			return 1;
-		}
-		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
-		       "will find optimal one\n");
-	}
-
-	i = 0;
-	memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
-	memset(result, 0, sizeof(result));
-	for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
-
-		for (chunk_size = gran_size; chunk_size < (1ULL<<32);
-		     chunk_size <<= 1) {
-
-			if (i >= NUM_RESULT)
-				continue;
-
-			mtrr_calc_range_state(chunk_size, gran_size,
-				      extra_remove_base, extra_remove_size, i);
-			if (debug_print) {
-				mtrr_print_out_one_result(i);
-				printk(KERN_INFO "\n");
-			}
-
-			i++;
-		}
-	}
-
-	/* try to find the optimal index */
-	index_good = mtrr_search_optimal_index();
-
-	if (index_good != -1) {
-		printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
-		i = index_good;
-		mtrr_print_out_one_result(i);
-
-		/* convert ranges to var ranges state */
-		chunk_size = result[i].chunk_sizek;
-		chunk_size <<= 10;
-		gran_size = result[i].gran_sizek;
-		gran_size <<= 10;
-		x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
-		set_var_mtrr_all(address_bits);
-		printk(KERN_DEBUG "New variable MTRRs\n");
-		print_out_mtrr_range_state();
-		return 1;
-	} else {
-		/* print out all */
-		for (i = 0; i < NUM_RESULT; i++)
-			mtrr_print_out_one_result(i);
-	}
-
-	printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
-	printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
-
-	return 0;
-}
-#else
-static int __init mtrr_cleanup(unsigned address_bits)
-{
-	return 0;
-}
-#endif
-
-static int __initdata changed_by_mtrr_cleanup;
-
-static int disable_mtrr_trim;
-
-static int __init disable_mtrr_trim_setup(char *str)
-{
-	disable_mtrr_trim = 1;
-	return 0;
-}
-early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
-
-/*
- * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
- * for memory >4GB. Check for that here.
- * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
- * apply to are wrong, but so far we don't know of any such case in the wild.
- */
-#define Tom2Enabled (1U << 21)
-#define Tom2ForceMemTypeWB (1U << 22)
-
-int __init amd_special_default_mtrr(void)
-{
-	u32 l, h;
-
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
-		return 0;
-	if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
-		return 0;
-	/* In case some hypervisor doesn't pass SYSCFG through */
-	if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
-		return 0;
-	/*
-	 * Memory between 4GB and top of mem is forced WB by this magic bit.
-	 * Reserved before K8RevF, but should be zero there.
-	 */
-	if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
-		 (Tom2Enabled | Tom2ForceMemTypeWB))
-		return 1;
-	return 0;
-}
-
-static u64 __init real_trim_memory(unsigned long start_pfn,
-				   unsigned long limit_pfn)
-{
-	u64 trim_start, trim_size;
-	trim_start = start_pfn;
-	trim_start <<= PAGE_SHIFT;
-	trim_size = limit_pfn;
-	trim_size <<= PAGE_SHIFT;
-	trim_size -= trim_start;
-
-	return e820_update_range(trim_start, trim_size, E820_RAM,
-				E820_RESERVED);
-}
-/**
- * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
- * @end_pfn: ending page frame number
- *
- * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
- * memory configurations.  This routine checks that the highest MTRR matches
- * the end of memory, to make sure the MTRRs having a write back type cover
- * all of the memory the kernel is intending to use. If not, it'll trim any
- * memory off the end by adjusting end_pfn, removing it from the kernel's
- * allocation pools, warning the user with an obnoxious message.
- */
-int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
-{
-	unsigned long i, base, size, highest_pfn = 0, def, dummy;
-	mtrr_type type;
-	u64 total_trim_size;
-
-	/* extra one for all 0 */
-	int num[MTRR_NUM_TYPES + 1];
-	/*
-	 * Make sure we only trim uncachable memory on machines that
-	 * support the Intel MTRR architecture:
-	 */
-	if (!is_cpu(INTEL) || disable_mtrr_trim)
-		return 0;
-	rdmsr(MTRRdefType_MSR, def, dummy);
-	def &= 0xff;
-	if (def != MTRR_TYPE_UNCACHABLE)
-		return 0;
-
-	/* get it and store it aside */
-	memset(range_state, 0, sizeof(range_state));
-	for (i = 0; i < num_var_ranges; i++) {
-		mtrr_if->get(i, &base, &size, &type);
-		range_state[i].base_pfn = base;
-		range_state[i].size_pfn = size;
-		range_state[i].type = type;
-	}
-
-	/* Find highest cached pfn */
-	for (i = 0; i < num_var_ranges; i++) {
-		type = range_state[i].type;
-		if (type != MTRR_TYPE_WRBACK)
-			continue;
-		base = range_state[i].base_pfn;
-		size = range_state[i].size_pfn;
-		if (highest_pfn < base + size)
-			highest_pfn = base + size;
-	}
-
-	/* kvm/qemu doesn't have mtrr set right, don't trim them all */
-	if (!highest_pfn) {
-		printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
-		return 0;
-	}
-
-	/* check entries number */
-	memset(num, 0, sizeof(num));
-	for (i = 0; i < num_var_ranges; i++) {
-		type = range_state[i].type;
-		if (type >= MTRR_NUM_TYPES)
-			continue;
-		size = range_state[i].size_pfn;
-		if (!size)
-			type = MTRR_NUM_TYPES;
-		num[type]++;
-	}
-
-	/* no entry for WB? */
-	if (!num[MTRR_TYPE_WRBACK])
-		return 0;
-
-	/* check if we only had WB and UC */
-	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
-		num_var_ranges - num[MTRR_NUM_TYPES])
-		return 0;
-
-	memset(range, 0, sizeof(range));
-	nr_range = 0;
-	if (mtrr_tom2) {
-		range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
-		range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
-		if (highest_pfn < range[nr_range].end + 1)
-			highest_pfn = range[nr_range].end + 1;
-		nr_range++;
-	}
-	nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
-
-	total_trim_size = 0;
-	/* check the head */
-	if (range[0].start)
-		total_trim_size += real_trim_memory(0, range[0].start);
-	/* check the holes */
-	for (i = 0; i < nr_range - 1; i++) {
-		if (range[i].end + 1 < range[i+1].start)
-			total_trim_size += real_trim_memory(range[i].end + 1,
-							    range[i+1].start);
-	}
-	/* check the top */
-	i = nr_range - 1;
-	if (range[i].end + 1 < end_pfn)
-		total_trim_size += real_trim_memory(range[i].end + 1,
-							 end_pfn);
-
-	if (total_trim_size) {
-		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
-			" all of memory, losing %lluMB of RAM.\n",
-			total_trim_size >> 20);
-
-		if (!changed_by_mtrr_cleanup)
-			WARN_ON(1);
-
-		printk(KERN_INFO "update e820 for mtrr\n");
-		update_e820();
-
-		return 1;
-	}
-
-	return 0;
-}
+int __initdata changed_by_mtrr_cleanup;
 
 /**
  * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index ffd60409cc6..77f67f7b347 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if;
 
 extern unsigned int num_var_ranges;
 extern u64 mtrr_tom2;
+extern struct mtrr_state_type mtrr_state;
 
 void mtrr_state_warn(void);
 const char *mtrr_attrib_to_str(int x);
@@ -88,3 +89,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
 int amd_init_mtrr(void);
 int cyrix_init_mtrr(void);
 int centaur_init_mtrr(void);
+
+extern int changed_by_mtrr_cleanup;
+extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 01b1244ef1c..f93047fed79 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -7,15 +7,14 @@
 /*
  *	Get CPU information for use by the procfs.
  */
-#ifdef CONFIG_X86_32
 static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
 			      unsigned int cpu)
 {
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 	if (c->x86_max_cores * smp_num_siblings > 1) {
 		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
 		seq_printf(m, "siblings\t: %d\n",
-			   cpus_weight(per_cpu(cpu_core_map, cpu)));
+			   cpumask_weight(cpu_sibling_mask(cpu)));
 		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
 		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 		seq_printf(m, "apicid\t\t: %d\n", c->apicid);
@@ -24,6 +23,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
 #endif
 }
 
+#ifdef CONFIG_X86_32
 static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 {
 	/*
@@ -50,22 +50,6 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 		   c->wp_works_ok ? "yes" : "no");
 }
 #else
-static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
-			      unsigned int cpu)
-{
-#ifdef CONFIG_SMP
-	if (c->x86_max_cores * smp_num_siblings > 1) {
-		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
-		seq_printf(m, "siblings\t: %d\n",
-			   cpus_weight(per_cpu(cpu_core_map, cpu)));
-		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
-		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
-		seq_printf(m, "apicid\t\t: %d\n", c->apicid);
-		seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
-	}
-#endif
-}
-
 static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 {
 	seq_printf(m,
@@ -159,9 +143,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 static void *c_start(struct seq_file *m, loff_t *pos)
 {
 	if (*pos == 0)	/* just in case, cpu 0 is not the first */
-		*pos = first_cpu(cpu_online_map);
+		*pos = cpumask_first(cpu_online_mask);
 	else
-		*pos = next_cpu_nr(*pos - 1, cpu_online_map);
+		*pos = cpumask_next(*pos - 1, cpu_online_mask);
 	if ((*pos) < nr_cpu_ids)
 		return &cpu_data(*pos);
 	return NULL;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 52b3fefbd5a..bb62b3e5caa 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 #endif
 }
 
-static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
 	.c_vendor	= "Transmeta",
 	.c_ident	= { "GenuineTMx86", "TransmetaCPU" },
 	.c_early_init	= early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index e777f79e096..fd2c37bf7ac 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -8,7 +8,7 @@
  * so no special init takes place.
  */
 
-static struct cpu_dev umc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
 	.c_vendor	= "UMC",
 	.c_ident	= { "UMC UMC UMC" },
 	.c_models = {
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 169a120587b..87b67e3a765 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -729,7 +729,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 
 	spin_unlock_irqrestore(&ds_lock, irq);
 
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
 	ds_resume_pebs(tracer);
 
 	return tracer;
@@ -1029,5 +1029,4 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
 
 void ds_exit_thread(struct task_struct *tsk)
 {
-	WARN_ON(tsk->thread.ds_ctx);
 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 87d103ded1c..95ea5fa7d44 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,10 +10,12 @@
 #include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
+#include <linux/ftrace.h>
 #include <linux/kexec.h>
 #include <linux/bug.h>
 #include <linux/nmi.h>
 #include <linux/sysfs.h>
+#include <linux/ftrace.h>
 
 #include <asm/stacktrace.h>
 
@@ -195,6 +197,11 @@ unsigned __kprobes long oops_begin(void)
 	int cpu;
 	unsigned long flags;
 
+	/* notify the hw-branch tracer so it may disable tracing and
+	   add the last trace to the trace buffer -
+	   the earlier this happens, the more useful the trace. */
+	trace_hw_branch_oops();
+
 	oops_enter();
 
 	/* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 508bec1cee2..ef2c3563357 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -110,19 +110,50 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
 /*
  * Add a memory region to the kernel e820 map.
  */
-void __init e820_add_region(u64 start, u64 size, int type)
+static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
+					 int type)
 {
-	int x = e820.nr_map;
+	int x = e820x->nr_map;
 
-	if (x == ARRAY_SIZE(e820.map)) {
+	if (x == ARRAY_SIZE(e820x->map)) {
 		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
 		return;
 	}
 
-	e820.map[x].addr = start;
-	e820.map[x].size = size;
-	e820.map[x].type = type;
-	e820.nr_map++;
+	e820x->map[x].addr = start;
+	e820x->map[x].size = size;
+	e820x->map[x].type = type;
+	e820x->nr_map++;
+}
+
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+	__e820_add_region(&e820, start, size, type);
+}
+
+static void __init e820_print_type(u32 type)
+{
+	switch (type) {
+	case E820_RAM:
+	case E820_RESERVED_KERN:
+		printk(KERN_CONT "(usable)");
+		break;
+	case E820_RESERVED:
+		printk(KERN_CONT "(reserved)");
+		break;
+	case E820_ACPI:
+		printk(KERN_CONT "(ACPI data)");
+		break;
+	case E820_NVS:
+		printk(KERN_CONT "(ACPI NVS)");
+		break;
+	case E820_UNUSABLE:
+		printk(KERN_CONT "(unusable)");
+		break;
+	default:
+		printk(KERN_CONT "type %u", type);
+		break;
+	}
 }
 
 void __init e820_print_map(char *who)
@@ -134,27 +165,8 @@ void __init e820_print_map(char *who)
 		       (unsigned long long) e820.map[i].addr,
 		       (unsigned long long)
 		       (e820.map[i].addr + e820.map[i].size));
-		switch (e820.map[i].type) {
-		case E820_RAM:
-		case E820_RESERVED_KERN:
-			printk(KERN_CONT "(usable)\n");
-			break;
-		case E820_RESERVED:
-			printk(KERN_CONT "(reserved)\n");
-			break;
-		case E820_ACPI:
-			printk(KERN_CONT "(ACPI data)\n");
-			break;
-		case E820_NVS:
-			printk(KERN_CONT "(ACPI NVS)\n");
-			break;
-		case E820_UNUSABLE:
-			printk("(unusable)\n");
-			break;
-		default:
-			printk(KERN_CONT "type %u\n", e820.map[i].type);
-			break;
-		}
+		e820_print_type(e820.map[i].type);
+		printk(KERN_CONT "\n");
 	}
 }
 
@@ -221,7 +233,7 @@ void __init e820_print_map(char *who)
  */
 
 int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
-				int *pnr_map)
+			     u32 *pnr_map)
 {
 	struct change_member {
 		struct e820entry *pbios; /* pointer to original bios entry */
@@ -417,11 +429,12 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
 	return __append_e820_map(biosmap, nr_map);
 }
 
-static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
 					u64 size, unsigned old_type,
 					unsigned new_type)
 {
-	int i;
+	u64 end;
+	unsigned int i;
 	u64 real_updated_size = 0;
 
 	BUG_ON(old_type == new_type);
@@ -429,27 +442,55 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
 	if (size > (ULLONG_MAX - start))
 		size = ULLONG_MAX - start;
 
-	for (i = 0; i < e820.nr_map; i++) {
+	end = start + size;
+	printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
+		       (unsigned long long) start,
+		       (unsigned long long) end);
+	e820_print_type(old_type);
+	printk(KERN_CONT " ==> ");
+	e820_print_type(new_type);
+	printk(KERN_CONT "\n");
+
+	for (i = 0; i < e820x->nr_map; i++) {
 		struct e820entry *ei = &e820x->map[i];
 		u64 final_start, final_end;
+		u64 ei_end;
+
 		if (ei->type != old_type)
 			continue;
-		/* totally covered? */
-		if (ei->addr >= start &&
-		    (ei->addr + ei->size) <= (start + size)) {
+
+		ei_end = ei->addr + ei->size;
+		/* totally covered by new range? */
+		if (ei->addr >= start && ei_end <= end) {
 			ei->type = new_type;
 			real_updated_size += ei->size;
 			continue;
 		}
+
+		/* new range is totally covered? */
+		if (ei->addr < start && ei_end > end) {
+			__e820_add_region(e820x, start, size, new_type);
+			__e820_add_region(e820x, end, ei_end - end, ei->type);
+			ei->size = start - ei->addr;
+			real_updated_size += size;
+			continue;
+		}
+
 		/* partially covered */
 		final_start = max(start, ei->addr);
-		final_end = min(start + size, ei->addr + ei->size);
+		final_end = min(end, ei_end);
 		if (final_start >= final_end)
 			continue;
-		e820_add_region(final_start, final_end - final_start,
-					 new_type);
+
+		__e820_add_region(e820x, final_start, final_end - final_start,
+				  new_type);
+
 		real_updated_size += final_end - final_start;
 
+		/*
+		 * left range could be head or tail, so need to update
+		 * size at first.
+		 */
 		ei->size -= final_end - final_start;
 		if (ei->addr < final_start)
 			continue;
@@ -461,13 +502,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
 u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 			     unsigned new_type)
 {
-	return e820_update_range_map(&e820, start, size, old_type, new_type);
+	return __e820_update_range(&e820, start, size, old_type, new_type);
 }
 
 static u64 __init e820_update_range_saved(u64 start, u64 size,
 					  unsigned old_type, unsigned new_type)
 {
-	return e820_update_range_map(&e820_saved, start, size, old_type,
+	return __e820_update_range(&e820_saved, start, size, old_type,
 				     new_type);
 }
 
@@ -511,7 +552,7 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
 
 void __init update_e820(void)
 {
-	int nr_map;
+	u32 nr_map;
 
 	nr_map = e820.nr_map;
 	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
@@ -522,7 +563,7 @@ void __init update_e820(void)
 }
 static void __init update_e820_saved(void)
 {
-	int nr_map;
+	u32 nr_map;
 
 	nr_map = e820_saved.nr_map;
 	if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
@@ -1020,8 +1061,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
 			continue;
 		return addr;
 	}
-	return -1UL;
 
+	return -1ULL;
 }
 
 /*
@@ -1034,13 +1075,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 	u64 start;
 
 	start = startt;
-	while (size < sizet)
+	while (size < sizet && (start + 1))
 		start = find_e820_area_size(start, &size, align);
 
 	if (size < sizet)
 		return 0;
 
+#ifdef CONFIG_X86_32
+	if (start >= MAXMEM)
+		return 0;
+	if (start + size > MAXMEM)
+		size = MAXMEM - start;
+#endif
+
 	addr = round_down(start + size - sizet, align);
+	if (addr < start)
+		return 0;
 	e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
 	e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
 	printk(KERN_INFO "update e820 for early_reserve_e820\n");
@@ -1253,7 +1303,7 @@ early_param("memmap", parse_memmap_opt);
 void __init finish_e820_parsing(void)
 {
 	if (userdef) {
-		int nr = e820.nr_map;
+		u32 nr = e820.nr_map;
 
 		if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
 			early_panic("Invalid user supplied memory map");
@@ -1336,7 +1386,7 @@ void __init e820_reserve_resources_late(void)
 char *__init default_machine_specific_memory_setup(void)
 {
 	char *who = "BIOS-e820";
-	int new_nr;
+	u32 new_nr;
 	/*
 	 * Try to copy the BIOS-supplied E820-map.
 	 *
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 639ad98238a..335f049d110 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void)
 	return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
 }
 
-static void dbgp_mdelay(int ms)
+static void __init dbgp_mdelay(int ms)
 {
 	int i;
 
@@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size)
 	writel(hi, &ehci_debug->data47);
 }
 
-static void dbgp_get_data(void *buf, int size)
+static void __init dbgp_get_data(void *buf, int size)
 {
 	unsigned char *bytes = buf;
 	u32 lo, hi;
@@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
 	return ret;
 }
 
-static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
 				 int size)
 {
 	u32 pids, addr, ctrl;
@@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
 	return ret;
 }
 
-static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
-	int value, int index, void *data, int size)
+static int __init dbgp_control_msg(unsigned devnum, int requesttype,
+	int request, int value, int index, void *data, int size)
 {
 	u32 pids, addr, ctrl;
 	struct usb_ctrlrequest req;
@@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
 	return 0;
 }
 
-static int ehci_reset_port(int port)
+static int __init ehci_reset_port(int port)
 {
 	u32 portsc;
 	u32 delay_time, delay;
@@ -532,7 +532,7 @@ static int ehci_reset_port(int port)
 	return -EBUSY;
 }
 
-static int ehci_wait_for_port(int port)
+static int __init ehci_wait_for_port(int port)
 {
 	u32 status;
 	int ret, reps;
@@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { }
 
 typedef void (*set_debug_port_t)(int port);
 
-static void default_set_debug_port(int port)
+static void __init default_set_debug_port(int port)
 {
 }
 
-static set_debug_port_t set_debug_port = default_set_debug_port;
+static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
 
-static void nvidia_set_debug_port(int port)
+static void __init nvidia_set_debug_port(int port)
 {
 	u32 dword;
 	dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index b205272ad39..1736acc4d7a 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void)
 	efi_memory_desc_t *md;
 	efi_status_t status;
 	unsigned long size;
-	u64 end, systab, addr, npages;
+	u64 end, systab, addr, npages, end_pfn;
 	void *p, *va;
 
 	efi.systab = NULL;
@@ -481,7 +481,10 @@ void __init efi_enter_virtual_mode(void)
 		size = md->num_pages << EFI_PAGE_SHIFT;
 		end = md->phys_addr + size;
 
-		if (PFN_UP(end) <= max_low_pfn_mapped)
+		end_pfn = PFN_UP(end);
+		if (end_pfn <= max_low_pfn_mapped
+		    || (end_pfn > (1UL << (32 - PAGE_SHIFT))
+			&& end_pfn <= max_pfn_mapped))
 			va = __va(md->phys_addr);
 		else
 			va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index a4ee29127fd..22c3b7828c5 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,24 +100,11 @@ void __init efi_call_phys_epilog(void)
 
 void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
 {
-	static unsigned pages_mapped __initdata;
-	unsigned i, pages;
-	unsigned long offset;
+	unsigned long last_map_pfn;
 
-	pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr);
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-
-	if (pages_mapped + pages > MAX_EFI_IO_PAGES)
+	last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
+	if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
 		return NULL;
 
-	for (i = 0; i < pages; i++) {
-		__set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
-			     phys_addr, PAGE_KERNEL);
-		phys_addr += PAGE_SIZE;
-		pages_mapped++;
-	}
-
-	return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
-					     (pages_mapped - pages)) + offset;
+	return (void __iomem *)__va(phys_addr);
 }
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 899e8938e79..c929add475c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -442,8 +442,7 @@ sysenter_past_esp:
 
 	GET_THREAD_INFO(%ebp)
 
-	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz sysenter_audit
 sysenter_do_call:
 	cmpl $(nr_syscalls), %eax
@@ -454,7 +453,7 @@ sysenter_do_call:
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
-	testw $_TIF_ALLWORK_MASK, %cx
+	testl $_TIF_ALLWORK_MASK, %ecx
 	jne sysexit_audit
 sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
@@ -468,7 +467,7 @@ sysenter_exit:
 
 #ifdef CONFIG_AUDITSYSCALL
 sysenter_audit:
-	testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	addl $4,%esp
 	CFI_ADJUST_CFA_OFFSET -4
@@ -485,7 +484,7 @@ sysenter_audit:
 	jmp sysenter_do_call
 
 sysexit_audit:
-	testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
 	jne syscall_exit_work
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
@@ -498,7 +497,7 @@ sysexit_audit:
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
-	testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
 	jne syscall_exit_work
 	movl PT_EAX(%esp),%eax	/* reload syscall return value */
 	jmp sysenter_exit
@@ -523,8 +522,7 @@ ENTRY(system_call)
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation / emulation
-	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -538,7 +536,7 @@ syscall_exit:
 					# between sampling and the iret
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
-	testw $_TIF_ALLWORK_MASK, %cx	# current->work
+	testl $_TIF_ALLWORK_MASK, %ecx	# current->work
 	jne syscall_exit_work
 
 restore_all:
@@ -673,7 +671,7 @@ END(syscall_trace_entry)
 	# perform syscall exit tracing
 	ALIGN
 syscall_exit_work:
-	testb $_TIF_WORK_SYSCALL_EXIT, %cl
+	testl $_TIF_WORK_SYSCALL_EXIT, %ecx
 	jz work_pending
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 24c7031e23c..3f129d963a0 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -368,6 +368,7 @@ ENTRY(save_rest)
 END(save_rest)
 
 /* save complete stack frame */
+	.pushsection .kprobes.text, "ax"
 ENTRY(save_paranoid)
 	XCPT_FRAME 1 RDI+8
 	cld
@@ -396,6 +397,7 @@ ENTRY(save_paranoid)
 1:	ret
 	CFI_ENDPROC
 END(save_paranoid)
+	.popsection
 
 /*
  * A newly forked process directly context switches into this address.
@@ -416,7 +418,6 @@ ENTRY(ret_from_fork)
 
 	GET_THREAD_INFO(%rcx)
 
-	CFI_REMEMBER_STATE
 	RESTORE_REST
 
 	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
@@ -428,7 +429,6 @@ ENTRY(ret_from_fork)
 	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
 	jmp ret_from_sys_call			# go to the SYSRET fastpath
 
-	CFI_RESTORE_STATE
 	CFI_ENDPROC
 END(ret_from_fork)
 
@@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \
 #endif
 apicinterrupt LOCAL_TIMER_VECTOR \
 	apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt GENERIC_INTERRUPT_VECTOR \
+	generic_interrupt smp_generic_interrupt
 
 #ifdef CONFIG_SMP
 apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 231bdd3c5b1..61df7753212 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 
+#include <asm/cacheflush.h>
 #include <asm/ftrace.h>
 #include <linux/ftrace.h>
 #include <asm/nops.h>
@@ -26,6 +27,18 @@
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+int ftrace_arch_code_modify_prepare(void)
+{
+	set_kernel_text_rw();
+	return 0;
+}
+
+int ftrace_arch_code_modify_post_process(void)
+{
+	set_kernel_text_ro();
+	return 0;
+}
+
 union ftrace_code_union {
 	char code[MCOUNT_INSN_SIZE];
 	struct {
@@ -66,11 +79,11 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
  *
  * 1) Put the instruction pointer into the IP buffer
  *    and the new code into the "code" buffer.
- * 2) Set a flag that says we are modifying code
- * 3) Wait for any running NMIs to finish.
- * 4) Write the code
- * 5) clear the flag.
- * 6) Wait for any running NMIs to finish.
+ * 2) Wait for any running NMIs to finish and set a flag that says
+ *    we are modifying code, it is done in an atomic operation.
+ * 3) Write the code
+ * 4) clear the flag.
+ * 5) Wait for any running NMIs to finish.
  *
  * If an NMI is executed, the first thing it does is to call
  * "ftrace_nmi_enter". This will check if the flag is set to write
@@ -82,9 +95,9 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
  * are the same as what exists.
  */
 
-static atomic_t in_nmi = ATOMIC_INIT(0);
+#define MOD_CODE_WRITE_FLAG (1 << 31)	/* set when NMI should do the write */
+static atomic_t nmi_running = ATOMIC_INIT(0);
 static int mod_code_status;		/* holds return value of text write */
-static int mod_code_write;		/* set when NMI should do the write */
 static void *mod_code_ip;		/* holds the IP to write to */
 static void *mod_code_newcode;		/* holds the text to write to the IP */
 
@@ -101,6 +114,20 @@ int ftrace_arch_read_dyn_info(char *buf, int size)
 	return r;
 }
 
+static void clear_mod_flag(void)
+{
+	int old = atomic_read(&nmi_running);
+
+	for (;;) {
+		int new = old & ~MOD_CODE_WRITE_FLAG;
+
+		if (old == new)
+			break;
+
+		old = atomic_cmpxchg(&nmi_running, old, new);
+	}
+}
+
 static void ftrace_mod_code(void)
 {
 	/*
@@ -111,37 +138,52 @@ static void ftrace_mod_code(void)
 	 */
 	mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
 					     MCOUNT_INSN_SIZE);
+
+	/* if we fail, then kill any new writers */
+	if (mod_code_status)
+		clear_mod_flag();
 }
 
 void ftrace_nmi_enter(void)
 {
-	atomic_inc(&in_nmi);
-	/* Must have in_nmi seen before reading write flag */
-	smp_mb();
-	if (mod_code_write) {
+	if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
+		smp_rmb();
 		ftrace_mod_code();
 		atomic_inc(&nmi_update_count);
 	}
+	/* Must have previous changes seen before executions */
+	smp_mb();
 }
 
 void ftrace_nmi_exit(void)
 {
-	/* Finish all executions before clearing in_nmi */
-	smp_wmb();
-	atomic_dec(&in_nmi);
+	/* Finish all executions before clearing nmi_running */
+	smp_mb();
+	atomic_dec(&nmi_running);
+}
+
+static void wait_for_nmi_and_set_mod_flag(void)
+{
+	if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
+		return;
+
+	do {
+		cpu_relax();
+	} while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
+
+	nmi_wait_count++;
 }
 
 static void wait_for_nmi(void)
 {
-	int waited = 0;
+	if (!atomic_read(&nmi_running))
+		return;
 
-	while (atomic_read(&in_nmi)) {
-		waited = 1;
+	do {
 		cpu_relax();
-	}
+	} while (atomic_read(&nmi_running));
 
-	if (waited)
-		nmi_wait_count++;
+	nmi_wait_count++;
 }
 
 static int
@@ -151,14 +193,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
 	mod_code_newcode = new_code;
 
 	/* The buffers need to be visible before we let NMIs write them */
-	smp_wmb();
-
-	mod_code_write = 1;
-
-	/* Make sure write bit is visible before we wait on NMIs */
 	smp_mb();
 
-	wait_for_nmi();
+	wait_for_nmi_and_set_mod_flag();
 
 	/* Make sure all running NMIs have finished before we write the code */
 	smp_mb();
@@ -166,13 +203,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
 	ftrace_mod_code();
 
 	/* Make sure the write happens before clearing the bit */
-	smp_wmb();
-
-	mod_code_write = 0;
-
-	/* make sure NMIs see the cleared bit */
 	smp_mb();
 
+	clear_mod_flag();
 	wait_for_nmi();
 
 	return mod_code_status;
@@ -368,100 +401,8 @@ int ftrace_disable_ftrace_graph_caller(void)
 	return ftrace_mod_jmp(ip, old_offset, new_offset);
 }
 
-#else /* CONFIG_DYNAMIC_FTRACE */
-
-/*
- * These functions are picked from those used on
- * this page for dynamic ftrace. They have been
- * simplified to ignore all traces in NMI context.
- */
-static atomic_t in_nmi;
-
-void ftrace_nmi_enter(void)
-{
-	atomic_inc(&in_nmi);
-}
-
-void ftrace_nmi_exit(void)
-{
-	atomic_dec(&in_nmi);
-}
-
 #endif /* !CONFIG_DYNAMIC_FTRACE */
 
-/* Add a function return address to the trace stack on thread info.*/
-static int push_return_trace(unsigned long ret, unsigned long long time,
-				unsigned long func, int *depth)
-{
-	int index;
-
-	if (!current->ret_stack)
-		return -EBUSY;
-
-	/* The return trace stack is full */
-	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
-		atomic_inc(&current->trace_overrun);
-		return -EBUSY;
-	}
-
-	index = ++current->curr_ret_stack;
-	barrier();
-	current->ret_stack[index].ret = ret;
-	current->ret_stack[index].func = func;
-	current->ret_stack[index].calltime = time;
-	*depth = index;
-
-	return 0;
-}
-
-/* Retrieve a function return address to the trace stack on thread info.*/
-static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
-{
-	int index;
-
-	index = current->curr_ret_stack;
-
-	if (unlikely(index < 0)) {
-		ftrace_graph_stop();
-		WARN_ON(1);
-		/* Might as well panic, otherwise we have no where to go */
-		*ret = (unsigned long)panic;
-		return;
-	}
-
-	*ret = current->ret_stack[index].ret;
-	trace->func = current->ret_stack[index].func;
-	trace->calltime = current->ret_stack[index].calltime;
-	trace->overrun = atomic_read(&current->trace_overrun);
-	trace->depth = index;
-	barrier();
-	current->curr_ret_stack--;
-
-}
-
-/*
- * Send the trace to the ring-buffer.
- * @return the original return address.
- */
-unsigned long ftrace_return_to_handler(void)
-{
-	struct ftrace_graph_ret trace;
-	unsigned long ret;
-
-	pop_return_trace(&trace, &ret);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
-	ftrace_graph_return(&trace);
-
-	if (unlikely(!ret)) {
-		ftrace_graph_stop();
-		WARN_ON(1);
-		/* Might as well panic. What else to do? */
-		ret = (unsigned long)panic;
-	}
-
-	return ret;
-}
-
 /*
  * Hook the return address and push it in the stack of return addrs
  * in current thread info.
@@ -469,14 +410,13 @@ unsigned long ftrace_return_to_handler(void)
 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 {
 	unsigned long old;
-	unsigned long long calltime;
 	int faulted;
 	struct ftrace_graph_ent trace;
 	unsigned long return_hooker = (unsigned long)
 				&return_to_handler;
 
 	/* Nmi's are currently unsupported */
-	if (unlikely(atomic_read(&in_nmi)))
+	if (unlikely(in_nmi()))
 		return;
 
 	if (unlikely(atomic_read(&current->tracing_graph_pause)))
@@ -512,17 +452,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	if (unlikely(!__kernel_text_address(old))) {
-		ftrace_graph_stop();
-		*parent = old;
-		WARN_ON(1);
-		return;
-	}
-
-	calltime = cpu_clock(raw_smp_processor_id());
-
-	if (push_return_trace(old, calltime,
-				self_addr, &trace.depth) == -EBUSY) {
+	if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) {
 		*parent = old;
 		return;
 	}
@@ -536,3 +466,66 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 	}
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+
+extern unsigned long __start_syscalls_metadata[];
+extern unsigned long __stop_syscalls_metadata[];
+extern unsigned long *sys_call_table;
+
+static struct syscall_metadata **syscalls_metadata;
+
+static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
+{
+	struct syscall_metadata *start;
+	struct syscall_metadata *stop;
+	char str[KSYM_SYMBOL_LEN];
+
+
+	start = (struct syscall_metadata *)__start_syscalls_metadata;
+	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
+	kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
+
+	for ( ; start < stop; start++) {
+		if (start->name && !strcmp(start->name, str))
+			return start;
+	}
+	return NULL;
+}
+
+struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+	if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0)
+		return NULL;
+
+	return syscalls_metadata[nr];
+}
+
+void arch_init_ftrace_syscalls(void)
+{
+	int i;
+	struct syscall_metadata *meta;
+	unsigned long **psys_syscall_table = &sys_call_table;
+	static atomic_t refs;
+
+	if (atomic_inc_return(&refs) != 1)
+		goto end;
+
+	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
+					FTRACE_SYSCALL_MAX, GFP_KERNEL);
+	if (!syscalls_metadata) {
+		WARN_ON(1);
+		return;
+	}
+
+	for (i = 0; i < FTRACE_SYSCALL_MAX; i++) {
+		meta = find_syscall_meta(psys_syscall_table[i]);
+		syscalls_metadata[i] = meta;
+	}
+	return;
+
+	/* Paranoid: avoid overflow */
+end:
+	atomic_dec(&refs);
+}
+#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ac108d1fe18..3f8579f8d42 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,7 +18,7 @@ void __init i386_start_kernel(void)
 {
 	reserve_trampoline_memory();
 
-	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Reserve INITRD */
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
-	reserve_early(init_pg_tables_start, init_pg_tables_end,
-			"INIT_PG_TABLE");
-
 	reserve_ebda_region();
 
 	/*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f5b27224769..70eaa852c73 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -100,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
 
 	reserve_trampoline_memory();
 
-	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Reserve INITRD */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c32ca19d591..30683883e0c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -38,42 +38,40 @@
 #define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
 
 /*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.
+ * This is how much memory in addition to the memory covered up to
+ * and including _end we need mapped initially.
  * We need:
- *  - one bit for each possible page, but only in low memory, which means
- *     2^32/4096/8 = 128K worst case (4G/4G split.)
- *  - enough space to map all low memory, which means
- *     (2^32/4096) / 1024 pages (worst case, non PAE)
- *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
- *  - a few pages for allocator use before the kernel pagetable has
- *     been set up
+ *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
  *
  * Modulo rounding, each megabyte assigned here requires a kilobyte of
  * memory, which is currently unreclaimed.
  *
  * This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
  */
-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
-
-/*
- * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
- * pagetables from above the 16MB DMA limit, so we'll have to set
- * up pagetables 16MB more (worst-case):
- */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-LOW_PAGES = LOW_PAGES + 0x1000000
-#endif
 
 #if PTRS_PER_PMD > 1
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
 #else
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
 #endif
-BOOTBITMAP_SIZE = LOW_PAGES / 8
-ALLOCATOR_SLOP = 4
 
-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+/* Enough space to fit pagetables for the low memory linear map */
+MAPPING_BEYOND_END = \
+	PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
+
+/*
+ * Worst-case size of the kernel mapping we need to make:
+ * the worst-case size of the kernel itself, plus the extra we need
+ * to map for the linear map.
+ */
+KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
+
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
+RESERVE_BRK(pagetables, INIT_MAP_SIZE)
 
 /*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
@@ -166,10 +164,10 @@ num_subarch_entries = (. - subarch_entries) / 4
 
 /*
  * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond _end.  The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * tables, which are located immediately beyond __brk_base.  The variable
+ * _brk_end is set up to point to the first "safe" location.
  * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ * and PAGE_OFFSET for up to _end.
  *
  * Note that the stack is not yet set up!
  */
@@ -190,8 +188,7 @@ default_entry:
 
 	xorl %ebx,%ebx				/* %ebx is kept at zero */
 
-	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
+	movl $pa(__brk_base), %edi
 	movl $pa(swapper_pg_pmd), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -209,14 +206,14 @@ default_entry:
 	loop 11b
 
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables.
+	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
 1:
-	movl %edi,pa(init_pg_tables_end)
+	addl $__PAGE_OFFSET, %edi
+	movl %edi, pa(_brk_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -227,8 +224,7 @@ default_entry:
 
 page_pde_offset = (__PAGE_OFFSET >> 20);
 
-	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
+	movl $pa(__brk_base), %edi
 	movl $pa(swapper_pg_dir), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -242,14 +238,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	addl $0x1000,%eax
 	loop 11b
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables; the +0x007 is
-	 * the attribute bits
+	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
-	movl %edi,pa(init_pg_tables_end)
+	addl $__PAGE_OFFSET, %edi
+	movl %edi, pa(_brk_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -636,6 +631,7 @@ swapper_pg_fixmap:
 	.fill 1024,4,0
 ENTRY(empty_zero_page)
 	.fill 4096,1,0
+
 /*
  * This starts the data section.
  */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index a00545fe5cd..648b3a2a3a4 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -80,6 +80,7 @@ static inline void hpet_clear_mapping(void)
  */
 static int boot_hpet_disable;
 int hpet_force_user;
+static int hpet_verbose;
 
 static int __init hpet_setup(char *str)
 {
@@ -88,6 +89,8 @@ static int __init hpet_setup(char *str)
 			boot_hpet_disable = 1;
 		if (!strncmp("force", str, 5))
 			hpet_force_user = 1;
+		if (!strncmp("verbose", str, 7))
+			hpet_verbose = 1;
 	}
 	return 1;
 }
@@ -119,6 +122,43 @@ int is_hpet_enabled(void)
 }
 EXPORT_SYMBOL_GPL(is_hpet_enabled);
 
+static void _hpet_print_config(const char *function, int line)
+{
+	u32 i, timers, l, h;
+	printk(KERN_INFO "hpet: %s(%d):\n", function, line);
+	l = hpet_readl(HPET_ID);
+	h = hpet_readl(HPET_PERIOD);
+	timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+	printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h);
+	l = hpet_readl(HPET_CFG);
+	h = hpet_readl(HPET_STATUS);
+	printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h);
+	l = hpet_readl(HPET_COUNTER);
+	h = hpet_readl(HPET_COUNTER+4);
+	printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h);
+
+	for (i = 0; i < timers; i++) {
+		l = hpet_readl(HPET_Tn_CFG(i));
+		h = hpet_readl(HPET_Tn_CFG(i)+4);
+		printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n",
+		       i, l, h);
+		l = hpet_readl(HPET_Tn_CMP(i));
+		h = hpet_readl(HPET_Tn_CMP(i)+4);
+		printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n",
+		       i, l, h);
+		l = hpet_readl(HPET_Tn_ROUTE(i));
+		h = hpet_readl(HPET_Tn_ROUTE(i)+4);
+		printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n",
+		       i, l, h);
+	}
+}
+
+#define hpet_print_config()					\
+do {								\
+	if (hpet_verbose)					\
+		_hpet_print_config(__FUNCTION__, __LINE__);	\
+} while (0)
+
 /*
  * When the hpet driver (/dev/hpet) is enabled, we need to reserve
  * timer 0 and timer 1 in case of RTC emulation.
@@ -191,27 +231,37 @@ static struct clock_event_device hpet_clockevent = {
 	.rating		= 50,
 };
 
-static void hpet_start_counter(void)
+static void hpet_stop_counter(void)
 {
 	unsigned long cfg = hpet_readl(HPET_CFG);
-
 	cfg &= ~HPET_CFG_ENABLE;
 	hpet_writel(cfg, HPET_CFG);
 	hpet_writel(0, HPET_COUNTER);
 	hpet_writel(0, HPET_COUNTER + 4);
+}
+
+static void hpet_start_counter(void)
+{
+	unsigned long cfg = hpet_readl(HPET_CFG);
 	cfg |= HPET_CFG_ENABLE;
 	hpet_writel(cfg, HPET_CFG);
 }
 
+static void hpet_restart_counter(void)
+{
+	hpet_stop_counter();
+	hpet_start_counter();
+}
+
 static void hpet_resume_device(void)
 {
 	force_hpet_resume();
 }
 
-static void hpet_restart_counter(void)
+static void hpet_resume_counter(void)
 {
 	hpet_resume_device();
-	hpet_start_counter();
+	hpet_restart_counter();
 }
 
 static void hpet_enable_legacy_int(void)
@@ -259,29 +309,23 @@ static int hpet_setup_msi_irq(unsigned int irq);
 static void hpet_set_mode(enum clock_event_mode mode,
 			  struct clock_event_device *evt, int timer)
 {
-	unsigned long cfg, cmp, now;
+	unsigned long cfg;
 	uint64_t delta;
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
+		hpet_stop_counter();
 		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
 		delta >>= evt->shift;
-		now = hpet_readl(HPET_COUNTER);
-		cmp = now + (unsigned long) delta;
 		cfg = hpet_readl(HPET_Tn_CFG(timer));
 		/* Make sure we use edge triggered interrupts */
 		cfg &= ~HPET_TN_LEVEL;
 		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
 		       HPET_TN_SETVAL | HPET_TN_32BIT;
 		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		/*
-		 * The first write after writing TN_SETVAL to the
-		 * config register sets the counter value, the second
-		 * write sets the period.
-		 */
-		hpet_writel(cmp, HPET_Tn_CMP(timer));
-		udelay(1);
 		hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
+		hpet_start_counter();
+		hpet_print_config();
 		break;
 
 	case CLOCK_EVT_MODE_ONESHOT:
@@ -308,6 +352,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
 			irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
 			enable_irq(hdev->irq);
 		}
+		hpet_print_config();
 		break;
 	}
 }
@@ -526,6 +571,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 
 	num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
 	num_timers++; /* Value read out starts from 0 */
+	hpet_print_config();
 
 	hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
 	if (!hpet_devs)
@@ -695,7 +741,7 @@ static struct clocksource clocksource_hpet = {
 	.mask		= HPET_MASK,
 	.shift		= HPET_SHIFT,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-	.resume		= hpet_restart_counter,
+	.resume		= hpet_resume_counter,
 #ifdef CONFIG_X86_64
 	.vread		= vread_hpet,
 #endif
@@ -707,7 +753,7 @@ static int hpet_clocksource_register(void)
 	cycle_t t1;
 
 	/* Start the counter */
-	hpet_start_counter();
+	hpet_restart_counter();
 
 	/* Verify whether hpet counter works */
 	t1 = read_hpet();
@@ -793,6 +839,7 @@ int __init hpet_enable(void)
 	 * information and the number of channels
 	 */
 	id = hpet_readl(HPET_ID);
+	hpet_print_config();
 
 #ifdef CONFIG_HPET_EMULATE_RTC
 	/*
@@ -845,6 +892,7 @@ static __init int hpet_late_init(void)
 		return -ENODEV;
 
 	hpet_reserve_platform_timers(hpet_readl(HPET_ID));
+	hpet_print_config();
 
 	for_each_online_cpu(cpu) {
 		hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b0f61f0dcd0..f2f8540a7f3 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
 		memset(tsk->thread.xstate, 0, xstate_size);
-		finit();
+		finit_task(tsk);
 		set_stopped_child_used_math(tsk);
 		return 0;
 	}
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 10f92fb532f..3475440baa5 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,17 +3,17 @@
  *
  */
 #include <linux/clockchips.h>
-#include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/spinlock.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
-#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
 
-#include <asm/smp.h>
-#include <asm/delay.h>
 #include <asm/i8253.h>
-#include <asm/io.h>
 #include <asm/hpet.h>
+#include <asm/smp.h>
 
 DEFINE_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
@@ -40,7 +40,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 {
 	spin_lock(&i8253_lock);
 
-	switch(mode) {
+	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
 		/* binary, mode 2, LSB/MSB, ch 0 */
 		outb_pit(0x34, PIT_MODE);
@@ -95,7 +95,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
  * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
  * !using_apic_timer decisions in do_timer_interrupt_hook()
  */
-static struct clock_event_device pit_clockevent = {
+static struct clock_event_device pit_ce = {
 	.name		= "pit",
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
 	.set_mode	= init_pit_timer,
@@ -114,15 +114,13 @@ void __init setup_pit_timer(void)
 	 * Start pit with the boot cpu mask and make it global after the
 	 * IO_APIC has been initialized.
 	 */
-	pit_clockevent.cpumask = cpumask_of(smp_processor_id());
-	pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
-				     pit_clockevent.shift);
-	pit_clockevent.max_delta_ns =
-		clockevent_delta2ns(0x7FFF, &pit_clockevent);
-	pit_clockevent.min_delta_ns =
-		clockevent_delta2ns(0xF, &pit_clockevent);
-	clockevents_register_device(&pit_clockevent);
-	global_clock_event = &pit_clockevent;
+	pit_ce.cpumask = cpumask_of(smp_processor_id());
+	pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
+	pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
+	pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
+
+	clockevents_register_device(&pit_ce);
+	global_clock_event = &pit_ce;
 }
 
 #ifndef CONFIG_X86_64
@@ -133,11 +131,11 @@ void __init setup_pit_timer(void)
  */
 static cycle_t pit_read(void)
 {
+	static int old_count;
+	static u32 old_jifs;
 	unsigned long flags;
 	int count;
 	u32 jifs;
-	static int old_count;
-	static u32 old_jifs;
 
 	spin_lock_irqsave(&i8253_lock, flags);
 	/*
@@ -179,9 +177,9 @@ static cycle_t pit_read(void)
 	 * Previous attempts to handle these cases intelligently were
 	 * buggy, so we just do the simple thing now.
 	 */
-	if (count > old_count && jifs == old_jifs) {
+	if (count > old_count && jifs == old_jifs)
 		count = old_count;
-	}
+
 	old_count = count;
 	old_jifs = jifs;
 
@@ -192,13 +190,13 @@ static cycle_t pit_read(void)
 	return (cycle_t)(jifs * LATCH) + count;
 }
 
-static struct clocksource clocksource_pit = {
-	.name	= "pit",
-	.rating = 110,
-	.read	= pit_read,
-	.mask	= CLOCKSOURCE_MASK(32),
-	.mult	= 0,
-	.shift	= 20,
+static struct clocksource pit_cs = {
+	.name		= "pit",
+	.rating		= 110,
+	.read		= pit_read,
+	.mask		= CLOCKSOURCE_MASK(32),
+	.mult		= 0,
+	.shift		= 20,
 };
 
 static void pit_disable_clocksource(void)
@@ -206,9 +204,9 @@ static void pit_disable_clocksource(void)
 	/*
 	 * Use mult to check whether it is registered or not
 	 */
-	if (clocksource_pit.mult) {
-		clocksource_unregister(&clocksource_pit);
-		clocksource_pit.mult = 0;
+	if (pit_cs.mult) {
+		clocksource_unregister(&pit_cs);
+		pit_cs.mult = 0;
 	}
 }
 
@@ -222,13 +220,13 @@ static int __init init_pit_clocksource(void)
 	  * - when local APIC timer is active (PIT is switched off)
 	  */
 	if (num_possible_cpus() > 1 || is_hpet_enabled() ||
-	    pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
+	    pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
 		return 0;
 
-	clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE,
-						   clocksource_pit.shift);
-	return clocksource_register(&clocksource_pit);
+	pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
+
+	return clocksource_register(&pit_cs);
 }
 arch_initcall(init_pit_clocksource);
 
-#endif
+#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 720d2607aac..a979b5bd2fc 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -7,10 +7,10 @@
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/init.h>
 #include <linux/dmi.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
 
@@ -47,8 +47,7 @@ EXPORT_SYMBOL(native_io_delay);
 static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
 {
 	if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
-		printk(KERN_NOTICE "%s: using 0xed I/O delay port\n",
-			id->ident);
+		pr_notice("%s: using 0xed I/O delay port\n", id->ident);
 		io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
 	}
 
@@ -64,40 +63,40 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
 		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "Compaq Presario V6000",
 		.matches	= {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-			DMI_MATCH(DMI_BOARD_NAME, "30B7")
+			DMI_MATCH(DMI_BOARD_VENDOR,	"Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME,	"30B7")
 		}
 	},
 	{
 		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "HP Pavilion dv9000z",
 		.matches	= {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-			DMI_MATCH(DMI_BOARD_NAME, "30B9")
+			DMI_MATCH(DMI_BOARD_VENDOR,	"Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME,	"30B9")
 		}
 	},
 	{
 		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "HP Pavilion dv6000",
 		.matches	= {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-			DMI_MATCH(DMI_BOARD_NAME, "30B8")
+			DMI_MATCH(DMI_BOARD_VENDOR,	"Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME,	"30B8")
 		}
 	},
 	{
 		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "HP Pavilion tx1000",
 		.matches	= {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-			DMI_MATCH(DMI_BOARD_NAME, "30BF")
+			DMI_MATCH(DMI_BOARD_VENDOR,	"Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME,	"30BF")
 		}
 	},
 	{
 		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "Presario F700",
 		.matches	= {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-			DMI_MATCH(DMI_BOARD_NAME, "30D3")
+			DMI_MATCH(DMI_BOARD_VENDOR,	"Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME,	"30D3")
 		}
 	},
 	{ }
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index e41980a373a..99c4d308f16 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -85,19 +85,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 
 	t->io_bitmap_max = bytes;
 
-#ifdef CONFIG_X86_32
-	/*
-	 * Sets the lazy trigger so that the next I/O operation will
-	 * reload the correct bitmap.
-	 * Reset the owner so that a process switch will not set
-	 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
-	 */
-	tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
-	tss->io_bitmap_owner = NULL;
-#else
 	/* Update the TSS: */
 	memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
-#endif
 
 	put_cpu();
 
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 7c95c8918a8..9c2754302ec 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -15,6 +15,9 @@
 
 atomic_t irq_err_count;
 
+/* Function pointer for generic interrupt vector handling */
+void (*generic_interrupt_extension)(void) = NULL;
+
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
  * each architecture has to answer this themselves.
@@ -42,59 +45,64 @@ void ack_bad_irq(unsigned int irq)
 /*
  * /proc/interrupts printing:
  */
-static int show_other_interrupts(struct seq_file *p)
+static int show_other_interrupts(struct seq_file *p, int prec)
 {
 	int j;
 
-	seq_printf(p, "NMI: ");
+	seq_printf(p, "%*s: ", prec, "NMI");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
 	seq_printf(p, "  Non-maskable interrupts\n");
 #ifdef CONFIG_X86_LOCAL_APIC
-	seq_printf(p, "LOC: ");
+	seq_printf(p, "%*s: ", prec, "LOC");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
+
+	seq_printf(p, "%*s: ", prec, "SPU");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+	seq_printf(p, "  Spurious interrupts\n");
 	seq_printf(p, "CNT: ");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
 #endif
+	if (generic_interrupt_extension) {
+		seq_printf(p, "PLT: ");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+		seq_printf(p, "  Platform interrupts\n");
+	}
 #ifdef CONFIG_SMP
-	seq_printf(p, "RES: ");
+	seq_printf(p, "%*s: ", prec, "RES");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
 	seq_printf(p, "  Rescheduling interrupts\n");
-	seq_printf(p, "CAL: ");
+	seq_printf(p, "%*s: ", prec, "CAL");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
 	seq_printf(p, "  Function call interrupts\n");
-	seq_printf(p, "TLB: ");
+	seq_printf(p, "%*s: ", prec, "TLB");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
 	seq_printf(p, "  TLB shootdowns\n");
 #endif
 #ifdef CONFIG_X86_MCE
-	seq_printf(p, "TRM: ");
+	seq_printf(p, "%*s: ", prec, "TRM");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
 	seq_printf(p, "  Thermal event interrupts\n");
 # ifdef CONFIG_X86_64
-	seq_printf(p, "THR: ");
+	seq_printf(p, "%*s: ", prec, "THR");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
 	seq_printf(p, "  Threshold APIC interrupts\n");
 # endif
 #endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	seq_printf(p, "SPU: ");
-	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
-	seq_printf(p, "  Spurious interrupts\n");
-#endif
-	seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
-	seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
 #endif
 	return 0;
 }
@@ -102,19 +110,22 @@ static int show_other_interrupts(struct seq_file *p)
 int show_interrupts(struct seq_file *p, void *v)
 {
 	unsigned long flags, any_count = 0;
-	int i = *(loff_t *) v, j;
+	int i = *(loff_t *) v, j, prec;
 	struct irqaction *action;
 	struct irq_desc *desc;
 
 	if (i > nr_irqs)
 		return 0;
 
+	for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+		j *= 10;
+
 	if (i == nr_irqs)
-		return show_other_interrupts(p);
+		return show_other_interrupts(p, prec);
 
 	/* print header */
 	if (i == 0) {
-		seq_printf(p, "           ");
+		seq_printf(p, "%*s", prec + 8, "");
 		for_each_online_cpu(j)
 			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
@@ -125,23 +136,15 @@ int show_interrupts(struct seq_file *p, void *v)
 		return 0;
 
 	spin_lock_irqsave(&desc->lock, flags);
-#ifndef CONFIG_SMP
-	any_count = kstat_irqs(i);
-#else
 	for_each_online_cpu(j)
 		any_count |= kstat_irqs_cpu(i, j);
-#endif
 	action = desc->action;
 	if (!action && !any_count)
 		goto out;
 
-	seq_printf(p, "%3d: ", i);
-#ifndef CONFIG_SMP
-	seq_printf(p, "%10u ", kstat_irqs(i));
-#else
+	seq_printf(p, "%*d: ", prec, i);
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-#endif
 	seq_printf(p, " %8s", desc->chip->name);
 	seq_printf(p, "-%-8s", desc->name);
 
@@ -166,8 +169,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
+	sum += irq_stats(cpu)->irq_spurious_count;
 	sum += irq_stats(cpu)->apic_perf_irqs;
 #endif
+	if (generic_interrupt_extension)
+		sum += irq_stats(cpu)->generic_irqs;
 #ifdef CONFIG_SMP
 	sum += irq_stats(cpu)->irq_resched_count;
 	sum += irq_stats(cpu)->irq_call_count;
@@ -179,9 +185,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->irq_threshold_count;
 #endif
 #endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	sum += irq_stats(cpu)->irq_spurious_count;
-#endif
 	return sum;
 }
 
@@ -231,4 +234,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	return 1;
 }
 
+/*
+ * Handler for GENERIC_INTERRUPT_VECTOR.
+ */
+void smp_generic_interrupt(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	ack_APIC_irq();
+
+	exit_idle();
+
+	irq_enter();
+
+	inc_irq_stat(generic_irqs);
+
+	if (generic_interrupt_extension)
+		generic_interrupt_extension();
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9dc6b2b2427..3b09634a515 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/uaccess.h>
+#include <linux/percpu.h>
 
 #include <asm/apic.h>
 
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
 union irq_ctx {
 	struct thread_info      tinfo;
 	u32                     stack[THREAD_SIZE/sizeof(u32)];
-};
+} __attribute__((aligned(PAGE_SIZE)));
 
-static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
-static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
+static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
 
-static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
-static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
 
 static void call_on_stack(void *func, void *stack)
 {
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
 	u32 *isp, arg1, arg2;
 
 	curctx = (union irq_ctx *) current_thread_info();
-	irqctx = hardirq_ctx[smp_processor_id()];
+	irqctx = __get_cpu_var(hardirq_ctx);
 
 	/*
 	 * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
 {
 	union irq_ctx *irqctx;
 
-	if (hardirq_ctx[cpu])
+	if (per_cpu(hardirq_ctx, cpu))
 		return;
 
-	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+	irqctx = &per_cpu(hardirq_stack, cpu);
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.preempt_count	= HARDIRQ_OFFSET;
 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
-	hardirq_ctx[cpu] = irqctx;
+	per_cpu(hardirq_ctx, cpu) = irqctx;
 
-	irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
+	irqctx = &per_cpu(softirq_stack, cpu);
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.preempt_count	= 0;
 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
-	softirq_ctx[cpu] = irqctx;
+	per_cpu(softirq_ctx, cpu) = irqctx;
 
 	printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
-	       cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
+	       cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
 }
 
 void irq_ctx_exit(int cpu)
 {
-	hardirq_ctx[cpu] = NULL;
+	per_cpu(hardirq_ctx, cpu) = NULL;
 }
 
 asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
 
 	if (local_softirq_pending()) {
 		curctx = current_thread_info();
-		irqctx = softirq_ctx[smp_processor_id()];
+		irqctx = __get_cpu_var(softirq_ctx);
 		irqctx->tinfo.task = curctx->task;
 		irqctx->tinfo.previous_esp = current_stack_pointer;
 
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index f3e11cb295c..925d87cfc55 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -50,7 +50,6 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
  */
 static struct irqaction fpu_irq = {
 	.handler = math_error_irq,
-	.mask = CPU_MASK_NONE,
 	.name = "fpu",
 };
 
@@ -83,7 +82,6 @@ void __init init_ISA_irqs(void)
  */
 static struct irqaction irq2 = {
 	.handler = no_action,
-	.mask = CPU_MASK_NONE,
 	.name = "cascade",
 };
 
@@ -160,6 +158,9 @@ static void __init apic_intr_init(void)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
+	/* generic IPI for platform specific use */
+	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 16e1fc68750..665e2ab48ab 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -45,7 +45,6 @@
 
 static struct irqaction irq2 = {
 	.handler = no_action,
-	.mask = CPU_MASK_NONE,
 	.name = "cascade",
 };
 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@ -147,6 +146,9 @@ static void __init apic_intr_init(void)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
+	/* generic IPI for platform specific use */
+	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index ff7d3b0124f..e444357375c 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -8,11 +8,11 @@
  */
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
-#include <linux/stat.h>
+#include <linux/module.h>
 #include <linux/init.h>
+#include <linux/stat.h>
 #include <linux/io.h>
 #include <linux/mm.h>
-#include <linux/module.h>
 
 #include <asm/setup.h>
 
@@ -26,9 +26,8 @@ struct setup_data_node {
 	u32 len;
 };
 
-static ssize_t
-setup_data_read(struct file *file, char __user *user_buf, size_t count,
-		loff_t *ppos)
+static ssize_t setup_data_read(struct file *file, char __user *user_buf,
+			       size_t count, loff_t *ppos)
 {
 	struct setup_data_node *node = file->private_data;
 	unsigned long remain;
@@ -39,20 +38,21 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
 
 	if (pos < 0)
 		return -EINVAL;
+
 	if (pos >= node->len)
 		return 0;
 
 	if (count > node->len - pos)
 		count = node->len - pos;
+
 	pa = node->paddr + sizeof(struct setup_data) + pos;
 	pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
 	if (PageHighMem(pg)) {
 		p = ioremap_cache(pa, count);
 		if (!p)
 			return -ENXIO;
-	} else {
+	} else
 		p = __va(pa);
-	}
 
 	remain = copy_to_user(user_buf, p, count);
 
@@ -70,12 +70,13 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
 static int setup_data_open(struct inode *inode, struct file *file)
 {
 	file->private_data = inode->i_private;
+
 	return 0;
 }
 
 static const struct file_operations fops_setup_data = {
-	.read =		setup_data_read,
-	.open =		setup_data_open,
+	.read		= setup_data_read,
+	.open		= setup_data_open,
 };
 
 static int __init
@@ -84,57 +85,50 @@ create_setup_data_node(struct dentry *parent, int no,
 {
 	struct dentry *d, *type, *data;
 	char buf[16];
-	int error;
 
 	sprintf(buf, "%d", no);
 	d = debugfs_create_dir(buf, parent);
-	if (!d) {
-		error = -ENOMEM;
-		goto err_return;
-	}
+	if (!d)
+		return -ENOMEM;
+
 	type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
-	if (!type) {
-		error = -ENOMEM;
+	if (!type)
 		goto err_dir;
-	}
+
 	data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
-	if (!data) {
-		error = -ENOMEM;
+	if (!data)
 		goto err_type;
-	}
+
 	return 0;
 
 err_type:
 	debugfs_remove(type);
 err_dir:
 	debugfs_remove(d);
-err_return:
-	return error;
+	return -ENOMEM;
 }
 
 static int __init create_setup_data_nodes(struct dentry *parent)
 {
 	struct setup_data_node *node;
 	struct setup_data *data;
-	int error, no = 0;
+	int error = -ENOMEM;
 	struct dentry *d;
 	struct page *pg;
 	u64 pa_data;
+	int no = 0;
 
 	d = debugfs_create_dir("setup_data", parent);
-	if (!d) {
-		error = -ENOMEM;
-		goto err_return;
-	}
+	if (!d)
+		return -ENOMEM;
 
 	pa_data = boot_params.hdr.setup_data;
 
 	while (pa_data) {
 		node = kmalloc(sizeof(*node), GFP_KERNEL);
-		if (!node) {
-			error = -ENOMEM;
+		if (!node)
 			goto err_dir;
-		}
+
 		pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
 		if (PageHighMem(pg)) {
 			data = ioremap_cache(pa_data, sizeof(*data));
@@ -143,9 +137,8 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 				error = -ENXIO;
 				goto err_dir;
 			}
-		} else {
+		} else
 			data = __va(pa_data);
-		}
 
 		node->paddr = pa_data;
 		node->type = data->type;
@@ -159,11 +152,11 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 			goto err_dir;
 		no++;
 	}
+
 	return 0;
 
 err_dir:
 	debugfs_remove(d);
-err_return:
 	return error;
 }
 
@@ -175,28 +168,26 @@ static struct debugfs_blob_wrapper boot_params_blob = {
 static int __init boot_params_kdebugfs_init(void)
 {
 	struct dentry *dbp, *version, *data;
-	int error;
+	int error = -ENOMEM;
 
 	dbp = debugfs_create_dir("boot_params", NULL);
-	if (!dbp) {
-		error = -ENOMEM;
-		goto err_return;
-	}
+	if (!dbp)
+		return -ENOMEM;
+
 	version = debugfs_create_x16("version", S_IRUGO, dbp,
 				     &boot_params.hdr.version);
-	if (!version) {
-		error = -ENOMEM;
+	if (!version)
 		goto err_dir;
-	}
+
 	data = debugfs_create_blob("data", S_IRUGO, dbp,
 				   &boot_params_blob);
-	if (!data) {
-		error = -ENOMEM;
+	if (!data)
 		goto err_version;
-	}
+
 	error = create_setup_data_nodes(dbp);
 	if (error)
 		goto err_data;
+
 	return 0;
 
 err_data:
@@ -205,10 +196,9 @@ err_version:
 	debugfs_remove(version);
 err_dir:
 	debugfs_remove(dbp);
-err_return:
 	return error;
 }
-#endif
+#endif /* CONFIG_DEBUG_BOOT_PARAMS */
 
 static int __init arch_kdebugfs_init(void)
 {
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e948b28a5a9..7b5169d2b00 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -193,6 +193,9 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes)
 	kprobe_opcode_t opcode;
 	kprobe_opcode_t *orig_opcodes = opcodes;
 
+	if (search_exception_tables((unsigned long)opcodes))
+		return 0;	/* Page fault may occur on this address. */
+
 retry:
 	if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
 		return 0;
@@ -635,13 +638,13 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
 #else
 			"	pushf\n"
 			/*
-			 * Skip cs, ip, orig_ax.
+			 * Skip cs, ip, orig_ax and gs.
 			 * trampoline_handler() will plug in these values
 			 */
-			"	subl $12, %esp\n"
+			"	subl $16, %esp\n"
 			"	pushl %fs\n"
-			"	pushl %ds\n"
 			"	pushl %es\n"
+			"	pushl %ds\n"
 			"	pushl %eax\n"
 			"	pushl %ebp\n"
 			"	pushl %edi\n"
@@ -652,10 +655,10 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
 			"	movl %esp, %eax\n"
 			"	call trampoline_handler\n"
 			/* Move flags to cs */
-			"	movl 52(%esp), %edx\n"
-			"	movl %edx, 48(%esp)\n"
+			"	movl 56(%esp), %edx\n"
+			"	movl %edx, 52(%esp)\n"
 			/* Replace saved flags with true return address. */
-			"	movl %eax, 52(%esp)\n"
+			"	movl %eax, 56(%esp)\n"
 			"	popl %ebx\n"
 			"	popl %ecx\n"
 			"	popl %edx\n"
@@ -663,8 +666,8 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
 			"	popl %edi\n"
 			"	popl %ebp\n"
 			"	popl %eax\n"
-			/* Skip ip, orig_ax, es, ds, fs */
-			"	addl $20, %esp\n"
+			/* Skip ds, es, fs, gs, orig_ax and ip */
+			"	addl $24, %esp\n"
 			"	popf\n"
 #endif
 			"	ret\n");
@@ -688,6 +691,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 	regs->cs = __KERNEL_CS;
 #else
 	regs->cs = __KERNEL_CS | get_kernel_rpl();
+	regs->gs = 0;
 #endif
 	regs->ip = trampoline_address;
 	regs->orig_ax = ~0UL;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 478bca986ec..33019ddb56b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -138,12 +138,6 @@ static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
 	kvm_mmu_write(ptep, pte_val(pte));
 }
 
-static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
-				pte_t *ptep, pte_t pte)
-{
-	kvm_mmu_write(ptep, pte_val(pte));
-}
-
 static void kvm_pte_clear(struct mm_struct *mm,
 			  unsigned long addr, pte_t *ptep)
 {
@@ -220,7 +214,6 @@ static void paravirt_ops_setup(void)
 #if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
 		pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
-		pv_mmu_ops.set_pte_present = kvm_set_pte_present;
 		pv_mmu_ops.pte_clear = kvm_pte_clear;
 		pv_mmu_ops.pmd_clear = kvm_pmd_clear;
 #endif
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index f5fc8c781a6..e7368c1da01 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
 #include <linux/ftrace.h>
 #include <linux/suspend.h>
 #include <linux/gfp.h>
+#include <linux/io.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/io.h>
 #include <asm/apic.h>
 #include <asm/cpufeature.h>
 #include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
 		"\tmovl %%eax,%%fs\n"
 		"\tmovl %%eax,%%gs\n"
 		"\tmovl %%eax,%%ss\n"
-		::: "eax", "memory");
+		: : : "eax", "memory");
 #undef STR
 #undef __STR
 }
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
-		/* We need to put APICs in legacy mode so that we can
+		/*
+		 * We need to put APICs in legacy mode so that we can
 		 * get timer interrupts in second kernel. kexec/kdump
 		 * paths already have calls to disable_IO_APIC() in
 		 * one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
 		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 						<< PAGE_SHIFT);
 
-	/* The segment registers are funny things, they have both a
+	/*
+	 * The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
 	 * set to a specific selector, the invisible part is loaded
 	 * with from a table in memory.  At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
 	 * segments, before I zap the gdt with an invalid value.
 	 */
 	load_segments();
-	/* The gdt & idt are now invalid.
+	/*
+	 * The gdt & idt are now invalid.
 	 * If you want to load them you must set up your own idt & gdt.
 	 */
-	set_gdt(phys_to_virt(0),0);
-	set_idt(phys_to_virt(0),0);
+	set_gdt(phys_to_virt(0), 0);
+	set_idt(phys_to_virt(0), 0);
 
 	/* now call it */
 	image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6993d51b7fd..89cea4d4467 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,11 +12,47 @@
 #include <linux/reboot.h>
 #include <linux/numa.h>
 #include <linux/ftrace.h>
+#include <linux/io.h>
+#include <linux/suspend.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/io.h>
+
+static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
+				unsigned long addr)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	struct page *page;
+	int result = -ENOMEM;
+
+	addr &= PMD_MASK;
+	pgd += pgd_index(addr);
+	if (!pgd_present(*pgd)) {
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page)
+			goto out;
+		pud = (pud_t *)page_address(page);
+		memset(pud, 0, PAGE_SIZE);
+		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+	}
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud)) {
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page)
+			goto out;
+		pmd = (pmd_t *)page_address(page);
+		memset(pmd, 0, PAGE_SIZE);
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+	}
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+	result = 0;
+out:
+	return result;
+}
 
 static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
@@ -83,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
 		}
 		level3p = (pud_t *)page_address(page);
 		result = init_level3_page(image, level3p, addr, last_addr);
-		if (result) {
+		if (result)
 			goto out;
-		}
 		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
 		addr += PGDIR_SIZE;
 	}
@@ -156,6 +191,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 	result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
 	if (result)
 		return result;
+	/*
+	 * image->start may be outside 0 ~ max_pfn, for example when
+	 * jump back to original kernel from kexeced kernel
+	 */
+	result = init_one_level2_page(image, level4p, image->start);
+	if (result)
+		return result;
 	return init_transition_pgtable(image, level4p);
 }
 
@@ -229,20 +271,45 @@ void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
+	int save_ftrace_enabled;
 
-	tracer_disable();
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context)
+		save_processor_state();
+#endif
+
+	save_ftrace_enabled = __ftrace_enabled_save();
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
+	if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+		/*
+		 * We need to put APICs in legacy mode so that we can
+		 * get timer interrupts in second kernel. kexec/kdump
+		 * paths already have calls to disable_IO_APIC() in
+		 * one form or other. kexec jump path also need
+		 * one.
+		 */
+		disable_IO_APIC();
+#endif
+	}
+
 	control_page = page_address(image->control_code_page) + PAGE_SIZE;
-	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+	memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
 
 	page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
+	page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 	page_list[PA_TABLE_PAGE] =
 	  (unsigned long)__pa(page_address(image->control_code_page));
 
-	/* The segment registers are funny things, they have both a
+	if (image->type == KEXEC_TYPE_DEFAULT)
+		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+						<< PAGE_SHIFT);
+
+	/*
+	 * The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
 	 * set to a specific selector, the invisible part is loaded
 	 * with from a table in memory.  At no other time is the
@@ -252,15 +319,25 @@ void machine_kexec(struct kimage *image)
 	 * segments, before I zap the gdt with an invalid value.
 	 */
 	load_segments();
-	/* The gdt & idt are now invalid.
+	/*
+	 * The gdt & idt are now invalid.
 	 * If you want to load them you must set up your own idt & gdt.
 	 */
-	set_gdt(phys_to_virt(0),0);
-	set_idt(phys_to_virt(0),0);
+	set_gdt(phys_to_virt(0), 0);
+	set_idt(phys_to_virt(0), 0);
 
 	/* now call it */
-	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
-			image->start);
+	image->start = relocate_kernel((unsigned long)image->head,
+				       (unsigned long)page_list,
+				       image->start,
+				       image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context)
+		restore_processor_state();
+#endif
+
+	__ftrace_enabled_restore(save_ftrace_enabled);
 }
 
 void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 8815f3c7fec..846510b78a0 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -348,7 +348,6 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id)
 static struct irqaction mfgptirq  = {
 	.handler = mfgpt_tick,
 	.flags = IRQF_DISABLED | IRQF_NOBALANCING,
-	.mask = CPU_MASK_NONE,
 	.name = "mfgpt-timer"
 };
 
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c25fdb38229..453b5795a5c 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -12,31 +12,30 @@
  *
  *  Licensed under the terms of the GNU General Public
  *  License version 2. See file COPYING for details.
-*/
-
+ */
+#include <linux/platform_device.h>
 #include <linux/capability.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
 #include <linux/miscdevice.h>
+#include <linux/firmware.h>
 #include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
+#include <linux/cpumask.h>
+#include <linux/pci_ids.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/cpu.h>
-#include <linux/firmware.h>
-#include <linux/platform_device.h>
 #include <linux/pci.h>
-#include <linux/pci_ids.h>
-#include <linux/uaccess.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
 
-#include <asm/msr.h>
-#include <asm/processor.h>
 #include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
 
 MODULE_DESCRIPTION("AMD Microcode Update Driver");
 MODULE_AUTHOR("Peter Oruba");
@@ -72,8 +71,8 @@ struct microcode_header_amd {
 } __attribute__((packed));
 
 struct microcode_amd {
-	struct microcode_header_amd hdr;
-	unsigned int mpb[0];
+	struct microcode_header_amd	hdr;
+	unsigned int			mpb[0];
 };
 
 #define UCODE_MAX_SIZE			2048
@@ -184,8 +183,8 @@ static int get_ucode_data(void *to, const u8 *from, size_t n)
 	return 0;
 }
 
-static void *get_next_ucode(const u8 *buf, unsigned int size,
-			    unsigned int *mc_size)
+static void *
+get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 {
 	unsigned int total_size;
 	u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
@@ -223,7 +222,6 @@ static void *get_next_ucode(const u8 *buf, unsigned int size,
 	return mc;
 }
 
-
 static int install_equiv_cpu_table(const u8 *buf)
 {
 	u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
@@ -372,4 +370,3 @@ struct microcode_ops * __init init_amd_microcode(void)
 {
 	return &microcode_amd_ops;
 }
-
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index c9b721ba968..a0f3851ef31 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -70,67 +70,78 @@
  *		Fix sigmatch() macro to handle old CPUs with pf == 0.
  *		Thanks to Stuart Swales for pointing out this bug.
  */
+#include <linux/platform_device.h>
 #include <linux/capability.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/miscdevice.h>
+#include <linux/firmware.h>
 #include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
-#include <linux/miscdevice.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/cpu.h>
-#include <linux/firmware.h>
-#include <linux/platform_device.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
 
-#include <asm/msr.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
 #include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
 
 MODULE_DESCRIPTION("Microcode Update Driver");
 MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
 MODULE_LICENSE("GPL");
 
-#define MICROCODE_VERSION 	"2.00"
+#define MICROCODE_VERSION	"2.00"
 
-static struct microcode_ops *microcode_ops;
+static struct microcode_ops	*microcode_ops;
 
 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
 static DEFINE_MUTEX(microcode_mutex);
 
-struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
+struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
 EXPORT_SYMBOL_GPL(ucode_cpu_info);
 
 #ifdef CONFIG_MICROCODE_OLD_INTERFACE
+struct update_for_cpu {
+	const void __user	*buf;
+	size_t			size;
+};
+
+static long update_for_cpu(void *_ufc)
+{
+	struct update_for_cpu *ufc = _ufc;
+	int error;
+
+	error = microcode_ops->request_microcode_user(smp_processor_id(),
+						      ufc->buf, ufc->size);
+	if (error < 0)
+		return error;
+	if (!error)
+		microcode_ops->apply_microcode(smp_processor_id());
+	return error;
+}
+
 static int do_microcode_update(const void __user *buf, size_t size)
 {
-	cpumask_t old;
 	int error = 0;
 	int cpu;
-
-	old = current->cpus_allowed;
+	struct update_for_cpu ufc = { .buf = buf, .size = size };
 
 	for_each_online_cpu(cpu) {
 		struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
 		if (!uci->valid)
 			continue;
-
-		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-		error = microcode_ops->request_microcode_user(cpu, buf, size);
+		error = work_on_cpu(cpu, update_for_cpu, &ufc);
 		if (error < 0)
-			goto out;
-		if (!error)
-			microcode_ops->apply_microcode(cpu);
+			break;
 	}
-out:
-	set_cpus_allowed_ptr(current, &old);
 	return error;
 }
 
@@ -198,18 +209,33 @@ static void microcode_dev_exit(void)
 
 MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
 #else
-#define microcode_dev_init() 0
-#define microcode_dev_exit() do { } while (0)
+#define microcode_dev_init()	0
+#define microcode_dev_exit()	do { } while (0)
 #endif
 
 /* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
+static struct platform_device	*microcode_pdev;
+
+static long reload_for_cpu(void *unused)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
+	int err = 0;
+
+	mutex_lock(&microcode_mutex);
+	if (uci->valid) {
+		err = microcode_ops->request_microcode_fw(smp_processor_id(),
+							  &microcode_pdev->dev);
+		if (!err)
+			microcode_ops->apply_microcode(smp_processor_id());
+	}
+	mutex_unlock(&microcode_mutex);
+	return err;
+}
 
 static ssize_t reload_store(struct sys_device *dev,
 			    struct sysdev_attribute *attr,
 			    const char *buf, size_t sz)
 {
-	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 	char *end;
 	unsigned long val = simple_strtoul(buf, &end, 0);
 	int err = 0;
@@ -218,21 +244,9 @@ static ssize_t reload_store(struct sys_device *dev,
 	if (end == buf)
 		return -EINVAL;
 	if (val == 1) {
-		cpumask_t old = current->cpus_allowed;
-
 		get_online_cpus();
-		if (cpu_online(cpu)) {
-			set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-			mutex_lock(&microcode_mutex);
-			if (uci->valid) {
-				err = microcode_ops->request_microcode_fw(cpu,
-						&microcode_pdev->dev);
-				if (!err)
-					microcode_ops->apply_microcode(cpu);
-			}
-			mutex_unlock(&microcode_mutex);
-			set_cpus_allowed_ptr(current, &old);
-		}
+		if (cpu_online(cpu))
+			err = work_on_cpu(cpu, reload_for_cpu, NULL);
 		put_online_cpus();
 	}
 	if (err)
@@ -268,8 +282,8 @@ static struct attribute *mc_default_attrs[] = {
 };
 
 static struct attribute_group mc_attr_group = {
-	.attrs = mc_default_attrs,
-	.name = "microcode",
+	.attrs		= mc_default_attrs,
+	.name		= "microcode",
 };
 
 static void __microcode_fini_cpu(int cpu)
@@ -328,9 +342,9 @@ static int microcode_resume_cpu(int cpu)
 	return 0;
 }
 
-static void microcode_update_cpu(int cpu)
+static long microcode_update_cpu(void *unused)
 {
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
 	int err = 0;
 
 	/*
@@ -338,30 +352,27 @@ static void microcode_update_cpu(int cpu)
 	 * otherwise just request a firmware:
 	 */
 	if (uci->valid) {
-		err = microcode_resume_cpu(cpu);
-	} else {	
-		collect_cpu_info(cpu);
+		err = microcode_resume_cpu(smp_processor_id());
+	} else {
+		collect_cpu_info(smp_processor_id());
 		if (uci->valid && system_state == SYSTEM_RUNNING)
-			err = microcode_ops->request_microcode_fw(cpu,
+			err = microcode_ops->request_microcode_fw(
+					smp_processor_id(),
 					&microcode_pdev->dev);
 	}
 	if (!err)
-		microcode_ops->apply_microcode(cpu);
+		microcode_ops->apply_microcode(smp_processor_id());
+	return err;
 }
 
-static void microcode_init_cpu(int cpu)
+static int microcode_init_cpu(int cpu)
 {
-	cpumask_t old = current->cpus_allowed;
-
-	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-	/* We should bind the task to the CPU */
-	BUG_ON(raw_smp_processor_id() != cpu);
-
+	int err;
 	mutex_lock(&microcode_mutex);
-	microcode_update_cpu(cpu);
+	err = work_on_cpu(cpu, microcode_update_cpu, NULL);
 	mutex_unlock(&microcode_mutex);
 
-	set_cpus_allowed_ptr(current, &old);
+	return err;
 }
 
 static int mc_sysdev_add(struct sys_device *sys_dev)
@@ -379,8 +390,11 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
 	if (err)
 		return err;
 
-	microcode_init_cpu(cpu);
-	return 0;
+	err = microcode_init_cpu(cpu);
+	if (err)
+		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+
+	return err;
 }
 
 static int mc_sysdev_remove(struct sys_device *sys_dev)
@@ -404,14 +418,14 @@ static int mc_sysdev_resume(struct sys_device *dev)
 		return 0;
 
 	/* only CPU 0 will apply ucode here */
-	microcode_update_cpu(0);
+	microcode_update_cpu(NULL);
 	return 0;
 }
 
 static struct sysdev_driver mc_sysdev_driver = {
-	.add = mc_sysdev_add,
-	.remove = mc_sysdev_remove,
-	.resume = mc_sysdev_resume,
+	.add		= mc_sysdev_add,
+	.remove		= mc_sysdev_remove,
+	.resume		= mc_sysdev_resume,
 };
 
 static __cpuinit int
@@ -424,7 +438,9 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		microcode_init_cpu(cpu);
+		if (microcode_init_cpu(cpu))
+			printk(KERN_ERR "microcode: failed to init CPU%d\n",
+			       cpu);
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 		pr_debug("microcode: CPU%d added\n", cpu);
@@ -448,7 +464,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 }
 
 static struct notifier_block __refdata mc_cpu_notifier = {
-	.notifier_call = mc_cpu_callback,
+	.notifier_call	= mc_cpu_callback,
 };
 
 static int __init microcode_init(void)
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 5e9f4fc5138..149b9ec7c1a 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,28 +70,28 @@
  *		Fix sigmatch() macro to handle old CPUs with pf == 0.
  *		Thanks to Stuart Swales for pointing out this bug.
  */
+#include <linux/platform_device.h>
 #include <linux/capability.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/miscdevice.h>
+#include <linux/firmware.h>
 #include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
-#include <linux/miscdevice.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/cpu.h>
-#include <linux/firmware.h>
-#include <linux/platform_device.h>
-#include <linux/uaccess.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
 
-#include <asm/msr.h>
-#include <asm/processor.h>
 #include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
 
 MODULE_DESCRIPTION("Microcode Update Driver");
 MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -129,12 +129,13 @@ struct extended_sigtable {
 	struct extended_signature sigs[0];
 };
 
-#define DEFAULT_UCODE_DATASIZE 	(2000)
+#define DEFAULT_UCODE_DATASIZE	(2000)
 #define MC_HEADER_SIZE		(sizeof(struct microcode_header_intel))
 #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
 #define EXT_HEADER_SIZE		(sizeof(struct extended_sigtable))
 #define EXT_SIGNATURE_SIZE	(sizeof(struct extended_signature))
 #define DWSIZE			(sizeof(u32))
+
 #define get_totalsize(mc) \
 	(((struct microcode_intel *)mc)->hdr.totalsize ? \
 	 ((struct microcode_intel *)mc)->hdr.totalsize : \
@@ -197,30 +198,31 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
 }
 
 static inline int
-update_match_revision(struct microcode_header_intel *mc_header,	int rev)
+update_match_revision(struct microcode_header_intel *mc_header, int rev)
 {
 	return (mc_header->rev <= rev) ? 0 : 1;
 }
 
 static int microcode_sanity_check(void *mc)
 {
+	unsigned long total_size, data_size, ext_table_size;
 	struct microcode_header_intel *mc_header = mc;
 	struct extended_sigtable *ext_header = NULL;
-	struct extended_signature *ext_sig;
-	unsigned long total_size, data_size, ext_table_size;
 	int sum, orig_sum, ext_sigcount = 0, i;
+	struct extended_signature *ext_sig;
 
 	total_size = get_totalsize(mc_header);
 	data_size = get_datasize(mc_header);
+
 	if (data_size + MC_HEADER_SIZE > total_size) {
 		printk(KERN_ERR "microcode: error! "
-			"Bad data size in microcode data file\n");
+				"Bad data size in microcode data file\n");
 		return -EINVAL;
 	}
 
 	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
 		printk(KERN_ERR "microcode: error! "
-			"Unknown microcode update format\n");
+				"Unknown microcode update format\n");
 		return -EINVAL;
 	}
 	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
@@ -318,11 +320,15 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
 
 static void apply_microcode(int cpu)
 {
+	struct microcode_intel *mc_intel;
+	struct ucode_cpu_info *uci;
 	unsigned long flags;
 	unsigned int val[2];
-	int cpu_num = raw_smp_processor_id();
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	struct microcode_intel *mc_intel = uci->mc;
+	int cpu_num;
+
+	cpu_num = raw_smp_processor_id();
+	uci = ucode_cpu_info + cpu;
+	mc_intel = uci->mc;
 
 	/* We should bind the task to the CPU */
 	BUG_ON(cpu_num != cpu);
@@ -348,15 +354,17 @@ static void apply_microcode(int cpu)
 	spin_unlock_irqrestore(&microcode_update_lock, flags);
 	if (val[1] != mc_intel->hdr.rev) {
 		printk(KERN_ERR "microcode: CPU%d update from revision "
-			"0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
+				"0x%x to 0x%x failed\n",
+			cpu_num, uci->cpu_sig.rev, val[1]);
 		return;
 	}
 	printk(KERN_INFO "microcode: CPU%d updated from revision "
-	       "0x%x to 0x%x, date = %04x-%02x-%02x \n",
+			 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
 		cpu_num, uci->cpu_sig.rev, val[1],
 		mc_intel->hdr.date & 0xffff,
 		mc_intel->hdr.date >> 24,
 		(mc_intel->hdr.date >> 16) & 0xff);
+
 	uci->cpu_sig.rev = val[1];
 }
 
@@ -404,18 +412,23 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
 		leftover  -= mc_size;
 	}
 
-	if (new_mc) {
-		if (!leftover) {
-			if (uci->mc)
-				vfree(uci->mc);
-			uci->mc = (struct microcode_intel *)new_mc;
-			pr_debug("microcode: CPU%d found a matching microcode update with"
-				 " version 0x%x (current=0x%x)\n",
-				cpu, new_rev, uci->cpu_sig.rev);
-		} else
-			vfree(new_mc);
+	if (!new_mc)
+		goto out;
+
+	if (leftover) {
+		vfree(new_mc);
+		goto out;
 	}
 
+	if (uci->mc)
+		vfree(uci->mc);
+	uci->mc = (struct microcode_intel *)new_mc;
+
+	pr_debug("microcode: CPU%d found a matching microcode update with"
+		 " version 0x%x (current=0x%x)\n",
+			cpu, new_rev, uci->cpu_sig.rev);
+
+ out:
 	return (int)leftover;
 }
 
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 666e43df51f..712d15fdc41 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
         return 0;
 }
 
-static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
+static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
         {
                 .callback = set_check_enable_amd_mmconf,
                 .ident = "Sun Microsystems Machine",
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 37cb1bda1ba..dce99dca6cf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -109,9 +109,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
 	} else
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
-#endif
-
-#ifdef CONFIG_X86_IO_APIC
 
 static int bad_ioapic(unsigned long address)
 {
@@ -224,8 +221,12 @@ static void __init MP_intsrc_info(struct mpc_intsrc *m)
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
+#else /* CONFIG_X86_IO_APIC */
+static inline void __init MP_bus_info(struct mpc_bus *m) {}
+static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
+static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
+#endif /* CONFIG_X86_IO_APIC */
 
-#endif
 
 static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
 {
@@ -275,6 +276,20 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
 	return 1;
 }
 
+static void skip_entry(unsigned char **ptr, int *count, int size)
+{
+	*ptr += size;
+	*count += size;
+}
+
+static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
+{
+	printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
+		"type %x\n", *mpt);
+	print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
+			1, mpc, mpc->length, 1);
+}
+
 static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 {
 	char str[16];
@@ -310,61 +325,30 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 	while (count < mpc->length) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
-			{
-				struct mpc_cpu *m = (struct mpc_cpu *)mpt;
-				/* ACPI may have already provided this data */
-				if (!acpi_lapic)
-					MP_processor_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			/* ACPI may have already provided this data */
+			if (!acpi_lapic)
+				MP_processor_info((struct mpc_cpu *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+			break;
 		case MP_BUS:
-			{
-				struct mpc_bus *m = (struct mpc_bus *)mpt;
-#ifdef CONFIG_X86_IO_APIC
-				MP_bus_info(m);
-#endif
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			MP_bus_info((struct mpc_bus *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+			break;
 		case MP_IOAPIC:
-			{
-#ifdef CONFIG_X86_IO_APIC
-				struct mpc_ioapic *m = (struct mpc_ioapic *)mpt;
-				MP_ioapic_info(m);
-#endif
-				mpt += sizeof(struct mpc_ioapic);
-				count += sizeof(struct mpc_ioapic);
-				break;
-			}
+			MP_ioapic_info((struct mpc_ioapic *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+			break;
 		case MP_INTSRC:
-			{
-#ifdef CONFIG_X86_IO_APIC
-				struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
-				MP_intsrc_info(m);
-#endif
-				mpt += sizeof(struct mpc_intsrc);
-				count += sizeof(struct mpc_intsrc);
-				break;
-			}
+			MP_intsrc_info((struct mpc_intsrc *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+			break;
 		case MP_LINTSRC:
-			{
-				struct mpc_lintsrc *m =
-				    (struct mpc_lintsrc *)mpt;
-				MP_lintsrc_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			MP_lintsrc_info((struct mpc_lintsrc *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+			break;
 		default:
 			/* wrong mptable */
-			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
-			printk(KERN_ERR "type %x\n", *mpt);
-			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
-					1, mpc, mpc->length, 1);
+			smp_dump_mptable(mpc, mpt);
 			count = mpc->length;
 			break;
 		}
@@ -558,6 +542,68 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 
 static struct mpf_intel *mpf_found;
 
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+	struct mpc_table *mpc;
+	unsigned long size;
+
+	mpc = early_ioremap(physptr, PAGE_SIZE);
+	size = mpc->length;
+	early_iounmap(mpc, PAGE_SIZE);
+	apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
+
+	return size;
+}
+
+static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
+{
+	struct mpc_table *mpc;
+	unsigned long size;
+
+	size = get_mpc_size(mpf->physptr);
+	mpc = early_ioremap(mpf->physptr, size);
+	/*
+	 * Read the physical hardware table.  Anything here will
+	 * override the defaults.
+	 */
+	if (!smp_read_mpc(mpc, early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
+		smp_found_config = 0;
+#endif
+		printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
+			"... disabling SMP support. (tell your hw vendor)\n");
+		early_iounmap(mpc, size);
+		return -1;
+	}
+	early_iounmap(mpc, size);
+
+	if (early)
+		return -1;
+
+#ifdef CONFIG_X86_IO_APIC
+	/*
+	 * If there are no explicit MP IRQ entries, then we are
+	 * broken.  We set up most of the low 16 IO-APIC pins to
+	 * ISA defaults and hope it will work.
+	 */
+	if (!mp_irq_entries) {
+		struct mpc_bus bus;
+
+		printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
+		       "using default mptable. (tell your hw vendor)\n");
+
+		bus.type = MP_BUS;
+		bus.busid = 0;
+		memcpy(bus.bustype, "ISA   ", 6);
+		MP_bus_info(&bus);
+
+		construct_default_ioirq_mptable(0);
+	}
+#endif
+
+	return 0;
+}
+
 /*
  * Scan the memory blocks for an SMP configuration block.
  */
@@ -611,45 +657,8 @@ static void __init __get_smp_config(unsigned int early)
 		construct_default_ISA_mptable(mpf->feature1);
 
 	} else if (mpf->physptr) {
-
-		/*
-		 * Read the physical hardware table.  Anything here will
-		 * override the defaults.
-		 */
-		if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
-#ifdef CONFIG_X86_LOCAL_APIC
-			smp_found_config = 0;
-#endif
-			printk(KERN_ERR
-			       "BIOS bug, MP table errors detected!...\n");
-			printk(KERN_ERR "... disabling SMP support. "
-			       "(tell your hw vendor)\n");
-			return;
-		}
-
-		if (early)
+		if (check_physptr(mpf, early))
 			return;
-#ifdef CONFIG_X86_IO_APIC
-		/*
-		 * If there are no explicit MP IRQ entries, then we are
-		 * broken.  We set up most of the low 16 IO-APIC pins to
-		 * ISA defaults and hope it will work.
-		 */
-		if (!mp_irq_entries) {
-			struct mpc_bus bus;
-
-			printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
-			       "using default mptable. "
-			       "(tell your hw vendor)\n");
-
-			bus.type = MP_BUS;
-			bus.busid = 0;
-			memcpy(bus.bustype, "ISA   ", 6);
-			MP_bus_info(&bus);
-
-			construct_default_ioirq_mptable(0);
-		}
-#endif
 	} else
 		BUG();
 
@@ -670,6 +679,31 @@ void __init get_smp_config(void)
 	__get_smp_config(0);
 }
 
+static void smp_reserve_bootmem(struct mpf_intel *mpf)
+{
+	unsigned long size = get_mpc_size(mpf->physptr);
+#ifdef CONFIG_X86_32
+	/*
+	 * We cannot access to MPC table to compute table size yet,
+	 * as only few megabytes from the bottom is mapped now.
+	 * PC-9800's MPC table places on the very last of physical
+	 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
+	 * yields BUG() in reserve_bootmem.
+	 * also need to make sure physptr is below than max_low_pfn
+	 * we don't need reserve the area above max_low_pfn
+	 */
+	unsigned long end = max_low_pfn * PAGE_SIZE;
+
+	if (mpf->physptr < end) {
+		if (mpf->physptr + size > end)
+			size = end - mpf->physptr;
+		reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+	}
+#else
+	reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+#endif
+}
+
 static int __init smp_scan_config(unsigned long base, unsigned long length,
 				  unsigned reserve)
 {
@@ -697,36 +731,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 
 			if (!reserve)
 				return 1;
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
-					BOOTMEM_DEFAULT);
-			if (mpf->physptr) {
-				unsigned long size = PAGE_SIZE;
-#ifdef CONFIG_X86_32
-				/*
-				 * We cannot access to MPC table to compute
-				 * table size yet, as only few megabytes from
-				 * the bottom is mapped now.
-				 * PC-9800's MPC table places on the very last
-				 * of physical memory; so that simply reserving
-				 * PAGE_SIZE from mpf->physptr yields BUG()
-				 * in reserve_bootmem.
-				 * also need to make sure physptr is below than
-				 * max_low_pfn
-				 * we don't need reserve the area above max_low_pfn
-				 */
-				unsigned long end = max_low_pfn * PAGE_SIZE;
-
-				if (mpf->physptr < end) {
-					if (mpf->physptr + size > end)
-						size = end - mpf->physptr;
-					reserve_bootmem_generic(mpf->physptr, size,
-							BOOTMEM_DEFAULT);
-				}
-#else
-				reserve_bootmem_generic(mpf->physptr, size,
+			reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
 						BOOTMEM_DEFAULT);
-#endif
-			}
+			if (mpf->physptr)
+				smp_reserve_bootmem(mpf);
 
 			return 1;
 		}
@@ -829,7 +837,57 @@ static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)
 #define SPARE_SLOT_NUM 20
 
 static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
-#endif
+
+static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+{
+	int i;
+
+	apic_printk(APIC_VERBOSE, "OLD ");
+	print_MP_intsrc_info(m);
+
+	i = get_MP_intsrc_index(m);
+	if (i > 0) {
+		assign_to_mpc_intsrc(&mp_irqs[i], m);
+		apic_printk(APIC_VERBOSE, "NEW ");
+		print_mp_irq_info(&mp_irqs[i]);
+		return;
+	}
+	if (!i) {
+		/* legacy, do nothing */
+		return;
+	}
+	if (*nr_m_spare < SPARE_SLOT_NUM) {
+		/*
+		 * not found (-1), or duplicated (-2) are invalid entries,
+		 * we need to use the slot later
+		 */
+		m_spare[*nr_m_spare] = m;
+		*nr_m_spare += 1;
+	}
+}
+#else /* CONFIG_X86_IO_APIC */
+static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
+		      int count)
+{
+	if (!mpc_new_phys) {
+		pr_info("No spare slots, try to append...take your risk, "
+			"new mpc_length %x\n", count);
+	} else {
+		if (count <= mpc_new_length)
+			pr_info("No spare slots, try to append..., "
+				"new mpc_length %x\n", count);
+		else {
+			pr_err("mpc_new_length %lx is too small\n",
+				mpc_new_length);
+			return -1;
+		}
+	}
+
+	return 0;
+}
 
 static int  __init replace_intsrc_all(struct mpc_table *mpc,
 					unsigned long mpc_new_phys,
@@ -837,77 +895,33 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 {
 #ifdef CONFIG_X86_IO_APIC
 	int i;
-	int nr_m_spare = 0;
 #endif
-
 	int count = sizeof(*mpc);
+	int nr_m_spare = 0;
 	unsigned char *mpt = ((unsigned char *)mpc) + count;
 
 	printk(KERN_INFO "mpc_length %x\n", mpc->length);
 	while (count < mpc->length) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
-			{
-				struct mpc_cpu *m = (struct mpc_cpu *)mpt;
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+			break;
 		case MP_BUS:
-			{
-				struct mpc_bus *m = (struct mpc_bus *)mpt;
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+			break;
 		case MP_IOAPIC:
-			{
-				mpt += sizeof(struct mpc_ioapic);
-				count += sizeof(struct mpc_ioapic);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+			break;
 		case MP_INTSRC:
-			{
-#ifdef CONFIG_X86_IO_APIC
-				struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
-				printk(KERN_INFO "OLD ");
-				print_MP_intsrc_info(m);
-				i = get_MP_intsrc_index(m);
-				if (i > 0) {
-					assign_to_mpc_intsrc(&mp_irqs[i], m);
-					printk(KERN_INFO "NEW ");
-					print_mp_irq_info(&mp_irqs[i]);
-				} else if (!i) {
-					/* legacy, do nothing */
-				} else if (nr_m_spare < SPARE_SLOT_NUM) {
-					/*
-					 * not found (-1), or duplicated (-2)
-					 * are invalid entries,
-					 * we need to use the slot  later
-					 */
-					m_spare[nr_m_spare] = m;
-					nr_m_spare++;
-				}
-#endif
-				mpt += sizeof(struct mpc_intsrc);
-				count += sizeof(struct mpc_intsrc);
-				break;
-			}
+			check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
+			skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+			break;
 		case MP_LINTSRC:
-			{
-				struct mpc_lintsrc *m =
-				    (struct mpc_lintsrc *)mpt;
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+			break;
 		default:
 			/* wrong mptable */
-			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
-			printk(KERN_ERR "type %x\n", *mpt);
-			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
-					1, mpc, mpc->length, 1);
+			smp_dump_mptable(mpc, mpt);
 			goto out;
 		}
 	}
@@ -924,23 +938,15 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 			continue;
 
 		if (nr_m_spare > 0) {
-			printk(KERN_INFO "*NEW* found ");
+			apic_printk(APIC_VERBOSE, "*NEW* found\n");
 			nr_m_spare--;
 			assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
 			m_spare[nr_m_spare] = NULL;
 		} else {
 			struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
 			count += sizeof(struct mpc_intsrc);
-			if (!mpc_new_phys) {
-				printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
-			} else {
-				if (count <= mpc_new_length)
-					printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
-				else {
-					printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
-					goto out;
-				}
-			}
+			if (!check_slot(mpc_new_phys, mpc_new_length, count))
+				goto out;
 			assign_to_mpc_intsrc(&mp_irqs[i], m);
 			mpc->length = count;
 			mpt += sizeof(struct mpc_intsrc);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 63dd358d8ee..8e45f446488 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -470,7 +470,6 @@ struct pv_mmu_ops pv_mmu_ops = {
 #if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
 	.set_pte_atomic = native_set_pte_atomic,
-	.set_pte_present = native_set_pte_present,
 	.pte_clear = native_pte_clear,
 	.pmd_clear = native_pmd_clear,
 #endif
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index d28bbdc35e4..755c21e906f 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -380,8 +380,9 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
 	return tbl;
 }
 
-static void calgary_unmap_sg(struct device *dev,
-	struct scatterlist *sglist, int nelems, int direction)
+static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
+			     int nelems,enum dma_data_direction dir,
+			     struct dma_attrs *attrs)
 {
 	struct iommu_table *tbl = find_iommu_table(dev);
 	struct scatterlist *s;
@@ -404,7 +405,8 @@ static void calgary_unmap_sg(struct device *dev,
 }
 
 static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
-	int nelems, int direction)
+			  int nelems, enum dma_data_direction dir,
+			  struct dma_attrs *attrs)
 {
 	struct iommu_table *tbl = find_iommu_table(dev);
 	struct scatterlist *s;
@@ -429,15 +431,14 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
 		s->dma_address = (entry << PAGE_SHIFT) | s->offset;
 
 		/* insert into HW table */
-		tce_build(tbl, entry, npages, vaddr & PAGE_MASK,
-			  direction);
+		tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir);
 
 		s->dma_length = s->length;
 	}
 
 	return nelems;
 error:
-	calgary_unmap_sg(dev, sg, nelems, direction);
+	calgary_unmap_sg(dev, sg, nelems, dir, NULL);
 	for_each_sg(sg, s, nelems, i) {
 		sg->dma_address = bad_dma_address;
 		sg->dma_length = 0;
@@ -445,10 +446,12 @@ error:
 	return 0;
 }
 
-static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
-	size_t size, int direction)
+static dma_addr_t calgary_map_page(struct device *dev, struct page *page,
+				   unsigned long offset, size_t size,
+				   enum dma_data_direction dir,
+				   struct dma_attrs *attrs)
 {
-	void *vaddr = phys_to_virt(paddr);
+	void *vaddr = page_address(page) + offset;
 	unsigned long uaddr;
 	unsigned int npages;
 	struct iommu_table *tbl = find_iommu_table(dev);
@@ -456,17 +459,18 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
 	uaddr = (unsigned long)vaddr;
 	npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
 
-	return iommu_alloc(dev, tbl, vaddr, npages, direction);
+	return iommu_alloc(dev, tbl, vaddr, npages, dir);
 }
 
-static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
-	size_t size, int direction)
+static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
+			       size_t size, enum dma_data_direction dir,
+			       struct dma_attrs *attrs)
 {
 	struct iommu_table *tbl = find_iommu_table(dev);
 	unsigned int npages;
 
-	npages = iommu_num_pages(dma_handle, size, PAGE_SIZE);
-	iommu_free(tbl, dma_handle, npages);
+	npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
+	iommu_free(tbl, dma_addr, npages);
 }
 
 static void* calgary_alloc_coherent(struct device *dev, size_t size,
@@ -515,13 +519,13 @@ static void calgary_free_coherent(struct device *dev, size_t size,
 	free_pages((unsigned long)vaddr, get_order(size));
 }
 
-static struct dma_mapping_ops calgary_dma_ops = {
+static struct dma_map_ops calgary_dma_ops = {
 	.alloc_coherent = calgary_alloc_coherent,
 	.free_coherent = calgary_free_coherent,
-	.map_single = calgary_map_single,
-	.unmap_single = calgary_unmap_single,
 	.map_sg = calgary_map_sg,
 	.unmap_sg = calgary_unmap_sg,
+	.map_page = calgary_map_page,
+	.unmap_page = calgary_unmap_page,
 };
 
 static inline void __iomem * busno_to_bbar(unsigned char num)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index b2542853314..90f5b9ef5de 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -1,4 +1,5 @@
 #include <linux/dma-mapping.h>
+#include <linux/dma-debug.h>
 #include <linux/dmar.h>
 #include <linux/bootmem.h>
 #include <linux/pci.h>
@@ -12,7 +13,7 @@
 
 static int forbid_dac __read_mostly;
 
-struct dma_mapping_ops *dma_ops;
+struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static int iommu_sac_force __read_mostly;
@@ -44,6 +45,9 @@ struct device x86_dma_fallback_dev = {
 };
 EXPORT_SYMBOL(x86_dma_fallback_dev);
 
+/* Number of entries preallocated for DMA-API debugging */
+#define PREALLOC_DMA_DEBUG_ENTRIES       32768
+
 int dma_set_mask(struct device *dev, u64 mask)
 {
 	if (!dev->dma_mask || !dma_supported(dev, mask))
@@ -224,7 +228,7 @@ early_param("iommu", iommu_setup);
 
 int dma_supported(struct device *dev, u64 mask)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);
 
 #ifdef CONFIG_PCI
 	if (mask > 0xffffffff && forbid_dac > 0) {
@@ -265,6 +269,12 @@ EXPORT_SYMBOL(dma_supported);
 
 static int __init pci_iommu_init(void)
 {
+	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+
+#ifdef CONFIG_PCI
+	dma_debug_add_bus(&pci_bus_type);
+#endif
+
 	calgary_iommu_init();
 
 	intel_iommu_init();
@@ -290,8 +300,7 @@ fs_initcall(pci_iommu_init);
 static __devinit void via_no_dac(struct pci_dev *dev)
 {
 	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-		printk(KERN_INFO
-			"PCI: VIA PCI bridge detected. Disabling DAC.\n");
+		dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
 		forbid_dac = 1;
 	}
 }
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d5768b1af08..b284b58c035 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -255,10 +255,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
 }
 
 /* Map a single area into the IOMMU */
-static dma_addr_t
-gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
+static dma_addr_t gart_map_page(struct device *dev, struct page *page,
+				unsigned long offset, size_t size,
+				enum dma_data_direction dir,
+				struct dma_attrs *attrs)
 {
 	unsigned long bus;
+	phys_addr_t paddr = page_to_phys(page) + offset;
 
 	if (!dev)
 		dev = &x86_dma_fallback_dev;
@@ -275,8 +278,9 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
 /*
  * Free a DMA mapping.
  */
-static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
-			      size_t size, int direction)
+static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
+			    size_t size, enum dma_data_direction dir,
+			    struct dma_attrs *attrs)
 {
 	unsigned long iommu_page;
 	int npages;
@@ -298,8 +302,8 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
 /*
  * Wrapper for pci_unmap_single working with scatterlists.
  */
-static void
-gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+			  enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *s;
 	int i;
@@ -307,7 +311,7 @@ gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 	for_each_sg(sg, s, nents, i) {
 		if (!s->dma_length || !s->length)
 			break;
-		gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
+		gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL);
 	}
 }
 
@@ -329,7 +333,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
 			addr = dma_map_area(dev, addr, s->length, dir, 0);
 			if (addr == bad_dma_address) {
 				if (i > 0)
-					gart_unmap_sg(dev, sg, i, dir);
+					gart_unmap_sg(dev, sg, i, dir, NULL);
 				nents = 0;
 				sg[0].dma_length = 0;
 				break;
@@ -400,8 +404,8 @@ dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
  * DMA map all entries in a scatterlist.
  * Merge chunks that have page aligned sizes into a continuous mapping.
  */
-static int
-gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+		       enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *s, *ps, *start_sg, *sgmap;
 	int need = 0, nextneed, i, out, start;
@@ -468,7 +472,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 
 error:
 	flush_gart();
-	gart_unmap_sg(dev, sg, out, dir);
+	gart_unmap_sg(dev, sg, out, dir, NULL);
 
 	/* When it was forced or merged try again in a dumb way */
 	if (force_iommu || iommu_merge) {
@@ -521,7 +525,7 @@ static void
 gart_free_coherent(struct device *dev, size_t size, void *vaddr,
 		   dma_addr_t dma_addr)
 {
-	gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
+	gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
 	free_pages((unsigned long)vaddr, get_order(size));
 }
 
@@ -707,11 +711,11 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	return -1;
 }
 
-static struct dma_mapping_ops gart_dma_ops = {
-	.map_single			= gart_map_single,
-	.unmap_single			= gart_unmap_single,
+static struct dma_map_ops gart_dma_ops = {
 	.map_sg				= gart_map_sg,
 	.unmap_sg			= gart_unmap_sg,
+	.map_page			= gart_map_page,
+	.unmap_page			= gart_unmap_page,
 	.alloc_coherent			= gart_alloc_coherent,
 	.free_coherent			= gart_free_coherent,
 };
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index c70ab5a5d4c..c6d703b3932 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -1,14 +1,14 @@
 /* Fallback functions when the main IOMMU code is not compiled in. This
    code is roughly equivalent to i386. */
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/string.h>
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/mm.h>
 
-#include <asm/iommu.h>
 #include <asm/processor.h>
+#include <asm/iommu.h>
 #include <asm/dma.h>
 
 static int
@@ -25,19 +25,19 @@ check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
 	return 1;
 }
 
-static dma_addr_t
-nommu_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
-	       int direction)
+static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
+				 unsigned long offset, size_t size,
+				 enum dma_data_direction dir,
+				 struct dma_attrs *attrs)
 {
-	dma_addr_t bus = paddr;
+	dma_addr_t bus = page_to_phys(page) + offset;
 	WARN_ON(size == 0);
-	if (!check_addr("map_single", hwdev, bus, size))
-				return bad_dma_address;
+	if (!check_addr("map_single", dev, bus, size))
+		return bad_dma_address;
 	flush_write_buffers();
 	return bus;
 }
 
-
 /* Map a set of buffers described by scatterlist in streaming
  * mode for DMA.  This is the scatter-gather version of the
  * above pci_map_single interface.  Here the scatter gather list
@@ -54,7 +54,8 @@ nommu_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
  * the same here.
  */
 static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
-	       int nents, int direction)
+			int nents, enum dma_data_direction dir,
+			struct dma_attrs *attrs)
 {
 	struct scatterlist *s;
 	int i;
@@ -78,12 +79,12 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
 	free_pages((unsigned long)vaddr, get_order(size));
 }
 
-struct dma_mapping_ops nommu_dma_ops = {
-	.alloc_coherent = dma_generic_alloc_coherent,
-	.free_coherent = nommu_free_coherent,
-	.map_single = nommu_map_single,
-	.map_sg = nommu_map_sg,
-	.is_phys = 1,
+struct dma_map_ops nommu_dma_ops = {
+	.alloc_coherent	= dma_generic_alloc_coherent,
+	.free_coherent	= nommu_free_coherent,
+	.map_sg		= nommu_map_sg,
+	.map_page	= nommu_map_page,
+	.is_phys	= 1,
 };
 
 void __init no_iommu_init(void)
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb.c
index d59c9174766..34f12e9996e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -33,18 +33,11 @@ phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
 	return baddr;
 }
 
-int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
 {
 	return 0;
 }
 
-static dma_addr_t
-swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
-			int direction)
-{
-	return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
-}
-
 static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 					dma_addr_t *dma_handle, gfp_t flags)
 {
@@ -57,20 +50,20 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
 }
 
-struct dma_mapping_ops swiotlb_dma_ops = {
+struct dma_map_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc_coherent = x86_swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
-	.map_single = swiotlb_map_single_phys,
-	.unmap_single = swiotlb_unmap_single,
 	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
 	.sync_single_for_device = swiotlb_sync_single_for_device,
 	.sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
 	.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
 	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
 	.sync_sg_for_device = swiotlb_sync_sg_for_device,
-	.map_sg = swiotlb_map_sg,
-	.unmap_sg = swiotlb_unmap_sg,
+	.map_sg = swiotlb_map_sg_attrs,
+	.unmap_sg = swiotlb_unmap_sg_attrs,
+	.map_page = swiotlb_map_page,
+	.unmap_page = swiotlb_unmap_page,
 	.dma_supported = NULL,
 };
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 87b69d4fac1..ca989158e84 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,16 +1,19 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <asm/idle.h>
 #include <linux/smp.h>
+#include <linux/prctl.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -19,6 +22,9 @@ EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
 
+DEFINE_TRACE(power_start);
+DEFINE_TRACE(power_end);
+
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	*dst = *src;
@@ -56,6 +62,193 @@ void arch_task_cache_init(void)
 }
 
 /*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+	struct task_struct *me = current;
+	struct thread_struct *t = &me->thread;
+	unsigned long *bp = t->io_bitmap_ptr;
+
+	if (bp) {
+		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+
+		t->io_bitmap_ptr = NULL;
+		clear_thread_flag(TIF_IO_BITMAP);
+		/*
+		 * Careful, clear this in the TSS too:
+		 */
+		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+		t->io_bitmap_max = 0;
+		put_cpu();
+		kfree(bp);
+	}
+
+	ds_exit_thread(current);
+}
+
+void flush_thread(void)
+{
+	struct task_struct *tsk = current;
+
+#ifdef CONFIG_X86_64
+	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
+		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
+		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
+			clear_tsk_thread_flag(tsk, TIF_IA32);
+		} else {
+			set_tsk_thread_flag(tsk, TIF_IA32);
+			current_thread_info()->status |= TS_COMPAT;
+		}
+	}
+#endif
+
+	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+
+	tsk->thread.debugreg0 = 0;
+	tsk->thread.debugreg1 = 0;
+	tsk->thread.debugreg2 = 0;
+	tsk->thread.debugreg3 = 0;
+	tsk->thread.debugreg6 = 0;
+	tsk->thread.debugreg7 = 0;
+	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+	/*
+	 * Forget coprocessor state..
+	 */
+	tsk->fpu_counter = 0;
+	clear_fpu(tsk);
+	clear_used_math();
+}
+
+static void hard_disable_TSC(void)
+{
+	write_cr4(read_cr4() | X86_CR4_TSD);
+}
+
+void disable_TSC(void)
+{
+	preempt_disable();
+	if (!test_and_set_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_disable_TSC();
+	preempt_enable();
+}
+
+static void hard_enable_TSC(void)
+{
+	write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+
+static void enable_TSC(void)
+{
+	preempt_disable();
+	if (test_and_clear_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_enable_TSC();
+	preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+	unsigned int val;
+
+	if (test_thread_flag(TIF_NOTSC))
+		val = PR_TSC_SIGSEGV;
+	else
+		val = PR_TSC_ENABLE;
+
+	return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+	if (val == PR_TSC_SIGSEGV)
+		disable_TSC();
+	else if (val == PR_TSC_ENABLE)
+		enable_TSC();
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+		      struct tss_struct *tss)
+{
+	struct thread_struct *prev, *next;
+
+	prev = &prev_p->thread;
+	next = &next_p->thread;
+
+	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+		ds_switch_to(prev_p, next_p);
+	else if (next->debugctlmsr != prev->debugctlmsr)
+		update_debugctlmsr(next->debugctlmsr);
+
+	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+		set_debugreg(next->debugreg0, 0);
+		set_debugreg(next->debugreg1, 1);
+		set_debugreg(next->debugreg2, 2);
+		set_debugreg(next->debugreg3, 3);
+		/* no 4 and 5 */
+		set_debugreg(next->debugreg6, 6);
+		set_debugreg(next->debugreg7, 7);
+	}
+
+	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+		/* prev and next are different */
+		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+			hard_disable_TSC();
+		else
+			hard_enable_TSC();
+	}
+
+	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+		/*
+		 * Copy the relevant range of the IO bitmap.
+		 * Normally this is 128 bytes or less:
+		 */
+		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+		       max(prev->io_bitmap_max, next->io_bitmap_max));
+	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+		/*
+		 * Clear any possible leftover bits:
+		 */
+		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+	}
+}
+
+int sys_fork(struct pt_regs *regs)
+{
+	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+int sys_vfork(struct pt_regs *regs)
+{
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
+		       NULL, NULL);
+}
+
+
+/*
  * Idle related variables and functions
  */
 unsigned long boot_option_idle_override = 0;
@@ -135,7 +328,7 @@ void stop_this_cpu(void *dummy)
 	/*
 	 * Remove this CPU:
 	 */
-	cpu_clear(smp_processor_id(), cpu_online_map);
+	set_cpu_online(smp_processor_id(), false);
 	disable_local_APIC();
 
 	for (;;) {
@@ -285,12 +478,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
 	return 1;
 }
 
-static cpumask_t c1e_mask = CPU_MASK_NONE;
+static cpumask_var_t c1e_mask;
 static int c1e_detected;
 
 void c1e_remove_cpu(int cpu)
 {
-	cpu_clear(cpu, c1e_mask);
+	if (c1e_mask != NULL)
+		cpumask_clear_cpu(cpu, c1e_mask);
 }
 
 /*
@@ -319,8 +513,8 @@ static void c1e_idle(void)
 	if (c1e_detected) {
 		int cpu = smp_processor_id();
 
-		if (!cpu_isset(cpu, c1e_mask)) {
-			cpu_set(cpu, c1e_mask);
+		if (!cpumask_test_cpu(cpu, c1e_mask)) {
+			cpumask_set_cpu(cpu, c1e_mask);
 			/*
 			 * Force broadcast so ACPI can not interfere. Needs
 			 * to run with interrupts enabled as it uses
@@ -372,6 +566,15 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 		pm_idle = default_idle;
 }
 
+void __init init_c1e_mask(void)
+{
+	/* If we're using c1e_idle, we need to allocate c1e_mask. */
+	if (pm_idle == c1e_idle) {
+		alloc_cpumask_var(&c1e_mask, GFP_KERNEL);
+		cpumask_clear(c1e_mask);
+	}
+}
+
 static int __init idle_setup(char *str)
 {
 	if (!str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 646da41a620..76f8f84043a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -230,55 +230,6 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 }
 EXPORT_SYMBOL(kernel_thread);
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-	/* The process may have allocated an io port bitmap... nuke it. */
-	if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
-		struct task_struct *tsk = current;
-		struct thread_struct *t = &tsk->thread;
-		int cpu = get_cpu();
-		struct tss_struct *tss = &per_cpu(init_tss, cpu);
-
-		kfree(t->io_bitmap_ptr);
-		t->io_bitmap_ptr = NULL;
-		clear_thread_flag(TIF_IO_BITMAP);
-		/*
-		 * Careful, clear this in the TSS too:
-		 */
-		memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
-		t->io_bitmap_max = 0;
-		tss->io_bitmap_owner = NULL;
-		tss->io_bitmap_max = 0;
-		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
-		put_cpu();
-	}
-
-	ds_exit_thread(current);
-}
-
-void flush_thread(void)
-{
-	struct task_struct *tsk = current;
-
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
-	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
-	/*
-	 * Forget coprocessor state..
-	 */
-	tsk->fpu_counter = 0;
-	clear_fpu(tsk);
-	clear_used_math();
-}
-
 void release_thread(struct task_struct *dead_task)
 {
 	BUG_ON(dead_task->mm);
@@ -294,7 +245,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 	unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
@@ -366,127 +317,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
-static void hard_disable_TSC(void)
-{
-	write_cr4(read_cr4() | X86_CR4_TSD);
-}
-
-void disable_TSC(void)
-{
-	preempt_disable();
-	if (!test_and_set_thread_flag(TIF_NOTSC))
-		/*
-		 * Must flip the CPU state synchronously with
-		 * TIF_NOTSC in the current running context.
-		 */
-		hard_disable_TSC();
-	preempt_enable();
-}
-
-static void hard_enable_TSC(void)
-{
-	write_cr4(read_cr4() & ~X86_CR4_TSD);
-}
-
-static void enable_TSC(void)
-{
-	preempt_disable();
-	if (test_and_clear_thread_flag(TIF_NOTSC))
-		/*
-		 * Must flip the CPU state synchronously with
-		 * TIF_NOTSC in the current running context.
-		 */
-		hard_enable_TSC();
-	preempt_enable();
-}
-
-int get_tsc_mode(unsigned long adr)
-{
-	unsigned int val;
-
-	if (test_thread_flag(TIF_NOTSC))
-		val = PR_TSC_SIGSEGV;
-	else
-		val = PR_TSC_ENABLE;
-
-	return put_user(val, (unsigned int __user *)adr);
-}
-
-int set_tsc_mode(unsigned int val)
-{
-	if (val == PR_TSC_SIGSEGV)
-		disable_TSC();
-	else if (val == PR_TSC_ENABLE)
-		enable_TSC();
-	else
-		return -EINVAL;
-
-	return 0;
-}
-
-static noinline void
-__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
-		 struct tss_struct *tss)
-{
-	struct thread_struct *prev, *next;
-
-	prev = &prev_p->thread;
-	next = &next_p->thread;
-
-	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
-	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
-		ds_switch_to(prev_p, next_p);
-	else if (next->debugctlmsr != prev->debugctlmsr)
-		update_debugctlmsr(next->debugctlmsr);
-
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg0, 0);
-		set_debugreg(next->debugreg1, 1);
-		set_debugreg(next->debugreg2, 2);
-		set_debugreg(next->debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(next->debugreg6, 6);
-		set_debugreg(next->debugreg7, 7);
-	}
-
-	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
-	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
-		/* prev and next are different */
-		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
-			hard_disable_TSC();
-		else
-			hard_enable_TSC();
-	}
-
-	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
-		/*
-		 * Disable the bitmap via an invalid offset. We still cache
-		 * the previous bitmap owner and the IO bitmap contents:
-		 */
-		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
-		return;
-	}
-
-	if (likely(next == tss->io_bitmap_owner)) {
-		/*
-		 * Previous owner of the bitmap (hence the bitmap content)
-		 * matches the next task, we dont have to do anything but
-		 * to set a valid offset in the TSS:
-		 */
-		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-		return;
-	}
-	/*
-	 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
-	 * and we let the task to get a GPF in case an I/O instruction
-	 * is performed.  The handler of the GPF will verify that the
-	 * faulting task has a valid I/O bitmap and, it true, does the
-	 * real copy and restart the instruction.  This will save us
-	 * redundant copies when the currently switched task does not
-	 * perform any I/O during its timeslice.
-	 */
-	tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
-}
 
 /*
  *	switch_to(x,yn) should switch tasks from x to y.
@@ -600,11 +430,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	return prev_p;
 }
 
-int sys_fork(struct pt_regs *regs)
-{
-	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
-}
-
 int sys_clone(struct pt_regs *regs)
 {
 	unsigned long clone_flags;
@@ -621,21 +446,6 @@ int sys_clone(struct pt_regs *regs)
 }
 
 /*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
- */
-int sys_vfork(struct pt_regs *regs)
-{
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL);
-}
-
-/*
  * sys_execve() executes a new program.
  */
 int sys_execve(struct pt_regs *regs)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 836ef6575f0..b751a41392b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -237,61 +237,6 @@ void show_regs(struct pt_regs *regs)
 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-	struct task_struct *me = current;
-	struct thread_struct *t = &me->thread;
-
-	if (me->thread.io_bitmap_ptr) {
-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
-
-		kfree(t->io_bitmap_ptr);
-		t->io_bitmap_ptr = NULL;
-		clear_thread_flag(TIF_IO_BITMAP);
-		/*
-		 * Careful, clear this in the TSS too:
-		 */
-		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
-		t->io_bitmap_max = 0;
-		put_cpu();
-	}
-
-	ds_exit_thread(current);
-}
-
-void flush_thread(void)
-{
-	struct task_struct *tsk = current;
-
-	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
-		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
-		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
-			clear_tsk_thread_flag(tsk, TIF_IA32);
-		} else {
-			set_tsk_thread_flag(tsk, TIF_IA32);
-			current_thread_info()->status |= TS_COMPAT;
-		}
-	}
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
-
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
-	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
-	/*
-	 * Forget coprocessor state..
-	 */
-	tsk->fpu_counter = 0;
-	clear_fpu(tsk);
-	clear_used_math();
-}
-
 void release_thread(struct task_struct *dead_task)
 {
 	if (dead_task->mm) {
@@ -333,7 +278,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
@@ -425,118 +370,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
-static void hard_disable_TSC(void)
-{
-	write_cr4(read_cr4() | X86_CR4_TSD);
-}
-
-void disable_TSC(void)
-{
-	preempt_disable();
-	if (!test_and_set_thread_flag(TIF_NOTSC))
-		/*
-		 * Must flip the CPU state synchronously with
-		 * TIF_NOTSC in the current running context.
-		 */
-		hard_disable_TSC();
-	preempt_enable();
-}
-
-static void hard_enable_TSC(void)
-{
-	write_cr4(read_cr4() & ~X86_CR4_TSD);
-}
-
-static void enable_TSC(void)
-{
-	preempt_disable();
-	if (test_and_clear_thread_flag(TIF_NOTSC))
-		/*
-		 * Must flip the CPU state synchronously with
-		 * TIF_NOTSC in the current running context.
-		 */
-		hard_enable_TSC();
-	preempt_enable();
-}
-
-int get_tsc_mode(unsigned long adr)
-{
-	unsigned int val;
-
-	if (test_thread_flag(TIF_NOTSC))
-		val = PR_TSC_SIGSEGV;
-	else
-		val = PR_TSC_ENABLE;
-
-	return put_user(val, (unsigned int __user *)adr);
-}
-
-int set_tsc_mode(unsigned int val)
-{
-	if (val == PR_TSC_SIGSEGV)
-		disable_TSC();
-	else if (val == PR_TSC_ENABLE)
-		enable_TSC();
-	else
-		return -EINVAL;
-
-	return 0;
-}
-
-/*
- * This special macro can be used to load a debugging register
- */
-#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
-
-static inline void __switch_to_xtra(struct task_struct *prev_p,
-				    struct task_struct *next_p,
-				    struct tss_struct *tss)
-{
-	struct thread_struct *prev, *next;
-
-	prev = &prev_p->thread,
-	next = &next_p->thread;
-
-	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
-	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
-		ds_switch_to(prev_p, next_p);
-	else if (next->debugctlmsr != prev->debugctlmsr)
-		update_debugctlmsr(next->debugctlmsr);
-
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		loaddebug(next, 0);
-		loaddebug(next, 1);
-		loaddebug(next, 2);
-		loaddebug(next, 3);
-		/* no 4 and 5 */
-		loaddebug(next, 6);
-		loaddebug(next, 7);
-	}
-
-	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
-	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
-		/* prev and next are different */
-		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
-			hard_disable_TSC();
-		else
-			hard_enable_TSC();
-	}
-
-	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
-		/*
-		 * Copy the relevant range of the IO bitmap.
-		 * Normally this is 128 bytes or less:
-		 */
-		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
-		       max(prev->io_bitmap_max, next->io_bitmap_max));
-	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
-		/*
-		 * Clear any possible leftover bits:
-		 */
-		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
-	}
-}
-
 /*
  *	switch_to(x,y) should switch tasks from x to y.
  *
@@ -694,11 +527,6 @@ void set_personality_64bit(void)
 	current->personality &= ~READ_IMPLIES_EXEC;
 }
 
-asmlinkage long sys_fork(struct pt_regs *regs)
-{
-	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
-}
-
 asmlinkage long
 sys_clone(unsigned long clone_flags, unsigned long newsp,
 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
@@ -708,22 +536,6 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
 }
 
-/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
- */
-asmlinkage long sys_vfork(struct pt_regs *regs)
-{
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
-		    NULL, NULL);
-}
-
 unsigned long get_wchan(struct task_struct *p)
 {
 	unsigned long stack;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3d9672e59c1..fe9345c967d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/signal.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -685,9 +686,8 @@ static int ptrace_bts_config(struct task_struct *child,
 		if (!cfg.signal)
 			return -EINVAL;
 
-		return -EOPNOTSUPP;
-
 		child->thread.bts_ovfl_signal = cfg.signal;
+		return -EOPNOTSUPP;
 	}
 
 	if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
@@ -1416,6 +1416,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
 	    tracehook_report_syscall_entry(regs))
 		ret = -1L;
 
+	if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
+		ftrace_syscall_enter(regs);
+
 	if (unlikely(current->audit_context)) {
 		if (IS_IA32)
 			audit_syscall_entry(AUDIT_ARCH_I386,
@@ -1439,6 +1442,9 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
 	if (unlikely(current->audit_context))
 		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 
+	if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
+		ftrace_syscall_exit(regs);
+
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
 		tracehook_report_syscall_exit(regs, 0);
 
@@ -1456,6 +1462,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
 	 * system call instruction.
 	 */
 	if (test_thread_flag(TIF_SINGLESTEP) &&
-	    tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
+	    tracehook_consider_fatal_signal(current, SIGTRAP))
 		send_sigtrap(current, regs, 0, TRAP_BRKPT);
 }
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 309949e9e1c..e95022e4f5d 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void)
 	if (!force_hpet_address)
 		return;
 
-	if (rcba_base == NULL)
-		BUG();
+	BUG_ON(rcba_base == NULL);
 
 	/* read the Function Disable register, dword mode only */
 	val = readl(rcba_base + 0x3404);
@@ -172,7 +171,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
 			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
 			 ich_force_enable_hpet);
-
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16,	/* ICH10 */
+			 ich_force_enable_hpet);
 
 static struct pci_dev *cached_dev;
 
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1cc18d439bb..2aef36d8aca 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
 		},
 	},
+	{	/* Handle problems with rebooting on Dell XPS710 */
+		.callback = set_bios_reboot,
+		.ident = "Dell XPS710",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+		},
+	},
 	{ }
 };
 
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 2064d0aa8d2..41235531b11 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -17,7 +17,8 @@
 
 #define PTR(x) (x << 2)
 
-/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
  * ~ control_page + PAGE_SIZE are used as data storage and stack for
  * jumping back
  */
@@ -76,8 +77,10 @@ relocate_kernel:
 	movl	%eax, CP_PA_SWAP_PAGE(%edi)
 	movl	%ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
 
-	/* get physical address of control page now */
-	/* this is impossible after page table switch */
+	/*
+	 * get physical address of control page now
+	 * this is impossible after page table switch
+	 */
 	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edi
 
 	/* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
 	/* store the start address on the stack */
 	pushl   %edx
 
-	/* Set cr0 to a known state:
+	/*
+	 * Set cr0 to a known state:
 	 *  - Paging disabled
 	 *  - Alignment check disabled
 	 *  - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
 	/* clear cr4 if applicable */
 	testl	%ecx, %ecx
 	jz	1f
-	/* Set cr4 to a known state:
+	/*
+	 * Set cr4 to a known state:
 	 * Setting everything to zero seems safe.
 	 */
 	xorl	%eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
 	call	swap_pages
 	addl	$8, %esp
 
-	/* To be certain of avoiding problems with self-modifying code
+	/*
+	 * To be certain of avoiding problems with self-modifying code
 	 * I need to execute a serializing instruction here.
 	 * So I flush the TLB, it's handy, and not processor dependent.
 	 */
 	xorl	%eax, %eax
 	movl	%eax, %cr3
 
-	/* set all of the registers to known values */
-	/* leave %esp alone */
+	/*
+	 * set all of the registers to known values
+	 * leave %esp alone
+	 */
 
 	testl	%esi, %esi
 	jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index d32cfb27a47..4de8f5b3d47 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -19,29 +19,77 @@
 #define PTR(x) (x << 3)
 #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset)		(KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define RSP			DATA(0x0)
+#define CR0			DATA(0x8)
+#define CR3			DATA(0x10)
+#define CR4			DATA(0x18)
+
+/* other data */
+#define CP_PA_TABLE_PAGE	DATA(0x20)
+#define CP_PA_SWAP_PAGE		DATA(0x28)
+#define CP_PA_BACKUP_PAGES_MAP	DATA(0x30)
+
 	.text
 	.align PAGE_SIZE
 	.code64
 	.globl relocate_kernel
 relocate_kernel:
-	/* %rdi indirection_page
+	/*
+	 * %rdi indirection_page
 	 * %rsi page_list
 	 * %rdx start address
+	 * %rcx preserve_context
 	 */
 
+	/* Save the CPU context, used for jumping back */
+	pushq %rbx
+	pushq %rbp
+	pushq %r12
+	pushq %r13
+	pushq %r14
+	pushq %r15
+	pushf
+
+	movq	PTR(VA_CONTROL_PAGE)(%rsi), %r11
+	movq	%rsp, RSP(%r11)
+	movq	%cr0, %rax
+	movq	%rax, CR0(%r11)
+	movq	%cr3, %rax
+	movq	%rax, CR3(%r11)
+	movq	%cr4, %rax
+	movq	%rax, CR4(%r11)
+
 	/* zero out flags, and disable interrupts */
 	pushq $0
 	popfq
 
-	/* get physical address of control page now */
-	/* this is impossible after page table switch */
+	/*
+	 * get physical address of control page now
+	 * this is impossible after page table switch
+	 */
 	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8
 
 	/* get physical address of page table now too */
-	movq	PTR(PA_TABLE_PAGE)(%rsi), %rcx
+	movq	PTR(PA_TABLE_PAGE)(%rsi), %r9
+
+	/* get physical address of swap page now */
+	movq	PTR(PA_SWAP_PAGE)(%rsi), %r10
+
+	/* save some information for jumping back */
+	movq	%r9, CP_PA_TABLE_PAGE(%r11)
+	movq	%r10, CP_PA_SWAP_PAGE(%r11)
+	movq	%rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
 
 	/* Switch to the identity mapped page tables */
-	movq	%rcx, %cr3
+	movq	%r9, %cr3
 
 	/* setup a new stack at the end of the physical control page */
 	lea	PAGE_SIZE(%r8), %rsp
@@ -55,7 +103,8 @@ identity_mapped:
 	/* store the start address on the stack */
 	pushq   %rdx
 
-	/* Set cr0 to a known state:
+	/*
+	 * Set cr0 to a known state:
 	 *  - Paging enabled
 	 *  - Alignment check disabled
 	 *  - Write protect disabled
@@ -68,7 +117,8 @@ identity_mapped:
 	orl	$(X86_CR0_PG | X86_CR0_PE), %eax
 	movq	%rax, %cr0
 
-	/* Set cr4 to a known state:
+	/*
+	 * Set cr4 to a known state:
 	 *  - physical address extension enabled
 	 */
 	movq	$X86_CR4_PAE, %rax
@@ -78,9 +128,87 @@ identity_mapped:
 1:
 
 	/* Flush the TLB (needed?) */
-	movq	%rcx, %cr3
+	movq	%r9, %cr3
+
+	movq	%rcx, %r11
+	call	swap_pages
+
+	/*
+	 * To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB by reloading %cr3 here, it's handy,
+	 * and not processor dependent.
+	 */
+	movq	%cr3, %rax
+	movq	%rax, %cr3
+
+	/*
+	 * set all of the registers to known values
+	 * leave %rsp alone
+	 */
+
+	testq	%r11, %r11
+	jnz 1f
+	xorq	%rax, %rax
+	xorq	%rbx, %rbx
+	xorq    %rcx, %rcx
+	xorq    %rdx, %rdx
+	xorq    %rsi, %rsi
+	xorq    %rdi, %rdi
+	xorq    %rbp, %rbp
+	xorq	%r8,  %r8
+	xorq	%r9,  %r9
+	xorq	%r10, %r9
+	xorq	%r11, %r11
+	xorq	%r12, %r12
+	xorq	%r13, %r13
+	xorq	%r14, %r14
+	xorq	%r15, %r15
+
+	ret
+
+1:
+	popq	%rdx
+	leaq	PAGE_SIZE(%r10), %rsp
+	call	*%rdx
+
+	/* get the re-entry point of the peer system */
+	movq	0(%rsp), %rbp
+	call	1f
+1:
+	popq	%r8
+	subq	$(1b - relocate_kernel), %r8
+	movq	CP_PA_SWAP_PAGE(%r8), %r10
+	movq	CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
+	movq	CP_PA_TABLE_PAGE(%r8), %rax
+	movq	%rax, %cr3
+	lea	PAGE_SIZE(%r8), %rsp
+	call	swap_pages
+	movq	$virtual_mapped, %rax
+	pushq	%rax
+	ret
+
+virtual_mapped:
+	movq	RSP(%r8), %rsp
+	movq	CR4(%r8), %rax
+	movq	%rax, %cr4
+	movq	CR3(%r8), %rax
+	movq	CR0(%r8), %r8
+	movq	%rax, %cr3
+	movq	%r8, %cr0
+	movq	%rbp, %rax
+
+	popf
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbp
+	popq	%rbx
+	ret
 
 	/* Do the copies */
+swap_pages:
 	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
 	xorq	%rdi, %rdi
 	xorq	%rsi, %rsi
@@ -112,36 +240,27 @@ identity_mapped:
 	movq	%rcx,   %rsi  /* For ever source page do a copy */
 	andq	$0xfffffffffffff000, %rsi
 
+	movq	%rdi, %rdx
+	movq	%rsi, %rax
+
+	movq	%r10, %rdi
 	movq	$512,   %rcx
 	rep ; movsq
-	jmp	0b
-3:
-
-	/* To be certain of avoiding problems with self-modifying code
-	 * I need to execute a serializing instruction here.
-	 * So I flush the TLB by reloading %cr3 here, it's handy,
-	 * and not processor dependent.
-	 */
-	movq	%cr3, %rax
-	movq	%rax, %cr3
 
-	/* set all of the registers to known values */
-	/* leave %rsp alone */
+	movq	%rax, %rdi
+	movq	%rdx, %rsi
+	movq	$512,   %rcx
+	rep ; movsq
 
-	xorq	%rax, %rax
-	xorq	%rbx, %rbx
-	xorq    %rcx, %rcx
-	xorq    %rdx, %rdx
-	xorq    %rsi, %rsi
-	xorq    %rdi, %rdi
-	xorq    %rbp, %rbp
-	xorq	%r8,  %r8
-	xorq	%r9,  %r9
-	xorq	%r10, %r9
-	xorq	%r11, %r11
-	xorq	%r12, %r12
-	xorq	%r13, %r13
-	xorq	%r14, %r14
-	xorq	%r15, %r15
+	movq	%rdx, %rdi
+	movq	%r10, %rsi
+	movq	$512,   %rcx
+	rep ; movsq
 
+	lea	PAGE_SIZE(%rax), %rsi
+	jmp	0b
+3:
 	ret
+
+	.globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index dd6f2b71561..5d465b207e7 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -1,14 +1,14 @@
 /*
  * RTC related functions
  */
+#include <linux/platform_device.h>
+#include <linux/mc146818rtc.h>
 #include <linux/acpi.h>
 #include <linux/bcd.h>
-#include <linux/mc146818rtc.h>
-#include <linux/platform_device.h>
 #include <linux/pnp.h>
 
-#include <asm/time.h>
 #include <asm/vsyscall.h>
+#include <asm/time.h>
 
 #ifdef CONFIG_X86_32
 /*
@@ -16,9 +16,9 @@
  * register we are working with.  It is required for NMI access to the
  * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
  */
-volatile unsigned long cmos_lock = 0;
+volatile unsigned long cmos_lock;
 EXPORT_SYMBOL(cmos_lock);
-#endif
+#endif /* CONFIG_X86_32 */
 
 /* For two digit years assume time is always after that */
 #define CMOS_YEARS_OFFS 2000
@@ -38,9 +38,9 @@ EXPORT_SYMBOL(rtc_lock);
  */
 int mach_set_rtc_mmss(unsigned long nowtime)
 {
-	int retval = 0;
 	int real_seconds, real_minutes, cmos_minutes;
 	unsigned char save_control, save_freq_select;
+	int retval = 0;
 
 	 /* tell the clock it's being set */
 	save_control = CMOS_READ(RTC_CONTROL);
@@ -72,8 +72,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 			real_seconds = bin2bcd(real_seconds);
 			real_minutes = bin2bcd(real_minutes);
 		}
-		CMOS_WRITE(real_seconds,RTC_SECONDS);
-		CMOS_WRITE(real_minutes,RTC_MINUTES);
+		CMOS_WRITE(real_seconds, RTC_SECONDS);
+		CMOS_WRITE(real_minutes, RTC_MINUTES);
 	} else {
 		printk(KERN_WARNING
 		       "set_rtc_mmss: can't update from %d to %d\n",
@@ -151,6 +151,7 @@ unsigned char rtc_cmos_read(unsigned char addr)
 	outb(addr, RTC_PORT(0));
 	val = inb(RTC_PORT(1));
 	lock_cmos_suffix(addr);
+
 	return val;
 }
 EXPORT_SYMBOL(rtc_cmos_read);
@@ -166,8 +167,8 @@ EXPORT_SYMBOL(rtc_cmos_write);
 
 static int set_rtc_mmss(unsigned long nowtime)
 {
-	int retval;
 	unsigned long flags;
+	int retval;
 
 	spin_lock_irqsave(&rtc_lock, flags);
 	retval = set_wallclock(nowtime);
@@ -242,6 +243,7 @@ static __init int add_rtc_cmos(void)
 	platform_device_register(&rtc_device);
 	dev_info(&rtc_device.dev,
 		 "registered platform RTC device (no PNP device found)\n");
+
 	return 0;
 }
 device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5b85759e797..b4158439bf6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,8 +112,13 @@
 #define ARCH_SETUP
 #endif
 
+RESERVE_BRK(dmi_alloc, 65536);
+
 unsigned int boot_cpu_id __read_mostly;
 
+static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
+
 #ifdef CONFIG_X86_64
 int default_cpu_present_to_apicid(int mps_cpu)
 {
@@ -158,12 +163,6 @@ static struct resource bss_resource = {
 
 
 #ifdef CONFIG_X86_32
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
-
 static struct resource video_ram_resource = {
 	.name	= "Video RAM area",
 	.start	= 0xa0000,
@@ -202,7 +201,9 @@ struct ist_info ist_info;
 #endif
 
 #else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+	.x86_phys_bits = MAX_PHYSMEM_BITS,
+};
 EXPORT_SYMBOL(boot_cpu_data);
 #endif
 
@@ -217,12 +218,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE;
 int bootloader_type;
 
 /*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
  * Setup options
  */
 struct screen_info screen_info;
@@ -267,6 +262,35 @@ static inline void copy_edd(void)
 }
 #endif
 
+void * __init extend_brk(size_t size, size_t align)
+{
+	size_t mask = align - 1;
+	void *ret;
+
+	BUG_ON(_brk_start == 0);
+	BUG_ON(align & mask);
+
+	_brk_end = (_brk_end + mask) & ~mask;
+	BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+	ret = (void *)_brk_end;
+	_brk_end += size;
+
+	memset(ret, 0, size);
+
+	return ret;
+}
+
+static void __init reserve_brk(void)
+{
+	if (_brk_end > _brk_start)
+		reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+
+	/* Mark brk area as locked down and no longer taking any
+	   new allocations */
+	_brk_start = 0;
+}
+
 #ifdef CONFIG_BLK_DEV_INITRD
 
 #ifdef CONFIG_X86_32
@@ -600,19 +624,7 @@ static int __init setup_elfcorehdr(char *arg)
 early_param("elfcorehdr", setup_elfcorehdr);
 #endif
 
-static int __init default_update_apic(void)
-{
-#ifdef CONFIG_SMP
-	if (!apic->wakeup_cpu)
-		apic->wakeup_cpu = wakeup_secondary_cpu_via_init;
-#endif
-
-	return 0;
-}
-
-static struct x86_quirks default_x86_quirks __initdata = {
-	.update_apic         = default_update_apic,
-};
+static struct x86_quirks default_x86_quirks __initdata;
 
 struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
 
@@ -727,11 +739,7 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.start_code = (unsigned long) _text;
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
-	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
-	init_mm.brk = (unsigned long) &_end;
-#endif
+	init_mm.brk = _brk_end;
 
 	code_resource.start = virt_to_phys(_text);
 	code_resource.end = virt_to_phys(_etext)-1;
@@ -782,6 +790,9 @@ void __init setup_arch(char **cmdline_p)
 
 	finish_e820_parsing();
 
+	if (efi_enabled)
+		efi_init();
+
 	dmi_scan_machine();
 
 	dmi_check_system(bad_bios_dmi_table);
@@ -801,8 +812,6 @@ void __init setup_arch(char **cmdline_p)
 	insert_resource(&iomem_resource, &data_resource);
 	insert_resource(&iomem_resource, &bss_resource);
 
-	if (efi_enabled)
-		efi_init();
 
 #ifdef CONFIG_X86_32
 	if (ppro_with_ram_bug()) {
@@ -851,6 +860,8 @@ void __init setup_arch(char **cmdline_p)
 	setup_bios_corruption_check();
 #endif
 
+	reserve_brk();
+
 	/* max_pfn_mapped is updated here */
 	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
 	max_pfn_mapped = max_low_pfn_mapped;
@@ -875,9 +886,7 @@ void __init setup_arch(char **cmdline_p)
 
 	reserve_initrd();
 
-#ifdef CONFIG_X86_64
 	vsmp_init();
-#endif
 
 	io_delay_init();
 
@@ -1040,7 +1049,6 @@ void __init x86_quirk_trap_init(void)
 static struct irqaction irq0  = {
 	.handler = timer_interrupt,
 	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
-	.mask = CPU_MASK_NONE,
 	.name = "timer"
 };
 
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d992e6cff73..3a97a4cf187 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,6 +7,7 @@
 #include <linux/crash_dump.h>
 #include <linux/smp.h>
 #include <linux/topology.h>
+#include <linux/pfn.h>
 #include <asm/sections.h>
 #include <asm/processor.h>
 #include <asm/setup.h>
@@ -41,6 +42,295 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
+/*
+ * On x86_64 symbols referenced from code should be reachable using
+ * 32bit relocations.  Reserve space for static percpu variables in
+ * modules so that they are always served from the first chunk which
+ * is located at the percpu segment base.  On x86_32, anything can
+ * address anywhere.  No need to reserve space in the first chunk.
+ */
+#ifdef CONFIG_X86_64
+#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE
+#else
+#define PERCPU_FIRST_CHUNK_RESERVE	0
+#endif
+
+/**
+ * pcpu_need_numa - determine percpu allocation needs to consider NUMA
+ *
+ * If NUMA is not configured or there is only one NUMA node available,
+ * there is no reason to consider NUMA.  This function determines
+ * whether percpu allocation should consider NUMA or not.
+ *
+ * RETURNS:
+ * true if NUMA should be considered; otherwise, false.
+ */
+static bool __init pcpu_need_numa(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	pg_data_t *last = NULL;
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		int node = early_cpu_to_node(cpu);
+
+		if (node_online(node) && NODE_DATA(node) &&
+		    last && last != NODE_DATA(node))
+			return true;
+
+		last = NODE_DATA(node);
+	}
+#endif
+	return false;
+}
+
+/**
+ * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
+ * @cpu: cpu to allocate for
+ * @size: size allocation in bytes
+ * @align: alignment
+ *
+ * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
+ * does the right thing for NUMA regardless of the current
+ * configuration.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
+					unsigned long align)
+{
+	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	int node = early_cpu_to_node(cpu);
+	void *ptr;
+
+	if (!node_online(node) || !NODE_DATA(node)) {
+		ptr = __alloc_bootmem_nopanic(size, align, goal);
+		pr_info("cpu %d has no node %d or node-local memory\n",
+			cpu, node);
+		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
+			 cpu, size, __pa(ptr));
+	} else {
+		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
+						   size, align, goal);
+		pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
+			 "%016lx\n", cpu, size, node, __pa(ptr));
+	}
+	return ptr;
+#else
+	return __alloc_bootmem_nopanic(size, align, goal);
+#endif
+}
+
+/*
+ * Remap allocator
+ *
+ * This allocator uses PMD page as unit.  A PMD page is allocated for
+ * each cpu and each is remapped into vmalloc area using PMD mapping.
+ * As PMD page is quite large, only part of it is used for the first
+ * chunk.  Unused part is returned to the bootmem allocator.
+ *
+ * So, the PMD pages are mapped twice - once to the physical mapping
+ * and to the vmalloc area for the first percpu chunk.  The double
+ * mapping does add one more PMD TLB entry pressure but still is much
+ * better than only using 4k mappings while still being NUMA friendly.
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+static size_t pcpur_size __initdata;
+static void **pcpur_ptrs __initdata;
+
+static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
+{
+	size_t off = (size_t)pageno << PAGE_SHIFT;
+
+	if (off >= pcpur_size)
+		return NULL;
+
+	return virt_to_page(pcpur_ptrs[cpu] + off);
+}
+
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+	static struct vm_struct vm;
+	size_t ptrs_size, dyn_size;
+	unsigned int cpu;
+	ssize_t ret;
+
+	/*
+	 * If large page isn't supported, there's no benefit in doing
+	 * this.  Also, on non-NUMA, embedding is better.
+	 */
+	if (!cpu_has_pse || !pcpu_need_numa())
+		return -EINVAL;
+
+	/*
+	 * Currently supports only single page.  Supporting multiple
+	 * pages won't be too difficult if it ever becomes necessary.
+	 */
+	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			       PERCPU_DYNAMIC_RESERVE);
+	if (pcpur_size > PMD_SIZE) {
+		pr_warning("PERCPU: static data is larger than large page, "
+			   "can't use large page\n");
+		return -EINVAL;
+	}
+	dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
+
+	/* allocate pointer array and alloc large pages */
+	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
+	pcpur_ptrs = alloc_bootmem(ptrs_size);
+
+	for_each_possible_cpu(cpu) {
+		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
+		if (!pcpur_ptrs[cpu])
+			goto enomem;
+
+		/*
+		 * Only use pcpur_size bytes and give back the rest.
+		 *
+		 * Ingo: The 2MB up-rounding bootmem is needed to make
+		 * sure the partial 2MB page is still fully RAM - it's
+		 * not well-specified to have a PAT-incompatible area
+		 * (unmapped RAM, device memory, etc.) in that hole.
+		 */
+		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
+			     PMD_SIZE - pcpur_size);
+
+		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+	}
+
+	/* allocate address and map */
+	vm.flags = VM_ALLOC;
+	vm.size = num_possible_cpus() * PMD_SIZE;
+	vm_area_register_early(&vm, PMD_SIZE);
+
+	for_each_possible_cpu(cpu) {
+		pmd_t *pmd;
+
+		pmd = populate_extra_pmd((unsigned long)vm.addr
+					 + cpu * PMD_SIZE);
+		set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
+				     PAGE_KERNEL_LARGE));
+	}
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Remapped at %p with large pages, static data "
+		"%zu bytes\n", vm.addr, static_size);
+
+	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+				     PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+				     PMD_SIZE, vm.addr, NULL);
+	goto out_free_ar;
+
+enomem:
+	for_each_possible_cpu(cpu)
+		if (pcpur_ptrs[cpu])
+			free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
+	ret = -ENOMEM;
+out_free_ar:
+	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+	return ret;
+}
+#else
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+	return -EINVAL;
+}
+#endif
+
+/*
+ * Embedding allocator
+ *
+ * The first chunk is sized to just contain the static area plus
+ * module and dynamic reserves and embedded into linear physical
+ * mapping so that it can use PMD mapping without additional TLB
+ * pressure.
+ */
+static ssize_t __init setup_pcpu_embed(size_t static_size)
+{
+	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
+
+	/*
+	 * If large page isn't supported, there's no benefit in doing
+	 * this.  Also, embedding allocation doesn't play well with
+	 * NUMA.
+	 */
+	if (!cpu_has_pse || pcpu_need_numa())
+		return -EINVAL;
+
+	return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+				      reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
+}
+
+/*
+ * 4k page allocator
+ *
+ * This is the basic allocator.  Static percpu area is allocated
+ * page-by-page and most of initialization is done by the generic
+ * setup function.
+ */
+static struct page **pcpu4k_pages __initdata;
+static int pcpu4k_nr_static_pages __initdata;
+
+static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
+{
+	if (pageno < pcpu4k_nr_static_pages)
+		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
+	return NULL;
+}
+
+static void __init pcpu4k_populate_pte(unsigned long addr)
+{
+	populate_extra_pte(addr);
+}
+
+static ssize_t __init setup_pcpu_4k(size_t static_size)
+{
+	size_t pages_size;
+	unsigned int cpu;
+	int i, j;
+	ssize_t ret;
+
+	pcpu4k_nr_static_pages = PFN_UP(static_size);
+
+	/* unaligned allocations can't be freed, round up to page size */
+	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
+			       * sizeof(pcpu4k_pages[0]));
+	pcpu4k_pages = alloc_bootmem(pages_size);
+
+	/* allocate and copy */
+	j = 0;
+	for_each_possible_cpu(cpu)
+		for (i = 0; i < pcpu4k_nr_static_pages; i++) {
+			void *ptr;
+
+			ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
+			if (!ptr)
+				goto enomem;
+
+			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
+			pcpu4k_pages[j++] = virt_to_page(ptr);
+		}
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
+		pcpu4k_nr_static_pages, static_size);
+
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
+				     PERCPU_FIRST_CHUNK_RESERVE, -1,
+				     -1, NULL, pcpu4k_populate_pte);
+	goto out_free_ar;
+
+enomem:
+	while (--j >= 0)
+		free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
+	ret = -ENOMEM;
+out_free_ar:
+	free_bootmem(__pa(pcpu4k_pages), pages_size);
+	return ret;
+}
+
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -61,38 +351,35 @@ static inline void setup_percpu_segment(int cpu)
  */
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t size;
-	char *ptr;
-	int cpu;
-
-	/* Copy section for each CPU (we discard the original) */
-	size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
+	size_t static_size = __per_cpu_end - __per_cpu_start;
+	unsigned int cpu;
+	unsigned long delta;
+	size_t pcpu_unit_size;
+	ssize_t ret;
 
 	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 
-	pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
+	/*
+	 * Allocate percpu area.  If PSE is supported, try to make use
+	 * of large page mappings.  Please read comments on top of
+	 * each allocator for details.
+	 */
+	ret = setup_pcpu_remap(static_size);
+	if (ret < 0)
+		ret = setup_pcpu_embed(static_size);
+	if (ret < 0)
+		ret = setup_pcpu_4k(static_size);
+	if (ret < 0)
+		panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
+		      static_size, ret);
 
-	for_each_possible_cpu(cpu) {
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-		ptr = alloc_bootmem_pages(size);
-#else
-		int node = early_cpu_to_node(cpu);
-		if (!node_online(node) || !NODE_DATA(node)) {
-			ptr = alloc_bootmem_pages(size);
-			pr_info("cpu %d has no node %d or node-local memory\n",
-				cpu, node);
-			pr_debug("per cpu data for cpu%d at %016lx\n",
-				 cpu, __pa(ptr));
-		} else {
-			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
-			pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
-				cpu, node, __pa(ptr));
-		}
-#endif
+	pcpu_unit_size = ret;
 
-		memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
-		per_cpu_offset(cpu) = ptr - __per_cpu_start;
+	/* alrighty, percpu areas up and running */
+	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+	for_each_possible_cpu(cpu) {
+		per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
 		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
 		per_cpu(cpu_number, cpu) = cpu;
 		setup_percpu_segment(cpu);
@@ -125,8 +412,6 @@ void __init setup_per_cpu_areas(void)
 		 */
 		if (cpu == boot_cpu_id)
 			switch_to_new_gdt(cpu);
-
-		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
 	}
 
 	/* indicate the early static arrays will soon be gone */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4d344101806..611615a92c9 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -187,6 +187,77 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 /*
  * Set up a signal frame.
  */
+
+/*
+ * Determine which stack to use..
+ */
+static unsigned long align_sigframe(unsigned long sp)
+{
+#ifdef CONFIG_X86_32
+	/*
+	 * Align the stack pointer according to the i386 ABI,
+	 * i.e. so that on function entry ((sp + 4) & 15) == 0.
+	 */
+	sp = ((sp + 4) & -16ul) - 4;
+#else /* !CONFIG_X86_32 */
+	sp = round_down(sp, 16) - 8;
+#endif
+	return sp;
+}
+
+static inline void __user *
+get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+	     void __user **fpstate)
+{
+	/* Default to using normal stack */
+	unsigned long sp = regs->sp;
+	int onsigstack = on_sig_stack(sp);
+
+#ifdef CONFIG_X86_64
+	/* redzone */
+	sp -= 128;
+#endif /* CONFIG_X86_64 */
+
+	if (!onsigstack) {
+		/* This is the X/Open sanctioned signal stack switching.  */
+		if (ka->sa.sa_flags & SA_ONSTACK) {
+			if (current->sas_ss_size)
+				sp = current->sas_ss_sp + current->sas_ss_size;
+		} else {
+#ifdef CONFIG_X86_32
+			/* This is the legacy signal stack switching. */
+			if ((regs->ss & 0xffff) != __USER_DS &&
+				!(ka->sa.sa_flags & SA_RESTORER) &&
+					ka->sa.sa_restorer)
+				sp = (unsigned long) ka->sa.sa_restorer;
+#endif /* CONFIG_X86_32 */
+		}
+	}
+
+	if (used_math()) {
+		sp -= sig_xstate_size;
+#ifdef CONFIG_X86_64
+		sp = round_down(sp, 64);
+#endif /* CONFIG_X86_64 */
+		*fpstate = (void __user *)sp;
+	}
+
+	sp = align_sigframe(sp - frame_size);
+
+	/*
+	 * If we are on the alternate signal stack and would overflow it, don't.
+	 * Return an always-bogus address instead so we will die with SIGSEGV.
+	 */
+	if (onsigstack && !likely(on_sig_stack(sp)))
+		return (void __user *)-1L;
+
+	/* save i387 state */
+	if (used_math() && save_i387_xstate(*fpstate) < 0)
+		return (void __user *)-1L;
+
+	return (void __user *)sp;
+}
+
 #ifdef CONFIG_X86_32
 static const struct {
 	u16 poplmovl;
@@ -210,54 +281,6 @@ static const struct {
 	0
 };
 
-/*
- * Determine which stack to use..
- */
-static inline void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
-	     void **fpstate)
-{
-	unsigned long sp;
-
-	/* Default to using normal stack */
-	sp = regs->sp;
-
-	/*
-	 * If we are on the alternate signal stack and would overflow it, don't.
-	 * Return an always-bogus address instead so we will die with SIGSEGV.
-	 */
-	if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
-		return (void __user *) -1L;
-
-	/* This is the X/Open sanctioned signal stack switching.  */
-	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(sp) == 0)
-			sp = current->sas_ss_sp + current->sas_ss_size;
-	} else {
-		/* This is the legacy signal stack switching. */
-		if ((regs->ss & 0xffff) != __USER_DS &&
-			!(ka->sa.sa_flags & SA_RESTORER) &&
-				ka->sa.sa_restorer)
-			sp = (unsigned long) ka->sa.sa_restorer;
-	}
-
-	if (used_math()) {
-		sp = sp - sig_xstate_size;
-		*fpstate = (struct _fpstate *) sp;
-		if (save_i387_xstate(*fpstate) < 0)
-			return (void __user *)-1L;
-	}
-
-	sp -= frame_size;
-	/*
-	 * Align the stack pointer according to the i386 ABI,
-	 * i.e. so that on function entry ((sp + 4) & 15) == 0.
-	 */
-	sp = ((sp + 4) & -16ul) - 4;
-
-	return (void __user *) sp;
-}
-
 static int
 __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
 	      struct pt_regs *regs)
@@ -388,24 +411,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	return 0;
 }
 #else /* !CONFIG_X86_32 */
-/*
- * Determine which stack to use..
- */
-static void __user *
-get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
-{
-	/* Default to using normal stack - redzone*/
-	sp -= 128;
-
-	/* This is the X/Open sanctioned signal stack switching.  */
-	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(sp) == 0)
-			sp = current->sas_ss_sp + current->sas_ss_size;
-	}
-
-	return (void __user *)round_down(sp - size, 64);
-}
-
 static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 			    sigset_t *set, struct pt_regs *regs)
 {
@@ -414,15 +419,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	int err = 0;
 	struct task_struct *me = current;
 
-	if (used_math()) {
-		fp = get_stack(ka, regs->sp, sig_xstate_size);
-		frame = (void __user *)round_down(
-			(unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
-
-		if (save_i387_xstate(fp) < 0)
-			return -EFAULT;
-	} else
-		frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
+	frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		return -EFAULT;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9ce666387f3..58d24ef917d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -101,29 +101,20 @@ EXPORT_SYMBOL(smp_num_siblings);
 DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
 
 /* representing HT siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 
 /* representing HT and core siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_t, cpu_core_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 
 /* Per CPU bogomips and other parameters */
 DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
 
-static atomic_t init_deasserted;
-
-
-/* Set if we find a B stepping CPU */
-static int __cpuinitdata smp_b_stepping;
+atomic_t init_deasserted;
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
-
-/* which logical CPUs are on which nodes */
-cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
-				{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
-EXPORT_SYMBOL(node_to_cpumask_map);
 /* which node each logical CPU is on */
 int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
 EXPORT_SYMBOL(cpu_to_node_map);
@@ -132,7 +123,7 @@ EXPORT_SYMBOL(cpu_to_node_map);
 static void map_cpu_to_node(int cpu, int node)
 {
 	printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
-	cpumask_set_cpu(cpu, &node_to_cpumask_map[node]);
+	cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
 	cpu_to_node_map[cpu] = node;
 }
 
@@ -143,7 +134,7 @@ static void unmap_cpu_to_node(int cpu)
 
 	printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
 	for (node = 0; node < MAX_NUMNODES; node++)
-		cpumask_clear_cpu(cpu, &node_to_cpumask_map[node]);
+		cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
 	cpu_to_node_map[cpu] = 0;
 }
 #else /* !(CONFIG_NUMA && CONFIG_X86_32) */
@@ -271,8 +262,6 @@ static void __cpuinit smp_callin(void)
 	cpumask_set_cpu(cpuid, cpu_callin_mask);
 }
 
-static int __cpuinitdata unsafe_smp;
-
 /*
  * Activate a secondary processor.
  */
@@ -307,7 +296,7 @@ notrace static void __cpuinit start_secondary(void *unused)
 	__flush_tlb_all();
 #endif
 
-	/* This must be done before setting cpu_online_map */
+	/* This must be done before setting cpu_online_mask */
 	set_cpu_sibling_map(raw_smp_processor_id());
 	wmb();
 
@@ -340,75 +329,22 @@ notrace static void __cpuinit start_secondary(void *unused)
 	cpu_idle();
 }
 
-static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
+#ifdef CONFIG_CPUMASK_OFFSTACK
+/* In this case, llc_shared_map is a pointer to a cpumask. */
+static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
+				    const struct cpuinfo_x86 *src)
 {
-	/*
-	 * Mask B, Pentium, but not Pentium MMX
-	 */
-	if (c->x86_vendor == X86_VENDOR_INTEL &&
-	    c->x86 == 5 &&
-	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
-	    c->x86_model <= 3)
-		/*
-		 * Remember we have B step Pentia with bugs
-		 */
-		smp_b_stepping = 1;
-
-	/*
-	 * Certain Athlons might work (for various values of 'work') in SMP
-	 * but they are not certified as MP capable.
-	 */
-	if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
-		if (num_possible_cpus() == 1)
-			goto valid_k7;
-
-		/* Athlon 660/661 is valid. */
-		if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
-		    (c->x86_mask == 1)))
-			goto valid_k7;
-
-		/* Duron 670 is valid */
-		if ((c->x86_model == 7) && (c->x86_mask == 0))
-			goto valid_k7;
-
-		/*
-		 * Athlon 662, Duron 671, and Athlon >model 7 have capability
-		 * bit. It's worth noting that the A5 stepping (662) of some
-		 * Athlon XP's have the MP bit set.
-		 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
-		 * more.
-		 */
-		if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
-		    ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
-		     (c->x86_model > 7))
-			if (cpu_has_mp)
-				goto valid_k7;
-
-		/* If we get here, not a certified SMP capable AMD system. */
-		unsafe_smp = 1;
-	}
-
-valid_k7:
-	;
+	struct cpumask *llc = dst->llc_shared_map;
+	*dst = *src;
+	dst->llc_shared_map = llc;
 }
-
-static void __cpuinit smp_checks(void)
+#else
+static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
+				    const struct cpuinfo_x86 *src)
 {
-	if (smp_b_stepping)
-		printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
-				    "with B stepping processors.\n");
-
-	/*
-	 * Don't taint if we are running SMP kernel on a single non-MP
-	 * approved Athlon
-	 */
-	if (unsafe_smp && num_online_cpus() > 1) {
-		printk(KERN_INFO "WARNING: This combination of AMD"
-			"processors is not suitable for SMP.\n");
-		add_taint(TAINT_UNSAFE_SMP);
-	}
+	*dst = *src;
 }
+#endif /* CONFIG_CPUMASK_OFFSTACK */
 
 /*
  * The bootstrap kernel entry code has set these up. Save them for
@@ -419,11 +355,10 @@ void __cpuinit smp_store_cpu_info(int id)
 {
 	struct cpuinfo_x86 *c = &cpu_data(id);
 
-	*c = boot_cpu_data;
+	copy_cpuinfo_x86(c, &boot_cpu_data);
 	c->cpu_index = id;
 	if (id != 0)
 		identify_secondary_cpu(c);
-	smp_apply_quirks(c);
 }
 
 
@@ -444,15 +379,15 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 				cpumask_set_cpu(cpu, cpu_sibling_mask(i));
 				cpumask_set_cpu(i, cpu_core_mask(cpu));
 				cpumask_set_cpu(cpu, cpu_core_mask(i));
-				cpumask_set_cpu(i, &c->llc_shared_map);
-				cpumask_set_cpu(cpu, &o->llc_shared_map);
+				cpumask_set_cpu(i, c->llc_shared_map);
+				cpumask_set_cpu(cpu, o->llc_shared_map);
 			}
 		}
 	} else {
 		cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
 	}
 
-	cpumask_set_cpu(cpu, &c->llc_shared_map);
+	cpumask_set_cpu(cpu, c->llc_shared_map);
 
 	if (current_cpu_data.x86_max_cores == 1) {
 		cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
@@ -463,8 +398,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 	for_each_cpu(i, cpu_sibling_setup_mask) {
 		if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
 		    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
-			cpumask_set_cpu(i, &c->llc_shared_map);
-			cpumask_set_cpu(cpu, &cpu_data(i).llc_shared_map);
+			cpumask_set_cpu(i, c->llc_shared_map);
+			cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map);
 		}
 		if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
 			cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -502,12 +437,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
 	if (sched_mc_power_savings || sched_smt_power_savings)
 		return cpu_core_mask(cpu);
 	else
-		return &c->llc_shared_map;
-}
-
-cpumask_t cpu_coregroup_map(int cpu)
-{
-	return *cpu_coregroup_mask(cpu);
+		return c->llc_shared_map;
 }
 
 static void impress_friends(void)
@@ -614,12 +544,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	unsigned long send_status, accept_status = 0;
 	int maxlvt, num_starts, j;
 
-	if (get_uv_system_type() == UV_NON_UNIQUE_APIC) {
-		send_status = uv_wakeup_secondary(phys_apicid, start_eip);
-		atomic_set(&init_deasserted, 1);
-		return send_status;
-	}
-
 	maxlvt = lapic_get_maxlvt();
 
 	/*
@@ -748,7 +672,8 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
- * Returns zero if CPU booted OK, else error code from ->wakeup_cpu.
+ * Returns zero if CPU booted OK, else error code from
+ * ->wakeup_secondary_cpu.
  */
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 {
@@ -835,9 +760,13 @@ do_rest:
 	}
 
 	/*
-	 * Starting actual IPI sequence...
+	 * Kick the secondary CPU. Use the method in the APIC driver
+	 * if it's defined - or use an INIT boot APIC message otherwise:
 	 */
-	boot_error = apic->wakeup_cpu(apicid, start_ip);
+	if (apic->wakeup_secondary_cpu)
+		boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
+	else
+		boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
 
 	if (!boot_error) {
 		/*
@@ -975,9 +904,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
  */
 static __init void disable_smp(void)
 {
-	/* use the read/write pointers to the present and possible maps */
-	cpumask_copy(&cpu_present_map, cpumask_of(0));
-	cpumask_copy(&cpu_possible_map, cpumask_of(0));
+	init_cpu_present(cpumask_of(0));
+	init_cpu_possible(cpumask_of(0));
 	smpboot_clear_io_apic_irqs();
 
 	if (smp_found_config)
@@ -1109,6 +1037,8 @@ static void __init smp_cpu_index_default(void)
  */
 void __init native_smp_prepare_cpus(unsigned int max_cpus)
 {
+	unsigned int i;
+
 	preempt_disable();
 	smp_cpu_index_default();
 	current_cpu_data = boot_cpu_data;
@@ -1122,6 +1052,14 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	boot_cpu_logical_apicid = logical_smp_processor_id();
 #endif
 	current_thread_info()->cpu = 0;  /* needed? */
+	for_each_possible_cpu(i) {
+		alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+		alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+		alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
+		cpumask_clear(per_cpu(cpu_core_map, i));
+		cpumask_clear(per_cpu(cpu_sibling_map, i));
+		cpumask_clear(cpu_data(i).llc_shared_map);
+	}
 	set_cpu_sibling_map(0);
 
 	enable_IR_x2apic();
@@ -1194,7 +1132,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 	pr_debug("Boot done.\n");
 
 	impress_friends();
-	smp_checks();
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
 #endif
@@ -1211,11 +1148,11 @@ early_param("possible_cpus", _setup_possible_cpus);
 
 
 /*
- * cpu_possible_map should be static, it cannot change as cpu's
+ * cpu_possible_mask should be static, it cannot change as cpu's
  * are onlined, or offlined. The reason is per-cpu data-structures
  * are allocated by some modules at init time, and dont expect to
  * do this dynamically on cpu arrival/departure.
- * cpu_present_map on the other hand can change dynamically.
+ * cpu_present_mask on the other hand can change dynamically.
  * In case when cpu_hotplug is not compiled, then we resort to current
  * behaviour, which is cpu_possible == cpu_present.
  * - Ashok Raj
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index b7607c4f204..c3ebbb90137 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -333,3 +333,5 @@ ENTRY(sys_call_table)
 	.long sys_pipe2
 	.long sys_inotify_init1
 	.long sys_perf_counter_open
+	.long sys_preadv
+	.long sys_pwritev
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 241ec3923f6..5ba343e6184 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -116,7 +116,6 @@ unsigned long __init calibrate_cpu(void)
 static struct irqaction irq0 = {
 	.handler	= timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER,
-	.mask		= CPU_MASK_NONE,
 	.name		= "timer"
 };
 
@@ -125,7 +124,6 @@ void __init hpet_time_init(void)
 	if (!hpet_enable())
 		setup_pit_timer();
 
-	irq0.mask = cpumask_of_cpu(0);
 	setup_irq(0, &irq0);
 }
 
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index f04549afcfe..deb5ebb32c3 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -275,6 +275,8 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
 	return NULL;
 }
 
+static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
+
 /**
  * uv_flush_tlb_others - globally purge translation cache of a virtual
  * address or all TLB's
@@ -304,8 +306,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 					  struct mm_struct *mm,
 					  unsigned long va, unsigned int cpu)
 {
-	static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
-	struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
+	struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
 	int i;
 	int bit;
 	int blade;
@@ -314,8 +315,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	int locals = 0;
 	struct bau_desc *bau_desc;
 
-	WARN_ON(!in_atomic());
-
 	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
 
 	uv_cpu = uv_blade_processor_id();
@@ -752,16 +751,21 @@ static int __init uv_bau_init(void)
 	int node;
 	int nblades;
 	int last_blade;
-	int cur_cpu = 0;
+	int cur_cpu;
 
 	if (!is_uv_system())
 		return 0;
 
+	for_each_possible_cpu(cur_cpu)
+		alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
+				       GFP_KERNEL, cpu_to_node(cur_cpu));
+
 	uv_bau_retry_limit = 1;
 	uv_nshift = uv_hub_info->n_val;
 	uv_mmask = (1UL << uv_hub_info->n_val) - 1;
 	nblades = 0;
 	last_blade = -1;
+	cur_cpu = 0;
 	for_each_online_node(node) {
 		blade = uv_node_to_blade_id(node);
 		if (blade == last_blade)
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 0fcc95a354f..7e4515957a1 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -25,10 +25,10 @@
  *
  * Send feedback to <colpatch@us.ibm.com>
  */
-#include <linux/init.h>
-#include <linux/smp.h>
 #include <linux/nodemask.h>
 #include <linux/mmzone.h>
+#include <linux/init.h>
+#include <linux/smp.h>
 #include <asm/cpu.h>
 
 static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
@@ -47,6 +47,7 @@ int __ref arch_register_cpu(int num)
 	 */
 	if (num)
 		per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
+
 	return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
 }
 EXPORT_SYMBOL(arch_register_cpu);
@@ -56,12 +57,13 @@ void arch_unregister_cpu(int num)
 	unregister_cpu(&per_cpu(cpu_devices, num).cpu);
 }
 EXPORT_SYMBOL(arch_unregister_cpu);
-#else
+#else /* CONFIG_HOTPLUG_CPU */
+
 static int __init arch_register_cpu(int num)
 {
 	return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
 }
-#endif /*CONFIG_HOTPLUG_CPU*/
+#endif /* CONFIG_HOTPLUG_CPU */
 
 static int __init topology_init(void)
 {
@@ -70,11 +72,11 @@ static int __init topology_init(void)
 #ifdef CONFIG_NUMA
 	for_each_online_node(i)
 		register_one_node(i);
-#endif /* CONFIG_NUMA */
+#endif
 
 	for_each_present_cpu(i)
 		arch_register_cpu(i);
+
 	return 0;
 }
-
 subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1dba866967e..2cc162e09c4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -118,47 +118,6 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err)
 	if (!user_mode_vm(regs))
 		die(str, regs, err);
 }
-
-/*
- * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
- * invalid offset set (the LAZY one) and the faulting thread has
- * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
- * we set the offset field correctly and return 1.
- */
-static int lazy_iobitmap_copy(void)
-{
-	struct thread_struct *thread;
-	struct tss_struct *tss;
-	int cpu;
-
-	cpu = get_cpu();
-	tss = &per_cpu(init_tss, cpu);
-	thread = &current->thread;
-
-	if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
-	    thread->io_bitmap_ptr) {
-		memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
-		       thread->io_bitmap_max);
-		/*
-		 * If the previously set map was extending to higher ports
-		 * than the current one, pad extra space with 0xff (no access).
-		 */
-		if (thread->io_bitmap_max < tss->io_bitmap_max) {
-			memset((char *) tss->io_bitmap +
-				thread->io_bitmap_max, 0xff,
-				tss->io_bitmap_max - thread->io_bitmap_max);
-		}
-		tss->io_bitmap_max = thread->io_bitmap_max;
-		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-		tss->io_bitmap_owner = thread;
-		put_cpu();
-
-		return 1;
-	}
-	put_cpu();
-
-	return 0;
-}
 #endif
 
 static void __kprobes
@@ -309,11 +268,6 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	conditional_sti(regs);
 
 #ifdef CONFIG_X86_32
-	if (lazy_iobitmap_copy()) {
-		/* restart the faulting instruction */
-		return;
-	}
-
 	if (regs->flags & X86_VM_MASK)
 		goto gp_in_vm86;
 #endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 83d53ce5d4c..7a567ebe636 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -17,20 +17,21 @@
 #include <asm/delay.h>
 #include <asm/hypervisor.h>
 
-unsigned int cpu_khz;           /* TSC clocks / usec, not used here */
+unsigned int __read_mostly cpu_khz;	/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
-unsigned int tsc_khz;
+
+unsigned int __read_mostly tsc_khz;
 EXPORT_SYMBOL(tsc_khz);
 
 /*
  * TSC can be unstable due to cpufreq or due to unsynced TSCs
  */
-static int tsc_unstable;
+static int __read_mostly tsc_unstable;
 
 /* native_sched_clock() is called before tsc_init(), so
    we must start with the TSC soft disabled to prevent
    erroneous rdtsc usage on !cpu_has_tsc processors */
-static int tsc_disabled = -1;
+static int __read_mostly tsc_disabled = -1;
 
 static int tsc_clocksource_reliable;
 /*
@@ -273,30 +274,43 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
  * use the TSC value at the transitions to calculate a pretty
  * good value for the TSC frequencty.
  */
-static inline int pit_expect_msb(unsigned char val)
+static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
 {
-	int count = 0;
+	int count;
+	u64 tsc = 0;
 
 	for (count = 0; count < 50000; count++) {
 		/* Ignore LSB */
 		inb(0x42);
 		if (inb(0x42) != val)
 			break;
+		tsc = get_cycles();
 	}
-	return count > 50;
+	*deltap = get_cycles() - tsc;
+	*tscp = tsc;
+
+	/*
+	 * We require _some_ success, but the quality control
+	 * will be based on the error terms on the TSC values.
+	 */
+	return count > 5;
 }
 
 /*
- * How many MSB values do we want to see? We aim for a
- * 15ms calibration, which assuming a 2us counter read
- * error should give us roughly 150 ppm precision for
- * the calibration.
+ * How many MSB values do we want to see? We aim for
+ * a maximum error rate of 500ppm (in practice the
+ * real error is much smaller), but refuse to spend
+ * more than 25ms on it.
  */
-#define QUICK_PIT_MS 15
-#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+#define MAX_QUICK_PIT_MS 25
+#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
 
 static unsigned long quick_pit_calibrate(void)
 {
+	int i;
+	u64 tsc, delta;
+	unsigned long d1, d2;
+
 	/* Set the Gate high, disable speaker */
 	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
 
@@ -315,45 +329,52 @@ static unsigned long quick_pit_calibrate(void)
 	outb(0xff, 0x42);
 	outb(0xff, 0x42);
 
-	if (pit_expect_msb(0xff)) {
-		int i;
-		u64 t1, t2, delta;
-		unsigned char expect = 0xfe;
-
-		t1 = get_cycles();
-		for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
-			if (!pit_expect_msb(expect))
-				goto failed;
+	/*
+	 * The PIT starts counting at the next edge, so we
+	 * need to delay for a microsecond. The easiest way
+	 * to do that is to just read back the 16-bit counter
+	 * once from the PIT.
+	 */
+	inb(0x42);
+	inb(0x42);
+
+	if (pit_expect_msb(0xff, &tsc, &d1)) {
+		for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
+			if (!pit_expect_msb(0xff-i, &delta, &d2))
+				break;
+
+			/*
+			 * Iterate until the error is less than 500 ppm
+			 */
+			delta -= tsc;
+			if (d1+d2 < delta >> 11)
+				goto success;
 		}
-		t2 = get_cycles();
-
-		/*
-		 * Make sure we can rely on the second TSC timestamp:
-		 */
-		if (!pit_expect_msb(expect))
-			goto failed;
-
-		/*
-		 * Ok, if we get here, then we've seen the
-		 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
-		 * times, and each MSB had many hits, so we never
-		 * had any sudden jumps.
-		 *
-		 * As a result, we can depend on there not being
-		 * any odd delays anywhere, and the TSC reads are
-		 * reliable.
-		 *
-		 * kHz = ticks / time-in-seconds / 1000;
-		 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
-		 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
-		 */
-		delta = (t2 - t1)*PIT_TICK_RATE;
-		do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
-		printk("Fast TSC calibration using PIT\n");
-		return delta;
 	}
-failed:
+	printk("Fast TSC calibration failed\n");
 	return 0;
+
+success:
+	/*
+	 * Ok, if we get here, then we've seen the
+	 * MSB of the PIT decrement 'i' times, and the
+	 * error has shrunk to less than 500 ppm.
+	 *
+	 * As a result, we can depend on there not being
+	 * any odd delays anywhere, and the TSC reads are
+	 * reliable (within the error). We also adjust the
+	 * delta to the middle of the error bars, just
+	 * because it looks nicer.
+	 *
+	 * kHz = ticks / time-in-seconds / 1000;
+	 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
+	 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
+	 */
+	delta += (long)(d2 - d1)/2;
+	delta *= PIT_TICK_RATE;
+	do_div(delta, i*256*1000);
+	printk("Fast TSC calibration using PIT\n");
+	return delta;
 }
 
 /**
@@ -523,8 +544,6 @@ unsigned long native_calibrate_tsc(void)
 	return tsc_pit_min;
 }
 
-#ifdef CONFIG_X86_32
-/* Only called from the Powernow K7 cpu freq driver */
 int recalibrate_cpu_khz(void)
 {
 #ifndef CONFIG_SMP
@@ -546,7 +565,6 @@ int recalibrate_cpu_khz(void)
 
 EXPORT_SYMBOL(recalibrate_cpu_khz);
 
-#endif /* CONFIG_X86_32 */
 
 /* Accelerators for sched_clock()
  * convert from cycles(64bits) => nanoseconds (64bits)
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 00000000000..2ffb6c53326
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,393 @@
+/*
+ * SGI RTC clock/timer routines.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (c) 2009 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) Dimitri Sivanich
+ */
+#include <linux/clockchips.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
+
+#define RTC_NAME		"sgi_rtc"
+
+static cycle_t uv_read_rtc(void);
+static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
+static void uv_rtc_timer_setup(enum clock_event_mode,
+				struct clock_event_device *);
+
+static struct clocksource clocksource_uv = {
+	.name		= RTC_NAME,
+	.rating		= 400,
+	.read		= uv_read_rtc,
+	.mask		= (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
+	.shift		= 10,
+	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static struct clock_event_device clock_event_device_uv = {
+	.name		= RTC_NAME,
+	.features	= CLOCK_EVT_FEAT_ONESHOT,
+	.shift		= 20,
+	.rating		= 400,
+	.irq		= -1,
+	.set_next_event	= uv_rtc_next_event,
+	.set_mode	= uv_rtc_timer_setup,
+	.event_handler	= NULL,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
+
+/* There is one of these allocated per node */
+struct uv_rtc_timer_head {
+	spinlock_t	lock;
+	/* next cpu waiting for timer, local node relative: */
+	int		next_cpu;
+	/* number of cpus on this node: */
+	int		ncpus;
+	struct {
+		int	lcpu;		/* systemwide logical cpu number */
+		u64	expires;	/* next timer expiration for this cpu */
+	} cpu[1];
+};
+
+/*
+ * Access to uv_rtc_timer_head via blade id.
+ */
+static struct uv_rtc_timer_head		**blade_info __read_mostly;
+
+static int				uv_rtc_enable;
+
+/*
+ * Hardware interface routines
+ */
+
+/* Send IPIs to another node */
+static void uv_rtc_send_IPI(int cpu)
+{
+	unsigned long apicid, val;
+	int pnode;
+
+	apicid = cpu_physical_id(cpu);
+	pnode = uv_apicid_to_pnode(apicid);
+	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+	      (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+/* Check for an RTC interrupt pending */
+static int uv_intr_pending(int pnode)
+{
+	return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+		UVH_EVENT_OCCURRED0_RTC1_MASK;
+}
+
+/* Setup interrupt and return non-zero if early expiration occurred. */
+static int uv_setup_intr(int cpu, u64 expires)
+{
+	u64 val;
+	int pnode = uv_cpu_to_pnode(cpu);
+
+	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+		UVH_RTC1_INT_CONFIG_M_MASK);
+	uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
+
+	uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+		UVH_EVENT_OCCURRED0_RTC1_MASK);
+
+	val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+		((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+
+	/* Set configuration */
+	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
+	/* Initialize comparator value */
+	uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
+
+	return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
+}
+
+/*
+ * Per-cpu timer tracking routines
+ */
+
+static __init void uv_rtc_deallocate_timers(void)
+{
+	int bid;
+
+	for_each_possible_blade(bid) {
+		kfree(blade_info[bid]);
+	}
+	kfree(blade_info);
+}
+
+/* Allocate per-node list of cpu timer expiration times. */
+static __init int uv_rtc_allocate_timers(void)
+{
+	int cpu;
+
+	blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
+	if (!blade_info)
+		return -ENOMEM;
+	memset(blade_info, 0, uv_possible_blades * sizeof(void *));
+
+	for_each_present_cpu(cpu) {
+		int nid = cpu_to_node(cpu);
+		int bid = uv_cpu_to_blade_id(cpu);
+		int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+		struct uv_rtc_timer_head *head = blade_info[bid];
+
+		if (!head) {
+			head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
+				(uv_blade_nr_possible_cpus(bid) *
+					2 * sizeof(u64)),
+				GFP_KERNEL, nid);
+			if (!head) {
+				uv_rtc_deallocate_timers();
+				return -ENOMEM;
+			}
+			spin_lock_init(&head->lock);
+			head->ncpus = uv_blade_nr_possible_cpus(bid);
+			head->next_cpu = -1;
+			blade_info[bid] = head;
+		}
+
+		head->cpu[bcpu].lcpu = cpu;
+		head->cpu[bcpu].expires = ULLONG_MAX;
+	}
+
+	return 0;
+}
+
+/* Find and set the next expiring timer.  */
+static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
+{
+	u64 lowest = ULLONG_MAX;
+	int c, bcpu = -1;
+
+	head->next_cpu = -1;
+	for (c = 0; c < head->ncpus; c++) {
+		u64 exp = head->cpu[c].expires;
+		if (exp < lowest) {
+			bcpu = c;
+			lowest = exp;
+		}
+	}
+	if (bcpu >= 0) {
+		head->next_cpu = bcpu;
+		c = head->cpu[bcpu].lcpu;
+		if (uv_setup_intr(c, lowest))
+			/* If we didn't set it up in time, trigger */
+			uv_rtc_send_IPI(c);
+	} else {
+		uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+			UVH_RTC1_INT_CONFIG_M_MASK);
+	}
+}
+
+/*
+ * Set expiration time for current cpu.
+ *
+ * Returns 1 if we missed the expiration time.
+ */
+static int uv_rtc_set_timer(int cpu, u64 expires)
+{
+	int pnode = uv_cpu_to_pnode(cpu);
+	int bid = uv_cpu_to_blade_id(cpu);
+	struct uv_rtc_timer_head *head = blade_info[bid];
+	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	u64 *t = &head->cpu[bcpu].expires;
+	unsigned long flags;
+	int next_cpu;
+
+	spin_lock_irqsave(&head->lock, flags);
+
+	next_cpu = head->next_cpu;
+	*t = expires;
+	/* Will this one be next to go off? */
+	if (next_cpu < 0 || bcpu == next_cpu ||
+			expires < head->cpu[next_cpu].expires) {
+		head->next_cpu = bcpu;
+		if (uv_setup_intr(cpu, expires)) {
+			*t = ULLONG_MAX;
+			uv_rtc_find_next_timer(head, pnode);
+			spin_unlock_irqrestore(&head->lock, flags);
+			return 1;
+		}
+	}
+
+	spin_unlock_irqrestore(&head->lock, flags);
+	return 0;
+}
+
+/*
+ * Unset expiration time for current cpu.
+ *
+ * Returns 1 if this timer was pending.
+ */
+static int uv_rtc_unset_timer(int cpu)
+{
+	int pnode = uv_cpu_to_pnode(cpu);
+	int bid = uv_cpu_to_blade_id(cpu);
+	struct uv_rtc_timer_head *head = blade_info[bid];
+	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	u64 *t = &head->cpu[bcpu].expires;
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&head->lock, flags);
+
+	if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
+		rc = 1;
+
+	*t = ULLONG_MAX;
+
+	/* Was the hardware setup for this timer? */
+	if (head->next_cpu == bcpu)
+		uv_rtc_find_next_timer(head, pnode);
+
+	spin_unlock_irqrestore(&head->lock, flags);
+
+	return rc;
+}
+
+
+/*
+ * Kernel interface routines.
+ */
+
+/*
+ * Read the RTC.
+ */
+static cycle_t uv_read_rtc(void)
+{
+	return (cycle_t)uv_read_local_mmr(UVH_RTC);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int uv_rtc_next_event(unsigned long delta,
+			     struct clock_event_device *ced)
+{
+	int ced_cpu = cpumask_first(ced->cpumask);
+
+	return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
+}
+
+/*
+ * Setup the RTC timer in oneshot mode
+ */
+static void uv_rtc_timer_setup(enum clock_event_mode mode,
+			       struct clock_event_device *evt)
+{
+	int ced_cpu = cpumask_first(evt->cpumask);
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here yet */
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		uv_rtc_unset_timer(ced_cpu);
+		break;
+	}
+}
+
+static void uv_rtc_interrupt(void)
+{
+	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+	int cpu = smp_processor_id();
+
+	if (!ced || !ced->event_handler)
+		return;
+
+	if (uv_rtc_unset_timer(cpu) != 1)
+		return;
+
+	ced->event_handler(ced);
+}
+
+static int __init uv_enable_rtc(char *str)
+{
+	uv_rtc_enable = 1;
+
+	return 1;
+}
+__setup("uvrtc", uv_enable_rtc);
+
+static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
+{
+	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+
+	*ced = clock_event_device_uv;
+	ced->cpumask = cpumask_of(smp_processor_id());
+	clockevents_register_device(ced);
+}
+
+static __init int uv_rtc_setup_clock(void)
+{
+	int rc;
+
+	if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
+		return -ENODEV;
+
+	generic_interrupt_extension = uv_rtc_interrupt;
+
+	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
+				clocksource_uv.shift);
+
+	rc = clocksource_register(&clocksource_uv);
+	if (rc) {
+		generic_interrupt_extension = NULL;
+		return rc;
+	}
+
+	/* Setup and register clockevents */
+	rc = uv_rtc_allocate_timers();
+	if (rc) {
+		clocksource_unregister(&clocksource_uv);
+		generic_interrupt_extension = NULL;
+		return rc;
+	}
+
+	clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
+				NSEC_PER_SEC, clock_event_device_uv.shift);
+
+	clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
+						sn_rtc_cycles_per_second;
+
+	clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
+				(NSEC_PER_SEC / sn_rtc_cycles_per_second);
+
+	rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
+	if (rc) {
+		clocksource_unregister(&clocksource_uv);
+		generic_interrupt_extension = NULL;
+		uv_rtc_deallocate_timers();
+	}
+
+	return rc;
+}
+arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 191a876e9e8..31ffc24eec4 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -578,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = {
 static irqreturn_t piix4_master_intr(int irq, void *dev_id)
 {
 	int realirq;
-	irq_desc_t *desc;
+	struct irq_desc *desc;
 	unsigned long flags;
 
 	spin_lock_irqsave(&i8259A_lock, flags);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 2cc4a90e2cb..95deb9f2211 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -395,11 +395,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
 	vmi_ops.update_pte(ptep, VMI_PAGE_PT);
 }
 
-static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
-	vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
-}
-
 static void vmi_set_pud(pud_t *pudp, pud_t pudval)
 {
 	/* Um, eww */
@@ -750,7 +745,6 @@ static inline int __init activate_vmi(void)
 		pv_mmu_ops.set_pmd = vmi_set_pmd;
 #ifdef CONFIG_X86_PAE
 		pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
-		pv_mmu_ops.set_pte_present = vmi_set_pte_present;
 		pv_mmu_ops.set_pud = vmi_set_pud;
 		pv_mmu_ops.pte_clear = vmi_pte_clear;
 		pv_mmu_ops.pmd_clear = vmi_pmd_clear;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 33a788d5879..d303369a7ba 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -202,7 +202,6 @@ static struct irqaction vmi_clock_action  = {
 	.name 		= "vmi-timer",
 	.handler 	= vmi_timer_interrupt,
 	.flags 		= IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-	.mask 		= CPU_MASK_ALL,
 };
 
 static void __devinit vmi_time_init_clockevent(void)
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 0d860963f26..62ad500d55f 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -189,15 +189,24 @@ SECTIONS
 	*(.bss)
 	. = ALIGN(4);
 	__bss_stop = .;
-  	_end = . ;
-	/* This is where the kernel creates the early boot page tables */
+  }
+
+  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
 	. = ALIGN(PAGE_SIZE);
-	pg0 = . ;
+	__brk_base = . ;
+ 	. += 64 * 1024 ;	/* 64k alignment slop space */
+	*(.brk_reservation)	/* areas brk users have reserved */
+	__brk_limit = . ;
+  }
+
+  .end : AT(ADDR(.end) - LOAD_OFFSET) {
+	_end = . ;
   }
 
   /* Sections to be discarded */
   /DISCARD/ : {
 	*(.exitcall.exit)
+	*(.discard)
 	}
 
   STABS_DEBUG
@@ -205,6 +214,12 @@ SECTIONS
   DWARF_DEBUG
 }
 
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+	"kernel image bigger than KERNEL_IMAGE_SIZE")
+
 #ifdef CONFIG_KEXEC
 /* Link time checks */
 #include <asm/kexec.h>
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fbfced6f680..c8742507b03 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -29,8 +29,8 @@ SECTIONS
 {
   . = __START_KERNEL;
   phys_startup_64 = startup_64 - LOAD_OFFSET;
-  _text = .;			/* Text and read-only data */
   .text :  AT(ADDR(.text) - LOAD_OFFSET) {
+	_text = .;			/* Text and read-only data */
 	/* First the code that has to be first for bootstrapping */
 	*(.text.head)
 	_stext = .;
@@ -61,13 +61,13 @@ SECTIONS
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
 	DATA_DATA
 	CONSTRUCTORS
+	_edata = .;			/* End of data section */
 	} :data
 
-  _edata = .;			/* End of data section */
 
-  . = ALIGN(PAGE_SIZE);
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
   .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+	. = ALIGN(PAGE_SIZE);
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
 	*(.data.cacheline_aligned)
   }
   . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
@@ -125,29 +125,29 @@ SECTIONS
 #undef VVIRT_OFFSET
 #undef VVIRT
 
-  . = ALIGN(THREAD_SIZE);	/* init_task */
   .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+	. = ALIGN(THREAD_SIZE);	/* init_task */
 	*(.data.init_task)
   }:data.init
 
-  . = ALIGN(PAGE_SIZE);
   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+	. = ALIGN(PAGE_SIZE);
 	*(.data.page_aligned)
   }
 
-  /* might get freed after init */
-  . = ALIGN(PAGE_SIZE);
-  __smp_alt_begin = .;
-  __smp_locks = .;
   .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+	/* might get freed after init */
+	. = ALIGN(PAGE_SIZE);
+	__smp_alt_begin = .;
+	__smp_locks = .;
 	*(.smp_locks)
+	__smp_locks_end = .;
+	. = ALIGN(PAGE_SIZE);
+	__smp_alt_end = .;
   }
-  __smp_locks_end = .;
-  . = ALIGN(PAGE_SIZE);
-  __smp_alt_end = .;
 
   . = ALIGN(PAGE_SIZE);		/* Init code and data */
-  __init_begin = .;
+  __init_begin = .;	/* paired with __init_end */
   .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
 	_sinittext = .;
 	INIT_TEXT
@@ -159,40 +159,42 @@ SECTIONS
 	__initdata_end = .;
    }
 
-  . = ALIGN(16);
-  __setup_start = .;
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
-  __setup_end = .;
-  __initcall_start = .;
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+	. = ALIGN(16);
+	__setup_start = .;
+	*(.init.setup)
+	__setup_end = .;
+  }
   .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+	__initcall_start = .;
 	INITCALLS
+	__initcall_end = .;
   }
-  __initcall_end = .;
-  __con_initcall_start = .;
   .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+	__con_initcall_start = .;
 	*(.con_initcall.init)
+	__con_initcall_end = .;
   }
-  __con_initcall_end = .;
-  __x86_cpu_dev_start = .;
   .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+	__x86_cpu_dev_start = .;
 	*(.x86_cpu_dev.init)
+	__x86_cpu_dev_end = .;
   }
-  __x86_cpu_dev_end = .;
   SECURITY_INIT
 
   . = ALIGN(8);
   .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-  __parainstructions = .;
+	__parainstructions = .;
        *(.parainstructions)
-  __parainstructions_end = .;
+	__parainstructions_end = .;
   }
 
-  . = ALIGN(8);
-  __alt_instructions = .;
   .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+	. = ALIGN(8);
+	__alt_instructions = .;
 	*(.altinstructions)
+	__alt_instructions_end = .;
   }
-  __alt_instructions_end = .;
   .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
 	*(.altinstr_replacement)
   }
@@ -207,9 +209,11 @@ SECTIONS
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(PAGE_SIZE);
-  __initramfs_start = .;
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
-  __initramfs_end = .;
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+	__initramfs_start = .;
+	*(.init.ramfs)
+	__initramfs_end = .;
+  }
 #endif
 
 #ifdef CONFIG_SMP
@@ -229,20 +233,29 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   __init_end = .;
 
-  . = ALIGN(PAGE_SIZE);
-  __nosave_begin = .;
   .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-      *(.data.nosave)
+	. = ALIGN(PAGE_SIZE);
+	__nosave_begin = .;
+	*(.data.nosave)
+	. = ALIGN(PAGE_SIZE);
+	__nosave_end = .;
   } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
-  . = ALIGN(PAGE_SIZE);
-  __nosave_end = .;
 
-  __bss_start = .;		/* BSS */
   .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+	. = ALIGN(PAGE_SIZE);
+	__bss_start = .;		/* BSS */
 	*(.bss.page_aligned)
 	*(.bss)
-	}
-  __bss_stop = .;
+	__bss_stop = .;
+  }
+
+  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+	. = ALIGN(PAGE_SIZE);
+	__brk_base = . ;
+ 	. += 64 * 1024 ;	/* 64k alignment slop space */
+	*(.brk_reservation)	/* areas brk users have reserved */
+	__brk_limit = . ;
+  }
 
   _end = . ;
 
@@ -250,6 +263,7 @@ SECTIONS
   /DISCARD/ : {
 	*(.exitcall.exit)
 	*(.eh_frame)
+	*(.discard)
 	}
 
   STABS_DEBUG
@@ -275,3 +289,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
 ASSERT((per_cpu__irq_stack_union == 0),
         "irq_stack_union is not at start of per-cpu area");
 #endif
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+       "kexec control code size is too big")
+#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index c609205df59..a1d804bcd48 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -139,6 +139,7 @@ int is_vsmp_box(void)
 		return 0;
 	}
 }
+
 #else
 static void __init detect_vsmp_box(void)
 {
@@ -148,7 +149,6 @@ int is_vsmp_box(void)
 	return 0;
 }
 #endif
-
 void __init vsmp_init(void)
 {
 	detect_vsmp_box();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b81125f0bde..a58504ea78c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -4,6 +4,10 @@
 config HAVE_KVM
        bool
 
+config HAVE_KVM_IRQCHIP
+       bool
+       default y
+
 menuconfig VIRTUALIZATION
 	bool "Virtualization"
 	depends on HAVE_KVM || X86
@@ -55,7 +59,8 @@ config KVM_AMD
 
 config KVM_TRACE
 	bool "KVM trace support"
-	depends on KVM && MARKERS && SYSFS
+	depends on KVM && SYSFS
+	select MARKERS
 	select RELAY
 	select DEBUG_FS
 	default n
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 72bd275a9b5..c13bb92d315 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -201,6 +201,9 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 	if (!atomic_inc_and_test(&pt->pending))
 		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
 
+	if (!pt->reinject)
+		atomic_set(&pt->pending, 1);
+
 	if (vcpu0 && waitqueue_active(&vcpu0->wq))
 		wake_up_interruptible(&vcpu0->wq);
 
@@ -536,6 +539,16 @@ void kvm_pit_reset(struct kvm_pit *pit)
 	pit->pit_state.irq_ack = 1;
 }
 
+static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
+{
+	struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
+
+	if (!mask) {
+		atomic_set(&pit->pit_state.pit_timer.pending, 0);
+		pit->pit_state.irq_ack = 1;
+	}
+}
+
 struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 {
 	struct kvm_pit *pit;
@@ -545,9 +558,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	if (!pit)
 		return NULL;
 
-	mutex_lock(&kvm->lock);
 	pit->irq_source_id = kvm_request_irq_source_id(kvm);
-	mutex_unlock(&kvm->lock);
 	if (pit->irq_source_id < 0) {
 		kfree(pit);
 		return NULL;
@@ -580,10 +591,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	pit_state->irq_ack_notifier.gsi = 0;
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
 	kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
+	pit_state->pit_timer.reinject = true;
 	mutex_unlock(&pit->pit_state.lock);
 
 	kvm_pit_reset(pit);
 
+	pit->mask_notifier.func = pit_mask_notifer;
+	kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+
 	return pit;
 }
 
@@ -592,6 +607,8 @@ void kvm_free_pit(struct kvm *kvm)
 	struct hrtimer *timer;
 
 	if (kvm->arch.vpit) {
+		kvm_unregister_irq_mask_notifier(kvm, 0,
+					       &kvm->arch.vpit->mask_notifier);
 		mutex_lock(&kvm->arch.vpit->pit_state.lock);
 		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
 		hrtimer_cancel(timer);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 4178022b97a..6acbe4b505d 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -9,6 +9,7 @@ struct kvm_kpit_timer {
 	s64 period; /* unit: ns */
 	s64 scheduled;
 	atomic_t pending;
+	bool reinject;
 };
 
 struct kvm_kpit_channel_state {
@@ -45,6 +46,7 @@ struct kvm_pit {
 	struct kvm *kvm;
 	struct kvm_kpit_state pit_state;
 	int irq_source_id;
+	struct kvm_irq_mask_notifier mask_notifier;
 };
 
 #define KVM_PIT_BASE_ADDRESS	    0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 179dcb0103f..1ccb50c74f1 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -32,11 +32,13 @@
 #include <linux/kvm_host.h>
 
 static void pic_lock(struct kvm_pic *s)
+	__acquires(&s->lock)
 {
 	spin_lock(&s->lock);
 }
 
 static void pic_unlock(struct kvm_pic *s)
+	__releases(&s->lock)
 {
 	struct kvm *kvm = s->kvm;
 	unsigned acks = s->pending_acks;
@@ -49,7 +51,8 @@ static void pic_unlock(struct kvm_pic *s)
 	spin_unlock(&s->lock);
 
 	while (acks) {
-		kvm_notify_acked_irq(kvm, __ffs(acks));
+		kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)),
+				     __ffs(acks));
 		acks &= acks - 1;
 	}
 
@@ -76,12 +79,13 @@ void kvm_pic_clear_isr_ack(struct kvm *kvm)
 /*
  * set irq level. If an edge is detected, then the IRR is set to 1
  */
-static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
+static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
 {
-	int mask;
+	int mask, ret = 1;
 	mask = 1 << irq;
 	if (s->elcr & mask)	/* level triggered */
 		if (level) {
+			ret = !(s->irr & mask);
 			s->irr |= mask;
 			s->last_irr |= mask;
 		} else {
@@ -90,11 +94,15 @@ static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
 		}
 	else	/* edge triggered */
 		if (level) {
-			if ((s->last_irr & mask) == 0)
+			if ((s->last_irr & mask) == 0) {
+				ret = !(s->irr & mask);
 				s->irr |= mask;
+			}
 			s->last_irr |= mask;
 		} else
 			s->last_irr &= ~mask;
+
+	return (s->imr & mask) ? -1 : ret;
 }
 
 /*
@@ -171,16 +179,19 @@ void kvm_pic_update_irq(struct kvm_pic *s)
 	pic_unlock(s);
 }
 
-void kvm_pic_set_irq(void *opaque, int irq, int level)
+int kvm_pic_set_irq(void *opaque, int irq, int level)
 {
 	struct kvm_pic *s = opaque;
+	int ret = -1;
 
 	pic_lock(s);
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
-		pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
+		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
 		pic_update_irq(s);
 	}
 	pic_unlock(s);
+
+	return ret;
 }
 
 /*
@@ -232,7 +243,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	}
 	pic_update_irq(s);
 	pic_unlock(s);
-	kvm_notify_acked_irq(kvm, irq);
+	kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq);
 
 	return intno;
 }
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 82579ee538d..9f593188129 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -32,6 +32,8 @@
 #include "lapic.h"
 
 #define PIC_NUM_PINS 16
+#define SELECT_PIC(irq) \
+	((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
 
 struct kvm;
 struct kvm_vcpu;
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 8e5ee99551f..ed66e4c078d 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -18,7 +18,6 @@ static const u32 host_save_user_msrs[] = {
 };
 
 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-#define NUM_DB_REGS 4
 
 struct kvm_vcpu;
 
@@ -29,18 +28,23 @@ struct vcpu_svm {
 	struct svm_cpu_data *svm_data;
 	uint64_t asid_generation;
 
-	unsigned long db_regs[NUM_DB_REGS];
-
 	u64 next_rip;
 
 	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
 	u64 host_gs_base;
 	unsigned long host_cr2;
-	unsigned long host_db_regs[NUM_DB_REGS];
-	unsigned long host_dr6;
-	unsigned long host_dr7;
 
 	u32 *msrpm;
+	struct vmcb *hsave;
+	u64 hsave_msr;
+
+	u64 nested_vmcb;
+
+	/* These are the merged vectors */
+	u32 *nested_msrpm;
+
+	/* gpa pointers to the real vectors */
+	u64 nested_vmcb_msrpm;
 };
 
 #endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2d4477c7147..2a36f7f7c4c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -145,11 +145,20 @@ struct kvm_rmap_desc {
 	struct kvm_rmap_desc *more;
 };
 
-struct kvm_shadow_walk {
-	int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
-		     u64 addr, u64 *spte, int level);
+struct kvm_shadow_walk_iterator {
+	u64 addr;
+	hpa_t shadow_addr;
+	int level;
+	u64 *sptep;
+	unsigned index;
 };
 
+#define for_each_shadow_entry(_vcpu, _addr, _walker)    \
+	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
+	     shadow_walk_okay(&(_walker));			\
+	     shadow_walk_next(&(_walker)))
+
+
 struct kvm_unsync_walk {
 	int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
 };
@@ -343,7 +352,6 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
 
 	BUG_ON(!mc->nobjs);
 	p = mc->objects[--mc->nobjs];
-	memset(p, 0, size);
 	return p;
 }
 
@@ -794,10 +802,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&sp->oos_link);
-	ASSERT(is_empty_shadow_page(sp->spt));
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
-	sp->global = 1;
 	sp->parent_pte = parent_pte;
 	--vcpu->kvm->arch.n_free_mmu_pages;
 	return sp;
@@ -983,8 +989,8 @@ struct kvm_mmu_pages {
 	     idx < 512;					\
 	     idx = find_next_bit(bitmap, 512, idx+1))
 
-int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
-		   int idx)
+static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+			 int idx)
 {
 	int i;
 
@@ -1059,7 +1065,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.metaphysical
+		if (sp->gfn == gfn && !sp->role.direct
 		    && !sp->role.invalid) {
 			pgprintk("%s: found role %x\n",
 				 __func__, sp->role.word);
@@ -1115,8 +1121,9 @@ struct mmu_page_path {
 			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
 			i = mmu_pages_next(&pvec, &parents, i))
 
-int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
-		   int i)
+static int mmu_pages_next(struct kvm_mmu_pages *pvec,
+			  struct mmu_page_path *parents,
+			  int i)
 {
 	int n;
 
@@ -1135,7 +1142,7 @@ int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
 	return n;
 }
 
-void mmu_pages_clear_parents(struct mmu_page_path *parents)
+static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 {
 	struct kvm_mmu_page *sp;
 	unsigned int level = 0;
@@ -1193,7 +1200,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
 					     unsigned level,
-					     int metaphysical,
+					     int direct,
 					     unsigned access,
 					     u64 *parent_pte)
 {
@@ -1204,10 +1211,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *tmp;
 
-	role.word = 0;
-	role.glevels = vcpu->arch.mmu.root_level;
+	role = vcpu->arch.mmu.base_role;
 	role.level = level;
-	role.metaphysical = metaphysical;
+	role.direct = direct;
 	role.access = access;
 	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1242,8 +1248,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
 	sp->gfn = gfn;
 	sp->role = role;
+	sp->global = role.cr4_pge;
 	hlist_add_head(&sp->hash_link, bucket);
-	if (!metaphysical) {
+	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		account_shadowed(vcpu->kvm, gfn);
@@ -1255,35 +1262,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	return sp;
 }
 
-static int walk_shadow(struct kvm_shadow_walk *walker,
-		       struct kvm_vcpu *vcpu, u64 addr)
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+			     struct kvm_vcpu *vcpu, u64 addr)
 {
-	hpa_t shadow_addr;
-	int level;
-	int r;
-	u64 *sptep;
-	unsigned index;
-
-	shadow_addr = vcpu->arch.mmu.root_hpa;
-	level = vcpu->arch.mmu.shadow_root_level;
-	if (level == PT32E_ROOT_LEVEL) {
-		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
-		shadow_addr &= PT64_BASE_ADDR_MASK;
-		if (!shadow_addr)
-			return 1;
-		--level;
+	iterator->addr = addr;
+	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
+	iterator->level = vcpu->arch.mmu.shadow_root_level;
+	if (iterator->level == PT32E_ROOT_LEVEL) {
+		iterator->shadow_addr
+			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
+		--iterator->level;
+		if (!iterator->shadow_addr)
+			iterator->level = 0;
 	}
+}
 
-	while (level >= PT_PAGE_TABLE_LEVEL) {
-		index = SHADOW_PT_INDEX(addr, level);
-		sptep = ((u64 *)__va(shadow_addr)) + index;
-		r = walker->entry(walker, vcpu, addr, sptep, level);
-		if (r)
-			return r;
-		shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
-		--level;
-	}
-	return 0;
+static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
+{
+	if (iterator->level < PT_PAGE_TABLE_LEVEL)
+		return false;
+	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
+	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
+	return true;
+}
+
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
+	--iterator->level;
 }
 
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
@@ -1388,7 +1395,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
 	kvm_flush_remote_tlbs(kvm);
-	if (!sp->role.invalid && !sp->role.metaphysical)
+	if (!sp->role.invalid && !sp->role.direct)
 		unaccount_shadowed(kvm, sp->gfn);
 	if (sp->unsync)
 		kvm_unlink_unsync_page(kvm, sp);
@@ -1451,7 +1458,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.metaphysical) {
+		if (sp->gfn == gfn && !sp->role.direct) {
 			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
 			r = 1;
@@ -1463,11 +1470,20 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
+	unsigned index;
+	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
+	struct hlist_node *node, *nn;
 
-	while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
-		pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
-		kvm_mmu_zap_page(kvm, sp);
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &kvm->arch.mmu_page_hash[index];
+	hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
+		if (sp->gfn == gfn && !sp->role.direct
+		    && !sp->role.invalid) {
+			pgprintk("%s: zap %lx %x\n",
+				 __func__, gfn, sp->role.word);
+			kvm_mmu_zap_page(kvm, sp);
+		}
 	}
 }
 
@@ -1622,7 +1638,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	/* don't unsync if pagetable is shadowed with multiple roles */
 	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != sp->gfn || s->role.metaphysical)
+		if (s->gfn != sp->gfn || s->role.direct)
 			continue;
 		if (s->role.word != sp->role.word)
 			return 1;
@@ -1669,8 +1685,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	u64 mt_mask = shadow_mt_mask;
 	struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
 
-	if (!(vcpu->arch.cr4 & X86_CR4_PGE))
-		global = 0;
 	if (!global && sp->global) {
 		sp->global = 0;
 		if (sp->unsync) {
@@ -1777,12 +1791,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			pgprintk("hfn old %lx new %lx\n",
 				 spte_to_pfn(*shadow_pte), pfn);
 			rmap_remove(vcpu->kvm, shadow_pte);
-		} else {
-			if (largepage)
-				was_rmapped = is_large_pte(*shadow_pte);
-			else
-				was_rmapped = 1;
-		}
+		} else
+			was_rmapped = 1;
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
 		      dirty, largepage, global, gfn, pfn, speculative, true)) {
@@ -1820,67 +1830,42 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
 
-struct direct_shadow_walk {
-	struct kvm_shadow_walk walker;
-	pfn_t pfn;
-	int write;
-	int largepage;
-	int pt_write;
-};
-
-static int direct_map_entry(struct kvm_shadow_walk *_walk,
-			    struct kvm_vcpu *vcpu,
-			    u64 addr, u64 *sptep, int level)
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+			int largepage, gfn_t gfn, pfn_t pfn)
 {
-	struct direct_shadow_walk *walk =
-		container_of(_walk, struct direct_shadow_walk, walker);
+	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
+	int pt_write = 0;
 	gfn_t pseudo_gfn;
-	gfn_t gfn = addr >> PAGE_SHIFT;
-
-	if (level == PT_PAGE_TABLE_LEVEL
-	    || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
-		mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
-			     0, walk->write, 1, &walk->pt_write,
-			     walk->largepage, 0, gfn, walk->pfn, false);
-		++vcpu->stat.pf_fixed;
-		return 1;
-	}
 
-	if (*sptep == shadow_trap_nonpresent_pte) {
-		pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
-		sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
-				      1, ACC_ALL, sptep);
-		if (!sp) {
-			pgprintk("nonpaging_map: ENOMEM\n");
-			kvm_release_pfn_clean(walk->pfn);
-			return -ENOMEM;
+	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
+		if (iterator.level == PT_PAGE_TABLE_LEVEL
+		    || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
+			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
+				     0, write, 1, &pt_write,
+				     largepage, 0, gfn, pfn, false);
+			++vcpu->stat.pf_fixed;
+			break;
 		}
 
-		set_shadow_pte(sptep,
-			       __pa(sp->spt)
-			       | PT_PRESENT_MASK | PT_WRITABLE_MASK
-			       | shadow_user_mask | shadow_x_mask);
-	}
-	return 0;
-}
+		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+			pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
+					      iterator.level - 1,
+					      1, ACC_ALL, iterator.sptep);
+			if (!sp) {
+				pgprintk("nonpaging_map: ENOMEM\n");
+				kvm_release_pfn_clean(pfn);
+				return -ENOMEM;
+			}
 
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			int largepage, gfn_t gfn, pfn_t pfn)
-{
-	int r;
-	struct direct_shadow_walk walker = {
-		.walker = { .entry = direct_map_entry, },
-		.pfn = pfn,
-		.largepage = largepage,
-		.write = write,
-		.pt_write = 0,
-	};
-
-	r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
-	if (r < 0)
-		return r;
-	return walker.pt_write;
+			set_shadow_pte(iterator.sptep,
+				       __pa(sp->spt)
+				       | PT_PRESENT_MASK | PT_WRITABLE_MASK
+				       | shadow_user_mask | shadow_x_mask);
+		}
+	}
+	return pt_write;
 }
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1962,7 +1947,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	int i;
 	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
-	int metaphysical = 0;
+	int direct = 0;
 
 	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
 
@@ -1971,18 +1956,18 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
 		ASSERT(!VALID_PAGE(root));
 		if (tdp_enabled)
-			metaphysical = 1;
+			direct = 1;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-				      PT64_ROOT_LEVEL, metaphysical,
+				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		vcpu->arch.mmu.root_hpa = root;
 		return;
 	}
-	metaphysical = !is_paging(vcpu);
+	direct = !is_paging(vcpu);
 	if (tdp_enabled)
-		metaphysical = 1;
+		direct = 1;
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -1996,7 +1981,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-				      PT32_ROOT_LEVEL, metaphysical,
+				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
@@ -2251,17 +2236,23 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
+	int r;
+
 	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	if (!is_paging(vcpu))
-		return nonpaging_init_context(vcpu);
+		r = nonpaging_init_context(vcpu);
 	else if (is_long_mode(vcpu))
-		return paging64_init_context(vcpu);
+		r = paging64_init_context(vcpu);
 	else if (is_pae(vcpu))
-		return paging32E_init_context(vcpu);
+		r = paging32E_init_context(vcpu);
 	else
-		return paging32_init_context(vcpu);
+		r = paging32_init_context(vcpu);
+
+	vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
+
+	return r;
 }
 
 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -2492,7 +2483,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-		if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
+		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
 			continue;
 		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -3130,7 +3121,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 	gfn_t gfn;
 
 	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.metaphysical)
+		if (sp->role.direct)
 			continue;
 
 		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 258e5d56298..eaab2145f62 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,7 +54,7 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 static inline int is_long_mode(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
-	return vcpu->arch.shadow_efer & EFER_LME;
+	return vcpu->arch.shadow_efer & EFER_LMA;
 #else
 	return 0;
 #endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9fd78b6e17a..6bd70206c56 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,7 +25,6 @@
 #if PTTYPE == 64
 	#define pt_element_t u64
 	#define guest_walker guest_walker64
-	#define shadow_walker shadow_walker64
 	#define FNAME(name) paging##64_##name
 	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
@@ -42,7 +41,6 @@
 #elif PTTYPE == 32
 	#define pt_element_t u32
 	#define guest_walker guest_walker32
-	#define shadow_walker shadow_walker32
 	#define FNAME(name) paging##32_##name
 	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
@@ -73,18 +71,6 @@ struct guest_walker {
 	u32 error_code;
 };
 
-struct shadow_walker {
-	struct kvm_shadow_walk walker;
-	struct guest_walker *guest_walker;
-	int user_fault;
-	int write_fault;
-	int largepage;
-	int *ptwrite;
-	pfn_t pfn;
-	u64 *sptep;
-	gpa_t pte_gpa;
-};
-
 static gfn_t gpte_to_gfn(pt_element_t gpte)
 {
 	return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -283,91 +269,79 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
-static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
-				    struct kvm_vcpu *vcpu, u64 addr,
-				    u64 *sptep, int level)
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+			 struct guest_walker *gw,
+			 int user_fault, int write_fault, int largepage,
+			 int *ptwrite, pfn_t pfn)
 {
-	struct shadow_walker *sw =
-		container_of(_sw, struct shadow_walker, walker);
-	struct guest_walker *gw = sw->guest_walker;
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *shadow_page;
-	u64 spte;
-	int metaphysical;
+	u64 spte, *sptep;
+	int direct;
 	gfn_t table_gfn;
 	int r;
+	int level;
 	pt_element_t curr_pte;
+	struct kvm_shadow_walk_iterator iterator;
 
-	if (level == PT_PAGE_TABLE_LEVEL
-	    || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
-		mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
-			     sw->user_fault, sw->write_fault,
-			     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
-			     sw->ptwrite, sw->largepage,
-			     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
-			     gw->gfn, sw->pfn, false);
-		sw->sptep = sptep;
-		return 1;
-	}
+	if (!is_present_pte(gw->ptes[gw->level - 1]))
+		return NULL;
 
-	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
-		return 0;
+	for_each_shadow_entry(vcpu, addr, iterator) {
+		level = iterator.level;
+		sptep = iterator.sptep;
+		if (level == PT_PAGE_TABLE_LEVEL
+		    || (largepage && level == PT_DIRECTORY_LEVEL)) {
+			mmu_set_spte(vcpu, sptep, access,
+				     gw->pte_access & access,
+				     user_fault, write_fault,
+				     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
+				     ptwrite, largepage,
+				     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
+				     gw->gfn, pfn, false);
+			break;
+		}
 
-	if (is_large_pte(*sptep)) {
-		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
-		kvm_flush_remote_tlbs(vcpu->kvm);
-		rmap_remove(vcpu->kvm, sptep);
-	}
+		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+			continue;
 
-	if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
-		metaphysical = 1;
-		if (!is_dirty_pte(gw->ptes[level - 1]))
-			access &= ~ACC_WRITE_MASK;
-		table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
-	} else {
-		metaphysical = 0;
-		table_gfn = gw->table_gfn[level - 2];
-	}
-	shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
-				       metaphysical, access, sptep);
-	if (!metaphysical) {
-		r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
-					  &curr_pte, sizeof(curr_pte));
-		if (r || curr_pte != gw->ptes[level - 2]) {
-			kvm_mmu_put_page(shadow_page, sptep);
-			kvm_release_pfn_clean(sw->pfn);
-			sw->sptep = NULL;
-			return 1;
+		if (is_large_pte(*sptep)) {
+			rmap_remove(vcpu->kvm, sptep);
+			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+			kvm_flush_remote_tlbs(vcpu->kvm);
 		}
-	}
 
-	spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
-		| PT_WRITABLE_MASK | PT_USER_MASK;
-	*sptep = spte;
-	return 0;
-}
-
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
-			 struct guest_walker *guest_walker,
-			 int user_fault, int write_fault, int largepage,
-			 int *ptwrite, pfn_t pfn)
-{
-	struct shadow_walker walker = {
-		.walker = { .entry = FNAME(shadow_walk_entry), },
-		.guest_walker = guest_walker,
-		.user_fault = user_fault,
-		.write_fault = write_fault,
-		.largepage = largepage,
-		.ptwrite = ptwrite,
-		.pfn = pfn,
-	};
-
-	if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
-		return NULL;
+		if (level == PT_DIRECTORY_LEVEL
+		    && gw->level == PT_DIRECTORY_LEVEL) {
+			direct = 1;
+			if (!is_dirty_pte(gw->ptes[level - 1]))
+				access &= ~ACC_WRITE_MASK;
+			table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
+		} else {
+			direct = 0;
+			table_gfn = gw->table_gfn[level - 2];
+		}
+		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+					       direct, access, sptep);
+		if (!direct) {
+			r = kvm_read_guest_atomic(vcpu->kvm,
+						  gw->pte_gpa[level - 2],
+						  &curr_pte, sizeof(curr_pte));
+			if (r || curr_pte != gw->ptes[level - 2]) {
+				kvm_mmu_put_page(shadow_page, sptep);
+				kvm_release_pfn_clean(pfn);
+				sptep = NULL;
+				break;
+			}
+		}
 
-	walk_shadow(&walker.walker, vcpu, addr);
+		spte = __pa(shadow_page->spt)
+			| PT_PRESENT_MASK | PT_ACCESSED_MASK
+			| PT_WRITABLE_MASK | PT_USER_MASK;
+		*sptep = spte;
+	}
 
-	return walker.sptep;
+	return sptep;
 }
 
 /*
@@ -465,54 +439,56 @@ out_unlock:
 	return 0;
 }
 
-static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
-				      struct kvm_vcpu *vcpu, u64 addr,
-				      u64 *sptep, int level)
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
-	struct shadow_walker *sw =
-		container_of(_sw, struct shadow_walker, walker);
-
-	/* FIXME: properly handle invlpg on large guest pages */
-	if (level == PT_PAGE_TABLE_LEVEL ||
-	    ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
-		struct kvm_mmu_page *sp = page_header(__pa(sptep));
+	struct kvm_shadow_walk_iterator iterator;
+	pt_element_t gpte;
+	gpa_t pte_gpa = -1;
+	int level;
+	u64 *sptep;
+	int need_flush = 0;
 
-		sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
-		sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+	spin_lock(&vcpu->kvm->mmu_lock);
 
-		if (is_shadow_present_pte(*sptep)) {
-			rmap_remove(vcpu->kvm, sptep);
-			if (is_large_pte(*sptep))
-				--vcpu->kvm->stat.lpages;
+	for_each_shadow_entry(vcpu, gva, iterator) {
+		level = iterator.level;
+		sptep = iterator.sptep;
+
+		/* FIXME: properly handle invlpg on large guest pages */
+		if (level == PT_PAGE_TABLE_LEVEL ||
+		    ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+			struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+			pte_gpa = (sp->gfn << PAGE_SHIFT);
+			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+			if (is_shadow_present_pte(*sptep)) {
+				rmap_remove(vcpu->kvm, sptep);
+				if (is_large_pte(*sptep))
+					--vcpu->kvm->stat.lpages;
+				need_flush = 1;
+			}
+			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+			break;
 		}
-		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
-		return 1;
-	}
-	if (!is_shadow_present_pte(*sptep))
-		return 1;
-	return 0;
-}
 
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	pt_element_t gpte;
-	struct shadow_walker walker = {
-		.walker = { .entry = FNAME(shadow_invlpg_entry), },
-		.pte_gpa = -1,
-	};
+		if (!is_shadow_present_pte(*sptep))
+			break;
+	}
 
-	spin_lock(&vcpu->kvm->mmu_lock);
-	walk_shadow(&walker.walker, vcpu, gva);
+	if (need_flush)
+		kvm_flush_remote_tlbs(vcpu->kvm);
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	if (walker.pte_gpa == -1)
+
+	if (pte_gpa == -1)
 		return;
-	if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
+	if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
 				  sizeof(pt_element_t)))
 		return;
 	if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
 		if (mmu_topup_memory_caches(vcpu))
 			return;
-		kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
+		kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
 				  sizeof(pt_element_t), 0);
 	}
 }
@@ -540,7 +516,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 	pt_element_t pt[256 / sizeof(pt_element_t)];
 	gpa_t pte_gpa;
 
-	if (sp->role.metaphysical
+	if (sp->role.direct
 	    || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
 		nonpaging_prefetch_page(vcpu, sp);
 		return;
@@ -619,7 +595,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 #undef pt_element_t
 #undef guest_walker
-#undef shadow_walker
 #undef FNAME
 #undef PT_BASE_ADDR_MASK
 #undef PT_INDEX
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a9e769e4e25..1821c207819 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -38,9 +38,6 @@ MODULE_LICENSE("GPL");
 #define IOPM_ALLOC_ORDER 2
 #define MSRPM_ALLOC_ORDER 1
 
-#define DR7_GD_MASK (1 << 13)
-#define DR6_BD_MASK (1 << 13)
-
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
@@ -50,6 +47,15 @@ MODULE_LICENSE("GPL");
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
+/* Turn on to get debugging output*/
+/* #define NESTED_DEBUG */
+
+#ifdef NESTED_DEBUG
+#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
+#else
+#define nsvm_printk(fmt, args...) do {} while(0)
+#endif
+
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
@@ -60,14 +66,29 @@ static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
 
+static int nested = 0;
+module_param(nested, int, S_IRUGO);
+
 static void kvm_reput_irq(struct vcpu_svm *svm);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 
+static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
+static int nested_svm_vmexit(struct vcpu_svm *svm);
+static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
+			     void *arg2, void *opaque);
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+				      bool has_error_code, u32 error_code);
+
 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 {
 	return container_of(vcpu, struct vcpu_svm, vcpu);
 }
 
+static inline bool is_nested(struct vcpu_svm *svm)
+{
+	return svm->nested_vmcb;
+}
+
 static unsigned long iopm_base;
 
 struct kvm_ldttss_desc {
@@ -157,32 +178,6 @@ static inline void kvm_write_cr2(unsigned long val)
 	asm volatile ("mov %0, %%cr2" :: "r" (val));
 }
 
-static inline unsigned long read_dr6(void)
-{
-	unsigned long dr6;
-
-	asm volatile ("mov %%dr6, %0" : "=r" (dr6));
-	return dr6;
-}
-
-static inline void write_dr6(unsigned long val)
-{
-	asm volatile ("mov %0, %%dr6" :: "r" (val));
-}
-
-static inline unsigned long read_dr7(void)
-{
-	unsigned long dr7;
-
-	asm volatile ("mov %%dr7, %0" : "=r" (dr7));
-	return dr7;
-}
-
-static inline void write_dr7(unsigned long val)
-{
-	asm volatile ("mov %0, %%dr7" :: "r" (val));
-}
-
 static inline void force_new_asid(struct kvm_vcpu *vcpu)
 {
 	to_svm(vcpu)->asid_generation--;
@@ -198,7 +193,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	if (!npt_enabled && !(efer & EFER_LMA))
 		efer &= ~EFER_LME;
 
-	to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
+	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
 	vcpu->arch.shadow_efer = efer;
 }
 
@@ -207,6 +202,11 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	/* If we are within a nested VM we'd better #VMEXIT and let the
+	   guest handle the exception */
+	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
+		return;
+
 	svm->vmcb->control.event_inj = nr
 		| SVM_EVTINJ_VALID
 		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
@@ -242,7 +242,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	kvm_rip_write(vcpu, svm->next_rip);
 	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 
-	vcpu->arch.interrupt_window_open = 1;
+	vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
 static int has_svm(void)
@@ -250,7 +250,7 @@ static int has_svm(void)
 	const char *msg;
 
 	if (!cpu_has_svm(&msg)) {
-		printk(KERN_INFO "has_svn: %s\n", msg);
+		printk(KERN_INFO "has_svm: %s\n", msg);
 		return 0;
 	}
 
@@ -292,7 +292,7 @@ static void svm_hardware_enable(void *garbage)
 	svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
 	rdmsrl(MSR_EFER, efer);
-	wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
+	wrmsrl(MSR_EFER, efer | EFER_SVME);
 
 	wrmsrl(MSR_VM_HSAVE_PA,
 	       page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
@@ -417,6 +417,14 @@ static __init int svm_hardware_setup(void)
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
+	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+		kvm_enable_efer_bits(EFER_FFXSR);
+
+	if (nested) {
+		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+		kvm_enable_efer_bits(EFER_SVME);
+	}
+
 	for_each_online_cpu(cpu) {
 		r = svm_cpu_init(cpu);
 		if (r)
@@ -559,7 +567,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-	save->efer = MSR_EFER_SVME_MASK;
+	save->efer = EFER_SVME;
 	save->dr6 = 0xffff0ff0;
 	save->dr7 = 0x400;
 	save->rflags = 2;
@@ -591,6 +599,9 @@ static void init_vmcb(struct vcpu_svm *svm)
 		save->cr4 = 0;
 	}
 	force_new_asid(&svm->vcpu);
+
+	svm->nested_vmcb = 0;
+	svm->vcpu.arch.hflags = HF_GIF_MASK;
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -615,6 +626,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	struct vcpu_svm *svm;
 	struct page *page;
 	struct page *msrpm_pages;
+	struct page *hsave_page;
+	struct page *nested_msrpm_pages;
 	int err;
 
 	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -637,14 +650,25 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
 	if (!msrpm_pages)
 		goto uninit;
+
+	nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+	if (!nested_msrpm_pages)
+		goto uninit;
+
 	svm->msrpm = page_address(msrpm_pages);
 	svm_vcpu_init_msrpm(svm->msrpm);
 
+	hsave_page = alloc_page(GFP_KERNEL);
+	if (!hsave_page)
+		goto uninit;
+	svm->hsave = page_address(hsave_page);
+
+	svm->nested_msrpm = page_address(nested_msrpm_pages);
+
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
-	memset(svm->db_regs, 0, sizeof(svm->db_regs));
 	init_vmcb(svm);
 
 	fx_init(&svm->vcpu);
@@ -669,6 +693,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 
 	__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
 	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+	__free_page(virt_to_page(svm->hsave));
+	__free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, svm);
 }
@@ -718,6 +744,16 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	to_svm(vcpu)->vmcb->save.rflags = rflags;
 }
 
+static void svm_set_vintr(struct vcpu_svm *svm)
+{
+	svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
+}
+
+static void svm_clear_vintr(struct vcpu_svm *svm)
+{
+	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+}
+
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
 {
 	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
@@ -760,20 +796,37 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
 	var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
 
-	/*
-	 * SVM always stores 0 for the 'G' bit in the CS selector in
-	 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
-	 * Intel's VMENTRY has a check on the 'G' bit.
-	 */
-	if (seg == VCPU_SREG_CS)
+	switch (seg) {
+	case VCPU_SREG_CS:
+		/*
+		 * SVM always stores 0 for the 'G' bit in the CS selector in
+		 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
+		 * Intel's VMENTRY has a check on the 'G' bit.
+		 */
 		var->g = s->limit > 0xfffff;
-
-	/*
-	 * Work around a bug where the busy flag in the tr selector
-	 * isn't exposed
-	 */
-	if (seg == VCPU_SREG_TR)
+		break;
+	case VCPU_SREG_TR:
+		/*
+		 * Work around a bug where the busy flag in the tr selector
+		 * isn't exposed
+		 */
 		var->type |= 0x2;
+		break;
+	case VCPU_SREG_DS:
+	case VCPU_SREG_ES:
+	case VCPU_SREG_FS:
+	case VCPU_SREG_GS:
+		/*
+		 * The accessed bit must always be set in the segment
+		 * descriptor cache, although it can be cleared in the
+		 * descriptor, the cached bit always remains at 1. Since
+		 * Intel has a check on this, set it here to support
+		 * cross-vendor migration.
+		 */
+		if (!var->unusable)
+			var->type |= 0x1;
+		break;
+	}
 
 	var->unusable = !var->present;
 }
@@ -905,9 +958,37 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 
 }
 
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-	return -EOPNOTSUPP;
+	int old_debug = vcpu->guest_debug;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	vcpu->guest_debug = dbg->control;
+
+	svm->vmcb->control.intercept_exceptions &=
+		~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			svm->vmcb->control.intercept_exceptions |=
+				1 << DB_VECTOR;
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			svm->vmcb->control.intercept_exceptions |=
+				1 << BP_VECTOR;
+	} else
+		vcpu->guest_debug = 0;
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+		svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
+	else
+		svm->vmcb->save.dr7 = vcpu->arch.dr7;
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+	else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
+		svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+	return 0;
 }
 
 static int svm_get_irq(struct kvm_vcpu *vcpu)
@@ -949,7 +1030,29 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
 
 static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
 {
-	unsigned long val = to_svm(vcpu)->db_regs[dr];
+	struct vcpu_svm *svm = to_svm(vcpu);
+	unsigned long val;
+
+	switch (dr) {
+	case 0 ... 3:
+		val = vcpu->arch.db[dr];
+		break;
+	case 6:
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+			val = vcpu->arch.dr6;
+		else
+			val = svm->vmcb->save.dr6;
+		break;
+	case 7:
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+			val = vcpu->arch.dr7;
+		else
+			val = svm->vmcb->save.dr7;
+		break;
+	default:
+		val = 0;
+	}
+
 	KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
 	return val;
 }
@@ -959,33 +1062,40 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	*exception = 0;
+	KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
 
-	if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
-		svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
-		svm->vmcb->save.dr6 |= DR6_BD_MASK;
-		*exception = DB_VECTOR;
-		return;
-	}
+	*exception = 0;
 
 	switch (dr) {
 	case 0 ... 3:
-		svm->db_regs[dr] = value;
+		vcpu->arch.db[dr] = value;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+			vcpu->arch.eff_db[dr] = value;
 		return;
 	case 4 ... 5:
-		if (vcpu->arch.cr4 & X86_CR4_DE) {
+		if (vcpu->arch.cr4 & X86_CR4_DE)
 			*exception = UD_VECTOR;
+		return;
+	case 6:
+		if (value & 0xffffffff00000000ULL) {
+			*exception = GP_VECTOR;
 			return;
 		}
-	case 7: {
-		if (value & ~((1ULL << 32) - 1)) {
+		vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
+		return;
+	case 7:
+		if (value & 0xffffffff00000000ULL) {
 			*exception = GP_VECTOR;
 			return;
 		}
-		svm->vmcb->save.dr7 = value;
+		vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+			svm->vmcb->save.dr7 = vcpu->arch.dr7;
+			vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
+		}
 		return;
-	}
 	default:
+		/* FIXME: Possible case? */
 		printk(KERN_DEBUG "%s: unexpected dr %u\n",
 		       __func__, dr);
 		*exception = UD_VECTOR;
@@ -1031,6 +1141,27 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
 
+static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (!(svm->vcpu.guest_debug &
+	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+		kvm_queue_exception(&svm->vcpu, DB_VECTOR);
+		return 1;
+	}
+	kvm_run->exit_reason = KVM_EXIT_DEBUG;
+	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+	kvm_run->debug.arch.exception = DB_VECTOR;
+	return 0;
+}
+
+static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	kvm_run->exit_reason = KVM_EXIT_DEBUG;
+	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+	kvm_run->debug.arch.exception = BP_VECTOR;
+	return 0;
+}
+
 static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	int er;
@@ -1080,7 +1211,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
-	int size, down, in, string, rep;
+	int size, in, string;
 	unsigned port;
 
 	++svm->vcpu.stat.io_exits;
@@ -1099,8 +1230,6 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
 	port = io_info >> 16;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
-	rep = (io_info & SVM_IOIO_REP_MASK) != 0;
-	down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
 
 	skip_emulated_instruction(&svm->vcpu);
 	return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
@@ -1139,6 +1268,567 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int nested_svm_check_permissions(struct vcpu_svm *svm)
+{
+	if (!(svm->vcpu.arch.shadow_efer & EFER_SVME)
+	    || !is_paging(&svm->vcpu)) {
+		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	if (svm->vmcb->save.cpl) {
+		kvm_inject_gp(&svm->vcpu, 0);
+		return 1;
+	}
+
+       return 0;
+}
+
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+				      bool has_error_code, u32 error_code)
+{
+	if (is_nested(svm)) {
+		svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+		svm->vmcb->control.exit_code_hi = 0;
+		svm->vmcb->control.exit_info_1 = error_code;
+		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+		if (nested_svm_exit_handled(svm, false)) {
+			nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
+
+			nested_svm_vmexit(svm);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static inline int nested_svm_intr(struct vcpu_svm *svm)
+{
+	if (is_nested(svm)) {
+		if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+			return 0;
+
+		if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
+			return 0;
+
+		svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+
+		if (nested_svm_exit_handled(svm, false)) {
+			nsvm_printk("VMexit -> INTR\n");
+			nested_svm_vmexit(svm);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
+{
+	struct page *page;
+
+	down_read(&current->mm->mmap_sem);
+	page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+
+	if (is_error_page(page)) {
+		printk(KERN_INFO "%s: could not find page at 0x%llx\n",
+		       __func__, gpa);
+		kvm_release_page_clean(page);
+		kvm_inject_gp(&svm->vcpu, 0);
+		return NULL;
+	}
+	return page;
+}
+
+static int nested_svm_do(struct vcpu_svm *svm,
+			 u64 arg1_gpa, u64 arg2_gpa, void *opaque,
+			 int (*handler)(struct vcpu_svm *svm,
+					void *arg1,
+					void *arg2,
+					void *opaque))
+{
+	struct page *arg1_page;
+	struct page *arg2_page = NULL;
+	void *arg1;
+	void *arg2 = NULL;
+	int retval;
+
+	arg1_page = nested_svm_get_page(svm, arg1_gpa);
+	if(arg1_page == NULL)
+		return 1;
+
+	if (arg2_gpa) {
+		arg2_page = nested_svm_get_page(svm, arg2_gpa);
+		if(arg2_page == NULL) {
+			kvm_release_page_clean(arg1_page);
+			return 1;
+		}
+	}
+
+	arg1 = kmap_atomic(arg1_page, KM_USER0);
+	if (arg2_gpa)
+		arg2 = kmap_atomic(arg2_page, KM_USER1);
+
+	retval = handler(svm, arg1, arg2, opaque);
+
+	kunmap_atomic(arg1, KM_USER0);
+	if (arg2_gpa)
+		kunmap_atomic(arg2, KM_USER1);
+
+	kvm_release_page_dirty(arg1_page);
+	if (arg2_gpa)
+		kvm_release_page_dirty(arg2_page);
+
+	return retval;
+}
+
+static int nested_svm_exit_handled_real(struct vcpu_svm *svm,
+					void *arg1,
+					void *arg2,
+					void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	bool kvm_overrides = *(bool *)opaque;
+	u32 exit_code = svm->vmcb->control.exit_code;
+
+	if (kvm_overrides) {
+		switch (exit_code) {
+		case SVM_EXIT_INTR:
+		case SVM_EXIT_NMI:
+			return 0;
+		/* For now we are always handling NPFs when using them */
+		case SVM_EXIT_NPF:
+			if (npt_enabled)
+				return 0;
+			break;
+		/* When we're shadowing, trap PFs */
+		case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+			if (!npt_enabled)
+				return 0;
+			break;
+		default:
+			break;
+		}
+	}
+
+	switch (exit_code) {
+	case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
+		u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
+		if (nested_vmcb->control.intercept_cr_read & cr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
+		u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
+		if (nested_vmcb->control.intercept_cr_write & cr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
+		u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
+		if (nested_vmcb->control.intercept_dr_read & dr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
+		u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
+		if (nested_vmcb->control.intercept_dr_write & dr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+		if (nested_vmcb->control.intercept_exceptions & excp_bits)
+			return 1;
+		break;
+	}
+	default: {
+		u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
+		nsvm_printk("exit code: 0x%x\n", exit_code);
+		if (nested_vmcb->control.intercept & exit_bits)
+			return 1;
+	}
+	}
+
+	return 0;
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
+				       void *arg1, void *arg2,
+				       void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	u8 *msrpm = (u8 *)arg2;
+        u32 t0, t1;
+	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	u32 param = svm->vmcb->control.exit_info_1 & 1;
+
+	if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+		return 0;
+
+	switch(msr) {
+	case 0 ... 0x1fff:
+		t0 = (msr * 2) % 8;
+		t1 = msr / 8;
+		break;
+	case 0xc0000000 ... 0xc0001fff:
+		t0 = (8192 + msr - 0xc0000000) * 2;
+		t1 = (t0 / 8);
+		t0 %= 8;
+		break;
+	case 0xc0010000 ... 0xc0011fff:
+		t0 = (16384 + msr - 0xc0010000) * 2;
+		t1 = (t0 / 8);
+		t0 %= 8;
+		break;
+	default:
+		return 1;
+		break;
+	}
+	if (msrpm[t1] & ((1 << param) << t0))
+		return 1;
+
+	return 0;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
+{
+	bool k = kvm_override;
+
+	switch (svm->vmcb->control.exit_code) {
+	case SVM_EXIT_MSR:
+		return nested_svm_do(svm, svm->nested_vmcb,
+				     svm->nested_vmcb_msrpm, NULL,
+				     nested_svm_exit_handled_msr);
+	default: break;
+	}
+
+	return nested_svm_do(svm, svm->nested_vmcb, 0, &k,
+			     nested_svm_exit_handled_real);
+}
+
+static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
+				  void *arg2, void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	struct vmcb *hsave = svm->hsave;
+	u64 nested_save[] = { nested_vmcb->save.cr0,
+			      nested_vmcb->save.cr3,
+			      nested_vmcb->save.cr4,
+			      nested_vmcb->save.efer,
+			      nested_vmcb->control.intercept_cr_read,
+			      nested_vmcb->control.intercept_cr_write,
+			      nested_vmcb->control.intercept_dr_read,
+			      nested_vmcb->control.intercept_dr_write,
+			      nested_vmcb->control.intercept_exceptions,
+			      nested_vmcb->control.intercept,
+			      nested_vmcb->control.msrpm_base_pa,
+			      nested_vmcb->control.iopm_base_pa,
+			      nested_vmcb->control.tsc_offset };
+
+	/* Give the current vmcb to the guest */
+	memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb));
+	nested_vmcb->save.cr0 = nested_save[0];
+	if (!npt_enabled)
+		nested_vmcb->save.cr3 = nested_save[1];
+	nested_vmcb->save.cr4 = nested_save[2];
+	nested_vmcb->save.efer = nested_save[3];
+	nested_vmcb->control.intercept_cr_read = nested_save[4];
+	nested_vmcb->control.intercept_cr_write = nested_save[5];
+	nested_vmcb->control.intercept_dr_read = nested_save[6];
+	nested_vmcb->control.intercept_dr_write = nested_save[7];
+	nested_vmcb->control.intercept_exceptions = nested_save[8];
+	nested_vmcb->control.intercept = nested_save[9];
+	nested_vmcb->control.msrpm_base_pa = nested_save[10];
+	nested_vmcb->control.iopm_base_pa = nested_save[11];
+	nested_vmcb->control.tsc_offset = nested_save[12];
+
+	/* We always set V_INTR_MASKING and remember the old value in hflags */
+	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+		nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
+
+	if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) &&
+	    (nested_vmcb->control.int_vector)) {
+		nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n",
+				nested_vmcb->control.int_vector);
+	}
+
+	/* Restore the original control entries */
+	svm->vmcb->control = hsave->control;
+
+	/* Kill any pending exceptions */
+	if (svm->vcpu.arch.exception.pending == true)
+		nsvm_printk("WARNING: Pending Exception\n");
+	svm->vcpu.arch.exception.pending = false;
+
+	/* Restore selected save entries */
+	svm->vmcb->save.es = hsave->save.es;
+	svm->vmcb->save.cs = hsave->save.cs;
+	svm->vmcb->save.ss = hsave->save.ss;
+	svm->vmcb->save.ds = hsave->save.ds;
+	svm->vmcb->save.gdtr = hsave->save.gdtr;
+	svm->vmcb->save.idtr = hsave->save.idtr;
+	svm->vmcb->save.rflags = hsave->save.rflags;
+	svm_set_efer(&svm->vcpu, hsave->save.efer);
+	svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
+	svm_set_cr4(&svm->vcpu, hsave->save.cr4);
+	if (npt_enabled) {
+		svm->vmcb->save.cr3 = hsave->save.cr3;
+		svm->vcpu.arch.cr3 = hsave->save.cr3;
+	} else {
+		kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+	}
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
+	svm->vmcb->save.dr7 = 0;
+	svm->vmcb->save.cpl = 0;
+	svm->vmcb->control.exit_int_info = 0;
+
+	svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+	/* Exit nested SVM mode */
+	svm->nested_vmcb = 0;
+
+	return 0;
+}
+
+static int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+	nsvm_printk("VMexit\n");
+	if (nested_svm_do(svm, svm->nested_vmcb, 0,
+			  NULL, nested_svm_vmexit_real))
+		return 1;
+
+	kvm_mmu_reset_context(&svm->vcpu);
+	kvm_mmu_load(&svm->vcpu);
+
+	return 0;
+}
+
+static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1,
+				  void *arg2, void *opaque)
+{
+	int i;
+	u32 *nested_msrpm = (u32*)arg1;
+	for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
+		svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
+	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm);
+
+	return 0;
+}
+
+static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
+			    void *arg2, void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	struct vmcb *hsave = svm->hsave;
+
+	/* nested_vmcb is our indicator if nested SVM is activated */
+	svm->nested_vmcb = svm->vmcb->save.rax;
+
+	/* Clear internal status */
+	svm->vcpu.arch.exception.pending = false;
+
+	/* Save the old vmcb, so we don't need to pick what we save, but
+	   can restore everything when a VMEXIT occurs */
+	memcpy(hsave, svm->vmcb, sizeof(struct vmcb));
+	/* We need to remember the original CR3 in the SPT case */
+	if (!npt_enabled)
+		hsave->save.cr3 = svm->vcpu.arch.cr3;
+	hsave->save.cr4 = svm->vcpu.arch.cr4;
+	hsave->save.rip = svm->next_rip;
+
+	if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
+		svm->vcpu.arch.hflags |= HF_HIF_MASK;
+	else
+		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
+
+	/* Load the nested guest state */
+	svm->vmcb->save.es = nested_vmcb->save.es;
+	svm->vmcb->save.cs = nested_vmcb->save.cs;
+	svm->vmcb->save.ss = nested_vmcb->save.ss;
+	svm->vmcb->save.ds = nested_vmcb->save.ds;
+	svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
+	svm->vmcb->save.idtr = nested_vmcb->save.idtr;
+	svm->vmcb->save.rflags = nested_vmcb->save.rflags;
+	svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
+	svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
+	svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
+	if (npt_enabled) {
+		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
+		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
+	} else {
+		kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+		kvm_mmu_reset_context(&svm->vcpu);
+	}
+	svm->vmcb->save.cr2 = nested_vmcb->save.cr2;
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+	/* In case we don't even reach vcpu_run, the fields are not updated */
+	svm->vmcb->save.rax = nested_vmcb->save.rax;
+	svm->vmcb->save.rsp = nested_vmcb->save.rsp;
+	svm->vmcb->save.rip = nested_vmcb->save.rip;
+	svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
+	svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
+	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+
+	/* We don't want a nested guest to be more powerful than the guest,
+	   so all intercepts are ORed */
+	svm->vmcb->control.intercept_cr_read |=
+		nested_vmcb->control.intercept_cr_read;
+	svm->vmcb->control.intercept_cr_write |=
+		nested_vmcb->control.intercept_cr_write;
+	svm->vmcb->control.intercept_dr_read |=
+		nested_vmcb->control.intercept_dr_read;
+	svm->vmcb->control.intercept_dr_write |=
+		nested_vmcb->control.intercept_dr_write;
+	svm->vmcb->control.intercept_exceptions |=
+		nested_vmcb->control.intercept_exceptions;
+
+	svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+
+	svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+
+	force_new_asid(&svm->vcpu);
+	svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
+	svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
+	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
+	if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
+		nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
+				nested_vmcb->control.int_ctl);
+	}
+	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
+		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
+	else
+		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
+
+	nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
+			nested_vmcb->control.exit_int_info,
+			nested_vmcb->control.int_state);
+
+	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
+	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
+	svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
+	if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
+		nsvm_printk("Injecting Event: 0x%x\n",
+				nested_vmcb->control.event_inj);
+	svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
+	svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
+
+	svm->vcpu.arch.hflags |= HF_GIF_MASK;
+
+	return 0;
+}
+
+static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+	to_vmcb->save.fs = from_vmcb->save.fs;
+	to_vmcb->save.gs = from_vmcb->save.gs;
+	to_vmcb->save.tr = from_vmcb->save.tr;
+	to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+	to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+	to_vmcb->save.star = from_vmcb->save.star;
+	to_vmcb->save.lstar = from_vmcb->save.lstar;
+	to_vmcb->save.cstar = from_vmcb->save.cstar;
+	to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+	to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+	to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+	to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+
+	return 1;
+}
+
+static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
+			     void *arg2, void *opaque)
+{
+	return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb);
+}
+
+static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
+			     void *arg2, void *opaque)
+{
+	return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb);
+}
+
+static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload);
+
+	return 1;
+}
+
+static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave);
+
+	return 1;
+}
+
+static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	nsvm_printk("VMrun\n");
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	if (nested_svm_do(svm, svm->vmcb->save.rax, 0,
+			  NULL, nested_svm_vmrun))
+		return 1;
+
+	if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0,
+		      NULL, nested_svm_vmrun_msrpm))
+		return 1;
+
+	return 1;
+}
+
+static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	svm->vcpu.arch.hflags |= HF_GIF_MASK;
+
+	return 1;
+}
+
+static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+
+	/* After a CLGI no interrupts should come */
+	svm_clear_vintr(svm);
+	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+
+	return 1;
+}
+
 static int invalid_op_interception(struct vcpu_svm *svm,
 				   struct kvm_run *kvm_run)
 {
@@ -1250,6 +1940,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	case MSR_IA32_LASTINTTOIP:
 		*data = svm->vmcb->save.last_excp_to;
 		break;
+	case MSR_VM_HSAVE_PA:
+		*data = svm->hsave_msr;
+		break;
+	case MSR_VM_CR:
+		*data = 0;
+		break;
+	case MSR_IA32_UCODE_REV:
+		*data = 0x01000065;
+		break;
 	default:
 		return kvm_get_msr_common(vcpu, ecx, data);
 	}
@@ -1344,6 +2043,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
 
 		break;
+	case MSR_VM_HSAVE_PA:
+		svm->hsave_msr = data;
+		break;
 	default:
 		return kvm_set_msr_common(vcpu, ecx, data);
 	}
@@ -1380,7 +2082,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
 {
 	KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
 
-	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	/*
 	 * If the user space waits to inject interrupts, exit as soon as
@@ -1417,6 +2119,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_WRITE_DR3]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_DR5]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_DR7]			= emulate_on_interception,
+	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
+	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
 	[SVM_EXIT_EXCP_BASE + PF_VECTOR] 	= pf_interception,
 	[SVM_EXIT_EXCP_BASE + NM_VECTOR] 	= nm_interception,
@@ -1436,12 +2140,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_MSR]				= msr_interception,
 	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
 	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
-	[SVM_EXIT_VMRUN]			= invalid_op_interception,
+	[SVM_EXIT_VMRUN]			= vmrun_interception,
 	[SVM_EXIT_VMMCALL]			= vmmcall_interception,
-	[SVM_EXIT_VMLOAD]			= invalid_op_interception,
-	[SVM_EXIT_VMSAVE]			= invalid_op_interception,
-	[SVM_EXIT_STGI]				= invalid_op_interception,
-	[SVM_EXIT_CLGI]				= invalid_op_interception,
+	[SVM_EXIT_VMLOAD]			= vmload_interception,
+	[SVM_EXIT_VMSAVE]			= vmsave_interception,
+	[SVM_EXIT_STGI]				= stgi_interception,
+	[SVM_EXIT_CLGI]				= clgi_interception,
 	[SVM_EXIT_SKINIT]			= invalid_op_interception,
 	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
 	[SVM_EXIT_MONITOR]			= invalid_op_interception,
@@ -1457,6 +2161,17 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
 		    (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
 
+	if (is_nested(svm)) {
+		nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
+			    exit_code, svm->vmcb->control.exit_info_1,
+			    svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
+		if (nested_svm_exit_handled(svm, true)) {
+			nested_svm_vmexit(svm);
+			nsvm_printk("-> #VMEXIT\n");
+			return 1;
+		}
+	}
+
 	if (npt_enabled) {
 		int mmu_reload = 0;
 		if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -1544,6 +2259,8 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	nested_svm_intr(svm);
+
 	svm_inject_irq(svm, irq);
 }
 
@@ -1589,11 +2306,17 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
 	if (!kvm_cpu_has_interrupt(vcpu))
 		goto out;
 
+	if (nested_svm_intr(svm))
+		goto out;
+
+	if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
+		goto out;
+
 	if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
 	    (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
 	    (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
 		/* unable to deliver irq, set pending irq */
-		vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
+		svm_set_vintr(svm);
 		svm_inject_irq(svm, 0x0);
 		goto out;
 	}
@@ -1615,7 +2338,8 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
 	}
 
 	svm->vcpu.arch.interrupt_window_open =
-		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
+		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+		 (svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
 static void svm_do_inject_vector(struct vcpu_svm *svm)
@@ -1637,9 +2361,13 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb_control_area *control = &svm->vmcb->control;
 
+	if (nested_svm_intr(svm))
+		return;
+
 	svm->vcpu.arch.interrupt_window_open =
 		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
+		 (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
+		 (svm->vcpu.arch.hflags & HF_GIF_MASK));
 
 	if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
 		/*
@@ -1652,9 +2380,9 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	 */
 	if (!svm->vcpu.arch.interrupt_window_open &&
 	    (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
-		control->intercept |= 1ULL << INTERCEPT_VINTR;
-	 else
-		control->intercept &= ~(1ULL << INTERCEPT_VINTR);
+		svm_set_vintr(svm);
+	else
+		svm_clear_vintr(svm);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -1662,22 +2390,6 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
 	return 0;
 }
 
-static void save_db_regs(unsigned long *db_regs)
-{
-	asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
-	asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
-	asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
-	asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
-}
-
-static void load_db_regs(unsigned long *db_regs)
-{
-	asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
-	asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
-	asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
-	asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
-}
-
 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	force_new_asid(vcpu);
@@ -1736,19 +2448,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	gs_selector = kvm_read_gs();
 	ldt_selector = kvm_read_ldt();
 	svm->host_cr2 = kvm_read_cr2();
-	svm->host_dr6 = read_dr6();
-	svm->host_dr7 = read_dr7();
-	svm->vmcb->save.cr2 = vcpu->arch.cr2;
+	if (!is_nested(svm))
+		svm->vmcb->save.cr2 = vcpu->arch.cr2;
 	/* required for live migration with NPT */
 	if (npt_enabled)
 		svm->vmcb->save.cr3 = vcpu->arch.cr3;
 
-	if (svm->vmcb->save.dr7 & 0xff) {
-		write_dr7(0);
-		save_db_regs(svm->host_db_regs);
-		load_db_regs(svm->db_regs);
-	}
-
 	clgi();
 
 	local_irq_enable();
@@ -1824,16 +2529,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 #endif
 		);
 
-	if ((svm->vmcb->save.dr7 & 0xff))
-		load_db_regs(svm->host_db_regs);
-
 	vcpu->arch.cr2 = svm->vmcb->save.cr2;
 	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
 	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
 	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
-	write_dr6(svm->host_dr6);
-	write_dr7(svm->host_dr7);
 	kvm_write_cr2(svm->host_cr2);
 
 	kvm_load_fs(fs_selector);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7611af57682..bb481330716 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -91,6 +91,7 @@ struct vcpu_vmx {
 	} rmode;
 	int vpid;
 	bool emulation_required;
+	enum emulation_result invalid_state_emulation_result;
 
 	/* Support for vnmi-less CPUs */
 	int soft_vnmi_blocked;
@@ -189,21 +190,21 @@ static inline int is_page_fault(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+		(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_no_device(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+		(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_invalid_opcode(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+		(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_external_interrupt(u32 intr_info)
@@ -480,8 +481,13 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
 	if (!vcpu->fpu_active)
 		eb |= 1u << NM_VECTOR;
-	if (vcpu->guest_debug.enabled)
-		eb |= 1u << DB_VECTOR;
+	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			eb |= 1u << DB_VECTOR;
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			eb |= 1u << BP_VECTOR;
+	}
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
 	if (vm_need_ept())
@@ -747,29 +753,33 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-	if (has_error_code)
+	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+	}
 
 	if (vcpu->arch.rmode.active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = nr;
 		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		if (nr == BP_VECTOR)
+		if (nr == BP_VECTOR || nr == OF_VECTOR)
 			vmx->rmode.irq.rip++;
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     nr | INTR_TYPE_SOFT_INTR
-			     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
-			     | INTR_INFO_VALID_MASK);
+		intr_info |= INTR_TYPE_SOFT_INTR;
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
 		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
 		return;
 	}
 
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-		     nr | INTR_TYPE_EXCEPTION
-		     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
-		     | INTR_INFO_VALID_MASK);
+	if (nr == BP_VECTOR || nr == OF_VECTOR) {
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+	} else
+		intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 }
 
 static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
@@ -856,11 +866,8 @@ static u64 guest_read_tsc(void)
  * writes 'guest_tsc' into guest's timestamp counter "register"
  * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
  */
-static void guest_write_tsc(u64 guest_tsc)
+static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 {
-	u64 host_tsc;
-
-	rdtscll(host_tsc);
 	vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
 }
 
@@ -925,14 +932,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct kvm_msr_entry *msr;
+	u64 host_tsc;
 	int ret = 0;
 
 	switch (msr_index) {
-#ifdef CONFIG_X86_64
 	case MSR_EFER:
 		vmx_load_host_state(vmx);
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
 		break;
+#ifdef CONFIG_X86_64
 	case MSR_FS_BASE:
 		vmcs_writel(GUEST_FS_BASE, data);
 		break;
@@ -950,7 +958,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		vmcs_writel(GUEST_SYSENTER_ESP, data);
 		break;
 	case MSR_IA32_TIME_STAMP_COUNTER:
-		guest_write_tsc(data);
+		rdtscll(host_tsc);
+		guest_write_tsc(data, host_tsc);
 		break;
 	case MSR_P6_PERFCTR0:
 	case MSR_P6_PERFCTR1:
@@ -999,40 +1008,28 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 	}
 }
 
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-	unsigned long dr7 = 0x400;
-	int old_singlestep;
-
-	old_singlestep = vcpu->guest_debug.singlestep;
-
-	vcpu->guest_debug.enabled = dbg->enabled;
-	if (vcpu->guest_debug.enabled) {
-		int i;
+	int old_debug = vcpu->guest_debug;
+	unsigned long flags;
 
-		dr7 |= 0x200;  /* exact */
-		for (i = 0; i < 4; ++i) {
-			if (!dbg->breakpoints[i].enabled)
-				continue;
-			vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
-			dr7 |= 2 << (i*2);    /* global enable */
-			dr7 |= 0 << (i*4+16); /* execution breakpoint */
-		}
+	vcpu->guest_debug = dbg->control;
+	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+		vcpu->guest_debug = 0;
 
-		vcpu->guest_debug.singlestep = dbg->singlestep;
-	} else
-		vcpu->guest_debug.singlestep = 0;
-
-	if (old_singlestep && !vcpu->guest_debug.singlestep) {
-		unsigned long flags;
+	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+		vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
+	else
+		vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
 
-		flags = vmcs_readl(GUEST_RFLAGS);
+	flags = vmcs_readl(GUEST_RFLAGS);
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+	else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
 		flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-		vmcs_writel(GUEST_RFLAGS, flags);
-	}
+	vmcs_writel(GUEST_RFLAGS, flags);
 
 	update_exception_bitmap(vcpu);
-	vmcs_writel(GUEST_DR7, dr7);
 
 	return 0;
 }
@@ -1433,6 +1430,29 @@ continue_rmode:
 	init_rmode(vcpu->kvm);
 }
 
+static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+
+	vcpu->arch.shadow_efer = efer;
+	if (!msr)
+		return;
+	if (efer & EFER_LMA) {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+			     vmcs_read32(VM_ENTRY_CONTROLS) |
+			     VM_ENTRY_IA32E_MODE);
+		msr->data = efer;
+	} else {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+			     vmcs_read32(VM_ENTRY_CONTROLS) &
+			     ~VM_ENTRY_IA32E_MODE);
+
+		msr->data = efer & ~EFER_LME;
+	}
+	setup_msrs(vmx);
+}
+
 #ifdef CONFIG_X86_64
 
 static void enter_lmode(struct kvm_vcpu *vcpu)
@@ -1447,13 +1467,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 			     (guest_tr_ar & ~AR_TYPE_MASK)
 			     | AR_TYPE_BUSY_64_TSS);
 	}
-
 	vcpu->arch.shadow_efer |= EFER_LMA;
-
-	find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
-	vmcs_write32(VM_ENTRY_CONTROLS,
-		     vmcs_read32(VM_ENTRY_CONTROLS)
-		     | VM_ENTRY_IA32E_MODE);
+	vmx_set_efer(vcpu, vcpu->arch.shadow_efer);
 }
 
 static void exit_lmode(struct kvm_vcpu *vcpu)
@@ -1612,30 +1627,6 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	vmcs_writel(GUEST_CR4, hw_cr4);
 }
 
-static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
-
-	vcpu->arch.shadow_efer = efer;
-	if (!msr)
-		return;
-	if (efer & EFER_LMA) {
-		vmcs_write32(VM_ENTRY_CONTROLS,
-				     vmcs_read32(VM_ENTRY_CONTROLS) |
-				     VM_ENTRY_IA32E_MODE);
-		msr->data = efer;
-
-	} else {
-		vmcs_write32(VM_ENTRY_CONTROLS,
-				     vmcs_read32(VM_ENTRY_CONTROLS) &
-				     ~VM_ENTRY_IA32E_MODE);
-
-		msr->data = efer & ~EFER_LME;
-	}
-	setup_msrs(vmx);
-}
-
 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1653,7 +1644,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 	var->limit = vmcs_read32(sf->limit);
 	var->selector = vmcs_read16(sf->selector);
 	ar = vmcs_read32(sf->ar_bytes);
-	if (ar & AR_UNUSABLE_MASK)
+	if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
 		ar = 0;
 	var->type = ar & 15;
 	var->s = (ar >> 4) & 1;
@@ -1788,14 +1779,16 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
 	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
 	cs_rpl = cs.selector & SELECTOR_RPL_MASK;
 
+	if (cs.unusable)
+		return false;
 	if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
 		return false;
 	if (!cs.s)
 		return false;
-	if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
+	if (cs.type & AR_TYPE_WRITEABLE_MASK) {
 		if (cs.dpl > cs_rpl)
 			return false;
-	} else if (cs.type & AR_TYPE_CODE_MASK) {
+	} else {
 		if (cs.dpl != cs_rpl)
 			return false;
 	}
@@ -1814,7 +1807,9 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu)
 	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
 	ss_rpl = ss.selector & SELECTOR_RPL_MASK;
 
-	if ((ss.type != 3) || (ss.type != 7))
+	if (ss.unusable)
+		return true;
+	if (ss.type != 3 && ss.type != 7)
 		return false;
 	if (!ss.s)
 		return false;
@@ -1834,6 +1829,8 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
 	vmx_get_segment(vcpu, &var, seg);
 	rpl = var.selector & SELECTOR_RPL_MASK;
 
+	if (var.unusable)
+		return true;
 	if (!var.s)
 		return false;
 	if (!var.present)
@@ -1855,9 +1852,11 @@ static bool tr_valid(struct kvm_vcpu *vcpu)
 
 	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
 
+	if (tr.unusable)
+		return false;
 	if (tr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
 		return false;
-	if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
+	if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
 		return false;
 	if (!tr.present)
 		return false;
@@ -1871,6 +1870,8 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
 
 	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
 
+	if (ldtr.unusable)
+		return true;
 	if (ldtr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
 		return false;
 	if (ldtr.type != 2)
@@ -2112,7 +2113,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
 	u32 host_sysenter_cs, msr_low, msr_high;
 	u32 junk;
-	u64 host_pat;
+	u64 host_pat, tsc_this, tsc_base;
 	unsigned long a;
 	struct descriptor_table dt;
 	int i;
@@ -2240,6 +2241,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
 	vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
 
+	tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
+	rdtscll(tsc_this);
+	if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
+		tsc_base = tsc_this;
+
+	guest_write_tsc(0, tsc_base);
 
 	return 0;
 }
@@ -2319,7 +2326,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		kvm_rip_write(vcpu, 0);
 	kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 
-	/* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
 	vmcs_writel(GUEST_DR7, 0x400);
 
 	vmcs_writel(GUEST_GDTR_BASE, 0);
@@ -2332,8 +2338,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
 	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
-	guest_write_tsc(0);
-
 	/* Special registers */
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 
@@ -2486,6 +2490,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 {
 	vmx_update_window_states(vcpu);
 
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_STI |
+				GUEST_INTR_STATE_MOV_SS);
+
 	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
 		if (vcpu->arch.interrupt.pending) {
 			enable_nmi_window(vcpu);
@@ -2536,24 +2545,6 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 	return 0;
 }
 
-static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
-{
-	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-
-	set_debugreg(dbg->bp[0], 0);
-	set_debugreg(dbg->bp[1], 1);
-	set_debugreg(dbg->bp[2], 2);
-	set_debugreg(dbg->bp[3], 3);
-
-	if (dbg->singlestep) {
-		unsigned long flags;
-
-		flags = vmcs_readl(GUEST_RFLAGS);
-		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-		vmcs_writel(GUEST_RFLAGS, flags);
-	}
-}
-
 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 				  int vec, u32 err_code)
 {
@@ -2570,9 +2561,17 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 	 *        the required debugging infrastructure rework.
 	 */
 	switch (vec) {
-	case DE_VECTOR:
 	case DB_VECTOR:
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			return 0;
+		kvm_queue_exception(vcpu, vec);
+		return 1;
 	case BP_VECTOR:
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			return 0;
+		/* fall through */
+	case DE_VECTOR:
 	case OF_VECTOR:
 	case BR_VECTOR:
 	case UD_VECTOR:
@@ -2589,8 +2588,8 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 intr_info, error_code;
-	unsigned long cr2, rip;
+	u32 intr_info, ex_no, error_code;
+	unsigned long cr2, rip, dr6;
 	u32 vect_info;
 	enum emulation_result er;
 
@@ -2649,14 +2648,30 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		return 1;
 	}
 
-	if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
-	    (INTR_TYPE_EXCEPTION | 1)) {
+	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+	switch (ex_no) {
+	case DB_VECTOR:
+		dr6 = vmcs_readl(EXIT_QUALIFICATION);
+		if (!(vcpu->guest_debug &
+		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+			vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
+			kvm_queue_exception(vcpu, DB_VECTOR);
+			return 1;
+		}
+		kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
+		/* fall through */
+	case BP_VECTOR:
 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
-		return 0;
+		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+		kvm_run->debug.arch.exception = ex_no;
+		break;
+	default:
+		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+		kvm_run->ex.exception = ex_no;
+		kvm_run->ex.error_code = error_code;
+		break;
 	}
-	kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
-	kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
-	kvm_run->ex.error_code = error_code;
 	return 0;
 }
 
@@ -2677,7 +2692,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	unsigned long exit_qualification;
-	int size, down, in, string, rep;
+	int size, in, string;
 	unsigned port;
 
 	++vcpu->stat.io_exits;
@@ -2693,8 +2708,6 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	size = (exit_qualification & 7) + 1;
 	in = (exit_qualification & 8) != 0;
-	down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
-	rep = (exit_qualification & 32) != 0;
 	port = exit_qualification >> 16;
 
 	skip_emulated_instruction(vcpu);
@@ -2795,21 +2808,44 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	unsigned long val;
 	int dr, reg;
 
-	/*
-	 * FIXME: this code assumes the host is debugging the guest.
-	 *        need to deal with guest debugging itself too.
-	 */
+	dr = vmcs_readl(GUEST_DR7);
+	if (dr & DR7_GD) {
+		/*
+		 * As the vm-exit takes precedence over the debug trap, we
+		 * need to emulate the latter, either for the host or the
+		 * guest debugging itself.
+		 */
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+			kvm_run->debug.arch.dr6 = vcpu->arch.dr6;
+			kvm_run->debug.arch.dr7 = dr;
+			kvm_run->debug.arch.pc =
+				vmcs_readl(GUEST_CS_BASE) +
+				vmcs_readl(GUEST_RIP);
+			kvm_run->debug.arch.exception = DB_VECTOR;
+			kvm_run->exit_reason = KVM_EXIT_DEBUG;
+			return 0;
+		} else {
+			vcpu->arch.dr7 &= ~DR7_GD;
+			vcpu->arch.dr6 |= DR6_BD;
+			vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+			kvm_queue_exception(vcpu, DB_VECTOR);
+			return 1;
+		}
+	}
+
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	dr = exit_qualification & 7;
-	reg = (exit_qualification >> 8) & 15;
-	if (exit_qualification & 16) {
-		/* mov from dr */
+	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
+	if (exit_qualification & TYPE_MOV_FROM_DR) {
 		switch (dr) {
+		case 0 ... 3:
+			val = vcpu->arch.db[dr];
+			break;
 		case 6:
-			val = 0xffff0ff0;
+			val = vcpu->arch.dr6;
 			break;
 		case 7:
-			val = 0x400;
+			val = vcpu->arch.dr7;
 			break;
 		default:
 			val = 0;
@@ -2817,7 +2853,38 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		kvm_register_write(vcpu, reg, val);
 		KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
 	} else {
-		/* mov to dr */
+		val = vcpu->arch.regs[reg];
+		switch (dr) {
+		case 0 ... 3:
+			vcpu->arch.db[dr] = val;
+			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+				vcpu->arch.eff_db[dr] = val;
+			break;
+		case 4 ... 5:
+			if (vcpu->arch.cr4 & X86_CR4_DE)
+				kvm_queue_exception(vcpu, UD_VECTOR);
+			break;
+		case 6:
+			if (val & 0xffffffff00000000ULL) {
+				kvm_queue_exception(vcpu, GP_VECTOR);
+				break;
+			}
+			vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+			break;
+		case 7:
+			if (val & 0xffffffff00000000ULL) {
+				kvm_queue_exception(vcpu, GP_VECTOR);
+				break;
+			}
+			vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+				vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+				vcpu->arch.switch_db_regs =
+					(val & DR7_BP_EN_MASK);
+			}
+			break;
+		}
+		KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
 	}
 	skip_emulated_instruction(vcpu);
 	return 1;
@@ -2968,17 +3035,25 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 	tss_selector = exit_qualification;
 
-	return kvm_task_switch(vcpu, tss_selector, reason);
+	if (!kvm_task_switch(vcpu, tss_selector, reason))
+		return 0;
+
+	/* clear all local breakpoint enable flags */
+	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
+
+	/*
+	 * TODO: What about debug traps on tss switch?
+	 *       Are we supposed to inject them and update dr6?
+	 */
+
+	return 1;
 }
 
 static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	u64 exit_qualification;
-	enum emulation_result er;
 	gpa_t gpa;
-	unsigned long hva;
 	int gla_validity;
-	int r;
 
 	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
 
@@ -3001,32 +3076,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-	hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
-	if (!kvm_is_error_hva(hva)) {
-		r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
-		if (r < 0) {
-			printk(KERN_ERR "EPT: Not enough memory!\n");
-			return -ENOMEM;
-		}
-		return 1;
-	} else {
-		/* must be MMIO */
-		er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
-
-		if (er == EMULATE_FAIL) {
-			printk(KERN_ERR
-			 "EPT: Fail to handle EPT violation vmexit!er is %d\n",
-			 er);
-			printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
-			 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
-			 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
-			printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
-				(long unsigned int)exit_qualification);
-			return -ENOTSUPP;
-		} else if (er == EMULATE_DO_MMIO)
-			return 0;
-	}
-	return 1;
+	return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
 }
 
 static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -3046,7 +3096,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 				struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int err;
+	enum emulation_result err = EMULATE_DONE;
 
 	preempt_enable();
 	local_irq_enable();
@@ -3071,10 +3121,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 	local_irq_disable();
 	preempt_disable();
 
-	/* Guest state should be valid now except if we need to
-	 * emulate an MMIO */
-	if (guest_state_valid(vcpu))
-		vmx->emulation_required = 0;
+	vmx->invalid_state_emulation_result = err;
 }
 
 /*
@@ -3123,8 +3170,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	/* If we need to emulate an MMIO from handle_invalid_guest_state
 	 * we just return 0 */
-	if (vmx->emulation_required && emulate_invalid_guest_state)
-		return 0;
+	if (vmx->emulation_required && emulate_invalid_guest_state) {
+		if (guest_state_valid(vcpu))
+			vmx->emulation_required = 0;
+		return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
+	}
 
 	/* Access CR3 don't cause VMExit in paging mode, so we need
 	 * to sync with guest real CR3. */
@@ -3238,7 +3288,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 			vmx->vcpu.arch.nmi_injected = false;
 	}
 	kvm_clear_exception_queue(&vmx->vcpu);
-	if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
+	if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION ||
+				type == INTR_TYPE_SOFT_EXCEPTION)) {
 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
 			error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
 			kvm_queue_exception_e(&vmx->vcpu, vector, error);
@@ -3259,6 +3310,11 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 
 	vmx_update_window_states(vcpu);
 
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_STI |
+				GUEST_INTR_STATE_MOV_SS);
+
 	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
 		if (vcpu->arch.interrupt.pending) {
 			enable_nmi_window(vcpu);
@@ -3347,6 +3403,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	 */
 	vmcs_writel(HOST_CR0, read_cr0());
 
+	set_debugreg(vcpu->arch.dr6, 6);
+
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -3441,6 +3499,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
 	vcpu->arch.regs_dirty = 0;
 
+	get_debugreg(vcpu->arch.dr6, 6);
+
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
@@ -3595,7 +3655,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.vcpu_put = vmx_vcpu_put,
 
 	.set_guest_debug = set_guest_debug,
-	.guest_debug_pre = kvm_guest_debug_pre,
 	.get_msr = vmx_get_msr,
 	.set_msr = vmx_set_msr,
 	.get_segment_base = vmx_get_segment_base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 758b7a155ae..8ca100a9eca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -36,6 +36,7 @@
 #include <linux/highmem.h>
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
+#include <linux/cpufreq.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -69,6 +70,8 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
 
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				    struct kvm_cpuid_entry2 __user *entries);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+					      u32 function, u32 index);
 
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -173,6 +176,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 			   u32 error_code)
 {
 	++vcpu->stat.pf_guest;
+
 	if (vcpu->arch.exception.pending) {
 		if (vcpu->arch.exception.nr == PF_VECTOR) {
 			printk(KERN_DEBUG "kvm: inject_page_fault:"
@@ -361,6 +365,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
+	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 }
@@ -442,6 +447,11 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+static inline u32 bit(int bitno)
+{
+	return 1 << (bitno & 31);
+}
+
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -456,7 +466,7 @@ static u32 msrs_to_save[] = {
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
 	MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-	MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
+	MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 };
 
 static unsigned num_msrs_to_save;
@@ -481,6 +491,28 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 		return;
 	}
 
+	if (efer & EFER_FFXSR) {
+		struct kvm_cpuid_entry2 *feat;
+
+		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
+			printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	}
+
+	if (efer & EFER_SVME) {
+		struct kvm_cpuid_entry2 *feat;
+
+		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
+			printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	}
+
 	kvm_x86_ops->set_efer(vcpu, efer);
 
 	efer &= ~EFER_LMA;
@@ -586,6 +618,8 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
 		 hv_clock->tsc_to_system_mul);
 }
 
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
 	struct timespec ts;
@@ -596,9 +630,9 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	if ((!vcpu->time_page))
 		return;
 
-	if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
-		kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = tsc_khz;
+	if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
+		kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
+		vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	}
 
 	/* Keep irq disabled to prevent changes to the clock */
@@ -629,6 +663,16 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 }
 
+static int kvm_request_guest_time_update(struct kvm_vcpu *v)
+{
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+
+	if (!vcpu->time_page)
+		return 0;
+	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+	return 1;
+}
+
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
@@ -722,6 +766,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		break;
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_UCODE_WRITE:
+	case MSR_VM_HSAVE_PA:
 		break;
 	case 0x200 ... 0x2ff:
 		return set_msr_mtrr(vcpu, msr, data);
@@ -758,7 +803,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			vcpu->arch.time_page = NULL;
 		}
 
-		kvm_write_guest_time(vcpu);
+		kvm_request_guest_time_update(vcpu);
 		break;
 	}
 	default:
@@ -843,6 +888,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_LASTBRANCHTOIP:
 	case MSR_IA32_LASTINTFROMIP:
 	case MSR_IA32_LASTINTTOIP:
+	case MSR_VM_HSAVE_PA:
 		data = 0;
 		break;
 	case MSR_MTRRcap:
@@ -967,10 +1013,13 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
+	case KVM_CAP_CLOCKSOURCE:
 	case KVM_CAP_PIT:
 	case KVM_CAP_NOP_IO_DELAY:
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_SYNC_MMU:
+	case KVM_CAP_REINJECT_CONTROL:
+	case KVM_CAP_IRQ_INJECT_STATUS:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -991,9 +1040,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_IOMMU:
 		r = iommu_found();
 		break;
-	case KVM_CAP_CLOCKSOURCE:
-		r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC);
-		break;
 	default:
 		r = 0;
 		break;
@@ -1044,7 +1090,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
-			cpuid_arg->entries);
+						      cpuid_arg->entries);
 		if (r)
 			goto out;
 
@@ -1064,7 +1110,7 @@ out:
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	kvm_write_guest_time(vcpu);
+	kvm_request_guest_time_update(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1142,8 +1188,8 @@ out:
 }
 
 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+				     struct kvm_cpuid2 *cpuid,
+				     struct kvm_cpuid_entry2 __user *entries)
 {
 	int r;
 
@@ -1162,8 +1208,8 @@ out:
 }
 
 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+				     struct kvm_cpuid2 *cpuid,
+				     struct kvm_cpuid_entry2 __user *entries)
 {
 	int r;
 
@@ -1172,7 +1218,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 		goto out;
 	r = -EFAULT;
 	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
-			   vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+			 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out;
 	return 0;
 
@@ -1181,18 +1227,13 @@ out:
 	return r;
 }
 
-static inline u32 bit(int bitno)
-{
-	return 1 << (bitno & 31);
-}
-
 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-			  u32 index)
+			   u32 index)
 {
 	entry->function = function;
 	entry->index = index;
 	cpuid_count(entry->function, entry->index,
-		&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
 	entry->flags = 0;
 }
 
@@ -1222,15 +1263,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 #ifdef CONFIG_X86_64
 		bit(X86_FEATURE_LM) |
 #endif
+		bit(X86_FEATURE_FXSR_OPT) |
 		bit(X86_FEATURE_MMXEXT) |
 		bit(X86_FEATURE_3DNOWEXT) |
 		bit(X86_FEATURE_3DNOW);
 	const u32 kvm_supported_word3_x86_features =
 		bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
 	const u32 kvm_supported_word6_x86_features =
-		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
+		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
+		bit(X86_FEATURE_SVM);
 
-	/* all func 2 cpuid_count() should be called on the same cpu */
+	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
 	do_cpuid_1_ent(entry, function, index);
 	++*nent;
@@ -1304,7 +1347,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 }
 
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+				     struct kvm_cpuid_entry2 __user *entries)
 {
 	struct kvm_cpuid_entry2 *cpuid_entries;
 	int limit, nent = 0, r = -E2BIG;
@@ -1321,7 +1364,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	limit = cpuid_entries[0].eax;
 	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
 		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-				&nent, cpuid->nent);
+			     &nent, cpuid->nent);
 	r = -E2BIG;
 	if (nent >= cpuid->nent)
 		goto out_free;
@@ -1330,10 +1373,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	limit = cpuid_entries[nent - 1].eax;
 	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
 		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			       &nent, cpuid->nent);
+			     &nent, cpuid->nent);
 	r = -EFAULT;
 	if (copy_to_user(entries, cpuid_entries,
-			nent * sizeof(struct kvm_cpuid_entry2)))
+			 nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out_free;
 	cpuid->nent = nent;
 	r = 0;
@@ -1477,7 +1520,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
-				cpuid_arg->entries);
+					      cpuid_arg->entries);
 		if (r)
 			goto out;
 		break;
@@ -1490,7 +1533,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
-				cpuid_arg->entries);
+					      cpuid_arg->entries);
 		if (r)
 			goto out;
 		r = -EFAULT;
@@ -1710,6 +1753,15 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 	return r;
 }
 
+static int kvm_vm_ioctl_reinject(struct kvm *kvm,
+				 struct kvm_reinject_control *control)
+{
+	if (!kvm->arch.vpit)
+		return -ENXIO;
+	kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+	return 0;
+}
+
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
@@ -1807,13 +1859,26 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			}
 		} else
 			goto out;
+		r = kvm_setup_default_irq_routing(kvm);
+		if (r) {
+			kfree(kvm->arch.vpic);
+			kfree(kvm->arch.vioapic);
+			goto out;
+		}
 		break;
 	case KVM_CREATE_PIT:
+		mutex_lock(&kvm->lock);
+		r = -EEXIST;
+		if (kvm->arch.vpit)
+			goto create_pit_unlock;
 		r = -ENOMEM;
 		kvm->arch.vpit = kvm_create_pit(kvm);
 		if (kvm->arch.vpit)
 			r = 0;
+	create_pit_unlock:
+		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_IRQ_LINE_STATUS:
 	case KVM_IRQ_LINE: {
 		struct kvm_irq_level irq_event;
 
@@ -1821,10 +1886,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
+			__s32 status;
 			mutex_lock(&kvm->lock);
-			kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-				    irq_event.irq, irq_event.level);
+			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+					irq_event.irq, irq_event.level);
 			mutex_unlock(&kvm->lock);
+			if (ioctl == KVM_IRQ_LINE_STATUS) {
+				irq_event.status = status;
+				if (copy_to_user(argp, &irq_event,
+							sizeof irq_event))
+					goto out;
+			}
 			r = 0;
 		}
 		break;
@@ -1907,6 +1979,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_REINJECT_CONTROL: {
+		struct kvm_reinject_control control;
+		r =  -EFAULT;
+		if (copy_from_user(&control, argp, sizeof(control)))
+			goto out;
+		r = kvm_vm_ioctl_reinject(kvm, &control);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
@@ -1960,10 +2043,38 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
 	return dev;
 }
 
-int emulator_read_std(unsigned long addr,
-			     void *val,
-			     unsigned int bytes,
-			     struct kvm_vcpu *vcpu)
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			       struct kvm_vcpu *vcpu)
+{
+	void *data = val;
+	int r = X86EMUL_CONTINUE;
+
+	while (bytes) {
+		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+		unsigned offset = addr & (PAGE_SIZE-1);
+		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
+		int ret;
+
+		if (gpa == UNMAPPED_GVA) {
+			r = X86EMUL_PROPAGATE_FAULT;
+			goto out;
+		}
+		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
+		if (ret < 0) {
+			r = X86EMUL_UNHANDLEABLE;
+			goto out;
+		}
+
+		bytes -= toread;
+		data += toread;
+		addr += toread;
+	}
+out:
+	return r;
+}
+
+static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+				struct kvm_vcpu *vcpu)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
@@ -1971,27 +2082,27 @@ int emulator_read_std(unsigned long addr,
 	while (bytes) {
 		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
 		unsigned offset = addr & (PAGE_SIZE-1);
-		unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
+		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
 
 		if (gpa == UNMAPPED_GVA) {
 			r = X86EMUL_PROPAGATE_FAULT;
 			goto out;
 		}
-		ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
+		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
 		if (ret < 0) {
 			r = X86EMUL_UNHANDLEABLE;
 			goto out;
 		}
 
-		bytes -= tocopy;
-		data += tocopy;
-		addr += tocopy;
+		bytes -= towrite;
+		data += towrite;
+		addr += towrite;
 	}
 out:
 	return r;
 }
-EXPORT_SYMBOL_GPL(emulator_read_std);
+
 
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
@@ -2013,8 +2124,8 @@ static int emulator_read_emulated(unsigned long addr,
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		goto mmio;
 
-	if (emulator_read_std(addr, val, bytes, vcpu)
-			== X86EMUL_CONTINUE)
+	if (kvm_read_guest_virt(addr, val, bytes, vcpu)
+				== X86EMUL_CONTINUE)
 		return X86EMUL_CONTINUE;
 	if (gpa == UNMAPPED_GVA)
 		return X86EMUL_PROPAGATE_FAULT;
@@ -2217,7 +2328,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 
 	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
 
-	emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
+	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
 
 	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
 	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -2225,7 +2336,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
 static struct x86_emulate_ops emulate_ops = {
-	.read_std            = emulator_read_std,
+	.read_std            = kvm_read_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
@@ -2327,40 +2438,19 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
-		if (vcpu->arch.pio.guest_pages[i]) {
-			kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
-			vcpu->arch.pio.guest_pages[i] = NULL;
-		}
-}
-
 static int pio_copy_data(struct kvm_vcpu *vcpu)
 {
 	void *p = vcpu->arch.pio_data;
-	void *q;
+	gva_t q = vcpu->arch.pio.guest_gva;
 	unsigned bytes;
-	int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
+	int ret;
 
-	q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
-		 PAGE_KERNEL);
-	if (!q) {
-		free_pio_guest_pages(vcpu);
-		return -ENOMEM;
-	}
-	q += vcpu->arch.pio.guest_page_offset;
 	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
 	if (vcpu->arch.pio.in)
-		memcpy(q, p, bytes);
+		ret = kvm_write_guest_virt(q, p, bytes, vcpu);
 	else
-		memcpy(p, q, bytes);
-	q -= vcpu->arch.pio.guest_page_offset;
-	vunmap(q);
-	free_pio_guest_pages(vcpu);
-	return 0;
+		ret = kvm_read_guest_virt(q, p, bytes, vcpu);
+	return ret;
 }
 
 int complete_pio(struct kvm_vcpu *vcpu)
@@ -2471,7 +2561,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.string = 0;
 	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.guest_page_offset = 0;
 	vcpu->arch.pio.rep = 0;
 
 	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
@@ -2499,9 +2588,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		  gva_t address, int rep, unsigned port)
 {
 	unsigned now, in_page;
-	int i, ret = 0;
-	int nr_pages = 1;
-	struct page *page;
+	int ret = 0;
 	struct kvm_io_device *pio_dev;
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2513,7 +2600,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.string = 1;
 	vcpu->arch.pio.down = down;
-	vcpu->arch.pio.guest_page_offset = offset_in_page(address);
 	vcpu->arch.pio.rep = rep;
 
 	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
@@ -2533,15 +2619,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	else
 		in_page = offset_in_page(address) + size;
 	now = min(count, (unsigned long)in_page / size);
-	if (!now) {
-		/*
-		 * String I/O straddles page boundary.  Pin two guest pages
-		 * so that we satisfy atomicity constraints.  Do just one
-		 * transaction to avoid complexity.
-		 */
-		nr_pages = 2;
+	if (!now)
 		now = 1;
-	}
 	if (down) {
 		/*
 		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
@@ -2556,15 +2635,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
 		kvm_x86_ops->skip_emulated_instruction(vcpu);
 
-	for (i = 0; i < nr_pages; ++i) {
-		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
-		vcpu->arch.pio.guest_pages[i] = page;
-		if (!page) {
-			kvm_inject_gp(vcpu, 0);
-			free_pio_guest_pages(vcpu);
-			return 1;
-		}
-	}
+	vcpu->arch.pio.guest_gva = address;
 
 	pio_dev = vcpu_find_pio_dev(vcpu, port,
 				    vcpu->arch.pio.cur_count,
@@ -2572,7 +2643,11 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	if (!vcpu->arch.pio.in) {
 		/* string PIO write */
 		ret = pio_copy_data(vcpu);
-		if (ret >= 0 && pio_dev) {
+		if (ret == X86EMUL_PROPAGATE_FAULT) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+		if (ret == 0 && pio_dev) {
 			pio_string_write(pio_dev, vcpu);
 			complete_pio(vcpu);
 			if (vcpu->arch.pio.count == 0)
@@ -2587,9 +2662,72 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
 
+static void bounce_off(void *info)
+{
+	/* nothing */
+}
+
+static unsigned int  ref_freq;
+static unsigned long tsc_khz_ref;
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				     void *data)
+{
+	struct cpufreq_freqs *freq = data;
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int i, send_ipi = 0;
+
+	if (!ref_freq)
+		ref_freq = freq->old;
+
+	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+		return 0;
+	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+		return 0;
+	per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+
+	spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (vcpu->cpu != freq->cpu)
+				continue;
+			if (!kvm_request_guest_time_update(vcpu))
+				continue;
+			if (vcpu->cpu != smp_processor_id())
+				send_ipi++;
+		}
+	}
+	spin_unlock(&kvm_lock);
+
+	if (freq->old < freq->new && send_ipi) {
+		/*
+		 * We upscale the frequency.  Must make the guest
+		 * doesn't see old kvmclock values while running with
+		 * the new frequency, otherwise we risk the guest sees
+		 * time go backwards.
+		 *
+		 * In case we update the frequency for another cpu
+		 * (which might be in guest context) send an interrupt
+		 * to kick the cpu out of guest context.  Next time
+		 * guest context is entered kvmclock will be updated,
+		 * so the guest will not see stale values.
+		 */
+		smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+	}
+	return 0;
+}
+
+static struct notifier_block kvmclock_cpufreq_notifier_block = {
+        .notifier_call  = kvmclock_cpufreq_notifier
+};
+
 int kvm_arch_init(void *opaque)
 {
-	int r;
+	int r, cpu;
 	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
 
 	if (kvm_x86_ops) {
@@ -2620,6 +2758,15 @@ int kvm_arch_init(void *opaque)
 	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
+
+	for_each_possible_cpu(cpu)
+		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+		tsc_khz_ref = tsc_khz;
+		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+					  CPUFREQ_TRANSITION_NOTIFIER);
+	}
+
 	return 0;
 
 out:
@@ -2827,25 +2974,20 @@ static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
 	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
 		return 0;
 	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
-		!(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
 		return 0;
 	return 1;
 }
 
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+					      u32 function, u32 index)
 {
 	int i;
-	u32 function, index;
-	struct kvm_cpuid_entry2 *e, *best;
+	struct kvm_cpuid_entry2 *best = NULL;
 
-	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
-	best = NULL;
 	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+		struct kvm_cpuid_entry2 *e;
+
 		e = &vcpu->arch.cpuid_entries[i];
 		if (is_matching_cpuid_entry(e, function, index)) {
 			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
@@ -2860,6 +3002,21 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 			if (!best || e->function > best->function)
 				best = e;
 	}
+	return best;
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+	u32 function, index;
+	struct kvm_cpuid_entry2 *best;
+
+	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
+	best = kvm_find_cpuid_entry(vcpu, function, index);
 	if (best) {
 		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
 		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
@@ -2945,6 +3102,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->requests) {
 		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
 			__kvm_migrate_timers(vcpu);
+		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+			kvm_write_guest_time(vcpu);
 		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
 			kvm_mmu_sync_roots(vcpu);
 		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
@@ -2979,9 +3138,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}
 
-	if (vcpu->guest_debug.enabled)
-		kvm_x86_ops->guest_debug_pre(vcpu);
-
 	vcpu->guest_mode = 1;
 	/*
 	 * Make sure that guest_mode assignment won't happen after
@@ -3002,10 +3158,34 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	kvm_guest_enter();
 
+	get_debugreg(vcpu->arch.host_dr6, 6);
+	get_debugreg(vcpu->arch.host_dr7, 7);
+	if (unlikely(vcpu->arch.switch_db_regs)) {
+		get_debugreg(vcpu->arch.host_db[0], 0);
+		get_debugreg(vcpu->arch.host_db[1], 1);
+		get_debugreg(vcpu->arch.host_db[2], 2);
+		get_debugreg(vcpu->arch.host_db[3], 3);
+
+		set_debugreg(0, 7);
+		set_debugreg(vcpu->arch.eff_db[0], 0);
+		set_debugreg(vcpu->arch.eff_db[1], 1);
+		set_debugreg(vcpu->arch.eff_db[2], 2);
+		set_debugreg(vcpu->arch.eff_db[3], 3);
+	}
 
 	KVMTRACE_0D(VMENTRY, vcpu, entryexit);
 	kvm_x86_ops->run(vcpu, kvm_run);
 
+	if (unlikely(vcpu->arch.switch_db_regs)) {
+		set_debugreg(0, 7);
+		set_debugreg(vcpu->arch.host_db[0], 0);
+		set_debugreg(vcpu->arch.host_db[1], 1);
+		set_debugreg(vcpu->arch.host_db[2], 2);
+		set_debugreg(vcpu->arch.host_db[3], 3);
+	}
+	set_debugreg(vcpu->arch.host_dr6, 6);
+	set_debugreg(vcpu->arch.host_dr7, 7);
+
 	vcpu->guest_mode = 0;
 	local_irq_enable();
 
@@ -3192,7 +3372,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	/*
 	 * Don't leak debug flags in case they were set for guest debugging
 	 */
-	if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
 
 	vcpu_put(vcpu);
@@ -3811,15 +3991,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-				    struct kvm_debug_guest *dbg)
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
 {
-	int r;
+	int i, r;
 
 	vcpu_load(vcpu);
 
+	if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
+	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
+		for (i = 0; i < KVM_NR_DB_REGS; ++i)
+			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
+		vcpu->arch.switch_db_regs =
+			(dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
+	} else {
+		for (i = 0; i < KVM_NR_DB_REGS; i++)
+			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+		vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
+	}
+
 	r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
 
+	if (dbg->control & KVM_GUESTDBG_INJECT_DB)
+		kvm_queue_exception(vcpu, DB_VECTOR);
+	else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
+		kvm_queue_exception(vcpu, BP_VECTOR);
+
 	vcpu_put(vcpu);
 
 	return r;
@@ -4007,6 +4204,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.nmi_pending = false;
 	vcpu->arch.nmi_injected = false;
 
+	vcpu->arch.switch_db_regs = 0;
+	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+	vcpu->arch.dr6 = DR6_FIXED_1;
+	vcpu->arch.dr7 = DR7_FIXED_1;
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -4100,6 +4302,8 @@ struct  kvm *kvm_arch_create_vm(void)
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
+	rdtscll(kvm->arch.vm_init_tsc);
+
 	return kvm;
 }
 
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d174db7a337..ca91749d208 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -178,7 +178,7 @@ static u32 opcode_table[256] = {
 	0, ImplicitOps | Stack, 0, 0,
 	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
 	/* 0xC8 - 0xCF */
-	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0,
 	/* 0xD0 - 0xD7 */
 	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
 	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -1136,18 +1136,19 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
-		       struct x86_emulate_ops *ops)
+		       struct x86_emulate_ops *ops,
+		       void *dest, int len)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
 	rc = ops->read_emulated(register_address(c, ss_base(ctxt),
 						 c->regs[VCPU_REGS_RSP]),
-				&c->src.val, c->src.bytes, ctxt->vcpu);
+				dest, len, ctxt->vcpu);
 	if (rc != 0)
 		return rc;
 
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
 	return rc;
 }
 
@@ -1157,11 +1158,9 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	c->src.bytes = c->dst.bytes;
-	rc = emulate_pop(ctxt, ops);
+	rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
 	if (rc != 0)
 		return rc;
-	c->dst.val = c->src.val;
 	return 0;
 }
 
@@ -1279,6 +1278,25 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 	return 0;
 }
 
+static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+	unsigned long cs;
+
+	rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
+	if (rc)
+		return rc;
+	if (c->op_bytes == 4)
+		c->eip = (u32)c->eip;
+	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+	if (rc)
+		return rc;
+	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS);
+	return rc;
+}
+
 static inline int writeback(struct x86_emulate_ctxt *ctxt,
 			    struct x86_emulate_ops *ops)
 {
@@ -1467,11 +1485,9 @@ special_insn:
 		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
-		c->src.bytes = c->op_bytes;
-		rc = emulate_pop(ctxt, ops);
+		rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
 		if (rc != 0)
 			goto done;
-		c->dst.val = c->src.val;
 		break;
 	case 0x63:		/* movsxd */
 		if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1738,6 +1754,11 @@ special_insn:
 	mov:
 		c->dst.val = c->src.val;
 		break;
+	case 0xcb:		/* ret far */
+		rc = emulate_ret_far(ctxt, ops);
+		if (rc)
+			goto done;
+		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		c->src.val = 1;
 		emulate_grp2(ctxt);
@@ -1908,11 +1929,16 @@ twobyte_insn:
 			c->dst.type = OP_NONE;
 			break;
 		case 3: /* lidt/vmmcall */
-			if (c->modrm_mod == 3 && c->modrm_rm == 1) {
-				rc = kvm_fix_hypercall(ctxt->vcpu);
-				if (rc)
-					goto done;
-				kvm_emulate_hypercall(ctxt->vcpu);
+			if (c->modrm_mod == 3) {
+				switch (c->modrm_rm) {
+				case 1:
+					rc = kvm_fix_hypercall(ctxt->vcpu);
+					if (rc)
+						goto done;
+					break;
+				default:
+					goto cannot_emulate;
+				}
 			} else {
 				rc = read_descriptor(ctxt, ops, c->src.ptr,
 						     &size, &address,
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index f3a5305b8ad..e94a11e42f9 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -107,7 +107,7 @@ static void async_hcall(unsigned long call, unsigned long arg1,
 	local_irq_save(flags);
 	if (lguest_data.hcall_status[next_call] != 0xFF) {
 		/* Table full, so do normal hcall which will flush table. */
-		hcall(call, arg1, arg2, arg3);
+		kvm_hypercall3(call, arg1, arg2, arg3);
 	} else {
 		lguest_data.hcalls[next_call].arg0 = call;
 		lguest_data.hcalls[next_call].arg1 = arg1;
@@ -134,13 +134,32 @@ static void async_hcall(unsigned long call, unsigned long arg1,
  *
  * So, when we're in lazy mode, we call async_hcall() to store the call for
  * future processing: */
-static void lazy_hcall(unsigned long call,
+static void lazy_hcall1(unsigned long call,
+		       unsigned long arg1)
+{
+	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+		kvm_hypercall1(call, arg1);
+	else
+		async_hcall(call, arg1, 0, 0);
+}
+
+static void lazy_hcall2(unsigned long call,
+		       unsigned long arg1,
+		       unsigned long arg2)
+{
+	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+		kvm_hypercall2(call, arg1, arg2);
+	else
+		async_hcall(call, arg1, arg2, 0);
+}
+
+static void lazy_hcall3(unsigned long call,
 		       unsigned long arg1,
 		       unsigned long arg2,
 		       unsigned long arg3)
 {
 	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3);
+		kvm_hypercall3(call, arg1, arg2, arg3);
 	else
 		async_hcall(call, arg1, arg2, arg3);
 }
@@ -150,7 +169,7 @@ static void lazy_hcall(unsigned long call,
 static void lguest_leave_lazy_mode(void)
 {
 	paravirt_leave_lazy(paravirt_get_lazy_mode());
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+	kvm_hypercall0(LHCALL_FLUSH_ASYNC);
 }
 
 /*G:033
@@ -229,7 +248,7 @@ static void lguest_write_idt_entry(gate_desc *dt,
 	/* Keep the local copy up to date. */
 	native_write_idt_entry(dt, entrynum, g);
 	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
+	kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
 }
 
 /* Changing to a different IDT is very rare: we keep the IDT up-to-date every
@@ -241,7 +260,7 @@ static void lguest_load_idt(const struct desc_ptr *desc)
 	struct desc_struct *idt = (void *)desc->address;
 
 	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
+		kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
 }
 
 /*
@@ -261,8 +280,8 @@ static void lguest_load_idt(const struct desc_ptr *desc)
  */
 static void lguest_load_gdt(const struct desc_ptr *desc)
 {
-	BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
-	hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
+	BUG_ON((desc->size + 1) / 8 != GDT_ENTRIES);
+	kvm_hypercall2(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES);
 }
 
 /* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
@@ -272,7 +291,7 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
 				   const void *desc, int type)
 {
 	native_write_gdt_entry(dt, entrynum, desc, type);
-	hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+	kvm_hypercall2(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES);
 }
 
 /* OK, I lied.  There are three "thread local storage" GDT entries which change
@@ -284,7 +303,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
 	 * can't handle us removing entries we're currently using.  So we clear
 	 * the GS register here: if it's needed it'll be reloaded anyway. */
 	lazy_load_gs(0);
-	lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+	lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
 }
 
 /*G:038 That's enough excitement for now, back to ploughing through each of
@@ -348,6 +367,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
 		 * flush_tlb_user() for both user and kernel mappings unless
 		 * the Page Global Enable (PGE) feature bit is set. */
 		*dx |= 0x00002000;
+		/* We also lie, and say we're family id 5.  6 or greater
+		 * leads to a rdmsr in early_init_intel which we can't handle.
+		 * Family ID is returned as bits 8-12 in ax. */
+		*ax &= 0xFFFFF0FF;
+		*ax |= 0x00000500;
 		break;
 	case 0x80000000:
 		/* Futureproof this a little: if they ask how much extended
@@ -377,7 +401,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
 static unsigned long current_cr0;
 static void lguest_write_cr0(unsigned long val)
 {
-	lazy_hcall(LHCALL_TS, val & X86_CR0_TS, 0, 0);
+	lazy_hcall1(LHCALL_TS, val & X86_CR0_TS);
 	current_cr0 = val;
 }
 
@@ -391,7 +415,7 @@ static unsigned long lguest_read_cr0(void)
  * the vowels have been optimized out. */
 static void lguest_clts(void)
 {
-	lazy_hcall(LHCALL_TS, 0, 0, 0);
+	lazy_hcall1(LHCALL_TS, 0);
 	current_cr0 &= ~X86_CR0_TS;
 }
 
@@ -413,7 +437,7 @@ static bool cr3_changed = false;
 static void lguest_write_cr3(unsigned long cr3)
 {
 	lguest_data.pgdir = cr3;
-	lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
 	cr3_changed = true;
 }
 
@@ -485,11 +509,17 @@ static void lguest_write_cr4(unsigned long val)
  * into a process' address space.  We set the entry then tell the Host the
  * toplevel and address this corresponds to.  The Guest uses one pagetable per
  * process, so we need to tell the Host which one we're changing (mm->pgd). */
+static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
+			       pte_t *ptep)
+{
+	lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
+}
+
 static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pteval)
 {
 	*ptep = pteval;
-	lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
+	lguest_pte_update(mm, addr, ptep);
 }
 
 /* The Guest calls this to set a top-level entry.  Again, we set the entry then
@@ -498,8 +528,8 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
 static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
 	*pmdp = pmdval;
-	lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
-		   (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
+	lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
+		   (__pa(pmdp) & (PAGE_SIZE - 1)) / 4);
 }
 
 /* There are a couple of legacy places where the kernel sets a PTE, but we
@@ -515,7 +545,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
 {
 	*ptep = pteval;
 	if (cr3_changed)
-		lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
 }
 
 /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
@@ -531,7 +561,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
 static void lguest_flush_tlb_single(unsigned long addr)
 {
 	/* Simply set it to zero: if it was not, it will fault back in. */
-	lazy_hcall(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
+	lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
 }
 
 /* This is what happens after the Guest has removed a large number of entries.
@@ -539,7 +569,7 @@ static void lguest_flush_tlb_single(unsigned long addr)
  * have changed, ie. virtual addresses below PAGE_OFFSET. */
 static void lguest_flush_tlb_user(void)
 {
-	lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+	lazy_hcall1(LHCALL_FLUSH_TLB, 0);
 }
 
 /* This is called when the kernel page tables have changed.  That's not very
@@ -547,7 +577,7 @@ static void lguest_flush_tlb_user(void)
  * slow), so it's worth separating this from the user flushing above. */
 static void lguest_flush_tlb_kernel(void)
 {
-	lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+	lazy_hcall1(LHCALL_FLUSH_TLB, 1);
 }
 
 /*
@@ -594,19 +624,21 @@ static void __init lguest_init_IRQ(void)
 		/* Some systems map "vectors" to interrupts weirdly.  Lguest has
 		 * a straightforward 1 to 1 mapping, so force that here. */
 		__get_cpu_var(vector_irq)[vector] = i;
-		if (vector != SYSCALL_VECTOR) {
-			set_intr_gate(vector,
-				      interrupt[vector-FIRST_EXTERNAL_VECTOR]);
-			set_irq_chip_and_handler_name(i, &lguest_irq_controller,
-						      handle_level_irq,
-						      "level");
-		}
+		if (vector != SYSCALL_VECTOR)
+			set_intr_gate(vector, interrupt[i]);
 	}
 	/* This call is required to set up for 4k stacks, where we have
 	 * separate stacks for hard and soft interrupts. */
 	irq_ctx_init(smp_processor_id());
 }
 
+void lguest_setup_irq(unsigned int irq)
+{
+	irq_to_desc_alloc_cpu(irq, 0);
+	set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
+				      handle_level_irq, "level");
+}
+
 /*
  * Time.
  *
@@ -682,7 +714,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
 	}
 
 	/* Please wake us this far in the future. */
-	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
+	kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta);
 	return 0;
 }
 
@@ -693,7 +725,7 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode,
 	case CLOCK_EVT_MODE_UNUSED:
 	case CLOCK_EVT_MODE_SHUTDOWN:
 		/* A 0 argument shuts the clock down. */
-		hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0);
+		kvm_hypercall0(LHCALL_SET_CLOCKEVENT);
 		break;
 	case CLOCK_EVT_MODE_ONESHOT:
 		/* This is what we expect. */
@@ -768,8 +800,8 @@ static void lguest_time_init(void)
 static void lguest_load_sp0(struct tss_struct *tss,
 			    struct thread_struct *thread)
 {
-	lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0,
-		   THREAD_SIZE/PAGE_SIZE);
+	lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
+		   THREAD_SIZE / PAGE_SIZE);
 }
 
 /* Let's just say, I wouldn't do debugging under a Guest. */
@@ -842,7 +874,7 @@ static void set_lguest_basic_apic_ops(void)
 /* STOP!  Until an interrupt comes in. */
 static void lguest_safe_halt(void)
 {
-	hcall(LHCALL_HALT, 0, 0, 0);
+	kvm_hypercall0(LHCALL_HALT);
 }
 
 /* The SHUTDOWN hypercall takes a string to describe what's happening, and
@@ -852,7 +884,8 @@ static void lguest_safe_halt(void)
  * rather than virtual addresses, so we use __pa() here. */
 static void lguest_power_off(void)
 {
-	hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0);
+	kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
+					LGUEST_SHUTDOWN_POWEROFF);
 }
 
 /*
@@ -862,7 +895,7 @@ static void lguest_power_off(void)
  */
 static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
 {
-	hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0);
+	kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF);
 	/* The hcall won't return, but to keep gcc happy, we're "done". */
 	return NOTIFY_DONE;
 }
@@ -903,7 +936,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
 		len = sizeof(scratch) - 1;
 	scratch[len] = '\0';
 	memcpy(scratch, buf, len);
-	hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0);
+	kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch));
 
 	/* This routine returns the number of bytes actually written. */
 	return len;
@@ -913,7 +946,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
  * Launcher to reboot us. */
 static void lguest_restart(char *reason)
 {
-	hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
+	kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
 }
 
 /*G:050
@@ -1033,6 +1066,8 @@ __init void lguest_init(void)
 	pv_mmu_ops.read_cr3 = lguest_read_cr3;
 	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
 	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+	pv_mmu_ops.pte_update = lguest_pte_update;
+	pv_mmu_ops.pte_update_defer = lguest_pte_update;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* apic read/write intercepts */
@@ -1051,14 +1086,6 @@ __init void lguest_init(void)
 	 * lguest_init() where the rest of the fairly chaotic boot setup
 	 * occurs. */
 
-	/* The native boot code sets up initial page tables immediately after
-	 * the kernel itself, and sets init_pg_tables_end so they're not
-	 * clobbered.  The Launcher places our initial pagetables somewhere at
-	 * the top of our physical memory, so we don't need extra space: set
-	 * init_pg_tables_end to the end of the kernel. */
-	init_pg_tables_start = __pa(pg0);
-	init_pg_tables_end = __pa(pg0);
-
 	/* As described in head_32.S, we map the first 128M of memory. */
 	max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
 
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 10b9bd35a8f..f7954198947 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -27,8 +27,8 @@ ENTRY(lguest_entry)
 	/* We make the "initialization" hypercall now to tell the Host about
 	 * us, and also find out where it put our page tables. */
 	movl $LHCALL_LGUEST_INIT, %eax
-	movl $lguest_data - __PAGE_OFFSET, %edx
-	int $LGUEST_TRAP_ENTRY
+	movl $lguest_data - __PAGE_OFFSET, %ebx
+	.byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
 
 	/* Set up the initial stack so we can run C code. */
 	movl $(init_thread_union+THREAD_SIZE),%esp
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c22981fa2f3..ad5441ed1b5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
+
 #include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
 
 /*
  * memcpy - Copy a memory block.
  *
- * Input:	
- * rdi destination
- * rsi source
- * rdx count
- * 
+ * Input:
+ *  rdi destination
+ *  rsi source
+ *  rdx count
+ *
  * Output:
  * rax original destination
- */	
+ */
 
+/*
+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
+ *
+ * Calls to this get patched into the kernel image via the
+ * alternative instructions framework:
+ */
 	ALIGN
 memcpy_c:
 	CFI_STARTPROC
-	movq %rdi,%rax
-	movl %edx,%ecx
-	shrl $3,%ecx
-	andl $7,%edx
+	movq %rdi, %rax
+
+	movl %edx, %ecx
+	shrl $3, %ecx
+	andl $7, %edx
 	rep movsq
-	movl %edx,%ecx
+	movl %edx, %ecx
 	rep movsb
 	ret
 	CFI_ENDPROC
@@ -33,99 +41,110 @@ ENDPROC(memcpy_c)
 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC
-	pushq %rbx
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rbx, 0
-	movq %rdi,%rax
 
-	movl %edx,%ecx
-	shrl $6,%ecx
+	/*
+	 * Put the number of full 64-byte blocks into %ecx.
+	 * Tail portion is handled at the end:
+	 */
+	movq %rdi, %rax
+	movl %edx, %ecx
+	shrl   $6, %ecx
 	jz .Lhandle_tail
 
 	.p2align 4
 .Lloop_64:
+	/*
+	 * We decrement the loop index here - and the zero-flag is
+	 * checked at the end of the loop (instructions inbetween do
+	 * not change the zero flag):
+	 */
 	decl %ecx
 
-	movq (%rsi),%r11
-	movq 8(%rsi),%r8
+	/*
+	 * Move in blocks of 4x16 bytes:
+	 */
+	movq 0*8(%rsi),		%r11
+	movq 1*8(%rsi),		%r8
+	movq %r11,		0*8(%rdi)
+	movq %r8,		1*8(%rdi)
 
-	movq %r11,(%rdi)
-	movq %r8,1*8(%rdi)
+	movq 2*8(%rsi),		%r9
+	movq 3*8(%rsi),		%r10
+	movq %r9,		2*8(%rdi)
+	movq %r10,		3*8(%rdi)
 
-	movq 2*8(%rsi),%r9
-	movq 3*8(%rsi),%r10
+	movq 4*8(%rsi),		%r11
+	movq 5*8(%rsi),		%r8
+	movq %r11,		4*8(%rdi)
+	movq %r8,		5*8(%rdi)
 
-	movq %r9,2*8(%rdi)
-	movq %r10,3*8(%rdi)
+	movq 6*8(%rsi),		%r9
+	movq 7*8(%rsi),		%r10
+	movq %r9,		6*8(%rdi)
+	movq %r10,		7*8(%rdi)
 
-	movq 4*8(%rsi),%r11
-	movq 5*8(%rsi),%r8
+	leaq 64(%rsi), %rsi
+	leaq 64(%rdi), %rdi
 
-	movq %r11,4*8(%rdi)
-	movq %r8,5*8(%rdi)
-
-	movq 6*8(%rsi),%r9
-	movq 7*8(%rsi),%r10
-
-	movq %r9,6*8(%rdi)
-	movq %r10,7*8(%rdi)
-
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
 	jnz  .Lloop_64
 
 .Lhandle_tail:
-	movl %edx,%ecx
-	andl $63,%ecx
-	shrl $3,%ecx
+	movl %edx, %ecx
+	andl  $63, %ecx
+	shrl   $3, %ecx
 	jz   .Lhandle_7
+
 	.p2align 4
 .Lloop_8:
 	decl %ecx
-	movq (%rsi),%r8
-	movq %r8,(%rdi)
-	leaq 8(%rdi),%rdi
-	leaq 8(%rsi),%rsi
+	movq (%rsi),		%r8
+	movq %r8,		(%rdi)
+	leaq 8(%rdi),		%rdi
+	leaq 8(%rsi),		%rsi
 	jnz  .Lloop_8
 
 .Lhandle_7:
-	movl %edx,%ecx
-	andl $7,%ecx
-	jz .Lende
+	movl %edx, %ecx
+	andl $7, %ecx
+	jz .Lend
+
 	.p2align 4
 .Lloop_1:
-	movb (%rsi),%r8b
-	movb %r8b,(%rdi)
+	movb (%rsi), %r8b
+	movb %r8b, (%rdi)
 	incq %rdi
 	incq %rsi
 	decl %ecx
 	jnz .Lloop_1
 
-.Lende:
-	popq %rbx
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE rbx
+.Lend:
 	ret
-.Lfinal:
 	CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
 
-	/* Some CPUs run faster using the string copy instructions.
-	   It is also a lot simpler. Use this when possible */
+	/*
+	 * Some CPUs run faster using the string copy instructions.
+	 * It is also a lot simpler. Use this when possible:
+	 */
 
-	.section .altinstr_replacement,"ax"
+	.section .altinstr_replacement, "ax"
 1:	.byte 0xeb				/* jmp <disp8> */
 	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
 2:
 	.previous
-	.section .altinstructions,"a"
+
+	.section .altinstructions, "a"
 	.align 8
 	.quad memcpy
 	.quad 1b
 	.byte X86_FEATURE_REP_GOOD
-	/* Replace only beginning, memcpy is used to apply alternatives, so it
-	 * is silly to overwrite itself with nops - reboot is only outcome... */
+
+	/*
+	 * Replace only beginning, memcpy is used to apply alternatives,
+	 * so it is silly to overwrite itself with nops - reboot is the
+	 * only outcome...
+	 */
 	.byte 2b - 1b
 	.byte 2b - 1b
 	.previous
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 491e737ce54..aa098708877 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,20 +30,29 @@ static void fclex(void)
 }
 
 /* Needs to be externally visible */
-void finit(void)
+void finit_task(struct task_struct *tsk)
 {
-	control_word = 0x037f;
-	partial_status = 0;
-	top = 0;		/* We don't keep top in the status word internally. */
-	fpu_tag_word = 0xffff;
+	struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
+	struct address *oaddr, *iaddr;
+	soft->cwd = 0x037f;
+	soft->swd = 0;
+	soft->ftop = 0;	/* We don't keep top in the status word internally. */
+	soft->twd = 0xffff;
 	/* The behaviour is different from that detailed in
 	   Section 15.1.6 of the Intel manual */
-	operand_address.offset = 0;
-	operand_address.selector = 0;
-	instruction_address.offset = 0;
-	instruction_address.selector = 0;
-	instruction_address.opcode = 0;
-	no_ip_update = 1;
+	oaddr = (struct address *)&soft->foo;
+	oaddr->offset = 0;
+	oaddr->selector = 0;
+	iaddr = (struct address *)&soft->fip;
+	iaddr->offset = 0;
+	iaddr->selector = 0;
+	iaddr->opcode = 0;
+	soft->no_update = 1;
+}
+
+void finit(void)
+{
+	finit_task(current);
 }
 
 /*
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 2b938a38491..fdd30d08ab5 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,4 +1,4 @@
-obj-y	:=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
+obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 	    pat.o pgtable.o gup.o
 
 obj-$(CONFIG_SMP)		+= tlb.o
@@ -14,7 +14,7 @@ obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
 mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
 
-obj-$(CONFIG_NUMA)		+= numa_$(BITS).o
+obj-$(CONFIG_NUMA)		+= numa.o numa_$(BITS).o
 obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
 
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index bcc079c282d..8126e8d1a2a 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -1,5 +1,6 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/swap.h> /* for totalram_pages */
 
 void *kmap(struct page *page)
 {
@@ -18,49 +19,6 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
-static void debug_kmap_atomic_prot(enum km_type type)
-{
-#ifdef CONFIG_DEBUG_HIGHMEM
-	static unsigned warn_count = 10;
-
-	if (unlikely(warn_count == 0))
-		return;
-
-	if (unlikely(in_interrupt())) {
-		if (in_irq()) {
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		} else if (!irqs_disabled()) {	/* softirq */
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
-			    type != KM_SKB_SUNRPC_DATA &&
-			    type != KM_SKB_DATA_SOFTIRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		}
-	}
-
-	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-			type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
-		if (!irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
-		if (irq_count() == 0 && !irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	}
-#endif
-}
-
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -80,7 +38,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic_prot(type);
+	debug_kmap_atomic(type);
 
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -120,22 +78,13 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 	pagefault_enable();
 }
 
-/* This is the same as kmap_atomic() but can map memory that doesn't
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
-	enum fixed_addresses idx;
-	unsigned long vaddr;
-
-	pagefault_disable();
-
-	idx = type + KM_TYPE_NR*smp_processor_id();
-	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-	set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
-	arch_flush_lazy_mmu_mode();
-
-	return (void*) vaddr;
+	return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
 }
 EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
 
@@ -156,3 +105,27 @@ EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
 EXPORT_SYMBOL(kmap_atomic);
 EXPORT_SYMBOL(kunmap_atomic);
+
+void __init set_highmem_pages_init(void)
+{
+	struct zone *zone;
+	int nid;
+
+	for_each_zone(zone) {
+		unsigned long zone_start_pfn, zone_end_pfn;
+
+		if (!is_highmem(zone))
+			continue;
+
+		zone_start_pfn = zone->zone_start_pfn;
+		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+
+		nid = zone_to_nid(zone);
+		printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
+				zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+		add_highpages_with_active_regions(nid, zone_start_pfn,
+				 zone_end_pfn);
+	}
+	totalram_pages += totalhigh_pages;
+}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
new file mode 100644
index 00000000000..fd3da1dda1c
--- /dev/null
+++ b/arch/x86/mm/init.c
@@ -0,0 +1,393 @@
+#include <linux/ioport.h>
+#include <linux/swap.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
+#include <asm/page.h>
+#include <asm/page_types.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/tlbflush.h>
+
+unsigned long __initdata e820_table_start;
+unsigned long __meminitdata e820_table_end;
+unsigned long __meminitdata e820_table_top;
+
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+				= 1
+#endif
+;
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+					  int use_gbpages)
+{
+	unsigned long puds, pmds, ptes, tables, start;
+
+	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+	if (use_gbpages) {
+		unsigned long extra;
+
+		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+	} else
+		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+	if (use_pse) {
+		unsigned long extra;
+
+		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+		extra += PMD_SIZE;
+#endif
+		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	} else
+		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+	/* for fixmap */
+	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+
+	/*
+	 * RED-PEN putting page tables only on node 0 could
+	 * cause a hotspot and fill up ZONE_DMA. The page tables
+	 * need roughly 0.5KB per GB.
+	 */
+#ifdef CONFIG_X86_32
+	start = 0x7000;
+	e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+					tables, PAGE_SIZE);
+#else /* CONFIG_X86_64 */
+	start = 0x8000;
+	e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+#endif
+	if (e820_table_start == -1UL)
+		panic("Cannot find space for the kernel page tables");
+
+	e820_table_start >>= PAGE_SHIFT;
+	e820_table_end = e820_table_start;
+	e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+
+	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+		end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+}
+
+struct map_range {
+	unsigned long start;
+	unsigned long end;
+	unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int __meminit save_mr(struct map_range *mr, int nr_range,
+			     unsigned long start_pfn, unsigned long end_pfn,
+			     unsigned long page_size_mask)
+{
+	if (start_pfn < end_pfn) {
+		if (nr_range >= NR_RANGE_MR)
+			panic("run out of range for init_memory_mapping\n");
+		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+		mr[nr_range].page_size_mask = page_size_mask;
+		nr_range++;
+	}
+
+	return nr_range;
+}
+
+#ifdef CONFIG_X86_64
+static void __init init_gbpages(void)
+{
+	if (direct_gbpages && cpu_has_gbpages)
+		printk(KERN_INFO "Using GB pages for direct mapping\n");
+	else
+		direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+#endif
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+					       unsigned long end)
+{
+	unsigned long page_size_mask = 0;
+	unsigned long start_pfn, end_pfn;
+	unsigned long ret = 0;
+	unsigned long pos;
+
+	struct map_range mr[NR_RANGE_MR];
+	int nr_range, i;
+	int use_pse, use_gbpages;
+
+	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+	if (!after_bootmem)
+		init_gbpages();
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+	 * This will simplify cpa(), which otherwise needs to support splitting
+	 * large pages into small in interrupt context, etc.
+	 */
+	use_pse = use_gbpages = 0;
+#else
+	use_pse = cpu_has_pse;
+	use_gbpages = direct_gbpages;
+#endif
+
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_PAE
+	set_nx();
+	if (nx_enabled)
+		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+
+	/* Enable PSE if available */
+	if (cpu_has_pse)
+		set_in_cr4(X86_CR4_PSE);
+
+	/* Enable PGE if available */
+	if (cpu_has_pge) {
+		set_in_cr4(X86_CR4_PGE);
+		__supported_pte_mask |= _PAGE_GLOBAL;
+	}
+#endif
+
+	if (use_gbpages)
+		page_size_mask |= 1 << PG_LEVEL_1G;
+	if (use_pse)
+		page_size_mask |= 1 << PG_LEVEL_2M;
+
+	memset(mr, 0, sizeof(mr));
+	nr_range = 0;
+
+	/* head if not big page alignment ? */
+	start_pfn = start >> PAGE_SHIFT;
+	pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+	/*
+	 * Don't use a large page for the first 2/4MB of memory
+	 * because there are often fixed size MTRRs in there
+	 * and overlapping MTRRs into large pages can cause
+	 * slowdowns.
+	 */
+	if (pos == 0)
+		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+	else
+		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+				 << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+			<< (PMD_SHIFT - PAGE_SHIFT);
+#endif
+	if (end_pfn > (end >> PAGE_SHIFT))
+		end_pfn = end >> PAGE_SHIFT;
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+	/* big page (2M) range */
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+#ifdef CONFIG_X86_64
+	/* big page (1G) range */
+	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask &
+				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+	/* tail is not big page (1G) alignment */
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+#endif
+
+	/* tail is not big page (2M) alignment */
+	start_pfn = pos>>PAGE_SHIFT;
+	end_pfn = end>>PAGE_SHIFT;
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+	/* try to merge same page size and continuous */
+	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+		unsigned long old_start;
+		if (mr[i].end != mr[i+1].start ||
+		    mr[i].page_size_mask != mr[i+1].page_size_mask)
+			continue;
+		/* move it */
+		old_start = mr[i].start;
+		memmove(&mr[i], &mr[i+1],
+			(nr_range - 1 - i) * sizeof(struct map_range));
+		mr[i--].start = old_start;
+		nr_range--;
+	}
+
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+				mr[i].start, mr[i].end,
+			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+	/*
+	 * Find space for the kernel direct mapping tables.
+	 *
+	 * Later we should allocate these tables in the local node of the
+	 * memory mapped. Unfortunately this is done currently before the
+	 * nodes are discovered.
+	 */
+	if (!after_bootmem)
+		find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_32
+	for (i = 0; i < nr_range; i++)
+		kernel_physical_mapping_init(mr[i].start, mr[i].end,
+					     mr[i].page_size_mask);
+	ret = end;
+#else /* CONFIG_X86_64 */
+	for (i = 0; i < nr_range; i++)
+		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+						   mr[i].page_size_mask);
+#endif
+
+#ifdef CONFIG_X86_32
+	early_ioremap_page_table_range_init();
+
+	load_cr3(swapper_pg_dir);
+#endif
+
+#ifdef CONFIG_X86_64
+	if (!after_bootmem)
+		mmu_cr4_features = read_cr4();
+#endif
+	__flush_tlb_all();
+
+	if (!after_bootmem && e820_table_end > e820_table_start)
+		reserve_early(e820_table_start << PAGE_SHIFT,
+				 e820_table_end << PAGE_SHIFT, "PGTABLE");
+
+	if (!after_bootmem)
+		early_memtest(start, end);
+
+	return ret >> PAGE_SHIFT;
+}
+
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+		return 0;
+	if (!page_is_ram(pagenr))
+		return 1;
+	return 0;
+}
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+	unsigned long addr = begin;
+
+	if (addr >= end)
+		return;
+
+	/*
+	 * If debugging page accesses then do not free this memory but
+	 * mark them not present - any buggy init-section access will
+	 * create a kernel page fault:
+	 */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+		begin, PAGE_ALIGN(end));
+	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
+	/*
+	 * We just marked the kernel text read only above, now that
+	 * we are going to free part of that, we need to make that
+	 * writeable first.
+	 */
+	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
+	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+
+	for (; addr < end; addr += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(addr));
+		init_page_count(virt_to_page(addr));
+		memset((void *)(addr & ~(PAGE_SIZE-1)),
+			POISON_FREE_INITMEM, PAGE_SIZE);
+		free_page(addr);
+		totalram_pages++;
+	}
+#endif
+}
+
+void free_initmem(void)
+{
+	free_init_pages("unused kernel memory",
+			(unsigned long)(&__init_begin),
+			(unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	free_init_pages("initrd memory", start, end);
+}
+#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 06708ee94aa..749559ed80f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,8 +49,7 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
-
-unsigned int __VMALLOC_RESERVE = 128 << 20;
+#include <asm/init.h>
 
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
@@ -60,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
 
-
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
-static int __initdata after_init_bootmem;
+bool __read_mostly __vmalloc_start_set = false;
 
 static __init void *alloc_low_page(void)
 {
-	unsigned long pfn = table_end++;
+	unsigned long pfn = e820_table_end++;
 	void *adr;
 
-	if (pfn >= table_top)
+	if (pfn >= e820_table_top)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = __va(pfn * PAGE_SIZE);
@@ -92,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 
 #ifdef CONFIG_X86_PAE
 	if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
-		if (after_init_bootmem)
+		if (after_bootmem)
 			pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
 		else
 			pmd_table = (pmd_t *)alloc_low_page();
@@ -119,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
 		pte_t *page_table = NULL;
 
-		if (after_init_bootmem) {
+		if (after_bootmem) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 			page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
@@ -137,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	return pte_offset_kernel(pmd, 0);
 }
 
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+	int pgd_idx = pgd_index(vaddr);
+	int pmd_idx = pmd_index(vaddr);
+
+	return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	int pte_idx = pte_index(vaddr);
+	pmd_t *pmd;
+
+	pmd = populate_extra_pmd(vaddr);
+	return one_page_table_init(pmd) + pte_idx;
+}
+
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 					   unsigned long vaddr, pte_t *lastpte)
 {
@@ -153,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 	if (pmd_idx_kmap_begin != pmd_idx_kmap_end
 	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
 	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-	    && ((__pa(pte) >> PAGE_SHIFT) < table_start
-		|| (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+	    && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+		|| (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
 		pte_t *newpte;
 		int i;
 
-		BUG_ON(after_init_bootmem);
+		BUG_ON(after_bootmem);
 		newpte = alloc_low_page();
 		for (i = 0; i < PTRS_PER_PTE; i++)
 			set_pte(newpte + i, pte[i]);
@@ -227,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
  * of max_low_pfn pages, by creating page tables starting from address
  * PAGE_OFFSET:
  */
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
-						unsigned long start_pfn,
-						unsigned long end_pfn,
-						int use_pse)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
 {
+	int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+	unsigned long start_pfn, end_pfn;
+	pgd_t *pgd_base = swapper_pg_dir;
 	int pgd_idx, pmd_idx, pte_ofs;
 	unsigned long pfn;
 	pgd_t *pgd;
@@ -240,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
 	unsigned pages_2m, pages_4k;
 	int mapping_iter;
 
+	start_pfn = start >> PAGE_SHIFT;
+	end_pfn = end >> PAGE_SHIFT;
+
 	/*
 	 * First iteration will setup identity mapping using large/small pages
 	 * based on use_pse, with other attributes same as set by
@@ -354,26 +371,6 @@ repeat:
 		mapping_iter = 2;
 		goto repeat;
 	}
-}
-
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-	if (pagenr <= 256)
-		return 1;
-	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-		return 0;
-	if (!page_is_ram(pagenr))
-		return 1;
 	return 0;
 }
 
@@ -469,22 +466,10 @@ void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
 	work_with_active_regions(nid, add_highpages_work_fn, &data);
 }
 
-#ifndef CONFIG_NUMA
-static void __init set_highmem_pages_init(void)
-{
-	add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-
-	totalram_pages += totalhigh_pages;
-}
-#endif /* !CONFIG_NUMA */
-
 #else
 static inline void permanent_kmaps_init(pgd_t *pgd_base)
 {
 }
-static inline void set_highmem_pages_init(void)
-{
-}
 #endif /* CONFIG_HIGHMEM */
 
 void __init native_pagetable_setup_start(pgd_t *base)
@@ -542,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
  * be partially populated, and so it avoids stomping on any existing
  * mappings.
  */
-static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
+void __init early_ioremap_page_table_range_init(void)
 {
+	pgd_t *pgd_base = swapper_pg_dir;
 	unsigned long vaddr, end;
 
 	/*
@@ -638,7 +624,7 @@ static int __init noexec_setup(char *str)
 }
 early_param("noexec", noexec_setup);
 
-static void __init set_nx(void)
+void __init set_nx(void)
 {
 	unsigned int v[4], l, h;
 
@@ -790,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
 #ifdef CONFIG_FLATMEM
 	max_mapnr = num_physpages;
 #endif
+	__vmalloc_start_set = true;
+
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
 			pages_to_mb(max_low_pfn));
 
@@ -811,176 +799,66 @@ static void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
+static unsigned long __init setup_node_bootmem(int nodeid,
+				 unsigned long start_pfn,
+				 unsigned long end_pfn,
+				 unsigned long bootmap)
+{
+	unsigned long bootmap_size;
+
+	/* don't touch min_low_pfn */
+	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+					 bootmap >> PAGE_SHIFT,
+					 start_pfn, end_pfn);
+	printk(KERN_INFO "  node %d low ram: %08lx - %08lx\n",
+		nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+	printk(KERN_INFO "  node %d bootmap %08lx - %08lx\n",
+		 nodeid, bootmap, bootmap + bootmap_size);
+	free_bootmem_with_active_regions(nodeid, end_pfn);
+	early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+	return bootmap + bootmap_size;
+}
+
 void __init setup_bootmem_allocator(void)
 {
-	int i;
+	int nodeid;
 	unsigned long bootmap_size, bootmap;
 	/*
 	 * Initialize the boot-time allocator (with low memory only):
 	 */
 	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
-	bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
-				 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+	bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
 				 PAGE_SIZE);
 	if (bootmap == -1L)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
 	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
 
-	/* don't touch min_low_pfn */
-	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
-					 min_low_pfn, max_low_pfn);
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
 		 max_pfn_mapped<<PAGE_SHIFT);
-	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
-		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
-	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
-		 bootmap, bootmap + bootmap_size);
-	for_each_online_node(i)
-		free_bootmem_with_active_regions(i, max_low_pfn);
-	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-
-	after_init_bootmem = 1;
-}
-
-static void __init find_early_table_space(unsigned long end, int use_pse)
-{
-	unsigned long puds, pmds, ptes, tables, start;
-
-	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-	tables = PAGE_ALIGN(puds * sizeof(pud_t));
-
-	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-	tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
+	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
 
-	if (use_pse) {
-		unsigned long extra;
+	for_each_online_node(nodeid) {
+		 unsigned long start_pfn, end_pfn;
 
-		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-		extra += PMD_SIZE;
-		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	} else
-		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
-	tables += PAGE_ALIGN(ptes * sizeof(pte_t));
-
-	/* for fixmap */
-	tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
-
-	/*
-	 * RED-PEN putting page tables only on node 0 could
-	 * cause a hotspot and fill up ZONE_DMA. The page tables
-	 * need roughly 0.5KB per GB.
-	 */
-	start = 0x7000;
-	table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-					tables, PAGE_SIZE);
-	if (table_start == -1UL)
-		panic("Cannot find space for the kernel page tables");
-
-	table_start >>= PAGE_SHIFT;
-	table_end = table_start;
-	table_top = table_start + (tables>>PAGE_SHIFT);
-
-	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, table_start << PAGE_SHIFT,
-		(table_start << PAGE_SHIFT) + tables);
-}
-
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-						unsigned long end)
-{
-	pgd_t *pgd_base = swapper_pg_dir;
-	unsigned long start_pfn, end_pfn;
-	unsigned long big_page_start;
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/*
-	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-	 * This will simplify cpa(), which otherwise needs to support splitting
-	 * large pages into small in interrupt context, etc.
-	 */
-	int use_pse = 0;
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+		start_pfn = node_start_pfn[nodeid];
+		end_pfn = node_end_pfn[nodeid];
+		if (start_pfn > max_low_pfn)
+			continue;
+		if (end_pfn > max_low_pfn)
+			end_pfn = max_low_pfn;
 #else
-	int use_pse = cpu_has_pse;
-#endif
-
-	/*
-	 * Find space for the kernel direct mapping tables.
-	 */
-	if (!after_init_bootmem)
-		find_early_table_space(end, use_pse);
-
-#ifdef CONFIG_X86_PAE
-	set_nx();
-	if (nx_enabled)
-		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+		start_pfn = 0;
+		end_pfn = max_low_pfn;
 #endif
-
-	/* Enable PSE if available */
-	if (cpu_has_pse)
-		set_in_cr4(X86_CR4_PSE);
-
-	/* Enable PGE if available */
-	if (cpu_has_pge) {
-		set_in_cr4(X86_CR4_PGE);
-		__supported_pte_mask |= _PAGE_GLOBAL;
+		bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
+						 bootmap);
 	}
 
-	/*
-	 * Don't use a large page for the first 2/4MB of memory
-	 * because there are often fixed size MTRRs in there
-	 * and overlapping MTRRs into large pages can cause
-	 * slowdowns.
-	 */
-	big_page_start = PMD_SIZE;
-
-	if (start < big_page_start) {
-		start_pfn = start >> PAGE_SHIFT;
-		end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
-	} else {
-		/* head is not big page alignment ? */
-		start_pfn = start >> PAGE_SHIFT;
-		end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
-				 << (PMD_SHIFT - PAGE_SHIFT);
-	}
-	if (start_pfn < end_pfn)
-		kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
-
-	/* big page range */
-	start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < (big_page_start >> PAGE_SHIFT))
-		start_pfn =  big_page_start >> PAGE_SHIFT;
-	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn)
-		kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
-					     use_pse);
-
-	/* tail is not big page alignment ? */
-	start_pfn = end_pfn;
-	if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
-		end_pfn = end >> PAGE_SHIFT;
-		if (start_pfn < end_pfn)
-			kernel_physical_mapping_init(pgd_base, start_pfn,
-							 end_pfn, 0);
-	}
-
-	early_ioremap_page_table_range_init(pgd_base);
-
-	load_cr3(swapper_pg_dir);
-
-	__flush_tlb_all();
-
-	if (!after_init_bootmem)
-		reserve_early(table_start << PAGE_SHIFT,
-				 table_end << PAGE_SHIFT, "PGTABLE");
-
-	if (!after_init_bootmem)
-		early_memtest(start, end);
-
-	return end >> PAGE_SHIFT;
+	after_bootmem = 1;
 }
 
-
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
  * already mapped by head.S.
@@ -1176,17 +1054,47 @@ static noinline int do_test_wp_bit(void)
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);
 
+static int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long size = PFN_ALIGN(_etext) - start;
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read write\n",
+		 start, start+size);
+
+	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long size = PFN_ALIGN(_etext) - start;
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read only\n",
+		 start, start+size);
+
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long size = PFN_ALIGN(_etext) - start;
 
-#ifndef CONFIG_DYNAMIC_FTRACE
-	/* Dynamic tracing modifies the kernel text section */
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	printk(KERN_INFO "Write protecting the kernel text: %luk\n",
 		size >> 10);
 
+	kernel_set_to_readonly = 1;
+
 #ifdef CONFIG_CPA_DEBUG
 	printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
 		start, start+size);
@@ -1195,7 +1103,6 @@ void mark_rodata_ro(void)
 	printk(KERN_INFO "Testing CPA: write protecting again\n");
 	set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 #endif
-#endif /* CONFIG_DYNAMIC_FTRACE */
 
 	start += size;
 	size = (unsigned long)__end_rodata - start;
@@ -1214,52 +1121,6 @@ void mark_rodata_ro(void)
 }
 #endif
 
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/*
-	 * If debugging page accesses then do not free this memory but
-	 * mark them not present - any buggy init-section access will
-	 * create a kernel page fault:
-	 */
-	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
-		begin, PAGE_ALIGN(end));
-	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
-	unsigned long addr;
-
-	/*
-	 * We just marked the kernel text read only above, now that
-	 * we are going to free part of that, we need to make that
-	 * writeable first.
-	 */
-	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
-
-	for (addr = begin; addr < end; addr += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
-		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-		free_page(addr);
-		totalram_pages++;
-	}
-	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-#endif
-}
-
-void free_initmem(void)
-{
-	free_init_pages("unused kernel memory",
-			(unsigned long)(&__init_begin),
-			(unsigned long)(&__init_end));
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_init_pages("initrd memory", start, end);
-}
-#endif
-
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 				   int flags)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b1352250096..1753e8020df 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
 #include <asm/kdebug.h>
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
+#include <asm/init.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
-				= 1
-#endif
-;
-
 static int __init parse_direct_gbpages_off(char *arg)
 {
 	direct_gbpages = 0;
@@ -87,12 +82,10 @@ early_param("gbpages", parse_direct_gbpages_on);
  * around without checking the pgd every time.
  */
 
-int after_bootmem;
-
 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-static int do_not_nx __cpuinitdata;
+static int disable_nx __cpuinitdata;
 
 /*
  * noexec=on|off
@@ -107,9 +100,9 @@ static int __init nonx_setup(char *str)
 		return -EINVAL;
 	if (!strncmp(str, "on", 2)) {
 		__supported_pte_mask |= _PAGE_NX;
-		do_not_nx = 0;
+		disable_nx = 0;
 	} else if (!strncmp(str, "off", 3)) {
-		do_not_nx = 1;
+		disable_nx = 1;
 		__supported_pte_mask &= ~_PAGE_NX;
 	}
 	return 0;
@@ -121,7 +114,7 @@ void __cpuinit check_efer(void)
 	unsigned long efer;
 
 	rdmsrl(MSR_EFER, efer);
-	if (!(efer & EFER_NX) || do_not_nx)
+	if (!(efer & EFER_NX) || disable_nx)
 		__supported_pte_mask &= ~_PAGE_NX;
 }
 
@@ -168,34 +161,51 @@ static __ref void *spp_getpage(void)
 	return ptr;
 }
 
-void
-set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
 {
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	if (pgd_none(*pgd)) {
+		pud_t *pud = (pud_t *)spp_getpage();
+		pgd_populate(&init_mm, pgd, pud);
+		if (pud != pud_offset(pgd, 0))
+			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+			       pud, pud_offset(pgd, 0));
+	}
+	return pud_offset(pgd, vaddr);
+}
 
-	pud = pud_page + pud_index(vaddr);
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
 	if (pud_none(*pud)) {
-		pmd = (pmd_t *) spp_getpage();
+		pmd_t *pmd = (pmd_t *) spp_getpage();
 		pud_populate(&init_mm, pud, pmd);
-		if (pmd != pmd_offset(pud, 0)) {
+		if (pmd != pmd_offset(pud, 0))
 			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
-				pmd, pmd_offset(pud, 0));
-			return;
-		}
+			       pmd, pmd_offset(pud, 0));
 	}
-	pmd = pmd_offset(pud, vaddr);
+	return pmd_offset(pud, vaddr);
+}
+
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
 	if (pmd_none(*pmd)) {
-		pte = (pte_t *) spp_getpage();
+		pte_t *pte = (pte_t *) spp_getpage();
 		pmd_populate_kernel(&init_mm, pmd, pte);
-		if (pte != pte_offset_kernel(pmd, 0)) {
+		if (pte != pte_offset_kernel(pmd, 0))
 			printk(KERN_ERR "PAGETABLE BUG #02!\n");
-			return;
-		}
 	}
+	return pte_offset_kernel(pmd, vaddr);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pud = pud_page + pud_index(vaddr);
+	pmd = fill_pmd(pud, vaddr);
+	pte = fill_pte(pmd, vaddr);
 
-	pte = pte_offset_kernel(pmd, vaddr);
 	set_pte(pte, new_pte);
 
 	/*
@@ -205,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 	__flush_tlb_one(vaddr);
 }
 
-void
-set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
 	pgd_t *pgd;
 	pud_t *pud_page;
@@ -223,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 	set_pte_vaddr_pud(pud_page, vaddr, pteval);
 }
 
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+
+	pgd = pgd_offset_k(vaddr);
+	pud = fill_pud(pgd, vaddr);
+	return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	pmd_t *pmd;
+
+	pmd = populate_extra_pmd(vaddr);
+	return fill_pte(pmd, vaddr);
+}
+
 /*
  * Create large page table mappings for a range of physical addresses.
  */
@@ -291,13 +318,9 @@ void __init cleanup_highmap(void)
 	}
 }
 
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
 static __ref void *alloc_low_page(unsigned long *phys)
 {
-	unsigned long pfn = table_end++;
+	unsigned long pfn = e820_table_end++;
 	void *adr;
 
 	if (after_bootmem) {
@@ -307,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
 		return adr;
 	}
 
-	if (pfn >= table_top)
+	if (pfn >= e820_table_top)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -547,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
 	return phys_pud_init(pud, addr, end, page_size_mask);
 }
 
-static void __init find_early_table_space(unsigned long end, int use_pse,
-					  int use_gbpages)
-{
-	unsigned long puds, pmds, ptes, tables, start;
-
-	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-	if (use_gbpages) {
-		unsigned long extra;
-		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
-		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-	} else
-		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
-	if (use_pse) {
-		unsigned long extra;
-		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	} else
-		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
-	/*
-	 * RED-PEN putting page tables only on node 0 could
-	 * cause a hotspot and fill up ZONE_DMA. The page tables
-	 * need roughly 0.5KB per GB.
-	 */
-	start = 0x8000;
-	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
-	if (table_start == -1UL)
-		panic("Cannot find space for the kernel page tables");
-
-	table_start >>= PAGE_SHIFT;
-	table_end = table_start;
-	table_top = table_start + (tables >> PAGE_SHIFT);
-
-	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-
-static void __init init_gbpages(void)
-{
-	if (direct_gbpages && cpu_has_gbpages)
-		printk(KERN_INFO "Using GB pages for direct mapping\n");
-	else
-		direct_gbpages = 0;
-}
-
-static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
-						unsigned long end,
-						unsigned long page_size_mask)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
 {
 
 	unsigned long next, last_map_addr = end;
@@ -635,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
 	return last_map_addr;
 }
 
-struct map_range {
-	unsigned long start;
-	unsigned long end;
-	unsigned page_size_mask;
-};
-
-#define NR_RANGE_MR 5
-
-static int save_mr(struct map_range *mr, int nr_range,
-		   unsigned long start_pfn, unsigned long end_pfn,
-		   unsigned long page_size_mask)
-{
-
-	if (start_pfn < end_pfn) {
-		if (nr_range >= NR_RANGE_MR)
-			panic("run out of range for init_memory_mapping\n");
-		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
-		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
-		mr[nr_range].page_size_mask = page_size_mask;
-		nr_range++;
-	}
-
-	return nr_range;
-}
-
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-					       unsigned long end)
-{
-	unsigned long last_map_addr = 0;
-	unsigned long page_size_mask = 0;
-	unsigned long start_pfn, end_pfn;
-	unsigned long pos;
-
-	struct map_range mr[NR_RANGE_MR];
-	int nr_range, i;
-	int use_pse, use_gbpages;
-
-	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
-	/*
-	 * Find space for the kernel direct mapping tables.
-	 *
-	 * Later we should allocate these tables in the local node of the
-	 * memory mapped. Unfortunately this is done currently before the
-	 * nodes are discovered.
-	 */
-	if (!after_bootmem)
-		init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/*
-	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-	 * This will simplify cpa(), which otherwise needs to support splitting
-	 * large pages into small in interrupt context, etc.
-	 */
-	use_pse = use_gbpages = 0;
-#else
-	use_pse = cpu_has_pse;
-	use_gbpages = direct_gbpages;
-#endif
-
-	if (use_gbpages)
-		page_size_mask |= 1 << PG_LEVEL_1G;
-	if (use_pse)
-		page_size_mask |= 1 << PG_LEVEL_2M;
-
-	memset(mr, 0, sizeof(mr));
-	nr_range = 0;
-
-	/* head if not big page alignment ?*/
-	start_pfn = start >> PAGE_SHIFT;
-	pos = start_pfn << PAGE_SHIFT;
-	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-			<< (PMD_SHIFT - PAGE_SHIFT);
-	if (end_pfn > (end >> PAGE_SHIFT))
-		end_pfn = end >> PAGE_SHIFT;
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* big page (2M) range*/
-	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-			 << (PUD_SHIFT - PAGE_SHIFT);
-	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask & (1<<PG_LEVEL_2M));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* big page (1G) range */
-	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-			 << (PUD_SHIFT - PAGE_SHIFT);
-	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask &
-				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* tail is not big page (1G) alignment */
-	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask & (1<<PG_LEVEL_2M));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* tail is not big page (2M) alignment */
-	start_pfn = pos>>PAGE_SHIFT;
-	end_pfn = end>>PAGE_SHIFT;
-	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-
-	/* try to merge same page size and continuous */
-	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
-		unsigned long old_start;
-		if (mr[i].end != mr[i+1].start ||
-		    mr[i].page_size_mask != mr[i+1].page_size_mask)
-			continue;
-		/* move it */
-		old_start = mr[i].start;
-		memmove(&mr[i], &mr[i+1],
-			 (nr_range - 1 - i) * sizeof (struct map_range));
-		mr[i--].start = old_start;
-		nr_range--;
-	}
-
-	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-				mr[i].start, mr[i].end,
-			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
-			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
-	if (!after_bootmem)
-		find_early_table_space(end, use_pse, use_gbpages);
-
-	for (i = 0; i < nr_range; i++)
-		last_map_addr = kernel_physical_mapping_init(
-					mr[i].start, mr[i].end,
-					mr[i].page_size_mask);
-
-	if (!after_bootmem)
-		mmu_cr4_features = read_cr4();
-	__flush_tlb_all();
-
-	if (!after_bootmem && table_end > table_start)
-		reserve_early(table_start << PAGE_SHIFT,
-				 table_end << PAGE_SHIFT, "PGTABLE");
-
-	printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
-			 last_map_addr, end);
-
-	if (!after_bootmem)
-		early_memtest(start, end);
-
-	return last_map_addr >> PAGE_SHIFT;
-}
-
 #ifndef CONFIG_NUMA
 void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -876,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-	if (pagenr <= 256)
-		return 1;
-	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-		return 0;
-	if (!page_is_ram(pagenr))
-		return 1;
-	return 0;
-}
-
-
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
 			 kcore_modules, kcore_vsyscall;
 
@@ -947,46 +730,39 @@ void __init mem_init(void)
 		initsize >> 10);
 }
 
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+static int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
 {
-	unsigned long addr = begin;
+	unsigned long start = PFN_ALIGN(_stext);
+	unsigned long end = PFN_ALIGN(__start_rodata);
 
-	if (addr >= end)
+	if (!kernel_set_to_readonly)
 		return;
 
-	/*
-	 * If debugging page accesses then do not free this memory but
-	 * mark them not present - any buggy init-section access will
-	 * create a kernel page fault:
-	 */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
-		begin, PAGE_ALIGN(end));
-	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
-	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-
-	for (; addr < end; addr += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
-		memset((void *)(addr & ~(PAGE_SIZE-1)),
-			POISON_FREE_INITMEM, PAGE_SIZE);
-		free_page(addr);
-		totalram_pages++;
-	}
-#endif
+	pr_debug("Set kernel text: %lx - %lx for read write\n",
+		 start, end);
+
+	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
 }
 
-void free_initmem(void)
+void set_kernel_text_ro(void)
 {
-	free_init_pages("unused kernel memory",
-			(unsigned long)(&__init_begin),
-			(unsigned long)(&__init_end));
-}
+	unsigned long start = PFN_ALIGN(_stext);
+	unsigned long end = PFN_ALIGN(__start_rodata);
 
-#ifdef CONFIG_DEBUG_RODATA
-const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read only\n",
+		 start, end);
+
+	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+}
 
 void mark_rodata_ro(void)
 {
@@ -994,15 +770,12 @@ void mark_rodata_ro(void)
 	unsigned long rodata_start =
 		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
 
-#ifdef CONFIG_DYNAMIC_FTRACE
-	/* Dynamic tracing modifies the kernel text section */
-	start = rodata_start;
-#endif
-
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
 	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
 
+	kernel_set_to_readonly = 1;
+
 	/*
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
@@ -1022,13 +795,6 @@ void mark_rodata_ro(void)
 
 #endif
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_init_pages("initrd memory", start, end);
-}
-#endif
-
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 				   int flags)
 {
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 04102d42ff4..8056545e2d3 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -19,10 +19,11 @@
 #include <asm/iomap.h>
 #include <asm/pat.h>
 #include <linux/module.h>
+#include <linux/highmem.h>
 
 int is_io_mapping_possible(resource_size_t base, unsigned long size)
 {
-#ifndef CONFIG_X86_PAE
+#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
 	/* There is no way to map greater than 1 << 32 address without PAE */
 	if (base + size > 0x100000000ULL)
 		return 0;
@@ -31,16 +32,28 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(is_io_mapping_possible);
 
-/* Map 'pfn' using fixed map 'type' and protections 'prot'
- */
-void *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
 	pagefault_disable();
 
+	debug_kmap_atomic(type);
+	idx = type + KM_TYPE_NR * smp_processor_id();
+	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
+	arch_flush_lazy_mmu_mode();
+
+	return (void *)vaddr;
+}
+
+/*
+ * Map 'pfn' using fixed map 'type' and protections 'prot'
+ */
+void *
+iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+{
 	/*
 	 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
 	 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
@@ -50,12 +63,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 	if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
 		prot = PAGE_KERNEL_UC_MINUS;
 
-	idx = type + KM_TYPE_NR*smp_processor_id();
-	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-	set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
-	arch_flush_lazy_mmu_mode();
-
-	return (void*) vaddr;
+	return kmap_atomic_prot_pfn(pfn, type, prot);
 }
 EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
 
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 433f7bd4648..0dfa09d69e8 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -22,13 +22,17 @@
 #include <asm/pgalloc.h>
 #include <asm/pat.h>
 
-#ifdef CONFIG_X86_64
-
-static inline int phys_addr_valid(unsigned long addr)
+static inline int phys_addr_valid(resource_size_t addr)
 {
-	return addr < (1UL << boot_cpu_data.x86_phys_bits);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+	return !(addr >> boot_cpu_data.x86_phys_bits);
+#else
+	return 1;
+#endif
 }
 
+#ifdef CONFIG_X86_64
+
 unsigned long __phys_addr(unsigned long x)
 {
 	if (x >= __START_KERNEL_map) {
@@ -38,8 +42,7 @@ unsigned long __phys_addr(unsigned long x)
 	} else {
 		VIRTUAL_BUG_ON(x < PAGE_OFFSET);
 		x -= PAGE_OFFSET;
-		VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
-					!phys_addr_valid(x));
+		VIRTUAL_BUG_ON(!phys_addr_valid(x));
 	}
 	return x;
 }
@@ -56,10 +59,8 @@ bool __virt_addr_valid(unsigned long x)
 		if (x < PAGE_OFFSET)
 			return false;
 		x -= PAGE_OFFSET;
-		if (system_state == SYSTEM_BOOTING ?
-				x > MAXMEM : !phys_addr_valid(x)) {
+		if (!phys_addr_valid(x))
 			return false;
-		}
 	}
 
 	return pfn_valid(x >> PAGE_SHIFT);
@@ -68,18 +69,12 @@ EXPORT_SYMBOL(__virt_addr_valid);
 
 #else
 
-static inline int phys_addr_valid(unsigned long addr)
-{
-	return 1;
-}
-
 #ifdef CONFIG_DEBUG_VIRTUAL
 unsigned long __phys_addr(unsigned long x)
 {
-	/* VMALLOC_* aren't constants; not available at the boot time */
+	/* VMALLOC_* aren't constants  */
 	VIRTUAL_BUG_ON(x < PAGE_OFFSET);
-	VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
-		is_vmalloc_addr((void *) x));
+	VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
 	return x - PAGE_OFFSET;
 }
 EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +84,9 @@ bool __virt_addr_valid(unsigned long x)
 {
 	if (x < PAGE_OFFSET)
 		return false;
-	if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+	if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+		return false;
+	if (x >= FIXADDR_START)
 		return false;
 	return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
 }
@@ -508,13 +505,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
 	return &bm_pte[pte_index(addr)];
 }
 
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
 void __init early_ioremap_init(void)
 {
 	pmd_t *pmd;
+	int i;
 
 	if (early_ioremap_debug)
 		printk(KERN_INFO "early_ioremap_init()\n");
 
+	for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+		slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+
 	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
 	memset(bm_pte, 0, sizeof(bm_pte));
 	pmd_populate_kernel(&init_mm, pmd, bm_pte);
@@ -581,6 +584,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
 
 static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
 static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+
 static int __init check_early_ioremap_leak(void)
 {
 	int count = 0;
@@ -602,7 +606,8 @@ static int __init check_early_ioremap_leak(void)
 }
 late_initcall(check_early_ioremap_leak);
 
-static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
+static void __init __iomem *
+__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
 {
 	unsigned long offset, last_addr;
 	unsigned int nrpages;
@@ -668,9 +673,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo
 		--nrpages;
 	}
 	if (early_ioremap_debug)
-		printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+		printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
 
-	prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0));
+	prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
 	return prev_map[slot];
 }
 
@@ -738,8 +743,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
 	}
 	prev_map[slot] = NULL;
 }
-
-void __this_fixmap_does_not_exist(void)
-{
-	WARN_ON(1);
-}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 9f205030d9a..4f115e00486 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -310,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active) {
-		pr_warning("kmmio: spurious debug trap on CPU %d.\n",
+		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
 							smp_processor_id());
 		goto out;
 	}
@@ -451,23 +451,24 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
 
 static void remove_kmmio_fault_pages(struct rcu_head *head)
 {
-	struct kmmio_delayed_release *dr = container_of(
-						head,
-						struct kmmio_delayed_release,
-						rcu);
+	struct kmmio_delayed_release *dr =
+		container_of(head, struct kmmio_delayed_release, rcu);
 	struct kmmio_fault_page *p = dr->release_list;
 	struct kmmio_fault_page **prevp = &dr->release_list;
 	unsigned long flags;
+
 	spin_lock_irqsave(&kmmio_lock, flags);
 	while (p) {
-		if (!p->count)
+		if (!p->count) {
 			list_del_rcu(&p->list);
-		else
+			prevp = &p->release_next;
+		} else {
 			*prevp = p->release_next;
-		prevp = &p->release_next;
+		}
 		p = p->release_next;
 	}
 	spin_unlock_irqrestore(&kmmio_lock, flags);
+
 	/* This is the real RCU destroy call. */
 	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
 }
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 0bcd7883d03..605c8be0621 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg)
 {
 	if (arg)
 		memtest_pattern = simple_strtoul(arg, NULL, 0);
+	else
+		memtest_pattern = ARRAY_SIZE(patterns);
+
 	return 0;
 }
 
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 2c4baa88f2c..c9342ed8b40 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -378,27 +378,34 @@ static void clear_trace_list(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static cpumask_t downed_cpus;
+static cpumask_var_t downed_cpus;
 
 static void enter_uniprocessor(void)
 {
 	int cpu;
 	int err;
 
+	if (downed_cpus == NULL &&
+	    !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
+		pr_notice(NAME "Failed to allocate mask\n");
+		goto out;
+	}
+
 	get_online_cpus();
-	downed_cpus = cpu_online_map;
-	cpu_clear(first_cpu(cpu_online_map), downed_cpus);
+	cpumask_copy(downed_cpus, cpu_online_mask);
+	cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
 	if (num_online_cpus() > 1)
 		pr_notice(NAME "Disabling non-boot CPUs...\n");
 	put_online_cpus();
 
-	for_each_cpu_mask(cpu, downed_cpus) {
+	for_each_cpu(cpu, downed_cpus) {
 		err = cpu_down(cpu);
 		if (!err)
 			pr_info(NAME "CPU%d is down.\n", cpu);
 		else
 			pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
 	}
+out:
 	if (num_online_cpus() > 1)
 		pr_warning(NAME "multiple CPUs still online, "
 						"may miss events.\n");
@@ -411,10 +418,10 @@ static void __ref leave_uniprocessor(void)
 	int cpu;
 	int err;
 
-	if (cpus_weight(downed_cpus) == 0)
+	if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
 		return;
 	pr_notice(NAME "Re-enabling CPUs...\n");
-	for_each_cpu_mask(cpu, downed_cpus) {
+	for_each_cpu(cpu, downed_cpus) {
 		err = cpu_up(cpu);
 		if (!err)
 			pr_info(NAME "enabled CPU%d.\n", cpu);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
new file mode 100644
index 00000000000..550df481acc
--- /dev/null
+++ b/arch/x86/mm/numa.c
@@ -0,0 +1,67 @@
+/* Common code for 32 and 64-bit NUMA */
+#include <linux/topology.h>
+#include <linux/module.h>
+#include <linux/bootmem.h>
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
+#else
+# define DBG(x...)
+#endif
+
+/*
+ * Which logical CPUs are on which nodes
+ */
+cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: node_to_cpumask() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+void __init setup_node_to_cpumask_map(void)
+{
+	unsigned int node, num = 0;
+
+	/* setup nr_node_ids if not done yet */
+	if (nr_node_ids == MAX_NUMNODES) {
+		for_each_node_mask(node, node_possible_map)
+			num = node;
+		nr_node_ids = num + 1;
+	}
+
+	/* allocate the map */
+	for (node = 0; node < nr_node_ids; node++)
+		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
+
+	/* cpumask_of_node() will now work */
+	pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
+}
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const struct cpumask *cpumask_of_node(int node)
+{
+	if (node >= nr_node_ids) {
+		printk(KERN_WARNING
+			"cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+			node, nr_node_ids);
+		dump_stack();
+		return cpu_none_mask;
+	}
+	if (node_to_cpumask_map[node] == NULL) {
+		printk(KERN_WARNING
+			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
+			node);
+		dump_stack();
+		return cpu_online_mask;
+	}
+	return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 3957cd6d645..3daefa04ace 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -416,39 +416,14 @@ void __init initmem_init(unsigned long start_pfn,
 	for_each_online_node(nid)
 		propagate_e820_map_node(nid);
 
-	for_each_online_node(nid)
+	for_each_online_node(nid) {
 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+	}
 
-	NODE_DATA(0)->bdata = &bootmem_node_data[0];
 	setup_bootmem_allocator();
 }
 
-void __init set_highmem_pages_init(void)
-{
-#ifdef CONFIG_HIGHMEM
-	struct zone *zone;
-	int nid;
-
-	for_each_zone(zone) {
-		unsigned long zone_start_pfn, zone_end_pfn;
-
-		if (!is_highmem(zone))
-			continue;
-
-		zone_start_pfn = zone->zone_start_pfn;
-		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
-
-		nid = zone_to_nid(zone);
-		printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
-				zone->name, nid, zone_start_pfn, zone_end_pfn);
-
-		add_highpages_with_active_regions(nid, zone_start_pfn,
-				 zone_end_pfn);
-	}
-	totalram_pages += totalhigh_pages;
-#endif
-}
-
 #ifdef CONFIG_MEMORY_HOTPLUG
 static int paddr_to_nid(u64 addr)
 {
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 64c9cf043cd..d73aaa89237 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,12 +20,6 @@
 #include <asm/acpi.h>
 #include <asm/k8.h>
 
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-# define DBG(x...) printk(KERN_DEBUG x)
-#else
-# define DBG(x...)
-#endif
-
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
@@ -49,12 +43,6 @@ DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
 
 /*
- * Which logical CPUs are on which nodes
- */
-cpumask_t *node_to_cpumask_map;
-EXPORT_SYMBOL(node_to_cpumask_map);
-
-/*
  * Given a shift value, try to populate memnodemap[]
  * Returns :
  * 1 if OK
@@ -661,36 +649,6 @@ void __init init_cpu_to_node(void)
 #endif
 
 
-/*
- * Allocate node_to_cpumask_map based on number of available nodes
- * Requires node_possible_map to be valid.
- *
- * Note: node_to_cpumask() is not valid until after this is done.
- * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
- */
-void __init setup_node_to_cpumask_map(void)
-{
-	unsigned int node, num = 0;
-	cpumask_t *map;
-
-	/* setup nr_node_ids if not done yet */
-	if (nr_node_ids == MAX_NUMNODES) {
-		for_each_node_mask(node, node_possible_map)
-			num = node;
-		nr_node_ids = num + 1;
-	}
-
-	/* allocate the map */
-	map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
-	DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
-
-	pr_debug("Node to cpumask map at %p for %d nodes\n",
-		 map, nr_node_ids);
-
-	/* node_to_cpumask() will now work */
-	node_to_cpumask_map = map;
-}
-
 void __cpuinit numa_set_node(int cpu, int node)
 {
 	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
@@ -723,12 +681,12 @@ void __cpuinit numa_clear_node(int cpu)
 
 void __cpuinit numa_add_cpu(int cpu)
 {
-	cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 }
 
 void __cpuinit numa_remove_cpu(int cpu)
 {
-	cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 }
 
 #else /* CONFIG_DEBUG_PER_CPU_MAPS */
@@ -739,20 +697,20 @@ void __cpuinit numa_remove_cpu(int cpu)
 static void __cpuinit numa_set_cpumask(int cpu, int enable)
 {
 	int node = early_cpu_to_node(cpu);
-	cpumask_t *mask;
+	struct cpumask *mask;
 	char buf[64];
 
-	if (node_to_cpumask_map == NULL) {
-		printk(KERN_ERR "node_to_cpumask_map NULL\n");
+	mask = node_to_cpumask_map[node];
+	if (mask == NULL) {
+		printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
 		dump_stack();
 		return;
 	}
 
-	mask = &node_to_cpumask_map[node];
 	if (enable)
-		cpu_set(cpu, *mask);
+		cpumask_set_cpu(cpu, mask);
 	else
-		cpu_clear(cpu, *mask);
+		cpumask_clear_cpu(cpu, mask);
 
 	cpulist_scnprintf(buf, sizeof(buf), mask);
 	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
@@ -799,59 +757,6 @@ int early_cpu_to_node(int cpu)
 	return per_cpu(x86_cpu_to_node_map, cpu);
 }
 
-
-/* empty cpumask */
-static const cpumask_t cpu_mask_none;
-
-/*
- * Returns a pointer to the bitmask of CPUs on Node 'node'.
- */
-const cpumask_t *cpumask_of_node(int node)
-{
-	if (node_to_cpumask_map == NULL) {
-		printk(KERN_WARNING
-			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
-			node);
-		dump_stack();
-		return (const cpumask_t *)&cpu_online_map;
-	}
-	if (node >= nr_node_ids) {
-		printk(KERN_WARNING
-			"cpumask_of_node(%d): node > nr_node_ids(%d)\n",
-			node, nr_node_ids);
-		dump_stack();
-		return &cpu_mask_none;
-	}
-	return &node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(cpumask_of_node);
-
-/*
- * Returns a bitmask of CPUs on Node 'node'.
- *
- * Side note: this function creates the returned cpumask on the stack
- * so with a high NR_CPUS count, excessive stack space is used.  The
- * node_to_cpumask_ptr function should be used whenever possible.
- */
-cpumask_t node_to_cpumask(int node)
-{
-	if (node_to_cpumask_map == NULL) {
-		printk(KERN_WARNING
-			"node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
-		dump_stack();
-		return cpu_online_map;
-	}
-	if (node >= nr_node_ids) {
-		printk(KERN_WARNING
-			"node_to_cpumask(%d): node > nr_node_ids(%d)\n",
-			node, nr_node_ids);
-		dump_stack();
-		return cpu_mask_none;
-	}
-	return node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(node_to_cpumask);
-
 /*
  * --------- end of debug versions of the numa functions ---------
  */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8253bc97587..d71e1b636ce 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -16,6 +16,7 @@
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/setup.h>
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
@@ -33,6 +34,7 @@ struct cpa_data {
 	unsigned long	pfn;
 	unsigned	force_split : 1;
 	int		curpage;
+	struct page	**pages;
 };
 
 /*
@@ -45,6 +47,7 @@ static DEFINE_SPINLOCK(cpa_lock);
 
 #define CPA_FLUSHTLB 1
 #define CPA_ARRAY 2
+#define CPA_PAGES_ARRAY 4
 
 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -95,7 +98,7 @@ static inline unsigned long highmap_start_pfn(void)
 
 static inline unsigned long highmap_end_pfn(void)
 {
-	return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+	return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
 }
 
 #endif
@@ -201,10 +204,10 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
 	}
 }
 
-static void cpa_flush_array(unsigned long *start, int numpages, int cache)
+static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+			    int in_flags, struct page **pages)
 {
 	unsigned int i, level;
-	unsigned long *addr;
 
 	BUG_ON(irqs_disabled());
 
@@ -225,14 +228,22 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache)
 	 * will cause all other CPUs to flush the same
 	 * cachelines:
 	 */
-	for (i = 0, addr = start; i < numpages; i++, addr++) {
-		pte_t *pte = lookup_address(*addr, &level);
+	for (i = 0; i < numpages; i++) {
+		unsigned long addr;
+		pte_t *pte;
+
+		if (in_flags & CPA_PAGES_ARRAY)
+			addr = (unsigned long)page_address(pages[i]);
+		else
+			addr = start[i];
+
+		pte = lookup_address(addr, &level);
 
 		/*
 		 * Only flush present addresses:
 		 */
 		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
-			clflush_cache_range((void *) *addr, PAGE_SIZE);
+			clflush_cache_range((void *)addr, PAGE_SIZE);
 	}
 }
 
@@ -522,6 +533,17 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	 * primary protection behavior:
 	 */
 	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+
+	/*
+	 * Intel Atom errata AAH41 workaround.
+	 *
+	 * The real fix should be in hw or in a microcode update, but
+	 * we also probabilistically try to reduce the window of having
+	 * a large TLB mixed with 4K TLBs while instruction fetches are
+	 * going on.
+	 */
+	__flush_tlb_all();
+
 	base = NULL;
 
 out_unlock:
@@ -573,7 +595,9 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
 	unsigned int level;
 	pte_t *kpte, old_pte;
 
-	if (cpa->flags & CPA_ARRAY)
+	if (cpa->flags & CPA_PAGES_ARRAY)
+		address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+	else if (cpa->flags & CPA_ARRAY)
 		address = cpa->vaddr[cpa->curpage];
 	else
 		address = *cpa->vaddr;
@@ -676,7 +700,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
 	 * No need to redo, when the primary call touched the direct
 	 * mapping already:
 	 */
-	if (cpa->flags & CPA_ARRAY)
+	if (cpa->flags & CPA_PAGES_ARRAY)
+		vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+	else if (cpa->flags & CPA_ARRAY)
 		vaddr = cpa->vaddr[cpa->curpage];
 	else
 		vaddr = *cpa->vaddr;
@@ -687,7 +713,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
 		alias_cpa = *cpa;
 		temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
 		alias_cpa.vaddr = &temp_cpa_vaddr;
-		alias_cpa.flags &= ~CPA_ARRAY;
+		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
 
 
 		ret = __change_page_attr_set_clr(&alias_cpa, 0);
@@ -700,7 +726,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
 	 * No need to redo, when the primary call touched the high
 	 * mapping already:
 	 */
-	if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
+	if (within(vaddr, (unsigned long) _text, _brk_end))
 		return 0;
 
 	/*
@@ -713,7 +739,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
 	alias_cpa = *cpa;
 	temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
 	alias_cpa.vaddr = &temp_cpa_vaddr;
-	alias_cpa.flags &= ~CPA_ARRAY;
+	alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
 
 	/*
 	 * The high mapping range is imprecise, so ignore the return value.
@@ -734,7 +760,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 		 */
 		cpa->numpages = numpages;
 		/* for array changes, we can't use large page */
-		if (cpa->flags & CPA_ARRAY)
+		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
 			cpa->numpages = 1;
 
 		if (!debug_pagealloc)
@@ -758,7 +784,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 		 */
 		BUG_ON(cpa->numpages > numpages);
 		numpages -= cpa->numpages;
-		if (cpa->flags & CPA_ARRAY)
+		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
 			cpa->curpage++;
 		else
 			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
@@ -775,7 +801,8 @@ static inline int cache_attr(pgprot_t attr)
 
 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 				    pgprot_t mask_set, pgprot_t mask_clr,
-				    int force_split, int array)
+				    int force_split, int in_flag,
+				    struct page **pages)
 {
 	struct cpa_data cpa;
 	int ret, cache, checkalias;
@@ -790,15 +817,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 		return 0;
 
 	/* Ensure we are PAGE_SIZE aligned */
-	if (!array) {
-		if (*addr & ~PAGE_MASK) {
-			*addr &= PAGE_MASK;
-			/*
-			 * People should not be passing in unaligned addresses:
-			 */
-			WARN_ON_ONCE(1);
-		}
-	} else {
+	if (in_flag & CPA_ARRAY) {
 		int i;
 		for (i = 0; i < numpages; i++) {
 			if (addr[i] & ~PAGE_MASK) {
@@ -806,6 +825,18 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 				WARN_ON_ONCE(1);
 			}
 		}
+	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
+		/*
+		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
+		 * No need to cehck in that case
+		 */
+		if (*addr & ~PAGE_MASK) {
+			*addr &= PAGE_MASK;
+			/*
+			 * People should not be passing in unaligned addresses:
+			 */
+			WARN_ON_ONCE(1);
+		}
 	}
 
 	/* Must avoid aliasing mappings in the highmem code */
@@ -821,6 +852,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 	arch_flush_lazy_mmu_mode();
 
 	cpa.vaddr = addr;
+	cpa.pages = pages;
 	cpa.numpages = numpages;
 	cpa.mask_set = mask_set;
 	cpa.mask_clr = mask_clr;
@@ -828,8 +860,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 	cpa.curpage = 0;
 	cpa.force_split = force_split;
 
-	if (array)
-		cpa.flags |= CPA_ARRAY;
+	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
+		cpa.flags |= in_flag;
 
 	/* No alias checking for _NX bit modifications */
 	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -855,9 +887,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 	 * wbindv):
 	 */
 	if (!ret && cpu_has_clflush) {
-		if (cpa.flags & CPA_ARRAY)
-			cpa_flush_array(addr, numpages, cache);
-		else
+		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+			cpa_flush_array(addr, numpages, cache,
+					cpa.flags, pages);
+		} else
 			cpa_flush_range(*addr, numpages, cache);
 	} else
 		cpa_flush_all(cache);
@@ -877,14 +910,28 @@ static inline int change_page_attr_set(unsigned long *addr, int numpages,
 				       pgprot_t mask, int array)
 {
 	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
-		array);
+		(array ? CPA_ARRAY : 0), NULL);
 }
 
 static inline int change_page_attr_clear(unsigned long *addr, int numpages,
 					 pgprot_t mask, int array)
 {
 	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
-		array);
+		(array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int cpa_set_pages_array(struct page **pages, int numpages,
+				       pgprot_t mask)
+{
+	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
+		CPA_PAGES_ARRAY, pages);
+}
+
+static inline int cpa_clear_pages_array(struct page **pages, int numpages,
+					 pgprot_t mask)
+{
+	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
+		CPA_PAGES_ARRAY, pages);
 }
 
 int _set_memory_uc(unsigned long addr, int numpages)
@@ -1032,7 +1079,7 @@ int set_memory_np(unsigned long addr, int numpages)
 int set_memory_4k(unsigned long addr, int numpages)
 {
 	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
-					__pgprot(0), 1, 0);
+					__pgprot(0), 1, 0, NULL);
 }
 
 int set_pages_uc(struct page *page, int numpages)
@@ -1043,6 +1090,35 @@ int set_pages_uc(struct page *page, int numpages)
 }
 EXPORT_SYMBOL(set_pages_uc);
 
+int set_pages_array_uc(struct page **pages, int addrinarray)
+{
+	unsigned long start;
+	unsigned long end;
+	int i;
+	int free_idx;
+
+	for (i = 0; i < addrinarray; i++) {
+		start = (unsigned long)page_address(pages[i]);
+		end = start + PAGE_SIZE;
+		if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
+			goto err_out;
+	}
+
+	if (cpa_set_pages_array(pages, addrinarray,
+			__pgprot(_PAGE_CACHE_UC_MINUS)) == 0) {
+		return 0; /* Success */
+	}
+err_out:
+	free_idx = i;
+	for (i = 0; i < free_idx; i++) {
+		start = (unsigned long)page_address(pages[i]);
+		end = start + PAGE_SIZE;
+		free_memtype(start, end);
+	}
+	return -EINVAL;
+}
+EXPORT_SYMBOL(set_pages_array_uc);
+
 int set_pages_wb(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
@@ -1051,6 +1127,26 @@ int set_pages_wb(struct page *page, int numpages)
 }
 EXPORT_SYMBOL(set_pages_wb);
 
+int set_pages_array_wb(struct page **pages, int addrinarray)
+{
+	int retval;
+	unsigned long start;
+	unsigned long end;
+	int i;
+
+	retval = cpa_clear_pages_array(pages, addrinarray,
+			__pgprot(_PAGE_CACHE_MASK));
+
+	for (i = 0; i < addrinarray; i++) {
+		start = (unsigned long)page_address(pages[i]);
+		end = start + PAGE_SIZE;
+		free_memtype(start, end);
+	}
+
+	return retval;
+}
+EXPORT_SYMBOL(set_pages_array_wb);
+
 int set_pages_x(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2ed37158012..640339ee4fb 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -677,10 +677,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 
 	/*
-	 * reserve_pfn_range() doesn't support RAM pages.
+	 * reserve_pfn_range() doesn't support RAM pages. Maintain the current
+	 * behavior with RAM pages by returning success.
 	 */
 	if (is_ram != 0)
-		return -EINVAL;
+		return 0;
 
 	ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
 	if (ret)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 86f2ffc43c3..5b7c7c8464f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -313,6 +313,24 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
 	return young;
 }
 
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void __init reserve_top_address(unsigned long reserve)
+{
+#ifdef CONFIG_X86_32
+	BUG_ON(fixmaps_set > 0);
+	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
+	       (int)-reserve);
+	__FIXADDR_TOP = -reserve - PAGE_SIZE;
+	__VMALLOC_RESERVE += reserve;
+#endif
+}
+
 int fixmaps_set;
 
 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 0951db9ee51..46c8834aedc 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,6 +20,8 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
 /*
  * Associate a virtual page frame with a given physical page frame 
  * and protection flags for that frame.
@@ -48,7 +50,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 	}
 	pte = pte_offset_kernel(pmd, vaddr);
 	if (pte_val(pteval))
-		set_pte_present(&init_mm, vaddr, pte, pteval);
+		set_pte_at(&init_mm, vaddr, pte, pteval);
 	else
 		pte_clear(&init_mm, vaddr, pte);
 
@@ -97,22 +99,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
 unsigned long __FIXADDR_TOP = 0xfffff000;
 EXPORT_SYMBOL(__FIXADDR_TOP);
 
-/**
- * reserve_top_address - reserves a hole in the top of kernel address space
- * @reserve - size of hole to reserve
- *
- * Can be used to relocate the fixmap area and poke a hole in the top
- * of kernel address space to make room for a hypervisor.
- */
-void __init reserve_top_address(unsigned long reserve)
-{
-	BUG_ON(fixmaps_set > 0);
-	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
-	       (int)-reserve);
-	__FIXADDR_TOP = -reserve - PAGE_SIZE;
-	__VMALLOC_RESERVE += reserve;
-}
-
 /*
  * vmalloc=size forces the vmalloc area to be exactly 'size'
  * bytes. This can be used to increase (or decrease) the
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 574c8bc95ef..c7d272b8574 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -116,6 +116,36 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 	reserve_early(phys, phys + length, "ACPI SLIT");
 }
 
+/* Callback for Proximity Domain -> x2APIC mapping */
+void __init
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+	int pxm, node;
+	int apic_id;
+
+	if (srat_disabled())
+		return;
+	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
+	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+		return;
+	pxm = pa->proximity_domain;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+		bad_srat();
+		return;
+	}
+
+	apic_id = pa->apic_id;
+	apicid_to_node[apic_id] = node;
+	acpi_numa = 1;
+	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+	       pxm, apic_id, node);
+}
+
 /* Callback for Proximity Domain -> LAPIC mapping */
 void __init
 acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index a654d59e448..821e97017e9 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -187,11 +187,6 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
 		       cpumask, cpumask_of(smp_processor_id()));
 
 	/*
-	 * Make the above memory operations globally visible before
-	 * sending the IPI.
-	 */
-	smp_mb();
-	/*
 	 * We have to send the IPI only to
 	 * CPUs affected.
 	 */
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 4c4a51c90bc..819b131fd75 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -380,7 +380,7 @@ static unsigned int get_stagger(void)
 {
 #ifdef CONFIG_SMP
 	int cpu = smp_processor_id();
-	return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu)));
+	return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map));
 #endif
 	return 0;
 }
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 82d22fc601a..8c362b96b64 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -90,7 +90,7 @@ static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
 	return 0;
 }
 
-static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = {
+static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitconst = {
 /*
  * Systems where PCI IO resource ISA alignment can be skipped
  * when the ISA enable bit in the bridge control is not set
@@ -183,7 +183,7 @@ static int __devinit assign_all_busses(const struct dmi_system_id *d)
 }
 #endif
 
-static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
+static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
 #ifdef __i386__
 /*
  * Laptops which need pci=assign-busses to see Cardbus cards
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index f6adf2c6d75..aaf26ae58cd 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -69,11 +69,12 @@ void early_dump_pci_device(u8 bus, u8 slot, u8 func)
 	int j;
 	u32 val;
 
-	printk(KERN_INFO "PCI: %02x:%02x:%02x", bus, slot, func);
+	printk(KERN_INFO "pci 0000:%02x:%02x.%d config space:",
+	       bus, slot, func);
 
 	for (i = 0; i < 256; i += 4) {
 		if (!(i & 0x0f))
-			printk("\n%04x:",i);
+			printk("\n  %02x:",i);
 
 		val = read_pci_config(bus, slot, func, i);
 		for (j = 0; j < 4; j++) {
@@ -96,20 +97,22 @@ void early_dump_pci_devices(void)
 			for (func = 0; func < 8; func++) {
 				u32 class;
 				u8 type;
+
 				class = read_pci_config(bus, slot, func,
 							PCI_CLASS_REVISION);
 				if (class == 0xffffffff)
-					break;
+					continue;
 
 				early_dump_pci_device(bus, slot, func);
 
-				/* No multi-function device? */
-				type = read_pci_config_byte(bus, slot, func,
+				if (func == 0) {
+					type = read_pci_config_byte(bus, slot,
+								    func,
 							       PCI_HEADER_TYPE);
-				if (!(type & 0x80))
-					break;
+					if (!(type & 0x80))
+						break;
+				}
 			}
 		}
 	}
 }
-
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 7d388d5cf54..6dd89555fbf 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -356,7 +356,7 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
 DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
 
 
-static struct dmi_system_id __devinitdata msi_k8t_dmi_table[] = {
+static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = {
 	{
 		.ident = "MSI-K8T-Neo2Fir",
 		.matches = {
@@ -413,7 +413,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
  */
 static u16 toshiba_line_size;
 
-static struct dmi_system_id __devinitdata toshiba_ohci1394_dmi_table[] = {
+static const struct dmi_system_id __devinitconst toshiba_ohci1394_dmi_table[] = {
 	{
 		.ident = "Toshiba PS5 based laptop",
 		.matches = {
@@ -495,26 +495,6 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015,
 			  pci_siemens_interrupt_controller);
 
 /*
- * Regular PCI devices have 256 bytes, but AMD Family 10h/11h CPUs have
- * 4096 bytes configuration space for each function of their processor
- * configuration space.
- */
-static void amd_cpu_pci_cfg_space_size(struct pci_dev *dev)
-{
-	dev->cfg_size = pci_cfg_space_size_ext(dev);
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1300, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1301, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1302, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1303, amd_cpu_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1304, amd_cpu_pci_cfg_space_size);
-
-/*
  * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
  * confusing the PCI engine:
  */
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5ead808dd70..f1817f71e00 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -258,24 +258,7 @@ void pcibios_set_master(struct pci_dev *dev)
 	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
 }
 
-static void pci_unmap_page_range(struct vm_area_struct *vma)
-{
-	u64 addr = (u64)vma->vm_pgoff << PAGE_SHIFT;
-	free_memtype(addr, addr + vma->vm_end - vma->vm_start);
-}
-
-static void pci_track_mmap_page_range(struct vm_area_struct *vma)
-{
-	u64 addr = (u64)vma->vm_pgoff << PAGE_SHIFT;
-	unsigned long flags = pgprot_val(vma->vm_page_prot)
-						& _PAGE_CACHE_MASK;
-
-	reserve_memtype(addr, addr + vma->vm_end - vma->vm_start, flags, NULL);
-}
-
 static struct vm_operations_struct pci_mmap_ops = {
-	.open  = pci_track_mmap_page_range,
-	.close = pci_unmap_page_range,
 	.access = generic_access_phys,
 };
 
@@ -283,11 +266,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
 	unsigned long prot;
-	u64 addr = vma->vm_pgoff << PAGE_SHIFT;
-	unsigned long len = vma->vm_end - vma->vm_start;
-	unsigned long flags;
-	unsigned long new_flags;
-	int retval;
 
 	/* I/O space cannot be accessed via normal processor loads and
 	 * stores on this platform.
@@ -308,27 +286,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 
 	vma->vm_page_prot = __pgprot(prot);
 
-	flags = pgprot_val(vma->vm_page_prot) & _PAGE_CACHE_MASK;
-	retval = reserve_memtype(addr, addr + len, flags, &new_flags);
-	if (retval)
-		return retval;
-
-	if (flags != new_flags) {
-		if (!is_new_memtype_allowed(flags, new_flags)) {
-			free_memtype(addr, addr+len);
-			return -EINVAL;
-		}
-		flags = new_flags;
-	}
-
-	if (((vma->vm_pgoff < max_low_pfn_mapped) ||
-	     (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
-	      vma->vm_pgoff < max_pfn_mapped)) &&
-	    ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
-		free_memtype(addr, addr + len);
-		return -EINVAL;
-	}
-
 	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
 			       vma->vm_end - vma->vm_start,
 			       vma->vm_page_prot))
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index f1065b129e9..4061bb0f267 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -50,8 +50,6 @@ static int __init pci_legacy_init(void)
 	if (pci_root_bus)
 		pci_bus_add_devices(pci_root_bus);
 
-	pcibios_fixup_peer_bridges();
-
 	return 0;
 }
 
@@ -67,6 +65,7 @@ int __init pci_subsys_init(void)
 	pci_visws_init();
 #endif
 	pci_legacy_init();
+	pcibios_fixup_peer_bridges();
 	pcibios_irq_init();
 	pcibios_init();
 
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 89bf9242c80..905bb526b13 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/acpi.h>
 #include <linux/bitmap.h>
+#include <linux/sort.h>
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
 
@@ -24,24 +25,49 @@
 /* Indicate if the mmcfg resources have been placed into the resource table. */
 static int __initdata pci_mmcfg_resources_inserted;
 
+static __init int extend_mmcfg(int num)
+{
+	struct acpi_mcfg_allocation *new;
+	int new_num = pci_mmcfg_config_num + num;
+
+	new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL);
+	if (!new)
+		return -1;
+
+	if (pci_mmcfg_config) {
+		memcpy(new, pci_mmcfg_config,
+			 sizeof(pci_mmcfg_config[0]) * new_num);
+		kfree(pci_mmcfg_config);
+	}
+	pci_mmcfg_config = new;
+
+	return 0;
+}
+
+static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end)
+{
+	int i = pci_mmcfg_config_num;
+
+	pci_mmcfg_config_num++;
+	pci_mmcfg_config[i].address = addr;
+	pci_mmcfg_config[i].pci_segment = segment;
+	pci_mmcfg_config[i].start_bus_number = start;
+	pci_mmcfg_config[i].end_bus_number = end;
+}
+
 static const char __init *pci_mmcfg_e7520(void)
 {
 	u32 win;
 	raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0xce, 2, &win);
 
 	win = win & 0xf000;
-	if(win == 0x0000 || win == 0xf000)
-		pci_mmcfg_config_num = 0;
-	else {
-		pci_mmcfg_config_num = 1;
-		pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
-		if (!pci_mmcfg_config)
-			return NULL;
-		pci_mmcfg_config[0].address = win << 16;
-		pci_mmcfg_config[0].pci_segment = 0;
-		pci_mmcfg_config[0].start_bus_number = 0;
-		pci_mmcfg_config[0].end_bus_number = 255;
-	}
+	if (win == 0x0000 || win == 0xf000)
+		return NULL;
+
+	if (extend_mmcfg(1) == -1)
+		return NULL;
+
+	fill_one_mmcfg(win << 16, 0, 0, 255);
 
 	return "Intel Corporation E7520 Memory Controller Hub";
 }
@@ -50,13 +76,11 @@ static const char __init *pci_mmcfg_intel_945(void)
 {
 	u32 pciexbar, mask = 0, len = 0;
 
-	pci_mmcfg_config_num = 1;
-
 	raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0x48, 4, &pciexbar);
 
 	/* Enable bit */
 	if (!(pciexbar & 1))
-		pci_mmcfg_config_num = 0;
+		return NULL;
 
 	/* Size bits */
 	switch ((pciexbar >> 1) & 3) {
@@ -73,28 +97,23 @@ static const char __init *pci_mmcfg_intel_945(void)
 		len  = 0x04000000U;
 		break;
 	default:
-		pci_mmcfg_config_num = 0;
+		return NULL;
 	}
 
 	/* Errata #2, things break when not aligned on a 256Mb boundary */
 	/* Can only happen in 64M/128M mode */
 
 	if ((pciexbar & mask) & 0x0fffffffU)
-		pci_mmcfg_config_num = 0;
+		return NULL;
 
 	/* Don't hit the APIC registers and their friends */
 	if ((pciexbar & mask) >= 0xf0000000U)
-		pci_mmcfg_config_num = 0;
-
-	if (pci_mmcfg_config_num) {
-		pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
-		if (!pci_mmcfg_config)
-			return NULL;
-		pci_mmcfg_config[0].address = pciexbar & mask;
-		pci_mmcfg_config[0].pci_segment = 0;
-		pci_mmcfg_config[0].start_bus_number = 0;
-		pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1;
-	}
+		return NULL;
+
+	if (extend_mmcfg(1) == -1)
+		return NULL;
+
+	fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1);
 
 	return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
 }
@@ -138,22 +157,77 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
 		busnbits = 8;
 	}
 
-	pci_mmcfg_config_num = (1 << segnbits);
-	pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]) *
-				   pci_mmcfg_config_num, GFP_KERNEL);
-	if (!pci_mmcfg_config)
+	if (extend_mmcfg(1 << segnbits) == -1)
 		return NULL;
 
-	for (i = 0; i < (1 << segnbits); i++) {
-		pci_mmcfg_config[i].address = base + (1<<28) * i;
-		pci_mmcfg_config[i].pci_segment = i;
-		pci_mmcfg_config[i].start_bus_number = 0;
-		pci_mmcfg_config[i].end_bus_number = (1 << busnbits) - 1;
-	}
+	for (i = 0; i < (1 << segnbits); i++)
+		fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1);
 
 	return "AMD Family 10h NB";
 }
 
+static bool __initdata mcp55_checked;
+static const char __init *pci_mmcfg_nvidia_mcp55(void)
+{
+	int bus;
+	int mcp55_mmconf_found = 0;
+
+	static const u32 extcfg_regnum		= 0x90;
+	static const u32 extcfg_regsize		= 4;
+	static const u32 extcfg_enable_mask	= 1<<31;
+	static const u32 extcfg_start_mask	= 0xff<<16;
+	static const int extcfg_start_shift	= 16;
+	static const u32 extcfg_size_mask	= 0x3<<28;
+	static const int extcfg_size_shift	= 28;
+	static const int extcfg_sizebus[]	= {0x100, 0x80, 0x40, 0x20};
+	static const u32 extcfg_base_mask[]	= {0x7ff8, 0x7ffc, 0x7ffe, 0x7fff};
+	static const int extcfg_base_lshift	= 25;
+
+	/*
+	 * do check if amd fam10h already took over
+	 */
+	if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked)
+		return NULL;
+
+	mcp55_checked = true;
+	for (bus = 0; bus < 256; bus++) {
+		u64 base;
+		u32 l, extcfg;
+		u16 vendor, device;
+		int start, size_index, end;
+
+		raw_pci_ops->read(0, bus, PCI_DEVFN(0, 0), 0, 4, &l);
+		vendor = l & 0xffff;
+		device = (l >> 16) & 0xffff;
+
+		if (PCI_VENDOR_ID_NVIDIA != vendor || 0x0369 != device)
+			continue;
+
+		raw_pci_ops->read(0, bus, PCI_DEVFN(0, 0), extcfg_regnum,
+				  extcfg_regsize, &extcfg);
+
+		if (!(extcfg & extcfg_enable_mask))
+			continue;
+
+		if (extend_mmcfg(1) == -1)
+			continue;
+
+		size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift;
+		base = extcfg & extcfg_base_mask[size_index];
+		/* base could > 4G */
+		base <<= extcfg_base_lshift;
+		start = (extcfg & extcfg_start_mask) >> extcfg_start_shift;
+		end = start + extcfg_sizebus[size_index] - 1;
+		fill_one_mmcfg(base, 0, start, end);
+		mcp55_mmconf_found++;
+	}
+
+	if (!mcp55_mmconf_found)
+		return NULL;
+
+	return "nVidia MCP55";
+}
+
 struct pci_mmcfg_hostbridge_probe {
 	u32 bus;
 	u32 devfn;
@@ -171,8 +245,52 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
 	  0x1200, pci_mmcfg_amd_fam10h },
 	{ 0xff, PCI_DEVFN(0, 0), PCI_VENDOR_ID_AMD,
 	  0x1200, pci_mmcfg_amd_fam10h },
+	{ 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_NVIDIA,
+	  0x0369, pci_mmcfg_nvidia_mcp55 },
 };
 
+static int __init cmp_mmcfg(const void *x1, const void *x2)
+{
+	const typeof(pci_mmcfg_config[0]) *m1 = x1;
+	const typeof(pci_mmcfg_config[0]) *m2 = x2;
+	int start1, start2;
+
+	start1 = m1->start_bus_number;
+	start2 = m2->start_bus_number;
+
+	return start1 - start2;
+}
+
+static void __init pci_mmcfg_check_end_bus_number(void)
+{
+	int i;
+	typeof(pci_mmcfg_config[0]) *cfg, *cfgx;
+
+	/* sort them at first */
+	sort(pci_mmcfg_config, pci_mmcfg_config_num,
+		 sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL);
+
+	/* last one*/
+	if (pci_mmcfg_config_num > 0) {
+		i = pci_mmcfg_config_num - 1;
+		cfg = &pci_mmcfg_config[i];
+		if (cfg->end_bus_number < cfg->start_bus_number)
+			cfg->end_bus_number = 255;
+	}
+
+	/* don't overlap please */
+	for (i = 0; i < pci_mmcfg_config_num - 1; i++) {
+		cfg = &pci_mmcfg_config[i];
+		cfgx = &pci_mmcfg_config[i+1];
+
+		if (cfg->end_bus_number < cfg->start_bus_number)
+			cfg->end_bus_number = 255;
+
+		if (cfg->end_bus_number >= cfgx->start_bus_number)
+			cfg->end_bus_number = cfgx->start_bus_number - 1;
+	}
+}
+
 static int __init pci_mmcfg_check_hostbridge(void)
 {
 	u32 l;
@@ -186,31 +304,33 @@ static int __init pci_mmcfg_check_hostbridge(void)
 
 	pci_mmcfg_config_num = 0;
 	pci_mmcfg_config = NULL;
-	name = NULL;
 
-	for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
+	for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
 		bus =  pci_mmcfg_probes[i].bus;
 		devfn = pci_mmcfg_probes[i].devfn;
 		raw_pci_ops->read(0, bus, devfn, 0, 4, &l);
 		vendor = l & 0xffff;
 		device = (l >> 16) & 0xffff;
 
+		name = NULL;
 		if (pci_mmcfg_probes[i].vendor == vendor &&
 		    pci_mmcfg_probes[i].device == device)
 			name = pci_mmcfg_probes[i].probe();
-	}
 
-	if (name) {
-		printk(KERN_INFO "PCI: Found %s %s MMCONFIG support.\n",
-		       name, pci_mmcfg_config_num ? "with" : "without");
+		if (name)
+			printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n",
+			       name);
 	}
 
-	return name != NULL;
+	/* some end_bus_number is crazy, fix it */
+	pci_mmcfg_check_end_bus_number();
+
+	return pci_mmcfg_config_num != 0;
 }
 
 static void __init pci_mmcfg_insert_resources(void)
 {
-#define PCI_MMCFG_RESOURCE_NAME_LEN 19
+#define PCI_MMCFG_RESOURCE_NAME_LEN 24
 	int i;
 	struct resource *res;
 	char *names;
@@ -228,9 +348,10 @@ static void __init pci_mmcfg_insert_resources(void)
 		struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
 		num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
 		res->name = names;
-		snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
-			 cfg->pci_segment);
-		res->start = cfg->address;
+		snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN,
+			 "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment,
+			 cfg->start_bus_number, cfg->end_bus_number);
+		res->start = cfg->address + (cfg->start_bus_number << 20);
 		res->end = res->start + (num_buses << 20) - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		insert_resource(&iomem_resource, res);
@@ -354,8 +475,6 @@ static void __init pci_mmcfg_reject_broken(int early)
 	    (pci_mmcfg_config[0].address == 0))
 		return;
 
-	cfg = &pci_mmcfg_config[0];
-
 	for (i = 0; i < pci_mmcfg_config_num; i++) {
 		int valid = 0;
 		u64 addr, size;
@@ -423,10 +542,10 @@ static void __init __pci_mmcfg_init(int early)
 			known_bridge = 1;
 	}
 
-	if (!known_bridge) {
+	if (!known_bridge)
 		acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
-		pci_mmcfg_reject_broken(early);
-	}
+
+	pci_mmcfg_reject_broken(early);
 
 	if ((pci_mmcfg_config_num == 0) ||
 	    (pci_mmcfg_config == NULL) ||
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 30007ffc8e1..94349f8b2f9 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -112,13 +112,18 @@ static struct pci_raw_ops pci_mmcfg = {
 static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
 {
 	void __iomem *addr;
-	u32 size;
-
-	size = (cfg->end_bus_number + 1) << 20;
-	addr = ioremap_nocache(cfg->address, size);
+	u64 start, size;
+
+	start = cfg->start_bus_number;
+	start <<= 20;
+	start += cfg->address;
+	size = cfg->end_bus_number + 1 - cfg->start_bus_number;
+	size <<= 20;
+	addr = ioremap_nocache(start, size);
 	if (addr) {
 		printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n",
-		       cfg->address, cfg->address + size - 1);
+		       start, start + size - 1);
+		addr -= cfg->start_bus_number << 20;
 	}
 	return addr;
 }
@@ -157,7 +162,7 @@ void __init pci_mmcfg_arch_free(void)
 
 	for (i = 0; i < pci_mmcfg_config_num; ++i) {
 		if (pci_mmcfg_virt[i].virt) {
-			iounmap(pci_mmcfg_virt[i].virt);
+			iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20));
 			pci_mmcfg_virt[i].virt = NULL;
 			pci_mmcfg_virt[i].cfg = NULL;
 		}
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 9ff4d5b55ad..58b32db3312 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,2 +1,7 @@
+# __restore_processor_state() restores %gs after S3 resume and so should not
+# itself be stack-protected
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_cpu_$(BITS).o	:= $(nostackp)
+
 obj-$(CONFIG_PM_SLEEP)		+= cpu_$(BITS).o
 obj-$(CONFIG_HIBERNATION)	+= hibernate_$(BITS).o hibernate_asm_$(BITS).o
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 274d06082f4..ce702c5b3a2 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -12,6 +12,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/xcr.h>
+#include <asm/suspend.h>
 
 static struct saved_context saved_context;
 
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index e3b6cf70d62..5343540f260 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -15,6 +15,7 @@
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
 #include <asm/xcr.h>
+#include <asm/suspend.h>
 
 static void fix_processor_context(void);
 
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 6dd000dd793..65fdc86e923 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -14,6 +14,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
+#include <asm/suspend.h>
 
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c52f4034c7f..82cd39a6cbd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
 
 	vcpup = &per_cpu(xen_vcpu_info, cpu);
 
-	info.mfn = virt_to_mfn(vcpup);
+	info.mfn = arbitrary_virt_to_mfn(vcpup);
 	info.offset = offset_in_page(vcpup);
 
 	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 	frames = mcs.args;
 
 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-		frames[f] = virt_to_mfn(va);
+		frames[f] = arbitrary_virt_to_mfn((void *)va);
+
 		make_lowmem_page_readonly((void *)va);
+		make_lowmem_page_readonly(mfn_to_virt(frames[f]));
 	}
 
 	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
 				unsigned int cpu, unsigned int i)
 {
 	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-	xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
 	struct multicall_space mc = __xen_mc_entry(0);
 
 	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 		break;
 
 	default: {
-		xmaddr_t maddr = virt_to_machine(&dt[entry]);
+		xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
 
 		xen_mc_flush();
 		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 319bd40a57c..db3802fb7b8 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	p2m_top[topidx][idx] = mfn;
 }
 
+unsigned long arbitrary_virt_to_mfn(void *vaddr)
+{
+	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
+
+	return PFN_DOWN(maddr.maddr);
+}
+
 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 {
 	unsigned long address = (unsigned long)vaddr;
@@ -1716,9 +1723,9 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 {
 	pmd_t *kernel_pmd;
 
-	init_pg_tables_start = __pa(pgd);
-	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-	max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
+				  xen_start_info->nr_pt_frames * PAGE_SIZE +
+				  512*1024);
 
 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
 	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1863,7 +1870,6 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
 #ifdef CONFIG_X86_PAE
 	.set_pte_atomic = xen_set_pte_atomic,
-	.set_pte_present = xen_set_pte_at,
 	.pte_clear = xen_pte_clear,
 	.pmd_clear = xen_pmd_clear,
 #endif	/* CONFIG_X86_PAE */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 035582ae815..585a6e33083 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -158,7 +158,7 @@ static void __init xen_fill_possible_map(void)
 		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
 		if (rc >= 0) {
 			num_processors++;
-			cpu_set(i, cpu_possible_map);
+			set_cpu_possible(i, true);
 		}
 	}
 }
@@ -197,7 +197,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 	while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
 		for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
 			continue;
-		cpu_clear(cpu, cpu_possible_map);
+		set_cpu_possible(cpu, false);
 	}
 
 	for_each_possible_cpu (cpu) {
@@ -210,7 +210,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 		if (IS_ERR(idle))
 			panic("failed fork for CPU %d", cpu);
 
-		cpu_set(cpu, cpu_present_map);
+		set_cpu_present(cpu, true);
 	}
 }
 
@@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 {
 	struct vcpu_guest_context *ctxt;
 	struct desc_struct *gdt;
+	unsigned long gdt_mfn;
 
 	if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
 		return 0;
@@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->ldt_ents = 0;
 
 	BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+
+	gdt_mfn = arbitrary_virt_to_mfn(gdt);
 	make_lowmem_page_readonly(gdt);
+	make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
 
-	ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+	ctxt->gdt_frames[0] = gdt_mfn;
 	ctxt->gdt_ents      = GDT_ENTRIES;
 
 	ctxt->user_regs.cs = __KERNEL_CS;