aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-28 16:54:49 +0100
committerIngo Molnar <mingo@elte.hu>2008-10-28 16:54:49 +0100
commitd1a76187a5be4f89c6cb19d800cb5fb7aac735c5 (patch)
tree2fac3ffbfffc7560eeef8364b541d0d7a0057920 /arch/x86
parentc7e78cff6b7518212247fb20b1dc6411540dc9af (diff)
parent0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)
Merge commit 'v2.6.28-rc2' into core/locking
Conflicts: arch/um/include/asm/system.h
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig222
-rw-r--r--arch/x86/Kconfig.cpu149
-rw-r--r--arch/x86/Kconfig.debug13
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/Makefile_32.cpu6
-rw-r--r--arch/x86/boot/Makefile5
-rw-r--r--arch/x86/boot/compressed/Makefile8
-rw-r--r--arch/x86/boot/compressed/head_32.S5
-rw-r--r--arch/x86/boot/compressed/misc.c12
-rw-r--r--arch/x86/boot/compressed/relocs.c2
-rw-r--r--arch/x86/boot/cpu.c17
-rw-r--r--arch/x86/boot/edd.c7
-rw-r--r--arch/x86/boot/header.S1
-rw-r--r--arch/x86/boot/mkcpustr.c40
-rw-r--r--arch/x86/boot/video-bios.c4
-rw-r--r--arch/x86/boot/video-vesa.c15
-rw-r--r--arch/x86/configs/i386_defconfig22
-rw-r--r--arch/x86/configs/x86_64_defconfig31
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/crc32c-intel.c197
-rw-r--r--arch/x86/ia32/ia32_aout.c11
-rw-r--r--arch/x86/ia32/ia32_signal.c138
-rw-r--r--arch/x86/ia32/ia32entry.S30
-rw-r--r--arch/x86/ia32/sys_ia32.c108
-rw-r--r--arch/x86/include/asm/Kbuild24
-rw-r--r--arch/x86/include/asm/a.out-core.h73
-rw-r--r--arch/x86/include/asm/a.out.h20
-rw-r--r--arch/x86/include/asm/acpi.h178
-rw-r--r--arch/x86/include/asm/agp.h35
-rw-r--r--arch/x86/include/asm/alternative-asm.h22
-rw-r--r--arch/x86/include/asm/alternative.h183
-rw-r--r--arch/x86/include/asm/amd_iommu.h35
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h404
-rw-r--r--arch/x86/include/asm/apic.h199
-rw-r--r--arch/x86/include/asm/apicdef.h417
-rw-r--r--arch/x86/include/asm/arch_hooks.h26
-rw-r--r--arch/x86/include/asm/asm.h47
-rw-r--r--arch/x86/include/asm/atomic.h5
-rw-r--r--arch/x86/include/asm/atomic_32.h259
-rw-r--r--arch/x86/include/asm/atomic_64.h473
-rw-r--r--arch/x86/include/asm/auxvec.h12
-rw-r--r--arch/x86/include/asm/bigsmp/apic.h139
-rw-r--r--arch/x86/include/asm/bigsmp/apicdef.h13
-rw-r--r--arch/x86/include/asm/bigsmp/ipi.h25
-rw-r--r--arch/x86/include/asm/bios_ebda.h36
-rw-r--r--arch/x86/include/asm/bitops.h451
-rw-r--r--arch/x86/include/asm/boot.h26
-rw-r--r--arch/x86/include/asm/bootparam.h111
-rw-r--r--arch/x86/include/asm/bug.h39
-rw-r--r--arch/x86/include/asm/bugs.h12
-rw-r--r--arch/x86/include/asm/byteorder.h81
-rw-r--r--arch/x86/include/asm/cache.h20
-rw-r--r--arch/x86/include/asm/cacheflush.h118
-rw-r--r--arch/x86/include/asm/calgary.h72
-rw-r--r--arch/x86/include/asm/calling.h170
-rw-r--r--arch/x86/include/asm/checksum.h5
-rw-r--r--arch/x86/include/asm/checksum_32.h189
-rw-r--r--arch/x86/include/asm/checksum_64.h191
-rw-r--r--arch/x86/include/asm/cmpxchg.h5
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h344
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h185
-rw-r--r--arch/x86/include/asm/compat.h218
-rw-r--r--arch/x86/include/asm/cpu.h20
-rw-r--r--arch/x86/include/asm/cpufeature.h271
-rw-r--r--arch/x86/include/asm/cputime.h1
-rw-r--r--arch/x86/include/asm/current.h39
-rw-r--r--arch/x86/include/asm/debugreg.h70
-rw-r--r--arch/x86/include/asm/delay.h31
-rw-r--r--arch/x86/include/asm/desc.h409
-rw-r--r--arch/x86/include/asm/desc_defs.h95
-rw-r--r--arch/x86/include/asm/device.h16
-rw-r--r--arch/x86/include/asm/div64.h60
-rw-r--r--arch/x86/include/asm/dma-mapping.h308
-rw-r--r--arch/x86/include/asm/dma.h318
-rw-r--r--arch/x86/include/asm/dmi.h26
-rw-r--r--arch/x86/include/asm/ds.h238
-rw-r--r--arch/x86/include/asm/dwarf2.h61
-rw-r--r--arch/x86/include/asm/e820.h146
-rw-r--r--arch/x86/include/asm/edac.h18
-rw-r--r--arch/x86/include/asm/efi.h110
-rw-r--r--arch/x86/include/asm/elf.h336
-rw-r--r--arch/x86/include/asm/emergency-restart.h18
-rw-r--r--arch/x86/include/asm/errno.h1
-rw-r--r--arch/x86/include/asm/es7000/apic.h193
-rw-r--r--arch/x86/include/asm/es7000/apicdef.h13
-rw-r--r--arch/x86/include/asm/es7000/ipi.h24
-rw-r--r--arch/x86/include/asm/es7000/mpparse.h30
-rw-r--r--arch/x86/include/asm/es7000/wakecpu.h59
-rw-r--r--arch/x86/include/asm/fb.h21
-rw-r--r--arch/x86/include/asm/fcntl.h1
-rw-r--r--arch/x86/include/asm/fixmap.h68
-rw-r--r--arch/x86/include/asm/fixmap_32.h123
-rw-r--r--arch/x86/include/asm/fixmap_64.h83
-rw-r--r--arch/x86/include/asm/floppy.h281
-rw-r--r--arch/x86/include/asm/frame.h27
-rw-r--r--arch/x86/include/asm/ftrace.h24
-rw-r--r--arch/x86/include/asm/futex.h140
-rw-r--r--arch/x86/include/asm/gart.h73
-rw-r--r--arch/x86/include/asm/genapic.h5
-rw-r--r--arch/x86/include/asm/genapic_32.h126
-rw-r--r--arch/x86/include/asm/genapic_64.h58
-rw-r--r--arch/x86/include/asm/geode.h253
-rw-r--r--arch/x86/include/asm/gpio.h56
-rw-r--r--arch/x86/include/asm/hardirq.h11
-rw-r--r--arch/x86/include/asm/hardirq_32.h28
-rw-r--r--arch/x86/include/asm/hardirq_64.h23
-rw-r--r--arch/x86/include/asm/highmem.h82
-rw-r--r--arch/x86/include/asm/hpet.h114
-rw-r--r--arch/x86/include/asm/hugetlb.h93
-rw-r--r--arch/x86/include/asm/hw_irq.h131
-rw-r--r--arch/x86/include/asm/hypertransport.h45
-rw-r--r--arch/x86/include/asm/i387.h400
-rw-r--r--arch/x86/include/asm/i8253.h18
-rw-r--r--arch/x86/include/asm/i8259.h63
-rw-r--r--arch/x86/include/asm/ia32.h170
-rw-r--r--arch/x86/include/asm/ia32_unistd.h18
-rw-r--r--arch/x86/include/asm/idle.h16
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/io.h91
-rw-r--r--arch/x86/include/asm/io_32.h284
-rw-r--r--arch/x86/include/asm/io_64.h244
-rw-r--r--arch/x86/include/asm/io_apic.h204
-rw-r--r--arch/x86/include/asm/ioctl.h1
-rw-r--r--arch/x86/include/asm/ioctls.h94
-rw-r--r--arch/x86/include/asm/iommu.h50
-rw-r--r--arch/x86/include/asm/ipcbuf.h28
-rw-r--r--arch/x86/include/asm/ipi.h138
-rw-r--r--arch/x86/include/asm/irq.h50
-rw-r--r--arch/x86/include/asm/irq_regs.h5
-rw-r--r--arch/x86/include/asm/irq_regs_32.h29
-rw-r--r--arch/x86/include/asm/irq_regs_64.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h8
-rw-r--r--arch/x86/include/asm/irq_vectors.h164
-rw-r--r--arch/x86/include/asm/irqflags.h211
-rw-r--r--arch/x86/include/asm/ist.h34
-rw-r--r--arch/x86/include/asm/k8.h15
-rw-r--r--arch/x86/include/asm/kdebug.h37
-rw-r--r--arch/x86/include/asm/kexec.h175
-rw-r--r--arch/x86/include/asm/kgdb.h79
-rw-r--r--arch/x86/include/asm/kmap_types.h29
-rw-r--r--arch/x86/include/asm/kprobes.h88
-rw-r--r--arch/x86/include/asm/kvm.h211
-rw-r--r--arch/x86/include/asm/kvm_host.h752
-rw-r--r--arch/x86/include/asm/kvm_para.h147
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h184
-rw-r--r--arch/x86/include/asm/ldt.h40
-rw-r--r--arch/x86/include/asm/lguest.h94
-rw-r--r--arch/x86/include/asm/lguest_hcall.h71
-rw-r--r--arch/x86/include/asm/linkage.h61
-rw-r--r--arch/x86/include/asm/local.h235
-rw-r--r--arch/x86/include/asm/mach-default/apm.h73
-rw-r--r--arch/x86/include/asm/mach-default/do_timer.h16
-rw-r--r--arch/x86/include/asm/mach-default/entry_arch.h36
-rw-r--r--arch/x86/include/asm/mach-default/mach_apic.h156
-rw-r--r--arch/x86/include/asm/mach-default/mach_apicdef.h24
-rw-r--r--arch/x86/include/asm/mach-default/mach_ipi.h64
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpparse.h17
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-default/mach_timer.h48
-rw-r--r--arch/x86/include/asm/mach-default/mach_traps.h33
-rw-r--r--arch/x86/include/asm/mach-default/mach_wakecpu.h42
-rw-r--r--arch/x86/include/asm/mach-default/pci-functions.h19
-rw-r--r--arch/x86/include/asm/mach-default/setup_arch.h3
-rw-r--r--arch/x86/include/asm/mach-default/smpboot_hooks.h59
-rw-r--r--arch/x86/include/asm/mach-generic/gpio.h15
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apic.h33
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apicdef.h11
-rw-r--r--arch/x86/include/asm/mach-generic/mach_ipi.h10
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpparse.h10
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-rdc321x/gpio.h60
-rw-r--r--arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h12
-rw-r--r--arch/x86/include/asm/mach-voyager/do_timer.h17
-rw-r--r--arch/x86/include/asm/mach-voyager/entry_arch.h26
-rw-r--r--arch/x86/include/asm/mach-voyager/setup_arch.h12
-rw-r--r--arch/x86/include/asm/math_emu.h31
-rw-r--r--arch/x86/include/asm/mc146818rtc.h104
-rw-r--r--arch/x86/include/asm/mca.h43
-rw-r--r--arch/x86/include/asm/mca_dma.h201
-rw-r--r--arch/x86/include/asm/mce.h130
-rw-r--r--arch/x86/include/asm/microcode.h47
-rw-r--r--arch/x86/include/asm/mman.h20
-rw-r--r--arch/x86/include/asm/mmconfig.h12
-rw-r--r--arch/x86/include/asm/mmu.h26
-rw-r--r--arch/x86/include/asm/mmu_context.h37
-rw-r--r--arch/x86/include/asm/mmu_context_32.h56
-rw-r--r--arch/x86/include/asm/mmu_context_64.h54
-rw-r--r--arch/x86/include/asm/mmx.h14
-rw-r--r--arch/x86/include/asm/mmzone.h5
-rw-r--r--arch/x86/include/asm/mmzone_32.h134
-rw-r--r--arch/x86/include/asm/mmzone_64.h51
-rw-r--r--arch/x86/include/asm/module.h80
-rw-r--r--arch/x86/include/asm/mpspec.h145
-rw-r--r--arch/x86/include/asm/mpspec_def.h180
-rw-r--r--arch/x86/include/asm/msgbuf.h39
-rw-r--r--arch/x86/include/asm/msidef.h55
-rw-r--r--arch/x86/include/asm/msr-index.h332
-rw-r--r--arch/x86/include/asm/msr.h247
-rw-r--r--arch/x86/include/asm/mtrr.h173
-rw-r--r--arch/x86/include/asm/mutex.h5
-rw-r--r--arch/x86/include/asm/mutex_32.h125
-rw-r--r--arch/x86/include/asm/mutex_64.h100
-rw-r--r--arch/x86/include/asm/nmi.h81
-rw-r--r--arch/x86/include/asm/nops.h118
-rw-r--r--arch/x86/include/asm/numa.h5
-rw-r--r--arch/x86/include/asm/numa_32.h11
-rw-r--r--arch/x86/include/asm/numa_64.h43
-rw-r--r--arch/x86/include/asm/numaq.h169
-rw-r--r--arch/x86/include/asm/numaq/apic.h136
-rw-r--r--arch/x86/include/asm/numaq/apicdef.h14
-rw-r--r--arch/x86/include/asm/numaq/ipi.h25
-rw-r--r--arch/x86/include/asm/numaq/mpparse.h7
-rw-r--r--arch/x86/include/asm/numaq/wakecpu.h43
-rw-r--r--arch/x86/include/asm/olpc.h132
-rw-r--r--arch/x86/include/asm/page.h209
-rw-r--r--arch/x86/include/asm/page_32.h136
-rw-r--r--arch/x86/include/asm/page_64.h105
-rw-r--r--arch/x86/include/asm/param.h22
-rw-r--r--arch/x86/include/asm/paravirt.h1650
-rw-r--r--arch/x86/include/asm/parport.h10
-rw-r--r--arch/x86/include/asm/pat.h22
-rw-r--r--arch/x86/include/asm/pci-direct.h21
-rw-r--r--arch/x86/include/asm/pci.h114
-rw-r--r--arch/x86/include/asm/pci_32.h34
-rw-r--r--arch/x86/include/asm/pci_64.h66
-rw-r--r--arch/x86/include/asm/pda.h137
-rw-r--r--arch/x86/include/asm/percpu.h218
-rw-r--r--arch/x86/include/asm/pgalloc.h114
-rw-r--r--arch/x86/include/asm/pgtable-2level-defs.h20
-rw-r--r--arch/x86/include/asm/pgtable-2level.h79
-rw-r--r--arch/x86/include/asm/pgtable-3level-defs.h28
-rw-r--r--arch/x86/include/asm/pgtable-3level.h175
-rw-r--r--arch/x86/include/asm/pgtable.h562
-rw-r--r--arch/x86/include/asm/pgtable_32.h191
-rw-r--r--arch/x86/include/asm/pgtable_64.h285
-rw-r--r--arch/x86/include/asm/poll.h1
-rw-r--r--arch/x86/include/asm/posix_types.h13
-rw-r--r--arch/x86/include/asm/posix_types_32.h85
-rw-r--r--arch/x86/include/asm/posix_types_64.h119
-rw-r--r--arch/x86/include/asm/prctl.h10
-rw-r--r--arch/x86/include/asm/processor-cyrix.h38
-rw-r--r--arch/x86/include/asm/processor-flags.h100
-rw-r--r--arch/x86/include/asm/processor.h936
-rw-r--r--arch/x86/include/asm/proto.h32
-rw-r--r--arch/x86/include/asm/ptrace-abi.h145
-rw-r--r--arch/x86/include/asm/ptrace.h280
-rw-r--r--arch/x86/include/asm/pvclock-abi.h42
-rw-r--r--arch/x86/include/asm/pvclock.h14
-rw-r--r--arch/x86/include/asm/reboot.h21
-rw-r--r--arch/x86/include/asm/reboot_fixups.h6
-rw-r--r--arch/x86/include/asm/required-features.h82
-rw-r--r--arch/x86/include/asm/resource.h1
-rw-r--r--arch/x86/include/asm/resume-trace.h21
-rw-r--r--arch/x86/include/asm/rio.h63
-rw-r--r--arch/x86/include/asm/rtc.h1
-rw-r--r--arch/x86/include/asm/rwlock.h8
-rw-r--r--arch/x86/include/asm/rwsem.h265
-rw-r--r--arch/x86/include/asm/scatterlist.h33
-rw-r--r--arch/x86/include/asm/seccomp.h5
-rw-r--r--arch/x86/include/asm/seccomp_32.h17
-rw-r--r--arch/x86/include/asm/seccomp_64.h25
-rw-r--r--arch/x86/include/asm/sections.h1
-rw-r--r--arch/x86/include/asm/segment.h209
-rw-r--r--arch/x86/include/asm/sembuf.h24
-rw-r--r--arch/x86/include/asm/serial.h29
-rw-r--r--arch/x86/include/asm/setup.h105
-rw-r--r--arch/x86/include/asm/shmbuf.h51
-rw-r--r--arch/x86/include/asm/shmparam.h6
-rw-r--r--arch/x86/include/asm/sigcontext.h284
-rw-r--r--arch/x86/include/asm/sigcontext32.h75
-rw-r--r--arch/x86/include/asm/siginfo.h10
-rw-r--r--arch/x86/include/asm/signal.h262
-rw-r--r--arch/x86/include/asm/smp.h229
-rw-r--r--arch/x86/include/asm/socket.h57
-rw-r--r--arch/x86/include/asm/sockios.h13
-rw-r--r--arch/x86/include/asm/sparsemem.h34
-rw-r--r--arch/x86/include/asm/spinlock.h364
-rw-r--r--arch/x86/include/asm/spinlock_types.h20
-rw-r--r--arch/x86/include/asm/srat.h39
-rw-r--r--arch/x86/include/asm/stacktrace.h21
-rw-r--r--arch/x86/include/asm/stat.h114
-rw-r--r--arch/x86/include/asm/statfs.h12
-rw-r--r--arch/x86/include/asm/string.h5
-rw-r--r--arch/x86/include/asm/string_32.h326
-rw-r--r--arch/x86/include/asm/string_64.h60
-rw-r--r--arch/x86/include/asm/summit/apic.h184
-rw-r--r--arch/x86/include/asm/summit/apicdef.h13
-rw-r--r--arch/x86/include/asm/summit/ipi.h25
-rw-r--r--arch/x86/include/asm/summit/mpparse.h109
-rw-r--r--arch/x86/include/asm/suspend.h5
-rw-r--r--arch/x86/include/asm/suspend_32.h51
-rw-r--r--arch/x86/include/asm/suspend_64.h52
-rw-r--r--arch/x86/include/asm/swiotlb.h58
-rw-r--r--arch/x86/include/asm/sync_bitops.h130
-rw-r--r--arch/x86/include/asm/syscall.h213
-rw-r--r--arch/x86/include/asm/syscalls.h93
-rw-r--r--arch/x86/include/asm/system.h425
-rw-r--r--arch/x86/include/asm/system_64.h22
-rw-r--r--arch/x86/include/asm/tce.h48
-rw-r--r--arch/x86/include/asm/termbits.h198
-rw-r--r--arch/x86/include/asm/termios.h113
-rw-r--r--arch/x86/include/asm/therm_throt.h9
-rw-r--r--arch/x86/include/asm/thread_info.h264
-rw-r--r--arch/x86/include/asm/time.h63
-rw-r--r--arch/x86/include/asm/timer.h66
-rw-r--r--arch/x86/include/asm/timex.h19
-rw-r--r--arch/x86/include/asm/tlb.h11
-rw-r--r--arch/x86/include/asm/tlbflush.h178
-rw-r--r--arch/x86/include/asm/topology.h258
-rw-r--r--arch/x86/include/asm/trampoline.h21
-rw-r--r--arch/x86/include/asm/traps.h81
-rw-r--r--arch/x86/include/asm/tsc.h62
-rw-r--r--arch/x86/include/asm/types.h36
-rw-r--r--arch/x86/include/asm/uaccess.h456
-rw-r--r--arch/x86/include/asm/uaccess_32.h218
-rw-r--r--arch/x86/include/asm/uaccess_64.h208
-rw-r--r--arch/x86/include/asm/ucontext.h18
-rw-r--r--arch/x86/include/asm/unaligned.h14
-rw-r--r--arch/x86/include/asm/unistd.h13
-rw-r--r--arch/x86/include/asm/unistd_32.h379
-rw-r--r--arch/x86/include/asm/unistd_64.h693
-rw-r--r--arch/x86/include/asm/unwind.h13
-rw-r--r--arch/x86/include/asm/user.h5
-rw-r--r--arch/x86/include/asm/user32.h70
-rw-r--r--arch/x86/include/asm/user_32.h131
-rw-r--r--arch/x86/include/asm/user_64.h137
-rw-r--r--arch/x86/include/asm/uv/bios.h94
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h332
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h354
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h36
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h1295
-rw-r--r--arch/x86/include/asm/vdso.h47
-rw-r--r--arch/x86/include/asm/vga.h20
-rw-r--r--arch/x86/include/asm/vgtod.h29
-rw-r--r--arch/x86/include/asm/vic.h61
-rw-r--r--arch/x86/include/asm/visws/cobalt.h125
-rw-r--r--arch/x86/include/asm/visws/lithium.h53
-rw-r--r--arch/x86/include/asm/visws/piix4.h107
-rw-r--r--arch/x86/include/asm/visws/sgivw.h5
-rw-r--r--arch/x86/include/asm/vm86.h208
-rw-r--r--arch/x86/include/asm/vmi.h263
-rw-r--r--arch/x86/include/asm/vmi_time.h98
-rw-r--r--arch/x86/include/asm/voyager.h528
-rw-r--r--arch/x86/include/asm/vsyscall.h44
-rw-r--r--arch/x86/include/asm/xcr.h49
-rw-r--r--arch/x86/include/asm/xen/events.h24
-rw-r--r--arch/x86/include/asm/xen/grant_table.h7
-rw-r--r--arch/x86/include/asm/xen/hypercall.h527
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h82
-rw-r--r--arch/x86/include/asm/xen/interface.h175
-rw-r--r--arch/x86/include/asm/xen/interface_32.h97
-rw-r--r--arch/x86/include/asm/xen/interface_64.h159
-rw-r--r--arch/x86/include/asm/xen/page.h165
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/include/asm/xor_32.h888
-rw-r--r--arch/x86/include/asm/xor_64.h361
-rw-r--r--arch/x86/include/asm/xsave.h118
-rw-r--r--arch/x86/kernel/Makefile23
-rw-r--r--arch/x86/kernel/acpi/boot.c56
-rw-r--r--arch/x86/kernel/acpi/sleep.c7
-rw-r--r--arch/x86/kernel/alternative.c10
-rw-r--r--arch/x86/kernel/amd_iommu.c359
-rw-r--r--arch/x86/kernel/amd_iommu_init.c196
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic.c (renamed from arch/x86/kernel/apic_32.c)1038
-rw-r--r--arch/x86/kernel/apic_64.c1390
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/bios_uv.c137
-rw-r--r--arch/x86/kernel/cpu/.gitignore1
-rw-r--r--arch/x86/kernel/cpu/Makefile34
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c88
-rw-r--r--arch/x86/kernel/cpu/amd.c548
-rw-r--r--arch/x86/kernel/cpu/amd_64.c224
-rw-r--r--arch/x86/kernel/cpu/centaur.c4
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c6
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c1001
-rw-r--r--arch/x86/kernel/cpu/common_64.c712
-rw-r--r--arch/x86/kernel/cpu/cpu.h19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c16
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c42
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c23
-rw-r--r--arch/x86/kernel/cpu/feature_names.c84
-rw-r--r--arch/x86/kernel/cpu/intel.c365
-rw-r--r--arch/x86/kernel/cpu/intel_64.c95
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c169
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c2
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl32
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c276
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c97
-rw-r--r--arch/x86/kernel/cpu/powerflags.c20
-rw-r--r--arch/x86/kernel/cpu/proc.c6
-rw-r--r--arch/x86/kernel/cpu/transmeta.c32
-rw-r--r--arch/x86/kernel/cpu/umc.c3
-rw-r--r--arch/x86/kernel/cpuid.c5
-rw-r--r--arch/x86/kernel/crash_dump_32.c3
-rw-r--r--arch/x86/kernel/crash_dump_64.c14
-rw-r--r--arch/x86/kernel/doublefault_32.c2
-rw-r--r--arch/x86/kernel/ds.c954
-rw-r--r--arch/x86/kernel/dumpstack_32.c449
-rw-r--r--arch/x86/kernel/dumpstack_64.c575
-rw-r--r--arch/x86/kernel/e820.c32
-rw-r--r--arch/x86/kernel/early-quirks.c115
-rw-r--r--arch/x86/kernel/early_printk.c748
-rw-r--r--arch/x86/kernel/efi.c10
-rw-r--r--arch/x86/kernel/entry_32.S36
-rw-r--r--arch/x86/kernel/entry_64.S45
-rw-r--r--arch/x86/kernel/es7000_32.c (renamed from arch/x86/mach-es7000/es7000plat.c)115
-rw-r--r--arch/x86/kernel/ftrace.c124
-rw-r--r--arch/x86/kernel/genapic_64.c88
-rw-r--r--arch/x86/kernel/genapic_flat_64.c64
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c159
-rw-r--r--arch/x86/kernel/genx2apic_phys.c154
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c138
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/head64.c5
-rw-r--r--arch/x86/kernel/head_32.S34
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c459
-rw-r--r--arch/x86/kernel/i387.c168
-rw-r--r--arch/x86/kernel/i8259.c24
-rw-r--r--arch/x86/kernel/io_apic.c (renamed from arch/x86/kernel/io_apic_64.c)2071
-rw-r--r--arch/x86/kernel/io_apic_32.c2901
-rw-r--r--arch/x86/kernel/ioport.c1
-rw-r--r--arch/x86/kernel/ipi.c3
-rw-r--r--arch/x86/kernel/irq.c189
-rw-r--r--arch/x86/kernel/irq_32.c194
-rw-r--r--arch/x86/kernel/irq_64.c169
-rw-r--r--arch/x86/kernel/irqinit_32.c84
-rw-r--r--arch/x86/kernel/irqinit_64.c69
-rw-r--r--arch/x86/kernel/k8.c5
-rw-r--r--arch/x86/kernel/kgdb.c50
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/kvmclock.c30
-rw-r--r--arch/x86/kernel/ldt.c10
-rw-r--r--arch/x86/kernel/microcode.c853
-rw-r--r--arch/x86/kernel/microcode_amd.c435
-rw-r--r--arch/x86/kernel/microcode_core.c508
-rw-r--r--arch/x86/kernel/microcode_intel.c480
-rw-r--r--arch/x86/kernel/mpparse.c2
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/numaq_32.c7
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c37
-rw-r--r--arch/x86/kernel/paravirt.c30
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c36
-rw-r--r--arch/x86/kernel/pci-dma.c199
-rw-r--r--arch/x86/kernel/pci-gart_64.c170
-rw-r--r--arch/x86/kernel/pci-nommu.c10
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/process.c21
-rw-r--r--arch/x86/kernel/process_32.c105
-rw-r--r--arch/x86/kernel/process_64.c204
-rw-r--r--arch/x86/kernel/ptrace.c522
-rw-r--r--arch/x86/kernel/pvclock.c12
-rw-r--r--arch/x86/kernel/quirks.c44
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/rtc.c42
-rw-r--r--arch/x86/kernel/setup.c236
-rw-r--r--arch/x86/kernel/setup_percpu.c28
-rw-r--r--arch/x86/kernel/sigframe.h19
-rw-r--r--arch/x86/kernel/signal_32.c273
-rw-r--r--arch/x86/kernel/signal_64.c362
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c238
-rw-r--r--arch/x86/kernel/summit_32.c2
-rw-r--r--arch/x86/kernel/sys_i386_32.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c44
-rw-r--r--arch/x86/kernel/syscall_64.c4
-rw-r--r--arch/x86/kernel/time_32.c8
-rw-r--r--arch/x86/kernel/time_64.c23
-rw-r--r--arch/x86/kernel/tlb_32.c8
-rw-r--r--arch/x86/kernel/tlb_uv.c2
-rw-r--r--arch/x86/kernel/tls.c1
-rw-r--r--arch/x86/kernel/traps.c (renamed from arch/x86/kernel/traps_32.c)888
-rw-r--r--arch/x86/kernel/traps_64.c1212
-rw-r--r--arch/x86/kernel/tsc.c290
-rw-r--r--arch/x86/kernel/uv_irq.c79
-rw-r--r--arch/x86/kernel/uv_sysfs.c72
-rw-r--r--arch/x86/kernel/visws_quirks.c48
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kernel/vmi_32.c16
-rw-r--r--arch/x86/kernel/vmiclock_32.c3
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S9
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S9
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kernel/xsave.c345
-rw-r--r--arch/x86/kvm/Makefile5
-rw-r--r--arch/x86/kvm/i8254.c87
-rw-r--r--arch/x86/kvm/i8254.h7
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.c3
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h32
-rw-r--r--arch/x86/kvm/lapic.c49
-rw-r--r--arch/x86/kvm/mmu.c680
-rw-r--r--arch/x86/kvm/paging_tmpl.h249
-rw-r--r--arch/x86/kvm/svm.c156
-rw-r--r--arch/x86/kvm/vmx.c712
-rw-r--r--arch/x86/kvm/vmx.h18
-rw-r--r--arch/x86/kvm/x86.c554
-rw-r--r--arch/x86/kvm/x86.h22
-rw-r--r--arch/x86/kvm/x86_emulate.c170
-rw-r--r--arch/x86/lguest/boot.c40
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/msr-on-cpu.c78
-rw-r--r--arch/x86/lib/string_32.c42
-rw-r--r--arch/x86/lib/strstr_32.c6
-rw-r--r--arch/x86/lib/usercopy_32.c7
-rw-r--r--arch/x86/mach-default/setup.c19
-rw-r--r--arch/x86/mach-es7000/Makefile5
-rw-r--r--arch/x86/mach-es7000/es7000.h114
-rw-r--r--arch/x86/mach-generic/Makefile1
-rw-r--r--arch/x86/mach-generic/bigsmp.c13
-rw-r--r--arch/x86/mach-generic/es7000.c47
-rw-r--r--arch/x86/mach-generic/numaq.c26
-rw-r--r--arch/x86/mach-generic/summit.c25
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c6
-rw-r--r--arch/x86/mm/Makefile6
-rw-r--r--arch/x86/mm/dump_pagetables.c4
-rw-r--r--arch/x86/mm/fault.c67
-rw-r--r--arch/x86/mm/gup.c10
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init_32.c90
-rw-r--r--arch/x86/mm/init_64.c184
-rw-r--r--arch/x86/mm/ioremap.c197
-rw-r--r--arch/x86/mm/memtest.c7
-rw-r--r--arch/x86/mm/mmio-mod.c87
-rw-r--r--arch/x86/mm/numa_32.c (renamed from arch/x86/mm/discontig_32.c)2
-rw-r--r--arch/x86/mm/numa_64.c10
-rw-r--r--arch/x86/mm/pageattr-test.c9
-rw-r--r--arch/x86/mm/pageattr.c476
-rw-r--r--arch/x86/mm/pat.c132
-rw-r--r--arch/x86/mm/pf_in.c121
-rw-r--r--arch/x86/mm/pgtable.c6
-rw-r--r--arch/x86/mm/pgtable_32.c3
-rw-r--r--arch/x86/mm/srat_64.c2
-rw-r--r--arch/x86/mm/testmmiotrace.c4
-rw-r--r--arch/x86/oprofile/Makefile2
-rw-r--r--arch/x86/oprofile/backtrace.c3
-rw-r--r--arch/x86/oprofile/nmi_int.c204
-rw-r--r--arch/x86/oprofile/op_counter.h18
-rw-r--r--arch/x86/oprofile/op_model_amd.c546
-rw-r--r--arch/x86/oprofile/op_model_athlon.c190
-rw-r--r--arch/x86/oprofile/op_model_p4.c207
-rw-r--r--arch/x86/oprofile/op_model_ppro.c120
-rw-r--r--arch/x86/oprofile/op_x86_model.h17
-rw-r--r--arch/x86/pci/acpi.c5
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/pci/fixup.c28
-rw-r--r--arch/x86/pci/i386.c3
-rw-r--r--arch/x86/pci/irq.c86
-rw-r--r--arch/x86/pci/mmconfig-shared.c12
-rw-r--r--arch/x86/power/cpu_32.c7
-rw-r--r--arch/x86/power/cpu_64.c7
-rw-r--r--arch/x86/power/hibernate_asm_32.S14
-rw-r--r--arch/x86/xen/Kconfig12
-rw-r--r--arch/x86/xen/Makefile12
-rw-r--r--arch/x86/xen/debugfs.c123
-rw-r--r--arch/x86/xen/debugfs.h10
-rw-r--r--arch/x86/xen/enlighten.c318
-rw-r--r--arch/x86/xen/irq.c141
-rw-r--r--arch/x86/xen/mmu.c315
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/multicalls.c115
-rw-r--r--arch/x86/xen/smp.c245
-rw-r--r--arch/x86/xen/spinlock.c428
-rw-r--r--arch/x86/xen/time.c23
-rw-r--r--arch/x86/xen/xen-asm_32.S2
-rw-r--r--arch/x86/xen/xen-asm_64.S22
-rw-r--r--arch/x86/xen/xen-ops.h8
586 files changed, 58376 insertions, 14873 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ed92864d132..350bee1d54d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -18,6 +18,7 @@ config X86_64
### Arch settings
config X86
def_bool y
+ select HAVE_AOUT if X86_32
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
@@ -25,10 +26,12 @@ config X86
select HAVE_KPROBES
select ARCH_WANT_OPTIONAL_GPIOLIB
select HAVE_KRETPROBES
+ select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE
select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
select HAVE_ARCH_KGDB if !X86_VOYAGER
+ select HAVE_ARCH_TRACEHOOK
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -37,10 +40,6 @@ config ARCH_DEFCONFIG
default "arch/x86/configs/i386_defconfig" if X86_32
default "arch/x86/configs/x86_64_defconfig" if X86_64
-
-config GENERIC_LOCKBREAK
- def_bool n
-
config GENERIC_TIME
def_bool y
@@ -93,7 +92,7 @@ config GENERIC_HWEIGHT
def_bool y
config GENERIC_GPIO
- def_bool n
+ bool
config ARCH_MAY_HAVE_PC_FDC
def_bool y
@@ -104,12 +103,6 @@ config RWSEM_GENERIC_SPINLOCK
config RWSEM_XCHGADD_ALGORITHM
def_bool X86_XADD
-config ARCH_HAS_ILOG2_U32
- def_bool n
-
-config ARCH_HAS_ILOG2_U64
- def_bool n
-
config ARCH_HAS_CPU_IDLE_WAIT
def_bool y
@@ -123,6 +116,9 @@ config GENERIC_TIME_VSYSCALL
config ARCH_HAS_CPU_RELAX
def_bool y
+config ARCH_HAS_DEFAULT_IDLE
+ def_bool y
+
config ARCH_HAS_CACHE_LINE_SIZE
def_bool y
@@ -151,9 +147,6 @@ config AUDIT_ARCH
bool
default X86_64
-config ARCH_SUPPORTS_AOUT
- def_bool y
-
config ARCH_SUPPORTS_OPTIMIZED_INLINING
def_bool y
@@ -204,6 +197,7 @@ config X86_TRAMPOLINE
config KTIME_SCALAR
def_bool X86_32
source "init/Kconfig"
+source "kernel/Kconfig.freezer"
menu "Processor type and features"
@@ -553,6 +547,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
config AMD_IOMMU
bool "AMD IOMMU support"
select SWIOTLB
+ select PCI_MSI
depends on X86_64 && PCI && ACPI
help
With this option you can enable support for AMD IOMMU hardware in
@@ -758,9 +753,8 @@ config I8K
Say N otherwise.
config X86_REBOOTFIXUPS
- def_bool n
- prompt "Enable X86 board specific fixups for reboot"
- depends on X86_32 && X86
+ bool "Enable X86 board specific fixups for reboot"
+ depends on X86_32
---help---
This enables chipset and/or board specific fixups to be done
in order to get reboot to work correctly. This is only needed on
@@ -776,23 +770,45 @@ config X86_REBOOTFIXUPS
Say N otherwise.
config MICROCODE
- tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
+ tristate "/dev/cpu/microcode - microcode support"
select FW_LOADER
---help---
If you say Y here, you will be able to update the microcode on
- Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II,
- Pentium III, Pentium 4, Xeon etc. You will obviously need the
- actual microcode binary data itself which is not shipped with the
- Linux kernel.
+ certain Intel and AMD processors. The Intel support is for the
+ IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
+ Pentium 4, Xeon etc. The AMD support is for family 0x10 and
+ 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
+ You will obviously need the actual microcode binary data itself
+ which is not shipped with the Linux kernel.
- For latest news and information on obtaining all the required
- ingredients for this driver, check:
- <http://www.urbanmyth.org/microcode/>.
+ This option selects the general module only, you need to select
+ at least one vendor specific module as well.
To compile this driver as a module, choose M here: the
module will be called microcode.
-config MICROCODE_OLD_INTERFACE
+config MICROCODE_INTEL
+ bool "Intel microcode patch loading support"
+ depends on MICROCODE
+ default MICROCODE
+ select FW_LOADER
+ --help---
+ This options enables microcode patch loading support for Intel
+ processors.
+
+ For latest news and information on obtaining all the required
+ Intel ingredients for this driver, check:
+ <http://www.urbanmyth.org/microcode/>.
+
+config MICROCODE_AMD
+ bool "AMD microcode patch loading support"
+ depends on MICROCODE
+ select FW_LOADER
+ --help---
+ If you select this option, microcode patch loading support for AMD
+ processors will be enabled.
+
+ config MICROCODE_OLD_INTERFACE
def_bool y
depends on MICROCODE
@@ -922,16 +938,17 @@ config HIGHMEM
depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
config X86_PAE
- def_bool n
- prompt "PAE (Physical Address Extension) Support"
+ bool "PAE (Physical Address Extension) Support"
depends on X86_32 && !HIGHMEM4G
- select RESOURCES_64BIT
help
PAE is required for NX support, and furthermore enables
larger swapspace support for non-overcommit purposes. It
has the cost of more pagetable lookup overhead, and also
consumes more pagetable space per process.
+config ARCH_PHYS_ADDR_T_64BIT
+ def_bool X86_64 || X86_PAE
+
# Common NUMA Features
config NUMA
bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
@@ -1020,7 +1037,7 @@ config HAVE_ARCH_ALLOC_REMAP
config ARCH_FLATMEM_ENABLE
def_bool y
- depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA
+ depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
config ARCH_DISCONTIGMEM_ENABLE
def_bool y
@@ -1036,7 +1053,7 @@ config ARCH_SPARSEMEM_DEFAULT
config ARCH_SPARSEMEM_ENABLE
def_bool y
- depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC)
+ depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
@@ -1059,6 +1076,56 @@ config HIGHPTE
low memory. Setting this option will put user-space page table
entries in high memory.
+config X86_CHECK_BIOS_CORRUPTION
+ bool "Check for low memory corruption"
+ help
+ Periodically check for memory corruption in low memory, which
+ is suspected to be caused by BIOS. Even when enabled in the
+ configuration, it is disabled at runtime. Enable it by
+ setting "memory_corruption_check=1" on the kernel command
+ line. By default it scans the low 64k of memory every 60
+ seconds; see the memory_corruption_check_size and
+ memory_corruption_check_period parameters in
+ Documentation/kernel-parameters.txt to adjust this.
+
+ When enabled with the default parameters, this option has
+ almost no overhead, as it reserves a relatively small amount
+ of memory and scans it infrequently. It both detects corruption
+ and prevents it from affecting the running system.
+
+ It is, however, intended as a diagnostic tool; if repeatable
+ BIOS-originated corruption always affects the same memory,
+ you can use memmap= to prevent the kernel from using that
+ memory.
+
+config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
+ bool "Set the default setting of memory_corruption_check"
+ depends on X86_CHECK_BIOS_CORRUPTION
+ default y
+ help
+ Set whether the default state of memory_corruption_check is
+ on or off.
+
+config X86_RESERVE_LOW_64K
+ bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
+ default y
+ help
+ Reserve the first 64K of physical RAM on BIOSes that are known
+ to potentially corrupt that memory range. A numbers of BIOSes are
+ known to utilize this area during suspend/resume, so it must not
+ be used by the kernel.
+
+ Set this to N if you are absolutely sure that you trust the BIOS
+ to get all its memory reservations and usages right.
+
+ If you have doubts about the BIOS (e.g. suspend/resume does not
+ work or there's kernel crashes after certain hardware hotplug
+ events) and it's not AMI or Phoenix, then you might want to enable
+ X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
+ corruption patterns.
+
+ Say Y if unsure.
+
config MATH_EMULATION
bool
prompt "Math emulation" if X86_32
@@ -1117,10 +1184,10 @@ config MTRR
You can safely say Y even if your machine doesn't have MTRRs, you'll
just add about 9 KB to your kernel.
- See <file:Documentation/mtrr.txt> for more information.
+ See <file:Documentation/x86/mtrr.txt> for more information.
config MTRR_SANITIZER
- bool
+ def_bool y
prompt "MTRR cleanup support"
depends on MTRR
help
@@ -1131,7 +1198,7 @@ config MTRR_SANITIZER
The largest mtrr entry size for a continous block can be set with
mtrr_chunk_size.
- If unsure, say N.
+ If unsure, say Y.
config MTRR_SANITIZER_ENABLE_DEFAULT
int "MTRR cleanup enable value (0-1)"
@@ -1166,8 +1233,7 @@ config X86_PAT
If unsure, say Y.
config EFI
- def_bool n
- prompt "EFI runtime service support"
+ bool "EFI runtime service support"
depends on ACPI
---help---
This enables the kernel to use EFI runtime services that are
@@ -1180,18 +1246,9 @@ config EFI
resultant kernel should continue to boot on existing non-EFI
platforms.
-config IRQBALANCE
- def_bool y
- prompt "Enable kernel irq balancing"
- depends on X86_32 && SMP && X86_IO_APIC
- help
- The default yes will allow the kernel to do irq load balancing.
- Saying no will keep the kernel from doing irq load balancing.
-
config SECCOMP
def_bool y
prompt "Enable seccomp to safely compute untrusted bytecode"
- depends on PROC_FS
help
This kernel feature is useful for number crunching applications
that may need to compute untrusted bytecode during their
@@ -1199,7 +1256,7 @@ config SECCOMP
the process as file descriptors supporting the read/write
syscalls, it's possible to isolate those applications in
their own address space using seccomp. Once seccomp is
- enabled via /proc/<pid>/seccomp, it cannot be disabled
+ enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
and the task is only allowed to execute a few safe syscalls
defined by each seccomp mode.
@@ -1356,14 +1413,14 @@ config PHYSICAL_ALIGN
Don't change this unless you know what you are doing.
config HOTPLUG_CPU
- bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)"
- depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER
+ bool "Support for hot-pluggable CPUs"
+ depends on SMP && HOTPLUG && !X86_VOYAGER
---help---
- Say Y here to experiment with turning CPUs off and on, and to
- enable suspend on SMP systems. CPUs can be controlled through
- /sys/devices/system/cpu.
- Say N if you want to disable CPU hotplug and don't need to
- suspend.
+ Say Y here to allow turning CPUs off and on. CPUs can be
+ controlled through /sys/devices/system/cpu.
+ ( Note: power management support will enable this option
+ automatically on SMP systems. )
+ Say N if you want to disable CPU hotplug.
config COMPAT_VDSO
def_bool y
@@ -1378,6 +1435,51 @@ config COMPAT_VDSO
If unsure, say Y.
+config CMDLINE_BOOL
+ bool "Built-in kernel command line"
+ default n
+ help
+ Allow for specifying boot arguments to the kernel at
+ build time. On some systems (e.g. embedded ones), it is
+ necessary or convenient to provide some or all of the
+ kernel boot arguments with the kernel itself (that is,
+ to not rely on the boot loader to provide them.)
+
+ To compile command line arguments into the kernel,
+ set this option to 'Y', then fill in the
+ the boot arguments in CONFIG_CMDLINE.
+
+ Systems with fully functional boot loaders (i.e. non-embedded)
+ should leave this option set to 'N'.
+
+config CMDLINE
+ string "Built-in kernel command string"
+ depends on CMDLINE_BOOL
+ default ""
+ help
+ Enter arguments here that should be compiled into the kernel
+ image and used at boot time. If the boot loader provides a
+ command line at boot time, it is appended to this string to
+ form the full kernel command line, when the system boots.
+
+ However, you can use the CONFIG_CMDLINE_OVERRIDE option to
+ change this behavior.
+
+ In most cases, the command line (whether built-in or provided
+ by the boot loader) should specify the device for the root
+ file system.
+
+config CMDLINE_OVERRIDE
+ bool "Built-in command line overrides boot loader arguments"
+ default n
+ depends on CMDLINE_BOOL
+ help
+ Set this option to 'Y' to have the kernel ignore the boot loader
+ command line, and use ONLY the built-in command line.
+
+ This is used to work around broken boot loaders. This should
+ be set to 'N' under normal conditions.
+
endmenu
config ARCH_ENABLE_MEMORY_HOTPLUG
@@ -1536,6 +1638,8 @@ source "arch/x86/kernel/cpu/cpufreq/Kconfig"
source "drivers/cpuidle/Kconfig"
+source "drivers/idle/Kconfig"
+
endmenu
@@ -1643,6 +1747,14 @@ config DMAR_FLOPPY_WA
workaround will setup a 1:1 mapping for the first
16M to make floppy (an ISA device) work.
+config INTR_REMAP
+ bool "Support for Interrupt Remapping (EXPERIMENTAL)"
+ depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
+ help
+ Supports Interrupt remapping for IO-APIC and MSI devices.
+ To use x2apic mode in the CPU's which support x2APIC enhancements or
+ to support platforms with CPU's having > 8 bit APIC ID, say Y.
+
source "drivers/pci/pcie/Kconfig"
source "drivers/pci/Kconfig"
@@ -1759,7 +1871,7 @@ config IA32_EMULATION
config IA32_AOUT
tristate "IA32 a.out support"
- depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT
+ depends on IA32_EMULATION
help
Support old a.out binaries in the 32bit emulation.
@@ -1773,7 +1885,7 @@ config COMPAT_FOR_U64_ALIGNMENT
config SYSVIPC_COMPAT
def_bool y
- depends on X86_64 && COMPAT && SYSVIPC
+ depends on COMPAT && SYSVIPC
endmenu
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index b225219c448..0b7c4a3f065 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -38,8 +38,7 @@ config M386
- "Crusoe" for the Transmeta Crusoe series.
- "Efficeon" for the Transmeta Efficeon series.
- "Winchip-C6" for original IDT Winchip.
- - "Winchip-2" for IDT Winchip 2.
- - "Winchip-2A" for IDT Winchips with 3dNow! capabilities.
+ - "Winchip-2" for IDT Winchips with 3dNow! capabilities.
- "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
- "Geode GX/LX" For AMD Geode GX and LX processors.
- "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
@@ -194,19 +193,11 @@ config MWINCHIPC6
treat this chip as a 586TSC with some extended instructions
and alignment requirements.
-config MWINCHIP2
- bool "Winchip-2"
- depends on X86_32
- help
- Select this for an IDT Winchip-2. Linux and GCC
- treat this chip as a 586TSC with some extended instructions
- and alignment requirements.
-
config MWINCHIP3D
- bool "Winchip-2A/Winchip-3"
+ bool "Winchip-2/Winchip-2A/Winchip-3"
depends on X86_32
help
- Select this for an IDT Winchip-2A or 3. Linux and GCC
+ Select this for an IDT Winchip-2, 2A or 3. Linux and GCC
treat this chip as a 586TSC with some extended instructions
and alignment requirements. Also enable out of order memory
stores for this CPU, which can increase performance of some
@@ -318,7 +309,7 @@ config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC
default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
- default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
+ default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
config X86_XADD
@@ -360,7 +351,7 @@ config X86_POPAD_OK
config X86_ALIGNMENT_16
def_bool y
- depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
config X86_INTEL_USERCOPY
def_bool y
@@ -368,7 +359,7 @@ config X86_INTEL_USERCOPY
config X86_USE_PPRO_CHECKSUM
def_bool y
- depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
config X86_USE_3DNOW
def_bool y
@@ -376,7 +367,7 @@ config X86_USE_3DNOW
config X86_OOSTORE
def_bool y
- depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
+ depends on (MWINCHIP3D || MWINCHIPC6) && MTRR
#
# P6_NOPs are a relatively minor optimization that require a family >=
@@ -396,7 +387,7 @@ config X86_P6_NOP
config X86_TSC
def_bool y
- depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
+ depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
config X86_CMPXCHG64
def_bool y
@@ -406,7 +397,7 @@ config X86_CMPXCHG64
# generates cmov.
config X86_CMOV
def_bool y
- depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64)
+ depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64)
config X86_MINIMUM_CPU_FAMILY
int
@@ -417,4 +408,124 @@ config X86_MINIMUM_CPU_FAMILY
config X86_DEBUGCTLMSR
def_bool y
- depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
+ depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
+
+menuconfig PROCESSOR_SELECT
+ bool "Supported processor vendors" if EMBEDDED
+ help
+ This lets you choose what x86 vendor support code your kernel
+ will include.
+
+config CPU_SUP_INTEL
+ default y
+ bool "Support Intel processors" if PROCESSOR_SELECT
+ help
+ This enables detection, tunings and quirks for Intel processors
+
+ You need this enabled if you want your kernel to run on an
+ Intel CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on an Intel
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config CPU_SUP_CYRIX_32
+ default y
+ bool "Support Cyrix processors" if PROCESSOR_SELECT
+ depends on !64BIT
+ help
+ This enables detection, tunings and quirks for Cyrix processors
+
+ You need this enabled if you want your kernel to run on a
+ Cyrix CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on a Cyrix
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config CPU_SUP_AMD
+ default y
+ bool "Support AMD processors" if PROCESSOR_SELECT
+ help
+ This enables detection, tunings and quirks for AMD processors
+
+ You need this enabled if you want your kernel to run on an
+ AMD CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on an AMD
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config CPU_SUP_CENTAUR_32
+ default y
+ bool "Support Centaur processors" if PROCESSOR_SELECT
+ depends on !64BIT
+ help
+ This enables detection, tunings and quirks for Centaur processors
+
+ You need this enabled if you want your kernel to run on a
+ Centaur CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on a Centaur
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config CPU_SUP_CENTAUR_64
+ default y
+ bool "Support Centaur processors" if PROCESSOR_SELECT
+ depends on 64BIT
+ help
+ This enables detection, tunings and quirks for Centaur processors
+
+ You need this enabled if you want your kernel to run on a
+ Centaur CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on a Centaur
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config CPU_SUP_TRANSMETA_32
+ default y
+ bool "Support Transmeta processors" if PROCESSOR_SELECT
+ depends on !64BIT
+ help
+ This enables detection, tunings and quirks for Transmeta processors
+
+ You need this enabled if you want your kernel to run on a
+ Transmeta CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on a Transmeta
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config CPU_SUP_UMC_32
+ default y
+ bool "Support UMC processors" if PROCESSOR_SELECT
+ depends on !64BIT
+ help
+ This enables detection, tunings and quirks for UMC processors
+
+ You need this enabled if you want your kernel to run on a
+ UMC CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on a UMC
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config X86_DS
+ bool "Debug Store support"
+ default y
+ help
+ Add support for Debug Store.
+ This allows the kernel to provide a memory buffer to the hardware
+ to store various profiling and tracing events.
+
+config X86_PTRACE_BTS
+ bool "ptrace interface to Branch Trace Store"
+ default y
+ depends on (X86_DS && X86_DEBUGCTLMSR)
+ help
+ Add a ptrace interface to allow collecting an execution trace
+ of the traced task.
+ This collects control flow changes in a (cyclic) buffer and allows
+ debuggers to fill in the gaps and show an execution trace of the debuggee.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 092f019e033..2a3dfbd5e67 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,19 @@ config EARLY_PRINTK
with klogd/syslogd or the X server. You should normally N here,
unless you want to debug such a crash.
+config EARLY_PRINTK_DBGP
+ bool "Early printk via EHCI debug port"
+ default n
+ depends on EARLY_PRINTK && PCI
+ help
+ Write kernel log output directly into the EHCI debug port.
+
+ This is useful for kernel debugging when your machine crashes very
+ early before the console code is initialized. For normal operation
+ it is not recommended because it looks ugly and doesn't cooperate
+ with klogd/syslogd or the X server. You should normally N here,
+ unless you want to debug such a crash. You need usb debug device.
+
config DEBUG_STACKOVERFLOW
bool "Check for stack overflows"
depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index f5631da585b..d1a47adb5ae 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -110,16 +110,16 @@ KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
mcore-y := arch/x86/mach-default/
# Voyager subarch support
-mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager
+mflags-$(CONFIG_X86_VOYAGER) := -Iarch/x86/include/asm/mach-voyager
mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
# generic subarchitecture
-mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
+mflags-$(CONFIG_X86_GENERICARCH):= -Iarch/x86/include/asm/mach-generic
fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
# default subarch .h files
-mflags-y += -Iinclude/asm-x86/mach-default
+mflags-y += -Iarch/x86/include/asm/mach-default
# 64 bit does not support subarch support - clear sub arch variables
fcore-$(CONFIG_X86_64) :=
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index e372b584e91..80177ec052f 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -28,7 +28,6 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
-cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586)
cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
@@ -45,3 +44,8 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
# cpu entries
cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
+# Bug fix for binutils: this option is required in order to keep
+# binutils from generating NOPL instructions against our will.
+ifneq ($(CONFIG_X86_P6_NOP),y)
+cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,)
+endif
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 7ee102f9c4f..cd48c721001 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
KBUILD_CFLAGS += $(call cc-option,-m32)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-$(obj)/zImage: IMAGE_OFFSET := 0x1000
$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK)
-$(obj)/bzImage: IMAGE_OFFSET := 0x100000
$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
$(obj)/bzImage: BUILDFLAGS := -b
@@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
$(call if_changed,objcopy)
$(obj)/compressed/vmlinux: FORCE
- $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
+ $(Q)$(MAKE) $(build)=$(obj)/compressed $@
# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
FDARGS =
@@ -181,6 +179,7 @@ isoimage: $(BOOTIMAGE)
mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
-no-emul-boot -boot-load-size 4 -boot-info-table \
$(obj)/isoimage
+ isohybrid $(obj)/image.iso 2>/dev/null || true
rm -rf $(obj)/isoimage
zlilo: $(BOOTIMAGE)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 92fdd35bd93..1771c804e02 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE
$(call if_changed,objcopy)
-ifeq ($(CONFIG_X86_32),y)
-targets += vmlinux.bin.all vmlinux.relocs
-hostprogs-y := relocs
+targets += vmlinux.bin.all vmlinux.relocs relocs
+hostprogs-$(CONFIG_X86_32) += relocs
quiet_cmd_relocs = RELOCS $@
cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD $@
$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
$(call if_changed,relocbin)
+ifeq ($(CONFIG_X86_32),y)
+
ifdef CONFIG_RELOCATABLE
$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
$(call if_changed,gzip)
@@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
endif
-
$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
$(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index ba7736cf2ec..29c5fbf0839 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -137,14 +137,15 @@ relocated:
*/
movl output_len(%ebx), %eax
pushl %eax
+ # push arguments for decompress_kernel:
pushl %ebp # output address
movl input_len(%ebx), %eax
pushl %eax # input_len
leal input_data(%ebx), %eax
pushl %eax # input_data
leal boot_heap(%ebx), %eax
- pushl %eax # heap area as third argument
- pushl %esi # real mode pointer as second arg
+ pushl %eax # heap area
+ pushl %esi # real mode pointer
call decompress_kernel
addl $20, %esp
popl %ecx
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 9fea7370647..da062216948 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -16,7 +16,7 @@
*/
#undef CONFIG_PARAVIRT
#ifdef CONFIG_X86_32
-#define _ASM_DESC_H_ 1
+#define _ASM_X86_DESC_H 1
#endif
#ifdef CONFIG_X86_64
@@ -27,7 +27,7 @@
#include <linux/linkage.h>
#include <linux/screen_info.h>
#include <linux/elf.h>
-#include <asm/io.h>
+#include <linux/io.h>
#include <asm/page.h>
#include <asm/boot.h>
#include <asm/bootparam.h>
@@ -251,7 +251,7 @@ static void __putstr(int error, const char *s)
y--;
}
} else {
- vidmem [(x + cols * y) * 2] = c;
+ vidmem[(x + cols * y) * 2] = c;
if (++x >= cols) {
x = 0;
if (++y >= lines) {
@@ -277,7 +277,8 @@ static void *memset(void *s, int c, unsigned n)
int i;
char *ss = s;
- for (i = 0; i < n; i++) ss[i] = c;
+ for (i = 0; i < n; i++)
+ ss[i] = c;
return s;
}
@@ -287,7 +288,8 @@ static void *memcpy(void *dest, const void *src, unsigned n)
const char *s = src;
char *d = dest;
- for (i = 0; i < n; i++) d[i] = s[i];
+ for (i = 0; i < n; i++)
+ d[i] = s[i];
return dest;
}
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index a1310c52fc0..857e492c571 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -492,7 +492,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
continue;
}
sh_symtab = sec_symtab->symtab;
- sym_strtab = sec->link->strtab;
+ sym_strtab = sec_symtab->link->strtab;
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
Elf32_Rel *rel;
Elf32_Sym *sym;
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 75298fe2edc..6ec6bb6e995 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -59,17 +59,18 @@ int validate_cpu(void)
u32 e = err_flags[i];
for (j = 0; j < 32; j++) {
- int n = (i << 5)+j;
- if (*msg_strs < n) {
+ if (msg_strs[0] < i ||
+ (msg_strs[0] == i && msg_strs[1] < j)) {
/* Skip to the next string */
- do {
- msg_strs++;
- } while (*msg_strs);
- msg_strs++;
+ msg_strs += 2;
+ while (*msg_strs++)
+ ;
}
if (e & 1) {
- if (*msg_strs == n && msg_strs[1])
- printf("%s ", msg_strs+1);
+ if (msg_strs[0] == i &&
+ msg_strs[1] == j &&
+ msg_strs[2])
+ printf("%s ", msg_strs+2);
else
printf("%d:%d ", i, j);
}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index d93cbc6464d..1aae8f3e5ca 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -41,6 +41,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
char *mbrbuf_ptr, *mbrbuf_end;
u32 buf_base, mbr_base;
extern char _end[];
+ u16 mbr_magic;
sector_size = ei->params.bytes_per_sector;
if (!sector_size)
@@ -58,11 +59,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
return -1;
+ memset(mbrbuf_ptr, 0, sector_size);
if (read_mbr(devno, mbrbuf_ptr))
return -1;
*mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
- return 0;
+ mbr_magic = *(u16 *)&mbrbuf_ptr[510];
+
+ /* check for valid MBR magic */
+ return mbr_magic == 0xAA55 ? 0 : -1;
}
static int get_edd_info(u8 devno, struct edd_info *ei)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index af86e431acf..b993062e9a5 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -30,7 +30,6 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
/* to be loaded */
ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
-SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
#ifndef SVGA_MODE
#define SVGA_MODE ASK_VGA
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index bbe76953bae..8ef60f20b37 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -15,33 +15,33 @@
#include <stdio.h>
-#include "../kernel/cpu/feature_names.c"
-
-#if NCAPFLAGS > 8
-# error "Need to adjust the boot code handling of CPUID strings"
-#endif
+#include "../kernel/cpu/capflags.c"
int main(void)
{
- int i;
+ int i, j;
const char *str;
printf("static const char x86_cap_strs[] = \n");
- for (i = 0; i < NCAPINTS*32; i++) {
- str = x86_cap_flags[i];
-
- if (i == NCAPINTS*32-1) {
- /* The last entry must be unconditional; this
- also consumes the compiler-added null character */
- if (!str)
- str = "";
- printf("\t\"\\x%02x\"\"%s\"\n", i, str);
- } else if (str) {
- printf("#if REQUIRED_MASK%d & (1 << %d)\n"
- "\t\"\\x%02x\"\"%s\\0\"\n"
- "#endif\n",
- i >> 5, i & 31, i, str);
+ for (i = 0; i < NCAPINTS; i++) {
+ for (j = 0; j < 32; j++) {
+ str = x86_cap_flags[i*32+j];
+
+ if (i == NCAPINTS-1 && j == 31) {
+ /* The last entry must be unconditional; this
+ also consumes the compiler-added null
+ character */
+ if (!str)
+ str = "";
+ printf("\t\"\\x%02x\\x%02x\"\"%s\"\n",
+ i, j, str);
+ } else if (str) {
+ printf("#if REQUIRED_MASK%d & (1 << %d)\n"
+ "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n"
+ "#endif\n",
+ i, j, i, j, str);
+ }
}
}
printf("\t;\n");
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 49f26aaaebc..3fa979c9c36 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -17,7 +17,7 @@
#include "boot.h"
#include "video.h"
-__videocard video_bios;
+static __videocard video_bios;
/* Set a conventional BIOS mode */
static int set_bios_mode(u8 mode);
@@ -119,7 +119,7 @@ static int bios_probe(void)
return nmodes;
}
-__videocard video_bios =
+static __videocard video_bios =
{
.card_name = "BIOS",
.probe = bios_probe,
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 401ad998ad0..75115849af3 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -20,7 +20,7 @@
static struct vesa_general_info vginfo;
static struct vesa_mode_info vminfo;
-__videocard video_vesa;
+static __videocard video_vesa;
#ifndef _WAKEUP
static void vesa_store_mode_params_graphics(void);
@@ -88,14 +88,11 @@ static int vesa_probe(void)
(vminfo.memory_layout == 4 ||
vminfo.memory_layout == 6) &&
vminfo.memory_planes == 1) {
-#ifdef CONFIG_FB
+#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
/* Graphics mode, color, linear frame buffer
supported. Only register the mode if
if framebuffer is configured, however,
- otherwise the user will be left without a screen.
- We don't require CONFIG_FB_VESA, however, since
- some of the other framebuffer drivers can use
- this mode-setting, too. */
+ otherwise the user will be left without a screen. */
mi = GET_HEAP(struct mode_info, 1);
mi->mode = mode + VIDEO_FIRST_VESA;
mi->depth = vminfo.bpp;
@@ -133,10 +130,12 @@ static int vesa_set_mode(struct mode_info *mode)
if ((vminfo.mode_attr & 0x15) == 0x05) {
/* It's a supported text mode */
is_graphic = 0;
+#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
} else if ((vminfo.mode_attr & 0x99) == 0x99) {
/* It's a graphics mode with linear frame buffer */
is_graphic = 1;
vesa_mode |= 0x4000; /* Request linear frame buffer */
+#endif
} else {
return -1; /* Invalid mode */
}
@@ -224,7 +223,7 @@ static void vesa_store_pm_info(void)
static void vesa_store_mode_params_graphics(void)
{
/* Tell the kernel we're in VESA graphics mode */
- boot_params.screen_info.orig_video_isVGA = 0x23;
+ boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
/* Mode parameters */
boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
@@ -294,7 +293,7 @@ void vesa_store_edid(void)
#endif /* not _WAKEUP */
-__videocard video_vesa =
+static __videocard video_vesa =
{
.card_name = "VESA",
.probe = vesa_probe,
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 104275e191a..13b8c86ae98 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.27-rc4
-# Mon Aug 25 15:04:00 2008
+# Linux kernel version: 2.6.27-rc5
+# Wed Sep 3 17:23:09 2008
#
# CONFIG_64BIT is not set
CONFIG_X86_32=y
@@ -202,7 +202,7 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
# CONFIG_M586 is not set
# CONFIG_M586TSC is not set
# CONFIG_M586MMX is not set
-# CONFIG_M686 is not set
+CONFIG_M686=y
# CONFIG_MPENTIUMII is not set
# CONFIG_MPENTIUMIII is not set
# CONFIG_MPENTIUMM is not set
@@ -213,7 +213,6 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
# CONFIG_MCRUSOE is not set
# CONFIG_MEFFICEON is not set
# CONFIG_MWINCHIPC6 is not set
-# CONFIG_MWINCHIP2 is not set
# CONFIG_MWINCHIP3D is not set
# CONFIG_MGEODEGX1 is not set
# CONFIG_MGEODE_LX is not set
@@ -221,13 +220,14 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
# CONFIG_MVIAC3_2 is not set
# CONFIG_MVIAC7 is not set
# CONFIG_MPSC is not set
-CONFIG_MCORE2=y
+# CONFIG_MCORE2 is not set
# CONFIG_GENERIC_CPU is not set
CONFIG_X86_GENERIC=y
CONFIG_X86_CPU=y
CONFIG_X86_CMPXCHG=y
CONFIG_X86_L1_CACHE_SHIFT=7
CONFIG_X86_XADD=y
+# CONFIG_X86_PPRO_FENCE is not set
CONFIG_X86_WP_WORKS_OK=y
CONFIG_X86_INVLPG=y
CONFIG_X86_BSWAP=y
@@ -235,14 +235,15 @@ CONFIG_X86_POPAD_OK=y
CONFIG_X86_INTEL_USERCOPY=y
CONFIG_X86_USE_PPRO_CHECKSUM=y
CONFIG_X86_TSC=y
+CONFIG_X86_CMOV=y
CONFIG_X86_MINIMUM_CPU_FAMILY=4
CONFIG_X86_DEBUGCTLMSR=y
CONFIG_HPET_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_DMI=y
# CONFIG_IOMMU_HELPER is not set
-CONFIG_NR_CPUS=4
-# CONFIG_SCHED_SMT is not set
+CONFIG_NR_CPUS=64
+CONFIG_SCHED_SMT=y
CONFIG_SCHED_MC=y
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
@@ -254,7 +255,8 @@ CONFIG_VM86=y
# CONFIG_TOSHIBA is not set
# CONFIG_I8K is not set
CONFIG_X86_REBOOTFIXUPS=y
-# CONFIG_MICROCODE is not set
+CONFIG_MICROCODE=y
+CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
# CONFIG_NOHIGHMEM is not set
@@ -285,7 +287,6 @@ CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
CONFIG_X86_PAT=y
CONFIG_EFI=y
-# CONFIG_IRQBALANCE is not set
CONFIG_SECCOMP=y
# CONFIG_HZ_100 is not set
# CONFIG_HZ_250 is not set
@@ -1532,7 +1533,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
CONFIG_VGA_CONSOLE=y
CONFIG_VGACON_SOFT_SCROLLBACK=y
CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
-CONFIG_VIDEO_SELECT=y
CONFIG_DUMMY_CONSOLE=y
# CONFIG_FRAMEBUFFER_CONSOLE is not set
CONFIG_LOGO=y
@@ -2115,7 +2115,7 @@ CONFIG_IO_DELAY_0X80=y
CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
# CONFIG_CPA_DEBUG is not set
-# CONFIG_OPTIMIZE_INLINING is not set
+CONFIG_OPTIMIZE_INLINING=y
#
# Security options
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 678c8acefe0..f0a03d7a7d6 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.27-rc4
-# Mon Aug 25 14:40:46 2008
+# Linux kernel version: 2.6.27-rc5
+# Wed Sep 3 17:13:39 2008
#
CONFIG_64BIT=y
# CONFIG_X86_32 is not set
@@ -210,7 +210,6 @@ CONFIG_X86_PC=y
# CONFIG_MCRUSOE is not set
# CONFIG_MEFFICEON is not set
# CONFIG_MWINCHIPC6 is not set
-# CONFIG_MWINCHIP2 is not set
# CONFIG_MWINCHIP3D is not set
# CONFIG_MGEODEGX1 is not set
# CONFIG_MGEODE_LX is not set
@@ -218,17 +217,14 @@ CONFIG_X86_PC=y
# CONFIG_MVIAC3_2 is not set
# CONFIG_MVIAC7 is not set
# CONFIG_MPSC is not set
-CONFIG_MCORE2=y
-# CONFIG_GENERIC_CPU is not set
+# CONFIG_MCORE2 is not set
+CONFIG_GENERIC_CPU=y
CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=64
-CONFIG_X86_INTERNODE_CACHE_BYTES=64
+CONFIG_X86_L1_CACHE_BYTES=128
+CONFIG_X86_INTERNODE_CACHE_BYTES=128
CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=6
+CONFIG_X86_L1_CACHE_SHIFT=7
CONFIG_X86_WP_WORKS_OK=y
-CONFIG_X86_INTEL_USERCOPY=y
-CONFIG_X86_USE_PPRO_CHECKSUM=y
-CONFIG_X86_P6_NOP=y
CONFIG_X86_TSC=y
CONFIG_X86_CMPXCHG64=y
CONFIG_X86_CMOV=y
@@ -243,9 +239,8 @@ CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
CONFIG_AMD_IOMMU=y
CONFIG_SWIOTLB=y
CONFIG_IOMMU_HELPER=y
-# CONFIG_MAXSMP is not set
-CONFIG_NR_CPUS=4
-# CONFIG_SCHED_SMT is not set
+CONFIG_NR_CPUS=64
+CONFIG_SCHED_SMT=y
CONFIG_SCHED_MC=y
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
@@ -254,7 +249,8 @@ CONFIG_X86_LOCAL_APIC=y
CONFIG_X86_IO_APIC=y
# CONFIG_X86_MCE is not set
# CONFIG_I8K is not set
-# CONFIG_MICROCODE is not set
+CONFIG_MICROCODE=y
+CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
CONFIG_NUMA=y
@@ -290,7 +286,7 @@ CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
-# CONFIG_X86_PAT is not set
+CONFIG_X86_PAT=y
CONFIG_EFI=y
CONFIG_SECCOMP=y
# CONFIG_HZ_100 is not set
@@ -1508,7 +1504,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
CONFIG_VGA_CONSOLE=y
CONFIG_VGACON_SOFT_SCROLLBACK=y
CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
-CONFIG_VIDEO_SELECT=y
CONFIG_DUMMY_CONSOLE=y
# CONFIG_FRAMEBUFFER_CONSOLE is not set
CONFIG_LOGO=y
@@ -2089,7 +2084,7 @@ CONFIG_IO_DELAY_0X80=y
CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
# CONFIG_CPA_DEBUG is not set
-# CONFIG_OPTIMIZE_INLINING is not set
+CONFIG_OPTIMIZE_INLINING=y
#
# Security options
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3874c2de540..903de4aa509 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
+obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
+
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
new file mode 100644
index 00000000000..070afc5b6c9
--- /dev/null
+++ b/arch/x86/crypto/crc32c-intel.c
@@ -0,0 +1,197 @@
+/*
+ * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
+ * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
+ * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2A: Instruction Set Reference, A-M
+ *
+ * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com>
+ * Copyright (c) 2008 Kent Liu <kent.liu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <crypto/internal/hash.h>
+
+#include <asm/cpufeature.h>
+
+#define CHKSUM_BLOCK_SIZE 1
+#define CHKSUM_DIGEST_SIZE 4
+
+#define SCALE_F sizeof(unsigned long)
+
+#ifdef CONFIG_X86_64
+#define REX_PRE "0x48, "
+#else
+#define REX_PRE
+#endif
+
+static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
+{
+ while (length--) {
+ __asm__ __volatile__(
+ ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
+ :"=S"(crc)
+ :"0"(crc), "c"(*data)
+ );
+ data++;
+ }
+
+ return crc;
+}
+
+static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
+{
+ unsigned int iquotient = len / SCALE_F;
+ unsigned int iremainder = len % SCALE_F;
+ unsigned long *ptmp = (unsigned long *)p;
+
+ while (iquotient--) {
+ __asm__ __volatile__(
+ ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
+ :"=S"(crc)
+ :"0"(crc), "c"(*ptmp)
+ );
+ ptmp++;
+ }
+
+ if (iremainder)
+ crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
+ iremainder);
+
+ return crc;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_ahash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = le32_to_cpup((__le32 *)key);
+ return 0;
+}
+
+static int crc32c_intel_init(struct ahash_request *req)
+{
+ u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
+ u32 *crcp = ahash_request_ctx(req);
+
+ *crcp = *mctx;
+
+ return 0;
+}
+
+static int crc32c_intel_update(struct ahash_request *req)
+{
+ struct crypto_hash_walk walk;
+ u32 *crcp = ahash_request_ctx(req);
+ u32 crc = *crcp;
+ int nbytes;
+
+ for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
+ nbytes = crypto_hash_walk_done(&walk, 0))
+ crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
+
+ *crcp = crc;
+ return 0;
+}
+
+static int crc32c_intel_final(struct ahash_request *req)
+{
+ u32 *crcp = ahash_request_ctx(req);
+
+ *(__le32 *)req->result = ~cpu_to_le32p(crcp);
+ return 0;
+}
+
+static int crc32c_intel_digest(struct ahash_request *req)
+{
+ struct crypto_hash_walk walk;
+ u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
+ u32 crc = *mctx;
+ int nbytes;
+
+ for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
+ nbytes = crypto_hash_walk_done(&walk, 0))
+ crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
+
+ *(__le32 *)req->result = ~cpu_to_le32(crc);
+ return 0;
+}
+
+static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = ~0;
+
+ tfm->crt_ahash.reqsize = sizeof(u32);
+
+ return 0;
+}
+
+static struct crypto_alg alg = {
+ .cra_name = "crc32c",
+ .cra_driver_name = "crc32c-intel",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_AHASH,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_alignmask = 3,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(alg.cra_list),
+ .cra_init = crc32c_intel_cra_init,
+ .cra_type = &crypto_ahash_type,
+ .cra_u = {
+ .ahash = {
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .setkey = crc32c_intel_setkey,
+ .init = crc32c_intel_init,
+ .update = crc32c_intel_update,
+ .final = crc32c_intel_final,
+ .digest = crc32c_intel_digest,
+ }
+ }
+};
+
+
+static int __init crc32c_intel_mod_init(void)
+{
+ if (cpu_has_xmm4_2)
+ return crypto_register_alg(&alg);
+ else
+ return -ENODEV;
+}
+
+static void __exit crc32c_intel_mod_fini(void)
+{
+ crypto_unregister_alg(&alg);
+}
+
+module_init(crc32c_intel_mod_init);
+module_exit(crc32c_intel_mod_fini);
+
+MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
+MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS("crc32c");
+MODULE_ALIAS("crc32c-intel");
+
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index a0e1dbe67dc..127ec3f0721 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -85,8 +85,10 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
dump->regs.ax = regs->ax;
dump->regs.ds = current->thread.ds;
dump->regs.es = current->thread.es;
- asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
- asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
+ savesegment(fs, fs);
+ dump->regs.fs = fs;
+ savesegment(gs, gs);
+ dump->regs.gs = gs;
dump->regs.orig_ax = regs->orig_ax;
dump->regs.ip = regs->ip;
dump->regs.cs = regs->cs;
@@ -430,8 +432,9 @@ beyond_if:
current->mm->start_stack =
(unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
/* start thread */
- asm volatile("movl %0,%%fs" :: "r" (0)); \
- asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
+ loadsegment(fs, 0);
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
load_gs_index(0);
(regs)->ip = ex.a_entry;
(regs)->sp = current->mm->start_stack;
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 20af4c79579..4bc02b23674 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -179,9 +179,10 @@ struct sigframe
u32 pretcode;
int sig;
struct sigcontext_ia32 sc;
- struct _fpstate_ia32 fpstate;
+ struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */
unsigned int extramask[_COMPAT_NSIG_WORDS-1];
char retcode[8];
+ /* fp state follows here */
};
struct rt_sigframe
@@ -192,8 +193,8 @@ struct rt_sigframe
u32 puc;
compat_siginfo_t info;
struct ucontext_ia32 uc;
- struct _fpstate_ia32 fpstate;
char retcode[8];
+ /* fp state follows here */
};
#define COPY(x) { \
@@ -206,7 +207,7 @@ struct rt_sigframe
{ unsigned int cur; \
unsigned short pre; \
err |= __get_user(pre, &sc->seg); \
- asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \
+ savesegment(seg, cur); \
pre |= mask; \
if (pre != cur) loadsegment(seg, pre); }
@@ -215,7 +216,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
unsigned int *peax)
{
unsigned int tmpflags, gs, oldgs, err = 0;
- struct _fpstate_ia32 __user *buf;
+ void __user *buf;
u32 tmp;
/* Always make any pending restarted system calls return -EINTR */
@@ -235,7 +236,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
*/
err |= __get_user(gs, &sc->gs);
gs |= 3;
- asm("movl %%gs,%0" : "=r" (oldgs));
+ savesegment(gs, oldgs);
if (gs != oldgs)
load_gs_index(gs);
@@ -259,26 +260,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
err |= __get_user(tmp, &sc->fpstate);
buf = compat_ptr(tmp);
- if (buf) {
- if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
- goto badframe;
- err |= restore_i387_ia32(buf);
- } else {
- struct task_struct *me = current;
-
- if (used_math()) {
- clear_fpu(me);
- clear_used_math();
- }
- }
+ err |= restore_i387_xstate_ia32(buf);
err |= __get_user(tmp, &sc->ax);
*peax = tmp;
return err;
-
-badframe:
- return 1;
}
asmlinkage long sys32_sigreturn(struct pt_regs *regs)
@@ -350,46 +337,42 @@ badframe:
*/
static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
- struct _fpstate_ia32 __user *fpstate,
+ void __user *fpstate,
struct pt_regs *regs, unsigned int mask)
{
int tmp, err = 0;
- tmp = 0;
- __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(gs, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
- __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(fs, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
- __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(ds, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
- __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(es, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->es);
- err |= __put_user((u32)regs->di, &sc->di);
- err |= __put_user((u32)regs->si, &sc->si);
- err |= __put_user((u32)regs->bp, &sc->bp);
- err |= __put_user((u32)regs->sp, &sc->sp);
- err |= __put_user((u32)regs->bx, &sc->bx);
- err |= __put_user((u32)regs->dx, &sc->dx);
- err |= __put_user((u32)regs->cx, &sc->cx);
- err |= __put_user((u32)regs->ax, &sc->ax);
- err |= __put_user((u32)regs->cs, &sc->cs);
- err |= __put_user((u32)regs->ss, &sc->ss);
+ err |= __put_user(regs->di, &sc->di);
+ err |= __put_user(regs->si, &sc->si);
+ err |= __put_user(regs->bp, &sc->bp);
+ err |= __put_user(regs->sp, &sc->sp);
+ err |= __put_user(regs->bx, &sc->bx);
+ err |= __put_user(regs->dx, &sc->dx);
+ err |= __put_user(regs->cx, &sc->cx);
+ err |= __put_user(regs->ax, &sc->ax);
+ err |= __put_user(regs->cs, &sc->cs);
+ err |= __put_user(regs->ss, &sc->ss);
err |= __put_user(current->thread.trap_no, &sc->trapno);
err |= __put_user(current->thread.error_code, &sc->err);
- err |= __put_user((u32)regs->ip, &sc->ip);
- err |= __put_user((u32)regs->flags, &sc->flags);
- err |= __put_user((u32)regs->sp, &sc->sp_at_signal);
+ err |= __put_user(regs->ip, &sc->ip);
+ err |= __put_user(regs->flags, &sc->flags);
+ err |= __put_user(regs->sp, &sc->sp_at_signal);
- tmp = save_i387_ia32(fpstate);
+ tmp = save_i387_xstate_ia32(fpstate);
if (tmp < 0)
err = -EFAULT;
- else {
- clear_used_math();
- stts();
+ else
err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
&sc->fpstate);
- }
/* non-iBCS2 extensions.. */
err |= __put_user(mask, &sc->oldmask);
@@ -402,7 +385,8 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
* Determine which stack to use..
*/
static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
- size_t frame_size)
+ size_t frame_size,
+ void **fpstate)
{
unsigned long sp;
@@ -421,6 +405,11 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
ka->sa.sa_restorer)
sp = (unsigned long) ka->sa.sa_restorer;
+ if (used_math()) {
+ sp = sp - sig_xstate_ia32_size;
+ *fpstate = (struct _fpstate_ia32 *) sp;
+ }
+
sp -= frame_size;
/* Align the stack pointer according to the i386 ABI,
* i.e. so that on function entry ((sp + 4) & 15) == 0. */
@@ -434,6 +423,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
struct sigframe __user *frame;
void __user *restorer;
int err = 0;
+ void __user *fpstate = NULL;
/* copy_to_user optimizes that into a single 8 byte store */
static const struct {
@@ -448,25 +438,21 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
0,
};
- frame = get_sigframe(ka, regs, sizeof(*frame));
+ frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- goto give_sigsegv;
+ return -EFAULT;
- err |= __put_user(sig, &frame->sig);
- if (err)
- goto give_sigsegv;
+ if (__put_user(sig, &frame->sig))
+ return -EFAULT;
- err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs,
- set->sig[0]);
- if (err)
- goto give_sigsegv;
+ if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
+ return -EFAULT;
if (_COMPAT_NSIG_WORDS > 1) {
- err |= __copy_to_user(frame->extramask, &set->sig[1],
- sizeof(frame->extramask));
- if (err)
- goto give_sigsegv;
+ if (__copy_to_user(frame->extramask, &set->sig[1],
+ sizeof(frame->extramask)))
+ return -EFAULT;
}
if (ka->sa.sa_flags & SA_RESTORER) {
@@ -487,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
*/
err |= __copy_to_user(frame->retcode, &code, 8);
if (err)
- goto give_sigsegv;
+ return -EFAULT;
/* Set up registers for signal handler */
regs->sp = (unsigned long) frame;
@@ -498,8 +484,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
regs->dx = 0;
regs->cx = 0;
- asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
- asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
regs->cs = __USER32_CS;
regs->ss = __USER32_DS;
@@ -510,10 +496,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
#endif
return 0;
-
-give_sigsegv:
- force_sigsegv(sig, current);
- return -EFAULT;
}
int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -522,6 +504,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
struct rt_sigframe __user *frame;
void __user *restorer;
int err = 0;
+ void __user *fpstate = NULL;
/* __copy_to_user optimizes that into a single 8 byte store */
static const struct {
@@ -537,30 +520,33 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
0,
};
- frame = get_sigframe(ka, regs, sizeof(*frame));
+ frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- goto give_sigsegv;
+ return -EFAULT;
err |= __put_user(sig, &frame->sig);
err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
err |= copy_siginfo_to_user32(&frame->info, info);
if (err)
- goto give_sigsegv;
+ return -EFAULT;
/* Create the ucontext. */
- err |= __put_user(0, &frame->uc.uc_flags);
+ if (cpu_has_xsave)
+ err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ err |= __put_user(0, &frame->uc.uc_flags);
err |= __put_user(0, &frame->uc.uc_link);
err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
err |= __put_user(sas_ss_flags(regs->sp),
&frame->uc.uc_stack.ss_flags);
err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+ err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
if (err)
- goto give_sigsegv;
+ return -EFAULT;
if (ka->sa.sa_flags & SA_RESTORER)
restorer = ka->sa.sa_restorer;
@@ -575,7 +561,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
*/
err |= __copy_to_user(frame->retcode, &code, 8);
if (err)
- goto give_sigsegv;
+ return -EFAULT;
/* Set up registers for signal handler */
regs->sp = (unsigned long) frame;
@@ -591,8 +577,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->dx = (unsigned long) &frame->info;
regs->cx = (unsigned long) &frame->uc;
- asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
- asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
regs->cs = __USER32_CS;
regs->ss = __USER32_DS;
@@ -603,8 +589,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
#endif
return 0;
-
-give_sigsegv:
- force_sigsegv(sig, current);
- return -EFAULT;
}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ffc1bb4fed7..256b00b6189 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -39,11 +39,11 @@
.endm
/* clobbers %eax */
- .macro CLEAR_RREGS
+ .macro CLEAR_RREGS _r9=rax
xorl %eax,%eax
movq %rax,R11(%rsp)
movq %rax,R10(%rsp)
- movq %rax,R9(%rsp)
+ movq %\_r9,R9(%rsp)
movq %rax,R8(%rsp)
.endm
@@ -52,11 +52,10 @@
* We don't reload %eax because syscall_trace_enter() returned
* the value it wants us to use in the table lookup.
*/
- .macro LOAD_ARGS32 offset
- movl \offset(%rsp),%r11d
- movl \offset+8(%rsp),%r10d
+ .macro LOAD_ARGS32 offset, _r9=0
+ .if \_r9
movl \offset+16(%rsp),%r9d
- movl \offset+24(%rsp),%r8d
+ .endif
movl \offset+40(%rsp),%ecx
movl \offset+48(%rsp),%edx
movl \offset+56(%rsp),%esi
@@ -145,7 +144,7 @@ ENTRY(ia32_sysenter_target)
SAVE_ARGS 0,0,1
/* no need to do an access_ok check here because rbp has been
32bit zero extended */
-1: movl (%rbp),%r9d
+1: movl (%rbp),%ebp
.section __ex_table,"a"
.quad 1b,ia32_badarg
.previous
@@ -157,7 +156,7 @@ ENTRY(ia32_sysenter_target)
cmpl $(IA32_NR_syscalls-1),%eax
ja ia32_badsys
sysenter_do_call:
- IA32_ARG_FIXUP 1
+ IA32_ARG_FIXUP
sysenter_dispatch:
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
@@ -234,20 +233,17 @@ sysexit_audit:
#endif
sysenter_tracesys:
- xchgl %r9d,%ebp
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
jz sysenter_auditsys
#endif
SAVE_REST
CLEAR_RREGS
- movq %r9,R9(%rsp)
movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
- xchgl %ebp,%r9d
cmpl $(IA32_NR_syscalls-1),%eax
ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
jmp sysenter_do_call
@@ -314,9 +310,9 @@ ENTRY(ia32_cstar_target)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
CFI_REMEMBER_STATE
jnz cstar_tracesys
-cstar_do_call:
cmpl $IA32_NR_syscalls-1,%eax
ja ia32_badsys
+cstar_do_call:
IA32_ARG_FIXUP 1
cstar_dispatch:
call *ia32_sys_call_table(,%rax,8)
@@ -357,15 +353,13 @@ cstar_tracesys:
#endif
xchgl %r9d,%ebp
SAVE_REST
- CLEAR_RREGS
- movq %r9,R9(%rsp)
+ CLEAR_RREGS r9
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
+ LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
RESTORE_REST
xchgl %ebp,%r9d
- movl RSP-ARGOFFSET(%rsp), %r8d
cmpl $(IA32_NR_syscalls-1),%eax
ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
jmp cstar_do_call
@@ -577,8 +571,8 @@ ia32_sys_call_table:
.quad compat_sys_setrlimit /* 75 */
.quad compat_sys_old_getrlimit /* old_getrlimit */
.quad compat_sys_getrusage
- .quad sys32_gettimeofday
- .quad sys32_settimeofday
+ .quad compat_sys_gettimeofday
+ .quad compat_sys_settimeofday
.quad sys_getgroups16 /* 80 */
.quad sys_setgroups16
.quad sys32_old_select
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index d3c64088b98..2e09dcd3c0a 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -49,41 +49,6 @@
#define AA(__x) ((unsigned long)(__x))
-int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
-{
- compat_ino_t ino;
-
- typeof(ubuf->st_uid) uid = 0;
- typeof(ubuf->st_gid) gid = 0;
- SET_UID(uid, kbuf->uid);
- SET_GID(gid, kbuf->gid);
- if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev))
- return -EOVERFLOW;
- if (kbuf->size >= 0x7fffffff)
- return -EOVERFLOW;
- ino = kbuf->ino;
- if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino)
- return -EOVERFLOW;
- if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
- __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
- __put_user(ino, &ubuf->st_ino) ||
- __put_user(kbuf->mode, &ubuf->st_mode) ||
- __put_user(kbuf->nlink, &ubuf->st_nlink) ||
- __put_user(uid, &ubuf->st_uid) ||
- __put_user(gid, &ubuf->st_gid) ||
- __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
- __put_user(kbuf->size, &ubuf->st_size) ||
- __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) ||
- __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
- __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
- __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
- __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
- __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
- __put_user(kbuf->blksize, &ubuf->st_blksize) ||
- __put_user(kbuf->blocks, &ubuf->st_blocks))
- return -EFAULT;
- return 0;
-}
asmlinkage long sys32_truncate64(char __user *filename,
unsigned long offset_low,
@@ -402,75 +367,11 @@ asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
return 0;
}
-static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i)
-{
- int err = -EFAULT;
-
- if (access_ok(VERIFY_READ, i, sizeof(*i))) {
- err = __get_user(o->tv_sec, &i->tv_sec);
- err |= __get_user(o->tv_usec, &i->tv_usec);
- }
- return err;
-}
-
-static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
-{
- int err = -EFAULT;
-
- if (access_ok(VERIFY_WRITE, o, sizeof(*o))) {
- err = __put_user(i->tv_sec, &o->tv_sec);
- err |= __put_user(i->tv_usec, &o->tv_usec);
- }
- return err;
-}
-
asmlinkage long sys32_alarm(unsigned int seconds)
{
return alarm_setitimer(seconds);
}
-/*
- * Translations due to time_t size differences. Which affects all
- * sorts of things, like timeval and itimerval.
- */
-asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
-{
- if (tv) {
- struct timeval ktv;
-
- do_gettimeofday(&ktv);
- if (put_tv32(tv, &ktv))
- return -EFAULT;
- }
- if (tz) {
- if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
- return -EFAULT;
- }
- return 0;
-}
-
-asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
-{
- struct timeval ktv;
- struct timespec kts;
- struct timezone ktz;
-
- if (tv) {
- if (get_tv32(&ktv, tv))
- return -EFAULT;
- kts.tv_sec = ktv.tv_sec;
- kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC;
- }
- if (tz) {
- if (copy_from_user(&ktz, tz, sizeof(ktz)))
- return -EFAULT;
- }
-
- return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
-}
-
struct sel_arg_struct {
unsigned int n;
unsigned int inp;
@@ -556,15 +457,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
return ret;
}
-/* These are here just in case some old ia32 binary calls it. */
-asmlinkage long sys32_pause(void)
-{
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- return -ERESTARTNOHAND;
-}
-
-
#ifdef CONFIG_SYSCTL_SYSCALL
struct sysctl_ia32 {
unsigned int name;
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
new file mode 100644
index 00000000000..4a8e80cdcfa
--- /dev/null
+++ b/arch/x86/include/asm/Kbuild
@@ -0,0 +1,24 @@
+include include/asm-generic/Kbuild.asm
+
+header-y += boot.h
+header-y += bootparam.h
+header-y += debugreg.h
+header-y += ldt.h
+header-y += msr-index.h
+header-y += prctl.h
+header-y += ptrace-abi.h
+header-y += sigcontext32.h
+header-y += ucontext.h
+header-y += processor-flags.h
+
+unifdef-y += e820.h
+unifdef-y += ist.h
+unifdef-y += mce.h
+unifdef-y += msr.h
+unifdef-y += mtrr.h
+unifdef-y += posix_types_32.h
+unifdef-y += posix_types_64.h
+unifdef-y += unistd_32.h
+unifdef-y += unistd_64.h
+unifdef-y += vm86.h
+unifdef-y += vsyscall.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
new file mode 100644
index 00000000000..37822206083
--- /dev/null
+++ b/arch/x86/include/asm/a.out-core.h
@@ -0,0 +1,73 @@
+/* a.out coredump register dumper
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _ASM_X86_A_OUT_CORE_H
+#define _ASM_X86_A_OUT_CORE_H
+
+#ifdef __KERNEL__
+#ifdef CONFIG_X86_32
+
+#include <linux/user.h>
+#include <linux/elfcore.h>
+
+/*
+ * fill in the user structure for an a.out core dump
+ */
+static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
+{
+ u16 gs;
+
+/* changed the size calculations - should hopefully work better. lbt */
+ dump->magic = CMAGIC;
+ dump->start_code = 0;
+ dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
+ dump->u_tsize = ((unsigned long)current->mm->end_code) >> PAGE_SHIFT;
+ dump->u_dsize = ((unsigned long)(current->mm->brk + (PAGE_SIZE - 1)))
+ >> PAGE_SHIFT;
+ dump->u_dsize -= dump->u_tsize;
+ dump->u_ssize = 0;
+ dump->u_debugreg[0] = current->thread.debugreg0;
+ dump->u_debugreg[1] = current->thread.debugreg1;
+ dump->u_debugreg[2] = current->thread.debugreg2;
+ dump->u_debugreg[3] = current->thread.debugreg3;
+ dump->u_debugreg[4] = 0;
+ dump->u_debugreg[5] = 0;
+ dump->u_debugreg[6] = current->thread.debugreg6;
+ dump->u_debugreg[7] = current->thread.debugreg7;
+
+ if (dump->start_stack < TASK_SIZE)
+ dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
+ >> PAGE_SHIFT;
+
+ dump->regs.bx = regs->bx;
+ dump->regs.cx = regs->cx;
+ dump->regs.dx = regs->dx;
+ dump->regs.si = regs->si;
+ dump->regs.di = regs->di;
+ dump->regs.bp = regs->bp;
+ dump->regs.ax = regs->ax;
+ dump->regs.ds = (u16)regs->ds;
+ dump->regs.es = (u16)regs->es;
+ dump->regs.fs = (u16)regs->fs;
+ savesegment(gs, gs);
+ dump->regs.orig_ax = regs->orig_ax;
+ dump->regs.ip = regs->ip;
+ dump->regs.cs = (u16)regs->cs;
+ dump->regs.flags = regs->flags;
+ dump->regs.sp = regs->sp;
+ dump->regs.ss = (u16)regs->ss;
+
+ dump->u_fpvalid = dump_fpu(regs, &dump->i387);
+}
+
+#endif /* CONFIG_X86_32 */
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_A_OUT_CORE_H */
diff --git a/arch/x86/include/asm/a.out.h b/arch/x86/include/asm/a.out.h
new file mode 100644
index 00000000000..4684f97a5bb
--- /dev/null
+++ b/arch/x86/include/asm/a.out.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_A_OUT_H
+#define _ASM_X86_A_OUT_H
+
+struct exec
+{
+ unsigned int a_info; /* Use macros N_MAGIC, etc for access */
+ unsigned a_text; /* length of text, in bytes */
+ unsigned a_data; /* length of data, in bytes */
+ unsigned a_bss; /* length of uninitialized data area for file, in bytes */
+ unsigned a_syms; /* length of symbol table data in file, in bytes */
+ unsigned a_entry; /* start address */
+ unsigned a_trsize; /* length of relocation info for text, in bytes */
+ unsigned a_drsize; /* length of relocation info for data, in bytes */
+};
+
+#define N_TRSIZE(a) ((a).a_trsize)
+#define N_DRSIZE(a) ((a).a_drsize)
+#define N_SYMSIZE(a) ((a).a_syms)
+
+#endif /* _ASM_X86_A_OUT_H */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
new file mode 100644
index 00000000000..8d676d8ecde
--- /dev/null
+++ b/arch/x86/include/asm/acpi.h
@@ -0,0 +1,178 @@
+#ifndef _ASM_X86_ACPI_H
+#define _ASM_X86_ACPI_H
+
+/*
+ * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <acpi/pdc_intel.h>
+
+#include <asm/numa.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/mpspec.h>
+
+#define COMPILER_DEPENDENT_INT64 long long
+#define COMPILER_DEPENDENT_UINT64 unsigned long long
+
+/*
+ * Calling conventions:
+ *
+ * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads)
+ * ACPI_EXTERNAL_XFACE - External ACPI interfaces
+ * ACPI_INTERNAL_XFACE - Internal ACPI interfaces
+ * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces
+ */
+#define ACPI_SYSTEM_XFACE
+#define ACPI_EXTERNAL_XFACE
+#define ACPI_INTERNAL_XFACE
+#define ACPI_INTERNAL_VAR_XFACE
+
+/* Asm macros */
+
+#define ACPI_ASM_MACROS
+#define BREAKPOINT3
+#define ACPI_DISABLE_IRQS() local_irq_disable()
+#define ACPI_ENABLE_IRQS() local_irq_enable()
+#define ACPI_FLUSH_CPU_CACHE() wbinvd()
+
+int __acpi_acquire_global_lock(unsigned int *lock);
+int __acpi_release_global_lock(unsigned int *lock);
+
+#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
+ ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
+
+#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
+ ((Acq) = __acpi_release_global_lock(&facs->global_lock))
+
+/*
+ * Math helper asm macros
+ */
+#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
+ asm("divl %2;" \
+ : "=a"(q32), "=d"(r32) \
+ : "r"(d32), \
+ "0"(n_lo), "1"(n_hi))
+
+
+#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
+ asm("shrl $1,%2 ;" \
+ "rcrl $1,%3;" \
+ : "=r"(n_hi), "=r"(n_lo) \
+ : "0"(n_hi), "1"(n_lo))
+
+#ifdef CONFIG_ACPI
+extern int acpi_lapic;
+extern int acpi_ioapic;
+extern int acpi_noirq;
+extern int acpi_strict;
+extern int acpi_disabled;
+extern int acpi_ht;
+extern int acpi_pci_disabled;
+extern int acpi_skip_timer_override;
+extern int acpi_use_timer_override;
+
+extern u8 acpi_sci_flags;
+extern int acpi_sci_override_gsi;
+void acpi_pic_sci_set_trigger(unsigned int, u16);
+
+static inline void disable_acpi(void)
+{
+ acpi_disabled = 1;
+ acpi_ht = 0;
+ acpi_pci_disabled = 1;
+ acpi_noirq = 1;
+}
+
+/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
+#define FIX_ACPI_PAGES 4
+
+extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
+
+static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
+static inline void acpi_disable_pci(void)
+{
+ acpi_pci_disabled = 1;
+ acpi_noirq_set();
+}
+extern int acpi_irq_balance_set(char *str);
+
+/* routines for saving/restoring kernel state */
+extern int acpi_save_state_mem(void);
+extern void acpi_restore_state_mem(void);
+
+extern unsigned long acpi_wakeup_address;
+
+/* early initialization routine */
+extern void acpi_reserve_bootmem(void);
+
+/*
+ * Check if the CPU can handle C2 and deeper
+ */
+static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
+{
+ /*
+ * Early models (<=5) of AMD Opterons are not supposed to go into
+ * C2 state.
+ *
+ * Steppings 0x0A and later are good
+ */
+ if (boot_cpu_data.x86 == 0x0F &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86_model <= 0x05 &&
+ boot_cpu_data.x86_mask < 0x0A)
+ return 1;
+ else if (boot_cpu_has(X86_FEATURE_AMDC1E))
+ return 1;
+ else
+ return max_cstate;
+}
+
+#else /* !CONFIG_ACPI */
+
+#define acpi_lapic 0
+#define acpi_ioapic 0
+static inline void acpi_noirq_set(void) { }
+static inline void acpi_disable_pci(void) { }
+static inline void disable_acpi(void) { }
+
+#endif /* !CONFIG_ACPI */
+
+#define ARCH_HAS_POWER_INIT 1
+
+struct bootnode;
+
+#ifdef CONFIG_ACPI_NUMA
+extern int acpi_numa;
+extern int acpi_scan_nodes(unsigned long start, unsigned long end);
+#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
+ int num_nodes);
+#else
+static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
+ int num_nodes)
+{
+}
+#endif
+
+#define acpi_unlazy_tlb(x) leave_mm(x)
+
+#endif /* _ASM_X86_ACPI_H */
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
new file mode 100644
index 00000000000..9825cd64c9b
--- /dev/null
+++ b/arch/x86/include/asm/agp.h
@@ -0,0 +1,35 @@
+#ifndef _ASM_X86_AGP_H
+#define _ASM_X86_AGP_H
+
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+
+/*
+ * Functions to keep the agpgart mappings coherent with the MMU. The
+ * GART gives the CPU a physical alias of pages in memory. The alias
+ * region is mapped uncacheable. Make sure there are no conflicting
+ * mappings with different cachability attributes for the same
+ * page. This avoids data corruption on some CPUs.
+ */
+
+#define map_page_into_agp(page) set_pages_uc(page, 1)
+#define unmap_page_from_agp(page) set_pages_wb(page, 1)
+
+/*
+ * Could use CLFLUSH here if the cpu supports it. But then it would
+ * need to be called for each cacheline of the whole page so it may
+ * not be worth it. Would need a page for it.
+ */
+#define flush_agp_cache() wbinvd()
+
+/* Convert a physical address to an address suitable for the GART. */
+#define phys_to_gart(x) (x)
+#define gart_to_phys(x) (x)
+
+/* GATT allocation. Returns/accepts GATT kernel virtual address. */
+#define alloc_gatt_pages(order) \
+ ((char *)__get_free_pages(GFP_KERNEL, (order)))
+#define free_gatt_pages(table, order) \
+ free_pages((unsigned long)(table), (order))
+
+#endif /* _ASM_X86_AGP_H */
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
new file mode 100644
index 00000000000..e2077d343c3
--- /dev/null
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -0,0 +1,22 @@
+#ifdef __ASSEMBLY__
+
+#ifdef CONFIG_X86_32
+# define X86_ALIGN .long
+#else
+# define X86_ALIGN .quad
+#endif
+
+#ifdef CONFIG_SMP
+ .macro LOCK_PREFIX
+1: lock
+ .section .smp_locks,"a"
+ .align 4
+ X86_ALIGN 1b
+ .previous
+ .endm
+#else
+ .macro LOCK_PREFIX
+ .endm
+#endif
+
+#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
new file mode 100644
index 00000000000..f6aa18eadf7
--- /dev/null
+++ b/arch/x86/include/asm/alternative.h
@@ -0,0 +1,183 @@
+#ifndef _ASM_X86_ALTERNATIVE_H
+#define _ASM_X86_ALTERNATIVE_H
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <asm/asm.h>
+
+/*
+ * Alternative inline assembly for SMP.
+ *
+ * The LOCK_PREFIX macro defined here replaces the LOCK and
+ * LOCK_PREFIX macros used everywhere in the source tree.
+ *
+ * SMP alternatives use the same data structures as the other
+ * alternatives and the X86_FEATURE_UP flag to indicate the case of a
+ * UP system running a SMP kernel. The existing apply_alternatives()
+ * works fine for patching a SMP kernel for UP.
+ *
+ * The SMP alternative tables can be kept after boot and contain both
+ * UP and SMP versions of the instructions to allow switching back to
+ * SMP at runtime, when hotplugging in a new CPU, which is especially
+ * useful in virtualized environments.
+ *
+ * The very common lock prefix is handled as special case in a
+ * separate table which is a pure address list without replacement ptr
+ * and size information. That keeps the table sizes small.
+ */
+
+#ifdef CONFIG_SMP
+#define LOCK_PREFIX \
+ ".section .smp_locks,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR "661f\n" /* address */ \
+ ".previous\n" \
+ "661:\n\tlock; "
+
+#else /* ! CONFIG_SMP */
+#define LOCK_PREFIX ""
+#endif
+
+/* This must be included *after* the definition of LOCK_PREFIX */
+#include <asm/cpufeature.h>
+
+struct alt_instr {
+ u8 *instr; /* original instruction */
+ u8 *replacement;
+ u8 cpuid; /* cpuid bit set for replacement */
+ u8 instrlen; /* length of original instruction */
+ u8 replacementlen; /* length of new instruction, <= instrlen */
+ u8 pad1;
+#ifdef CONFIG_X86_64
+ u32 pad2;
+#endif
+};
+
+extern void alternative_instructions(void);
+extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+
+struct module;
+
+#ifdef CONFIG_SMP
+extern void alternatives_smp_module_add(struct module *mod, char *name,
+ void *locks, void *locks_end,
+ void *text, void *text_end);
+extern void alternatives_smp_module_del(struct module *mod);
+extern void alternatives_smp_switch(int smp);
+#else
+static inline void alternatives_smp_module_add(struct module *mod, char *name,
+ void *locks, void *locks_end,
+ void *text, void *text_end) {}
+static inline void alternatives_smp_module_del(struct module *mod) {}
+static inline void alternatives_smp_switch(int smp) {}
+#endif /* CONFIG_SMP */
+
+const unsigned char *const *find_nop_table(void);
+
+/*
+ * Alternative instructions for different CPU types or capabilities.
+ *
+ * This allows to use optimized instructions even on generic binary
+ * kernels.
+ *
+ * length of oldinstr must be longer or equal the length of newinstr
+ * It can be padded with nops as needed.
+ *
+ * For non barrier like inlines please define new variants
+ * without volatile and memory clobber.
+ */
+#define alternative(oldinstr, newinstr, feature) \
+ asm volatile ("661:\n\t" oldinstr "\n662:\n" \
+ ".section .altinstructions,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR "661b\n" /* label */ \
+ _ASM_PTR "663f\n" /* new instruction */ \
+ " .byte %c0\n" /* feature bit */ \
+ " .byte 662b-661b\n" /* sourcelen */ \
+ " .byte 664f-663f\n" /* replacementlen */ \
+ ".previous\n" \
+ ".section .altinstr_replacement,\"ax\"\n" \
+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
+ ".previous" :: "i" (feature) : "memory")
+
+/*
+ * Alternative inline assembly with input.
+ *
+ * Pecularities:
+ * No memory clobber here.
+ * Argument numbers start with 1.
+ * Best is to use constraints that are fixed size (like (%1) ... "r")
+ * If you use variable sized constraints like "m" or "g" in the
+ * replacement make sure to pad to the worst case length.
+ */
+#define alternative_input(oldinstr, newinstr, feature, input...) \
+ asm volatile ("661:\n\t" oldinstr "\n662:\n" \
+ ".section .altinstructions,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR "661b\n" /* label */ \
+ _ASM_PTR "663f\n" /* new instruction */ \
+ " .byte %c0\n" /* feature bit */ \
+ " .byte 662b-661b\n" /* sourcelen */ \
+ " .byte 664f-663f\n" /* replacementlen */ \
+ ".previous\n" \
+ ".section .altinstr_replacement,\"ax\"\n" \
+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
+ ".previous" :: "i" (feature), ##input)
+
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, newinstr, feature, output, input...) \
+ asm volatile ("661:\n\t" oldinstr "\n662:\n" \
+ ".section .altinstructions,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR "661b\n" /* label */ \
+ _ASM_PTR "663f\n" /* new instruction */ \
+ " .byte %c[feat]\n" /* feature bit */ \
+ " .byte 662b-661b\n" /* sourcelen */ \
+ " .byte 664f-663f\n" /* replacementlen */ \
+ ".previous\n" \
+ ".section .altinstr_replacement,\"ax\"\n" \
+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
+ ".previous" : output : [feat] "i" (feature), ##input)
+
+/*
+ * use this macro(s) if you need more than one output parameter
+ * in alternative_io
+ */
+#define ASM_OUTPUT2(a, b) a, b
+
+struct paravirt_patch_site;
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end);
+#else
+static inline void apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end)
+{}
+#define __parainstructions NULL
+#define __parainstructions_end NULL
+#endif
+
+extern void add_nops(void *insns, unsigned int len);
+
+/*
+ * Clear and restore the kernel write-protection flag on the local CPU.
+ * Allows the kernel to edit read-only pages.
+ * Side-effect: any interrupt handler running between save and restore will have
+ * the ability to write to read-only pages.
+ *
+ * Warning:
+ * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and
+ * no thread can be preempted in the instructions being modified (no iret to an
+ * invalid instruction possible) or if the instructions are changed from a
+ * consistent state to another consistent state atomically.
+ * More care must be taken when modifying code in the SMP case because of
+ * Intel's errata.
+ * On the local CPU you need to be protected again NMI or MCE handlers seeing an
+ * inconsistent instruction while you patch.
+ * The _early version expects the memory to already be RW.
+ */
+
+extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+
+#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
new file mode 100644
index 00000000000..f712344329b
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ * Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_H
+#define _ASM_X86_AMD_IOMMU_H
+
+#include <linux/irqreturn.h>
+
+#ifdef CONFIG_AMD_IOMMU
+extern int amd_iommu_init(void);
+extern int amd_iommu_init_dma_ops(void);
+extern void amd_iommu_detect(void);
+extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+#else
+static inline int amd_iommu_init(void) { return -ENODEV; }
+static inline void amd_iommu_detect(void) { }
+#endif
+
+#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
new file mode 100644
index 00000000000..1a30c0440c6
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -0,0 +1,404 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ * Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
+#define _ASM_X86_AMD_IOMMU_TYPES_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+/*
+ * some size calculation constants
+ */
+#define DEV_TABLE_ENTRY_SIZE 32
+#define ALIAS_TABLE_ENTRY_SIZE 2
+#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
+
+/* Length of the MMIO region for the AMD IOMMU */
+#define MMIO_REGION_LENGTH 0x4000
+
+/* Capability offsets used by the driver */
+#define MMIO_CAP_HDR_OFFSET 0x00
+#define MMIO_RANGE_OFFSET 0x0c
+#define MMIO_MISC_OFFSET 0x10
+
+/* Masks, shifts and macros to parse the device range capability */
+#define MMIO_RANGE_LD_MASK 0xff000000
+#define MMIO_RANGE_FD_MASK 0x00ff0000
+#define MMIO_RANGE_BUS_MASK 0x0000ff00
+#define MMIO_RANGE_LD_SHIFT 24
+#define MMIO_RANGE_FD_SHIFT 16
+#define MMIO_RANGE_BUS_SHIFT 8
+#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
+#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
+#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
+#define MMIO_MSI_NUM(x) ((x) & 0x1f)
+
+/* Flag masks for the AMD IOMMU exclusion range */
+#define MMIO_EXCL_ENABLE_MASK 0x01ULL
+#define MMIO_EXCL_ALLOW_MASK 0x02ULL
+
+/* Used offsets into the MMIO space */
+#define MMIO_DEV_TABLE_OFFSET 0x0000
+#define MMIO_CMD_BUF_OFFSET 0x0008
+#define MMIO_EVT_BUF_OFFSET 0x0010
+#define MMIO_CONTROL_OFFSET 0x0018
+#define MMIO_EXCL_BASE_OFFSET 0x0020
+#define MMIO_EXCL_LIMIT_OFFSET 0x0028
+#define MMIO_CMD_HEAD_OFFSET 0x2000
+#define MMIO_CMD_TAIL_OFFSET 0x2008
+#define MMIO_EVT_HEAD_OFFSET 0x2010
+#define MMIO_EVT_TAIL_OFFSET 0x2018
+#define MMIO_STATUS_OFFSET 0x2020
+
+/* MMIO status bits */
+#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
+
+/* event logging constants */
+#define EVENT_ENTRY_SIZE 0x10
+#define EVENT_TYPE_SHIFT 28
+#define EVENT_TYPE_MASK 0xf
+#define EVENT_TYPE_ILL_DEV 0x1
+#define EVENT_TYPE_IO_FAULT 0x2
+#define EVENT_TYPE_DEV_TAB_ERR 0x3
+#define EVENT_TYPE_PAGE_TAB_ERR 0x4
+#define EVENT_TYPE_ILL_CMD 0x5
+#define EVENT_TYPE_CMD_HARD_ERR 0x6
+#define EVENT_TYPE_IOTLB_INV_TO 0x7
+#define EVENT_TYPE_INV_DEV_REQ 0x8
+#define EVENT_DEVID_MASK 0xffff
+#define EVENT_DEVID_SHIFT 0
+#define EVENT_DOMID_MASK 0xffff
+#define EVENT_DOMID_SHIFT 0
+#define EVENT_FLAGS_MASK 0xfff
+#define EVENT_FLAGS_SHIFT 0x10
+
+/* feature control bits */
+#define CONTROL_IOMMU_EN 0x00ULL
+#define CONTROL_HT_TUN_EN 0x01ULL
+#define CONTROL_EVT_LOG_EN 0x02ULL
+#define CONTROL_EVT_INT_EN 0x03ULL
+#define CONTROL_COMWAIT_EN 0x04ULL
+#define CONTROL_PASSPW_EN 0x08ULL
+#define CONTROL_RESPASSPW_EN 0x09ULL
+#define CONTROL_COHERENT_EN 0x0aULL
+#define CONTROL_ISOC_EN 0x0bULL
+#define CONTROL_CMDBUF_EN 0x0cULL
+#define CONTROL_PPFLOG_EN 0x0dULL
+#define CONTROL_PPFINT_EN 0x0eULL
+
+/* command specific defines */
+#define CMD_COMPL_WAIT 0x01
+#define CMD_INV_DEV_ENTRY 0x02
+#define CMD_INV_IOMMU_PAGES 0x03
+
+#define CMD_COMPL_WAIT_STORE_MASK 0x01
+#define CMD_COMPL_WAIT_INT_MASK 0x02
+#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
+#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
+
+#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
+
+/* macros and definitions for device table entries */
+#define DEV_ENTRY_VALID 0x00
+#define DEV_ENTRY_TRANSLATION 0x01
+#define DEV_ENTRY_IR 0x3d
+#define DEV_ENTRY_IW 0x3e
+#define DEV_ENTRY_NO_PAGE_FAULT 0x62
+#define DEV_ENTRY_EX 0x67
+#define DEV_ENTRY_SYSMGT1 0x68
+#define DEV_ENTRY_SYSMGT2 0x69
+#define DEV_ENTRY_INIT_PASS 0xb8
+#define DEV_ENTRY_EINT_PASS 0xb9
+#define DEV_ENTRY_NMI_PASS 0xba
+#define DEV_ENTRY_LINT0_PASS 0xbe
+#define DEV_ENTRY_LINT1_PASS 0xbf
+#define DEV_ENTRY_MODE_MASK 0x07
+#define DEV_ENTRY_MODE_SHIFT 0x09
+
+/* constants to configure the command buffer */
+#define CMD_BUFFER_SIZE 8192
+#define CMD_BUFFER_ENTRIES 512
+#define MMIO_CMD_SIZE_SHIFT 56
+#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
+
+/* constants for event buffer handling */
+#define EVT_BUFFER_SIZE 8192 /* 512 entries */
+#define EVT_LEN_MASK (0x9ULL << 56)
+
+#define PAGE_MODE_1_LEVEL 0x01
+#define PAGE_MODE_2_LEVEL 0x02
+#define PAGE_MODE_3_LEVEL 0x03
+
+#define IOMMU_PDE_NL_0 0x000ULL
+#define IOMMU_PDE_NL_1 0x200ULL
+#define IOMMU_PDE_NL_2 0x400ULL
+#define IOMMU_PDE_NL_3 0x600ULL
+
+#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL)
+#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL)
+#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL)
+
+#define IOMMU_MAP_SIZE_L1 (1ULL << 21)
+#define IOMMU_MAP_SIZE_L2 (1ULL << 30)
+#define IOMMU_MAP_SIZE_L3 (1ULL << 39)
+
+#define IOMMU_PTE_P (1ULL << 0)
+#define IOMMU_PTE_TV (1ULL << 1)
+#define IOMMU_PTE_U (1ULL << 59)
+#define IOMMU_PTE_FC (1ULL << 60)
+#define IOMMU_PTE_IR (1ULL << 61)
+#define IOMMU_PTE_IW (1ULL << 62)
+
+#define IOMMU_L1_PDE(address) \
+ ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
+#define IOMMU_L2_PDE(address) \
+ ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
+
+#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
+#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
+#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
+#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
+
+#define IOMMU_PROT_MASK 0x03
+#define IOMMU_PROT_IR 0x01
+#define IOMMU_PROT_IW 0x02
+
+/* IOMMU capabilities */
+#define IOMMU_CAP_IOTLB 24
+#define IOMMU_CAP_NPCACHE 26
+
+#define MAX_DOMAIN_ID 65536
+
+/* FIXME: move this macro to <linux/pci.h> */
+#define PCI_BUS(x) (((x) >> 8) & 0xff)
+
+/*
+ * This structure contains generic data for IOMMU protection domains
+ * independent of their use.
+ */
+struct protection_domain {
+ spinlock_t lock; /* mostly used to lock the page table*/
+ u16 id; /* the domain id written to the device table */
+ int mode; /* paging mode (0-6 levels) */
+ u64 *pt_root; /* page table root pointer */
+ void *priv; /* private data */
+};
+
+/*
+ * Data container for a dma_ops specific protection domain
+ */
+struct dma_ops_domain {
+ struct list_head list;
+
+ /* generic protection domain information */
+ struct protection_domain domain;
+
+ /* size of the aperture for the mappings */
+ unsigned long aperture_size;
+
+ /* address we start to search for free addresses */
+ unsigned long next_bit;
+
+ /* address allocation bitmap */
+ unsigned long *bitmap;
+
+ /*
+ * Array of PTE pages for the aperture. In this array we save all the
+ * leaf pages of the domain page table used for the aperture. This way
+ * we don't need to walk the page table to find a specific PTE. We can
+ * just calculate its address in constant time.
+ */
+ u64 **pte_pages;
+
+ /* This will be set to true when TLB needs to be flushed */
+ bool need_flush;
+
+ /*
+ * if this is a preallocated domain, keep the device for which it was
+ * preallocated in this variable
+ */
+ u16 target_dev;
+};
+
+/*
+ * Structure where we save information about one hardware AMD IOMMU in the
+ * system.
+ */
+struct amd_iommu {
+ struct list_head list;
+
+ /* locks the accesses to the hardware */
+ spinlock_t lock;
+
+ /* Pointer to PCI device of this IOMMU */
+ struct pci_dev *dev;
+
+ /*
+ * Capability pointer. There could be more than one IOMMU per PCI
+ * device function if there are more than one AMD IOMMU capability
+ * pointers.
+ */
+ u16 cap_ptr;
+
+ /* physical address of MMIO space */
+ u64 mmio_phys;
+ /* virtual address of MMIO space */
+ u8 *mmio_base;
+
+ /* capabilities of that IOMMU read from ACPI */
+ u32 cap;
+
+ /* pci domain of this IOMMU */
+ u16 pci_seg;
+
+ /* first device this IOMMU handles. read from PCI */
+ u16 first_device;
+ /* last device this IOMMU handles. read from PCI */
+ u16 last_device;
+
+ /* start of exclusion range of that IOMMU */
+ u64 exclusion_start;
+ /* length of exclusion range of that IOMMU */
+ u64 exclusion_length;
+
+ /* command buffer virtual address */
+ u8 *cmd_buf;
+ /* size of command buffer */
+ u32 cmd_buf_size;
+
+ /* event buffer virtual address */
+ u8 *evt_buf;
+ /* size of event buffer */
+ u32 evt_buf_size;
+ /* MSI number for event interrupt */
+ u16 evt_msi_num;
+
+ /* if one, we need to send a completion wait command */
+ int need_sync;
+
+ /* true if interrupts for this IOMMU are already enabled */
+ bool int_enabled;
+
+ /* default dma_ops domain for that IOMMU */
+ struct dma_ops_domain *default_dom;
+};
+
+/*
+ * List with all IOMMUs in the system. This list is not locked because it is
+ * only written and read at driver initialization or suspend time
+ */
+extern struct list_head amd_iommu_list;
+
+/*
+ * Structure defining one entry in the device table
+ */
+struct dev_table_entry {
+ u32 data[8];
+};
+
+/*
+ * One entry for unity mappings parsed out of the ACPI table.
+ */
+struct unity_map_entry {
+ struct list_head list;
+
+ /* starting device id this entry is used for (including) */
+ u16 devid_start;
+ /* end device id this entry is used for (including) */
+ u16 devid_end;
+
+ /* start address to unity map (including) */
+ u64 address_start;
+ /* end address to unity map (including) */
+ u64 address_end;
+
+ /* required protection */
+ int prot;
+};
+
+/*
+ * List of all unity mappings. It is not locked because as runtime it is only
+ * read. It is created at ACPI table parsing time.
+ */
+extern struct list_head amd_iommu_unity_map;
+
+/*
+ * Data structures for device handling
+ */
+
+/*
+ * Device table used by hardware. Read and write accesses by software are
+ * locked with the amd_iommu_pd_table lock.
+ */
+extern struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * Alias table to find requestor ids to device ids. Not locked because only
+ * read on runtime.
+ */
+extern u16 *amd_iommu_alias_table;
+
+/*
+ * Reverse lookup table to find the IOMMU which translates a specific device.
+ */
+extern struct amd_iommu **amd_iommu_rlookup_table;
+
+/* size of the dma_ops aperture as power of 2 */
+extern unsigned amd_iommu_aperture_order;
+
+/* largest PCI device id we expect translation requests for */
+extern u16 amd_iommu_last_bdf;
+
+/* data structures for protection domain handling */
+extern struct protection_domain **amd_iommu_pd_table;
+
+/* allocation bitmap for domain ids */
+extern unsigned long *amd_iommu_pd_alloc_bitmap;
+
+/* will be 1 if device isolation is enabled */
+extern int amd_iommu_isolate;
+
+/*
+ * If true, the addresses will be flushed on unmap time, not when
+ * they are reused
+ */
+extern bool amd_iommu_unmap_flush;
+
+/* takes a PCI device id and prints it out in a readable form */
+static inline void print_devid(u16 devid, int nl)
+{
+ int bus = devid >> 8;
+ int dev = devid >> 3 & 0x1f;
+ int fn = devid & 0x07;
+
+ printk("%02x:%02x.%x", bus, dev, fn);
+ if (nl)
+ printk("\n");
+}
+
+/* takes bus and device/function and returns the device id
+ * FIXME: should that be in generic PCI code? */
+static inline u16 calc_devid(u8 bus, u8 devfn)
+{
+ return (((u16)bus) << 8) | devfn;
+}
+
+#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
new file mode 100644
index 00000000000..3b1510b4fc5
--- /dev/null
+++ b/arch/x86/include/asm/apic.h
@@ -0,0 +1,199 @@
+#ifndef _ASM_X86_APIC_H
+#define _ASM_X86_APIC_H
+
+#include <linux/pm.h>
+#include <linux/delay.h>
+
+#include <asm/alternative.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+
+#define ARCH_APICTIMER_STOPS_ON_C3 1
+
+/*
+ * Debugging macros
+ */
+#define APIC_QUIET 0
+#define APIC_VERBOSE 1
+#define APIC_DEBUG 2
+
+/*
+ * Define the default level of output to be very little
+ * This can be turned up by using apic=verbose for more
+ * information and apic=debug for _lots_ of information.
+ * apic_verbosity is defined in apic.c
+ */
+#define apic_printk(v, s, a...) do { \
+ if ((v) <= apic_verbosity) \
+ printk(s, ##a); \
+ } while (0)
+
+
+extern void generic_apic_probe(void);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+extern unsigned int apic_verbosity;
+extern int local_apic_timer_c2_ok;
+
+extern int disable_apic;
+/*
+ * Basic functions accessing APICs.
+ */
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define setup_boot_clock setup_boot_APIC_clock
+#define setup_secondary_clock setup_secondary_APIC_clock
+#endif
+
+extern int is_vsmp_box(void);
+extern void xapic_wait_icr_idle(void);
+extern u32 safe_xapic_wait_icr_idle(void);
+extern u64 xapic_icr_read(void);
+extern void xapic_icr_write(u32, u32);
+extern int setup_profiling_timer(unsigned int);
+
+static inline void native_apic_mem_write(u32 reg, u32 v)
+{
+ volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
+
+ alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP,
+ ASM_OUTPUT2("=r" (v), "=m" (*addr)),
+ ASM_OUTPUT2("0" (v), "m" (*addr)));
+}
+
+static inline u32 native_apic_mem_read(u32 reg)
+{
+ return *((volatile u32 *)(APIC_BASE + reg));
+}
+
+static inline void native_apic_msr_write(u32 reg, u32 v)
+{
+ if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
+ reg == APIC_LVR)
+ return;
+
+ wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0);
+}
+
+static inline u32 native_apic_msr_read(u32 reg)
+{
+ u32 low, high;
+
+ if (reg == APIC_DFR)
+ return -1;
+
+ rdmsr(APIC_BASE_MSR + (reg >> 4), low, high);
+ return low;
+}
+
+#ifndef CONFIG_X86_32
+extern int x2apic, x2apic_preenabled;
+extern void check_x2apic(void);
+extern void enable_x2apic(void);
+extern void enable_IR_x2apic(void);
+extern void x2apic_icr_write(u32 low, u32 id);
+static inline int x2apic_enabled(void)
+{
+ int msr, msr2;
+
+ if (!cpu_has_x2apic)
+ return 0;
+
+ rdmsr(MSR_IA32_APICBASE, msr, msr2);
+ if (msr & X2APIC_ENABLE)
+ return 1;
+ return 0;
+}
+#else
+#define x2apic_enabled() 0
+#endif
+
+struct apic_ops {
+ u32 (*read)(u32 reg);
+ void (*write)(u32 reg, u32 v);
+ u64 (*icr_read)(void);
+ void (*icr_write)(u32 low, u32 high);
+ void (*wait_icr_idle)(void);
+ u32 (*safe_wait_icr_idle)(void);
+};
+
+extern struct apic_ops *apic_ops;
+
+#define apic_read (apic_ops->read)
+#define apic_write (apic_ops->write)
+#define apic_icr_read (apic_ops->icr_read)
+#define apic_icr_write (apic_ops->icr_write)
+#define apic_wait_icr_idle (apic_ops->wait_icr_idle)
+#define safe_apic_wait_icr_idle (apic_ops->safe_wait_icr_idle)
+
+extern int get_physical_broadcast(void);
+
+#ifdef CONFIG_X86_64
+static inline void ack_x2APIC_irq(void)
+{
+ /* Docs say use 0 for future compatibility */
+ native_apic_msr_write(APIC_EOI, 0);
+}
+#endif
+
+
+static inline void ack_APIC_irq(void)
+{
+ /*
+ * ack_APIC_irq() actually gets compiled as a single instruction
+ * ... yummie.
+ */
+
+ /* Docs say use 0 for future compatibility */
+ apic_write(APIC_EOI, 0);
+}
+
+extern int lapic_get_maxlvt(void);
+extern void clear_local_APIC(void);
+extern void connect_bsp_APIC(void);
+extern void disconnect_bsp_APIC(int virt_wire_setup);
+extern void disable_local_APIC(void);
+extern void lapic_shutdown(void);
+extern int verify_local_APIC(void);
+extern void cache_APIC_registers(void);
+extern void sync_Arb_IDs(void);
+extern void init_bsp_APIC(void);
+extern void setup_local_APIC(void);
+extern void end_local_APIC_setup(void);
+extern void init_apic_mappings(void);
+extern void setup_boot_APIC_clock(void);
+extern void setup_secondary_APIC_clock(void);
+extern int APIC_init_uniprocessor(void);
+extern void enable_NMI_through_LVT0(void);
+
+/*
+ * On 32bit this is mach-xxx local
+ */
+#ifdef CONFIG_X86_64
+extern void early_init_lapic_mapping(void);
+extern int apic_is_clustered_box(void);
+#else
+static inline int apic_is_clustered_box(void)
+{
+ return 0;
+}
+#endif
+
+extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
+extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
+
+
+#else /* !CONFIG_X86_LOCAL_APIC */
+static inline void lapic_shutdown(void) { }
+#define local_apic_timer_c2_ok 1
+static inline void init_apic_mappings(void) { }
+
+#endif /* !CONFIG_X86_LOCAL_APIC */
+
+#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
new file mode 100644
index 00000000000..63134e31e8b
--- /dev/null
+++ b/arch/x86/include/asm/apicdef.h
@@ -0,0 +1,417 @@
+#ifndef _ASM_X86_APICDEF_H
+#define _ASM_X86_APICDEF_H
+
+/*
+ * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
+ *
+ * Alan Cox <Alan.Cox@linux.org>, 1995.
+ * Ingo Molnar <mingo@redhat.com>, 1999, 2000
+ */
+
+#define APIC_DEFAULT_PHYS_BASE 0xfee00000
+
+#define APIC_ID 0x20
+
+#define APIC_LVR 0x30
+#define APIC_LVR_MASK 0xFF00FF
+#define GET_APIC_VERSION(x) ((x) & 0xFFu)
+#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu)
+#ifdef CONFIG_X86_32
+# define APIC_INTEGRATED(x) ((x) & 0xF0u)
+#else
+# define APIC_INTEGRATED(x) (1)
+#endif
+#define APIC_XAPIC(x) ((x) >= 0x14)
+#define APIC_TASKPRI 0x80
+#define APIC_TPRI_MASK 0xFFu
+#define APIC_ARBPRI 0x90
+#define APIC_ARBPRI_MASK 0xFFu
+#define APIC_PROCPRI 0xA0
+#define APIC_EOI 0xB0
+#define APIC_EIO_ACK 0x0
+#define APIC_RRR 0xC0
+#define APIC_LDR 0xD0
+#define APIC_LDR_MASK (0xFFu << 24)
+#define GET_APIC_LOGICAL_ID(x) (((x) >> 24) & 0xFFu)
+#define SET_APIC_LOGICAL_ID(x) (((x) << 24))
+#define APIC_ALL_CPUS 0xFFu
+#define APIC_DFR 0xE0
+#define APIC_DFR_CLUSTER 0x0FFFFFFFul
+#define APIC_DFR_FLAT 0xFFFFFFFFul
+#define APIC_SPIV 0xF0
+#define APIC_SPIV_FOCUS_DISABLED (1 << 9)
+#define APIC_SPIV_APIC_ENABLED (1 << 8)
+#define APIC_ISR 0x100
+#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */
+#define APIC_TMR 0x180
+#define APIC_IRR 0x200
+#define APIC_ESR 0x280
+#define APIC_ESR_SEND_CS 0x00001
+#define APIC_ESR_RECV_CS 0x00002
+#define APIC_ESR_SEND_ACC 0x00004
+#define APIC_ESR_RECV_ACC 0x00008
+#define APIC_ESR_SENDILL 0x00020
+#define APIC_ESR_RECVILL 0x00040
+#define APIC_ESR_ILLREGA 0x00080
+#define APIC_ICR 0x300
+#define APIC_DEST_SELF 0x40000
+#define APIC_DEST_ALLINC 0x80000
+#define APIC_DEST_ALLBUT 0xC0000
+#define APIC_ICR_RR_MASK 0x30000
+#define APIC_ICR_RR_INVALID 0x00000
+#define APIC_ICR_RR_INPROG 0x10000
+#define APIC_ICR_RR_VALID 0x20000
+#define APIC_INT_LEVELTRIG 0x08000
+#define APIC_INT_ASSERT 0x04000
+#define APIC_ICR_BUSY 0x01000
+#define APIC_DEST_LOGICAL 0x00800
+#define APIC_DEST_PHYSICAL 0x00000
+#define APIC_DM_FIXED 0x00000
+#define APIC_DM_LOWEST 0x00100
+#define APIC_DM_SMI 0x00200
+#define APIC_DM_REMRD 0x00300
+#define APIC_DM_NMI 0x00400
+#define APIC_DM_INIT 0x00500
+#define APIC_DM_STARTUP 0x00600
+#define APIC_DM_EXTINT 0x00700
+#define APIC_VECTOR_MASK 0x000FF
+#define APIC_ICR2 0x310
+#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF)
+#define SET_APIC_DEST_FIELD(x) ((x) << 24)
+#define APIC_LVTT 0x320
+#define APIC_LVTTHMR 0x330
+#define APIC_LVTPC 0x340
+#define APIC_LVT0 0x350
+#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18)
+#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3)
+#define SET_APIC_TIMER_BASE(x) (((x) << 18))
+#define APIC_TIMER_BASE_CLKIN 0x0
+#define APIC_TIMER_BASE_TMBASE 0x1
+#define APIC_TIMER_BASE_DIV 0x2
+#define APIC_LVT_TIMER_PERIODIC (1 << 17)
+#define APIC_LVT_MASKED (1 << 16)
+#define APIC_LVT_LEVEL_TRIGGER (1 << 15)
+#define APIC_LVT_REMOTE_IRR (1 << 14)
+#define APIC_INPUT_POLARITY (1 << 13)
+#define APIC_SEND_PENDING (1 << 12)
+#define APIC_MODE_MASK 0x700
+#define GET_APIC_DELIVERY_MODE(x) (((x) >> 8) & 0x7)
+#define SET_APIC_DELIVERY_MODE(x, y) (((x) & ~0x700) | ((y) << 8))
+#define APIC_MODE_FIXED 0x0
+#define APIC_MODE_NMI 0x4
+#define APIC_MODE_EXTINT 0x7
+#define APIC_LVT1 0x360
+#define APIC_LVTERR 0x370
+#define APIC_TMICT 0x380
+#define APIC_TMCCT 0x390
+#define APIC_TDCR 0x3E0
+#define APIC_SELF_IPI 0x3F0
+#define APIC_TDR_DIV_TMBASE (1 << 2)
+#define APIC_TDR_DIV_1 0xB
+#define APIC_TDR_DIV_2 0x0
+#define APIC_TDR_DIV_4 0x1
+#define APIC_TDR_DIV_8 0x2
+#define APIC_TDR_DIV_16 0x3
+#define APIC_TDR_DIV_32 0x8
+#define APIC_TDR_DIV_64 0x9
+#define APIC_TDR_DIV_128 0xA
+#define APIC_EILVT0 0x500
+#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
+#define APIC_EILVT_NR_AMD_10H 4
+#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
+#define APIC_EILVT_MSG_FIX 0x0
+#define APIC_EILVT_MSG_SMI 0x2
+#define APIC_EILVT_MSG_NMI 0x4
+#define APIC_EILVT_MSG_EXT 0x7
+#define APIC_EILVT_MASKED (1 << 16)
+#define APIC_EILVT1 0x510
+#define APIC_EILVT2 0x520
+#define APIC_EILVT3 0x530
+
+#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
+#define APIC_BASE_MSR 0x800
+#define X2APIC_ENABLE (1UL << 10)
+
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
+
+/*
+ * All x86-64 systems are xAPIC compatible.
+ * In the following, "apicid" is a physical APIC ID.
+ */
+#define XAPIC_DEST_CPUS_SHIFT 4
+#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
+#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
+#define APIC_CLUSTER(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
+#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT)
+#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK)
+#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
+
+/*
+ * the local APIC register structure, memory mapped. Not terribly well
+ * tested, but we might eventually use this one in the future - the
+ * problem why we cannot use it right now is the P5 APIC, it has an
+ * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
+ */
+#define u32 unsigned int
+
+struct local_apic {
+
+/*000*/ struct { u32 __reserved[4]; } __reserved_01;
+
+/*010*/ struct { u32 __reserved[4]; } __reserved_02;
+
+/*020*/ struct { /* APIC ID Register */
+ u32 __reserved_1 : 24,
+ phys_apic_id : 4,
+ __reserved_2 : 4;
+ u32 __reserved[3];
+ } id;
+
+/*030*/ const
+ struct { /* APIC Version Register */
+ u32 version : 8,
+ __reserved_1 : 8,
+ max_lvt : 8,
+ __reserved_2 : 8;
+ u32 __reserved[3];
+ } version;
+
+/*040*/ struct { u32 __reserved[4]; } __reserved_03;
+
+/*050*/ struct { u32 __reserved[4]; } __reserved_04;
+
+/*060*/ struct { u32 __reserved[4]; } __reserved_05;
+
+/*070*/ struct { u32 __reserved[4]; } __reserved_06;
+
+/*080*/ struct { /* Task Priority Register */
+ u32 priority : 8,
+ __reserved_1 : 24;
+ u32 __reserved_2[3];
+ } tpr;
+
+/*090*/ const
+ struct { /* Arbitration Priority Register */
+ u32 priority : 8,
+ __reserved_1 : 24;
+ u32 __reserved_2[3];
+ } apr;
+
+/*0A0*/ const
+ struct { /* Processor Priority Register */
+ u32 priority : 8,
+ __reserved_1 : 24;
+ u32 __reserved_2[3];
+ } ppr;
+
+/*0B0*/ struct { /* End Of Interrupt Register */
+ u32 eoi;
+ u32 __reserved[3];
+ } eoi;
+
+/*0C0*/ struct { u32 __reserved[4]; } __reserved_07;
+
+/*0D0*/ struct { /* Logical Destination Register */
+ u32 __reserved_1 : 24,
+ logical_dest : 8;
+ u32 __reserved_2[3];
+ } ldr;
+
+/*0E0*/ struct { /* Destination Format Register */
+ u32 __reserved_1 : 28,
+ model : 4;
+ u32 __reserved_2[3];
+ } dfr;
+
+/*0F0*/ struct { /* Spurious Interrupt Vector Register */
+ u32 spurious_vector : 8,
+ apic_enabled : 1,
+ focus_cpu : 1,
+ __reserved_2 : 22;
+ u32 __reserved_3[3];
+ } svr;
+
+/*100*/ struct { /* In Service Register */
+/*170*/ u32 bitfield;
+ u32 __reserved[3];
+ } isr [8];
+
+/*180*/ struct { /* Trigger Mode Register */
+/*1F0*/ u32 bitfield;
+ u32 __reserved[3];
+ } tmr [8];
+
+/*200*/ struct { /* Interrupt Request Register */
+/*270*/ u32 bitfield;
+ u32 __reserved[3];
+ } irr [8];
+
+/*280*/ union { /* Error Status Register */
+ struct {
+ u32 send_cs_error : 1,
+ receive_cs_error : 1,
+ send_accept_error : 1,
+ receive_accept_error : 1,
+ __reserved_1 : 1,
+ send_illegal_vector : 1,
+ receive_illegal_vector : 1,
+ illegal_register_address : 1,
+ __reserved_2 : 24;
+ u32 __reserved_3[3];
+ } error_bits;
+ struct {
+ u32 errors;
+ u32 __reserved_3[3];
+ } all_errors;
+ } esr;
+
+/*290*/ struct { u32 __reserved[4]; } __reserved_08;
+
+/*2A0*/ struct { u32 __reserved[4]; } __reserved_09;
+
+/*2B0*/ struct { u32 __reserved[4]; } __reserved_10;
+
+/*2C0*/ struct { u32 __reserved[4]; } __reserved_11;
+
+/*2D0*/ struct { u32 __reserved[4]; } __reserved_12;
+
+/*2E0*/ struct { u32 __reserved[4]; } __reserved_13;
+
+/*2F0*/ struct { u32 __reserved[4]; } __reserved_14;
+
+/*300*/ struct { /* Interrupt Command Register 1 */
+ u32 vector : 8,
+ delivery_mode : 3,
+ destination_mode : 1,
+ delivery_status : 1,
+ __reserved_1 : 1,
+ level : 1,
+ trigger : 1,
+ __reserved_2 : 2,
+ shorthand : 2,
+ __reserved_3 : 12;
+ u32 __reserved_4[3];
+ } icr1;
+
+/*310*/ struct { /* Interrupt Command Register 2 */
+ union {
+ u32 __reserved_1 : 24,
+ phys_dest : 4,
+ __reserved_2 : 4;
+ u32 __reserved_3 : 24,
+ logical_dest : 8;
+ } dest;
+ u32 __reserved_4[3];
+ } icr2;
+
+/*320*/ struct { /* LVT - Timer */
+ u32 vector : 8,
+ __reserved_1 : 4,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ timer_mode : 1,
+ __reserved_3 : 14;
+ u32 __reserved_4[3];
+ } lvt_timer;
+
+/*330*/ struct { /* LVT - Thermal Sensor */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ __reserved_3 : 15;
+ u32 __reserved_4[3];
+ } lvt_thermal;
+
+/*340*/ struct { /* LVT - Performance Counter */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ __reserved_3 : 15;
+ u32 __reserved_4[3];
+ } lvt_pc;
+
+/*350*/ struct { /* LVT - LINT0 */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ polarity : 1,
+ remote_irr : 1,
+ trigger : 1,
+ mask : 1,
+ __reserved_2 : 15;
+ u32 __reserved_3[3];
+ } lvt_lint0;
+
+/*360*/ struct { /* LVT - LINT1 */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ polarity : 1,
+ remote_irr : 1,
+ trigger : 1,
+ mask : 1,
+ __reserved_2 : 15;
+ u32 __reserved_3[3];
+ } lvt_lint1;
+
+/*370*/ struct { /* LVT - Error */
+ u32 vector : 8,
+ __reserved_1 : 4,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ __reserved_3 : 15;
+ u32 __reserved_4[3];
+ } lvt_error;
+
+/*380*/ struct { /* Timer Initial Count Register */
+ u32 initial_count;
+ u32 __reserved_2[3];
+ } timer_icr;
+
+/*390*/ const
+ struct { /* Timer Current Count Register */
+ u32 curr_count;
+ u32 __reserved_2[3];
+ } timer_ccr;
+
+/*3A0*/ struct { u32 __reserved[4]; } __reserved_16;
+
+/*3B0*/ struct { u32 __reserved[4]; } __reserved_17;
+
+/*3C0*/ struct { u32 __reserved[4]; } __reserved_18;
+
+/*3D0*/ struct { u32 __reserved[4]; } __reserved_19;
+
+/*3E0*/ struct { /* Timer Divide Configuration Register */
+ u32 divisor : 4,
+ __reserved_1 : 28;
+ u32 __reserved_2[3];
+ } timer_dcr;
+
+/*3F0*/ struct { u32 __reserved[4]; } __reserved_20;
+
+} __attribute__ ((packed));
+
+#undef u32
+
+#ifdef CONFIG_X86_32
+ #define BAD_APICID 0xFFu
+#else
+ #define BAD_APICID 0xFFFFu
+#endif
+#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/arch_hooks.h b/arch/x86/include/asm/arch_hooks.h
new file mode 100644
index 00000000000..cbd4957838a
--- /dev/null
+++ b/arch/x86/include/asm/arch_hooks.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_ARCH_HOOKS_H
+#define _ASM_X86_ARCH_HOOKS_H
+
+#include <linux/interrupt.h>
+
+/*
+ * linux/include/asm/arch_hooks.h
+ *
+ * define the architecture specific hooks
+ */
+
+/* these aren't arch hooks, they are generic routines
+ * that can be used by the hooks */
+extern void init_ISA_irqs(void);
+extern irqreturn_t timer_interrupt(int irq, void *dev_id);
+
+/* these are the defined hooks */
+extern void intr_init_hook(void);
+extern void pre_intr_init_hook(void);
+extern void pre_setup_arch_hook(void);
+extern void trap_init_hook(void);
+extern void pre_time_init_hook(void);
+extern void time_init_hook(void);
+extern void mca_nmi_hook(void);
+
+#endif /* _ASM_X86_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
new file mode 100644
index 00000000000..56be78f582f
--- /dev/null
+++ b/arch/x86/include/asm/asm.h
@@ -0,0 +1,47 @@
+#ifndef _ASM_X86_ASM_H
+#define _ASM_X86_ASM_H
+
+#ifdef __ASSEMBLY__
+# define __ASM_FORM(x) x
+# define __ASM_EX_SEC .section __ex_table
+#else
+# define __ASM_FORM(x) " " #x " "
+# define __ASM_EX_SEC " .section __ex_table,\"a\"\n"
+#endif
+
+#ifdef CONFIG_X86_32
+# define __ASM_SEL(a,b) __ASM_FORM(a)
+#else
+# define __ASM_SEL(a,b) __ASM_FORM(b)
+#endif
+
+#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q)
+#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg)
+
+#define _ASM_PTR __ASM_SEL(.long, .quad)
+#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8)
+
+#define _ASM_MOV __ASM_SIZE(mov)
+#define _ASM_INC __ASM_SIZE(inc)
+#define _ASM_DEC __ASM_SIZE(dec)
+#define _ASM_ADD __ASM_SIZE(add)
+#define _ASM_SUB __ASM_SIZE(sub)
+#define _ASM_XADD __ASM_SIZE(xadd)
+
+#define _ASM_AX __ASM_REG(ax)
+#define _ASM_BX __ASM_REG(bx)
+#define _ASM_CX __ASM_REG(cx)
+#define _ASM_DX __ASM_REG(dx)
+#define _ASM_SP __ASM_REG(sp)
+#define _ASM_BP __ASM_REG(bp)
+#define _ASM_SI __ASM_REG(si)
+#define _ASM_DI __ASM_REG(di)
+
+/* Exception table entry */
+# define _ASM_EXTABLE(from,to) \
+ __ASM_EX_SEC \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR #from "," #to "\n" \
+ " .previous\n"
+
+#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
new file mode 100644
index 00000000000..4e1b8873c47
--- /dev/null
+++ b/arch/x86/include/asm/atomic.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "atomic_32.h"
+#else
+# include "atomic_64.h"
+#endif
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
new file mode 100644
index 00000000000..ad5b9f6ecdd
--- /dev/null
+++ b/arch/x86/include/asm/atomic_32.h
@@ -0,0 +1,259 @@
+#ifndef _ASM_X86_ATOMIC_32_H
+#define _ASM_X86_ATOMIC_32_H
+
+#include <linux/compiler.h>
+#include <asm/processor.h>
+#include <asm/cmpxchg.h>
+
+/*
+ * Atomic operations that C can't guarantee us. Useful for
+ * resource counting etc..
+ */
+
+/*
+ * Make sure gcc doesn't try to be clever and move things around
+ * on us. We need to use _exactly_ the address the user gave us,
+ * not some alias that contains the same information.
+ */
+typedef struct {
+ int counter;
+} atomic_t;
+
+#define ATOMIC_INIT(i) { (i) }
+
+/**
+ * atomic_read - read atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically reads the value of @v.
+ */
+#define atomic_read(v) ((v)->counter)
+
+/**
+ * atomic_set - set atomic variable
+ * @v: pointer of type atomic_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+#define atomic_set(v, i) (((v)->counter) = (i))
+
+/**
+ * atomic_add - add integer to atomic variable
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v.
+ */
+static inline void atomic_add(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i));
+}
+
+/**
+ * atomic_sub - subtract integer from atomic variable
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static inline void atomic_sub(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "subl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i));
+}
+
+/**
+ * atomic_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic_sub_and_test(int i, atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
+ : "+m" (v->counter), "=qm" (c)
+ : "ir" (i) : "memory");
+ return c;
+}
+
+/**
+ * atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic_inc(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "incl %0"
+ : "+m" (v->counter));
+}
+
+/**
+ * atomic_dec - decrement atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static inline void atomic_dec(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "decl %0"
+ : "+m" (v->counter));
+}
+
+/**
+ * atomic_dec_and_test - decrement and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic_dec_and_test(atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "decl %0; sete %1"
+ : "+m" (v->counter), "=qm" (c)
+ : : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic_inc_and_test - increment and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic_inc_and_test(atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "incl %0; sete %1"
+ : "+m" (v->counter), "=qm" (c)
+ : : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic_add_negative - add and test if negative
+ * @v: pointer of type atomic_t
+ * @i: integer value to add
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int atomic_add_negative(int i, atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
+ : "+m" (v->counter), "=qm" (c)
+ : "ir" (i) : "memory");
+ return c;
+}
+
+/**
+ * atomic_add_return - add integer and return
+ * @v: pointer of type atomic_t
+ * @i: integer value to add
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+ int __i;
+#ifdef CONFIG_M386
+ unsigned long flags;
+ if (unlikely(boot_cpu_data.x86 <= 3))
+ goto no_xadd;
+#endif
+ /* Modern 486+ processor */
+ __i = i;
+ asm volatile(LOCK_PREFIX "xaddl %0, %1"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
+
+#ifdef CONFIG_M386
+no_xadd: /* Legacy 386 processor */
+ local_irq_save(flags);
+ __i = atomic_read(v);
+ atomic_set(v, i + __i);
+ local_irq_restore(flags);
+ return i + __i;
+#endif
+}
+
+/**
+ * atomic_sub_return - subtract integer and return
+ * @v: pointer of type atomic_t
+ * @i: integer value to subtract
+ *
+ * Atomically subtracts @i from @v and returns @v - @i
+ */
+static inline int atomic_sub_return(int i, atomic_t *v)
+{
+ return atomic_add_return(-i, v);
+}
+
+#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
+
+/**
+ * atomic_add_unless - add unless the number is already a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as @v was not already @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+ int c, old;
+ c = atomic_read(v);
+ for (;;) {
+ if (unlikely(c == (u)))
+ break;
+ old = atomic_cmpxchg((v), c, c + (a));
+ if (likely(old == c))
+ break;
+ c = old;
+ }
+ return c != (u);
+}
+
+#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
+
+#define atomic_inc_return(v) (atomic_add_return(1, v))
+#define atomic_dec_return(v) (atomic_sub_return(1, v))
+
+/* These are x86-specific, used by some header files */
+#define atomic_clear_mask(mask, addr) \
+ asm volatile(LOCK_PREFIX "andl %0,%1" \
+ : : "r" (~(mask)), "m" (*(addr)) : "memory")
+
+#define atomic_set_mask(mask, addr) \
+ asm volatile(LOCK_PREFIX "orl %0,%1" \
+ : : "r" (mask), "m" (*(addr)) : "memory")
+
+/* Atomic operations are already serializing on x86 */
+#define smp_mb__before_atomic_dec() barrier()
+#define smp_mb__after_atomic_dec() barrier()
+#define smp_mb__before_atomic_inc() barrier()
+#define smp_mb__after_atomic_inc() barrier()
+
+#include <asm-generic/atomic.h>
+#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
new file mode 100644
index 00000000000..279d2a731f3
--- /dev/null
+++ b/arch/x86/include/asm/atomic_64.h
@@ -0,0 +1,473 @@
+#ifndef _ASM_X86_ATOMIC_64_H
+#define _ASM_X86_ATOMIC_64_H
+
+#include <asm/alternative.h>
+#include <asm/cmpxchg.h>
+
+/* atomic_t should be 32 bit signed type */
+
+/*
+ * Atomic operations that C can't guarantee us. Useful for
+ * resource counting etc..
+ */
+
+/*
+ * Make sure gcc doesn't try to be clever and move things around
+ * on us. We need to use _exactly_ the address the user gave us,
+ * not some alias that contains the same information.
+ */
+typedef struct {
+ int counter;
+} atomic_t;
+
+#define ATOMIC_INIT(i) { (i) }
+
+/**
+ * atomic_read - read atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically reads the value of @v.
+ */
+#define atomic_read(v) ((v)->counter)
+
+/**
+ * atomic_set - set atomic variable
+ * @v: pointer of type atomic_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+#define atomic_set(v, i) (((v)->counter) = (i))
+
+/**
+ * atomic_add - add integer to atomic variable
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v.
+ */
+static inline void atomic_add(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0"
+ : "=m" (v->counter)
+ : "ir" (i), "m" (v->counter));
+}
+
+/**
+ * atomic_sub - subtract the atomic variable
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static inline void atomic_sub(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "subl %1,%0"
+ : "=m" (v->counter)
+ : "ir" (i), "m" (v->counter));
+}
+
+/**
+ * atomic_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic_sub_and_test(int i, atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "ir" (i), "m" (v->counter) : "memory");
+ return c;
+}
+
+/**
+ * atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic_inc(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "incl %0"
+ : "=m" (v->counter)
+ : "m" (v->counter));
+}
+
+/**
+ * atomic_dec - decrement atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static inline void atomic_dec(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "decl %0"
+ : "=m" (v->counter)
+ : "m" (v->counter));
+}
+
+/**
+ * atomic_dec_and_test - decrement and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic_dec_and_test(atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "decl %0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "m" (v->counter) : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic_inc_and_test - increment and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic_inc_and_test(atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "incl %0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "m" (v->counter) : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic_add_negative - add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int atomic_add_negative(int i, atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "ir" (i), "m" (v->counter) : "memory");
+ return c;
+}
+
+/**
+ * atomic_add_return - add and return
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+ int __i = i;
+ asm volatile(LOCK_PREFIX "xaddl %0, %1"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
+}
+
+static inline int atomic_sub_return(int i, atomic_t *v)
+{
+ return atomic_add_return(-i, v);
+}
+
+#define atomic_inc_return(v) (atomic_add_return(1, v))
+#define atomic_dec_return(v) (atomic_sub_return(1, v))
+
+/* An 64bit atomic type */
+
+typedef struct {
+ long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(i) { (i) }
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+#define atomic64_read(v) ((v)->counter)
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @v: pointer to type atomic64_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+#define atomic64_set(v, i) (((v)->counter) = (i))
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v.
+ */
+static inline void atomic64_add(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "addq %1,%0"
+ : "=m" (v->counter)
+ : "er" (i), "m" (v->counter));
+}
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static inline void atomic64_sub(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "subq %1,%0"
+ : "=m" (v->counter)
+ : "er" (i), "m" (v->counter));
+}
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_sub_and_test(long i, atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "er" (i), "m" (v->counter) : "memory");
+ return c;
+}
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic64_inc(atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "incq %0"
+ : "=m" (v->counter)
+ : "m" (v->counter));
+}
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static inline void atomic64_dec(atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "decq %0"
+ : "=m" (v->counter)
+ : "m" (v->counter));
+}
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic64_dec_and_test(atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "decq %0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "m" (v->counter) : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_inc_and_test(atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "incq %0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "m" (v->counter) : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int atomic64_add_negative(long i, atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "er" (i), "m" (v->counter) : "memory");
+ return c;
+}
+
+/**
+ * atomic64_add_return - add and return
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline long atomic64_add_return(long i, atomic64_t *v)
+{
+ long __i = i;
+ asm volatile(LOCK_PREFIX "xaddq %0, %1;"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
+}
+
+static inline long atomic64_sub_return(long i, atomic64_t *v)
+{
+ return atomic64_add_return(-i, v);
+}
+
+#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
+#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
+
+#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
+#define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
+
+#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
+
+/**
+ * atomic_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as it was not @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+ int c, old;
+ c = atomic_read(v);
+ for (;;) {
+ if (unlikely(c == (u)))
+ break;
+ old = atomic_cmpxchg((v), c, c + (a));
+ if (likely(old == c))
+ break;
+ c = old;
+ }
+ return c != (u);
+}
+
+#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
+
+/**
+ * atomic64_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic64_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as it was not @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
+{
+ long c, old;
+ c = atomic64_read(v);
+ for (;;) {
+ if (unlikely(c == (u)))
+ break;
+ old = atomic64_cmpxchg((v), c, c + (a));
+ if (likely(old == c))
+ break;
+ c = old;
+ }
+ return c != (u);
+}
+
+/**
+ * atomic_inc_short - increment of a short integer
+ * @v: pointer to type int
+ *
+ * Atomically adds 1 to @v
+ * Returns the new value of @u
+ */
+static inline short int atomic_inc_short(short int *v)
+{
+ asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
+ return *v;
+}
+
+/**
+ * atomic_or_long - OR of two long integers
+ * @v1: pointer to type unsigned long
+ * @v2: pointer to type unsigned long
+ *
+ * Atomically ORs @v1 and @v2
+ * Returns the result of the OR
+ */
+static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
+{
+ asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
+}
+
+#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
+
+/* These are x86-specific, used by some header files */
+#define atomic_clear_mask(mask, addr) \
+ asm volatile(LOCK_PREFIX "andl %0,%1" \
+ : : "r" (~(mask)), "m" (*(addr)) : "memory")
+
+#define atomic_set_mask(mask, addr) \
+ asm volatile(LOCK_PREFIX "orl %0,%1" \
+ : : "r" ((unsigned)(mask)), "m" (*(addr)) \
+ : "memory")
+
+/* Atomic operations are already serializing on x86 */
+#define smp_mb__before_atomic_dec() barrier()
+#define smp_mb__after_atomic_dec() barrier()
+#define smp_mb__before_atomic_inc() barrier()
+#define smp_mb__after_atomic_inc() barrier()
+
+#include <asm-generic/atomic.h>
+#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h
new file mode 100644
index 00000000000..1316b4c3542
--- /dev/null
+++ b/arch/x86/include/asm/auxvec.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_AUXVEC_H
+#define _ASM_X86_AUXVEC_H
+/*
+ * Architecture-neutral AT_ values in 0-17, leave some room
+ * for more of them, start the x86-specific ones at 32.
+ */
+#ifdef __i386__
+#define AT_SYSINFO 32
+#endif
+#define AT_SYSINFO_EHDR 33
+
+#endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h
new file mode 100644
index 00000000000..1d9543b9d35
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/apic.h
@@ -0,0 +1,139 @@
+#ifndef __ASM_MACH_APIC_H
+#define __ASM_MACH_APIC_H
+
+#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu))
+#define esr_disable (1)
+
+static inline int apic_id_registered(void)
+{
+ return (1);
+}
+
+static inline cpumask_t target_cpus(void)
+{
+#ifdef CONFIG_SMP
+ return cpu_online_map;
+#else
+ return cpumask_of_cpu(0);
+#endif
+}
+
+#undef APIC_DEST_LOGICAL
+#define APIC_DEST_LOGICAL 0
+#define APIC_DFR_VALUE (APIC_DFR_FLAT)
+#define INT_DELIVERY_MODE (dest_Fixed)
+#define INT_DEST_MODE (0) /* phys delivery to target proc */
+#define NO_BALANCE_IRQ (0)
+#define WAKE_SECONDARY_VIA_INIT
+
+
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return (0);
+}
+
+static inline unsigned long check_apicid_present(int bit)
+{
+ return (1);
+}
+
+static inline unsigned long calculate_ldr(int cpu)
+{
+ unsigned long val, id;
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ id = xapic_phys_to_log_apicid(cpu);
+ val |= SET_APIC_LOGICAL_ID(id);
+ return val;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static inline void init_apic_ldr(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static inline void setup_apic_routing(void)
+{
+ printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
+ "Physflat", nr_ioapics);
+}
+
+static inline int multi_timer_check(int apic, int irq)
+{
+ return (0);
+}
+
+static inline int apicid_to_node(int logical_apicid)
+{
+ return apicid_2_node[hard_smp_processor_id()];
+}
+
+static inline int cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < NR_CPUS)
+ return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
+
+ return BAD_APICID;
+}
+
+static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
+{
+ return physid_mask_of_physid(phys_apicid);
+}
+
+extern u8 cpu_2_logical_apicid[];
+/* Mapping from cpu number to logical apicid */
+static inline int cpu_to_logical_apicid(int cpu)
+{
+ if (cpu >= NR_CPUS)
+ return BAD_APICID;
+ return cpu_physical_id(cpu);
+}
+
+static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0xFFL);
+}
+
+static inline void setup_portio_remap(void)
+{
+}
+
+static inline void enable_apic_mode(void)
+{
+}
+
+static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return (1);
+}
+
+/* As we are using single CPU as destination, pick only one CPU here */
+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int cpu;
+ int apicid;
+
+ cpu = first_cpu(cpumask);
+ apicid = cpu_to_logical_apicid(cpu);
+ return apicid;
+}
+
+static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+#endif /* __ASM_MACH_APIC_H */
diff --git a/arch/x86/include/asm/bigsmp/apicdef.h b/arch/x86/include/asm/bigsmp/apicdef.h
new file mode 100644
index 00000000000..392c3f5ef2f
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/apicdef.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_MACH_APICDEF_H
+#define __ASM_MACH_APICDEF_H
+
+#define APIC_ID_MASK (0xFF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+ return (((x)>>24)&0xFF);
+}
+
+#define GET_APIC_ID(x) get_apic_id(x)
+
+#endif
diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h
new file mode 100644
index 00000000000..9404c535b7e
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/ipi.h
@@ -0,0 +1,25 @@
+#ifndef __ASM_MACH_IPI_H
+#define __ASM_MACH_IPI_H
+
+void send_IPI_mask_sequence(cpumask_t mask, int vector);
+
+static inline void send_IPI_mask(cpumask_t mask, int vector)
+{
+ send_IPI_mask_sequence(mask, vector);
+}
+
+static inline void send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ send_IPI_mask(mask, vector);
+}
+
+static inline void send_IPI_all(int vector)
+{
+ send_IPI_mask(cpu_online_map, vector);
+}
+
+#endif /* __ASM_MACH_IPI_H */
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
new file mode 100644
index 00000000000..3c7521063d3
--- /dev/null
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_BIOS_EBDA_H
+#define _ASM_X86_BIOS_EBDA_H
+
+#include <asm/io.h>
+
+/*
+ * there is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E.
+ */
+static inline unsigned int get_bios_ebda(void)
+{
+ unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
+ address <<= 4;
+ return address; /* 0 means none */
+}
+
+void reserve_ebda_region(void);
+
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+/*
+ * This is obviously not a great place for this, but we want to be
+ * able to scatter it around anywhere in the kernel.
+ */
+void check_for_bios_corruption(void);
+void start_periodic_check_for_corruption(void);
+#else
+static inline void check_for_bios_corruption(void)
+{
+}
+
+static inline void start_periodic_check_for_corruption(void)
+{
+}
+#endif
+
+#endif /* _ASM_X86_BIOS_EBDA_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
new file mode 100644
index 00000000000..36001032271
--- /dev/null
+++ b/arch/x86/include/asm/bitops.h
@@ -0,0 +1,451 @@
+#ifndef _ASM_X86_BITOPS_H
+#define _ASM_X86_BITOPS_H
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ */
+
+#ifndef _LINUX_BITOPS_H
+#error only <linux/bitops.h> can be included directly
+#endif
+
+#include <linux/compiler.h>
+#include <asm/alternative.h>
+
+/*
+ * These have to be done with inline assembly: that way the bit-setting
+ * is guaranteed to be atomic. All bit operations return 0 if the bit
+ * was cleared before the operation and != 0 if it was not.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
+/* Technically wrong, but this avoids compilation errors on some gcc
+ versions. */
+#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
+#else
+#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
+#endif
+
+#define ADDR BITOP_ADDR(addr)
+
+/*
+ * We do the locked ops that don't return the old value as
+ * a mask operation on a byte.
+ */
+#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr))
+#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3))
+#define CONST_MASK(nr) (1 << ((nr) & 7))
+
+/**
+ * set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered. See __set_bit()
+ * if you do not require the atomic guarantees.
+ *
+ * Note: there are no guarantees that this function will not be reordered
+ * on non x86 architectures, so if you are writing portable code,
+ * make sure not to rely on its reordering guarantees.
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void set_bit(unsigned int nr, volatile unsigned long *addr)
+{
+ if (IS_IMMEDIATE(nr)) {
+ asm volatile(LOCK_PREFIX "orb %1,%0"
+ : CONST_MASK_ADDR(nr, addr)
+ : "iq" ((u8)CONST_MASK(nr))
+ : "memory");
+ } else {
+ asm volatile(LOCK_PREFIX "bts %1,%0"
+ : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
+ }
+}
+
+/**
+ * __set_bit - Set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Unlike set_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __set_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
+}
+
+/**
+ * clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered. However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void clear_bit(int nr, volatile unsigned long *addr)
+{
+ if (IS_IMMEDIATE(nr)) {
+ asm volatile(LOCK_PREFIX "andb %1,%0"
+ : CONST_MASK_ADDR(nr, addr)
+ : "iq" ((u8)~CONST_MASK(nr)));
+ } else {
+ asm volatile(LOCK_PREFIX "btr %1,%0"
+ : BITOP_ADDR(addr)
+ : "Ir" (nr));
+ }
+}
+
+/*
+ * clear_bit_unlock - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and implies release semantics before the memory
+ * operation. It can be used for an unlock.
+ */
+static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
+{
+ barrier();
+ clear_bit(nr, addr);
+}
+
+static inline void __clear_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
+}
+
+/*
+ * __clear_bit_unlock - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * __clear_bit() is non-atomic and implies release semantics before the memory
+ * operation. It can be used for an unlock if no other CPUs can concurrently
+ * modify other bits in the word.
+ *
+ * No memory barrier is required here, because x86 cannot reorder stores past
+ * older loads. Same principle as spin_unlock.
+ */
+static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
+{
+ barrier();
+ __clear_bit(nr, addr);
+}
+
+#define smp_mb__before_clear_bit() barrier()
+#define smp_mb__after_clear_bit() barrier()
+
+/**
+ * __change_bit - Toggle a bit in memory
+ * @nr: the bit to change
+ * @addr: the address to start counting from
+ *
+ * Unlike change_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __change_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
+}
+
+/**
+ * change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * change_bit() is atomic and may not be reordered.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void change_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr));
+}
+
+/**
+ * test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
+ "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
+
+ return oldbit;
+}
+
+/**
+ * test_and_set_bit_lock - Set a bit and return its old value for lock
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This is the same as test_and_set_bit on x86.
+ */
+static inline int test_and_set_bit_lock(int nr, volatile unsigned long *addr)
+{
+ return test_and_set_bit(nr, addr);
+}
+
+/**
+ * __test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm("bts %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit), ADDR
+ : "Ir" (nr));
+ return oldbit;
+}
+
+/**
+ * test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
+
+ return oldbit;
+}
+
+/**
+ * __test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("btr %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit), ADDR
+ : "Ir" (nr));
+ return oldbit;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("btc %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit), ADDR
+ : "Ir" (nr) : "memory");
+
+ return oldbit;
+}
+
+/**
+ * test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
+
+ return oldbit;
+}
+
+static inline int constant_test_bit(int nr, const volatile unsigned long *addr)
+{
+ return ((1UL << (nr % BITS_PER_LONG)) &
+ (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
+
+static inline int variable_test_bit(int nr, volatile const unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("bt %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit)
+ : "m" (*(unsigned long *)addr), "Ir" (nr));
+
+ return oldbit;
+}
+
+#if 0 /* Fool kernel-doc since it doesn't do macros yet */
+/**
+ * test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static int test_bit(int nr, const volatile unsigned long *addr);
+#endif
+
+#define test_bit(nr, addr) \
+ (__builtin_constant_p((nr)) \
+ ? constant_test_bit((nr), (addr)) \
+ : variable_test_bit((nr), (addr)))
+
+/**
+ * __ffs - find first set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ asm("bsf %1,%0"
+ : "=r" (word)
+ : "rm" (word));
+ return word;
+}
+
+/**
+ * ffz - find first zero bit in word
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static inline unsigned long ffz(unsigned long word)
+{
+ asm("bsf %1,%0"
+ : "=r" (word)
+ : "r" (~word));
+ return word;
+}
+
+/*
+ * __fls: find last set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no set bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __fls(unsigned long word)
+{
+ asm("bsr %1,%0"
+ : "=r" (word)
+ : "rm" (word));
+ return word;
+}
+
+#ifdef __KERNEL__
+/**
+ * ffs - find first set bit in word
+ * @x: the word to search
+ *
+ * This is defined the same way as the libc and compiler builtin ffs
+ * routines, therefore differs in spirit from the other bitops.
+ *
+ * ffs(value) returns 0 if value is 0 or the position of the first
+ * set bit if value is nonzero. The first (least significant) bit
+ * is at position 1.
+ */
+static inline int ffs(int x)
+{
+ int r;
+#ifdef CONFIG_X86_CMOV
+ asm("bsfl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=r" (r) : "rm" (x), "r" (-1));
+#else
+ asm("bsfl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r + 1;
+}
+
+/**
+ * fls - find last set bit in word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffs, but returns the position of the most significant set bit.
+ *
+ * fls(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 32.
+ */
+static inline int fls(int x)
+{
+ int r;
+#ifdef CONFIG_X86_CMOV
+ asm("bsrl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=&r" (r) : "rm" (x), "rm" (-1));
+#else
+ asm("bsrl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r + 1;
+}
+#endif /* __KERNEL__ */
+
+#undef ADDR
+
+#ifdef __KERNEL__
+
+#include <asm-generic/bitops/sched.h>
+
+#define ARCH_HAS_FAST_MULTIPLIER 1
+
+#include <asm-generic/bitops/hweight.h>
+
+#endif /* __KERNEL__ */
+
+#include <asm-generic/bitops/fls64.h>
+
+#ifdef __KERNEL__
+
+#include <asm-generic/bitops/ext2-non-atomic.h>
+
+#define ext2_set_bit_atomic(lock, nr, addr) \
+ test_and_set_bit((nr), (unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+ test_and_clear_bit((nr), (unsigned long *)(addr))
+
+#include <asm-generic/bitops/minix.h>
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
new file mode 100644
index 00000000000..dd61616cb73
--- /dev/null
+++ b/arch/x86/include/asm/boot.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_BOOT_H
+#define _ASM_X86_BOOT_H
+
+/* Don't touch these, unless you really know what you're doing. */
+#define DEF_SYSSEG 0x1000
+#define DEF_SYSSIZE 0x7F00
+
+/* Internal svga startup constants */
+#define NORMAL_VGA 0xffff /* 80x25 mode */
+#define EXTENDED_VGA 0xfffe /* 80x50 mode */
+#define ASK_VGA 0xfffd /* ask for it at bootup */
+
+/* Physical address where kernel should be loaded. */
+#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
+ + (CONFIG_PHYSICAL_ALIGN - 1)) \
+ & ~(CONFIG_PHYSICAL_ALIGN - 1))
+
+#ifdef CONFIG_X86_64
+#define BOOT_HEAP_SIZE 0x7000
+#define BOOT_STACK_SIZE 0x4000
+#else
+#define BOOT_HEAP_SIZE 0x4000
+#define BOOT_STACK_SIZE 0x1000
+#endif
+
+#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
new file mode 100644
index 00000000000..433adaebf9b
--- /dev/null
+++ b/arch/x86/include/asm/bootparam.h
@@ -0,0 +1,111 @@
+#ifndef _ASM_X86_BOOTPARAM_H
+#define _ASM_X86_BOOTPARAM_H
+
+#include <linux/types.h>
+#include <linux/screen_info.h>
+#include <linux/apm_bios.h>
+#include <linux/edd.h>
+#include <asm/e820.h>
+#include <asm/ist.h>
+#include <video/edid.h>
+
+/* setup data types */
+#define SETUP_NONE 0
+#define SETUP_E820_EXT 1
+
+/* extensible setup data list node */
+struct setup_data {
+ __u64 next;
+ __u32 type;
+ __u32 len;
+ __u8 data[0];
+};
+
+struct setup_header {
+ __u8 setup_sects;
+ __u16 root_flags;
+ __u32 syssize;
+ __u16 ram_size;
+#define RAMDISK_IMAGE_START_MASK 0x07FF
+#define RAMDISK_PROMPT_FLAG 0x8000
+#define RAMDISK_LOAD_FLAG 0x4000
+ __u16 vid_mode;
+ __u16 root_dev;
+ __u16 boot_flag;
+ __u16 jump;
+ __u32 header;
+ __u16 version;
+ __u32 realmode_swtch;
+ __u16 start_sys;
+ __u16 kernel_version;
+ __u8 type_of_loader;
+ __u8 loadflags;
+#define LOADED_HIGH (1<<0)
+#define QUIET_FLAG (1<<5)
+#define KEEP_SEGMENTS (1<<6)
+#define CAN_USE_HEAP (1<<7)
+ __u16 setup_move_size;
+ __u32 code32_start;
+ __u32 ramdisk_image;
+ __u32 ramdisk_size;
+ __u32 bootsect_kludge;
+ __u16 heap_end_ptr;
+ __u16 _pad1;
+ __u32 cmd_line_ptr;
+ __u32 initrd_addr_max;
+ __u32 kernel_alignment;
+ __u8 relocatable_kernel;
+ __u8 _pad2[3];
+ __u32 cmdline_size;
+ __u32 hardware_subarch;
+ __u64 hardware_subarch_data;
+ __u32 payload_offset;
+ __u32 payload_length;
+ __u64 setup_data;
+} __attribute__((packed));
+
+struct sys_desc_table {
+ __u16 length;
+ __u8 table[14];
+};
+
+struct efi_info {
+ __u32 efi_loader_signature;
+ __u32 efi_systab;
+ __u32 efi_memdesc_size;
+ __u32 efi_memdesc_version;
+ __u32 efi_memmap;
+ __u32 efi_memmap_size;
+ __u32 efi_systab_hi;
+ __u32 efi_memmap_hi;
+};
+
+/* The so-called "zeropage" */
+struct boot_params {
+ struct screen_info screen_info; /* 0x000 */
+ struct apm_bios_info apm_bios_info; /* 0x040 */
+ __u8 _pad2[12]; /* 0x054 */
+ struct ist_info ist_info; /* 0x060 */
+ __u8 _pad3[16]; /* 0x070 */
+ __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
+ __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
+ struct sys_desc_table sys_desc_table; /* 0x0a0 */
+ __u8 _pad4[144]; /* 0x0b0 */
+ struct edid_info edid_info; /* 0x140 */
+ struct efi_info efi_info; /* 0x1c0 */
+ __u32 alt_mem_k; /* 0x1e0 */
+ __u32 scratch; /* Scratch field! */ /* 0x1e4 */
+ __u8 e820_entries; /* 0x1e8 */
+ __u8 eddbuf_entries; /* 0x1e9 */
+ __u8 edd_mbr_sig_buf_entries; /* 0x1ea */
+ __u8 _pad6[6]; /* 0x1eb */
+ struct setup_header hdr; /* setup header */ /* 0x1f1 */
+ __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)];
+ __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */
+ struct e820entry e820_map[E820MAX]; /* 0x2d0 */
+ __u8 _pad8[48]; /* 0xcd0 */
+ struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */
+ __u8 _pad9[276]; /* 0xeec */
+} __attribute__((packed));
+
+#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
new file mode 100644
index 00000000000..3def2065fce
--- /dev/null
+++ b/arch/x86/include/asm/bug.h
@@ -0,0 +1,39 @@
+#ifndef _ASM_X86_BUG_H
+#define _ASM_X86_BUG_H
+
+#ifdef CONFIG_BUG
+#define HAVE_ARCH_BUG
+
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+
+#ifdef CONFIG_X86_32
+# define __BUG_C0 "2:\t.long 1b, %c0\n"
+#else
+# define __BUG_C0 "2:\t.quad 1b, %c0\n"
+#endif
+
+#define BUG() \
+do { \
+ asm volatile("1:\tud2\n" \
+ ".pushsection __bug_table,\"a\"\n" \
+ __BUG_C0 \
+ "\t.word %c1, 0\n" \
+ "\t.org 2b+%c2\n" \
+ ".popsection" \
+ : : "i" (__FILE__), "i" (__LINE__), \
+ "i" (sizeof(struct bug_entry))); \
+ for (;;) ; \
+} while (0)
+
+#else
+#define BUG() \
+do { \
+ asm volatile("ud2"); \
+ for (;;) ; \
+} while (0)
+#endif
+
+#endif /* !CONFIG_BUG */
+
+#include <asm-generic/bug.h>
+#endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h
new file mode 100644
index 00000000000..08abf639075
--- /dev/null
+++ b/arch/x86/include/asm/bugs.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_BUGS_H
+#define _ASM_X86_BUGS_H
+
+extern void check_bugs(void);
+
+#if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32)
+int ppro_with_ram_bug(void);
+#else
+static inline int ppro_with_ram_bug(void) { return 0; }
+#endif
+
+#endif /* _ASM_X86_BUGS_H */
diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h
new file mode 100644
index 00000000000..e02ae2d89ac
--- /dev/null
+++ b/arch/x86/include/asm/byteorder.h
@@ -0,0 +1,81 @@
+#ifndef _ASM_X86_BYTEORDER_H
+#define _ASM_X86_BYTEORDER_H
+
+#include <asm/types.h>
+#include <linux/compiler.h>
+
+#ifdef __GNUC__
+
+#ifdef __i386__
+
+static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
+{
+#ifdef CONFIG_X86_BSWAP
+ asm("bswap %0" : "=r" (x) : "0" (x));
+#else
+ asm("xchgb %b0,%h0\n\t" /* swap lower bytes */
+ "rorl $16,%0\n\t" /* swap words */
+ "xchgb %b0,%h0" /* swap higher bytes */
+ : "=q" (x)
+ : "0" (x));
+#endif
+ return x;
+}
+
+static inline __attribute_const__ __u64 ___arch__swab64(__u64 val)
+{
+ union {
+ struct {
+ __u32 a;
+ __u32 b;
+ } s;
+ __u64 u;
+ } v;
+ v.u = val;
+#ifdef CONFIG_X86_BSWAP
+ asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
+ : "=r" (v.s.a), "=r" (v.s.b)
+ : "0" (v.s.a), "1" (v.s.b));
+#else
+ v.s.a = ___arch__swab32(v.s.a);
+ v.s.b = ___arch__swab32(v.s.b);
+ asm("xchgl %0,%1"
+ : "=r" (v.s.a), "=r" (v.s.b)
+ : "0" (v.s.a), "1" (v.s.b));
+#endif
+ return v.u;
+}
+
+#else /* __i386__ */
+
+static inline __attribute_const__ __u64 ___arch__swab64(__u64 x)
+{
+ asm("bswapq %0"
+ : "=r" (x)
+ : "0" (x));
+ return x;
+}
+
+static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
+{
+ asm("bswapl %0"
+ : "=r" (x)
+ : "0" (x));
+ return x;
+}
+
+#endif
+
+/* Do not define swab16. Gcc is smart enough to recognize "C" version and
+ convert it into rotation or exhange. */
+
+#define __arch__swab64(x) ___arch__swab64(x)
+#define __arch__swab32(x) ___arch__swab32(x)
+
+#define __BYTEORDER_HAS_U64__
+
+#endif /* __GNUC__ */
+
+#include <linux/byteorder/little_endian.h>
+
+#endif /* _ASM_X86_BYTEORDER_H */
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
new file mode 100644
index 00000000000..5d367caa0e3
--- /dev/null
+++ b/arch/x86/include/asm/cache.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_CACHE_H
+#define _ASM_X86_CACHE_H
+
+/* L1 cache line size */
+#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT)
+#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
+
+#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+
+#ifdef CONFIG_X86_VSMP
+/* vSMP Internode cacheline shift */
+#define INTERNODE_CACHE_SHIFT (12)
+#ifdef CONFIG_SMP
+#define __cacheline_aligned_in_smp \
+ __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
+ __attribute__((__section__(".data.page_aligned")))
+#endif
+#endif
+
+#endif /* _ASM_X86_CACHE_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
new file mode 100644
index 00000000000..2f8466540fb
--- /dev/null
+++ b/arch/x86/include/asm/cacheflush.h
@@ -0,0 +1,118 @@
+#ifndef _ASM_X86_CACHEFLUSH_H
+#define _ASM_X86_CACHEFLUSH_H
+
+/* Keep includes the same across arches. */
+#include <linux/mm.h>
+
+/* Caches aren't brain-dead on the intel. */
+#define flush_cache_all() do { } while (0)
+#define flush_cache_mm(mm) do { } while (0)
+#define flush_cache_dup_mm(mm) do { } while (0)
+#define flush_cache_range(vma, start, end) do { } while (0)
+#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
+#define flush_dcache_page(page) do { } while (0)
+#define flush_dcache_mmap_lock(mapping) do { } while (0)
+#define flush_dcache_mmap_unlock(mapping) do { } while (0)
+#define flush_icache_range(start, end) do { } while (0)
+#define flush_icache_page(vma, pg) do { } while (0)
+#define flush_icache_user_range(vma, pg, adr, len) do { } while (0)
+#define flush_cache_vmap(start, end) do { } while (0)
+#define flush_cache_vunmap(start, end) do { } while (0)
+
+#define copy_to_user_page(vma, page, vaddr, dst, src, len) \
+ memcpy((dst), (src), (len))
+#define copy_from_user_page(vma, page, vaddr, dst, src, len) \
+ memcpy((dst), (src), (len))
+
+#define PG_non_WB PG_arch_1
+PAGEFLAG(NonWB, non_WB)
+
+/*
+ * The set_memory_* API can be used to change various attributes of a virtual
+ * address range. The attributes include:
+ * Cachability : UnCached, WriteCombining, WriteBack
+ * Executability : eXeutable, NoteXecutable
+ * Read/Write : ReadOnly, ReadWrite
+ * Presence : NotPresent
+ *
+ * Within a catagory, the attributes are mutually exclusive.
+ *
+ * The implementation of this API will take care of various aspects that
+ * are associated with changing such attributes, such as:
+ * - Flushing TLBs
+ * - Flushing CPU caches
+ * - Making sure aliases of the memory behind the mapping don't violate
+ * coherency rules as defined by the CPU in the system.
+ *
+ * What this API does not do:
+ * - Provide exclusion between various callers - including callers that
+ * operation on other mappings of the same physical page
+ * - Restore default attributes when a page is freed
+ * - Guarantee that mappings other than the requested one are
+ * in any state, other than that these do not violate rules for
+ * the CPU you have. Do not depend on any effects on other mappings,
+ * CPUs other than the one you have may have more relaxed rules.
+ * The caller is required to take care of these.
+ */
+
+int _set_memory_uc(unsigned long addr, int numpages);
+int _set_memory_wc(unsigned long addr, int numpages);
+int _set_memory_wb(unsigned long addr, int numpages);
+int set_memory_uc(unsigned long addr, int numpages);
+int set_memory_wc(unsigned long addr, int numpages);
+int set_memory_wb(unsigned long addr, int numpages);
+int set_memory_x(unsigned long addr, int numpages);
+int set_memory_nx(unsigned long addr, int numpages);
+int set_memory_ro(unsigned long addr, int numpages);
+int set_memory_rw(unsigned long addr, int numpages);
+int set_memory_np(unsigned long addr, int numpages);
+int set_memory_4k(unsigned long addr, int numpages);
+
+int set_memory_array_uc(unsigned long *addr, int addrinarray);
+int set_memory_array_wb(unsigned long *addr, int addrinarray);
+
+/*
+ * For legacy compatibility with the old APIs, a few functions
+ * are provided that work on a "struct page".
+ * These functions operate ONLY on the 1:1 kernel mapping of the
+ * memory that the struct page represents, and internally just
+ * call the set_memory_* function. See the description of the
+ * set_memory_* function for more details on conventions.
+ *
+ * These APIs should be considered *deprecated* and are likely going to
+ * be removed in the future.
+ * The reason for this is the implicit operation on the 1:1 mapping only,
+ * making this not a generally useful API.
+ *
+ * Specifically, many users of the old APIs had a virtual address,
+ * called virt_to_page() or vmalloc_to_page() on that address to
+ * get a struct page* that the old API required.
+ * To convert these cases, use set_memory_*() on the original
+ * virtual address, do not use these functions.
+ */
+
+int set_pages_uc(struct page *page, int numpages);
+int set_pages_wb(struct page *page, int numpages);
+int set_pages_x(struct page *page, int numpages);
+int set_pages_nx(struct page *page, int numpages);
+int set_pages_ro(struct page *page, int numpages);
+int set_pages_rw(struct page *page, int numpages);
+
+
+void clflush_cache_range(void *addr, unsigned int size);
+
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+extern const int rodata_test_data;
+#endif
+
+#ifdef CONFIG_DEBUG_RODATA_TEST
+int rodata_test(void);
+#else
+static inline int rodata_test(void)
+{
+ return 0;
+}
+#endif
+
+#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
new file mode 100644
index 00000000000..b03bedb62aa
--- /dev/null
+++ b/arch/x86/include/asm/calgary.h
@@ -0,0 +1,72 @@
+/*
+ * Derived from include/asm-powerpc/iommu.h
+ *
+ * Copyright IBM Corporation, 2006-2007
+ *
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_CALGARY_H
+#define _ASM_X86_CALGARY_H
+
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/timer.h>
+#include <asm/types.h>
+
+struct iommu_table {
+ struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
+ unsigned long it_base; /* mapped address of tce table */
+ unsigned long it_hint; /* Hint for next alloc */
+ unsigned long *it_map; /* A simple allocation bitmap for now */
+ void __iomem *bbar; /* Bridge BAR */
+ u64 tar_val; /* Table Address Register */
+ struct timer_list watchdog_timer;
+ spinlock_t it_lock; /* Protects it_map */
+ unsigned int it_size; /* Size of iommu table in entries */
+ unsigned char it_busno; /* Bus number this table belongs to */
+};
+
+struct cal_chipset_ops {
+ void (*handle_quirks)(struct iommu_table *tbl, struct pci_dev *dev);
+ void (*tce_cache_blast)(struct iommu_table *tbl);
+ void (*dump_error_regs)(struct iommu_table *tbl);
+};
+
+#define TCE_TABLE_SIZE_UNSPECIFIED ~0
+#define TCE_TABLE_SIZE_64K 0
+#define TCE_TABLE_SIZE_128K 1
+#define TCE_TABLE_SIZE_256K 2
+#define TCE_TABLE_SIZE_512K 3
+#define TCE_TABLE_SIZE_1M 4
+#define TCE_TABLE_SIZE_2M 5
+#define TCE_TABLE_SIZE_4M 6
+#define TCE_TABLE_SIZE_8M 7
+
+extern int use_calgary;
+
+#ifdef CONFIG_CALGARY_IOMMU
+extern int calgary_iommu_init(void);
+extern void detect_calgary(void);
+#else
+static inline int calgary_iommu_init(void) { return 1; }
+static inline void detect_calgary(void) { return; }
+#endif
+
+#endif /* _ASM_X86_CALGARY_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
new file mode 100644
index 00000000000..2bc162e0ec6
--- /dev/null
+++ b/arch/x86/include/asm/calling.h
@@ -0,0 +1,170 @@
+/*
+ * Some macros to handle stack frames in assembly.
+ */
+
+#define R15 0
+#define R14 8
+#define R13 16
+#define R12 24
+#define RBP 32
+#define RBX 40
+
+/* arguments: interrupts/non tracing syscalls only save upto here*/
+#define R11 48
+#define R10 56
+#define R9 64
+#define R8 72
+#define RAX 80
+#define RCX 88
+#define RDX 96
+#define RSI 104
+#define RDI 112
+#define ORIG_RAX 120 /* + error_code */
+/* end of arguments */
+
+/* cpu exception frame or undefined in case of fast syscall. */
+#define RIP 128
+#define CS 136
+#define EFLAGS 144
+#define RSP 152
+#define SS 160
+
+#define ARGOFFSET R11
+#define SWFRAME ORIG_RAX
+
+ .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0
+ subq $9*8+\addskip, %rsp
+ CFI_ADJUST_CFA_OFFSET 9*8+\addskip
+ movq %rdi, 8*8(%rsp)
+ CFI_REL_OFFSET rdi, 8*8
+ movq %rsi, 7*8(%rsp)
+ CFI_REL_OFFSET rsi, 7*8
+ movq %rdx, 6*8(%rsp)
+ CFI_REL_OFFSET rdx, 6*8
+ .if \norcx
+ .else
+ movq %rcx, 5*8(%rsp)
+ CFI_REL_OFFSET rcx, 5*8
+ .endif
+ movq %rax, 4*8(%rsp)
+ CFI_REL_OFFSET rax, 4*8
+ .if \nor891011
+ .else
+ movq %r8, 3*8(%rsp)
+ CFI_REL_OFFSET r8, 3*8
+ movq %r9, 2*8(%rsp)
+ CFI_REL_OFFSET r9, 2*8
+ movq %r10, 1*8(%rsp)
+ CFI_REL_OFFSET r10, 1*8
+ movq %r11, (%rsp)
+ CFI_REL_OFFSET r11, 0*8
+ .endif
+ .endm
+
+#define ARG_SKIP 9*8
+
+ .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
+ skipr8910=0, skiprdx=0
+ .if \skipr11
+ .else
+ movq (%rsp), %r11
+ CFI_RESTORE r11
+ .endif
+ .if \skipr8910
+ .else
+ movq 1*8(%rsp), %r10
+ CFI_RESTORE r10
+ movq 2*8(%rsp), %r9
+ CFI_RESTORE r9
+ movq 3*8(%rsp), %r8
+ CFI_RESTORE r8
+ .endif
+ .if \skiprax
+ .else
+ movq 4*8(%rsp), %rax
+ CFI_RESTORE rax
+ .endif
+ .if \skiprcx
+ .else
+ movq 5*8(%rsp), %rcx
+ CFI_RESTORE rcx
+ .endif
+ .if \skiprdx
+ .else
+ movq 6*8(%rsp), %rdx
+ CFI_RESTORE rdx
+ .endif
+ movq 7*8(%rsp), %rsi
+ CFI_RESTORE rsi
+ movq 8*8(%rsp), %rdi
+ CFI_RESTORE rdi
+ .if ARG_SKIP+\addskip > 0
+ addq $ARG_SKIP+\addskip, %rsp
+ CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
+ .endif
+ .endm
+
+ .macro LOAD_ARGS offset, skiprax=0
+ movq \offset(%rsp), %r11
+ movq \offset+8(%rsp), %r10
+ movq \offset+16(%rsp), %r9
+ movq \offset+24(%rsp), %r8
+ movq \offset+40(%rsp), %rcx
+ movq \offset+48(%rsp), %rdx
+ movq \offset+56(%rsp), %rsi
+ movq \offset+64(%rsp), %rdi
+ .if \skiprax
+ .else
+ movq \offset+72(%rsp), %rax
+ .endif
+ .endm
+
+#define REST_SKIP 6*8
+
+ .macro SAVE_REST
+ subq $REST_SKIP, %rsp
+ CFI_ADJUST_CFA_OFFSET REST_SKIP
+ movq %rbx, 5*8(%rsp)
+ CFI_REL_OFFSET rbx, 5*8
+ movq %rbp, 4*8(%rsp)
+ CFI_REL_OFFSET rbp, 4*8
+ movq %r12, 3*8(%rsp)
+ CFI_REL_OFFSET r12, 3*8
+ movq %r13, 2*8(%rsp)
+ CFI_REL_OFFSET r13, 2*8
+ movq %r14, 1*8(%rsp)
+ CFI_REL_OFFSET r14, 1*8
+ movq %r15, (%rsp)
+ CFI_REL_OFFSET r15, 0*8
+ .endm
+
+ .macro RESTORE_REST
+ movq (%rsp), %r15
+ CFI_RESTORE r15
+ movq 1*8(%rsp), %r14
+ CFI_RESTORE r14
+ movq 2*8(%rsp), %r13
+ CFI_RESTORE r13
+ movq 3*8(%rsp), %r12
+ CFI_RESTORE r12
+ movq 4*8(%rsp), %rbp
+ CFI_RESTORE rbp
+ movq 5*8(%rsp), %rbx
+ CFI_RESTORE rbx
+ addq $REST_SKIP, %rsp
+ CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
+ .endm
+
+ .macro SAVE_ALL
+ SAVE_ARGS
+ SAVE_REST
+ .endm
+
+ .macro RESTORE_ALL addskip=0
+ RESTORE_REST
+ RESTORE_ARGS 0, \addskip
+ .endm
+
+ .macro icebp
+ .byte 0xf1
+ .endm
diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h
new file mode 100644
index 00000000000..848850fd7d6
--- /dev/null
+++ b/arch/x86/include/asm/checksum.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "checksum_32.h"
+#else
+# include "checksum_64.h"
+#endif
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
new file mode 100644
index 00000000000..7c5ef8b14d9
--- /dev/null
+++ b/arch/x86/include/asm/checksum_32.h
@@ -0,0 +1,189 @@
+#ifndef _ASM_X86_CHECKSUM_32_H
+#define _ASM_X86_CHECKSUM_32_H
+
+#include <linux/in6.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+asmlinkage __wsum csum_partial(const void *buff, int len, __wsum sum);
+
+/*
+ * the same as csum_partial, but copies from src while it
+ * checksums, and handles user-space pointer exceptions correctly, when needed.
+ *
+ * here even more important to align src and dst on a 32-bit (or even
+ * better 64-bit) boundary
+ */
+
+asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst,
+ int len, __wsum sum,
+ int *src_err_ptr, int *dst_err_ptr);
+
+/*
+ * Note: when you get a NULL pointer exception here this means someone
+ * passed in an incorrect kernel address to one of these functions.
+ *
+ * If you use these functions directly please don't forget the
+ * access_ok().
+ */
+static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst,
+ int len, __wsum sum)
+{
+ return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
+}
+
+static inline __wsum csum_partial_copy_from_user(const void __user *src,
+ void *dst,
+ int len, __wsum sum,
+ int *err_ptr)
+{
+ might_sleep();
+ return csum_partial_copy_generic((__force void *)src, dst,
+ len, sum, err_ptr, NULL);
+}
+
+/*
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksum on 4 octet boundaries.
+ *
+ * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
+ * Arnt Gulbrandsen.
+ */
+static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+ unsigned int sum;
+
+ asm volatile("movl (%1), %0 ;\n"
+ "subl $4, %2 ;\n"
+ "jbe 2f ;\n"
+ "addl 4(%1), %0 ;\n"
+ "adcl 8(%1), %0 ;\n"
+ "adcl 12(%1), %0;\n"
+ "1: adcl 16(%1), %0 ;\n"
+ "lea 4(%1), %1 ;\n"
+ "decl %2 ;\n"
+ "jne 1b ;\n"
+ "adcl $0, %0 ;\n"
+ "movl %0, %2 ;\n"
+ "shrl $16, %0 ;\n"
+ "addw %w2, %w0 ;\n"
+ "adcl $0, %0 ;\n"
+ "notl %0 ;\n"
+ "2: ;\n"
+ /* Since the input registers which are loaded with iph and ihl
+ are modified, we must also specify them as outputs, or gcc
+ will assume they contain their original values. */
+ : "=r" (sum), "=r" (iph), "=r" (ihl)
+ : "1" (iph), "2" (ihl)
+ : "memory");
+ return (__force __sum16)sum;
+}
+
+/*
+ * Fold a partial checksum
+ */
+
+static inline __sum16 csum_fold(__wsum sum)
+{
+ asm("addl %1, %0 ;\n"
+ "adcl $0xffff, %0 ;\n"
+ : "=r" (sum)
+ : "r" ((__force u32)sum << 16),
+ "0" ((__force u32)sum & 0xffff0000));
+ return (__force __sum16)(~(__force u32)sum >> 16);
+}
+
+static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
+ unsigned short len,
+ unsigned short proto,
+ __wsum sum)
+{
+ asm("addl %1, %0 ;\n"
+ "adcl %2, %0 ;\n"
+ "adcl %3, %0 ;\n"
+ "adcl $0, %0 ;\n"
+ : "=r" (sum)
+ : "g" (daddr), "g"(saddr),
+ "g" ((len + proto) << 8), "0" (sum));
+ return sum;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented
+ */
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+ unsigned short len,
+ unsigned short proto,
+ __wsum sum)
+{
+ return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+
+static inline __sum16 ip_compute_csum(const void *buff, int len)
+{
+ return csum_fold(csum_partial(buff, len, 0));
+}
+
+#define _HAVE_ARCH_IPV6_CSUM
+static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, unsigned short proto,
+ __wsum sum)
+{
+ asm("addl 0(%1), %0 ;\n"
+ "adcl 4(%1), %0 ;\n"
+ "adcl 8(%1), %0 ;\n"
+ "adcl 12(%1), %0 ;\n"
+ "adcl 0(%2), %0 ;\n"
+ "adcl 4(%2), %0 ;\n"
+ "adcl 8(%2), %0 ;\n"
+ "adcl 12(%2), %0 ;\n"
+ "adcl %3, %0 ;\n"
+ "adcl %4, %0 ;\n"
+ "adcl $0, %0 ;\n"
+ : "=&r" (sum)
+ : "r" (saddr), "r" (daddr),
+ "r" (htonl(len)), "r" (htonl(proto)), "0" (sum));
+
+ return csum_fold(sum);
+}
+
+/*
+ * Copy and checksum to user
+ */
+#define HAVE_CSUM_COPY_USER
+static inline __wsum csum_and_copy_to_user(const void *src,
+ void __user *dst,
+ int len, __wsum sum,
+ int *err_ptr)
+{
+ might_sleep();
+ if (access_ok(VERIFY_WRITE, dst, len))
+ return csum_partial_copy_generic(src, (__force void *)dst,
+ len, sum, NULL, err_ptr);
+
+ if (len)
+ *err_ptr = -EFAULT;
+
+ return (__force __wsum)-1; /* invalid checksum */
+}
+
+#endif /* _ASM_X86_CHECKSUM_32_H */
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
new file mode 100644
index 00000000000..9bfdc41629e
--- /dev/null
+++ b/arch/x86/include/asm/checksum_64.h
@@ -0,0 +1,191 @@
+#ifndef _ASM_X86_CHECKSUM_64_H
+#define _ASM_X86_CHECKSUM_64_H
+
+/*
+ * Checksums for x86-64
+ * Copyright 2002 by Andi Kleen, SuSE Labs
+ * with some code from asm-x86/checksum.h
+ */
+
+#include <linux/compiler.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+
+/**
+ * csum_fold - Fold and invert a 32bit checksum.
+ * sum: 32bit unfolded sum
+ *
+ * Fold a 32bit running checksum to 16bit and invert it. This is usually
+ * the last step before putting a checksum into a packet.
+ * Make sure not to mix with 64bit checksums.
+ */
+static inline __sum16 csum_fold(__wsum sum)
+{
+ asm(" addl %1,%0\n"
+ " adcl $0xffff,%0"
+ : "=r" (sum)
+ : "r" ((__force u32)sum << 16),
+ "0" ((__force u32)sum & 0xffff0000));
+ return (__force __sum16)(~(__force u32)sum >> 16);
+}
+
+/*
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksum on 4 octet boundaries.
+ *
+ * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
+ * Arnt Gulbrandsen.
+ */
+
+/**
+ * ip_fast_csum - Compute the IPv4 header checksum efficiently.
+ * iph: ipv4 header
+ * ihl: length of header / 4
+ */
+static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+ unsigned int sum;
+
+ asm(" movl (%1), %0\n"
+ " subl $4, %2\n"
+ " jbe 2f\n"
+ " addl 4(%1), %0\n"
+ " adcl 8(%1), %0\n"
+ " adcl 12(%1), %0\n"
+ "1: adcl 16(%1), %0\n"
+ " lea 4(%1), %1\n"
+ " decl %2\n"
+ " jne 1b\n"
+ " adcl $0, %0\n"
+ " movl %0, %2\n"
+ " shrl $16, %0\n"
+ " addw %w2, %w0\n"
+ " adcl $0, %0\n"
+ " notl %0\n"
+ "2:"
+ /* Since the input registers which are loaded with iph and ihl
+ are modified, we must also specify them as outputs, or gcc
+ will assume they contain their original values. */
+ : "=r" (sum), "=r" (iph), "=r" (ihl)
+ : "1" (iph), "2" (ihl)
+ : "memory");
+ return (__force __sum16)sum;
+}
+
+/**
+ * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum.
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
+ * @proto: ip protocol of packet
+ * @sum: initial sum to be added in (32bit unfolded)
+ *
+ * Returns the pseudo header checksum the input data. Result is
+ * 32bit unfolded.
+ */
+static inline __wsum
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
+ unsigned short proto, __wsum sum)
+{
+ asm(" addl %1, %0\n"
+ " adcl %2, %0\n"
+ " adcl %3, %0\n"
+ " adcl $0, %0\n"
+ : "=r" (sum)
+ : "g" (daddr), "g" (saddr),
+ "g" ((len + proto)<<8), "0" (sum));
+ return sum;
+}
+
+
+/**
+ * csum_tcpup_magic - Compute an IPv4 pseudo header checksum.
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
+ * @proto: ip protocol of packet
+ * @sum: initial sum to be added in (32bit unfolded)
+ *
+ * Returns the 16bit pseudo header checksum the input data already
+ * complemented and ready to be filled in.
+ */
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+ unsigned short len,
+ unsigned short proto, __wsum sum)
+{
+ return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
+/**
+ * csum_partial - Compute an internet checksum.
+ * @buff: buffer to be checksummed
+ * @len: length of buffer.
+ * @sum: initial sum to be added in (32bit unfolded)
+ *
+ * Returns the 32bit unfolded internet checksum of the buffer.
+ * Before filling it in it needs to be csum_fold()'ed.
+ * buff should be aligned to a 64bit boundary if possible.
+ */
+extern __wsum csum_partial(const void *buff, int len, __wsum sum);
+
+#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
+#define HAVE_CSUM_COPY_USER 1
+
+
+/* Do not call this directly. Use the wrappers below */
+extern __wsum csum_partial_copy_generic(const void *src, const void *dst,
+ int len, __wsum sum,
+ int *src_err_ptr, int *dst_err_ptr);
+
+
+extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
+ int len, __wsum isum, int *errp);
+extern __wsum csum_partial_copy_to_user(const void *src, void __user *dst,
+ int len, __wsum isum, int *errp);
+extern __wsum csum_partial_copy_nocheck(const void *src, void *dst,
+ int len, __wsum sum);
+
+/* Old names. To be removed. */
+#define csum_and_copy_to_user csum_partial_copy_to_user
+#define csum_and_copy_from_user csum_partial_copy_from_user
+
+/**
+ * ip_compute_csum - Compute an 16bit IP checksum.
+ * @buff: buffer address.
+ * @len: length of buffer.
+ *
+ * Returns the 16bit folded/inverted checksum of the passed buffer.
+ * Ready to fill in.
+ */
+extern __sum16 ip_compute_csum(const void *buff, int len);
+
+/**
+ * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header.
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
+ * @proto: protocol of packet
+ * @sum: initial sum (32bit unfolded) to be added in
+ *
+ * Computes an IPv6 pseudo header checksum. This sum is added the checksum
+ * into UDP/TCP packets and contains some link layer information.
+ * Returns the unfolded 32bit checksum.
+ */
+
+struct in6_addr;
+
+#define _HAVE_ARCH_IPV6_CSUM 1
+extern __sum16
+csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
+ __u32 len, unsigned short proto, __wsum sum);
+
+static inline unsigned add32_with_carry(unsigned a, unsigned b)
+{
+ asm("addl %2,%0\n\t"
+ "adcl $0,%0"
+ : "=r" (a)
+ : "0" (a), "r" (b));
+ return a;
+}
+
+#endif /* _ASM_X86_CHECKSUM_64_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
new file mode 100644
index 00000000000..a460fa088d4
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "cmpxchg_32.h"
+#else
+# include "cmpxchg_64.h"
+#endif
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
new file mode 100644
index 00000000000..82ceb788a98
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -0,0 +1,344 @@
+#ifndef _ASM_X86_CMPXCHG_32_H
+#define _ASM_X86_CMPXCHG_32_H
+
+#include <linux/bitops.h> /* for LOCK_PREFIX */
+
+/*
+ * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you
+ * you need to test for the feature in boot_cpu_data.
+ */
+
+#define xchg(ptr, v) \
+ ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr))))
+
+struct __xchg_dummy {
+ unsigned long a[100];
+};
+#define __xg(x) ((struct __xchg_dummy *)(x))
+
+/*
+ * The semantics of XCHGCMP8B are a bit strange, this is why
+ * there is a loop and the loading of %%eax and %%edx has to
+ * be inside. This inlines well in most cases, the cached
+ * cost is around ~38 cycles. (in the future we might want
+ * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
+ * might have an implicit FPU-save as a cost, so it's not
+ * clear which path to go.)
+ *
+ * cmpxchg8b must be used with the lock prefix here to allow
+ * the instruction to be executed atomically, see page 3-102
+ * of the instruction set reference 24319102.pdf. We need
+ * the reader side to see the coherent 64bit value.
+ */
+static inline void __set_64bit(unsigned long long *ptr,
+ unsigned int low, unsigned int high)
+{
+ asm volatile("\n1:\t"
+ "movl (%0), %%eax\n\t"
+ "movl 4(%0), %%edx\n\t"
+ LOCK_PREFIX "cmpxchg8b (%0)\n\t"
+ "jnz 1b"
+ : /* no outputs */
+ : "D"(ptr),
+ "b"(low),
+ "c"(high)
+ : "ax", "dx", "memory");
+}
+
+static inline void __set_64bit_constant(unsigned long long *ptr,
+ unsigned long long value)
+{
+ __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
+}
+
+#define ll_low(x) *(((unsigned int *)&(x)) + 0)
+#define ll_high(x) *(((unsigned int *)&(x)) + 1)
+
+static inline void __set_64bit_var(unsigned long long *ptr,
+ unsigned long long value)
+{
+ __set_64bit(ptr, ll_low(value), ll_high(value));
+}
+
+#define set_64bit(ptr, value) \
+ (__builtin_constant_p((value)) \
+ ? __set_64bit_constant((ptr), (value)) \
+ : __set_64bit_var((ptr), (value)))
+
+#define _set_64bit(ptr, value) \
+ (__builtin_constant_p(value) \
+ ? __set_64bit(ptr, (unsigned int)(value), \
+ (unsigned int)((value) >> 32)) \
+ : __set_64bit(ptr, ll_low((value)), ll_high((value))))
+
+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
+ * but generally the primitive is invalid, *ptr is output argument. --ANK
+ */
+static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
+ int size)
+{
+ switch (size) {
+ case 1:
+ asm volatile("xchgb %b0,%1"
+ : "=q" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ case 2:
+ asm volatile("xchgw %w0,%1"
+ : "=r" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ case 4:
+ asm volatile("xchgl %0,%1"
+ : "=r" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ }
+ return x;
+}
+
+/*
+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
+ * store NEW in MEM. Return the initial value in MEM. Success is
+ * indicated by comparing RETURN with OLD.
+ */
+
+#ifdef CONFIG_X86_CMPXCHG
+#define __HAVE_ARCH_CMPXCHG 1
+#define cmpxchg(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
+ (unsigned long)(n), \
+ sizeof(*(ptr))))
+#define sync_cmpxchg(ptr, o, n) \
+ ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
+ (unsigned long)(n), \
+ sizeof(*(ptr))))
+#define cmpxchg_local(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
+ (unsigned long)(n), \
+ sizeof(*(ptr))))
+#endif
+
+#ifdef CONFIG_X86_CMPXCHG64
+#define cmpxchg64(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
+ (unsigned long long)(n)))
+#define cmpxchg64_local(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
+ (unsigned long long)(n)))
+#endif
+
+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+/*
+ * Always use locked operations when touching memory shared with a
+ * hypervisor, since the system may be SMP even if the guest kernel
+ * isn't.
+ */
+static inline unsigned long __sync_cmpxchg(volatile void *ptr,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile("lock; cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile("lock; cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile("lock; cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+static inline unsigned long __cmpxchg_local(volatile void *ptr,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile("cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile("cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile("cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+static inline unsigned long long __cmpxchg64(volatile void *ptr,
+ unsigned long long old,
+ unsigned long long new)
+{
+ unsigned long long prev;
+ asm volatile(LOCK_PREFIX "cmpxchg8b %3"
+ : "=A"(prev)
+ : "b"((unsigned long)new),
+ "c"((unsigned long)(new >> 32)),
+ "m"(*__xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+}
+
+static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
+ unsigned long long old,
+ unsigned long long new)
+{
+ unsigned long long prev;
+ asm volatile("cmpxchg8b %3"
+ : "=A"(prev)
+ : "b"((unsigned long)new),
+ "c"((unsigned long)(new >> 32)),
+ "m"(*__xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+}
+
+#ifndef CONFIG_X86_CMPXCHG
+/*
+ * Building a kernel capable running on 80386. It may be necessary to
+ * simulate the cmpxchg on the 80386 CPU. For that purpose we define
+ * a function for each of the sizes we support.
+ */
+
+extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
+extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
+extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
+
+static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
+ unsigned long new, int size)
+{
+ switch (size) {
+ case 1:
+ return cmpxchg_386_u8(ptr, old, new);
+ case 2:
+ return cmpxchg_386_u16(ptr, old, new);
+ case 4:
+ return cmpxchg_386_u32(ptr, old, new);
+ }
+ return old;
+}
+
+#define cmpxchg(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ if (likely(boot_cpu_data.x86 > 3)) \
+ __ret = (__typeof__(*(ptr)))__cmpxchg((ptr), \
+ (unsigned long)(o), (unsigned long)(n), \
+ sizeof(*(ptr))); \
+ else \
+ __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \
+ (unsigned long)(o), (unsigned long)(n), \
+ sizeof(*(ptr))); \
+ __ret; \
+})
+#define cmpxchg_local(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ if (likely(boot_cpu_data.x86 > 3)) \
+ __ret = (__typeof__(*(ptr)))__cmpxchg_local((ptr), \
+ (unsigned long)(o), (unsigned long)(n), \
+ sizeof(*(ptr))); \
+ else \
+ __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \
+ (unsigned long)(o), (unsigned long)(n), \
+ sizeof(*(ptr))); \
+ __ret; \
+})
+#endif
+
+#ifndef CONFIG_X86_CMPXCHG64
+/*
+ * Building a kernel capable running on 80386 and 80486. It may be necessary
+ * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
+ */
+
+extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
+
+#define cmpxchg64(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ if (likely(boot_cpu_data.x86 > 4)) \
+ __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \
+ (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ else \
+ __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
+ (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ __ret; \
+})
+#define cmpxchg64_local(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ if (likely(boot_cpu_data.x86 > 4)) \
+ __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \
+ (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ else \
+ __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
+ (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ __ret; \
+})
+
+#endif
+
+#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
new file mode 100644
index 00000000000..52de72e0de8
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -0,0 +1,185 @@
+#ifndef _ASM_X86_CMPXCHG_64_H
+#define _ASM_X86_CMPXCHG_64_H
+
+#include <asm/alternative.h> /* Provides LOCK_PREFIX */
+
+#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \
+ (ptr), sizeof(*(ptr))))
+
+#define __xg(x) ((volatile long *)(x))
+
+static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
+{
+ *ptr = val;
+}
+
+#define _set_64bit set_64bit
+
+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
+ * but generally the primitive is invalid, *ptr is output argument. --ANK
+ */
+static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
+ int size)
+{
+ switch (size) {
+ case 1:
+ asm volatile("xchgb %b0,%1"
+ : "=q" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ case 2:
+ asm volatile("xchgw %w0,%1"
+ : "=r" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ case 4:
+ asm volatile("xchgl %k0,%1"
+ : "=r" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ case 8:
+ asm volatile("xchgq %0,%1"
+ : "=r" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ }
+ return x;
+}
+
+/*
+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
+ * store NEW in MEM. Return the initial value in MEM. Success is
+ * indicated by comparing RETURN with OLD.
+ */
+
+#define __HAVE_ARCH_CMPXCHG 1
+
+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 8:
+ asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+/*
+ * Always use locked operations when touching memory shared with a
+ * hypervisor, since the system may be SMP even if the guest kernel
+ * isn't.
+ */
+static inline unsigned long __sync_cmpxchg(volatile void *ptr,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile("lock; cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile("lock; cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile("lock; cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+static inline unsigned long __cmpxchg_local(volatile void *ptr,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile("cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile("cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile("cmpxchgl %k1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 8:
+ asm volatile("cmpxchgq %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+#define cmpxchg(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
+ (unsigned long)(n), sizeof(*(ptr))))
+#define cmpxchg64(ptr, o, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ cmpxchg((ptr), (o), (n)); \
+})
+#define cmpxchg_local(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
+ (unsigned long)(n), \
+ sizeof(*(ptr))))
+#define sync_cmpxchg(ptr, o, n) \
+ ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
+ (unsigned long)(n), \
+ sizeof(*(ptr))))
+#define cmpxchg64_local(ptr, o, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ cmpxchg_local((ptr), (o), (n)); \
+})
+
+#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
new file mode 100644
index 00000000000..9a9c7bdc923
--- /dev/null
+++ b/arch/x86/include/asm/compat.h
@@ -0,0 +1,218 @@
+#ifndef _ASM_X86_COMPAT_H
+#define _ASM_X86_COMPAT_H
+
+/*
+ * Architecture specific compatibility types
+ */
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <asm/user32.h>
+
+#define COMPAT_USER_HZ 100
+
+typedef u32 compat_size_t;
+typedef s32 compat_ssize_t;
+typedef s32 compat_time_t;
+typedef s32 compat_clock_t;
+typedef s32 compat_pid_t;
+typedef u16 __compat_uid_t;
+typedef u16 __compat_gid_t;
+typedef u32 __compat_uid32_t;
+typedef u32 __compat_gid32_t;
+typedef u16 compat_mode_t;
+typedef u32 compat_ino_t;
+typedef u16 compat_dev_t;
+typedef s32 compat_off_t;
+typedef s64 compat_loff_t;
+typedef u16 compat_nlink_t;
+typedef u16 compat_ipc_pid_t;
+typedef s32 compat_daddr_t;
+typedef u32 compat_caddr_t;
+typedef __kernel_fsid_t compat_fsid_t;
+typedef s32 compat_timer_t;
+typedef s32 compat_key_t;
+
+typedef s32 compat_int_t;
+typedef s32 compat_long_t;
+typedef s64 __attribute__((aligned(4))) compat_s64;
+typedef u32 compat_uint_t;
+typedef u32 compat_ulong_t;
+typedef u64 __attribute__((aligned(4))) compat_u64;
+
+struct compat_timespec {
+ compat_time_t tv_sec;
+ s32 tv_nsec;
+};
+
+struct compat_timeval {
+ compat_time_t tv_sec;
+ s32 tv_usec;
+};
+
+struct compat_stat {
+ compat_dev_t st_dev;
+ u16 __pad1;
+ compat_ino_t st_ino;
+ compat_mode_t st_mode;
+ compat_nlink_t st_nlink;
+ __compat_uid_t st_uid;
+ __compat_gid_t st_gid;
+ compat_dev_t st_rdev;
+ u16 __pad2;
+ u32 st_size;
+ u32 st_blksize;
+ u32 st_blocks;
+ u32 st_atime;
+ u32 st_atime_nsec;
+ u32 st_mtime;
+ u32 st_mtime_nsec;
+ u32 st_ctime;
+ u32 st_ctime_nsec;
+ u32 __unused4;
+ u32 __unused5;
+};
+
+struct compat_flock {
+ short l_type;
+ short l_whence;
+ compat_off_t l_start;
+ compat_off_t l_len;
+ compat_pid_t l_pid;
+};
+
+#define F_GETLK64 12 /* using 'struct flock64' */
+#define F_SETLK64 13
+#define F_SETLKW64 14
+
+/*
+ * IA32 uses 4 byte alignment for 64 bit quantities,
+ * so we need to pack this structure.
+ */
+struct compat_flock64 {
+ short l_type;
+ short l_whence;
+ compat_loff_t l_start;
+ compat_loff_t l_len;
+ compat_pid_t l_pid;
+} __attribute__((packed));
+
+struct compat_statfs {
+ int f_type;
+ int f_bsize;
+ int f_blocks;
+ int f_bfree;
+ int f_bavail;
+ int f_files;
+ int f_ffree;
+ compat_fsid_t f_fsid;
+ int f_namelen; /* SunOS ignores this field. */
+ int f_frsize;
+ int f_spare[5];
+};
+
+#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff
+#define COMPAT_RLIM_INFINITY 0xffffffff
+
+typedef u32 compat_old_sigset_t; /* at least 32 bits */
+
+#define _COMPAT_NSIG 64
+#define _COMPAT_NSIG_BPW 32
+
+typedef u32 compat_sigset_word;
+
+#define COMPAT_OFF_T_MAX 0x7fffffff
+#define COMPAT_LOFF_T_MAX 0x7fffffffffffffffL
+
+struct compat_ipc64_perm {
+ compat_key_t key;
+ __compat_uid32_t uid;
+ __compat_gid32_t gid;
+ __compat_uid32_t cuid;
+ __compat_gid32_t cgid;
+ unsigned short mode;
+ unsigned short __pad1;
+ unsigned short seq;
+ unsigned short __pad2;
+ compat_ulong_t unused1;
+ compat_ulong_t unused2;
+};
+
+struct compat_semid64_ds {
+ struct compat_ipc64_perm sem_perm;
+ compat_time_t sem_otime;
+ compat_ulong_t __unused1;
+ compat_time_t sem_ctime;
+ compat_ulong_t __unused2;
+ compat_ulong_t sem_nsems;
+ compat_ulong_t __unused3;
+ compat_ulong_t __unused4;
+};
+
+struct compat_msqid64_ds {
+ struct compat_ipc64_perm msg_perm;
+ compat_time_t msg_stime;
+ compat_ulong_t __unused1;
+ compat_time_t msg_rtime;
+ compat_ulong_t __unused2;
+ compat_time_t msg_ctime;
+ compat_ulong_t __unused3;
+ compat_ulong_t msg_cbytes;
+ compat_ulong_t msg_qnum;
+ compat_ulong_t msg_qbytes;
+ compat_pid_t msg_lspid;
+ compat_pid_t msg_lrpid;
+ compat_ulong_t __unused4;
+ compat_ulong_t __unused5;
+};
+
+struct compat_shmid64_ds {
+ struct compat_ipc64_perm shm_perm;
+ compat_size_t shm_segsz;
+ compat_time_t shm_atime;
+ compat_ulong_t __unused1;
+ compat_time_t shm_dtime;
+ compat_ulong_t __unused2;
+ compat_time_t shm_ctime;
+ compat_ulong_t __unused3;
+ compat_pid_t shm_cpid;
+ compat_pid_t shm_lpid;
+ compat_ulong_t shm_nattch;
+ compat_ulong_t __unused4;
+ compat_ulong_t __unused5;
+};
+
+/*
+ * The type of struct elf_prstatus.pr_reg in compatible core dumps.
+ */
+typedef struct user_regs_struct32 compat_elf_gregset_t;
+
+/*
+ * A pointer passed in from user mode. This should not
+ * be used for syscall parameters, just declare them
+ * as pointers because the syscall entry code will have
+ * appropriately converted them already.
+ */
+typedef u32 compat_uptr_t;
+
+static inline void __user *compat_ptr(compat_uptr_t uptr)
+{
+ return (void __user *)(unsigned long)uptr;
+}
+
+static inline compat_uptr_t ptr_to_compat(void __user *uptr)
+{
+ return (u32)(unsigned long)uptr;
+}
+
+static inline void __user *compat_alloc_user_space(long len)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ return (void __user *)regs->sp - len;
+}
+
+static inline int is_compat_task(void)
+{
+ return current_thread_info()->status & TS_COMPAT;
+}
+
+#endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
new file mode 100644
index 00000000000..bae482df603
--- /dev/null
+++ b/arch/x86/include/asm/cpu.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_CPU_H
+#define _ASM_X86_CPU_H
+
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/topology.h>
+#include <linux/nodemask.h>
+#include <linux/percpu.h>
+
+struct x86_cpu {
+ struct cpu cpu;
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern int arch_register_cpu(int num);
+extern void arch_unregister_cpu(int);
+#endif
+
+DECLARE_PER_CPU(int, cpu_state);
+#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
new file mode 100644
index 00000000000..f73e95d75b4
--- /dev/null
+++ b/arch/x86/include/asm/cpufeature.h
@@ -0,0 +1,271 @@
+/*
+ * Defines x86 CPU feature bits
+ */
+#ifndef _ASM_X86_CPUFEATURE_H
+#define _ASM_X86_CPUFEATURE_H
+
+#include <asm/required-features.h>
+
+#define NCAPINTS 9 /* N 32-bit words worth of info */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name. If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+ */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
+#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */
+#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions */
+ /* (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN (0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLSH (0*32+19) /* "clflush" CLFLUSH instruction */
+#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM (0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP (1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX (1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */
+/* cpu types for specific tunings: */
+#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
+#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */
+#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
+#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
+#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */
+#define X86_FEATURE_XTOPOLOGY (3*32+21) /* cpu topology enum extensions */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX (4*32+ 6) /* Safer mode */
+#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID (4*32+10) /* Context ID */
+#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */
+#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
+#define X86_FEATURE_AES (4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
+#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
+#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
+
+/*
+ * Auxiliary flags: Linux defined - For features scattered in various
+ * CPUID levels like 0x6, 0xA etc
+ */
+#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
+
+/* Virtualization flags: Linux defined */
+#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+
+#include <linux/bitops.h>
+
+extern const char * const x86_cap_flags[NCAPINTS*32];
+extern const char * const x86_power_flags[32];
+
+#define test_cpu_cap(c, bit) \
+ test_bit(bit, (unsigned long *)((c)->x86_capability))
+
+#define cpu_has(c, bit) \
+ (__builtin_constant_p(bit) && \
+ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
+ (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
+ (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
+ (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \
+ (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
+ (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
+ (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
+ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \
+ ? 1 : \
+ test_cpu_cap(c, bit))
+
+#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
+
+#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
+#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
+#define setup_clear_cpu_cap(bit) do { \
+ clear_cpu_cap(&boot_cpu_data, bit); \
+ set_bit(bit, (unsigned long *)cleared_cpu_caps); \
+} while (0)
+#define setup_force_cpu_cap(bit) do { \
+ set_cpu_cap(&boot_cpu_data, bit); \
+ clear_bit(bit, (unsigned long *)cleared_cpu_caps); \
+} while (0)
+
+#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
+#define cpu_has_vme boot_cpu_has(X86_FEATURE_VME)
+#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
+#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE)
+#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC)
+#define cpu_has_pae boot_cpu_has(X86_FEATURE_PAE)
+#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE)
+#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC)
+#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP)
+#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR)
+#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX)
+#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR)
+#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
+#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
+#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
+#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
+#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
+#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
+#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR)
+#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR)
+#define cpu_has_centaur_mcr boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
+#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE)
+#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
+#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT)
+#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
+#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2)
+#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN)
+#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE)
+#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN)
+#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM)
+#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN)
+#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS)
+#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS)
+#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH)
+#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS)
+#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES)
+#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
+#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT)
+#define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1)
+#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
+#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
+#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
+
+#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
+# define cpu_has_invlpg 1
+#else
+# define cpu_has_invlpg (boot_cpu_data.x86 > 3)
+#endif
+
+#ifdef CONFIG_X86_64
+
+#undef cpu_has_vme
+#define cpu_has_vme 0
+
+#undef cpu_has_pae
+#define cpu_has_pae ___BUG___
+
+#undef cpu_has_mp
+#define cpu_has_mp 1
+
+#undef cpu_has_k6_mtrr
+#define cpu_has_k6_mtrr 0
+
+#undef cpu_has_cyrix_arr
+#define cpu_has_cyrix_arr 0
+
+#undef cpu_has_centaur_mcr
+#define cpu_has_centaur_mcr 0
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
+
+#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/cputime.h b/arch/x86/include/asm/cputime.h
new file mode 100644
index 00000000000..6d68ad7e0ea
--- /dev/null
+++ b/arch/x86/include/asm/cputime.h
@@ -0,0 +1 @@
+#include <asm-generic/cputime.h>
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
new file mode 100644
index 00000000000..0930b4f8d67
--- /dev/null
+++ b/arch/x86/include/asm/current.h
@@ -0,0 +1,39 @@
+#ifndef _ASM_X86_CURRENT_H
+#define _ASM_X86_CURRENT_H
+
+#ifdef CONFIG_X86_32
+#include <linux/compiler.h>
+#include <asm/percpu.h>
+
+struct task_struct;
+
+DECLARE_PER_CPU(struct task_struct *, current_task);
+static __always_inline struct task_struct *get_current(void)
+{
+ return x86_read_percpu(current_task);
+}
+
+#else /* X86_32 */
+
+#ifndef __ASSEMBLY__
+#include <asm/pda.h>
+
+struct task_struct;
+
+static __always_inline struct task_struct *get_current(void)
+{
+ return read_pda(pcurrent);
+}
+
+#else /* __ASSEMBLY__ */
+
+#include <asm/asm-offsets.h>
+#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* X86_32 */
+
+#define current get_current()
+
+#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
new file mode 100644
index 00000000000..3ea6f37be9e
--- /dev/null
+++ b/arch/x86/include/asm/debugreg.h
@@ -0,0 +1,70 @@
+#ifndef _ASM_X86_DEBUGREG_H
+#define _ASM_X86_DEBUGREG_H
+
+
+/* Indicate the register numbers for a number of the specific
+ debug registers. Registers 0-3 contain the addresses we wish to trap on */
+#define DR_FIRSTADDR 0 /* u_debugreg[DR_FIRSTADDR] */
+#define DR_LASTADDR 3 /* u_debugreg[DR_LASTADDR] */
+
+#define DR_STATUS 6 /* u_debugreg[DR_STATUS] */
+#define DR_CONTROL 7 /* u_debugreg[DR_CONTROL] */
+
+/* Define a few things for the status register. We can use this to determine
+ which debugging register was responsible for the trap. The other bits
+ are either reserved or not of interest to us. */
+
+#define DR_TRAP0 (0x1) /* db0 */
+#define DR_TRAP1 (0x2) /* db1 */
+#define DR_TRAP2 (0x4) /* db2 */
+#define DR_TRAP3 (0x8) /* db3 */
+
+#define DR_STEP (0x4000) /* single-step */
+#define DR_SWITCH (0x8000) /* task switch */
+
+/* Now define a bunch of things for manipulating the control register.
+ The top two bytes of the control register consist of 4 fields of 4
+ bits - each field corresponds to one of the four debug registers,
+ and indicates what types of access we trap on, and how large the data
+ field is that we are looking at */
+
+#define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */
+#define DR_CONTROL_SIZE 4 /* 4 control bits per register */
+
+#define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */
+#define DR_RW_WRITE (0x1)
+#define DR_RW_READ (0x3)
+
+#define DR_LEN_1 (0x0) /* Settings for data length to trap on */
+#define DR_LEN_2 (0x4)
+#define DR_LEN_4 (0xC)
+#define DR_LEN_8 (0x8)
+
+/* The low byte to the control register determine which registers are
+ enabled. There are 4 fields of two bits. One bit is "local", meaning
+ that the processor will reset the bit after a task switch and the other
+ is global meaning that we have to explicitly reset the bit. With linux,
+ you can use either one, since we explicitly zero the register when we enter
+ kernel mode. */
+
+#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
+#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
+#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
+
+#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
+#define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */
+
+/* The second byte to the control register has a few special things.
+ We can slow the instruction pipeline for instructions coming via the
+ gdt or the ldt if we want to. I am not sure why this is an advantage */
+
+#ifdef __i386__
+#define DR_CONTROL_RESERVED (0xFC00) /* Reserved by Intel */
+#else
+#define DR_CONTROL_RESERVED (0xFFFFFFFF0000FC00UL) /* Reserved */
+#endif
+
+#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
+#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
+
+#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
new file mode 100644
index 00000000000..409a649204a
--- /dev/null
+++ b/arch/x86/include/asm/delay.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_DELAY_H
+#define _ASM_X86_DELAY_H
+
+/*
+ * Copyright (C) 1993 Linus Torvalds
+ *
+ * Delay routines calling functions in arch/x86/lib/delay.c
+ */
+
+/* Undefined functions to get compile-time errors */
+extern void __bad_udelay(void);
+extern void __bad_ndelay(void);
+
+extern void __udelay(unsigned long usecs);
+extern void __ndelay(unsigned long nsecs);
+extern void __const_udelay(unsigned long xloops);
+extern void __delay(unsigned long loops);
+
+/* 0x10c7 is 2**32 / 1000000 (rounded up) */
+#define udelay(n) (__builtin_constant_p(n) ? \
+ ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
+ __udelay(n))
+
+/* 0x5 is 2**32 / 1000000000 (rounded up) */
+#define ndelay(n) (__builtin_constant_p(n) ? \
+ ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
+ __ndelay(n))
+
+void use_tsc_delay(void);
+
+#endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
new file mode 100644
index 00000000000..e6b82b17b07
--- /dev/null
+++ b/arch/x86/include/asm/desc.h
@@ -0,0 +1,409 @@
+#ifndef _ASM_X86_DESC_H
+#define _ASM_X86_DESC_H
+
+#ifndef __ASSEMBLY__
+#include <asm/desc_defs.h>
+#include <asm/ldt.h>
+#include <asm/mmu.h>
+#include <linux/smp.h>
+
+static inline void fill_ldt(struct desc_struct *desc,
+ const struct user_desc *info)
+{
+ desc->limit0 = info->limit & 0x0ffff;
+ desc->base0 = info->base_addr & 0x0000ffff;
+
+ desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
+ desc->type = (info->read_exec_only ^ 1) << 1;
+ desc->type |= info->contents << 2;
+ desc->s = 1;
+ desc->dpl = 0x3;
+ desc->p = info->seg_not_present ^ 1;
+ desc->limit = (info->limit & 0xf0000) >> 16;
+ desc->avl = info->useable;
+ desc->d = info->seg_32bit;
+ desc->g = info->limit_in_pages;
+ desc->base2 = (info->base_addr & 0xff000000) >> 24;
+ /*
+ * Don't allow setting of the lm bit. It is useless anyway
+ * because 64bit system calls require __USER_CS:
+ */
+ desc->l = 0;
+}
+
+extern struct desc_ptr idt_descr;
+extern gate_desc idt_table[];
+
+struct gdt_page {
+ struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU(struct gdt_page, gdt_page);
+
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+ return per_cpu(gdt_page, cpu).gdt;
+}
+
+#ifdef CONFIG_X86_64
+
+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate->offset_low = PTR_LOW(func);
+ gate->segment = __KERNEL_CS;
+ gate->ist = ist;
+ gate->p = 1;
+ gate->dpl = dpl;
+ gate->zero0 = 0;
+ gate->zero1 = 0;
+ gate->type = type;
+ gate->offset_middle = PTR_MIDDLE(func);
+ gate->offset_high = PTR_HIGH(func);
+}
+
+#else
+static inline void pack_gate(gate_desc *gate, unsigned char type,
+ unsigned long base, unsigned dpl, unsigned flags,
+ unsigned short seg)
+{
+ gate->a = (seg << 16) | (base & 0xffff);
+ gate->b = (base & 0xffff0000) |
+ (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+}
+
+#endif
+
+static inline int desc_empty(const void *ptr)
+{
+ const u32 *desc = ptr;
+ return !(desc[0] | desc[1]);
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define load_TR_desc() native_load_tr_desc()
+#define load_gdt(dtr) native_load_gdt(dtr)
+#define load_idt(dtr) native_load_idt(dtr)
+#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) native_store_gdt(dtr)
+#define store_idt(dtr) native_store_idt(dtr)
+#define store_tr(tr) (tr = native_store_tr())
+#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
+
+#define load_TLS(t, cpu) native_load_tls(t, cpu)
+#define set_ldt native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc) \
+ native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type) \
+ native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g) \
+ native_write_idt_entry(dt, entry, g)
+
+static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+
+static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+#endif /* CONFIG_PARAVIRT */
+
+static inline void native_write_idt_entry(gate_desc *idt, int entry,
+ const gate_desc *gate)
+{
+ memcpy(&idt[entry], gate, sizeof(*gate));
+}
+
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
+ const void *desc)
+{
+ memcpy(&ldt[entry], desc, 8);
+}
+
+static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
+ const void *desc, int type)
+{
+ unsigned int size;
+ switch (type) {
+ case DESC_TSS:
+ size = sizeof(tss_desc);
+ break;
+ case DESC_LDT:
+ size = sizeof(ldt_desc);
+ break;
+ default:
+ size = sizeof(struct desc_struct);
+ break;
+ }
+ memcpy(&gdt[entry], desc, size);
+}
+
+static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
+ unsigned long limit, unsigned char type,
+ unsigned char flags)
+{
+ desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
+ desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+ (limit & 0x000f0000) | ((type & 0xff) << 8) |
+ ((flags & 0xf) << 20);
+ desc->p = 1;
+}
+
+
+static inline void set_tssldt_descriptor(void *d, unsigned long addr,
+ unsigned type, unsigned size)
+{
+#ifdef CONFIG_X86_64
+ struct ldttss_desc64 *desc = d;
+ memset(desc, 0, sizeof(*desc));
+ desc->limit0 = size & 0xFFFF;
+ desc->base0 = PTR_LOW(addr);
+ desc->base1 = PTR_MIDDLE(addr) & 0xFF;
+ desc->type = type;
+ desc->p = 1;
+ desc->limit1 = (size >> 16) & 0xF;
+ desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
+ desc->base3 = PTR_HIGH(addr);
+#else
+ pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+#endif
+}
+
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+{
+ struct desc_struct *d = get_cpu_gdt_table(cpu);
+ tss_desc tss;
+
+ /*
+ * sizeof(unsigned long) coming from an extra "long" at the end
+ * of the iobitmap. See tss_struct definition in processor.h
+ *
+ * -1? seg base+limit should be pointing to the address of the
+ * last valid byte
+ */
+ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
+ IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
+ sizeof(unsigned long) - 1);
+ write_gdt_entry(d, entry, &tss, DESC_TSS);
+}
+
+#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
+static inline void native_set_ldt(const void *addr, unsigned int entries)
+{
+ if (likely(entries == 0))
+ asm volatile("lldt %w0"::"q" (0));
+ else {
+ unsigned cpu = smp_processor_id();
+ ldt_desc ldt;
+
+ set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
+ entries * LDT_ENTRY_SIZE - 1);
+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+ &ldt, DESC_LDT);
+ asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+ }
+}
+
+static inline void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+ asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+ asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+ asm volatile("sidt %0":"=m" (*dtr));
+}
+
+static inline unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+ asm volatile("str %0":"=r" (tr));
+ return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ unsigned int i;
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
+}
+
+#define _LDT_empty(info) \
+ ((info)->base_addr == 0 && \
+ (info)->limit == 0 && \
+ (info)->contents == 0 && \
+ (info)->read_exec_only == 1 && \
+ (info)->seg_32bit == 0 && \
+ (info)->limit_in_pages == 0 && \
+ (info)->seg_not_present == 1 && \
+ (info)->useable == 0)
+
+#ifdef CONFIG_X86_64
+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
+#else
+#define LDT_empty(info) (_LDT_empty(info))
+#endif
+
+static inline void clear_LDT(void)
+{
+ set_ldt(NULL, 0);
+}
+
+/*
+ * load one particular LDT into the current CPU
+ */
+static inline void load_LDT_nolock(mm_context_t *pc)
+{
+ set_ldt(pc->ldt, pc->size);
+}
+
+static inline void load_LDT(mm_context_t *pc)
+{
+ preempt_disable();
+ load_LDT_nolock(pc);
+ preempt_enable();
+}
+
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
+{
+ return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
+}
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+ return desc->limit0 | (desc->limit << 16);
+}
+
+static inline void _set_gate(int gate, unsigned type, void *addr,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate_desc s;
+ pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+ /*
+ * does not need to be atomic because it is only done once at
+ * setup time
+ */
+ write_idt_entry(idt_table, gate, &s);
+}
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+static inline void set_intr_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
+}
+
+#define SYS_VECTOR_FREE 0
+#define SYS_VECTOR_ALLOCED 1
+
+extern int first_system_vector;
+extern char system_vectors[];
+
+static inline void alloc_system_vector(int vector)
+{
+ if (system_vectors[vector] == SYS_VECTOR_FREE) {
+ system_vectors[vector] = SYS_VECTOR_ALLOCED;
+ if (first_system_vector > vector)
+ first_system_vector = vector;
+ } else
+ BUG();
+}
+
+static inline void alloc_intr_gate(unsigned int n, void *addr)
+{
+ alloc_system_vector(n);
+ set_intr_gate(n, addr);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_system_trap_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_trap_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
+}
+
+static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
+}
+
+static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
+}
+
+static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+}
+
+#else
+/*
+ * GET_DESC_BASE reads the descriptor base of the specified segment.
+ *
+ * Args:
+ * idx - descriptor index
+ * gdt - GDT pointer
+ * base - 32bit register to which the base will be written
+ * lo_w - lo word of the "base" register
+ * lo_b - lo byte of the "base" register
+ * hi_b - hi byte of the low word of the "base" register
+ *
+ * Example:
+ * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+ * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
+ */
+#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
+ movb idx * 8 + 4(gdt), lo_b; \
+ movb idx * 8 + 7(gdt), hi_b; \
+ shll $16, base; \
+ movw idx * 8 + 2(gdt), lo_w;
+
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
new file mode 100644
index 00000000000..a6adefa28b9
--- /dev/null
+++ b/arch/x86/include/asm/desc_defs.h
@@ -0,0 +1,95 @@
+/* Written 2000 by Andi Kleen */
+#ifndef _ASM_X86_DESC_DEFS_H
+#define _ASM_X86_DESC_DEFS_H
+
+/*
+ * Segment descriptor structure definitions, usable from both x86_64 and i386
+ * archs.
+ */
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/*
+ * FIXME: Acessing the desc_struct through its fields is more elegant,
+ * and should be the one valid thing to do. However, a lot of open code
+ * still touches the a and b acessors, and doing this allow us to do it
+ * incrementally. We keep the signature as a struct, rather than an union,
+ * so we can get rid of it transparently in the future -- glommer
+ */
+/* 8 byte segment descriptor */
+struct desc_struct {
+ union {
+ struct {
+ unsigned int a;
+ unsigned int b;
+ };
+ struct {
+ u16 limit0;
+ u16 base0;
+ unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+ unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
+ };
+ };
+} __attribute__((packed));
+
+enum {
+ GATE_INTERRUPT = 0xE,
+ GATE_TRAP = 0xF,
+ GATE_CALL = 0xC,
+ GATE_TASK = 0x5,
+};
+
+/* 16byte gate */
+struct gate_struct64 {
+ u16 offset_low;
+ u16 segment;
+ unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
+ u16 offset_middle;
+ u32 offset_high;
+ u32 zero1;
+} __attribute__((packed));
+
+#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF)
+#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF)
+#define PTR_HIGH(x) ((unsigned long long)(x) >> 32)
+
+enum {
+ DESC_TSS = 0x9,
+ DESC_LDT = 0x2,
+ DESCTYPE_S = 0x10, /* !system */
+};
+
+/* LDT or TSS descriptor in the GDT. 16 bytes. */
+struct ldttss_desc64 {
+ u16 limit0;
+ u16 base0;
+ unsigned base1 : 8, type : 5, dpl : 2, p : 1;
+ unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+ u32 base3;
+ u32 zero1;
+} __attribute__((packed));
+
+#ifdef CONFIG_X86_64
+typedef struct gate_struct64 gate_desc;
+typedef struct ldttss_desc64 ldt_desc;
+typedef struct ldttss_desc64 tss_desc;
+#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32))
+#define gate_segment(g) ((g).segment)
+#else
+typedef struct desc_struct gate_desc;
+typedef struct desc_struct ldt_desc;
+typedef struct desc_struct tss_desc;
+#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff))
+#define gate_segment(g) ((g).a >> 16)
+#endif
+
+struct desc_ptr {
+ unsigned short size;
+ unsigned long address;
+} __attribute__((packed)) ;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_DESC_DEFS_H */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
new file mode 100644
index 00000000000..3c034f48fdb
--- /dev/null
+++ b/arch/x86/include/asm/device.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_X86_DEVICE_H
+#define _ASM_X86_DEVICE_H
+
+struct dev_archdata {
+#ifdef CONFIG_ACPI
+ void *acpi_handle;
+#endif
+#ifdef CONFIG_X86_64
+struct dma_mapping_ops *dma_ops;
+#endif
+#ifdef CONFIG_DMAR
+ void *iommu; /* hook for IOMMU specific extension */
+#endif
+};
+
+#endif /* _ASM_X86_DEVICE_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
new file mode 100644
index 00000000000..9a2d644c08e
--- /dev/null
+++ b/arch/x86/include/asm/div64.h
@@ -0,0 +1,60 @@
+#ifndef _ASM_X86_DIV64_H
+#define _ASM_X86_DIV64_H
+
+#ifdef CONFIG_X86_32
+
+#include <linux/types.h>
+
+/*
+ * do_div() is NOT a C function. It wants to return
+ * two values (the quotient and the remainder), but
+ * since that doesn't work very well in C, what it
+ * does is:
+ *
+ * - modifies the 64-bit dividend _in_place_
+ * - returns the 32-bit remainder
+ *
+ * This ends up being the most efficient "calling
+ * convention" on x86.
+ */
+#define do_div(n, base) \
+({ \
+ unsigned long __upper, __low, __high, __mod, __base; \
+ __base = (base); \
+ asm("":"=a" (__low), "=d" (__high) : "A" (n)); \
+ __upper = __high; \
+ if (__high) { \
+ __upper = __high % (__base); \
+ __high = __high / (__base); \
+ } \
+ asm("divl %2":"=a" (__low), "=d" (__mod) \
+ : "rm" (__base), "0" (__low), "1" (__upper)); \
+ asm("":"=A" (n) : "a" (__low), "d" (__high)); \
+ __mod; \
+})
+
+static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
+{
+ union {
+ u64 v64;
+ u32 v32[2];
+ } d = { dividend };
+ u32 upper;
+
+ upper = d.v32[1];
+ d.v32[1] = 0;
+ if (upper >= divisor) {
+ d.v32[1] = upper / divisor;
+ upper %= divisor;
+ }
+ asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
+ "rm" (divisor), "0" (d.v32[0]), "1" (upper));
+ return d.v64;
+}
+#define div_u64_rem div_u64_rem
+
+#else
+# include <asm-generic/div64.h>
+#endif /* CONFIG_X86_32 */
+
+#endif /* _ASM_X86_DIV64_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
new file mode 100644
index 00000000000..4a5397bfce2
--- /dev/null
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -0,0 +1,308 @@
+#ifndef _ASM_X86_DMA_MAPPING_H
+#define _ASM_X86_DMA_MAPPING_H
+
+/*
+ * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
+ * documentation.
+ */
+
+#include <linux/scatterlist.h>
+#include <asm/io.h>
+#include <asm/swiotlb.h>
+#include <asm-generic/dma-coherent.h>
+
+extern dma_addr_t bad_dma_address;
+extern int iommu_merge;
+extern struct device x86_dma_fallback_dev;
+extern int panic_on_overflow;
+
+struct dma_mapping_ops {
+ int (*mapping_error)(struct device *dev,
+ dma_addr_t dma_addr);
+ void* (*alloc_coherent)(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp);
+ void (*free_coherent)(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
+ dma_addr_t (*map_single)(struct device *hwdev, phys_addr_t ptr,
+ size_t size, int direction);
+ void (*unmap_single)(struct device *dev, dma_addr_t addr,
+ size_t size, int direction);
+ void (*sync_single_for_cpu)(struct device *hwdev,
+ dma_addr_t dma_handle, size_t size,
+ int direction);
+ void (*sync_single_for_device)(struct device *hwdev,
+ dma_addr_t dma_handle, size_t size,
+ int direction);
+ void (*sync_single_range_for_cpu)(struct device *hwdev,
+ dma_addr_t dma_handle, unsigned long offset,
+ size_t size, int direction);
+ void (*sync_single_range_for_device)(struct device *hwdev,
+ dma_addr_t dma_handle, unsigned long offset,
+ size_t size, int direction);
+ void (*sync_sg_for_cpu)(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int direction);
+ void (*sync_sg_for_device)(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int direction);
+ int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+ void (*unmap_sg)(struct device *hwdev,
+ struct scatterlist *sg, int nents,
+ int direction);
+ int (*dma_supported)(struct device *hwdev, u64 mask);
+ int is_phys;
+};
+
+extern struct dma_mapping_ops *dma_ops;
+
+static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
+{
+#ifdef CONFIG_X86_32
+ return dma_ops;
+#else
+ if (unlikely(!dev) || !dev->archdata.dma_ops)
+ return dma_ops;
+ else
+ return dev->archdata.dma_ops;
+#endif /* _ASM_X86_DMA_MAPPING_H */
+}
+
+/* Make sure we keep the same behaviour */
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+#ifdef CONFIG_X86_32
+ return 0;
+#else
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+ if (ops->mapping_error)
+ return ops->mapping_error(dev, dma_addr);
+
+ return (dma_addr == bad_dma_address);
+#endif
+}
+
+#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
+#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+#define dma_is_consistent(d, h) (1)
+
+extern int dma_supported(struct device *hwdev, u64 mask);
+extern int dma_set_mask(struct device *dev, u64 mask);
+
+extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_addr, gfp_t flag);
+
+static inline dma_addr_t
+dma_map_single(struct device *hwdev, void *ptr, size_t size,
+ int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ return ops->map_single(hwdev, virt_to_phys(ptr), size, direction);
+}
+
+static inline void
+dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
+ int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->unmap_single)
+ ops->unmap_single(dev, addr, size, direction);
+}
+
+static inline int
+dma_map_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ return ops->map_sg(hwdev, sg, nents, direction);
+}
+
+static inline void
+dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
+ int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->unmap_sg)
+ ops->unmap_sg(hwdev, sg, nents, direction);
+}
+
+static inline void
+dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_single_for_cpu)
+ ops->sync_single_for_cpu(hwdev, dma_handle, size, direction);
+ flush_write_buffers();
+}
+
+static inline void
+dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_single_for_device)
+ ops->sync_single_for_device(hwdev, dma_handle, size, direction);
+ flush_write_buffers();
+}
+
+static inline void
+dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
+ unsigned long offset, size_t size, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_single_range_for_cpu)
+ ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
+ size, direction);
+ flush_write_buffers();
+}
+
+static inline void
+dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
+ unsigned long offset, size_t size,
+ int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_single_range_for_device)
+ ops->sync_single_range_for_device(hwdev, dma_handle,
+ offset, size, direction);
+ flush_write_buffers();
+}
+
+static inline void
+dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+ int nelems, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_sg_for_cpu)
+ ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
+ flush_write_buffers();
+}
+
+static inline void
+dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+ int nelems, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_sg_for_device)
+ ops->sync_sg_for_device(hwdev, sg, nelems, direction);
+
+ flush_write_buffers();
+}
+
+static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
+ size_t offset, size_t size,
+ int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ return ops->map_single(dev, page_to_phys(page) + offset,
+ size, direction);
+}
+
+static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, int direction)
+{
+ dma_unmap_single(dev, addr, size, direction);
+}
+
+static inline void
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+ enum dma_data_direction dir)
+{
+ flush_write_buffers();
+}
+
+static inline int dma_get_cache_alignment(void)
+{
+ /* no easy way to get cache size on all x86, so return the
+ * maximum possible, to be safe */
+ return boot_cpu_data.x86_clflush_size;
+}
+
+static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
+ gfp_t gfp)
+{
+ unsigned long dma_mask = 0;
+
+ dma_mask = dev->coherent_dma_mask;
+ if (!dma_mask)
+ dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
+
+ return dma_mask;
+}
+
+static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
+{
+#ifdef CONFIG_X86_64
+ unsigned long dma_mask = dma_alloc_coherent_mask(dev, gfp);
+
+ if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA))
+ gfp |= GFP_DMA32;
+#endif
+ return gfp;
+}
+
+static inline void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+ void *memory;
+
+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+ if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
+ return memory;
+
+ if (!dev) {
+ dev = &x86_dma_fallback_dev;
+ gfp |= GFP_DMA;
+ }
+
+ if (!is_device_dma_capable(dev))
+ return NULL;
+
+ if (!ops->alloc_coherent)
+ return NULL;
+
+ return ops->alloc_coherent(dev, size, dma_handle,
+ dma_alloc_coherent_gfp_flags(dev, gfp));
+}
+
+static inline void dma_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t bus)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
+ WARN_ON(irqs_disabled()); /* for portability */
+
+ if (dma_release_from_coherent(dev, get_order(size), vaddr))
+ return;
+
+ if (ops->free_coherent)
+ ops->free_coherent(dev, size, vaddr, bus);
+}
+
+#endif
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
new file mode 100644
index 00000000000..ca1098a7e58
--- /dev/null
+++ b/arch/x86/include/asm/dma.h
@@ -0,0 +1,318 @@
+/*
+ * linux/include/asm/dma.h: Defines for using and allocating dma channels.
+ * Written by Hennus Bergman, 1992.
+ * High DMA channel support & info by Hannu Savolainen
+ * and John Boyd, Nov. 1992.
+ */
+
+#ifndef _ASM_X86_DMA_H
+#define _ASM_X86_DMA_H
+
+#include <linux/spinlock.h> /* And spinlocks */
+#include <asm/io.h> /* need byte IO */
+#include <linux/delay.h>
+
+#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
+#define dma_outb outb_p
+#else
+#define dma_outb outb
+#endif
+
+#define dma_inb inb
+
+/*
+ * NOTES about DMA transfers:
+ *
+ * controller 1: channels 0-3, byte operations, ports 00-1F
+ * controller 2: channels 4-7, word operations, ports C0-DF
+ *
+ * - ALL registers are 8 bits only, regardless of transfer size
+ * - channel 4 is not used - cascades 1 into 2.
+ * - channels 0-3 are byte - addresses/counts are for physical bytes
+ * - channels 5-7 are word - addresses/counts are for physical words
+ * - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries
+ * - transfer count loaded to registers is 1 less than actual count
+ * - controller 2 offsets are all even (2x offsets for controller 1)
+ * - page registers for 5-7 don't use data bit 0, represent 128K pages
+ * - page registers for 0-3 use bit 0, represent 64K pages
+ *
+ * DMA transfers are limited to the lower 16MB of _physical_ memory.
+ * Note that addresses loaded into registers must be _physical_ addresses,
+ * not logical addresses (which may differ if paging is active).
+ *
+ * Address mapping for channels 0-3:
+ *
+ * A23 ... A16 A15 ... A8 A7 ... A0 (Physical addresses)
+ * | ... | | ... | | ... |
+ * | ... | | ... | | ... |
+ * | ... | | ... | | ... |
+ * P7 ... P0 A7 ... A0 A7 ... A0
+ * | Page | Addr MSB | Addr LSB | (DMA registers)
+ *
+ * Address mapping for channels 5-7:
+ *
+ * A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0 (Physical addresses)
+ * | ... | \ \ ... \ \ \ ... \ \
+ * | ... | \ \ ... \ \ \ ... \ (not used)
+ * | ... | \ \ ... \ \ \ ... \
+ * P7 ... P1 (0) A7 A6 ... A0 A7 A6 ... A0
+ * | Page | Addr MSB | Addr LSB | (DMA registers)
+ *
+ * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses
+ * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at
+ * the hardware level, so odd-byte transfers aren't possible).
+ *
+ * Transfer count (_not # bytes_) is limited to 64K, represented as actual
+ * count - 1 : 64K => 0xFFFF, 1 => 0x0000. Thus, count is always 1 or more,
+ * and up to 128K bytes may be transferred on channels 5-7 in one operation.
+ *
+ */
+
+#define MAX_DMA_CHANNELS 8
+
+#ifdef CONFIG_X86_32
+
+/* The maximum address that we can perform a DMA transfer to on this platform */
+#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
+
+#else
+
+/* 16MB ISA DMA zone */
+#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
+
+/* 4GB broken PCI/AGP hardware bus master zone */
+#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
+
+/* Compat define for old dma zone */
+#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
+
+#endif
+
+/* 8237 DMA controllers */
+#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
+#define IO_DMA2_BASE 0xC0 /* 16 bit master DMA, ch 4(=slave input)..7 */
+
+/* DMA controller registers */
+#define DMA1_CMD_REG 0x08 /* command register (w) */
+#define DMA1_STAT_REG 0x08 /* status register (r) */
+#define DMA1_REQ_REG 0x09 /* request register (w) */
+#define DMA1_MASK_REG 0x0A /* single-channel mask (w) */
+#define DMA1_MODE_REG 0x0B /* mode register (w) */
+#define DMA1_CLEAR_FF_REG 0x0C /* clear pointer flip-flop (w) */
+#define DMA1_TEMP_REG 0x0D /* Temporary Register (r) */
+#define DMA1_RESET_REG 0x0D /* Master Clear (w) */
+#define DMA1_CLR_MASK_REG 0x0E /* Clear Mask */
+#define DMA1_MASK_ALL_REG 0x0F /* all-channels mask (w) */
+
+#define DMA2_CMD_REG 0xD0 /* command register (w) */
+#define DMA2_STAT_REG 0xD0 /* status register (r) */
+#define DMA2_REQ_REG 0xD2 /* request register (w) */
+#define DMA2_MASK_REG 0xD4 /* single-channel mask (w) */
+#define DMA2_MODE_REG 0xD6 /* mode register (w) */
+#define DMA2_CLEAR_FF_REG 0xD8 /* clear pointer flip-flop (w) */
+#define DMA2_TEMP_REG 0xDA /* Temporary Register (r) */
+#define DMA2_RESET_REG 0xDA /* Master Clear (w) */
+#define DMA2_CLR_MASK_REG 0xDC /* Clear Mask */
+#define DMA2_MASK_ALL_REG 0xDE /* all-channels mask (w) */
+
+#define DMA_ADDR_0 0x00 /* DMA address registers */
+#define DMA_ADDR_1 0x02
+#define DMA_ADDR_2 0x04
+#define DMA_ADDR_3 0x06
+#define DMA_ADDR_4 0xC0
+#define DMA_ADDR_5 0xC4
+#define DMA_ADDR_6 0xC8
+#define DMA_ADDR_7 0xCC
+
+#define DMA_CNT_0 0x01 /* DMA count registers */
+#define DMA_CNT_1 0x03
+#define DMA_CNT_2 0x05
+#define DMA_CNT_3 0x07
+#define DMA_CNT_4 0xC2
+#define DMA_CNT_5 0xC6
+#define DMA_CNT_6 0xCA
+#define DMA_CNT_7 0xCE
+
+#define DMA_PAGE_0 0x87 /* DMA page registers */
+#define DMA_PAGE_1 0x83
+#define DMA_PAGE_2 0x81
+#define DMA_PAGE_3 0x82
+#define DMA_PAGE_5 0x8B
+#define DMA_PAGE_6 0x89
+#define DMA_PAGE_7 0x8A
+
+/* I/O to memory, no autoinit, increment, single mode */
+#define DMA_MODE_READ 0x44
+/* memory to I/O, no autoinit, increment, single mode */
+#define DMA_MODE_WRITE 0x48
+/* pass thru DREQ->HRQ, DACK<-HLDA only */
+#define DMA_MODE_CASCADE 0xC0
+
+#define DMA_AUTOINIT 0x10
+
+
+extern spinlock_t dma_spin_lock;
+
+static inline unsigned long claim_dma_lock(void)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&dma_spin_lock, flags);
+ return flags;
+}
+
+static inline void release_dma_lock(unsigned long flags)
+{
+ spin_unlock_irqrestore(&dma_spin_lock, flags);
+}
+
+/* enable/disable a specific DMA channel */
+static inline void enable_dma(unsigned int dmanr)
+{
+ if (dmanr <= 3)
+ dma_outb(dmanr, DMA1_MASK_REG);
+ else
+ dma_outb(dmanr & 3, DMA2_MASK_REG);
+}
+
+static inline void disable_dma(unsigned int dmanr)
+{
+ if (dmanr <= 3)
+ dma_outb(dmanr | 4, DMA1_MASK_REG);
+ else
+ dma_outb((dmanr & 3) | 4, DMA2_MASK_REG);
+}
+
+/* Clear the 'DMA Pointer Flip Flop'.
+ * Write 0 for LSB/MSB, 1 for MSB/LSB access.
+ * Use this once to initialize the FF to a known state.
+ * After that, keep track of it. :-)
+ * --- In order to do that, the DMA routines below should ---
+ * --- only be used while holding the DMA lock ! ---
+ */
+static inline void clear_dma_ff(unsigned int dmanr)
+{
+ if (dmanr <= 3)
+ dma_outb(0, DMA1_CLEAR_FF_REG);
+ else
+ dma_outb(0, DMA2_CLEAR_FF_REG);
+}
+
+/* set mode (above) for a specific DMA channel */
+static inline void set_dma_mode(unsigned int dmanr, char mode)
+{
+ if (dmanr <= 3)
+ dma_outb(mode | dmanr, DMA1_MODE_REG);
+ else
+ dma_outb(mode | (dmanr & 3), DMA2_MODE_REG);
+}
+
+/* Set only the page register bits of the transfer address.
+ * This is used for successive transfers when we know the contents of
+ * the lower 16 bits of the DMA current address register, but a 64k boundary
+ * may have been crossed.
+ */
+static inline void set_dma_page(unsigned int dmanr, char pagenr)
+{
+ switch (dmanr) {
+ case 0:
+ dma_outb(pagenr, DMA_PAGE_0);
+ break;
+ case 1:
+ dma_outb(pagenr, DMA_PAGE_1);
+ break;
+ case 2:
+ dma_outb(pagenr, DMA_PAGE_2);
+ break;
+ case 3:
+ dma_outb(pagenr, DMA_PAGE_3);
+ break;
+ case 5:
+ dma_outb(pagenr & 0xfe, DMA_PAGE_5);
+ break;
+ case 6:
+ dma_outb(pagenr & 0xfe, DMA_PAGE_6);
+ break;
+ case 7:
+ dma_outb(pagenr & 0xfe, DMA_PAGE_7);
+ break;
+ }
+}
+
+
+/* Set transfer address & page bits for specific DMA channel.
+ * Assumes dma flipflop is clear.
+ */
+static inline void set_dma_addr(unsigned int dmanr, unsigned int a)
+{
+ set_dma_page(dmanr, a>>16);
+ if (dmanr <= 3) {
+ dma_outb(a & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
+ dma_outb((a >> 8) & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
+ } else {
+ dma_outb((a >> 1) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
+ dma_outb((a >> 9) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
+ }
+}
+
+
+/* Set transfer size (max 64k for DMA0..3, 128k for DMA5..7) for
+ * a specific DMA channel.
+ * You must ensure the parameters are valid.
+ * NOTE: from a manual: "the number of transfers is one more
+ * than the initial word count"! This is taken into account.
+ * Assumes dma flip-flop is clear.
+ * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7.
+ */
+static inline void set_dma_count(unsigned int dmanr, unsigned int count)
+{
+ count--;
+ if (dmanr <= 3) {
+ dma_outb(count & 0xff, ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
+ dma_outb((count >> 8) & 0xff,
+ ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
+ } else {
+ dma_outb((count >> 1) & 0xff,
+ ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
+ dma_outb((count >> 9) & 0xff,
+ ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
+ }
+}
+
+
+/* Get DMA residue count. After a DMA transfer, this
+ * should return zero. Reading this while a DMA transfer is
+ * still in progress will return unpredictable results.
+ * If called before the channel has been used, it may return 1.
+ * Otherwise, it returns the number of _bytes_ left to transfer.
+ *
+ * Assumes DMA flip-flop is clear.
+ */
+static inline int get_dma_residue(unsigned int dmanr)
+{
+ unsigned int io_port;
+ /* using short to get 16-bit wrap around */
+ unsigned short count;
+
+ io_port = (dmanr <= 3) ? ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE
+ : ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE;
+
+ count = 1 + dma_inb(io_port);
+ count += dma_inb(io_port) << 8;
+
+ return (dmanr <= 3) ? count : (count << 1);
+}
+
+
+/* These are in kernel/dma.c: */
+extern int request_dma(unsigned int dmanr, const char *device_id);
+extern void free_dma(unsigned int dmanr);
+
+/* From PCI */
+
+#ifdef CONFIG_PCI
+extern int isa_dma_bridge_buggy;
+#else
+#define isa_dma_bridge_buggy (0)
+#endif
+
+#endif /* _ASM_X86_DMA_H */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
new file mode 100644
index 00000000000..bc68212c6bc
--- /dev/null
+++ b/arch/x86/include/asm/dmi.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_DMI_H
+#define _ASM_X86_DMI_H
+
+#include <asm/io.h>
+
+#define DMI_MAX_DATA 2048
+
+extern int dmi_alloc_index;
+extern char dmi_alloc_data[DMI_MAX_DATA];
+
+/* This is so early that there is no good way to allocate dynamic memory.
+ Allocate data in an BSS array. */
+static inline void *dmi_alloc(unsigned len)
+{
+ int idx = dmi_alloc_index;
+ if ((dmi_alloc_index + len) > DMI_MAX_DATA)
+ return NULL;
+ dmi_alloc_index += len;
+ return dmi_alloc_data + idx;
+}
+
+/* Use early IO mappings for DMI because it's initialized early */
+#define dmi_ioremap early_ioremap
+#define dmi_iounmap early_iounmap
+
+#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
new file mode 100644
index 00000000000..72c5a190bf4
--- /dev/null
+++ b/arch/x86/include/asm/ds.h
@@ -0,0 +1,238 @@
+/*
+ * Debug Store (DS) support
+ *
+ * This provides a low-level interface to the hardware's Debug Store
+ * feature that is used for branch trace store (BTS) and
+ * precise-event based sampling (PEBS).
+ *
+ * It manages:
+ * - per-thread and per-cpu allocation of BTS and PEBS
+ * - buffer memory allocation (optional)
+ * - buffer overflow handling
+ * - buffer access
+ *
+ * It assumes:
+ * - get_task_struct on all parameter tasks
+ * - current is allowed to trace parameter tasks
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
+ */
+
+#ifndef _ASM_X86_DS_H
+#define _ASM_X86_DS_H
+
+#ifdef CONFIG_X86_DS
+
+#include <linux/types.h>
+#include <linux/init.h>
+
+
+struct task_struct;
+
+/*
+ * Request BTS or PEBS
+ *
+ * Due to alignement constraints, the actual buffer may be slightly
+ * smaller than the requested or provided buffer.
+ *
+ * Returns 0 on success; -Eerrno otherwise
+ *
+ * task: the task to request recording for;
+ * NULL for per-cpu recording on the current cpu
+ * base: the base pointer for the (non-pageable) buffer;
+ * NULL if buffer allocation requested
+ * size: the size of the requested or provided buffer
+ * ovfl: pointer to a function to be called on buffer overflow;
+ * NULL if cyclic buffer requested
+ */
+typedef void (*ds_ovfl_callback_t)(struct task_struct *);
+extern int ds_request_bts(struct task_struct *task, void *base, size_t size,
+ ds_ovfl_callback_t ovfl);
+extern int ds_request_pebs(struct task_struct *task, void *base, size_t size,
+ ds_ovfl_callback_t ovfl);
+
+/*
+ * Release BTS or PEBS resources
+ *
+ * Frees buffers allocated on ds_request.
+ *
+ * Returns 0 on success; -Eerrno otherwise
+ *
+ * task: the task to release resources for;
+ * NULL to release resources for the current cpu
+ */
+extern int ds_release_bts(struct task_struct *task);
+extern int ds_release_pebs(struct task_struct *task);
+
+/*
+ * Return the (array) index of the write pointer.
+ * (assuming an array of BTS/PEBS records)
+ *
+ * Returns -Eerrno on error
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ * pos (out): if not NULL, will hold the result
+ */
+extern int ds_get_bts_index(struct task_struct *task, size_t *pos);
+extern int ds_get_pebs_index(struct task_struct *task, size_t *pos);
+
+/*
+ * Return the (array) index one record beyond the end of the array.
+ * (assuming an array of BTS/PEBS records)
+ *
+ * Returns -Eerrno on error
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ * pos (out): if not NULL, will hold the result
+ */
+extern int ds_get_bts_end(struct task_struct *task, size_t *pos);
+extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
+
+/*
+ * Provide a pointer to the BTS/PEBS record at parameter index.
+ * (assuming an array of BTS/PEBS records)
+ *
+ * The pointer points directly into the buffer. The user is
+ * responsible for copying the record.
+ *
+ * Returns the size of a single record on success; -Eerrno on error
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ * index: the index of the requested record
+ * record (out): pointer to the requested record
+ */
+extern int ds_access_bts(struct task_struct *task,
+ size_t index, const void **record);
+extern int ds_access_pebs(struct task_struct *task,
+ size_t index, const void **record);
+
+/*
+ * Write one or more BTS/PEBS records at the write pointer index and
+ * advance the write pointer.
+ *
+ * If size is not a multiple of the record size, trailing bytes are
+ * zeroed out.
+ *
+ * May result in one or more overflow notifications.
+ *
+ * If called during overflow handling, that is, with index >=
+ * interrupt threshold, the write will wrap around.
+ *
+ * An overflow notification is given if and when the interrupt
+ * threshold is reached during or after the write.
+ *
+ * Returns the number of bytes written or -Eerrno.
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ * buffer: the buffer to write
+ * size: the size of the buffer
+ */
+extern int ds_write_bts(struct task_struct *task,
+ const void *buffer, size_t size);
+extern int ds_write_pebs(struct task_struct *task,
+ const void *buffer, size_t size);
+
+/*
+ * Same as ds_write_bts/pebs, but omit ownership checks.
+ *
+ * This is needed to have some other task than the owner of the
+ * BTS/PEBS buffer or the parameter task itself write into the
+ * respective buffer.
+ */
+extern int ds_unchecked_write_bts(struct task_struct *task,
+ const void *buffer, size_t size);
+extern int ds_unchecked_write_pebs(struct task_struct *task,
+ const void *buffer, size_t size);
+
+/*
+ * Reset the write pointer of the BTS/PEBS buffer.
+ *
+ * Returns 0 on success; -Eerrno on error
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ */
+extern int ds_reset_bts(struct task_struct *task);
+extern int ds_reset_pebs(struct task_struct *task);
+
+/*
+ * Clear the BTS/PEBS buffer and reset the write pointer.
+ * The entire buffer will be zeroed out.
+ *
+ * Returns 0 on success; -Eerrno on error
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ */
+extern int ds_clear_bts(struct task_struct *task);
+extern int ds_clear_pebs(struct task_struct *task);
+
+/*
+ * Provide the PEBS counter reset value.
+ *
+ * Returns 0 on success; -Eerrno on error
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ * value (out): the counter reset value
+ */
+extern int ds_get_pebs_reset(struct task_struct *task, u64 *value);
+
+/*
+ * Set the PEBS counter reset value.
+ *
+ * Returns 0 on success; -Eerrno on error
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ * value: the new counter reset value
+ */
+extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
+
+/*
+ * Initialization
+ */
+struct cpuinfo_x86;
+extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
+
+
+
+/*
+ * The DS context - part of struct thread_struct.
+ */
+struct ds_context {
+ /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
+ unsigned char *ds;
+ /* the owner of the BTS and PEBS configuration, respectively */
+ struct task_struct *owner[2];
+ /* buffer overflow notification function for BTS and PEBS */
+ ds_ovfl_callback_t callback[2];
+ /* the original buffer address */
+ void *buffer[2];
+ /* the number of allocated pages for on-request allocated buffers */
+ unsigned int pages[2];
+ /* use count */
+ unsigned long count;
+ /* a pointer to the context location inside the thread_struct
+ * or the per_cpu context array */
+ struct ds_context **this;
+ /* a pointer to the task owning this context, or NULL, if the
+ * context is owned by a cpu */
+ struct task_struct *task;
+};
+
+/* called by exit_thread() to free leftover contexts */
+extern void ds_free(struct ds_context *context);
+
+#else /* CONFIG_X86_DS */
+
+#define ds_init_intel(config) do {} while (0)
+
+#endif /* CONFIG_X86_DS */
+#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
new file mode 100644
index 00000000000..804b6e6be92
--- /dev/null
+++ b/arch/x86/include/asm/dwarf2.h
@@ -0,0 +1,61 @@
+#ifndef _ASM_X86_DWARF2_H
+#define _ASM_X86_DWARF2_H
+
+#ifndef __ASSEMBLY__
+#warning "asm/dwarf2.h should be only included in pure assembly files"
+#endif
+
+/*
+ Macros for dwarf2 CFI unwind table entries.
+ See "as.info" for details on these pseudo ops. Unfortunately
+ they are only supported in very new binutils, so define them
+ away for older version.
+ */
+
+#ifdef CONFIG_AS_CFI
+
+#define CFI_STARTPROC .cfi_startproc
+#define CFI_ENDPROC .cfi_endproc
+#define CFI_DEF_CFA .cfi_def_cfa
+#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
+#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
+#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
+#define CFI_OFFSET .cfi_offset
+#define CFI_REL_OFFSET .cfi_rel_offset
+#define CFI_REGISTER .cfi_register
+#define CFI_RESTORE .cfi_restore
+#define CFI_REMEMBER_STATE .cfi_remember_state
+#define CFI_RESTORE_STATE .cfi_restore_state
+#define CFI_UNDEFINED .cfi_undefined
+
+#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
+#define CFI_SIGNAL_FRAME .cfi_signal_frame
+#else
+#define CFI_SIGNAL_FRAME
+#endif
+
+#else
+
+/* Due to the structure of pre-exisiting code, don't use assembler line
+ comment character # to ignore the arguments. Instead, use a dummy macro. */
+.macro cfi_ignore a=0, b=0, c=0, d=0
+.endm
+
+#define CFI_STARTPROC cfi_ignore
+#define CFI_ENDPROC cfi_ignore
+#define CFI_DEF_CFA cfi_ignore
+#define CFI_DEF_CFA_REGISTER cfi_ignore
+#define CFI_DEF_CFA_OFFSET cfi_ignore
+#define CFI_ADJUST_CFA_OFFSET cfi_ignore
+#define CFI_OFFSET cfi_ignore
+#define CFI_REL_OFFSET cfi_ignore
+#define CFI_REGISTER cfi_ignore
+#define CFI_RESTORE cfi_ignore
+#define CFI_REMEMBER_STATE cfi_ignore
+#define CFI_RESTORE_STATE cfi_ignore
+#define CFI_UNDEFINED cfi_ignore
+#define CFI_SIGNAL_FRAME cfi_ignore
+
+#endif
+
+#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
new file mode 100644
index 00000000000..3d8ceddbd40
--- /dev/null
+++ b/arch/x86/include/asm/e820.h
@@ -0,0 +1,146 @@
+#ifndef _ASM_X86_E820_H
+#define _ASM_X86_E820_H
+#define E820MAP 0x2d0 /* our map */
+#define E820MAX 128 /* number of entries in E820MAP */
+
+/*
+ * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the
+ * constrained space in the zeropage. If we have more nodes than
+ * that, and if we've booted off EFI firmware, then the EFI tables
+ * passed us from the EFI firmware can list more nodes. Size our
+ * internal memory map tables to have room for these additional
+ * nodes, based on up to three entries per node for which the
+ * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT),
+ * plus E820MAX, allowing space for the possible duplicate E820
+ * entries that might need room in the same arrays, prior to the
+ * call to sanitize_e820_map() to remove duplicates. The allowance
+ * of three memory map entries per node is "enough" entries for
+ * the initial hardware platform motivating this mechanism to make
+ * use of additional EFI map entries. Future platforms may want
+ * to allow more than three entries per node or otherwise refine
+ * this size.
+ */
+
+/*
+ * Odd: 'make headers_check' complains about numa.h if I try
+ * to collapse the next two #ifdef lines to a single line:
+ * #if defined(__KERNEL__) && defined(CONFIG_EFI)
+ */
+#ifdef __KERNEL__
+#ifdef CONFIG_EFI
+#include <linux/numa.h>
+#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES)
+#else /* ! CONFIG_EFI */
+#define E820_X_MAX E820MAX
+#endif
+#else /* ! __KERNEL__ */
+#define E820_X_MAX E820MAX
+#endif
+
+#define E820NR 0x1e8 /* # entries in E820MAP */
+
+#define E820_RAM 1
+#define E820_RESERVED 2
+#define E820_ACPI 3
+#define E820_NVS 4
+#define E820_UNUSABLE 5
+
+/* reserved RAM used by kernel itself */
+#define E820_RESERVED_KERN 128
+
+#ifndef __ASSEMBLY__
+struct e820entry {
+ __u64 addr; /* start of memory segment */
+ __u64 size; /* size of memory segment */
+ __u32 type; /* type of memory segment */
+} __attribute__((packed));
+
+struct e820map {
+ __u32 nr_map;
+ struct e820entry map[E820_X_MAX];
+};
+
+#ifdef __KERNEL__
+/* see comment in arch/x86/kernel/e820.c */
+extern struct e820map e820;
+extern struct e820map e820_saved;
+
+extern unsigned long pci_mem_start;
+extern int e820_any_mapped(u64 start, u64 end, unsigned type);
+extern int e820_all_mapped(u64 start, u64 end, unsigned type);
+extern void e820_add_region(u64 start, u64 size, int type);
+extern void e820_print_map(char *who);
+extern int
+sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
+extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
+ unsigned new_type);
+extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
+ int checktype);
+extern void update_e820(void);
+extern void e820_setup_gap(void);
+extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
+ unsigned long start_addr, unsigned long long end_addr);
+struct setup_data;
+extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data);
+
+#if defined(CONFIG_X86_64) || \
+ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+extern void e820_mark_nosave_regions(unsigned long limit_pfn);
+#else
+static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+}
+#endif
+
+#ifdef CONFIG_MEMTEST
+extern void early_memtest(unsigned long start, unsigned long end);
+#else
+static inline void early_memtest(unsigned long start, unsigned long end)
+{
+}
+#endif
+
+extern unsigned long end_user_pfn;
+
+extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
+extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
+extern void reserve_early(u64 start, u64 end, char *name);
+extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
+extern void free_early(u64 start, u64 end);
+extern void early_res_to_bootmem(u64 start, u64 end);
+extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+
+extern unsigned long e820_end_of_ram_pfn(void);
+extern unsigned long e820_end_of_low_ram_pfn(void);
+extern int e820_find_active_region(const struct e820entry *ei,
+ unsigned long start_pfn,
+ unsigned long last_pfn,
+ unsigned long *ei_startpfn,
+ unsigned long *ei_endpfn);
+extern void e820_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn);
+extern u64 e820_hole_size(u64 start, u64 end);
+extern void finish_e820_parsing(void);
+extern void e820_reserve_resources(void);
+extern void e820_reserve_resources_late(void);
+extern void setup_memory_map(void);
+extern char *default_machine_specific_memory_setup(void);
+extern char *machine_specific_memory_setup(void);
+extern char *memory_setup(void);
+#endif /* __KERNEL__ */
+#endif /* __ASSEMBLY__ */
+
+#define ISA_START_ADDRESS 0xa0000
+#define ISA_END_ADDRESS 0x100000
+#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS)
+
+#define BIOS_BEGIN 0x000a0000
+#define BIOS_END 0x00100000
+
+#ifdef __KERNEL__
+#include <linux/ioport.h>
+
+#define HIGH_MEMORY (1024*1024)
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_E820_H */
diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h
new file mode 100644
index 00000000000..e9b57ecc70c
--- /dev/null
+++ b/arch/x86/include/asm/edac.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_EDAC_H
+#define _ASM_X86_EDAC_H
+
+/* ECC atomic, DMA, SMP and interrupt safe scrub function */
+
+static inline void atomic_scrub(void *va, u32 size)
+{
+ u32 i, *virt_addr = va;
+
+ /*
+ * Very carefully read and write to memory atomically so we
+ * are interrupt, DMA and SMP safe.
+ */
+ for (i = 0; i < size / 4; i++, virt_addr++)
+ asm volatile("lock; addl $0, %0"::"m" (*virt_addr));
+}
+
+#endif /* _ASM_X86_EDAC_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
new file mode 100644
index 00000000000..a2e545c91c3
--- /dev/null
+++ b/arch/x86/include/asm/efi.h
@@ -0,0 +1,110 @@
+#ifndef _ASM_X86_EFI_H
+#define _ASM_X86_EFI_H
+
+#ifdef CONFIG_X86_32
+
+extern unsigned long asmlinkage efi_call_phys(void *, ...);
+
+#define efi_call_phys0(f) efi_call_phys(f)
+#define efi_call_phys1(f, a1) efi_call_phys(f, a1)
+#define efi_call_phys2(f, a1, a2) efi_call_phys(f, a1, a2)
+#define efi_call_phys3(f, a1, a2, a3) efi_call_phys(f, a1, a2, a3)
+#define efi_call_phys4(f, a1, a2, a3, a4) \
+ efi_call_phys(f, a1, a2, a3, a4)
+#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
+ efi_call_phys(f, a1, a2, a3, a4, a5)
+#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
+ efi_call_phys(f, a1, a2, a3, a4, a5, a6)
+/*
+ * Wrap all the virtual calls in a way that forces the parameters on the stack.
+ */
+
+#define efi_call_virt(f, args...) \
+ ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
+
+#define efi_call_virt0(f) efi_call_virt(f)
+#define efi_call_virt1(f, a1) efi_call_virt(f, a1)
+#define efi_call_virt2(f, a1, a2) efi_call_virt(f, a1, a2)
+#define efi_call_virt3(f, a1, a2, a3) efi_call_virt(f, a1, a2, a3)
+#define efi_call_virt4(f, a1, a2, a3, a4) \
+ efi_call_virt(f, a1, a2, a3, a4)
+#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
+ efi_call_virt(f, a1, a2, a3, a4, a5)
+#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
+ efi_call_virt(f, a1, a2, a3, a4, a5, a6)
+
+#define efi_ioremap(addr, size) ioremap_cache(addr, size)
+
+#else /* !CONFIG_X86_32 */
+
+#define MAX_EFI_IO_PAGES 100
+
+extern u64 efi_call0(void *fp);
+extern u64 efi_call1(void *fp, u64 arg1);
+extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
+extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3);
+extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4);
+extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3,
+ u64 arg4, u64 arg5);
+extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
+ u64 arg4, u64 arg5, u64 arg6);
+
+#define efi_call_phys0(f) \
+ efi_call0((void *)(f))
+#define efi_call_phys1(f, a1) \
+ efi_call1((void *)(f), (u64)(a1))
+#define efi_call_phys2(f, a1, a2) \
+ efi_call2((void *)(f), (u64)(a1), (u64)(a2))
+#define efi_call_phys3(f, a1, a2, a3) \
+ efi_call3((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3))
+#define efi_call_phys4(f, a1, a2, a3, a4) \
+ efi_call4((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
+ (u64)(a4))
+#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
+ efi_call5((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
+ (u64)(a4), (u64)(a5))
+#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
+ efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
+ (u64)(a4), (u64)(a5), (u64)(a6))
+
+#define efi_call_virt0(f) \
+ efi_call0((void *)(efi.systab->runtime->f))
+#define efi_call_virt1(f, a1) \
+ efi_call1((void *)(efi.systab->runtime->f), (u64)(a1))
+#define efi_call_virt2(f, a1, a2) \
+ efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2))
+#define efi_call_virt3(f, a1, a2, a3) \
+ efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ (u64)(a3))
+#define efi_call_virt4(f, a1, a2, a3, a4) \
+ efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ (u64)(a3), (u64)(a4))
+#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
+ efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ (u64)(a3), (u64)(a4), (u64)(a5))
+#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
+ efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
+
+extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size);
+
+#endif /* CONFIG_X86_32 */
+
+extern void efi_reserve_early(void);
+extern void efi_call_phys_prelog(void);
+extern void efi_call_phys_epilog(void);
+
+#ifndef CONFIG_EFI
+/*
+ * IF EFI is not configured, have the EFI calls return -ENOSYS.
+ */
+#define efi_call0(_f) (-ENOSYS)
+#define efi_call1(_f, _a1) (-ENOSYS)
+#define efi_call2(_f, _a1, _a2) (-ENOSYS)
+#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS)
+#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS)
+#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
+#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
+#endif /* CONFIG_EFI */
+
+#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
new file mode 100644
index 00000000000..40ca1bea791
--- /dev/null
+++ b/arch/x86/include/asm/elf.h
@@ -0,0 +1,336 @@
+#ifndef _ASM_X86_ELF_H
+#define _ASM_X86_ELF_H
+
+/*
+ * ELF register definitions..
+ */
+
+#include <asm/ptrace.h>
+#include <asm/user.h>
+#include <asm/auxvec.h>
+
+typedef unsigned long elf_greg_t;
+
+#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+typedef struct user_i387_struct elf_fpregset_t;
+
+#ifdef __i386__
+
+typedef struct user_fxsr_struct elf_fpxregset_t;
+
+#define R_386_NONE 0
+#define R_386_32 1
+#define R_386_PC32 2
+#define R_386_GOT32 3
+#define R_386_PLT32 4
+#define R_386_COPY 5
+#define R_386_GLOB_DAT 6
+#define R_386_JMP_SLOT 7
+#define R_386_RELATIVE 8
+#define R_386_GOTOFF 9
+#define R_386_GOTPC 10
+#define R_386_NUM 11
+
+/*
+ * These are used to set parameters in the core dumps.
+ */
+#define ELF_CLASS ELFCLASS32
+#define ELF_DATA ELFDATA2LSB
+#define ELF_ARCH EM_386
+
+#else
+
+/* x86-64 relocation types */
+#define R_X86_64_NONE 0 /* No reloc */
+#define R_X86_64_64 1 /* Direct 64 bit */
+#define R_X86_64_PC32 2 /* PC relative 32 bit signed */
+#define R_X86_64_GOT32 3 /* 32 bit GOT entry */
+#define R_X86_64_PLT32 4 /* 32 bit PLT address */
+#define R_X86_64_COPY 5 /* Copy symbol at runtime */
+#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */
+#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */
+#define R_X86_64_RELATIVE 8 /* Adjust by program base */
+#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative
+ offset to GOT */
+#define R_X86_64_32 10 /* Direct 32 bit zero extended */
+#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
+#define R_X86_64_16 12 /* Direct 16 bit zero extended */
+#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */
+#define R_X86_64_8 14 /* Direct 8 bit sign extended */
+#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */
+
+#define R_X86_64_NUM 16
+
+/*
+ * These are used to set parameters in the core dumps.
+ */
+#define ELF_CLASS ELFCLASS64
+#define ELF_DATA ELFDATA2LSB
+#define ELF_ARCH EM_X86_64
+
+#endif
+
+#include <asm/vdso.h>
+
+extern unsigned int vdso_enabled;
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch_ia32(x) \
+ (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#ifdef CONFIG_X86_32
+#include <asm/desc.h>
+
+#define elf_check_arch(x) elf_check_arch_ia32(x)
+
+/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx
+ contains a pointer to a function which might be registered using `atexit'.
+ This provides a mean for the dynamic linker to call DT_FINI functions for
+ shared libraries that have been loaded before the code runs.
+
+ A value of 0 tells we have no such handler.
+
+ We might as well make sure everything else is cleared too (except for %esp),
+ just to make things more deterministic.
+ */
+#define ELF_PLAT_INIT(_r, load_addr) \
+ do { \
+ _r->bx = 0; _r->cx = 0; _r->dx = 0; \
+ _r->si = 0; _r->di = 0; _r->bp = 0; \
+ _r->ax = 0; \
+} while (0)
+
+/*
+ * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
+ * now struct_user_regs, they are different)
+ */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+do { \
+ pr_reg[0] = regs->bx; \
+ pr_reg[1] = regs->cx; \
+ pr_reg[2] = regs->dx; \
+ pr_reg[3] = regs->si; \
+ pr_reg[4] = regs->di; \
+ pr_reg[5] = regs->bp; \
+ pr_reg[6] = regs->ax; \
+ pr_reg[7] = regs->ds & 0xffff; \
+ pr_reg[8] = regs->es & 0xffff; \
+ pr_reg[9] = regs->fs & 0xffff; \
+ savesegment(gs, pr_reg[10]); \
+ pr_reg[11] = regs->orig_ax; \
+ pr_reg[12] = regs->ip; \
+ pr_reg[13] = regs->cs & 0xffff; \
+ pr_reg[14] = regs->flags; \
+ pr_reg[15] = regs->sp; \
+ pr_reg[16] = regs->ss & 0xffff; \
+} while (0);
+
+#define ELF_PLATFORM (utsname()->machine)
+#define set_personality_64bit() do { } while (0)
+
+#else /* CONFIG_X86_32 */
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch(x) \
+ ((x)->e_machine == EM_X86_64)
+
+#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
+
+static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
+{
+ loadsegment(fs, 0);
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
+ load_gs_index(0);
+ regs->ip = ip;
+ regs->sp = sp;
+ regs->flags = X86_EFLAGS_IF;
+ regs->cs = __USER32_CS;
+ regs->ss = __USER32_DS;
+}
+
+static inline void elf_common_init(struct thread_struct *t,
+ struct pt_regs *regs, const u16 ds)
+{
+ regs->ax = regs->bx = regs->cx = regs->dx = 0;
+ regs->si = regs->di = regs->bp = 0;
+ regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
+ regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
+ t->fs = t->gs = 0;
+ t->fsindex = t->gsindex = 0;
+ t->ds = t->es = ds;
+}
+
+#define ELF_PLAT_INIT(_r, load_addr) \
+do { \
+ elf_common_init(&current->thread, _r, 0); \
+ clear_thread_flag(TIF_IA32); \
+} while (0)
+
+#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
+ elf_common_init(&current->thread, regs, __USER_DS)
+
+#define compat_start_thread(regs, ip, sp) \
+do { \
+ start_ia32_thread(regs, ip, sp); \
+ set_fs(USER_DS); \
+} while (0)
+
+#define COMPAT_SET_PERSONALITY(ex) \
+do { \
+ if (test_thread_flag(TIF_IA32)) \
+ clear_thread_flag(TIF_ABI_PENDING); \
+ else \
+ set_thread_flag(TIF_ABI_PENDING); \
+ current->personality |= force_personality32; \
+} while (0)
+
+#define COMPAT_ELF_PLATFORM ("i686")
+
+/*
+ * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
+ * now struct_user_regs, they are different). Assumes current is the process
+ * getting dumped.
+ */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+do { \
+ unsigned v; \
+ (pr_reg)[0] = (regs)->r15; \
+ (pr_reg)[1] = (regs)->r14; \
+ (pr_reg)[2] = (regs)->r13; \
+ (pr_reg)[3] = (regs)->r12; \
+ (pr_reg)[4] = (regs)->bp; \
+ (pr_reg)[5] = (regs)->bx; \
+ (pr_reg)[6] = (regs)->r11; \
+ (pr_reg)[7] = (regs)->r10; \
+ (pr_reg)[8] = (regs)->r9; \
+ (pr_reg)[9] = (regs)->r8; \
+ (pr_reg)[10] = (regs)->ax; \
+ (pr_reg)[11] = (regs)->cx; \
+ (pr_reg)[12] = (regs)->dx; \
+ (pr_reg)[13] = (regs)->si; \
+ (pr_reg)[14] = (regs)->di; \
+ (pr_reg)[15] = (regs)->orig_ax; \
+ (pr_reg)[16] = (regs)->ip; \
+ (pr_reg)[17] = (regs)->cs; \
+ (pr_reg)[18] = (regs)->flags; \
+ (pr_reg)[19] = (regs)->sp; \
+ (pr_reg)[20] = (regs)->ss; \
+ (pr_reg)[21] = current->thread.fs; \
+ (pr_reg)[22] = current->thread.gs; \
+ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
+ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
+ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
+ asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \
+} while (0);
+
+/* I'm not sure if we can use '-' here */
+#define ELF_PLATFORM ("x86_64")
+extern void set_personality_64bit(void);
+extern unsigned int sysctl_vsyscall32;
+extern int force_personality32;
+
+#endif /* !CONFIG_X86_32 */
+
+#define CORE_DUMP_USE_REGSET
+#define USE_ELF_CORE_DUMP
+#define ELF_EXEC_PAGESIZE 4096
+
+/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
+ use of this is to invoke "./ld.so someprog" to test out a new version of
+ the loader. We need to make sure that it is out of the way of the program
+ that it will "exec", and that there is sufficient room for the brk. */
+
+#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
+
+/* This yields a mask that user programs can use to figure out what
+ instruction set this CPU supports. This could be done in user space,
+ but it's not easy, and we've already done it here. */
+
+#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
+
+/* This yields a string that ld.so will use to load implementation
+ specific libraries for optimization. This is more specific in
+ intent than poking at uname or /proc/cpuinfo.
+
+ For the moment, we have only optimizations for the Intel generations,
+ but that could change... */
+
+#define SET_PERSONALITY(ex) set_personality_64bit()
+
+/*
+ * An executable for which elf_read_implies_exec() returns TRUE will
+ * have the READ_IMPLIES_EXEC personality flag set automatically.
+ */
+#define elf_read_implies_exec(ex, executable_stack) \
+ (executable_stack != EXSTACK_DISABLE_X)
+
+struct task_struct;
+
+#define ARCH_DLINFO_IA32(vdso_enabled) \
+do { \
+ if (vdso_enabled) { \
+ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
+ } \
+} while (0)
+
+#ifdef CONFIG_X86_32
+
+#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO))
+
+#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
+
+/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
+
+#else /* CONFIG_X86_32 */
+
+#define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */
+
+/* 1GB for 64bit, 8MB for 32bit */
+#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff)
+
+#define ARCH_DLINFO \
+do { \
+ if (vdso_enabled) \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, \
+ (unsigned long)current->mm->context.vdso); \
+} while (0)
+
+#define AT_SYSINFO 32
+
+#define COMPAT_ARCH_DLINFO ARCH_DLINFO_IA32(sysctl_vsyscall32)
+
+#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
+
+#endif /* !CONFIG_X86_32 */
+
+#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
+
+#define VDSO_ENTRY \
+ ((unsigned long)VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall))
+
+struct linux_binprm;
+
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+ int executable_stack);
+
+extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
+#define compat_arch_setup_additional_pages syscall32_setup_pages
+
+extern unsigned long arch_randomize_brk(struct mm_struct *mm);
+#define arch_randomize_brk arch_randomize_brk
+
+#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
new file mode 100644
index 00000000000..94826cf8745
--- /dev/null
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_EMERGENCY_RESTART_H
+#define _ASM_X86_EMERGENCY_RESTART_H
+
+enum reboot_type {
+ BOOT_TRIPLE = 't',
+ BOOT_KBD = 'k',
+#ifdef CONFIG_X86_32
+ BOOT_BIOS = 'b',
+#endif
+ BOOT_ACPI = 'a',
+ BOOT_EFI = 'e'
+};
+
+extern enum reboot_type reboot_type;
+
+extern void machine_emergency_restart(void);
+
+#endif /* _ASM_X86_EMERGENCY_RESTART_H */
diff --git a/arch/x86/include/asm/errno.h b/arch/x86/include/asm/errno.h
new file mode 100644
index 00000000000..4c82b503d92
--- /dev/null
+++ b/arch/x86/include/asm/errno.h
@@ -0,0 +1 @@
+#include <asm-generic/errno.h>
diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h
new file mode 100644
index 00000000000..380f0b4f17e
--- /dev/null
+++ b/arch/x86/include/asm/es7000/apic.h
@@ -0,0 +1,193 @@
+#ifndef __ASM_ES7000_APIC_H
+#define __ASM_ES7000_APIC_H
+
+#define xapic_phys_to_log_apicid(cpu) per_cpu(x86_bios_cpu_apicid, cpu)
+#define esr_disable (1)
+
+static inline int apic_id_registered(void)
+{
+ return (1);
+}
+
+static inline cpumask_t target_cpus(void)
+{
+#if defined CONFIG_ES7000_CLUSTERED_APIC
+ return CPU_MASK_ALL;
+#else
+ return cpumask_of_cpu(smp_processor_id());
+#endif
+}
+
+#if defined CONFIG_ES7000_CLUSTERED_APIC
+#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
+#define INT_DELIVERY_MODE (dest_LowestPrio)
+#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */
+#define NO_BALANCE_IRQ (1)
+#undef WAKE_SECONDARY_VIA_INIT
+#define WAKE_SECONDARY_VIA_MIP
+#else
+#define APIC_DFR_VALUE (APIC_DFR_FLAT)
+#define INT_DELIVERY_MODE (dest_Fixed)
+#define INT_DEST_MODE (0) /* phys delivery to target procs */
+#define NO_BALANCE_IRQ (0)
+#undef APIC_DEST_LOGICAL
+#define APIC_DEST_LOGICAL 0x0
+#define WAKE_SECONDARY_VIA_INIT
+#endif
+
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return 0;
+}
+static inline unsigned long check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+#define apicid_cluster(apicid) (apicid & 0xF0)
+
+static inline unsigned long calculate_ldr(int cpu)
+{
+ unsigned long id;
+ id = xapic_phys_to_log_apicid(cpu);
+ return (SET_APIC_LOGICAL_ID(id));
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LdR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static inline void init_apic_ldr(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+#ifndef CONFIG_X86_GENERICARCH
+extern void enable_apic_mode(void);
+#endif
+
+extern int apic_version [MAX_APICS];
+static inline void setup_apic_routing(void)
+{
+ int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
+ printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
+ (apic_version[apic] == 0x14) ?
+ "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]);
+}
+
+static inline int multi_timer_check(int apic, int irq)
+{
+ return 0;
+}
+
+static inline int apicid_to_node(int logical_apicid)
+{
+ return 0;
+}
+
+
+static inline int cpu_present_to_apicid(int mps_cpu)
+{
+ if (!mps_cpu)
+ return boot_cpu_physical_apicid;
+ else if (mps_cpu < NR_CPUS)
+ return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
+{
+ static int id = 0;
+ physid_mask_t mask;
+ mask = physid_mask_of_physid(id);
+ ++id;
+ return mask;
+}
+
+extern u8 cpu_2_logical_apicid[];
+/* Mapping from cpu number to logical apicid */
+static inline int cpu_to_logical_apicid(int cpu)
+{
+#ifdef CONFIG_SMP
+ if (cpu >= NR_CPUS)
+ return BAD_APICID;
+ return (int)cpu_2_logical_apicid[cpu];
+#else
+ return logical_smp_processor_id();
+#endif
+}
+
+static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0xff);
+}
+
+
+static inline void setup_portio_remap(void)
+{
+}
+
+extern unsigned int boot_cpu_physical_apicid;
+static inline int check_phys_apicid_present(int cpu_physical_apicid)
+{
+ boot_cpu_physical_apicid = read_apic_id();
+ return (1);
+}
+
+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int num_bits_set;
+ int cpus_found = 0;
+ int cpu;
+ int apicid;
+
+ num_bits_set = cpus_weight(cpumask);
+ /* Return id to all */
+ if (num_bits_set == NR_CPUS)
+#if defined CONFIG_ES7000_CLUSTERED_APIC
+ return 0xFF;
+#else
+ return cpu_to_logical_apicid(0);
+#endif
+ /*
+ * The cpus in the mask must all be on the apic cluster. If are not
+ * on the same apicid cluster return default value of TARGET_CPUS.
+ */
+ cpu = first_cpu(cpumask);
+ apicid = cpu_to_logical_apicid(cpu);
+ while (cpus_found < num_bits_set) {
+ if (cpu_isset(cpu, cpumask)) {
+ int new_apicid = cpu_to_logical_apicid(cpu);
+ if (apicid_cluster(apicid) !=
+ apicid_cluster(new_apicid)){
+ printk ("%s: Not a valid mask!\n", __func__);
+#if defined CONFIG_ES7000_CLUSTERED_APIC
+ return 0xFF;
+#else
+ return cpu_to_logical_apicid(0);
+#endif
+ }
+ apicid = new_apicid;
+ cpus_found++;
+ }
+ cpu++;
+ }
+ return apicid;
+}
+
+static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+#endif /* __ASM_ES7000_APIC_H */
diff --git a/arch/x86/include/asm/es7000/apicdef.h b/arch/x86/include/asm/es7000/apicdef.h
new file mode 100644
index 00000000000..8b234a3cb85
--- /dev/null
+++ b/arch/x86/include/asm/es7000/apicdef.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_ES7000_APICDEF_H
+#define __ASM_ES7000_APICDEF_H
+
+#define APIC_ID_MASK (0xFF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+ return (((x)>>24)&0xFF);
+}
+
+#define GET_APIC_ID(x) get_apic_id(x)
+
+#endif
diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h
new file mode 100644
index 00000000000..632a955fcc0
--- /dev/null
+++ b/arch/x86/include/asm/es7000/ipi.h
@@ -0,0 +1,24 @@
+#ifndef __ASM_ES7000_IPI_H
+#define __ASM_ES7000_IPI_H
+
+void send_IPI_mask_sequence(cpumask_t mask, int vector);
+
+static inline void send_IPI_mask(cpumask_t mask, int vector)
+{
+ send_IPI_mask_sequence(mask, vector);
+}
+
+static inline void send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(smp_processor_id(), mask);
+ if (!cpus_empty(mask))
+ send_IPI_mask(mask, vector);
+}
+
+static inline void send_IPI_all(int vector)
+{
+ send_IPI_mask(cpu_online_map, vector);
+}
+
+#endif /* __ASM_ES7000_IPI_H */
diff --git a/arch/x86/include/asm/es7000/mpparse.h b/arch/x86/include/asm/es7000/mpparse.h
new file mode 100644
index 00000000000..ed5a3caae14
--- /dev/null
+++ b/arch/x86/include/asm/es7000/mpparse.h
@@ -0,0 +1,30 @@
+#ifndef __ASM_ES7000_MPPARSE_H
+#define __ASM_ES7000_MPPARSE_H
+
+#include <linux/acpi.h>
+
+extern int parse_unisys_oem (char *oemptr);
+extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
+extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
+extern void setup_unisys(void);
+
+#ifndef CONFIG_X86_GENERICARCH
+extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
+extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid);
+#endif
+
+#ifdef CONFIG_ACPI
+
+static inline int es7000_check_dsdt(void)
+{
+ struct acpi_table_header header;
+
+ if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
+ !strncmp(header.oem_id, "UNISYS", 6))
+ return 1;
+ return 0;
+}
+#endif
+
+#endif /* __ASM_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h
new file mode 100644
index 00000000000..3ffc5a7bf66
--- /dev/null
+++ b/arch/x86/include/asm/es7000/wakecpu.h
@@ -0,0 +1,59 @@
+#ifndef __ASM_ES7000_WAKECPU_H
+#define __ASM_ES7000_WAKECPU_H
+
+/*
+ * This file copes with machines that wakeup secondary CPUs by the
+ * INIT, INIT, STARTUP sequence.
+ */
+
+#ifdef CONFIG_ES7000_CLUSTERED_APIC
+#define WAKE_SECONDARY_VIA_MIP
+#else
+#define WAKE_SECONDARY_VIA_INIT
+#endif
+
+#ifdef WAKE_SECONDARY_VIA_MIP
+extern int es7000_start_cpu(int cpu, unsigned long eip);
+static inline int
+wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+{
+ int boot_error = 0;
+ boot_error = es7000_start_cpu(phys_apicid, start_eip);
+ return boot_error;
+}
+#endif
+
+#define TRAMPOLINE_LOW phys_to_virt(0x467)
+#define TRAMPOLINE_HIGH phys_to_virt(0x469)
+
+#define boot_cpu_apicid boot_cpu_physical_apicid
+
+static inline void wait_for_init_deassert(atomic_t *deassert)
+{
+#ifdef WAKE_SECONDARY_VIA_INIT
+ while (!atomic_read(deassert))
+ cpu_relax();
+#endif
+ return;
+}
+
+/* Nothing to do for most platforms, since cleared by the INIT cycle */
+static inline void smp_callin_clear_local_apic(void)
+{
+}
+
+static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
+{
+}
+
+static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
+{
+}
+
+#if APIC_DEBUG
+ #define inquire_remote_apic(apicid) __inquire_remote_apic(apicid)
+#else
+ #define inquire_remote_apic(apicid) {}
+#endif
+
+#endif /* __ASM_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
new file mode 100644
index 00000000000..53018464aea
--- /dev/null
+++ b/arch/x86/include/asm/fb.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_FB_H
+#define _ASM_X86_FB_H
+
+#include <linux/fb.h>
+#include <linux/fs.h>
+#include <asm/page.h>
+
+static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
+ unsigned long off)
+{
+ if (boot_cpu_data.x86 > 3)
+ pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
+}
+
+#ifdef CONFIG_X86_32
+extern int fb_is_primary_device(struct fb_info *info);
+#else
+static inline int fb_is_primary_device(struct fb_info *info) { return 0; }
+#endif
+
+#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fcntl.h b/arch/x86/include/asm/fcntl.h
new file mode 100644
index 00000000000..46ab12db573
--- /dev/null
+++ b/arch/x86/include/asm/fcntl.h
@@ -0,0 +1 @@
+#include <asm-generic/fcntl.h>
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
new file mode 100644
index 00000000000..8668a94f850
--- /dev/null
+++ b/arch/x86/include/asm/fixmap.h
@@ -0,0 +1,68 @@
+#ifndef _ASM_X86_FIXMAP_H
+#define _ASM_X86_FIXMAP_H
+
+#ifdef CONFIG_X86_32
+# include "fixmap_32.h"
+#else
+# include "fixmap_64.h"
+#endif
+
+extern int fixmaps_set;
+
+void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
+void native_set_fixmap(enum fixed_addresses idx,
+ unsigned long phys, pgprot_t flags);
+
+#ifndef CONFIG_PARAVIRT
+static inline void __set_fixmap(enum fixed_addresses idx,
+ unsigned long phys, pgprot_t flags)
+{
+ native_set_fixmap(idx, phys, flags);
+}
+#endif
+
+#define set_fixmap(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL)
+
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#define clear_fixmap(idx) \
+ __set_fixmap(idx, 0, __pgprot(0))
+
+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+ /*
+ * this branch gets completely eliminated after inlining,
+ * except when someone tries to use fixaddr indices in an
+ * illegal way. (such as mixing up address types or using
+ * out-of-range indices).
+ *
+ * If it doesn't get removed, the linker will complain
+ * loudly with a reasonably clear error message..
+ */
+ if (idx >= __end_of_fixed_addresses)
+ __this_fixmap_does_not_exist();
+
+ return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+ return __virt_to_fix(vaddr);
+}
+#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/fixmap_32.h b/arch/x86/include/asm/fixmap_32.h
new file mode 100644
index 00000000000..09f29ab5c13
--- /dev/null
+++ b/arch/x86/include/asm/fixmap_32.h
@@ -0,0 +1,123 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#ifndef _ASM_X86_FIXMAP_32_H
+#define _ASM_X86_FIXMAP_32_H
+
+
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
+#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
+
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#ifdef CONFIG_HIGHMEM
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#endif
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process. We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * these 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages. (or larger if used with an increment
+ * highger than 1) use fixmap_set(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
+ FIX_HOLE,
+ FIX_VDSO,
+ FIX_DBGP_BASE,
+ FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_X86_LOCAL_APIC
+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+ FIX_CO_CPU, /* Cobalt timer */
+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
+ FIX_LI_PCIA, /* Lithium PCI Bridge A */
+ FIX_LI_PCIB, /* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_X86_F00F_BUG
+ FIX_F00F_IDT, /* Virtual mapping for IDT */
+#endif
+#ifdef CONFIG_X86_CYCLONE_TIMER
+ FIX_CYCLONE_TIMER, /*cyclone timer register*/
+#endif
+#ifdef CONFIG_HIGHMEM
+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#endif
+#ifdef CONFIG_PCI_MMCONFIG
+ FIX_PCIE_MCFG,
+#endif
+#ifdef CONFIG_PARAVIRT
+ FIX_PARAVIRT_BOOTMAP,
+#endif
+ __end_of_permanent_fixed_addresses,
+ /*
+ * 256 temporary boot-time mappings, used by early_ioremap(),
+ * before ioremap() is functional.
+ *
+ * We round it up to the next 256 pages boundary so that we
+ * can have a single pgd entry and a single pte table:
+ */
+#define NR_FIX_BTMAPS 64
+#define FIX_BTMAPS_SLOTS 4
+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+ (__end_of_permanent_fixed_addresses & 255),
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
+ FIX_WP_TEST,
+#ifdef CONFIG_ACPI
+ FIX_ACPI_BEGIN,
+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
+ __end_of_fixed_addresses
+};
+
+extern void reserve_top_address(unsigned long reserve);
+
+
+#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
+
+#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
+#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_FIXMAP_32_H */
diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h
new file mode 100644
index 00000000000..00a30ab9b1a
--- /dev/null
+++ b/arch/x86/include/asm/fixmap_64.h
@@ -0,0 +1,83 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ */
+
+#ifndef _ASM_X86_FIXMAP_64_H
+#define _ASM_X86_FIXMAP_64_H
+
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#include <asm/vsyscall.h>
+#include <asm/efi.h>
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+
+enum fixed_addresses {
+ VSYSCALL_LAST_PAGE,
+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+ VSYSCALL_HPET,
+ FIX_DBGP_BASE,
+ FIX_EARLYCON_MEM_BASE,
+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+ FIX_EFI_IO_MAP_LAST_PAGE,
+ FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
+ + MAX_EFI_IO_PAGES - 1,
+#ifdef CONFIG_PARAVIRT
+ FIX_PARAVIRT_BOOTMAP,
+#endif
+ __end_of_permanent_fixed_addresses,
+#ifdef CONFIG_ACPI
+ FIX_ACPI_BEGIN,
+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
+ /*
+ * 256 temporary boot-time mappings, used by early_ioremap(),
+ * before ioremap() is functional.
+ *
+ * We round it up to the next 256 pages boundary so that we
+ * can have a single pgd entry and a single pte table:
+ */
+#define NR_FIX_BTMAPS 64
+#define FIX_BTMAPS_SLOTS 4
+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+ (__end_of_permanent_fixed_addresses & 255),
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
+ __end_of_fixed_addresses
+};
+
+#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
+#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+
+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
+#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
+#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
+
+#endif /* _ASM_X86_FIXMAP_64_H */
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
new file mode 100644
index 00000000000..dbe82a5c5ea
--- /dev/null
+++ b/arch/x86/include/asm/floppy.h
@@ -0,0 +1,281 @@
+/*
+ * Architecture specific parts of the Floppy driver
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1995
+ */
+#ifndef _ASM_X86_FLOPPY_H
+#define _ASM_X86_FLOPPY_H
+
+#include <linux/vmalloc.h>
+
+/*
+ * The DMA channel used by the floppy controller cannot access data at
+ * addresses >= 16MB
+ *
+ * Went back to the 1MB limit, as some people had problems with the floppy
+ * driver otherwise. It doesn't matter much for performance anyway, as most
+ * floppy accesses go through the track buffer.
+ */
+#define _CROSS_64KB(a, s, vdma) \
+ (!(vdma) && \
+ ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
+
+#define CROSS_64KB(a, s) _CROSS_64KB(a, s, use_virtual_dma & 1)
+
+
+#define SW fd_routine[use_virtual_dma & 1]
+#define CSW fd_routine[can_use_virtual_dma & 1]
+
+
+#define fd_inb(port) inb_p(port)
+#define fd_outb(value, port) outb_p(value, port)
+
+#define fd_request_dma() CSW._request_dma(FLOPPY_DMA, "floppy")
+#define fd_free_dma() CSW._free_dma(FLOPPY_DMA)
+#define fd_enable_irq() enable_irq(FLOPPY_IRQ)
+#define fd_disable_irq() disable_irq(FLOPPY_IRQ)
+#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL)
+#define fd_get_dma_residue() SW._get_dma_residue(FLOPPY_DMA)
+#define fd_dma_mem_alloc(size) SW._dma_mem_alloc(size)
+#define fd_dma_setup(addr, size, mode, io) SW._dma_setup(addr, size, mode, io)
+
+#define FLOPPY_CAN_FALLBACK_ON_NODMA
+
+static int virtual_dma_count;
+static int virtual_dma_residue;
+static char *virtual_dma_addr;
+static int virtual_dma_mode;
+static int doing_pdma;
+
+static irqreturn_t floppy_hardint(int irq, void *dev_id)
+{
+ unsigned char st;
+
+#undef TRACE_FLPY_INT
+
+#ifdef TRACE_FLPY_INT
+ static int calls;
+ static int bytes;
+ static int dma_wait;
+#endif
+ if (!doing_pdma)
+ return floppy_interrupt(irq, dev_id);
+
+#ifdef TRACE_FLPY_INT
+ if (!calls)
+ bytes = virtual_dma_count;
+#endif
+
+ {
+ int lcount;
+ char *lptr;
+
+ st = 1;
+ for (lcount = virtual_dma_count, lptr = virtual_dma_addr;
+ lcount; lcount--, lptr++) {
+ st = inb(virtual_dma_port + 4) & 0xa0;
+ if (st != 0xa0)
+ break;
+ if (virtual_dma_mode)
+ outb_p(*lptr, virtual_dma_port + 5);
+ else
+ *lptr = inb_p(virtual_dma_port + 5);
+ }
+ virtual_dma_count = lcount;
+ virtual_dma_addr = lptr;
+ st = inb(virtual_dma_port + 4);
+ }
+
+#ifdef TRACE_FLPY_INT
+ calls++;
+#endif
+ if (st == 0x20)
+ return IRQ_HANDLED;
+ if (!(st & 0x20)) {
+ virtual_dma_residue += virtual_dma_count;
+ virtual_dma_count = 0;
+#ifdef TRACE_FLPY_INT
+ printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
+ virtual_dma_count, virtual_dma_residue, calls, bytes,
+ dma_wait);
+ calls = 0;
+ dma_wait = 0;
+#endif
+ doing_pdma = 0;
+ floppy_interrupt(irq, dev_id);
+ return IRQ_HANDLED;
+ }
+#ifdef TRACE_FLPY_INT
+ if (!virtual_dma_count)
+ dma_wait++;
+#endif
+ return IRQ_HANDLED;
+}
+
+static void fd_disable_dma(void)
+{
+ if (!(can_use_virtual_dma & 1))
+ disable_dma(FLOPPY_DMA);
+ doing_pdma = 0;
+ virtual_dma_residue += virtual_dma_count;
+ virtual_dma_count = 0;
+}
+
+static int vdma_request_dma(unsigned int dmanr, const char *device_id)
+{
+ return 0;
+}
+
+static void vdma_nop(unsigned int dummy)
+{
+}
+
+
+static int vdma_get_dma_residue(unsigned int dummy)
+{
+ return virtual_dma_count + virtual_dma_residue;
+}
+
+
+static int fd_request_irq(void)
+{
+ if (can_use_virtual_dma)
+ return request_irq(FLOPPY_IRQ, floppy_hardint,
+ IRQF_DISABLED, "floppy", NULL);
+ else
+ return request_irq(FLOPPY_IRQ, floppy_interrupt,
+ IRQF_DISABLED, "floppy", NULL);
+}
+
+static unsigned long dma_mem_alloc(unsigned long size)
+{
+ return __get_dma_pages(GFP_KERNEL|__GFP_NORETRY, get_order(size));
+}
+
+
+static unsigned long vdma_mem_alloc(unsigned long size)
+{
+ return (unsigned long)vmalloc(size);
+
+}
+
+#define nodma_mem_alloc(size) vdma_mem_alloc(size)
+
+static void _fd_dma_mem_free(unsigned long addr, unsigned long size)
+{
+ if ((unsigned long)addr >= (unsigned long)high_memory)
+ vfree((void *)addr);
+ else
+ free_pages(addr, get_order(size));
+}
+
+#define fd_dma_mem_free(addr, size) _fd_dma_mem_free(addr, size)
+
+static void _fd_chose_dma_mode(char *addr, unsigned long size)
+{
+ if (can_use_virtual_dma == 2) {
+ if ((unsigned long)addr >= (unsigned long)high_memory ||
+ isa_virt_to_bus(addr) >= 0x1000000 ||
+ _CROSS_64KB(addr, size, 0))
+ use_virtual_dma = 1;
+ else
+ use_virtual_dma = 0;
+ } else {
+ use_virtual_dma = can_use_virtual_dma & 1;
+ }
+}
+
+#define fd_chose_dma_mode(addr, size) _fd_chose_dma_mode(addr, size)
+
+
+static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
+{
+ doing_pdma = 1;
+ virtual_dma_port = io;
+ virtual_dma_mode = (mode == DMA_MODE_WRITE);
+ virtual_dma_addr = addr;
+ virtual_dma_count = size;
+ virtual_dma_residue = 0;
+ return 0;
+}
+
+static int hard_dma_setup(char *addr, unsigned long size, int mode, int io)
+{
+#ifdef FLOPPY_SANITY_CHECK
+ if (CROSS_64KB(addr, size)) {
+ printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size);
+ return -1;
+ }
+#endif
+ /* actual, physical DMA */
+ doing_pdma = 0;
+ clear_dma_ff(FLOPPY_DMA);
+ set_dma_mode(FLOPPY_DMA, mode);
+ set_dma_addr(FLOPPY_DMA, isa_virt_to_bus(addr));
+ set_dma_count(FLOPPY_DMA, size);
+ enable_dma(FLOPPY_DMA);
+ return 0;
+}
+
+static struct fd_routine_l {
+ int (*_request_dma)(unsigned int dmanr, const char *device_id);
+ void (*_free_dma)(unsigned int dmanr);
+ int (*_get_dma_residue)(unsigned int dummy);
+ unsigned long (*_dma_mem_alloc)(unsigned long size);
+ int (*_dma_setup)(char *addr, unsigned long size, int mode, int io);
+} fd_routine[] = {
+ {
+ request_dma,
+ free_dma,
+ get_dma_residue,
+ dma_mem_alloc,
+ hard_dma_setup
+ },
+ {
+ vdma_request_dma,
+ vdma_nop,
+ vdma_get_dma_residue,
+ vdma_mem_alloc,
+ vdma_dma_setup
+ }
+};
+
+
+static int FDC1 = 0x3f0;
+static int FDC2 = -1;
+
+/*
+ * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
+ * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
+ * coincides with another rtc CMOS user. Paul G.
+ */
+#define FLOPPY0_TYPE \
+({ \
+ unsigned long flags; \
+ unsigned char val; \
+ spin_lock_irqsave(&rtc_lock, flags); \
+ val = (CMOS_READ(0x10) >> 4) & 15; \
+ spin_unlock_irqrestore(&rtc_lock, flags); \
+ val; \
+})
+
+#define FLOPPY1_TYPE \
+({ \
+ unsigned long flags; \
+ unsigned char val; \
+ spin_lock_irqsave(&rtc_lock, flags); \
+ val = CMOS_READ(0x10) & 15; \
+ spin_unlock_irqrestore(&rtc_lock, flags); \
+ val; \
+})
+
+#define N_FDC 2
+#define N_DRIVE 8
+
+#define EXTRA_FLOPPY_PARAMS
+
+#endif /* _ASM_X86_FLOPPY_H */
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
new file mode 100644
index 00000000000..06850a7194e
--- /dev/null
+++ b/arch/x86/include/asm/frame.h
@@ -0,0 +1,27 @@
+#ifdef __ASSEMBLY__
+
+#include <asm/dwarf2.h>
+
+/* The annotation hides the frame from the unwinder and makes it look
+ like a ordinary ebp save/restore. This avoids some special cases for
+ frame pointer later */
+#ifdef CONFIG_FRAME_POINTER
+ .macro FRAME
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp,0
+ movl %esp,%ebp
+ .endm
+ .macro ENDFRAME
+ popl %ebp
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebp
+ .endm
+#else
+ .macro FRAME
+ .endm
+ .macro ENDFRAME
+ .endm
+#endif
+
+#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
new file mode 100644
index 00000000000..47f7e65e6c1
--- /dev/null
+++ b/arch/x86/include/asm/ftrace.h
@@ -0,0 +1,24 @@
+#ifndef _ASM_X86_FTRACE_H
+#define _ASM_X86_FTRACE_H
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR ((long)(mcount))
+#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+ /*
+ * call mcount is "e8 <4 byte offset>"
+ * The addr points to the 4 byte offset and the caller of this
+ * function wants the pointer to e8. Simply subtract one.
+ */
+ return addr - 1;
+}
+#endif
+
+#endif /* CONFIG_FTRACE */
+
+#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
new file mode 100644
index 00000000000..1f11ce44e95
--- /dev/null
+++ b/arch/x86/include/asm/futex.h
@@ -0,0 +1,140 @@
+#ifndef _ASM_X86_FUTEX_H
+#define _ASM_X86_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <linux/uaccess.h>
+
+#include <asm/asm.h>
+#include <asm/errno.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
+ asm volatile("1:\t" insn "\n" \
+ "2:\t.section .fixup,\"ax\"\n" \
+ "3:\tmov\t%3, %1\n" \
+ "\tjmp\t2b\n" \
+ "\t.previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
+ : "i" (-EFAULT), "0" (oparg), "1" (0))
+
+#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
+ asm volatile("1:\tmovl %2, %0\n" \
+ "\tmovl\t%0, %3\n" \
+ "\t" insn "\n" \
+ "2:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \
+ "\tjnz\t1b\n" \
+ "3:\t.section .fixup,\"ax\"\n" \
+ "4:\tmov\t%5, %1\n" \
+ "\tjmp\t3b\n" \
+ "\t.previous\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
+ : "=&a" (oldval), "=&r" (ret), \
+ "+m" (*uaddr), "=&r" (tem) \
+ : "r" (oparg), "i" (-EFAULT), "1" (0))
+
+static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+{
+ int op = (encoded_op >> 28) & 7;
+ int cmp = (encoded_op >> 24) & 15;
+ int oparg = (encoded_op << 8) >> 20;
+ int cmparg = (encoded_op << 20) >> 20;
+ int oldval = 0, ret, tem;
+
+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+ oparg = 1 << oparg;
+
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+ return -EFAULT;
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
+ /* Real i386 machines can only support FUTEX_OP_SET */
+ if (op != FUTEX_OP_SET && boot_cpu_data.x86 == 3)
+ return -ENOSYS;
+#endif
+
+ pagefault_disable();
+
+ switch (op) {
+ case FUTEX_OP_SET:
+ __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_ADD:
+ __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
+ uaddr, oparg);
+ break;
+ case FUTEX_OP_OR:
+ __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_ANDN:
+ __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
+ break;
+ case FUTEX_OP_XOR:
+ __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ pagefault_enable();
+
+ if (!ret) {
+ switch (cmp) {
+ case FUTEX_OP_CMP_EQ:
+ ret = (oldval == cmparg);
+ break;
+ case FUTEX_OP_CMP_NE:
+ ret = (oldval != cmparg);
+ break;
+ case FUTEX_OP_CMP_LT:
+ ret = (oldval < cmparg);
+ break;
+ case FUTEX_OP_CMP_GE:
+ ret = (oldval >= cmparg);
+ break;
+ case FUTEX_OP_CMP_LE:
+ ret = (oldval <= cmparg);
+ break;
+ case FUTEX_OP_CMP_GT:
+ ret = (oldval > cmparg);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+ }
+ return ret;
+}
+
+static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
+ int newval)
+{
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
+ /* Real i386 machines have no cmpxchg instruction */
+ if (boot_cpu_data.x86 == 3)
+ return -ENOSYS;
+#endif
+
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+ return -EFAULT;
+
+ asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n"
+ "2:\t.section .fixup, \"ax\"\n"
+ "3:\tmov %2, %0\n"
+ "\tjmp 2b\n"
+ "\t.previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : "=a" (oldval), "+m" (*uaddr)
+ : "i" (-EFAULT), "r" (newval), "0" (oldval)
+ : "memory"
+ );
+
+ return oldval;
+}
+
+#endif
+#endif /* _ASM_X86_FUTEX_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
new file mode 100644
index 00000000000..74252264433
--- /dev/null
+++ b/arch/x86/include/asm/gart.h
@@ -0,0 +1,73 @@
+#ifndef _ASM_X86_GART_H
+#define _ASM_X86_GART_H
+
+#include <asm/e820.h>
+
+extern void set_up_gart_resume(u32, u32);
+
+extern int fallback_aper_order;
+extern int fallback_aper_force;
+extern int fix_aperture;
+
+/* PTE bits. */
+#define GPTE_VALID 1
+#define GPTE_COHERENT 2
+
+/* Aperture control register bits. */
+#define GARTEN (1<<0)
+#define DISGARTCPU (1<<4)
+#define DISGARTIO (1<<5)
+
+/* GART cache control register bits. */
+#define INVGART (1<<0)
+#define GARTPTEERR (1<<1)
+
+/* K8 On-cpu GART registers */
+#define AMD64_GARTAPERTURECTL 0x90
+#define AMD64_GARTAPERTUREBASE 0x94
+#define AMD64_GARTTABLEBASE 0x98
+#define AMD64_GARTCACHECTL 0x9c
+#define AMD64_GARTEN (1<<0)
+
+extern int agp_amd64_init(void);
+
+static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
+{
+ u32 tmp, ctl;
+
+ /* address of the mappings table */
+ addr >>= 12;
+ tmp = (u32) addr<<4;
+ tmp &= ~0xf;
+ pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
+
+ /* Enable GART translation for this hammer. */
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
+ ctl |= GARTEN;
+ ctl &= ~(DISGARTCPU | DISGARTIO);
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
+static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
+{
+ if (!aper_base)
+ return 0;
+
+ if (aper_base + aper_size > 0x100000000ULL) {
+ printk(KERN_INFO "Aperture beyond 4GB. Ignoring.\n");
+ return 0;
+ }
+ if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+ printk(KERN_INFO "Aperture pointing to e820 RAM. Ignoring.\n");
+ return 0;
+ }
+ if (aper_size < min_size) {
+ printk(KERN_INFO "Aperture too small (%d MB) than (%d MB)\n",
+ aper_size>>20, min_size>>20);
+ return 0;
+ }
+
+ return 1;
+}
+
+#endif /* _ASM_X86_GART_H */
diff --git a/arch/x86/include/asm/genapic.h b/arch/x86/include/asm/genapic.h
new file mode 100644
index 00000000000..d48bee663a6
--- /dev/null
+++ b/arch/x86/include/asm/genapic.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "genapic_32.h"
+#else
+# include "genapic_64.h"
+#endif
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
new file mode 100644
index 00000000000..5cbd4fcc06f
--- /dev/null
+++ b/arch/x86/include/asm/genapic_32.h
@@ -0,0 +1,126 @@
+#ifndef _ASM_X86_GENAPIC_32_H
+#define _ASM_X86_GENAPIC_32_H
+
+#include <asm/mpspec.h>
+
+/*
+ * Generic APIC driver interface.
+ *
+ * An straight forward mapping of the APIC related parts of the
+ * x86 subarchitecture interface to a dynamic object.
+ *
+ * This is used by the "generic" x86 subarchitecture.
+ *
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ */
+
+struct mpc_config_bus;
+struct mp_config_table;
+struct mpc_config_processor;
+
+struct genapic {
+ char *name;
+ int (*probe)(void);
+
+ int (*apic_id_registered)(void);
+ cpumask_t (*target_cpus)(void);
+ int int_delivery_mode;
+ int int_dest_mode;
+ int ESR_DISABLE;
+ int apic_destination_logical;
+ unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid);
+ unsigned long (*check_apicid_present)(int apicid);
+ int no_balance_irq;
+ int no_ioapic_check;
+ void (*init_apic_ldr)(void);
+ physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
+
+ void (*setup_apic_routing)(void);
+ int (*multi_timer_check)(int apic, int irq);
+ int (*apicid_to_node)(int logical_apicid);
+ int (*cpu_to_logical_apicid)(int cpu);
+ int (*cpu_present_to_apicid)(int mps_cpu);
+ physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
+ void (*setup_portio_remap)(void);
+ int (*check_phys_apicid_present)(int boot_cpu_physical_apicid);
+ void (*enable_apic_mode)(void);
+ u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb);
+
+ /* mpparse */
+ /* When one of the next two hooks returns 1 the genapic
+ is switched to this. Essentially they are additional probe
+ functions. */
+ int (*mps_oem_check)(struct mp_config_table *mpc, char *oem,
+ char *productid);
+ int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
+
+ unsigned (*get_apic_id)(unsigned long x);
+ unsigned long apic_id_mask;
+ unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
+ cpumask_t (*vector_allocation_domain)(int cpu);
+
+#ifdef CONFIG_SMP
+ /* ipi */
+ void (*send_IPI_mask)(cpumask_t mask, int vector);
+ void (*send_IPI_allbutself)(int vector);
+ void (*send_IPI_all)(int vector);
+#endif
+};
+
+#define APICFUNC(x) .x = x,
+
+/* More functions could be probably marked IPIFUNC and save some space
+ in UP GENERICARCH kernels, but I don't have the nerve right now
+ to untangle this mess. -AK */
+#ifdef CONFIG_SMP
+#define IPIFUNC(x) APICFUNC(x)
+#else
+#define IPIFUNC(x)
+#endif
+
+#define APIC_INIT(aname, aprobe) \
+{ \
+ .name = aname, \
+ .probe = aprobe, \
+ .int_delivery_mode = INT_DELIVERY_MODE, \
+ .int_dest_mode = INT_DEST_MODE, \
+ .no_balance_irq = NO_BALANCE_IRQ, \
+ .ESR_DISABLE = esr_disable, \
+ .apic_destination_logical = APIC_DEST_LOGICAL, \
+ APICFUNC(apic_id_registered) \
+ APICFUNC(target_cpus) \
+ APICFUNC(check_apicid_used) \
+ APICFUNC(check_apicid_present) \
+ APICFUNC(init_apic_ldr) \
+ APICFUNC(ioapic_phys_id_map) \
+ APICFUNC(setup_apic_routing) \
+ APICFUNC(multi_timer_check) \
+ APICFUNC(apicid_to_node) \
+ APICFUNC(cpu_to_logical_apicid) \
+ APICFUNC(cpu_present_to_apicid) \
+ APICFUNC(apicid_to_cpu_present) \
+ APICFUNC(setup_portio_remap) \
+ APICFUNC(check_phys_apicid_present) \
+ APICFUNC(mps_oem_check) \
+ APICFUNC(get_apic_id) \
+ .apic_id_mask = APIC_ID_MASK, \
+ APICFUNC(cpu_mask_to_apicid) \
+ APICFUNC(vector_allocation_domain) \
+ APICFUNC(acpi_madt_oem_check) \
+ IPIFUNC(send_IPI_mask) \
+ IPIFUNC(send_IPI_allbutself) \
+ IPIFUNC(send_IPI_all) \
+ APICFUNC(enable_apic_mode) \
+ APICFUNC(phys_pkg_id) \
+}
+
+extern struct genapic *genapic;
+
+enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
+#define get_uv_system_type() UV_NONE
+#define is_uv_system() 0
+#define uv_wakeup_secondary(a, b) 1
+#define uv_system_init() do {} while (0)
+
+
+#endif /* _ASM_X86_GENAPIC_32_H */
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
new file mode 100644
index 00000000000..13c4e96199e
--- /dev/null
+++ b/arch/x86/include/asm/genapic_64.h
@@ -0,0 +1,58 @@
+#ifndef _ASM_X86_GENAPIC_64_H
+#define _ASM_X86_GENAPIC_64_H
+
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch data struct.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+
+struct genapic {
+ char *name;
+ int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
+ u32 int_delivery_mode;
+ u32 int_dest_mode;
+ int (*apic_id_registered)(void);
+ cpumask_t (*target_cpus)(void);
+ cpumask_t (*vector_allocation_domain)(int cpu);
+ void (*init_apic_ldr)(void);
+ /* ipi */
+ void (*send_IPI_mask)(cpumask_t mask, int vector);
+ void (*send_IPI_allbutself)(int vector);
+ void (*send_IPI_all)(int vector);
+ void (*send_IPI_self)(int vector);
+ /* */
+ unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
+ unsigned int (*phys_pkg_id)(int index_msb);
+ unsigned int (*get_apic_id)(unsigned long x);
+ unsigned long (*set_apic_id)(unsigned int id);
+ unsigned long apic_id_mask;
+};
+
+extern struct genapic *genapic;
+
+extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
+extern struct genapic apic_x2apic_cluster;
+extern struct genapic apic_x2apic_phys;
+extern int acpi_madt_oem_check(char *, char *);
+
+extern void apic_send_IPI_self(int vector);
+enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
+extern enum uv_system_type get_uv_system_type(void);
+extern int is_uv_system(void);
+
+extern struct genapic apic_x2apic_uv_x;
+DECLARE_PER_CPU(int, x2apic_extra_bits);
+extern void uv_cpu_init(void);
+extern void uv_system_init(void);
+extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
+
+extern void setup_apic_routing(void);
+
+#endif /* _ASM_X86_GENAPIC_64_H */
diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h
new file mode 100644
index 00000000000..ad3c2ed7548
--- /dev/null
+++ b/arch/x86/include/asm/geode.h
@@ -0,0 +1,253 @@
+/*
+ * AMD Geode definitions
+ * Copyright (C) 2006, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_X86_GEODE_H
+#define _ASM_X86_GEODE_H
+
+#include <asm/processor.h>
+#include <linux/io.h>
+
+/* Generic southbridge functions */
+
+#define GEODE_DEV_PMS 0
+#define GEODE_DEV_ACPI 1
+#define GEODE_DEV_GPIO 2
+#define GEODE_DEV_MFGPT 3
+
+extern int geode_get_dev_base(unsigned int dev);
+
+/* Useful macros */
+#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS)
+#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI)
+#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO)
+#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT)
+
+/* MSRS */
+
+#define MSR_GLIU_P2D_RO0 0x10000029
+
+#define MSR_LX_GLD_MSR_CONFIG 0x48002001
+#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data
+ * sheet has the wrong value */
+#define MSR_GLCP_SYS_RSTPLL 0x4C000014
+#define MSR_GLCP_DOTPLL 0x4C000015
+
+#define MSR_LBAR_SMB 0x5140000B
+#define MSR_LBAR_GPIO 0x5140000C
+#define MSR_LBAR_MFGPT 0x5140000D
+#define MSR_LBAR_ACPI 0x5140000E
+#define MSR_LBAR_PMS 0x5140000F
+
+#define MSR_DIVIL_SOFT_RESET 0x51400017
+
+#define MSR_PIC_YSEL_LOW 0x51400020
+#define MSR_PIC_YSEL_HIGH 0x51400021
+#define MSR_PIC_ZSEL_LOW 0x51400022
+#define MSR_PIC_ZSEL_HIGH 0x51400023
+#define MSR_PIC_IRQM_LPC 0x51400025
+
+#define MSR_MFGPT_IRQ 0x51400028
+#define MSR_MFGPT_NR 0x51400029
+#define MSR_MFGPT_SETUP 0x5140002B
+
+#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */
+
+#define MSR_GX_GLD_MSR_CONFIG 0xC0002001
+#define MSR_GX_MSR_PADSEL 0xC0002011
+
+/* Resource Sizes */
+
+#define LBAR_GPIO_SIZE 0xFF
+#define LBAR_MFGPT_SIZE 0x40
+#define LBAR_ACPI_SIZE 0x40
+#define LBAR_PMS_SIZE 0x80
+
+/* ACPI registers (PMS block) */
+
+/*
+ * PM1_EN is only valid when VSA is enabled for 16 bit reads.
+ * When VSA is not enabled, *always* read both PM1_STS and PM1_EN
+ * with a 32 bit read at offset 0x0
+ */
+
+#define PM1_STS 0x00
+#define PM1_EN 0x02
+#define PM1_CNT 0x08
+#define PM2_CNT 0x0C
+#define PM_TMR 0x10
+#define PM_GPE0_STS 0x18
+#define PM_GPE0_EN 0x1C
+
+/* PMC registers (PMS block) */
+
+#define PM_SSD 0x00
+#define PM_SCXA 0x04
+#define PM_SCYA 0x08
+#define PM_OUT_SLPCTL 0x0C
+#define PM_SCLK 0x10
+#define PM_SED 0x1
+#define PM_SCXD 0x18
+#define PM_SCYD 0x1C
+#define PM_IN_SLPCTL 0x20
+#define PM_WKD 0x30
+#define PM_WKXD 0x34
+#define PM_RD 0x38
+#define PM_WKXA 0x3C
+#define PM_FSD 0x40
+#define PM_TSD 0x44
+#define PM_PSD 0x48
+#define PM_NWKD 0x4C
+#define PM_AWKD 0x50
+#define PM_SSC 0x54
+
+/* VSA2 magic values */
+
+#define VSA_VRC_INDEX 0xAC1C
+#define VSA_VRC_DATA 0xAC1E
+#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */
+#define VSA_VR_SIGNATURE 0x0003
+#define VSA_VR_MEM_SIZE 0x0200
+#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */
+#define GSW_VSA_SIG 0x534d /* General Software signature */
+/* GPIO */
+
+#define GPIO_OUTPUT_VAL 0x00
+#define GPIO_OUTPUT_ENABLE 0x04
+#define GPIO_OUTPUT_OPEN_DRAIN 0x08
+#define GPIO_OUTPUT_INVERT 0x0C
+#define GPIO_OUTPUT_AUX1 0x10
+#define GPIO_OUTPUT_AUX2 0x14
+#define GPIO_PULL_UP 0x18
+#define GPIO_PULL_DOWN 0x1C
+#define GPIO_INPUT_ENABLE 0x20
+#define GPIO_INPUT_INVERT 0x24
+#define GPIO_INPUT_FILTER 0x28
+#define GPIO_INPUT_EVENT_COUNT 0x2C
+#define GPIO_READ_BACK 0x30
+#define GPIO_INPUT_AUX1 0x34
+#define GPIO_EVENTS_ENABLE 0x38
+#define GPIO_LOCK_ENABLE 0x3C
+#define GPIO_POSITIVE_EDGE_EN 0x40
+#define GPIO_NEGATIVE_EDGE_EN 0x44
+#define GPIO_POSITIVE_EDGE_STS 0x48
+#define GPIO_NEGATIVE_EDGE_STS 0x4C
+
+#define GPIO_MAP_X 0xE0
+#define GPIO_MAP_Y 0xE4
+#define GPIO_MAP_Z 0xE8
+#define GPIO_MAP_W 0xEC
+
+static inline u32 geode_gpio(unsigned int nr)
+{
+ BUG_ON(nr > 28);
+ return 1 << nr;
+}
+
+extern void geode_gpio_set(u32, unsigned int);
+extern void geode_gpio_clear(u32, unsigned int);
+extern int geode_gpio_isset(u32, unsigned int);
+extern void geode_gpio_setup_event(unsigned int, int, int);
+extern void geode_gpio_set_irq(unsigned int, unsigned int);
+
+static inline void geode_gpio_event_irq(unsigned int gpio, int pair)
+{
+ geode_gpio_setup_event(gpio, pair, 0);
+}
+
+static inline void geode_gpio_event_pme(unsigned int gpio, int pair)
+{
+ geode_gpio_setup_event(gpio, pair, 1);
+}
+
+/* Specific geode tests */
+
+static inline int is_geode_gx(void)
+{
+ return ((boot_cpu_data.x86_vendor == X86_VENDOR_NSC) &&
+ (boot_cpu_data.x86 == 5) &&
+ (boot_cpu_data.x86_model == 5));
+}
+
+static inline int is_geode_lx(void)
+{
+ return ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ (boot_cpu_data.x86 == 5) &&
+ (boot_cpu_data.x86_model == 10));
+}
+
+static inline int is_geode(void)
+{
+ return (is_geode_gx() || is_geode_lx());
+}
+
+#ifdef CONFIG_MGEODE_LX
+extern int geode_has_vsa2(void);
+#else
+static inline int geode_has_vsa2(void)
+{
+ return 0;
+}
+#endif
+
+/* MFGPTs */
+
+#define MFGPT_MAX_TIMERS 8
+#define MFGPT_TIMER_ANY (-1)
+
+#define MFGPT_DOMAIN_WORKING 1
+#define MFGPT_DOMAIN_STANDBY 2
+#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY)
+
+#define MFGPT_CMP1 0
+#define MFGPT_CMP2 1
+
+#define MFGPT_EVENT_IRQ 0
+#define MFGPT_EVENT_NMI 1
+#define MFGPT_EVENT_RESET 3
+
+#define MFGPT_REG_CMP1 0
+#define MFGPT_REG_CMP2 2
+#define MFGPT_REG_COUNTER 4
+#define MFGPT_REG_SETUP 6
+
+#define MFGPT_SETUP_CNTEN (1 << 15)
+#define MFGPT_SETUP_CMP2 (1 << 14)
+#define MFGPT_SETUP_CMP1 (1 << 13)
+#define MFGPT_SETUP_SETUP (1 << 12)
+#define MFGPT_SETUP_STOPEN (1 << 11)
+#define MFGPT_SETUP_EXTEN (1 << 10)
+#define MFGPT_SETUP_REVEN (1 << 5)
+#define MFGPT_SETUP_CLKSEL (1 << 4)
+
+static inline void geode_mfgpt_write(int timer, u16 reg, u16 value)
+{
+ u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
+ outw(value, base + reg + (timer * 8));
+}
+
+static inline u16 geode_mfgpt_read(int timer, u16 reg)
+{
+ u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
+ return inw(base + reg + (timer * 8));
+}
+
+extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable);
+extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable);
+extern int geode_mfgpt_alloc_timer(int timer, int domain);
+
+#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1)
+#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0)
+
+#ifdef CONFIG_GEODE_MFGPT_TIMER
+extern int __init mfgpt_timer_setup(void);
+#else
+static inline int mfgpt_timer_setup(void) { return 0; }
+#endif
+
+#endif /* _ASM_X86_GEODE_H */
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
new file mode 100644
index 00000000000..49dbfdfa50f
--- /dev/null
+++ b/arch/x86/include/asm/gpio.h
@@ -0,0 +1,56 @@
+/*
+ * Generic GPIO API implementation for x86.
+ *
+ * Derived from the generic GPIO API for powerpc:
+ *
+ * Copyright (c) 2007-2008 MontaVista Software, Inc.
+ *
+ * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _ASM_X86_GPIO_H
+#define _ASM_X86_GPIO_H
+
+#include <asm-generic/gpio.h>
+
+#ifdef CONFIG_GPIOLIB
+
+/*
+ * Just call gpiolib.
+ */
+static inline int gpio_get_value(unsigned int gpio)
+{
+ return __gpio_get_value(gpio);
+}
+
+static inline void gpio_set_value(unsigned int gpio, int value)
+{
+ __gpio_set_value(gpio, value);
+}
+
+static inline int gpio_cansleep(unsigned int gpio)
+{
+ return __gpio_cansleep(gpio);
+}
+
+/*
+ * Not implemented, yet.
+ */
+static inline int gpio_to_irq(unsigned int gpio)
+{
+ return -ENOSYS;
+}
+
+static inline int irq_to_gpio(unsigned int irq)
+{
+ return -EINVAL;
+}
+
+#endif /* CONFIG_GPIOLIB */
+
+#endif /* _ASM_X86_GPIO_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
new file mode 100644
index 00000000000..000787df66e
--- /dev/null
+++ b/arch/x86/include/asm/hardirq.h
@@ -0,0 +1,11 @@
+#ifdef CONFIG_X86_32
+# include "hardirq_32.h"
+#else
+# include "hardirq_64.h"
+#endif
+
+extern u64 arch_irq_stat_cpu(unsigned int cpu);
+#define arch_irq_stat_cpu arch_irq_stat_cpu
+
+extern u64 arch_irq_stat(void);
+#define arch_irq_stat arch_irq_stat
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
new file mode 100644
index 00000000000..5ca135e72f2
--- /dev/null
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -0,0 +1,28 @@
+#ifndef _ASM_X86_HARDIRQ_32_H
+#define _ASM_X86_HARDIRQ_32_H
+
+#include <linux/threads.h>
+#include <linux/irq.h>
+
+typedef struct {
+ unsigned int __softirq_pending;
+ unsigned long idle_timestamp;
+ unsigned int __nmi_count; /* arch dependent */
+ unsigned int apic_timer_irqs; /* arch dependent */
+ unsigned int irq0_irqs;
+ unsigned int irq_resched_count;
+ unsigned int irq_call_count;
+ unsigned int irq_tlb_count;
+ unsigned int irq_thermal_count;
+ unsigned int irq_spurious_count;
+} ____cacheline_aligned irq_cpustat_t;
+
+DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
+
+#define __ARCH_IRQ_STAT
+#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
+
+void ack_bad_irq(unsigned int irq);
+#include <linux/irq_cpustat.h>
+
+#endif /* _ASM_X86_HARDIRQ_32_H */
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
new file mode 100644
index 00000000000..1ba381fc51d
--- /dev/null
+++ b/arch/x86/include/asm/hardirq_64.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_X86_HARDIRQ_64_H
+#define _ASM_X86_HARDIRQ_64_H
+
+#include <linux/threads.h>
+#include <linux/irq.h>
+#include <asm/pda.h>
+#include <asm/apic.h>
+
+/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
+#define MAX_HARDIRQS_PER_CPU NR_VECTORS
+
+#define __ARCH_IRQ_STAT 1
+
+#define local_softirq_pending() read_pda(__softirq_pending)
+
+#define __ARCH_SET_SOFTIRQ_PENDING 1
+
+#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
+#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
+
+extern void ack_bad_irq(unsigned int irq);
+
+#endif /* _ASM_X86_HARDIRQ_64_H */
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
new file mode 100644
index 00000000000..a3b3b7c3027
--- /dev/null
+++ b/arch/x86/include/asm/highmem.h
@@ -0,0 +1,82 @@
+/*
+ * highmem.h: virtual kernel memory mappings for high memory
+ *
+ * Used in CONFIG_HIGHMEM systems for memory pages which
+ * are not addressable by direct kernel virtual addresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ * Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#ifndef _ASM_X86_HIGHMEM_H
+#define _ASM_X86_HIGHMEM_H
+
+#ifdef __KERNEL__
+
+#include <linux/interrupt.h>
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#include <asm/tlbflush.h>
+#include <asm/paravirt.h>
+
+/* declarations for highmem.c */
+extern unsigned long highstart_pfn, highend_pfn;
+
+extern pte_t *kmap_pte;
+extern pgprot_t kmap_prot;
+extern pte_t *pkmap_page_table;
+
+/*
+ * Right now we initialize only a single pte table. It can be extended
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+/*
+ * Ordering is:
+ *
+ * FIXADDR_TOP
+ * fixed_addresses
+ * FIXADDR_START
+ * temp fixed addresses
+ * FIXADDR_BOOT_START
+ * Persistent kmap area
+ * PKMAP_BASE
+ * VMALLOC_END
+ * Vmalloc area
+ * VMALLOC_START
+ * high_memory
+ */
+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
+#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
+#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+
+extern void *kmap_high(struct page *page);
+extern void kunmap_high(struct page *page);
+
+void *kmap(struct page *page);
+void kunmap(struct page *page);
+void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
+void *kmap_atomic(struct page *page, enum km_type type);
+void kunmap_atomic(void *kvaddr, enum km_type type);
+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+struct page *kmap_atomic_to_page(void *ptr);
+
+#ifndef CONFIG_PARAVIRT
+#define kmap_atomic_pte(page, type) kmap_atomic(page, type)
+#endif
+
+#define flush_cache_kmaps() do { } while (0)
+
+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_HIGHMEM_H */
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
new file mode 100644
index 00000000000..1c22cb05ad6
--- /dev/null
+++ b/arch/x86/include/asm/hpet.h
@@ -0,0 +1,114 @@
+#ifndef _ASM_X86_HPET_H
+#define _ASM_X86_HPET_H
+
+#include <linux/msi.h>
+
+#ifdef CONFIG_HPET_TIMER
+
+#define HPET_MMAP_SIZE 1024
+
+#define HPET_ID 0x000
+#define HPET_PERIOD 0x004
+#define HPET_CFG 0x010
+#define HPET_STATUS 0x020
+#define HPET_COUNTER 0x0f0
+
+#define HPET_Tn_CFG(n) (0x100 + 0x20 * n)
+#define HPET_Tn_CMP(n) (0x108 + 0x20 * n)
+#define HPET_Tn_ROUTE(n) (0x110 + 0x20 * n)
+
+#define HPET_T0_CFG 0x100
+#define HPET_T0_CMP 0x108
+#define HPET_T0_ROUTE 0x110
+#define HPET_T1_CFG 0x120
+#define HPET_T1_CMP 0x128
+#define HPET_T1_ROUTE 0x130
+#define HPET_T2_CFG 0x140
+#define HPET_T2_CMP 0x148
+#define HPET_T2_ROUTE 0x150
+
+#define HPET_ID_REV 0x000000ff
+#define HPET_ID_NUMBER 0x00001f00
+#define HPET_ID_64BIT 0x00002000
+#define HPET_ID_LEGSUP 0x00008000
+#define HPET_ID_VENDOR 0xffff0000
+#define HPET_ID_NUMBER_SHIFT 8
+#define HPET_ID_VENDOR_SHIFT 16
+
+#define HPET_ID_VENDOR_8086 0x8086
+
+#define HPET_CFG_ENABLE 0x001
+#define HPET_CFG_LEGACY 0x002
+#define HPET_LEGACY_8254 2
+#define HPET_LEGACY_RTC 8
+
+#define HPET_TN_LEVEL 0x0002
+#define HPET_TN_ENABLE 0x0004
+#define HPET_TN_PERIODIC 0x0008
+#define HPET_TN_PERIODIC_CAP 0x0010
+#define HPET_TN_64BIT_CAP 0x0020
+#define HPET_TN_SETVAL 0x0040
+#define HPET_TN_32BIT 0x0100
+#define HPET_TN_ROUTE 0x3e00
+#define HPET_TN_FSB 0x4000
+#define HPET_TN_FSB_CAP 0x8000
+#define HPET_TN_ROUTE_SHIFT 9
+
+/* Max HPET Period is 10^8 femto sec as in HPET spec */
+#define HPET_MAX_PERIOD 100000000UL
+/*
+ * Min HPET period is 10^5 femto sec just for safety. If it is less than this,
+ * then 32 bit HPET counter wrapsaround in less than 0.5 sec.
+ */
+#define HPET_MIN_PERIOD 100000UL
+
+/* hpet memory map physical address */
+extern unsigned long hpet_address;
+extern unsigned long force_hpet_address;
+extern int hpet_force_user;
+extern int is_hpet_enabled(void);
+extern int hpet_enable(void);
+extern void hpet_disable(void);
+extern unsigned long hpet_readl(unsigned long a);
+extern void force_hpet_resume(void);
+
+extern void hpet_msi_unmask(unsigned int irq);
+extern void hpet_msi_mask(unsigned int irq);
+extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
+extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
+
+#ifdef CONFIG_PCI_MSI
+extern int arch_setup_hpet_msi(unsigned int irq);
+#else
+static inline int arch_setup_hpet_msi(unsigned int irq)
+{
+ return -EINVAL;
+}
+#endif
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+
+#include <linux/interrupt.h>
+
+typedef irqreturn_t (*rtc_irq_handler)(int interrupt, void *cookie);
+extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
+extern int hpet_set_rtc_irq_bit(unsigned long bit_mask);
+extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
+ unsigned char sec);
+extern int hpet_set_periodic_freq(unsigned long freq);
+extern int hpet_rtc_dropped_irq(void);
+extern int hpet_rtc_timer_init(void);
+extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
+extern int hpet_register_irq_handler(rtc_irq_handler handler);
+extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
+
+#endif /* CONFIG_HPET_EMULATE_RTC */
+
+#else /* CONFIG_HPET_TIMER */
+
+static inline int hpet_enable(void) { return 0; }
+static inline int is_hpet_enabled(void) { return 0; }
+#define hpet_readl(a) 0
+
+#endif
+#endif /* _ASM_X86_HPET_H */
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
new file mode 100644
index 00000000000..439a9acc132
--- /dev/null
+++ b/arch/x86/include/asm/hugetlb.h
@@ -0,0 +1,93 @@
+#ifndef _ASM_X86_HUGETLB_H
+#define _ASM_X86_HUGETLB_H
+
+#include <asm/page.h>
+
+
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+ unsigned long addr,
+ unsigned long len) {
+ return 0;
+}
+
+/*
+ * If the arch doesn't supply something else, assume that hugepage
+ * size aligned regions are ok without further preparation.
+ */
+static inline int prepare_hugepage_range(struct file *file,
+ unsigned long addr, unsigned long len)
+{
+ struct hstate *h = hstate_file(file);
+ if (len & ~huge_page_mask(h))
+ return -EINVAL;
+ if (addr & ~huge_page_mask(h))
+ return -EINVAL;
+ return 0;
+}
+
+static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
+}
+
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor,
+ unsigned long ceiling)
+{
+ free_pgd_range(tlb, addr, end, floor, ceiling);
+}
+
+static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ set_pte_at(mm, addr, ptep, pte);
+}
+
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ return ptep_get_and_clear(mm, addr, ptep);
+}
+
+static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+}
+
+static inline int huge_pte_none(pte_t pte)
+{
+ return pte_none(pte);
+}
+
+static inline pte_t huge_pte_wrprotect(pte_t pte)
+{
+ return pte_wrprotect(pte);
+}
+
+static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ ptep_set_wrprotect(mm, addr, ptep);
+}
+
+static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, int dirty)
+{
+ return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+}
+
+static inline pte_t huge_ptep_get(pte_t *ptep)
+{
+ return *ptep;
+}
+
+static inline int arch_prepare_hugepage(struct page *page)
+{
+ return 0;
+}
+
+static inline void arch_release_hugepage(struct page *page)
+{
+}
+
+#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
new file mode 100644
index 00000000000..b97aecb0b61
--- /dev/null
+++ b/arch/x86/include/asm/hw_irq.h
@@ -0,0 +1,131 @@
+#ifndef _ASM_X86_HW_IRQ_H
+#define _ASM_X86_HW_IRQ_H
+
+/*
+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
+ *
+ * moved some of the old arch/i386/kernel/irq.h to here. VY
+ *
+ * IRQ/IPI changes taken from work by Thomas Radke
+ * <tomsoft@informatik.tu-chemnitz.de>
+ *
+ * hacked by Andi Kleen for x86-64.
+ * unified by tglx
+ */
+
+#include <asm/irq_vectors.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/smp.h>
+
+#include <asm/atomic.h>
+#include <asm/irq.h>
+#include <asm/sections.h>
+
+#define platform_legacy_irq(irq) ((irq) < 16)
+
+/* Interrupt handlers registered during init_IRQ */
+extern void apic_timer_interrupt(void);
+extern void error_interrupt(void);
+extern void spurious_interrupt(void);
+extern void thermal_interrupt(void);
+extern void reschedule_interrupt(void);
+
+extern void invalidate_interrupt(void);
+extern void invalidate_interrupt0(void);
+extern void invalidate_interrupt1(void);
+extern void invalidate_interrupt2(void);
+extern void invalidate_interrupt3(void);
+extern void invalidate_interrupt4(void);
+extern void invalidate_interrupt5(void);
+extern void invalidate_interrupt6(void);
+extern void invalidate_interrupt7(void);
+
+extern void irq_move_cleanup_interrupt(void);
+extern void threshold_interrupt(void);
+
+extern void call_function_interrupt(void);
+extern void call_function_single_interrupt(void);
+
+/* PIC specific functions */
+extern void disable_8259A_irq(unsigned int irq);
+extern void enable_8259A_irq(unsigned int irq);
+extern int i8259A_irq_pending(unsigned int irq);
+extern void make_8259A_irq(unsigned int irq);
+extern void init_8259A(int aeoi);
+
+/* IOAPIC */
+#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
+extern unsigned long io_apic_irqs;
+
+extern void init_VISWS_APIC_irqs(void);
+extern void setup_IO_APIC(void);
+extern void disable_IO_APIC(void);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+extern void setup_ioapic_dest(void);
+
+#ifdef CONFIG_X86_64
+extern void enable_IO_APIC(void);
+#endif
+
+/* IPI functions */
+#ifdef CONFIG_X86_32
+extern void send_IPI_self(int vector);
+#endif
+extern void send_IPI(int dest, int vector);
+
+/* Statistics */
+extern atomic_t irq_err_count;
+extern atomic_t irq_mis_count;
+
+/* EISA */
+extern void eisa_set_level_irq(unsigned int irq);
+
+/* Voyager functions */
+extern asmlinkage void vic_cpi_interrupt(void);
+extern asmlinkage void vic_sys_interrupt(void);
+extern asmlinkage void vic_cmn_interrupt(void);
+extern asmlinkage void qic_timer_interrupt(void);
+extern asmlinkage void qic_invalidate_interrupt(void);
+extern asmlinkage void qic_reschedule_interrupt(void);
+extern asmlinkage void qic_enable_irq_interrupt(void);
+extern asmlinkage void qic_call_function_interrupt(void);
+
+/* SMP */
+extern void smp_apic_timer_interrupt(struct pt_regs *);
+extern void smp_spurious_interrupt(struct pt_regs *);
+extern void smp_error_interrupt(struct pt_regs *);
+#ifdef CONFIG_X86_SMP
+extern void smp_reschedule_interrupt(struct pt_regs *);
+extern void smp_call_function_interrupt(struct pt_regs *);
+extern void smp_call_function_single_interrupt(struct pt_regs *);
+#ifdef CONFIG_X86_32
+extern void smp_invalidate_interrupt(struct pt_regs *);
+#else
+extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
+#endif
+#endif
+
+#ifdef CONFIG_X86_32
+extern void (*const interrupt[NR_VECTORS])(void);
+#endif
+
+typedef int vector_irq_t[NR_VECTORS];
+DECLARE_PER_CPU(vector_irq_t, vector_irq);
+
+#ifdef CONFIG_X86_IO_APIC
+extern void lock_vector_lock(void);
+extern void unlock_vector_lock(void);
+extern void __setup_vector_irq(int cpu);
+#else
+static inline void lock_vector_lock(void) {}
+static inline void unlock_vector_lock(void) {}
+static inline void __setup_vector_irq(int cpu) {}
+#endif
+
+#endif /* !ASSEMBLY_ */
+
+#endif /* _ASM_X86_HW_IRQ_H */
diff --git a/arch/x86/include/asm/hypertransport.h b/arch/x86/include/asm/hypertransport.h
new file mode 100644
index 00000000000..334b1a885d9
--- /dev/null
+++ b/arch/x86/include/asm/hypertransport.h
@@ -0,0 +1,45 @@
+#ifndef _ASM_X86_HYPERTRANSPORT_H
+#define _ASM_X86_HYPERTRANSPORT_H
+
+/*
+ * Constants for x86 Hypertransport Interrupts.
+ */
+
+#define HT_IRQ_LOW_BASE 0xf8000000
+
+#define HT_IRQ_LOW_VECTOR_SHIFT 16
+#define HT_IRQ_LOW_VECTOR_MASK 0x00ff0000
+#define HT_IRQ_LOW_VECTOR(v) \
+ (((v) << HT_IRQ_LOW_VECTOR_SHIFT) & HT_IRQ_LOW_VECTOR_MASK)
+
+#define HT_IRQ_LOW_DEST_ID_SHIFT 8
+#define HT_IRQ_LOW_DEST_ID_MASK 0x0000ff00
+#define HT_IRQ_LOW_DEST_ID(v) \
+ (((v) << HT_IRQ_LOW_DEST_ID_SHIFT) & HT_IRQ_LOW_DEST_ID_MASK)
+
+#define HT_IRQ_LOW_DM_PHYSICAL 0x0000000
+#define HT_IRQ_LOW_DM_LOGICAL 0x0000040
+
+#define HT_IRQ_LOW_RQEOI_EDGE 0x0000000
+#define HT_IRQ_LOW_RQEOI_LEVEL 0x0000020
+
+
+#define HT_IRQ_LOW_MT_FIXED 0x0000000
+#define HT_IRQ_LOW_MT_ARBITRATED 0x0000004
+#define HT_IRQ_LOW_MT_SMI 0x0000008
+#define HT_IRQ_LOW_MT_NMI 0x000000c
+#define HT_IRQ_LOW_MT_INIT 0x0000010
+#define HT_IRQ_LOW_MT_STARTUP 0x0000014
+#define HT_IRQ_LOW_MT_EXTINT 0x0000018
+#define HT_IRQ_LOW_MT_LINT1 0x000008c
+#define HT_IRQ_LOW_MT_LINT0 0x0000098
+
+#define HT_IRQ_LOW_IRQ_MASKED 0x0000001
+
+
+#define HT_IRQ_HIGH_DEST_ID_SHIFT 0
+#define HT_IRQ_HIGH_DEST_ID_MASK 0x00ffffff
+#define HT_IRQ_HIGH_DEST_ID(v) \
+ ((((v) >> 8) << HT_IRQ_HIGH_DEST_ID_SHIFT) & HT_IRQ_HIGH_DEST_ID_MASK)
+
+#endif /* _ASM_X86_HYPERTRANSPORT_H */
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
new file mode 100644
index 00000000000..48f0004db8c
--- /dev/null
+++ b/arch/x86/include/asm/i387.h
@@ -0,0 +1,400 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_I387_H
+#define _ASM_X86_I387_H
+
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/regset.h>
+#include <linux/hardirq.h>
+#include <asm/asm.h>
+#include <asm/processor.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/uaccess.h>
+#include <asm/xsave.h>
+
+extern unsigned int sig_xstate_size;
+extern void fpu_init(void);
+extern void mxcsr_feature_mask_init(void);
+extern int init_fpu(struct task_struct *child);
+extern asmlinkage void math_state_restore(void);
+extern void init_thread_xstate(void);
+extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
+
+extern user_regset_active_fn fpregs_active, xfpregs_active;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
+
+extern struct _fpx_sw_bytes fx_sw_reserved;
+#ifdef CONFIG_IA32_EMULATION
+extern unsigned int sig_xstate_ia32_size;
+extern struct _fpx_sw_bytes fx_sw_reserved_ia32;
+struct _fpstate_ia32;
+struct _xstate_ia32;
+extern int save_i387_xstate_ia32(void __user *buf);
+extern int restore_i387_xstate_ia32(void __user *buf);
+#endif
+
+#define X87_FSW_ES (1 << 7) /* Exception Summary */
+
+#ifdef CONFIG_X86_64
+
+/* Ignore delayed exceptions from user space */
+static inline void tolerant_fwait(void)
+{
+ asm volatile("1: fwait\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b));
+}
+
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
+{
+ int err;
+
+ asm volatile("1: rex64/fxrstor (%[fx])\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err)
+#if 0 /* See comment in __save_init_fpu() below. */
+ : [fx] "r" (fx), "m" (*fx), "0" (0));
+#else
+ : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
+#endif
+ return err;
+}
+
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_XSAVE)
+ return xrstor_checking(&tsk->thread.xstate->xsave);
+ else
+ return fxrstor_checking(&tsk->thread.xstate->fxsave);
+}
+
+/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
+ is pending. Clear the x87 state here by setting it to fixed
+ values. The kernel data segment can be sometimes 0 and sometimes
+ new user value. Both should be ok.
+ Use the PDA as safe address because it should be already in L1. */
+static inline void clear_fpu_state(struct task_struct *tsk)
+{
+ struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
+ struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+
+ /*
+ * xsave header may indicate the init state of the FP.
+ */
+ if ((task_thread_info(tsk)->status & TS_XSAVE) &&
+ !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
+ return;
+
+ if (unlikely(fx->swd & X87_FSW_ES))
+ asm volatile("fnclex");
+ alternative_input(ASM_NOP8 ASM_NOP2,
+ " emms\n" /* clear stack tags */
+ " fildl %%gs:0", /* load to clear state */
+ X86_FEATURE_FXSAVE_LEAK);
+}
+
+static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
+{
+ int err;
+
+ asm volatile("1: rex64/fxsave (%[fx])\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err), "=m" (*fx)
+#if 0 /* See comment in __fxsave_clear() below. */
+ : [fx] "r" (fx), "0" (0));
+#else
+ : [fx] "cdaSDb" (fx), "0" (0));
+#endif
+ if (unlikely(err) &&
+ __clear_user(fx, sizeof(struct i387_fxsave_struct)))
+ err = -EFAULT;
+ /* No need to clear here because the caller clears USED_MATH */
+ return err;
+}
+
+static inline void fxsave(struct task_struct *tsk)
+{
+ /* Using "rex64; fxsave %0" is broken because, if the memory operand
+ uses any extended registers for addressing, a second REX prefix
+ will be generated (to the assembler, rex64 followed by semicolon
+ is a separate instruction), and hence the 64-bitness is lost. */
+#if 0
+ /* Using "fxsaveq %0" would be the ideal choice, but is only supported
+ starting with gas 2.16. */
+ __asm__ __volatile__("fxsaveq %0"
+ : "=m" (tsk->thread.xstate->fxsave));
+#elif 0
+ /* Using, as a workaround, the properly prefixed form below isn't
+ accepted by any binutils version so far released, complaining that
+ the same type of prefix is used twice if an extended register is
+ needed for addressing (fix submitted to mainline 2005-11-21). */
+ __asm__ __volatile__("rex64/fxsave %0"
+ : "=m" (tsk->thread.xstate->fxsave));
+#else
+ /* This, however, we can work around by forcing the compiler to select
+ an addressing mode that doesn't require extended registers. */
+ __asm__ __volatile__("rex64/fxsave (%1)"
+ : "=m" (tsk->thread.xstate->fxsave)
+ : "cdaSDb" (&tsk->thread.xstate->fxsave));
+#endif
+}
+
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_XSAVE)
+ xsave(tsk);
+ else
+ fxsave(tsk);
+
+ clear_fpu_state(tsk);
+ task_thread_info(tsk)->status &= ~TS_USEDFPU;
+}
+
+#else /* CONFIG_X86_32 */
+
+extern void finit(void);
+
+static inline void tolerant_fwait(void)
+{
+ asm volatile("fnclex ; fwait");
+}
+
+static inline void restore_fpu(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_XSAVE) {
+ xrstor_checking(&tsk->thread.xstate->xsave);
+ return;
+ }
+ /*
+ * The "nop" is needed to make the instructions the same
+ * length.
+ */
+ alternative_input(
+ "nop ; frstor %1",
+ "fxrstor %1",
+ X86_FEATURE_FXSR,
+ "m" (tsk->thread.xstate->fxsave));
+}
+
+/* We need a safe address that is cheap to find and that is already
+ in L1 during context switch. The best choices are unfortunately
+ different for UP and SMP */
+#ifdef CONFIG_SMP
+#define safe_address (__per_cpu_offset[0])
+#else
+#define safe_address (kstat_cpu(0).cpustat.user)
+#endif
+
+/*
+ * These must be called with preempt disabled
+ */
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_XSAVE) {
+ struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
+ struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+
+ xsave(tsk);
+
+ /*
+ * xsave header may indicate the init state of the FP.
+ */
+ if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
+ goto end;
+
+ if (unlikely(fx->swd & X87_FSW_ES))
+ asm volatile("fnclex");
+
+ /*
+ * we can do a simple return here or be paranoid :)
+ */
+ goto clear_state;
+ }
+
+ /* Use more nops than strictly needed in case the compiler
+ varies code */
+ alternative_input(
+ "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
+ "fxsave %[fx]\n"
+ "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
+ X86_FEATURE_FXSR,
+ [fx] "m" (tsk->thread.xstate->fxsave),
+ [fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory");
+clear_state:
+ /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+ is pending. Clear the x87 state here by setting it to fixed
+ values. safe_address is a random variable that should be in L1 */
+ alternative_input(
+ GENERIC_NOP8 GENERIC_NOP2,
+ "emms\n\t" /* clear stack tags */
+ "fildl %[addr]", /* set F?P to defined value */
+ X86_FEATURE_FXSAVE_LEAK,
+ [addr] "m" (safe_address));
+end:
+ task_thread_info(tsk)->status &= ~TS_USEDFPU;
+}
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Signal frame handlers...
+ */
+extern int save_i387_xstate(void __user *buf);
+extern int restore_i387_xstate(void __user *buf);
+
+static inline void __unlazy_fpu(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_USEDFPU) {
+ __save_init_fpu(tsk);
+ stts();
+ } else
+ tsk->fpu_counter = 0;
+}
+
+static inline void __clear_fpu(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_USEDFPU) {
+ tolerant_fwait();
+ task_thread_info(tsk)->status &= ~TS_USEDFPU;
+ stts();
+ }
+}
+
+static inline void kernel_fpu_begin(void)
+{
+ struct thread_info *me = current_thread_info();
+ preempt_disable();
+ if (me->status & TS_USEDFPU)
+ __save_init_fpu(me->task);
+ else
+ clts();
+}
+
+static inline void kernel_fpu_end(void)
+{
+ stts();
+ preempt_enable();
+}
+
+/*
+ * Some instructions like VIA's padlock instructions generate a spurious
+ * DNA fault but don't modify SSE registers. And these instructions
+ * get used from interrupt context aswell. To prevent these kernel instructions
+ * in interrupt context interact wrongly with other user/kernel fpu usage, we
+ * should use them only in the context of irq_ts_save/restore()
+ */
+static inline int irq_ts_save(void)
+{
+ /*
+ * If we are in process context, we are ok to take a spurious DNA fault.
+ * Otherwise, doing clts() in process context require pre-emption to
+ * be disabled or some heavy lifting like kernel_fpu_begin()
+ */
+ if (!in_interrupt())
+ return 0;
+
+ if (read_cr0() & X86_CR0_TS) {
+ clts();
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline void irq_ts_restore(int TS_state)
+{
+ if (TS_state)
+ stts();
+}
+
+#ifdef CONFIG_X86_64
+
+static inline void save_init_fpu(struct task_struct *tsk)
+{
+ __save_init_fpu(tsk);
+ stts();
+}
+
+#define unlazy_fpu __unlazy_fpu
+#define clear_fpu __clear_fpu
+
+#else /* CONFIG_X86_32 */
+
+/*
+ * These disable preemption on their own and are safe
+ */
+static inline void save_init_fpu(struct task_struct *tsk)
+{
+ preempt_disable();
+ __save_init_fpu(tsk);
+ stts();
+ preempt_enable();
+}
+
+static inline void unlazy_fpu(struct task_struct *tsk)
+{
+ preempt_disable();
+ __unlazy_fpu(tsk);
+ preempt_enable();
+}
+
+static inline void clear_fpu(struct task_struct *tsk)
+{
+ preempt_disable();
+ __clear_fpu(tsk);
+ preempt_enable();
+}
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * i387 state interaction
+ */
+static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
+{
+ if (cpu_has_fxsr) {
+ return tsk->thread.xstate->fxsave.cwd;
+ } else {
+ return (unsigned short)tsk->thread.xstate->fsave.cwd;
+ }
+}
+
+static inline unsigned short get_fpu_swd(struct task_struct *tsk)
+{
+ if (cpu_has_fxsr) {
+ return tsk->thread.xstate->fxsave.swd;
+ } else {
+ return (unsigned short)tsk->thread.xstate->fsave.swd;
+ }
+}
+
+static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
+{
+ if (cpu_has_xmm) {
+ return tsk->thread.xstate->fxsave.mxcsr;
+ } else {
+ return MXCSR_DEFAULT;
+ }
+}
+
+#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
new file mode 100644
index 00000000000..1edbf89680f
--- /dev/null
+++ b/arch/x86/include/asm/i8253.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_I8253_H
+#define _ASM_X86_I8253_H
+
+/* i8253A PIT registers */
+#define PIT_MODE 0x43
+#define PIT_CH0 0x40
+#define PIT_CH2 0x42
+
+extern spinlock_t i8253_lock;
+
+extern struct clock_event_device *global_clock_event;
+
+extern void setup_pit_timer(void);
+
+#define inb_pit inb_p
+#define outb_pit outb_p
+
+#endif /* _ASM_X86_I8253_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
new file mode 100644
index 00000000000..58d7091eeb1
--- /dev/null
+++ b/arch/x86/include/asm/i8259.h
@@ -0,0 +1,63 @@
+#ifndef _ASM_X86_I8259_H
+#define _ASM_X86_I8259_H
+
+#include <linux/delay.h>
+
+extern unsigned int cached_irq_mask;
+
+#define __byte(x, y) (((unsigned char *)&(y))[x])
+#define cached_master_mask (__byte(0, cached_irq_mask))
+#define cached_slave_mask (__byte(1, cached_irq_mask))
+
+/* i8259A PIC registers */
+#define PIC_MASTER_CMD 0x20
+#define PIC_MASTER_IMR 0x21
+#define PIC_MASTER_ISR PIC_MASTER_CMD
+#define PIC_MASTER_POLL PIC_MASTER_ISR
+#define PIC_MASTER_OCW3 PIC_MASTER_ISR
+#define PIC_SLAVE_CMD 0xa0
+#define PIC_SLAVE_IMR 0xa1
+
+/* i8259A PIC related value */
+#define PIC_CASCADE_IR 2
+#define MASTER_ICW4_DEFAULT 0x01
+#define SLAVE_ICW4_DEFAULT 0x01
+#define PIC_ICW4_AEOI 2
+
+extern spinlock_t i8259A_lock;
+
+extern void init_8259A(int auto_eoi);
+extern void enable_8259A_irq(unsigned int irq);
+extern void disable_8259A_irq(unsigned int irq);
+extern unsigned int startup_8259A_irq(unsigned int irq);
+
+/* the PIC may need a careful delay on some platforms, hence specific calls */
+static inline unsigned char inb_pic(unsigned int port)
+{
+ unsigned char value = inb(port);
+
+ /*
+ * delay for some accesses to PIC on motherboard or in chipset
+ * must be at least one microsecond, so be safe here:
+ */
+ udelay(2);
+
+ return value;
+}
+
+static inline void outb_pic(unsigned char value, unsigned int port)
+{
+ outb(value, port);
+ /*
+ * delay for some accesses to PIC on motherboard or in chipset
+ * must be at least one microsecond, so be safe here:
+ */
+ udelay(2);
+}
+
+extern struct irq_chip i8259A_chip;
+
+extern void mask_8259A(void);
+extern void unmask_8259A(void);
+
+#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
new file mode 100644
index 00000000000..97989c0e534
--- /dev/null
+++ b/arch/x86/include/asm/ia32.h
@@ -0,0 +1,170 @@
+#ifndef _ASM_X86_IA32_H
+#define _ASM_X86_IA32_H
+
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <linux/compat.h>
+
+/*
+ * 32 bit structures for IA32 support.
+ */
+
+#include <asm/sigcontext32.h>
+
+/* signal.h */
+struct sigaction32 {
+ unsigned int sa_handler; /* Really a pointer, but need to deal
+ with 32 bits */
+ unsigned int sa_flags;
+ unsigned int sa_restorer; /* Another 32 bit pointer */
+ compat_sigset_t sa_mask; /* A 32 bit mask */
+};
+
+struct old_sigaction32 {
+ unsigned int sa_handler; /* Really a pointer, but need to deal
+ with 32 bits */
+ compat_old_sigset_t sa_mask; /* A 32 bit mask */
+ unsigned int sa_flags;
+ unsigned int sa_restorer; /* Another 32 bit pointer */
+};
+
+typedef struct sigaltstack_ia32 {
+ unsigned int ss_sp;
+ int ss_flags;
+ unsigned int ss_size;
+} stack_ia32_t;
+
+struct ucontext_ia32 {
+ unsigned int uc_flags;
+ unsigned int uc_link;
+ stack_ia32_t uc_stack;
+ struct sigcontext_ia32 uc_mcontext;
+ compat_sigset_t uc_sigmask; /* mask last for extensibility */
+};
+
+/* This matches struct stat64 in glibc2.2, hence the absolutely
+ * insane amounts of padding around dev_t's.
+ */
+struct stat64 {
+ unsigned long long st_dev;
+ unsigned char __pad0[4];
+
+#define STAT64_HAS_BROKEN_ST_INO 1
+ unsigned int __st_ino;
+
+ unsigned int st_mode;
+ unsigned int st_nlink;
+
+ unsigned int st_uid;
+ unsigned int st_gid;
+
+ unsigned long long st_rdev;
+ unsigned char __pad3[4];
+
+ long long st_size;
+ unsigned int st_blksize;
+
+ long long st_blocks;/* Number 512-byte blocks allocated */
+
+ unsigned st_atime;
+ unsigned st_atime_nsec;
+ unsigned st_mtime;
+ unsigned st_mtime_nsec;
+ unsigned st_ctime;
+ unsigned st_ctime_nsec;
+
+ unsigned long long st_ino;
+} __attribute__((packed));
+
+typedef struct compat_siginfo {
+ int si_signo;
+ int si_errno;
+ int si_code;
+
+ union {
+ int _pad[((128 / sizeof(int)) - 3)];
+
+ /* kill() */
+ struct {
+ unsigned int _pid; /* sender's pid */
+ unsigned int _uid; /* sender's uid */
+ } _kill;
+
+ /* POSIX.1b timers */
+ struct {
+ compat_timer_t _tid; /* timer id */
+ int _overrun; /* overrun count */
+ compat_sigval_t _sigval; /* same as below */
+ int _sys_private; /* not to be passed to user */
+ int _overrun_incr; /* amount to add to overrun */
+ } _timer;
+
+ /* POSIX.1b signals */
+ struct {
+ unsigned int _pid; /* sender's pid */
+ unsigned int _uid; /* sender's uid */
+ compat_sigval_t _sigval;
+ } _rt;
+
+ /* SIGCHLD */
+ struct {
+ unsigned int _pid; /* which child */
+ unsigned int _uid; /* sender's uid */
+ int _status; /* exit code */
+ compat_clock_t _utime;
+ compat_clock_t _stime;
+ } _sigchld;
+
+ /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
+ struct {
+ unsigned int _addr; /* faulting insn/memory ref. */
+ } _sigfault;
+
+ /* SIGPOLL */
+ struct {
+ int _band; /* POLL_IN, POLL_OUT, POLL_MSG */
+ int _fd;
+ } _sigpoll;
+ } _sifields;
+} compat_siginfo_t;
+
+struct sigframe32 {
+ u32 pretcode;
+ int sig;
+ struct sigcontext_ia32 sc;
+ struct _fpstate_ia32 fpstate;
+ unsigned int extramask[_COMPAT_NSIG_WORDS-1];
+};
+
+struct rt_sigframe32 {
+ u32 pretcode;
+ int sig;
+ u32 pinfo;
+ u32 puc;
+ compat_siginfo_t info;
+ struct ucontext_ia32 uc;
+ struct _fpstate_ia32 fpstate;
+};
+
+struct ustat32 {
+ __u32 f_tfree;
+ compat_ino_t f_tinode;
+ char f_fname[6];
+ char f_fpack[6];
+};
+
+#define IA32_STACK_TOP IA32_PAGE_OFFSET
+
+#ifdef __KERNEL__
+struct linux_binprm;
+extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
+ unsigned long stack_top, int exec_stack);
+struct mm_struct;
+extern void ia32_pick_mmap_layout(struct mm_struct *mm);
+
+#endif
+
+#endif /* !CONFIG_IA32_SUPPORT */
+
+#endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
new file mode 100644
index 00000000000..976f6ecd2ce
--- /dev/null
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_IA32_UNISTD_H
+#define _ASM_X86_IA32_UNISTD_H
+
+/*
+ * This file contains the system call numbers of the ia32 port,
+ * this is for the kernel only.
+ * Only add syscalls here where some part of the kernel needs to know
+ * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
+ */
+
+#define __NR_ia32_restart_syscall 0
+#define __NR_ia32_exit 1
+#define __NR_ia32_read 3
+#define __NR_ia32_write 4
+#define __NR_ia32_sigreturn 119
+#define __NR_ia32_rt_sigreturn 173
+
+#endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
new file mode 100644
index 00000000000..44c89c3a23e
--- /dev/null
+++ b/arch/x86/include/asm/idle.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_X86_IDLE_H
+#define _ASM_X86_IDLE_H
+
+#define IDLE_START 1
+#define IDLE_END 2
+
+struct notifier_block;
+void idle_notifier_register(struct notifier_block *n);
+void idle_notifier_unregister(struct notifier_block *n);
+
+void enter_idle(void);
+void exit_idle(void);
+
+void c1e_remove_cpu(int cpu);
+
+#endif /* _ASM_X86_IDLE_H */
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
new file mode 100644
index 00000000000..fa0fd068bc2
--- /dev/null
+++ b/arch/x86/include/asm/intel_arch_perfmon.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
+#define _ASM_X86_INTEL_ARCH_PERFMON_H
+
+#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+union cpuid10_eax {
+ struct {
+ unsigned int version_id:8;
+ unsigned int num_counters:8;
+ unsigned int bit_width:8;
+ unsigned int mask_length:8;
+ } split;
+ unsigned int full;
+};
+
+#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
new file mode 100644
index 00000000000..5618a103f39
--- /dev/null
+++ b/arch/x86/include/asm/io.h
@@ -0,0 +1,91 @@
+#ifndef _ASM_X86_IO_H
+#define _ASM_X86_IO_H
+
+#define ARCH_HAS_IOREMAP_WC
+
+#include <linux/compiler.h>
+
+#define build_mmio_read(name, size, type, reg, barrier) \
+static inline type name(const volatile void __iomem *addr) \
+{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
+:"m" (*(volatile type __force *)addr) barrier); return ret; }
+
+#define build_mmio_write(name, size, type, reg, barrier) \
+static inline void name(type val, volatile void __iomem *addr) \
+{ asm volatile("mov" size " %0,%1": :reg (val), \
+"m" (*(volatile type __force *)addr) barrier); }
+
+build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
+build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
+build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
+
+build_mmio_read(__readb, "b", unsigned char, "=q", )
+build_mmio_read(__readw, "w", unsigned short, "=r", )
+build_mmio_read(__readl, "l", unsigned int, "=r", )
+
+build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
+build_mmio_write(writew, "w", unsigned short, "r", :"memory")
+build_mmio_write(writel, "l", unsigned int, "r", :"memory")
+
+build_mmio_write(__writeb, "b", unsigned char, "q", )
+build_mmio_write(__writew, "w", unsigned short, "r", )
+build_mmio_write(__writel, "l", unsigned int, "r", )
+
+#define readb_relaxed(a) __readb(a)
+#define readw_relaxed(a) __readw(a)
+#define readl_relaxed(a) __readl(a)
+#define __raw_readb __readb
+#define __raw_readw __readw
+#define __raw_readl __readl
+
+#define __raw_writeb __writeb
+#define __raw_writew __writew
+#define __raw_writel __writel
+
+#define mmiowb() barrier()
+
+#ifdef CONFIG_X86_64
+build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_read(__readq, "q", unsigned long, "=r", )
+build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+build_mmio_write(__writeq, "q", unsigned long, "r", )
+
+#define readq_relaxed(a) __readq(a)
+#define __raw_readq __readq
+#define __raw_writeq writeq
+
+/* Let people know we have them */
+#define readq readq
+#define writeq writeq
+#endif
+
+extern int iommu_bio_merge;
+
+#ifdef CONFIG_X86_32
+# include "io_32.h"
+#else
+# include "io_64.h"
+#endif
+
+extern void *xlate_dev_mem_ptr(unsigned long phys);
+extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
+
+extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+ unsigned long prot_val);
+extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
+
+/*
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
+ * mappings, before the real ioremap() is functional.
+ * A boot-time mapping is currently limited to at most 16 pages.
+ */
+extern void early_ioremap_init(void);
+extern void early_ioremap_clear(void);
+extern void early_ioremap_reset(void);
+extern void *early_ioremap(unsigned long offset, unsigned long size);
+extern void *early_memremap(unsigned long offset, unsigned long size);
+extern void early_iounmap(void *addr, unsigned long size);
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+
+
+#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
new file mode 100644
index 00000000000..d8e242e1b39
--- /dev/null
+++ b/arch/x86/include/asm/io_32.h
@@ -0,0 +1,284 @@
+#ifndef _ASM_X86_IO_32_H
+#define _ASM_X86_IO_32_H
+
+#include <linux/string.h>
+#include <linux/compiler.h>
+
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ * Linus
+ */
+
+ /*
+ * Bit simplified and optimized by Jan Hubicka
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+ *
+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+ * isa_read[wl] and isa_write[wl] fixed
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+#define IO_SPACE_LIMIT 0xffff
+
+#define XQUAD_PORTIO_BASE 0xfe400000
+#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
+
+#ifdef __KERNEL__
+
+#include <asm-generic/iomap.h>
+
+#include <linux/vmalloc.h>
+
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+#define xlate_dev_kmem_ptr(p) p
+
+/**
+ * virt_to_phys - map virtual addresses to physical
+ * @address: address to remap
+ *
+ * The returned physical address is the physical (CPU) mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses directly mapped or allocated via kmalloc.
+ *
+ * This function does not give bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline unsigned long virt_to_phys(volatile void *address)
+{
+ return __pa(address);
+}
+
+/**
+ * phys_to_virt - map physical address to virtual
+ * @address: address to remap
+ *
+ * The returned virtual address is a current CPU mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses that have a kernel mapping
+ *
+ * This function does not handle bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline void *phys_to_virt(unsigned long address)
+{
+ return __va(address);
+}
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+
+/**
+ * ioremap - map bus memory into CPU space
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * If the area you are trying to map is a PCI BAR you should have a
+ * look at pci_iomap().
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
+ unsigned long prot_val);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+ return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ */
+#define isa_virt_to_bus virt_to_phys
+#define isa_page_to_bus page_to_phys
+#define isa_bus_to_virt phys_to_virt
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus virt_to_phys
+#define bus_to_virt phys_to_virt
+
+static inline void
+memset_io(volatile void __iomem *addr, unsigned char val, int count)
+{
+ memset((void __force *)addr, val, count);
+}
+
+static inline void
+memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
+{
+ __memcpy(dst, (const void __force *)src, count);
+}
+
+static inline void
+memcpy_toio(volatile void __iomem *dst, const void *src, int count)
+{
+ __memcpy((void __force *)dst, src, count);
+}
+
+/*
+ * ISA space is 'always mapped' on a typical x86 system, no need to
+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+ * are physical addresses. The following constant pointer can be
+ * used as the IO-area pointer (it can be iounmapped as well, so the
+ * analogy with PCI is quite large):
+ */
+#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
+
+/*
+ * Cache management
+ *
+ * This needed for two cases
+ * 1. Out of order aware processors
+ * 2. Accidentally out of order processors (PPro errata #51)
+ */
+
+#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+
+static inline void flush_write_buffers(void)
+{
+ asm volatile("lock; addl $0,0(%%esp)": : :"memory");
+}
+
+#else
+
+#define flush_write_buffers() do { } while (0)
+
+#endif
+
+#endif /* __KERNEL__ */
+
+extern void native_io_delay(void);
+
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
+#else
+
+static inline void slow_down_io(void)
+{
+ native_io_delay();
+#ifdef REALLY_SLOW_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
+#endif
+}
+
+#endif
+
+#define __BUILDIO(bwl, bw, type) \
+static inline void out##bwl(unsigned type value, int port) \
+{ \
+ out##bwl##_local(value, port); \
+} \
+ \
+static inline unsigned type in##bwl(int port) \
+{ \
+ return in##bwl##_local(port); \
+}
+
+#define BUILDIO(bwl, bw, type) \
+static inline void out##bwl##_local(unsigned type value, int port) \
+{ \
+ asm volatile("out" #bwl " %" #bw "0, %w1" \
+ : : "a"(value), "Nd"(port)); \
+} \
+ \
+static inline unsigned type in##bwl##_local(int port) \
+{ \
+ unsigned type value; \
+ asm volatile("in" #bwl " %w1, %" #bw "0" \
+ : "=a"(value) : "Nd"(port)); \
+ return value; \
+} \
+ \
+static inline void out##bwl##_local_p(unsigned type value, int port) \
+{ \
+ out##bwl##_local(value, port); \
+ slow_down_io(); \
+} \
+ \
+static inline unsigned type in##bwl##_local_p(int port) \
+{ \
+ unsigned type value = in##bwl##_local(port); \
+ slow_down_io(); \
+ return value; \
+} \
+ \
+__BUILDIO(bwl, bw, type) \
+ \
+static inline void out##bwl##_p(unsigned type value, int port) \
+{ \
+ out##bwl(value, port); \
+ slow_down_io(); \
+} \
+ \
+static inline unsigned type in##bwl##_p(int port) \
+{ \
+ unsigned type value = in##bwl(port); \
+ slow_down_io(); \
+ return value; \
+} \
+ \
+static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+{ \
+ asm volatile("rep; outs" #bwl \
+ : "+S"(addr), "+c"(count) : "d"(port)); \
+} \
+ \
+static inline void ins##bwl(int port, void *addr, unsigned long count) \
+{ \
+ asm volatile("rep; ins" #bwl \
+ : "+D"(addr), "+c"(count) : "d"(port)); \
+}
+
+BUILDIO(b, b, char)
+BUILDIO(w, w, short)
+BUILDIO(l, , int)
+
+#endif /* _ASM_X86_IO_32_H */
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
new file mode 100644
index 00000000000..fea325a1122
--- /dev/null
+++ b/arch/x86/include/asm/io_64.h
@@ -0,0 +1,244 @@
+#ifndef _ASM_X86_IO_64_H
+#define _ASM_X86_IO_64_H
+
+
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ * Linus
+ */
+
+ /*
+ * Bit simplified and optimized by Jan Hubicka
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+ *
+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+ * isa_read[wl] and isa_write[wl] fixed
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+extern void native_io_delay(void);
+
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
+#else
+
+static inline void slow_down_io(void)
+{
+ native_io_delay();
+#ifdef REALLY_SLOW_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
+#endif
+}
+#endif
+
+/*
+ * Talk about misusing macros..
+ */
+#define __OUT1(s, x) \
+static inline void out##s(unsigned x value, unsigned short port) {
+
+#define __OUT2(s, s1, s2) \
+asm volatile ("out" #s " %" s1 "0,%" s2 "1"
+
+#ifndef REALLY_SLOW_IO
+#define REALLY_SLOW_IO
+#define UNSET_REALLY_SLOW_IO
+#endif
+
+#define __OUT(s, s1, x) \
+ __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
+ } \
+ __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
+ slow_down_io(); \
+}
+
+#define __IN1(s) \
+static inline RETURN_TYPE in##s(unsigned short port) \
+{ \
+ RETURN_TYPE _v;
+
+#define __IN2(s, s1, s2) \
+ asm volatile ("in" #s " %" s2 "1,%" s1 "0"
+
+#define __IN(s, s1, i...) \
+ __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
+ return _v; \
+ } \
+ __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
+ slow_down_io(); \
+ return _v; }
+
+#ifdef UNSET_REALLY_SLOW_IO
+#undef REALLY_SLOW_IO
+#endif
+
+#define __INS(s) \
+static inline void ins##s(unsigned short port, void *addr, \
+ unsigned long count) \
+{ \
+ asm volatile ("rep ; ins" #s \
+ : "=D" (addr), "=c" (count) \
+ : "d" (port), "0" (addr), "1" (count)); \
+}
+
+#define __OUTS(s) \
+static inline void outs##s(unsigned short port, const void *addr, \
+ unsigned long count) \
+{ \
+ asm volatile ("rep ; outs" #s \
+ : "=S" (addr), "=c" (count) \
+ : "d" (port), "0" (addr), "1" (count)); \
+}
+
+#define RETURN_TYPE unsigned char
+__IN(b, "")
+#undef RETURN_TYPE
+#define RETURN_TYPE unsigned short
+__IN(w, "")
+#undef RETURN_TYPE
+#define RETURN_TYPE unsigned int
+__IN(l, "")
+#undef RETURN_TYPE
+
+__OUT(b, "b", char)
+__OUT(w, "w", short)
+__OUT(l, , int)
+
+__INS(b)
+__INS(w)
+__INS(l)
+
+__OUTS(b)
+__OUTS(w)
+__OUTS(l)
+
+#define IO_SPACE_LIMIT 0xffff
+
+#if defined(__KERNEL__) && defined(__x86_64__)
+
+#include <linux/vmalloc.h>
+
+#ifndef __i386__
+/*
+ * Change virtual addresses to physical addresses and vv.
+ * These are pretty trivial
+ */
+static inline unsigned long virt_to_phys(volatile void *address)
+{
+ return __pa(address);
+}
+
+static inline void *phys_to_virt(unsigned long address)
+{
+ return __va(address);
+}
+#endif
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+
+#include <asm-generic/iomap.h>
+
+/*
+ * This one maps high address device memory and turns off caching for that area.
+ * it's useful if some control registers are in such an area and write combining
+ * or read caching is not desirable:
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
+ unsigned long prot_val);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+ return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ */
+#define isa_virt_to_bus virt_to_phys
+#define isa_page_to_bus page_to_phys
+#define isa_bus_to_virt phys_to_virt
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus virt_to_phys
+#define bus_to_virt phys_to_virt
+
+void __memcpy_fromio(void *, unsigned long, unsigned);
+void __memcpy_toio(unsigned long, const void *, unsigned);
+
+static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
+ unsigned len)
+{
+ __memcpy_fromio(to, (unsigned long)from, len);
+}
+
+static inline void memcpy_toio(volatile void __iomem *to, const void *from,
+ unsigned len)
+{
+ __memcpy_toio((unsigned long)to, from, len);
+}
+
+void memset_io(volatile void __iomem *a, int b, size_t c);
+
+/*
+ * ISA space is 'always mapped' on a typical x86 system, no need to
+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+ * are physical addresses. The following constant pointer can be
+ * used as the IO-area pointer (it can be iounmapped as well, so the
+ * analogy with PCI is quite large):
+ */
+#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
+
+#define flush_write_buffers()
+
+#define BIO_VMERGE_BOUNDARY iommu_bio_merge
+
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+#define xlate_dev_kmem_ptr(p) p
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_IO_64_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
new file mode 100644
index 00000000000..6afd9933a7d
--- /dev/null
+++ b/arch/x86/include/asm/io_apic.h
@@ -0,0 +1,204 @@
+#ifndef _ASM_X86_IO_APIC_H
+#define _ASM_X86_IO_APIC_H
+
+#include <linux/types.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/irq_vectors.h>
+
+/*
+ * Intel IO-APIC support for SMP and UP systems.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
+ */
+
+/* I/O Unit Redirection Table */
+#define IO_APIC_REDIR_VECTOR_MASK 0x000FF
+#define IO_APIC_REDIR_DEST_LOGICAL 0x00800
+#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000
+#define IO_APIC_REDIR_SEND_PENDING (1 << 12)
+#define IO_APIC_REDIR_REMOTE_IRR (1 << 14)
+#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15)
+#define IO_APIC_REDIR_MASKED (1 << 16)
+
+/*
+ * The structure of the IO-APIC:
+ */
+union IO_APIC_reg_00 {
+ u32 raw;
+ struct {
+ u32 __reserved_2 : 14,
+ LTS : 1,
+ delivery_type : 1,
+ __reserved_1 : 8,
+ ID : 8;
+ } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_01 {
+ u32 raw;
+ struct {
+ u32 version : 8,
+ __reserved_2 : 7,
+ PRQ : 1,
+ entries : 8,
+ __reserved_1 : 8;
+ } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_02 {
+ u32 raw;
+ struct {
+ u32 __reserved_2 : 24,
+ arbitration : 4,
+ __reserved_1 : 4;
+ } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_03 {
+ u32 raw;
+ struct {
+ u32 boot_DT : 1,
+ __reserved_1 : 31;
+ } __attribute__ ((packed)) bits;
+};
+
+enum ioapic_irq_destination_types {
+ dest_Fixed = 0,
+ dest_LowestPrio = 1,
+ dest_SMI = 2,
+ dest__reserved_1 = 3,
+ dest_NMI = 4,
+ dest_INIT = 5,
+ dest__reserved_2 = 6,
+ dest_ExtINT = 7
+};
+
+struct IO_APIC_route_entry {
+ __u32 vector : 8,
+ delivery_mode : 3, /* 000: FIXED
+ * 001: lowest prio
+ * 111: ExtINT
+ */
+ dest_mode : 1, /* 0: physical, 1: logical */
+ delivery_status : 1,
+ polarity : 1,
+ irr : 1,
+ trigger : 1, /* 0: edge, 1: level */
+ mask : 1, /* 0: enabled, 1: disabled */
+ __reserved_2 : 15;
+
+ __u32 __reserved_3 : 24,
+ dest : 8;
+} __attribute__ ((packed));
+
+struct IR_IO_APIC_route_entry {
+ __u64 vector : 8,
+ zero : 3,
+ index2 : 1,
+ delivery_status : 1,
+ polarity : 1,
+ irr : 1,
+ trigger : 1,
+ mask : 1,
+ reserved : 31,
+ format : 1,
+ index : 15;
+} __attribute__ ((packed));
+
+#ifdef CONFIG_X86_IO_APIC
+
+/*
+ * # of IO-APICs and # of IRQ routing registers
+ */
+extern int nr_ioapics;
+extern int nr_ioapic_registers[MAX_IO_APICS];
+
+/*
+ * MP-BIOS irq configuration table structures:
+ */
+
+#define MP_MAX_IOAPIC_PIN 127
+
+struct mp_config_ioapic {
+ unsigned long mp_apicaddr;
+ unsigned int mp_apicid;
+ unsigned char mp_type;
+ unsigned char mp_apicver;
+ unsigned char mp_flags;
+};
+
+struct mp_config_intsrc {
+ unsigned int mp_dstapic;
+ unsigned char mp_type;
+ unsigned char mp_irqtype;
+ unsigned short mp_irqflag;
+ unsigned char mp_srcbus;
+ unsigned char mp_srcbusirq;
+ unsigned char mp_dstirq;
+};
+
+/* I/O APIC entries */
+extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
+
+/* # of MP IRQ source entries */
+extern int mp_irq_entries;
+
+/* MP IRQ source entries */
+extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* non-0 if default (table-less) MP configuration */
+extern int mpc_default_type;
+
+/* Older SiS APIC requires we rewrite the index register */
+extern int sis_apic_bug;
+
+/* 1 if "noapic" boot option passed */
+extern int skip_ioapic_setup;
+
+/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
+extern int timer_through_8259;
+
+static inline void disable_ioapic_setup(void)
+{
+ skip_ioapic_setup = 1;
+}
+
+/*
+ * If we use the IO-APIC for IRQ routing, disable automatic
+ * assignment of PCI IRQ's.
+ */
+#define io_apic_assign_pci_irqs \
+ (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
+
+#ifdef CONFIG_ACPI
+extern int io_apic_get_unique_id(int ioapic, int apic_id);
+extern int io_apic_get_version(int ioapic);
+extern int io_apic_get_redir_entries(int ioapic);
+extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
+ int edge_level, int active_high_low);
+#endif /* CONFIG_ACPI */
+
+extern int (*ioapic_renumber_irq)(int ioapic, int irq);
+extern void ioapic_init_mappings(void);
+
+#ifdef CONFIG_X86_64
+extern int save_mask_IO_APIC_setup(void);
+extern void restore_IO_APIC_setup(void);
+extern void reinit_intr_remapped_IO_APIC(int);
+#endif
+
+extern int probe_nr_irqs(void);
+
+#else /* !CONFIG_X86_IO_APIC */
+#define io_apic_assign_pci_irqs 0
+static const int timer_through_8259 = 0;
+static inline void ioapic_init_mappings(void) { }
+
+static inline int probe_nr_irqs(void)
+{
+ return NR_IRQS;
+}
+#endif
+
+#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/ioctl.h b/arch/x86/include/asm/ioctl.h
new file mode 100644
index 00000000000..b279fe06dfe
--- /dev/null
+++ b/arch/x86/include/asm/ioctl.h
@@ -0,0 +1 @@
+#include <asm-generic/ioctl.h>
diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h
new file mode 100644
index 00000000000..0d5b23b7b06
--- /dev/null
+++ b/arch/x86/include/asm/ioctls.h
@@ -0,0 +1,94 @@
+#ifndef _ASM_X86_IOCTLS_H
+#define _ASM_X86_IOCTLS_H
+
+#include <asm/ioctl.h>
+
+/* 0x54 is just a magic number to make these relatively unique ('T') */
+
+#define TCGETS 0x5401
+#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */
+#define TCSETSW 0x5403
+#define TCSETSF 0x5404
+#define TCGETA 0x5405
+#define TCSETA 0x5406
+#define TCSETAW 0x5407
+#define TCSETAF 0x5408
+#define TCSBRK 0x5409
+#define TCXONC 0x540A
+#define TCFLSH 0x540B
+#define TIOCEXCL 0x540C
+#define TIOCNXCL 0x540D
+#define TIOCSCTTY 0x540E
+#define TIOCGPGRP 0x540F
+#define TIOCSPGRP 0x5410
+#define TIOCOUTQ 0x5411
+#define TIOCSTI 0x5412
+#define TIOCGWINSZ 0x5413
+#define TIOCSWINSZ 0x5414
+#define TIOCMGET 0x5415
+#define TIOCMBIS 0x5416
+#define TIOCMBIC 0x5417
+#define TIOCMSET 0x5418
+#define TIOCGSOFTCAR 0x5419
+#define TIOCSSOFTCAR 0x541A
+#define FIONREAD 0x541B
+#define TIOCINQ FIONREAD
+#define TIOCLINUX 0x541C
+#define TIOCCONS 0x541D
+#define TIOCGSERIAL 0x541E
+#define TIOCSSERIAL 0x541F
+#define TIOCPKT 0x5420
+#define FIONBIO 0x5421
+#define TIOCNOTTY 0x5422
+#define TIOCSETD 0x5423
+#define TIOCGETD 0x5424
+#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */
+/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */
+#define TIOCSBRK 0x5427 /* BSD compatibility */
+#define TIOCCBRK 0x5428 /* BSD compatibility */
+#define TIOCGSID 0x5429 /* Return the session ID of FD */
+#define TCGETS2 _IOR('T', 0x2A, struct termios2)
+#define TCSETS2 _IOW('T', 0x2B, struct termios2)
+#define TCSETSW2 _IOW('T', 0x2C, struct termios2)
+#define TCSETSF2 _IOW('T', 0x2D, struct termios2)
+#define TIOCGRS485 0x542E
+#define TIOCSRS485 0x542F
+#define TIOCGPTN _IOR('T', 0x30, unsigned int)
+ /* Get Pty Number (of pty-mux device) */
+#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */
+#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */
+#define TCSETX 0x5433
+#define TCSETXF 0x5434
+#define TCSETXW 0x5435
+
+#define FIONCLEX 0x5450
+#define FIOCLEX 0x5451
+#define FIOASYNC 0x5452
+#define TIOCSERCONFIG 0x5453
+#define TIOCSERGWILD 0x5454
+#define TIOCSERSWILD 0x5455
+#define TIOCGLCKTRMIOS 0x5456
+#define TIOCSLCKTRMIOS 0x5457
+#define TIOCSERGSTRUCT 0x5458 /* For debugging only */
+#define TIOCSERGETLSR 0x5459 /* Get line status register */
+#define TIOCSERGETMULTI 0x545A /* Get multiport config */
+#define TIOCSERSETMULTI 0x545B /* Set multiport config */
+
+#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */
+#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */
+#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */
+#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */
+#define FIOQSIZE 0x5460
+
+/* Used for packet mode */
+#define TIOCPKT_DATA 0
+#define TIOCPKT_FLUSHREAD 1
+#define TIOCPKT_FLUSHWRITE 2
+#define TIOCPKT_STOP 4
+#define TIOCPKT_START 8
+#define TIOCPKT_NOSTOP 16
+#define TIOCPKT_DOSTOP 32
+
+#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */
+
+#endif /* _ASM_X86_IOCTLS_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
new file mode 100644
index 00000000000..98e28ea8cd1
--- /dev/null
+++ b/arch/x86/include/asm/iommu.h
@@ -0,0 +1,50 @@
+#ifndef _ASM_X86_IOMMU_H
+#define _ASM_X86_IOMMU_H
+
+extern void pci_iommu_shutdown(void);
+extern void no_iommu_init(void);
+extern struct dma_mapping_ops nommu_dma_ops;
+extern int force_iommu, no_iommu;
+extern int iommu_detected;
+extern int dmar_disabled;
+extern int forbid_dac;
+
+extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
+
+/* 10 seconds */
+#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
+
+#ifdef CONFIG_GART_IOMMU
+extern int gart_iommu_aperture;
+extern int gart_iommu_aperture_allowed;
+extern int gart_iommu_aperture_disabled;
+
+extern void early_gart_iommu_check(void);
+extern void gart_iommu_init(void);
+extern void gart_iommu_shutdown(void);
+extern void __init gart_parse_options(char *);
+extern void gart_iommu_hole_init(void);
+
+#else
+#define gart_iommu_aperture 0
+#define gart_iommu_aperture_allowed 0
+#define gart_iommu_aperture_disabled 1
+
+static inline void early_gart_iommu_check(void)
+{
+}
+static inline void gart_iommu_init(void)
+{
+}
+static inline void gart_iommu_shutdown(void)
+{
+}
+static inline void gart_parse_options(char *options)
+{
+}
+static inline void gart_iommu_hole_init(void)
+{
+}
+#endif
+
+#endif /* _ASM_X86_IOMMU_H */
diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h
new file mode 100644
index 00000000000..ee678fd5159
--- /dev/null
+++ b/arch/x86/include/asm/ipcbuf.h
@@ -0,0 +1,28 @@
+#ifndef _ASM_X86_IPCBUF_H
+#define _ASM_X86_IPCBUF_H
+
+/*
+ * The ipc64_perm structure for x86 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 32-bit mode_t and seq
+ * - 2 miscellaneous 32-bit values
+ */
+
+struct ipc64_perm {
+ __kernel_key_t key;
+ __kernel_uid32_t uid;
+ __kernel_gid32_t gid;
+ __kernel_uid32_t cuid;
+ __kernel_gid32_t cgid;
+ __kernel_mode_t mode;
+ unsigned short __pad1;
+ unsigned short seq;
+ unsigned short __pad2;
+ unsigned long __unused1;
+ unsigned long __unused2;
+};
+
+#endif /* _ASM_X86_IPCBUF_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
new file mode 100644
index 00000000000..f89dffb28aa
--- /dev/null
+++ b/arch/x86/include/asm/ipi.h
@@ -0,0 +1,138 @@
+#ifndef _ASM_X86_IPI_H
+#define _ASM_X86_IPI_H
+
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC InterProcessor Interrupt code.
+ *
+ * Moved to include file by James Cleverdon from
+ * arch/x86-64/kernel/smp.c
+ *
+ * Copyrights from kernel/smp.c:
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ */
+
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/smp.h>
+
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
+
+static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector,
+ unsigned int dest)
+{
+ unsigned int icr = shortcut | dest;
+
+ switch (vector) {
+ default:
+ icr |= APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr |= APIC_DM_NMI;
+ break;
+ }
+ return icr;
+}
+
+static inline int __prepare_ICR2(unsigned int mask)
+{
+ return SET_APIC_DEST_FIELD(mask);
+}
+
+static inline void __xapic_wait_icr_idle(void)
+{
+ while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
+ cpu_relax();
+}
+
+static inline void __send_IPI_shortcut(unsigned int shortcut, int vector,
+ unsigned int dest)
+{
+ /*
+ * Subtle. In the case of the 'never do double writes' workaround
+ * we have to lock out interrupts to be safe. As we don't care
+ * of the value read we use an atomic rmw access to avoid costly
+ * cli/sti. Otherwise we use an even cheaper single atomic write
+ * to the APIC.
+ */
+ unsigned int cfg;
+
+ /*
+ * Wait for idle.
+ */
+ __xapic_wait_icr_idle();
+
+ /*
+ * No need to touch the target chip field
+ */
+ cfg = __prepare_ICR(shortcut, vector, dest);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ native_apic_mem_write(APIC_ICR, cfg);
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+static inline void __send_IPI_dest_field(unsigned int mask, int vector,
+ unsigned int dest)
+{
+ unsigned long cfg;
+
+ /*
+ * Wait for idle.
+ */
+ if (unlikely(vector == NMI_VECTOR))
+ safe_apic_wait_icr_idle();
+ else
+ __xapic_wait_icr_idle();
+
+ /*
+ * prepare target chip field
+ */
+ cfg = __prepare_ICR2(mask);
+ native_apic_mem_write(APIC_ICR2, cfg);
+
+ /*
+ * program the ICR
+ */
+ cfg = __prepare_ICR(0, vector, dest);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ native_apic_mem_write(APIC_ICR, cfg);
+}
+
+static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
+{
+ unsigned long flags;
+ unsigned long query_cpu;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicast to each CPU instead.
+ * - mbligh
+ */
+ local_irq_save(flags);
+ for_each_cpu_mask_nr(query_cpu, mask) {
+ __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
+ vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
new file mode 100644
index 00000000000..bae0eda9548
--- /dev/null
+++ b/arch/x86/include/asm/irq.h
@@ -0,0 +1,50 @@
+#ifndef _ASM_X86_IRQ_H
+#define _ASM_X86_IRQ_H
+/*
+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
+ *
+ * IRQ/IPI changes taken from work by Thomas Radke
+ * <tomsoft@informatik.tu-chemnitz.de>
+ */
+
+#include <asm/apicdef.h>
+#include <asm/irq_vectors.h>
+
+static inline int irq_canonicalize(int irq)
+{
+ return ((irq == 2) ? 9 : irq);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+# define ARCH_HAS_NMI_WATCHDOG
+#endif
+
+#ifdef CONFIG_4KSTACKS
+ extern void irq_ctx_init(int cpu);
+ extern void irq_ctx_exit(int cpu);
+# define __ARCH_HAS_DO_SOFTIRQ
+#else
+# define irq_ctx_init(cpu) do { } while (0)
+# define irq_ctx_exit(cpu) do { } while (0)
+# ifdef CONFIG_X86_64
+# define __ARCH_HAS_DO_SOFTIRQ
+# endif
+#endif
+
+#ifdef CONFIG_IRQBALANCE
+extern int irqbalance_disable(char *str);
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <linux/cpumask.h>
+extern void fixup_irqs(cpumask_t map);
+#endif
+
+extern unsigned int do_IRQ(struct pt_regs *regs);
+extern void init_IRQ(void);
+extern void native_init_IRQ(void);
+
+/* Interrupt vector management */
+extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
+
+#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
new file mode 100644
index 00000000000..89c898ab298
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "irq_regs_32.h"
+#else
+# include "irq_regs_64.h"
+#endif
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
new file mode 100644
index 00000000000..af2f02d27fc
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs_32.h
@@ -0,0 +1,29 @@
+/*
+ * Per-cpu current frame pointer - the location of the last exception frame on
+ * the stack, stored in the per-cpu area.
+ *
+ * Jeremy Fitzhardinge <jeremy@goop.org>
+ */
+#ifndef _ASM_X86_IRQ_REGS_32_H
+#define _ASM_X86_IRQ_REGS_32_H
+
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU(struct pt_regs *, irq_regs);
+
+static inline struct pt_regs *get_irq_regs(void)
+{
+ return x86_read_percpu(irq_regs);
+}
+
+static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
+{
+ struct pt_regs *old_regs;
+
+ old_regs = get_irq_regs();
+ x86_write_percpu(irq_regs, new_regs);
+
+ return old_regs;
+}
+
+#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_64.h b/arch/x86/include/asm/irq_regs_64.h
new file mode 100644
index 00000000000..3dd9c0b7027
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs_64.h
@@ -0,0 +1 @@
+#include <asm-generic/irq_regs.h>
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
new file mode 100644
index 00000000000..20e1fd588db
--- /dev/null
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -0,0 +1,8 @@
+#ifndef _ASM_X86_IRQ_REMAPPING_H
+#define _ASM_X86_IRQ_REMAPPING_H
+
+extern int x2apic;
+
+#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
+
+#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
new file mode 100644
index 00000000000..d843ed0e9b2
--- /dev/null
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -0,0 +1,164 @@
+#ifndef _ASM_X86_IRQ_VECTORS_H
+#define _ASM_X86_IRQ_VECTORS_H
+
+#include <linux/threads.h>
+
+#define NMI_VECTOR 0x02
+
+/*
+ * IDT vectors usable for external interrupt sources start
+ * at 0x20:
+ */
+#define FIRST_EXTERNAL_VECTOR 0x20
+
+#ifdef CONFIG_X86_32
+# define SYSCALL_VECTOR 0x80
+#else
+# define IA32_SYSCALL_VECTOR 0x80
+#endif
+
+/*
+ * Reserve the lowest usable priority level 0x20 - 0x2f for triggering
+ * cleanup after irq migration.
+ */
+#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
+
+/*
+ * Vectors 0x30-0x3f are used for ISA interrupts.
+ */
+#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
+#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
+#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
+#define IRQ3_VECTOR (IRQ0_VECTOR + 3)
+#define IRQ4_VECTOR (IRQ0_VECTOR + 4)
+#define IRQ5_VECTOR (IRQ0_VECTOR + 5)
+#define IRQ6_VECTOR (IRQ0_VECTOR + 6)
+#define IRQ7_VECTOR (IRQ0_VECTOR + 7)
+#define IRQ8_VECTOR (IRQ0_VECTOR + 8)
+#define IRQ9_VECTOR (IRQ0_VECTOR + 9)
+#define IRQ10_VECTOR (IRQ0_VECTOR + 10)
+#define IRQ11_VECTOR (IRQ0_VECTOR + 11)
+#define IRQ12_VECTOR (IRQ0_VECTOR + 12)
+#define IRQ13_VECTOR (IRQ0_VECTOR + 13)
+#define IRQ14_VECTOR (IRQ0_VECTOR + 14)
+#define IRQ15_VECTOR (IRQ0_VECTOR + 15)
+
+/*
+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
+ *
+ * some of the following vectors are 'rare', they are merged
+ * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
+ * TLB, reschedule and local APIC vectors are performance-critical.
+ *
+ * Vectors 0xf0-0xfa are free (reserved for future Linux use).
+ */
+#ifdef CONFIG_X86_32
+
+# define SPURIOUS_APIC_VECTOR 0xff
+# define ERROR_APIC_VECTOR 0xfe
+# define INVALIDATE_TLB_VECTOR 0xfd
+# define RESCHEDULE_VECTOR 0xfc
+# define CALL_FUNCTION_VECTOR 0xfb
+# define CALL_FUNCTION_SINGLE_VECTOR 0xfa
+# define THERMAL_APIC_VECTOR 0xf0
+
+#else
+
+#define SPURIOUS_APIC_VECTOR 0xff
+#define ERROR_APIC_VECTOR 0xfe
+#define RESCHEDULE_VECTOR 0xfd
+#define CALL_FUNCTION_VECTOR 0xfc
+#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
+#define THERMAL_APIC_VECTOR 0xfa
+#define THRESHOLD_APIC_VECTOR 0xf9
+#define UV_BAU_MESSAGE 0xf8
+#define INVALIDATE_TLB_VECTOR_END 0xf7
+#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
+
+#define NUM_INVALIDATE_TLB_VECTORS 8
+
+#endif
+
+/*
+ * Local APIC timer IRQ vector is on a different priority level,
+ * to work around the 'lost local interrupt if more than 2 IRQ
+ * sources per level' errata.
+ */
+#define LOCAL_TIMER_VECTOR 0xef
+
+/*
+ * First APIC vector available to drivers: (vectors 0x30-0xee) we
+ * start at 0x31(0x41) to spread out vectors evenly between priority
+ * levels. (0x80 is the syscall vector)
+ */
+#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
+
+#define NR_VECTORS 256
+
+#define FPU_IRQ 13
+
+#define FIRST_VM86_IRQ 3
+#define LAST_VM86_IRQ 15
+#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
+
+#ifdef CONFIG_X86_64
+# if NR_CPUS < MAX_IO_APICS
+# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
+# else
+# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
+# endif
+
+#elif !defined(CONFIG_X86_VOYAGER)
+
+# if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT) || defined(CONFIG_X86_VISWS)
+
+# define NR_IRQS 224
+
+# else /* IO_APIC || PARAVIRT */
+
+# define NR_IRQS 16
+
+# endif
+
+#else /* !VISWS && !VOYAGER */
+
+# define NR_IRQS 224
+
+#endif /* VISWS */
+
+/* Voyager specific defines */
+/* These define the CPIs we use in linux */
+#define VIC_CPI_LEVEL0 0
+#define VIC_CPI_LEVEL1 1
+/* now the fake CPIs */
+#define VIC_TIMER_CPI 2
+#define VIC_INVALIDATE_CPI 3
+#define VIC_RESCHEDULE_CPI 4
+#define VIC_ENABLE_IRQ_CPI 5
+#define VIC_CALL_FUNCTION_CPI 6
+#define VIC_CALL_FUNCTION_SINGLE_CPI 7
+
+/* Now the QIC CPIs: Since we don't need the two initial levels,
+ * these are 2 less than the VIC CPIs */
+#define QIC_CPI_OFFSET 1
+#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET)
+#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET)
+#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET)
+#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET)
+#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET)
+#define QIC_CALL_FUNCTION_SINGLE_CPI (VIC_CALL_FUNCTION_SINGLE_CPI - QIC_CPI_OFFSET)
+
+#define VIC_START_FAKE_CPI VIC_TIMER_CPI
+#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_SINGLE_CPI
+
+/* this is the SYS_INT CPI. */
+#define VIC_SYS_INT 8
+#define VIC_CMN_INT 15
+
+/* This is the boot CPI for alternate processors. It gets overwritten
+ * by the above once the system has activated all available processors */
+#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0
+#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8)
+
+
+#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
new file mode 100644
index 00000000000..2bdab21f089
--- /dev/null
+++ b/arch/x86/include/asm/irqflags.h
@@ -0,0 +1,211 @@
+#ifndef _X86_IRQFLAGS_H_
+#define _X86_IRQFLAGS_H_
+
+#include <asm/processor-flags.h>
+
+#ifndef __ASSEMBLY__
+/*
+ * Interrupt control:
+ */
+
+static inline unsigned long native_save_fl(void)
+{
+ unsigned long flags;
+
+ asm volatile("# __raw_save_flags\n\t"
+ "pushf ; pop %0"
+ : "=g" (flags)
+ : /* no input */
+ : "memory");
+
+ return flags;
+}
+
+static inline void native_restore_fl(unsigned long flags)
+{
+ asm volatile("push %0 ; popf"
+ : /* no output */
+ :"g" (flags)
+ :"memory", "cc");
+}
+
+static inline void native_irq_disable(void)
+{
+ asm volatile("cli": : :"memory");
+}
+
+static inline void native_irq_enable(void)
+{
+ asm volatile("sti": : :"memory");
+}
+
+static inline void native_safe_halt(void)
+{
+ asm volatile("sti; hlt": : :"memory");
+}
+
+static inline void native_halt(void)
+{
+ asm volatile("hlt": : :"memory");
+}
+
+#endif
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#ifndef __ASSEMBLY__
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ return native_save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+ native_restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ native_irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ native_irq_enable();
+}
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static inline void raw_safe_halt(void)
+{
+ native_safe_halt();
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static inline void halt(void)
+{
+ native_halt();
+}
+
+/*
+ * For spinlocks, etc:
+ */
+static inline unsigned long __raw_local_irq_save(void)
+{
+ unsigned long flags = __raw_local_save_flags();
+
+ raw_local_irq_disable();
+
+ return flags;
+}
+#else
+
+#define ENABLE_INTERRUPTS(x) sti
+#define DISABLE_INTERRUPTS(x) cli
+
+#ifdef CONFIG_X86_64
+#define SWAPGS swapgs
+/*
+ * Currently paravirt can't handle swapgs nicely when we
+ * don't have a stack we can rely on (such as a user space
+ * stack). So we either find a way around these or just fault
+ * and emulate if a guest tries to call swapgs directly.
+ *
+ * Either way, this is a good way to document that we don't
+ * have a reliable stack. x86_64 only.
+ */
+#define SWAPGS_UNSAFE_STACK swapgs
+
+#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
+
+#define INTERRUPT_RETURN iretq
+#define USERGS_SYSRET64 \
+ swapgs; \
+ sysretq;
+#define USERGS_SYSRET32 \
+ swapgs; \
+ sysretl
+#define ENABLE_INTERRUPTS_SYSEXIT32 \
+ swapgs; \
+ sti; \
+ sysexit
+
+#else
+#define INTERRUPT_RETURN iret
+#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
+#define GET_CR0_INTO_EAX movl %cr0, %eax
+#endif
+
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+ do { (flags) = __raw_local_save_flags(); } while (0)
+
+#define raw_local_irq_save(flags) \
+ do { (flags) = __raw_local_irq_save(); } while (0)
+
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+ return !(flags & X86_EFLAGS_IF);
+}
+
+static inline int raw_irqs_disabled(void)
+{
+ unsigned long flags = __raw_local_save_flags();
+
+ return raw_irqs_disabled_flags(flags);
+}
+
+#else
+
+#ifdef CONFIG_X86_64
+#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
+ TRACE_IRQS_ON; \
+ sti; \
+ SAVE_REST; \
+ LOCKDEP_SYS_EXIT; \
+ RESTORE_REST; \
+ cli; \
+ TRACE_IRQS_OFF;
+
+#else
+#define ARCH_LOCKDEP_SYS_EXIT \
+ pushl %eax; \
+ pushl %ecx; \
+ pushl %edx; \
+ call lockdep_sys_exit; \
+ popl %edx; \
+ popl %ecx; \
+ popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
+# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
+#else
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
+# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
+# else
+# define LOCKDEP_SYS_EXIT
+# define LOCKDEP_SYS_EXIT_IRQ
+# endif
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/arch/x86/include/asm/ist.h b/arch/x86/include/asm/ist.h
new file mode 100644
index 00000000000..7e5dff1de0e
--- /dev/null
+++ b/arch/x86/include/asm/ist.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_IST_H
+#define _ASM_X86_IST_H
+
+/*
+ * Include file for the interface to IST BIOS
+ * Copyright 2002 Andy Grover <andrew.grover@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+
+#include <linux/types.h>
+
+struct ist_info {
+ __u32 signature;
+ __u32 command;
+ __u32 event;
+ __u32 perf_level;
+};
+
+#ifdef __KERNEL__
+
+extern struct ist_info ist_info;
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_IST_H */
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
new file mode 100644
index 00000000000..54c8cc53b24
--- /dev/null
+++ b/arch/x86/include/asm/k8.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_X86_K8_H
+#define _ASM_X86_K8_H
+
+#include <linux/pci.h>
+
+extern struct pci_device_id k8_nb_ids[];
+
+extern int early_is_k8_nb(u32 value);
+extern struct pci_dev **k8_northbridges;
+extern int num_k8_northbridges;
+extern int cache_k8_northbridges(void);
+extern void k8_flush_garts(void);
+extern int k8_scan_nodes(unsigned long start, unsigned long end);
+
+#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
new file mode 100644
index 00000000000..fa7c0b97476
--- /dev/null
+++ b/arch/x86/include/asm/kdebug.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_X86_KDEBUG_H
+#define _ASM_X86_KDEBUG_H
+
+#include <linux/notifier.h>
+
+struct pt_regs;
+
+/* Grossly misnamed. */
+enum die_val {
+ DIE_OOPS = 1,
+ DIE_INT3,
+ DIE_DEBUG,
+ DIE_PANIC,
+ DIE_NMI,
+ DIE_DIE,
+ DIE_NMIWATCHDOG,
+ DIE_KERNELDEBUG,
+ DIE_TRAP,
+ DIE_GPF,
+ DIE_CALL,
+ DIE_NMI_IPI,
+ DIE_PAGE_FAULT,
+ DIE_NMIUNKNOWN,
+};
+
+extern void printk_address(unsigned long address, int reliable);
+extern void die(const char *, struct pt_regs *,long);
+extern int __must_check __die(const char *, struct pt_regs *, long);
+extern void show_registers(struct pt_regs *regs);
+extern void show_trace(struct task_struct *t, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp);
+extern void __show_regs(struct pt_regs *regs, int all);
+extern void show_regs(struct pt_regs *regs);
+extern unsigned long oops_begin(void);
+extern void oops_end(unsigned long, struct pt_regs *, int signr);
+
+#endif /* _ASM_X86_KDEBUG_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
new file mode 100644
index 00000000000..a1f22771a15
--- /dev/null
+++ b/arch/x86/include/asm/kexec.h
@@ -0,0 +1,175 @@
+#ifndef _ASM_X86_KEXEC_H
+#define _ASM_X86_KEXEC_H
+
+#ifdef CONFIG_X86_32
+# define PA_CONTROL_PAGE 0
+# define VA_CONTROL_PAGE 1
+# define PA_PGD 2
+# define VA_PGD 3
+# define PA_PTE_0 4
+# define VA_PTE_0 5
+# define PA_PTE_1 6
+# define VA_PTE_1 7
+# define PA_SWAP_PAGE 8
+# ifdef CONFIG_X86_PAE
+# define PA_PMD_0 9
+# define VA_PMD_0 10
+# define PA_PMD_1 11
+# define VA_PMD_1 12
+# define PAGES_NR 13
+# else
+# define PAGES_NR 9
+# endif
+#else
+# define PA_CONTROL_PAGE 0
+# define VA_CONTROL_PAGE 1
+# define PA_PGD 2
+# define VA_PGD 3
+# define PA_PUD_0 4
+# define VA_PUD_0 5
+# define PA_PMD_0 6
+# define VA_PMD_0 7
+# define PA_PTE_0 8
+# define VA_PTE_0 9
+# define PA_PUD_1 10
+# define VA_PUD_1 11
+# define PA_PMD_1 12
+# define VA_PMD_1 13
+# define PA_PTE_1 14
+# define VA_PTE_1 15
+# define PA_TABLE_PAGE 16
+# define PAGES_NR 17
+#endif
+
+#ifdef CONFIG_X86_32
+# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
+#endif
+
+#ifndef __ASSEMBLY__
+
+#include <linux/string.h>
+
+#include <asm/page.h>
+#include <asm/ptrace.h>
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * So far x86_64 is limited to 40 physical address bits.
+ */
+#ifdef CONFIG_X86_32
+/* Maximum physical address we can use pages from */
+# define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+# define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+/* Maximum address we can use for the control code buffer */
+# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+
+# define KEXEC_CONTROL_PAGE_SIZE 4096
+
+/* The native architecture */
+# define KEXEC_ARCH KEXEC_ARCH_386
+
+/* We can also handle crash dumps from 64 bit kernel. */
+# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
+#else
+/* Maximum physical address we can use pages from */
+# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+/* Maximum address we can reach in physical address mode */
+# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+/* Maximum address we can use for the control pages */
+# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+
+/* Allocate one page for the pdp and the second for the code */
+# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
+
+/* The native architecture */
+# define KEXEC_ARCH KEXEC_ARCH_X86_64
+#endif
+
+/*
+ * CPU does not save ss and sp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
+static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
+ struct pt_regs *oldregs)
+{
+#ifdef CONFIG_X86_32
+ newregs->sp = (unsigned long)&(oldregs->sp);
+ asm volatile("xorl %%eax, %%eax\n\t"
+ "movw %%ss, %%ax\n\t"
+ :"=a"(newregs->ss));
+#endif
+}
+
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and sp if coming via kernel
+ * mode exception.
+ */
+static inline void crash_setup_regs(struct pt_regs *newregs,
+ struct pt_regs *oldregs)
+{
+ if (oldregs) {
+ memcpy(newregs, oldregs, sizeof(*newregs));
+ crash_fixup_ss_esp(newregs, oldregs);
+ } else {
+#ifdef CONFIG_X86_32
+ asm volatile("movl %%ebx,%0" : "=m"(newregs->bx));
+ asm volatile("movl %%ecx,%0" : "=m"(newregs->cx));
+ asm volatile("movl %%edx,%0" : "=m"(newregs->dx));
+ asm volatile("movl %%esi,%0" : "=m"(newregs->si));
+ asm volatile("movl %%edi,%0" : "=m"(newregs->di));
+ asm volatile("movl %%ebp,%0" : "=m"(newregs->bp));
+ asm volatile("movl %%eax,%0" : "=m"(newregs->ax));
+ asm volatile("movl %%esp,%0" : "=m"(newregs->sp));
+ asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
+ asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
+ asm volatile("movl %%ds, %%eax;" :"=a"(newregs->ds));
+ asm volatile("movl %%es, %%eax;" :"=a"(newregs->es));
+ asm volatile("pushfl; popl %0" :"=m"(newregs->flags));
+#else
+ asm volatile("movq %%rbx,%0" : "=m"(newregs->bx));
+ asm volatile("movq %%rcx,%0" : "=m"(newregs->cx));
+ asm volatile("movq %%rdx,%0" : "=m"(newregs->dx));
+ asm volatile("movq %%rsi,%0" : "=m"(newregs->si));
+ asm volatile("movq %%rdi,%0" : "=m"(newregs->di));
+ asm volatile("movq %%rbp,%0" : "=m"(newregs->bp));
+ asm volatile("movq %%rax,%0" : "=m"(newregs->ax));
+ asm volatile("movq %%rsp,%0" : "=m"(newregs->sp));
+ asm volatile("movq %%r8,%0" : "=m"(newregs->r8));
+ asm volatile("movq %%r9,%0" : "=m"(newregs->r9));
+ asm volatile("movq %%r10,%0" : "=m"(newregs->r10));
+ asm volatile("movq %%r11,%0" : "=m"(newregs->r11));
+ asm volatile("movq %%r12,%0" : "=m"(newregs->r12));
+ asm volatile("movq %%r13,%0" : "=m"(newregs->r13));
+ asm volatile("movq %%r14,%0" : "=m"(newregs->r14));
+ asm volatile("movq %%r15,%0" : "=m"(newregs->r15));
+ asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
+ asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
+ asm volatile("pushfq; popq %0" :"=m"(newregs->flags));
+#endif
+ newregs->ip = (unsigned long)current_text_addr();
+ }
+}
+
+#ifdef CONFIG_X86_32
+asmlinkage unsigned long
+relocate_kernel(unsigned long indirection_page,
+ unsigned long control_page,
+ unsigned long start_address,
+ unsigned int has_pae,
+ unsigned int preserve_context);
+#else
+NORET_TYPE void
+relocate_kernel(unsigned long indirection_page,
+ unsigned long page_list,
+ unsigned long start_address) ATTRIB_NORET;
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
new file mode 100644
index 00000000000..e6c6c808489
--- /dev/null
+++ b/arch/x86/include/asm/kgdb.h
@@ -0,0 +1,79 @@
+#ifndef _ASM_X86_KGDB_H
+#define _ASM_X86_KGDB_H
+
+/*
+ * Copyright (C) 2001-2004 Amit S. Kale
+ * Copyright (C) 2008 Wind River Systems, Inc.
+ */
+
+/*
+ * BUFMAX defines the maximum number of characters in inbound/outbound
+ * buffers at least NUMREGBYTES*2 are needed for register packets
+ * Longer buffer is needed to list all threads
+ */
+#define BUFMAX 1024
+
+/*
+ * Note that this register image is in a different order than
+ * the register image that Linux produces at interrupt time.
+ *
+ * Linux's register image is defined by struct pt_regs in ptrace.h.
+ * Just why GDB uses a different order is a historical mystery.
+ */
+#ifdef CONFIG_X86_32
+enum regnames {
+ GDB_AX, /* 0 */
+ GDB_CX, /* 1 */
+ GDB_DX, /* 2 */
+ GDB_BX, /* 3 */
+ GDB_SP, /* 4 */
+ GDB_BP, /* 5 */
+ GDB_SI, /* 6 */
+ GDB_DI, /* 7 */
+ GDB_PC, /* 8 also known as eip */
+ GDB_PS, /* 9 also known as eflags */
+ GDB_CS, /* 10 */
+ GDB_SS, /* 11 */
+ GDB_DS, /* 12 */
+ GDB_ES, /* 13 */
+ GDB_FS, /* 14 */
+ GDB_GS, /* 15 */
+};
+#define NUMREGBYTES ((GDB_GS+1)*4)
+#else /* ! CONFIG_X86_32 */
+enum regnames64 {
+ GDB_AX, /* 0 */
+ GDB_BX, /* 1 */
+ GDB_CX, /* 2 */
+ GDB_DX, /* 3 */
+ GDB_SI, /* 4 */
+ GDB_DI, /* 5 */
+ GDB_BP, /* 6 */
+ GDB_SP, /* 7 */
+ GDB_R8, /* 8 */
+ GDB_R9, /* 9 */
+ GDB_R10, /* 10 */
+ GDB_R11, /* 11 */
+ GDB_R12, /* 12 */
+ GDB_R13, /* 13 */
+ GDB_R14, /* 14 */
+ GDB_R15, /* 15 */
+ GDB_PC, /* 16 */
+};
+
+enum regnames32 {
+ GDB_PS = 34,
+ GDB_CS,
+ GDB_SS,
+};
+#define NUMREGBYTES ((GDB_SS+1)*4)
+#endif /* CONFIG_X86_32 */
+
+static inline void arch_kgdb_breakpoint(void)
+{
+ asm(" int $3");
+}
+#define BREAK_INSTR_SIZE 1
+#define CACHE_FLUSH_IS_SAFE 1
+
+#endif /* _ASM_X86_KGDB_H */
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
new file mode 100644
index 00000000000..5759c165a5c
--- /dev/null
+++ b/arch/x86/include/asm/kmap_types.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_KMAP_TYPES_H
+#define _ASM_X86_KMAP_TYPES_H
+
+#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
+# define D(n) __KM_FENCE_##n ,
+#else
+# define D(n)
+#endif
+
+enum km_type {
+D(0) KM_BOUNCE_READ,
+D(1) KM_SKB_SUNRPC_DATA,
+D(2) KM_SKB_DATA_SOFTIRQ,
+D(3) KM_USER0,
+D(4) KM_USER1,
+D(5) KM_BIO_SRC_IRQ,
+D(6) KM_BIO_DST_IRQ,
+D(7) KM_PTE0,
+D(8) KM_PTE1,
+D(9) KM_IRQ0,
+D(10) KM_IRQ1,
+D(11) KM_SOFTIRQ0,
+D(12) KM_SOFTIRQ1,
+D(13) KM_TYPE_NR
+};
+
+#undef D
+
+#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
new file mode 100644
index 00000000000..4fe681de1e7
--- /dev/null
+++ b/arch/x86/include/asm/kprobes.h
@@ -0,0 +1,88 @@
+#ifndef _ASM_X86_KPROBES_H
+#define _ASM_X86_KPROBES_H
+/*
+ * Kernel Probes (KProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * See arch/x86/kernel/kprobes.c for x86 kprobes history.
+ */
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+
+#define __ARCH_WANT_KPROBES_INSN_SLOT
+
+struct pt_regs;
+struct kprobe;
+
+typedef u8 kprobe_opcode_t;
+#define BREAKPOINT_INSTRUCTION 0xcc
+#define RELATIVEJUMP_INSTRUCTION 0xe9
+#define MAX_INSN_SIZE 16
+#define MAX_STACK_SIZE 64
+#define MIN_STACK_SIZE(ADDR) \
+ (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
+ THREAD_SIZE - (unsigned long)(ADDR))) \
+ ? (MAX_STACK_SIZE) \
+ : (((unsigned long)current_thread_info()) + \
+ THREAD_SIZE - (unsigned long)(ADDR)))
+
+#define flush_insn_slot(p) do { } while (0)
+
+extern const int kretprobe_blacklist_size;
+
+void arch_remove_kprobe(struct kprobe *p);
+void kretprobe_trampoline(void);
+
+/* Architecture specific copy of original instruction*/
+struct arch_specific_insn {
+ /* copy of the original instruction */
+ kprobe_opcode_t *insn;
+ /*
+ * boostable = -1: This instruction type is not boostable.
+ * boostable = 0: This instruction type is boostable.
+ * boostable = 1: This instruction has been boosted: we have
+ * added a relative jump after the instruction copy in insn,
+ * so no single-step and fixup are needed (unless there's
+ * a post_handler or break_handler).
+ */
+ int boostable;
+};
+
+struct prev_kprobe {
+ struct kprobe *kp;
+ unsigned long status;
+ unsigned long old_flags;
+ unsigned long saved_flags;
+};
+
+/* per-cpu kprobe control block */
+struct kprobe_ctlblk {
+ unsigned long kprobe_status;
+ unsigned long kprobe_old_flags;
+ unsigned long kprobe_saved_flags;
+ unsigned long *jprobe_saved_sp;
+ struct pt_regs jprobe_saved_regs;
+ kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
+ struct prev_kprobe prev_kprobe;
+};
+
+extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
+extern int kprobe_exceptions_notify(struct notifier_block *self,
+ unsigned long val, void *data);
+#endif /* _ASM_X86_KPROBES_H */
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
new file mode 100644
index 00000000000..b95162af0bf
--- /dev/null
+++ b/arch/x86/include/asm/kvm.h
@@ -0,0 +1,211 @@
+#ifndef _ASM_X86_KVM_H
+#define _ASM_X86_KVM_H
+
+/*
+ * KVM x86 specific structures and definitions
+ *
+ */
+
+#include <asm/types.h>
+#include <linux/ioctl.h>
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+struct kvm_memory_alias {
+ __u32 slot; /* this has a different namespace than memory slots */
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size;
+ __u64 target_phys_addr;
+};
+
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+ __u8 last_irr; /* edge detection */
+ __u8 irr; /* interrupt request register */
+ __u8 imr; /* interrupt mask register */
+ __u8 isr; /* interrupt service register */
+ __u8 priority_add; /* highest irq priority */
+ __u8 irq_base;
+ __u8 read_reg_select;
+ __u8 poll;
+ __u8 special_mask;
+ __u8 init_state;
+ __u8 auto_eoi;
+ __u8 rotate_on_auto_eoi;
+ __u8 special_fully_nested_mode;
+ __u8 init4; /* true if 4 byte init */
+ __u8 elcr; /* PIIX edge/trigger selection */
+ __u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS 24
+struct kvm_ioapic_state {
+ __u64 base_address;
+ __u32 ioregsel;
+ __u32 id;
+ __u32 irr;
+ __u32 pad;
+ union {
+ __u64 bits;
+ struct {
+ __u8 vector;
+ __u8 delivery_mode:3;
+ __u8 dest_mode:1;
+ __u8 delivery_status:1;
+ __u8 polarity:1;
+ __u8 remote_irr:1;
+ __u8 trig_mode:1;
+ __u8 mask:1;
+ __u8 reserve:7;
+ __u8 reserved[4];
+ __u8 dest_id;
+ } fields;
+ } redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER 0
+#define KVM_IRQCHIP_PIC_SLAVE 1
+#define KVM_IRQCHIP_IOAPIC 2
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+ /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+ __u64 rax, rbx, rcx, rdx;
+ __u64 rsi, rdi, rsp, rbp;
+ __u64 r8, r9, r10, r11;
+ __u64 r12, r13, r14, r15;
+ __u64 rip, rflags;
+};
+
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+ char regs[KVM_APIC_REG_SIZE];
+};
+
+struct kvm_segment {
+ __u64 base;
+ __u32 limit;
+ __u16 selector;
+ __u8 type;
+ __u8 present, dpl, db, s, l, g, avl;
+ __u8 unusable;
+ __u8 padding;
+};
+
+struct kvm_dtable {
+ __u64 base;
+ __u16 limit;
+ __u16 padding[3];
+};
+
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+ /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+ struct kvm_segment cs, ds, es, fs, gs, ss;
+ struct kvm_segment tr, ldt;
+ struct kvm_dtable gdt, idt;
+ __u64 cr0, cr2, cr3, cr4, cr8;
+ __u64 efer;
+ __u64 apic_base;
+ __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+ __u8 fpr[8][16];
+ __u16 fcw;
+ __u16 fsw;
+ __u8 ftwx; /* in fxsave format */
+ __u8 pad1;
+ __u16 last_opcode;
+ __u64 last_ip;
+ __u64 last_dp;
+ __u8 xmm[16][16];
+ __u32 mxcsr;
+ __u32 pad2;
+};
+
+struct kvm_msr_entry {
+ __u32 index;
+ __u32 reserved;
+ __u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 pad;
+
+ struct kvm_msr_entry entries[0];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 indices[0];
+};
+
+
+struct kvm_cpuid_entry {
+ __u32 function;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry entries[0];
+};
+
+struct kvm_cpuid_entry2 {
+ __u32 function;
+ __u32 index;
+ __u32 flags;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding[3];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
+#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
+
+/* for KVM_SET_CPUID2 */
+struct kvm_cpuid2 {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry2 entries[0];
+};
+
+/* for KVM_GET_PIT and KVM_SET_PIT */
+struct kvm_pit_channel_state {
+ __u32 count; /* can be 65536 */
+ __u16 latched_count;
+ __u8 count_latched;
+ __u8 status_latched;
+ __u8 status;
+ __u8 read_state;
+ __u8 write_state;
+ __u8 write_latch;
+ __u8 rw_mode;
+ __u8 mode;
+ __u8 bcd;
+ __u8 gate;
+ __s64 count_load_time;
+};
+
+struct kvm_pit_state {
+ struct kvm_pit_channel_state channels[3];
+};
+#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
new file mode 100644
index 00000000000..65679d00633
--- /dev/null
+++ b/arch/x86/include/asm/kvm_host.h
@@ -0,0 +1,752 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This header defines architecture specific interfaces, x86 version
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _ASM_X86_KVM_HOST_H
+#define _ASM_X86_KVM_HOST_H
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mmu_notifier.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+
+#include <asm/pvclock-abi.h>
+#include <asm/desc.h>
+
+#define KVM_MAX_VCPUS 16
+#define KVM_MEMORY_SLOTS 32
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+
+#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
+#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
+#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
+ 0xFFFFFF0000000000ULL)
+
+#define KVM_GUEST_CR0_MASK \
+ (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
+ | X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_ON \
+ (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
+ | X86_CR0_MP)
+#define KVM_GUEST_CR4_MASK \
+ (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
+#define INVALID_PAGE (~(hpa_t)0)
+#define UNMAPPED_GVA (~(gpa_t)0)
+
+/* shadow tables are PAE even on non-PAE hosts */
+#define KVM_HPAGE_SHIFT 21
+#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
+#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
+
+#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
+
+#define DE_VECTOR 0
+#define DB_VECTOR 1
+#define BP_VECTOR 3
+#define OF_VECTOR 4
+#define BR_VECTOR 5
+#define UD_VECTOR 6
+#define NM_VECTOR 7
+#define DF_VECTOR 8
+#define TS_VECTOR 10
+#define NP_VECTOR 11
+#define SS_VECTOR 12
+#define GP_VECTOR 13
+#define PF_VECTOR 14
+#define MF_VECTOR 16
+#define MC_VECTOR 18
+
+#define SELECTOR_TI_MASK (1 << 2)
+#define SELECTOR_RPL_MASK 0x03
+
+#define IOPL_SHIFT 12
+
+#define KVM_ALIAS_SLOTS 4
+
+#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_MMU_HASH_SHIFT 10
+#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
+#define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_NR_VAR_MTRR 8
+
+extern spinlock_t kvm_lock;
+extern struct list_head vm_list;
+
+struct kvm_vcpu;
+struct kvm;
+
+enum kvm_reg {
+ VCPU_REGS_RAX = 0,
+ VCPU_REGS_RCX = 1,
+ VCPU_REGS_RDX = 2,
+ VCPU_REGS_RBX = 3,
+ VCPU_REGS_RSP = 4,
+ VCPU_REGS_RBP = 5,
+ VCPU_REGS_RSI = 6,
+ VCPU_REGS_RDI = 7,
+#ifdef CONFIG_X86_64
+ VCPU_REGS_R8 = 8,
+ VCPU_REGS_R9 = 9,
+ VCPU_REGS_R10 = 10,
+ VCPU_REGS_R11 = 11,
+ VCPU_REGS_R12 = 12,
+ VCPU_REGS_R13 = 13,
+ VCPU_REGS_R14 = 14,
+ VCPU_REGS_R15 = 15,
+#endif
+ VCPU_REGS_RIP,
+ NR_VCPU_REGS
+};
+
+enum {
+ VCPU_SREG_ES,
+ VCPU_SREG_CS,
+ VCPU_SREG_SS,
+ VCPU_SREG_DS,
+ VCPU_SREG_FS,
+ VCPU_SREG_GS,
+ VCPU_SREG_TR,
+ VCPU_SREG_LDTR,
+};
+
+#include <asm/kvm_x86_emulate.h>
+
+#define KVM_NR_MEM_OBJS 40
+
+struct kvm_guest_debug {
+ int enabled;
+ unsigned long bp[4];
+ int singlestep;
+};
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_mmu_memory_cache {
+ int nobjs;
+ void *objects[KVM_NR_MEM_OBJS];
+};
+
+#define NR_PTE_CHAIN_ENTRIES 5
+
+struct kvm_pte_chain {
+ u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
+ struct hlist_node link;
+};
+
+/*
+ * kvm_mmu_page_role, below, is defined as:
+ *
+ * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
+ * bits 4:7 - page table level for this shadow (1-4)
+ * bits 8:9 - page table quadrant for 2-level guests
+ * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
+ * bits 17:19 - common access permissions for all ptes in this shadow page
+ */
+union kvm_mmu_page_role {
+ unsigned word;
+ struct {
+ unsigned glevels:4;
+ unsigned level:4;
+ unsigned quadrant:2;
+ unsigned pad_for_nice_hex_output:6;
+ unsigned metaphysical:1;
+ unsigned access:3;
+ unsigned invalid:1;
+ };
+};
+
+struct kvm_mmu_page {
+ struct list_head link;
+ struct hlist_node hash_link;
+
+ /*
+ * The following two entries are used to key the shadow page in the
+ * hash table.
+ */
+ gfn_t gfn;
+ union kvm_mmu_page_role role;
+
+ u64 *spt;
+ /* hold the gfn of each spte inside spt */
+ gfn_t *gfns;
+ unsigned long slot_bitmap; /* One bit set per slot which has memory
+ * in this shadow page.
+ */
+ int multimapped; /* More than one parent_pte? */
+ int root_count; /* Currently serving as active root */
+ bool unsync;
+ bool unsync_children;
+ union {
+ u64 *parent_pte; /* !multimapped */
+ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
+ };
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
+};
+
+struct kvm_pv_mmu_op_buffer {
+ void *ptr;
+ unsigned len;
+ unsigned processed;
+ char buf[512] __aligned(sizeof(long));
+};
+
+/*
+ * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
+ * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
+ * mode.
+ */
+struct kvm_mmu {
+ void (*new_cr3)(struct kvm_vcpu *vcpu);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+ void (*free)(struct kvm_vcpu *vcpu);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
+ void (*prefetch_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page);
+ int (*sync_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp);
+ void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
+ hpa_t root_hpa;
+ int root_level;
+ int shadow_root_level;
+
+ u64 *pae_root;
+};
+
+struct kvm_vcpu_arch {
+ u64 host_tsc;
+ int interrupt_window_open;
+ unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
+ DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
+ /*
+ * rip and regs accesses must go through
+ * kvm_{register,rip}_{read,write} functions.
+ */
+ unsigned long regs[NR_VCPU_REGS];
+ u32 regs_avail;
+ u32 regs_dirty;
+
+ unsigned long cr0;
+ unsigned long cr2;
+ unsigned long cr3;
+ unsigned long cr4;
+ unsigned long cr8;
+ u64 pdptrs[4]; /* pae */
+ u64 shadow_efer;
+ u64 apic_base;
+ struct kvm_lapic *apic; /* kernel irqchip context */
+ int mp_state;
+ int sipi_vector;
+ u64 ia32_misc_enable_msr;
+ bool tpr_access_reporting;
+
+ struct kvm_mmu mmu;
+ /* only needed in kvm_pv_mmu_op() path, but it's hot so
+ * put it here to avoid allocation */
+ struct kvm_pv_mmu_op_buffer mmu_op_buffer;
+
+ struct kvm_mmu_memory_cache mmu_pte_chain_cache;
+ struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+ struct kvm_mmu_memory_cache mmu_page_cache;
+ struct kvm_mmu_memory_cache mmu_page_header_cache;
+
+ gfn_t last_pt_write_gfn;
+ int last_pt_write_count;
+ u64 *last_pte_updated;
+ gfn_t last_pte_gfn;
+
+ struct {
+ gfn_t gfn; /* presumed gfn during guest pte update */
+ pfn_t pfn; /* pfn corresponding to that gfn */
+ int largepage;
+ unsigned long mmu_seq;
+ } update_pte;
+
+ struct i387_fxsave_struct host_fx_image;
+ struct i387_fxsave_struct guest_fx_image;
+
+ gva_t mmio_fault_cr2;
+ struct kvm_pio_request pio;
+ void *pio_data;
+
+ struct kvm_queued_exception {
+ bool pending;
+ bool has_error_code;
+ u8 nr;
+ u32 error_code;
+ } exception;
+
+ struct kvm_queued_interrupt {
+ bool pending;
+ u8 nr;
+ } interrupt;
+
+ struct {
+ int active;
+ u8 save_iopl;
+ struct kvm_save_segment {
+ u16 selector;
+ unsigned long base;
+ u32 limit;
+ u32 ar;
+ } tr, es, ds, fs, gs;
+ } rmode;
+ int halt_request; /* real mode on Intel only */
+
+ int cpuid_nent;
+ struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+ /* emulate context */
+
+ struct x86_emulate_ctxt emulate_ctxt;
+
+ gpa_t time;
+ struct pvclock_vcpu_time_info hv_clock;
+ unsigned int hv_clock_tsc_khz;
+ unsigned int time_offset;
+ struct page *time_page;
+
+ bool nmi_pending;
+ bool nmi_injected;
+
+ u64 mtrr[0x100];
+};
+
+struct kvm_mem_alias {
+ gfn_t base_gfn;
+ unsigned long npages;
+ gfn_t target_gfn;
+};
+
+struct kvm_arch{
+ int naliases;
+ struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
+
+ unsigned int n_free_mmu_pages;
+ unsigned int n_requested_mmu_pages;
+ unsigned int n_alloc_mmu_pages;
+ struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+ /*
+ * Hash table of struct kvm_mmu_page.
+ */
+ struct list_head active_mmu_pages;
+ struct list_head assigned_dev_head;
+ struct dmar_domain *intel_iommu_domain;
+ struct kvm_pic *vpic;
+ struct kvm_ioapic *vioapic;
+ struct kvm_pit *vpit;
+ struct hlist_head irq_ack_notifier_list;
+
+ int round_robin_prev_vcpu;
+ unsigned int tss_addr;
+ struct page *apic_access_page;
+
+ gpa_t wall_clock;
+
+ struct page *ept_identity_pagetable;
+ bool ept_identity_pagetable_done;
+};
+
+struct kvm_vm_stat {
+ u32 mmu_shadow_zapped;
+ u32 mmu_pte_write;
+ u32 mmu_pte_updated;
+ u32 mmu_pde_zapped;
+ u32 mmu_flooded;
+ u32 mmu_recycled;
+ u32 mmu_cache_miss;
+ u32 mmu_unsync;
+ u32 remote_tlb_flush;
+ u32 lpages;
+};
+
+struct kvm_vcpu_stat {
+ u32 pf_fixed;
+ u32 pf_guest;
+ u32 tlb_flush;
+ u32 invlpg;
+
+ u32 exits;
+ u32 io_exits;
+ u32 mmio_exits;
+ u32 signal_exits;
+ u32 irq_window_exits;
+ u32 nmi_window_exits;
+ u32 halt_exits;
+ u32 halt_wakeup;
+ u32 request_irq_exits;
+ u32 irq_exits;
+ u32 host_state_reload;
+ u32 efer_reload;
+ u32 fpu_reload;
+ u32 insn_emulation;
+ u32 insn_emulation_fail;
+ u32 hypercalls;
+ u32 irq_injections;
+};
+
+struct descriptor_table {
+ u16 limit;
+ unsigned long base;
+} __attribute__((packed));
+
+struct kvm_x86_ops {
+ int (*cpu_has_kvm_support)(void); /* __init */
+ int (*disabled_by_bios)(void); /* __init */
+ void (*hardware_enable)(void *dummy); /* __init */
+ void (*hardware_disable)(void *dummy);
+ void (*check_processor_compatibility)(void *rtn);
+ int (*hardware_setup)(void); /* __init */
+ void (*hardware_unsetup)(void); /* __exit */
+ bool (*cpu_has_accelerated_tpr)(void);
+
+ /* Create, but do not attach this VCPU */
+ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
+ void (*vcpu_free)(struct kvm_vcpu *vcpu);
+ int (*vcpu_reset)(struct kvm_vcpu *vcpu);
+
+ void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+ void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+ void (*vcpu_put)(struct kvm_vcpu *vcpu);
+
+ int (*set_guest_debug)(struct kvm_vcpu *vcpu,
+ struct kvm_debug_guest *dbg);
+ void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
+ int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+ int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+ u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
+ void (*get_segment)(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+ int (*get_cpl)(struct kvm_vcpu *vcpu);
+ void (*set_segment)(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+ void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
+ void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+ void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+ void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
+ void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
+ void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
+ int *exception);
+ void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+ unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+ void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+
+ void (*tlb_flush)(struct kvm_vcpu *vcpu);
+
+ void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
+ int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+ void (*patch_hypercall)(struct kvm_vcpu *vcpu,
+ unsigned char *hypercall_addr);
+ int (*get_irq)(struct kvm_vcpu *vcpu);
+ void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+ void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code);
+ bool (*exception_injected)(struct kvm_vcpu *vcpu);
+ void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
+ void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
+ struct kvm_run *run);
+
+ int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+ int (*get_tdp_level)(void);
+};
+
+extern struct kvm_x86_ops *kvm_x86_ops;
+
+int kvm_mmu_module_init(void);
+void kvm_mmu_module_exit(void);
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_mmu_create(struct kvm_vcpu *vcpu);
+int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+void kvm_mmu_set_base_ptes(u64 base_pte);
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+ u64 dirty_mask, u64 nx_mask, u64 x_mask);
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_zap_all(struct kvm *kvm);
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes);
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+ gpa_t addr, unsigned long *ret);
+
+extern bool tdp_enabled;
+
+enum emulation_result {
+ EMULATE_DONE, /* no further processing */
+ EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
+ EMULATE_FAIL, /* can't emulate this instruction */
+};
+
+#define EMULTYPE_NO_DECODE (1 << 0)
+#define EMULTYPE_TRAP_UD (1 << 1)
+int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ unsigned long cr2, u16 error_code, int emulation_type);
+void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+ unsigned long *rflags);
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
+ unsigned long *rflags);
+void kvm_enable_efer_bits(u64);
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+
+struct x86_emulate_ctxt;
+
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+ int size, unsigned port);
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+ int size, unsigned long count, int down,
+ gva_t address, int rep, unsigned port);
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
+int emulate_clts(struct kvm_vcpu *vcpu);
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long *dest);
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long value);
+
+void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+ int type_bits, int seg);
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
+
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
+ u32 error_code);
+
+void kvm_pic_set_irq(void *opaque, int irq, int level);
+
+void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+
+void fx_init(struct kvm_vcpu *vcpu);
+
+int emulator_read_std(unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+int emulator_write_emulated(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+unsigned long segment_base(u16 selector);
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes);
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+
+void kvm_enable_tdp(void);
+void kvm_disable_tdp(void);
+
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int complete_pio(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline u16 kvm_read_fs(void)
+{
+ u16 seg;
+ asm("mov %%fs, %0" : "=g"(seg));
+ return seg;
+}
+
+static inline u16 kvm_read_gs(void)
+{
+ u16 seg;
+ asm("mov %%gs, %0" : "=g"(seg));
+ return seg;
+}
+
+static inline u16 kvm_read_ldt(void)
+{
+ u16 ldt;
+ asm("sldt %0" : "=g"(ldt));
+ return ldt;
+}
+
+static inline void kvm_load_fs(u16 sel)
+{
+ asm("mov %0, %%fs" : : "rm"(sel));
+}
+
+static inline void kvm_load_gs(u16 sel)
+{
+ asm("mov %0, %%gs" : : "rm"(sel));
+}
+
+static inline void kvm_load_ldt(u16 sel)
+{
+ asm("lldt %0" : : "rm"(sel));
+}
+
+static inline void kvm_get_idt(struct descriptor_table *table)
+{
+ asm("sidt %0" : "=m"(*table));
+}
+
+static inline void kvm_get_gdt(struct descriptor_table *table)
+{
+ asm("sgdt %0" : "=m"(*table));
+}
+
+static inline unsigned long kvm_read_tr_base(void)
+{
+ u16 tr;
+ asm("str %0" : "=g"(tr));
+ return segment_base(tr);
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long read_msr(unsigned long msr)
+{
+ u64 value;
+
+ rdmsrl(msr, value);
+ return value;
+}
+#endif
+
+static inline void kvm_fx_save(struct i387_fxsave_struct *image)
+{
+ asm("fxsave (%0)":: "r" (image));
+}
+
+static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
+{
+ asm("fxrstor (%0)":: "r" (image));
+}
+
+static inline void kvm_fx_finit(void)
+{
+ asm("finit");
+}
+
+static inline u32 get_rdx_init_val(void)
+{
+ return 0x600; /* P6 family */
+}
+
+static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
+{
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+}
+
+#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
+
+#define MSR_IA32_TIME_STAMP_COUNTER 0x010
+
+#define TSS_IOPB_BASE_OFFSET 0x66
+#define TSS_BASE_SIZE 0x68
+#define TSS_IOPB_SIZE (65536 / 8)
+#define TSS_REDIRECTION_SIZE (256 / 8)
+#define RMODE_TSS_SIZE \
+ (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
+
+enum {
+ TASK_SWITCH_CALL = 0,
+ TASK_SWITCH_IRET = 1,
+ TASK_SWITCH_JMP = 2,
+ TASK_SWITCH_GATE = 3,
+};
+
+/*
+ * Hardware virtualization extension instructions may fault if a
+ * reboot turns off virtualization while processes are running.
+ * Trap the fault and ignore the instruction if that happens.
+ */
+asmlinkage void kvm_handle_fault_on_reboot(void);
+
+#define __kvm_handle_fault_on_reboot(insn) \
+ "666: " insn "\n\t" \
+ ".pushsection .fixup, \"ax\" \n" \
+ "667: \n\t" \
+ __ASM_SIZE(push) " $666b \n\t" \
+ "jmp kvm_handle_fault_on_reboot \n\t" \
+ ".popsection \n\t" \
+ ".pushsection __ex_table, \"a\" \n\t" \
+ _ASM_PTR " 666b, 667b \n\t" \
+ ".popsection"
+
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+
+#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
new file mode 100644
index 00000000000..b8a3305ae09
--- /dev/null
+++ b/arch/x86/include/asm/kvm_para.h
@@ -0,0 +1,147 @@
+#ifndef _ASM_X86_KVM_PARA_H
+#define _ASM_X86_KVM_PARA_H
+
+/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
+ * should be used to determine that a VM is running under KVM.
+ */
+#define KVM_CPUID_SIGNATURE 0x40000000
+
+/* This CPUID returns a feature bitmap in eax. Before enabling a particular
+ * paravirtualization, the appropriate feature bit should be checked.
+ */
+#define KVM_CPUID_FEATURES 0x40000001
+#define KVM_FEATURE_CLOCKSOURCE 0
+#define KVM_FEATURE_NOP_IO_DELAY 1
+#define KVM_FEATURE_MMU_OP 2
+
+#define MSR_KVM_WALL_CLOCK 0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
+
+#define KVM_MAX_MMU_OP_BATCH 32
+
+/* Operations for KVM_HC_MMU_OP */
+#define KVM_MMU_OP_WRITE_PTE 1
+#define KVM_MMU_OP_FLUSH_TLB 2
+#define KVM_MMU_OP_RELEASE_PT 3
+
+/* Payload for KVM_HC_MMU_OP */
+struct kvm_mmu_op_header {
+ __u32 op;
+ __u32 pad;
+};
+
+struct kvm_mmu_op_write_pte {
+ struct kvm_mmu_op_header header;
+ __u64 pte_phys;
+ __u64 pte_val;
+};
+
+struct kvm_mmu_op_flush_tlb {
+ struct kvm_mmu_op_header header;
+};
+
+struct kvm_mmu_op_release_pt {
+ struct kvm_mmu_op_header header;
+ __u64 pt_phys;
+};
+
+#ifdef __KERNEL__
+#include <asm/processor.h>
+
+extern void kvmclock_init(void);
+
+
+/* This instruction is vmcall. On non-VT architectures, it will generate a
+ * trap that we will then rewrite to the appropriate instruction.
+ */
+#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
+
+/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
+ * instruction. The hypervisor may replace it with something else but only the
+ * instructions are guaranteed to be supported.
+ *
+ * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
+ * The hypercall number should be placed in rax and the return value will be
+ * placed in rax. No other registers will be clobbered unless explicited
+ * noted by the particular hypercall.
+ */
+
+static inline long kvm_hypercall0(unsigned int nr)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
+ unsigned long p2)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3,
+ unsigned long p4)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
+ : "memory");
+ return ret;
+}
+
+static inline int kvm_para_available(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ char signature[13];
+
+ cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
+ memcpy(signature + 0, &ebx, 4);
+ memcpy(signature + 4, &ecx, 4);
+ memcpy(signature + 8, &edx, 4);
+ signature[12] = 0;
+
+ if (strcmp(signature, "KVMKVMKVM") == 0)
+ return 1;
+
+ return 0;
+}
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+ return cpuid_eax(KVM_CPUID_FEATURES);
+}
+
+#endif
+
+#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
new file mode 100644
index 00000000000..25179a29f20
--- /dev/null
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -0,0 +1,184 @@
+/******************************************************************************
+ * x86_emulate.h
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef _ASM_X86_KVM_X86_EMULATE_H
+#define _ASM_X86_KVM_X86_EMULATE_H
+
+struct x86_emulate_ctxt;
+
+/*
+ * x86_emulate_ops:
+ *
+ * These operations represent the instruction emulator's interface to memory.
+ * There are two categories of operation: those that act on ordinary memory
+ * regions (*_std), and those that act on memory regions known to require
+ * special treatment or emulation (*_emulated).
+ *
+ * The emulator assumes that an instruction accesses only one 'emulated memory'
+ * location, that this location is the given linear faulting address (cr2), and
+ * that this is one of the instruction's data operands. Instruction fetches and
+ * stack operations are assumed never to access emulated memory. The emulator
+ * automatically deduces which operand of a string-move operation is accessing
+ * emulated memory, and assumes that the other operand accesses normal memory.
+ *
+ * NOTES:
+ * 1. The emulator isn't very smart about emulated vs. standard memory.
+ * 'Emulated memory' access addresses should be checked for sanity.
+ * 'Normal memory' accesses may fault, and the caller must arrange to
+ * detect and handle reentrancy into the emulator via recursive faults.
+ * Accesses may be unaligned and may cross page boundaries.
+ * 2. If the access fails (cannot emulate, or a standard access faults) then
+ * it is up to the memop to propagate the fault to the guest VM via
+ * some out-of-band mechanism, unknown to the emulator. The memop signals
+ * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
+ * then immediately bail.
+ * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
+ * cmpxchg8b_emulated need support 8-byte accesses.
+ * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ */
+/* Access completed successfully: continue emulation as normal. */
+#define X86EMUL_CONTINUE 0
+/* Access is unhandleable: bail from emulation and return error to caller. */
+#define X86EMUL_UNHANDLEABLE 1
+/* Terminate emulation but return success to the caller. */
+#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
+#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
+struct x86_emulate_ops {
+ /*
+ * read_std: Read bytes of standard (non-emulated/special) memory.
+ * Used for instruction fetch, stack operations, and others.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_std)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu);
+
+ /*
+ * read_emulated: Read bytes from emulated/special memory area.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_emulated)(unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+ /*
+ * write_emulated: Read bytes from emulated/special memory area.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [IN ] Value to write to memory (low-order bytes used as
+ * required).
+ * @bytes: [IN ] Number of bytes to write to memory.
+ */
+ int (*write_emulated)(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+ /*
+ * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
+ * emulated/special memory area.
+ * @addr: [IN ] Linear address to access.
+ * @old: [IN ] Value expected to be current at @addr.
+ * @new: [IN ] Value to write to @addr.
+ * @bytes: [IN ] Number of bytes to access using CMPXCHG.
+ */
+ int (*cmpxchg_emulated)(unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+};
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+ enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+ unsigned int bytes;
+ unsigned long val, orig_val, *ptr;
+};
+
+struct fetch_cache {
+ u8 data[15];
+ unsigned long start;
+ unsigned long end;
+};
+
+struct decode_cache {
+ u8 twobyte;
+ u8 b;
+ u8 lock_prefix;
+ u8 rep_prefix;
+ u8 op_bytes;
+ u8 ad_bytes;
+ u8 rex_prefix;
+ struct operand src;
+ struct operand dst;
+ bool has_seg_override;
+ u8 seg_override;
+ unsigned int d;
+ unsigned long regs[NR_VCPU_REGS];
+ unsigned long eip;
+ /* modrm */
+ u8 modrm;
+ u8 modrm_mod;
+ u8 modrm_reg;
+ u8 modrm_rm;
+ u8 use_modrm_ea;
+ bool rip_relative;
+ unsigned long modrm_ea;
+ void *modrm_ptr;
+ unsigned long modrm_val;
+ struct fetch_cache fetch;
+};
+
+struct x86_emulate_ctxt {
+ /* Register state before/after emulation. */
+ struct kvm_vcpu *vcpu;
+
+ /* Linear faulting address (if emulating a page-faulting instruction) */
+ unsigned long eflags;
+
+ /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+ int mode;
+
+ u32 cs_base;
+
+ /* decode cache */
+
+ struct decode_cache decode;
+};
+
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX 1
+#define REPNE_PREFIX 2
+
+/* Execution mode, passed to the emulator. */
+#define X86EMUL_MODE_REAL 0 /* Real mode. */
+#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
+#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
+#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
+
+/* Host execution mode. */
+#if defined(__i386__)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
+#elif defined(CONFIG_X86_64)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
+#endif
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops);
+
+#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/ldt.h b/arch/x86/include/asm/ldt.h
new file mode 100644
index 00000000000..46727eb37bf
--- /dev/null
+++ b/arch/x86/include/asm/ldt.h
@@ -0,0 +1,40 @@
+/*
+ * ldt.h
+ *
+ * Definitions of structures used with the modify_ldt system call.
+ */
+#ifndef _ASM_X86_LDT_H
+#define _ASM_X86_LDT_H
+
+/* Maximum number of LDT entries supported. */
+#define LDT_ENTRIES 8192
+/* The size of each LDT entry. */
+#define LDT_ENTRY_SIZE 8
+
+#ifndef __ASSEMBLY__
+/*
+ * Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
+ * not to the default values if you still want to do syscalls. This
+ * call is more for 32bit mode therefore.
+ */
+struct user_desc {
+ unsigned int entry_number;
+ unsigned int base_addr;
+ unsigned int limit;
+ unsigned int seg_32bit:1;
+ unsigned int contents:2;
+ unsigned int read_exec_only:1;
+ unsigned int limit_in_pages:1;
+ unsigned int seg_not_present:1;
+ unsigned int useable:1;
+#ifdef __x86_64__
+ unsigned int lm:1;
+#endif
+};
+
+#define MODIFY_LDT_CONTENTS_DATA 0
+#define MODIFY_LDT_CONTENTS_STACK 1
+#define MODIFY_LDT_CONTENTS_CODE 2
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_LDT_H */
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
new file mode 100644
index 00000000000..d28a507cef3
--- /dev/null
+++ b/arch/x86/include/asm/lguest.h
@@ -0,0 +1,94 @@
+#ifndef _ASM_X86_LGUEST_H
+#define _ASM_X86_LGUEST_H
+
+#define GDT_ENTRY_LGUEST_CS 10
+#define GDT_ENTRY_LGUEST_DS 11
+#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
+#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
+
+#ifndef __ASSEMBLY__
+#include <asm/desc.h>
+
+#define GUEST_PL 1
+
+/* Every guest maps the core switcher code. */
+#define SHARED_SWITCHER_PAGES \
+ DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
+/* Pages for switcher itself, then two pages per cpu */
+#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
+
+/* We map at -4M for ease of mapping into the guest (one PTE page). */
+#define SWITCHER_ADDR 0xFFC00000
+
+/* Found in switcher.S */
+extern unsigned long default_idt_entries[];
+
+/* Declarations for definitions in lguest_guest.S */
+extern char lguest_noirq_start[], lguest_noirq_end[];
+extern const char lgstart_cli[], lgend_cli[];
+extern const char lgstart_sti[], lgend_sti[];
+extern const char lgstart_popf[], lgend_popf[];
+extern const char lgstart_pushf[], lgend_pushf[];
+extern const char lgstart_iret[], lgend_iret[];
+
+extern void lguest_iret(void);
+extern void lguest_init(void);
+
+struct lguest_regs {
+ /* Manually saved part. */
+ unsigned long eax, ebx, ecx, edx;
+ unsigned long esi, edi, ebp;
+ unsigned long gs;
+ unsigned long fs, ds, es;
+ unsigned long trapnum, errcode;
+ /* Trap pushed part */
+ unsigned long eip;
+ unsigned long cs;
+ unsigned long eflags;
+ unsigned long esp;
+ unsigned long ss;
+};
+
+/* This is a guest-specific page (mapped ro) into the guest. */
+struct lguest_ro_state {
+ /* Host information we need to restore when we switch back. */
+ u32 host_cr3;
+ struct desc_ptr host_idt_desc;
+ struct desc_ptr host_gdt_desc;
+ u32 host_sp;
+
+ /* Fields which are used when guest is running. */
+ struct desc_ptr guest_idt_desc;
+ struct desc_ptr guest_gdt_desc;
+ struct x86_hw_tss guest_tss;
+ struct desc_struct guest_idt[IDT_ENTRIES];
+ struct desc_struct guest_gdt[GDT_ENTRIES];
+};
+
+struct lg_cpu_arch {
+ /* The GDT entries copied into lguest_ro_state when running. */
+ struct desc_struct gdt[GDT_ENTRIES];
+
+ /* The IDT entries: some copied into lguest_ro_state when running. */
+ struct desc_struct idt[IDT_ENTRIES];
+
+ /* The address of the last guest-visible pagefault (ie. cr2). */
+ unsigned long last_pagefault;
+};
+
+static inline void lguest_set_ts(void)
+{
+ u32 cr0;
+
+ cr0 = read_cr0();
+ if (!(cr0 & 8))
+ write_cr0(cr0 | 8);
+}
+
+/* Full 4G segment descriptors, suitable for CS and DS. */
+#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } })
+#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } })
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
new file mode 100644
index 00000000000..43894428c3c
--- /dev/null
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -0,0 +1,71 @@
+/* Architecture specific portion of the lguest hypercalls */
+#ifndef _ASM_X86_LGUEST_HCALL_H
+#define _ASM_X86_LGUEST_HCALL_H
+
+#define LHCALL_FLUSH_ASYNC 0
+#define LHCALL_LGUEST_INIT 1
+#define LHCALL_SHUTDOWN 2
+#define LHCALL_LOAD_GDT 3
+#define LHCALL_NEW_PGTABLE 4
+#define LHCALL_FLUSH_TLB 5
+#define LHCALL_LOAD_IDT_ENTRY 6
+#define LHCALL_SET_STACK 7
+#define LHCALL_TS 8
+#define LHCALL_SET_CLOCKEVENT 9
+#define LHCALL_HALT 10
+#define LHCALL_SET_PTE 14
+#define LHCALL_SET_PMD 15
+#define LHCALL_LOAD_TLS 16
+#define LHCALL_NOTIFY 17
+
+#define LGUEST_TRAP_ENTRY 0x1F
+
+/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
+#define LGUEST_SHUTDOWN_POWEROFF 1
+#define LGUEST_SHUTDOWN_RESTART 2
+
+#ifndef __ASSEMBLY__
+#include <asm/hw_irq.h>
+
+/*G:031 But first, how does our Guest contact the Host to ask for privileged
+ * operations? There are two ways: the direct way is to make a "hypercall",
+ * to make requests of the Host Itself.
+ *
+ * Our hypercall mechanism uses the highest unused trap code (traps 32 and
+ * above are used by real hardware interrupts). Fifteen hypercalls are
+ * available: the hypercall number is put in the %eax register, and the
+ * arguments (when required) are placed in %edx, %ebx and %ecx. If a return
+ * value makes sense, it's returned in %eax.
+ *
+ * Grossly invalid calls result in Sudden Death at the hands of the vengeful
+ * Host, rather than returning failure. This reflects Winston Churchill's
+ * definition of a gentleman: "someone who is only rude intentionally". */
+static inline unsigned long
+hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+ /* "int" is the Intel instruction to trigger a trap. */
+ asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
+ /* The call in %eax (aka "a") might be overwritten */
+ : "=a"(call)
+ /* The arguments are in %eax, %edx, %ebx & %ecx */
+ : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
+ /* "memory" means this might write somewhere in memory.
+ * This isn't true for all calls, but it's safe to tell
+ * gcc that it might happen so it doesn't get clever. */
+ : "memory");
+ return call;
+}
+/*:*/
+
+/* Can't use our min() macro here: needs to be a constant */
+#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
+
+#define LHCALL_RING_SIZE 64
+struct hcall_args {
+ /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
+ unsigned long arg0, arg2, arg3, arg1;
+};
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
new file mode 100644
index 00000000000..f61ee8f937e
--- /dev/null
+++ b/arch/x86/include/asm/linkage.h
@@ -0,0 +1,61 @@
+#ifndef _ASM_X86_LINKAGE_H
+#define _ASM_X86_LINKAGE_H
+
+#undef notrace
+#define notrace __attribute__((no_instrument_function))
+
+#ifdef CONFIG_X86_64
+#define __ALIGN .p2align 4,,15
+#define __ALIGN_STR ".p2align 4,,15"
+#endif
+
+#ifdef CONFIG_X86_32
+#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
+/*
+ * For 32-bit UML - mark functions implemented in assembly that use
+ * regparm input parameters:
+ */
+#define asmregparm __attribute__((regparm(3)))
+
+/*
+ * Make sure the compiler doesn't do anything stupid with the
+ * arguments on the stack - they are owned by the *caller*, not
+ * the callee. This just fools gcc into not spilling into them,
+ * and keeps it from doing tailcall recursion and/or using the
+ * stack slots for temporaries, since they are live and "used"
+ * all the way to the end of the function.
+ *
+ * NOTE! On x86-64, all the arguments are in registers, so this
+ * only matters on a 32-bit kernel.
+ */
+#define asmlinkage_protect(n, ret, args...) \
+ __asmlinkage_protect##n(ret, ##args)
+#define __asmlinkage_protect_n(ret, args...) \
+ __asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), ##args)
+#define __asmlinkage_protect0(ret) \
+ __asmlinkage_protect_n(ret)
+#define __asmlinkage_protect1(ret, arg1) \
+ __asmlinkage_protect_n(ret, "g" (arg1))
+#define __asmlinkage_protect2(ret, arg1, arg2) \
+ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2))
+#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \
+ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3))
+#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \
+ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
+ "g" (arg4))
+#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \
+ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
+ "g" (arg4), "g" (arg5))
+#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \
+ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
+ "g" (arg4), "g" (arg5), "g" (arg6))
+
+#endif
+
+#ifdef CONFIG_X86_ALIGNMENT_16
+#define __ALIGN .align 16,0x90
+#define __ALIGN_STR ".align 16,0x90"
+#endif
+
+#endif /* _ASM_X86_LINKAGE_H */
+
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
new file mode 100644
index 00000000000..47b9b6f1905
--- /dev/null
+++ b/arch/x86/include/asm/local.h
@@ -0,0 +1,235 @@
+#ifndef _ASM_X86_LOCAL_H
+#define _ASM_X86_LOCAL_H
+
+#include <linux/percpu.h>
+
+#include <asm/system.h>
+#include <asm/atomic.h>
+#include <asm/asm.h>
+
+typedef struct {
+ atomic_long_t a;
+} local_t;
+
+#define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) }
+
+#define local_read(l) atomic_long_read(&(l)->a)
+#define local_set(l, i) atomic_long_set(&(l)->a, (i))
+
+static inline void local_inc(local_t *l)
+{
+ asm volatile(_ASM_INC "%0"
+ : "+m" (l->a.counter));
+}
+
+static inline void local_dec(local_t *l)
+{
+ asm volatile(_ASM_DEC "%0"
+ : "+m" (l->a.counter));
+}
+
+static inline void local_add(long i, local_t *l)
+{
+ asm volatile(_ASM_ADD "%1,%0"
+ : "+m" (l->a.counter)
+ : "ir" (i));
+}
+
+static inline void local_sub(long i, local_t *l)
+{
+ asm volatile(_ASM_SUB "%1,%0"
+ : "+m" (l->a.counter)
+ : "ir" (i));
+}
+
+/**
+ * local_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @l: pointer to type local_t
+ *
+ * Atomically subtracts @i from @l and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int local_sub_and_test(long i, local_t *l)
+{
+ unsigned char c;
+
+ asm volatile(_ASM_SUB "%2,%0; sete %1"
+ : "+m" (l->a.counter), "=qm" (c)
+ : "ir" (i) : "memory");
+ return c;
+}
+
+/**
+ * local_dec_and_test - decrement and test
+ * @l: pointer to type local_t
+ *
+ * Atomically decrements @l by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int local_dec_and_test(local_t *l)
+{
+ unsigned char c;
+
+ asm volatile(_ASM_DEC "%0; sete %1"
+ : "+m" (l->a.counter), "=qm" (c)
+ : : "memory");
+ return c != 0;
+}
+
+/**
+ * local_inc_and_test - increment and test
+ * @l: pointer to type local_t
+ *
+ * Atomically increments @l by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int local_inc_and_test(local_t *l)
+{
+ unsigned char c;
+
+ asm volatile(_ASM_INC "%0; sete %1"
+ : "+m" (l->a.counter), "=qm" (c)
+ : : "memory");
+ return c != 0;
+}
+
+/**
+ * local_add_negative - add and test if negative
+ * @i: integer value to add
+ * @l: pointer to type local_t
+ *
+ * Atomically adds @i to @l and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int local_add_negative(long i, local_t *l)
+{
+ unsigned char c;
+
+ asm volatile(_ASM_ADD "%2,%0; sets %1"
+ : "+m" (l->a.counter), "=qm" (c)
+ : "ir" (i) : "memory");
+ return c;
+}
+
+/**
+ * local_add_return - add and return
+ * @i: integer value to add
+ * @l: pointer to type local_t
+ *
+ * Atomically adds @i to @l and returns @i + @l
+ */
+static inline long local_add_return(long i, local_t *l)
+{
+ long __i;
+#ifdef CONFIG_M386
+ unsigned long flags;
+ if (unlikely(boot_cpu_data.x86 <= 3))
+ goto no_xadd;
+#endif
+ /* Modern 486+ processor */
+ __i = i;
+ asm volatile(_ASM_XADD "%0, %1;"
+ : "+r" (i), "+m" (l->a.counter)
+ : : "memory");
+ return i + __i;
+
+#ifdef CONFIG_M386
+no_xadd: /* Legacy 386 processor */
+ local_irq_save(flags);
+ __i = local_read(l);
+ local_set(l, i + __i);
+ local_irq_restore(flags);
+ return i + __i;
+#endif
+}
+
+static inline long local_sub_return(long i, local_t *l)
+{
+ return local_add_return(-i, l);
+}
+
+#define local_inc_return(l) (local_add_return(1, l))
+#define local_dec_return(l) (local_sub_return(1, l))
+
+#define local_cmpxchg(l, o, n) \
+ (cmpxchg_local(&((l)->a.counter), (o), (n)))
+/* Always has a lock prefix */
+#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
+
+/**
+ * local_add_unless - add unless the number is a given value
+ * @l: pointer of type local_t
+ * @a: the amount to add to l...
+ * @u: ...unless l is equal to u.
+ *
+ * Atomically adds @a to @l, so long as it was not @u.
+ * Returns non-zero if @l was not @u, and zero otherwise.
+ */
+#define local_add_unless(l, a, u) \
+({ \
+ long c, old; \
+ c = local_read((l)); \
+ for (;;) { \
+ if (unlikely(c == (u))) \
+ break; \
+ old = local_cmpxchg((l), c, c + (a)); \
+ if (likely(old == c)) \
+ break; \
+ c = old; \
+ } \
+ c != (u); \
+})
+#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
+
+/* On x86_32, these are no better than the atomic variants.
+ * On x86-64 these are better than the atomic variants on SMP kernels
+ * because they dont use a lock prefix.
+ */
+#define __local_inc(l) local_inc(l)
+#define __local_dec(l) local_dec(l)
+#define __local_add(i, l) local_add((i), (l))
+#define __local_sub(i, l) local_sub((i), (l))
+
+/* Use these for per-cpu local_t variables: on some archs they are
+ * much more efficient than these naive implementations. Note they take
+ * a variable, not an address.
+ *
+ * X86_64: This could be done better if we moved the per cpu data directly
+ * after GS.
+ */
+
+/* Need to disable preemption for the cpu local counters otherwise we could
+ still access a variable of a previous CPU in a non atomic way. */
+#define cpu_local_wrap_v(l) \
+({ \
+ local_t res__; \
+ preempt_disable(); \
+ res__ = (l); \
+ preempt_enable(); \
+ res__; \
+})
+#define cpu_local_wrap(l) \
+({ \
+ preempt_disable(); \
+ (l); \
+ preempt_enable(); \
+}) \
+
+#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
+#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
+#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
+#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
+#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
+#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
+
+#define __cpu_local_inc(l) cpu_local_inc((l))
+#define __cpu_local_dec(l) cpu_local_dec((l))
+#define __cpu_local_add(i, l) cpu_local_add((i), (l))
+#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
+
+#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/mach-default/apm.h b/arch/x86/include/asm/mach-default/apm.h
new file mode 100644
index 00000000000..20370c6db74
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/apm.h
@@ -0,0 +1,73 @@
+/*
+ * Machine specific APM BIOS functions for generic.
+ * Split out from apm.c by Osamu Tomita <tomita@cinet.co.jp>
+ */
+
+#ifndef _ASM_X86_MACH_DEFAULT_APM_H
+#define _ASM_X86_MACH_DEFAULT_APM_H
+
+#ifdef APM_ZERO_SEGS
+# define APM_DO_ZERO_SEGS \
+ "pushl %%ds\n\t" \
+ "pushl %%es\n\t" \
+ "xorl %%edx, %%edx\n\t" \
+ "mov %%dx, %%ds\n\t" \
+ "mov %%dx, %%es\n\t" \
+ "mov %%dx, %%fs\n\t" \
+ "mov %%dx, %%gs\n\t"
+# define APM_DO_POP_SEGS \
+ "popl %%es\n\t" \
+ "popl %%ds\n\t"
+#else
+# define APM_DO_ZERO_SEGS
+# define APM_DO_POP_SEGS
+#endif
+
+static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in,
+ u32 *eax, u32 *ebx, u32 *ecx,
+ u32 *edx, u32 *esi)
+{
+ /*
+ * N.B. We do NOT need a cld after the BIOS call
+ * because we always save and restore the flags.
+ */
+ __asm__ __volatile__(APM_DO_ZERO_SEGS
+ "pushl %%edi\n\t"
+ "pushl %%ebp\n\t"
+ "lcall *%%cs:apm_bios_entry\n\t"
+ "setc %%al\n\t"
+ "popl %%ebp\n\t"
+ "popl %%edi\n\t"
+ APM_DO_POP_SEGS
+ : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx),
+ "=S" (*esi)
+ : "a" (func), "b" (ebx_in), "c" (ecx_in)
+ : "memory", "cc");
+}
+
+static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in,
+ u32 ecx_in, u32 *eax)
+{
+ int cx, dx, si;
+ u8 error;
+
+ /*
+ * N.B. We do NOT need a cld after the BIOS call
+ * because we always save and restore the flags.
+ */
+ __asm__ __volatile__(APM_DO_ZERO_SEGS
+ "pushl %%edi\n\t"
+ "pushl %%ebp\n\t"
+ "lcall *%%cs:apm_bios_entry\n\t"
+ "setc %%bl\n\t"
+ "popl %%ebp\n\t"
+ "popl %%edi\n\t"
+ APM_DO_POP_SEGS
+ : "=a" (*eax), "=b" (error), "=c" (cx), "=d" (dx),
+ "=S" (si)
+ : "a" (func), "b" (ebx_in), "c" (ecx_in)
+ : "memory", "cc");
+ return error;
+}
+
+#endif /* _ASM_X86_MACH_DEFAULT_APM_H */
diff --git a/arch/x86/include/asm/mach-default/do_timer.h b/arch/x86/include/asm/mach-default/do_timer.h
new file mode 100644
index 00000000000..23ecda0b28a
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/do_timer.h
@@ -0,0 +1,16 @@
+/* defines for inline arch setup functions */
+#include <linux/clockchips.h>
+
+#include <asm/i8259.h>
+#include <asm/i8253.h>
+
+/**
+ * do_timer_interrupt_hook - hook into timer tick
+ *
+ * Call the pit clock event handler. see asm/i8253.h
+ **/
+
+static inline void do_timer_interrupt_hook(void)
+{
+ global_clock_event->event_handler(global_clock_event);
+}
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
new file mode 100644
index 00000000000..6b1add8e31d
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -0,0 +1,36 @@
+/*
+ * This file is designed to contain the BUILD_INTERRUPT specifications for
+ * all of the extra named interrupt vectors used by the architecture.
+ * Usually this is the Inter Process Interrupts (IPIs)
+ */
+
+/*
+ * The following vectors are part of the Linux architecture, there
+ * is no hardware IRQ pin equivalent for them, they are triggered
+ * through the ICC by us (IPIs)
+ */
+#ifdef CONFIG_X86_SMP
+BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
+BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
+BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
+BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
+BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
+#endif
+
+/*
+ * every pentium local APIC has two 'local interrupts', with a
+ * soft-definable vector attached to both interrupts, one of
+ * which is a timer interrupt, the other one is error counter
+ * overflow. Linux uses the local APIC timer interrupt to get
+ * a much simpler SMP time architecture:
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
+BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
+BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
+
+#ifdef CONFIG_X86_MCE_P4THERMAL
+BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h
new file mode 100644
index 00000000000..ff3a6c236c0
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_apic.h
@@ -0,0 +1,156 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_APIC_H
+#define _ASM_X86_MACH_DEFAULT_MACH_APIC_H
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+#include <mach_apicdef.h>
+#include <asm/smp.h>
+
+#define APIC_DFR_VALUE (APIC_DFR_FLAT)
+
+static inline cpumask_t target_cpus(void)
+{
+#ifdef CONFIG_SMP
+ return cpu_online_map;
+#else
+ return cpumask_of_cpu(0);
+#endif
+}
+
+#define NO_BALANCE_IRQ (0)
+#define esr_disable (0)
+
+#ifdef CONFIG_X86_64
+#include <asm/genapic.h>
+#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
+#define INT_DEST_MODE (genapic->int_dest_mode)
+#define TARGET_CPUS (genapic->target_cpus())
+#define apic_id_registered (genapic->apic_id_registered)
+#define init_apic_ldr (genapic->init_apic_ldr)
+#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define phys_pkg_id (genapic->phys_pkg_id)
+#define vector_allocation_domain (genapic->vector_allocation_domain)
+#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID)))
+#define send_IPI_self (genapic->send_IPI_self)
+extern void setup_apic_routing(void);
+#else
+#define INT_DELIVERY_MODE dest_LowestPrio
+#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
+#define TARGET_CPUS (target_cpus())
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static inline void init_apic_ldr(void)
+{
+ unsigned long val;
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
+}
+
+static inline int apic_id_registered(void)
+{
+ return physid_isset(read_apic_id(), phys_cpu_present_map);
+}
+
+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ return cpus_addr(cpumask)[0];
+}
+
+static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static inline void setup_apic_routing(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
+ "Flat", nr_ioapics);
+#endif
+}
+
+static inline int apicid_to_node(int logical_apicid)
+{
+#ifdef CONFIG_SMP
+ return apicid_2_node[hard_smp_processor_id()];
+#else
+ return 0;
+#endif
+}
+
+static inline cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
+#endif
+
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return physid_isset(apicid, bitmap);
+}
+
+static inline unsigned long check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ return phys_map;
+}
+
+static inline int multi_timer_check(int apic, int irq)
+{
+ return 0;
+}
+
+/* Mapping from cpu number to logical apicid */
+static inline int cpu_to_logical_apicid(int cpu)
+{
+ return 1 << cpu;
+}
+
+static inline int cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < NR_CPUS && cpu_present(mps_cpu))
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
+{
+ return physid_mask_of_physid(phys_apicid);
+}
+
+static inline void setup_portio_remap(void)
+{
+}
+
+static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map);
+}
+
+static inline void enable_apic_mode(void)
+{
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_apicdef.h b/arch/x86/include/asm/mach-default/mach_apicdef.h
new file mode 100644
index 00000000000..53179936d6c
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_apicdef.h
@@ -0,0 +1,24 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
+#define _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
+
+#include <asm/apic.h>
+
+#ifdef CONFIG_X86_64
+#define APIC_ID_MASK (genapic->apic_id_mask)
+#define GET_APIC_ID(x) (genapic->get_apic_id(x))
+#define SET_APIC_ID(x) (genapic->set_apic_id(x))
+#else
+#define APIC_ID_MASK (0xF<<24)
+static inline unsigned get_apic_id(unsigned long x)
+{
+ unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+ if (APIC_XAPIC(ver))
+ return (((x)>>24)&0xFF);
+ else
+ return (((x)>>24)&0xF);
+}
+
+#define GET_APIC_ID(x) get_apic_id(x)
+#endif
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h
new file mode 100644
index 00000000000..fabca01ebac
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_ipi.h
@@ -0,0 +1,64 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_IPI_H
+#define _ASM_X86_MACH_DEFAULT_MACH_IPI_H
+
+/* Avoid include hell */
+#define NMI_VECTOR 0x02
+
+void send_IPI_mask_bitmask(cpumask_t mask, int vector);
+void __send_IPI_shortcut(unsigned int shortcut, int vector);
+
+extern int no_broadcast;
+
+#ifdef CONFIG_X86_64
+#include <asm/genapic.h>
+#define send_IPI_mask (genapic->send_IPI_mask)
+#else
+static inline void send_IPI_mask(cpumask_t mask, int vector)
+{
+ send_IPI_mask_bitmask(mask, vector);
+}
+#endif
+
+static inline void __local_send_IPI_allbutself(int vector)
+{
+ if (no_broadcast || vector == NMI_VECTOR) {
+ cpumask_t mask = cpu_online_map;
+
+ cpu_clear(smp_processor_id(), mask);
+ send_IPI_mask(mask, vector);
+ } else
+ __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
+}
+
+static inline void __local_send_IPI_all(int vector)
+{
+ if (no_broadcast || vector == NMI_VECTOR)
+ send_IPI_mask(cpu_online_map, vector);
+ else
+ __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
+}
+
+#ifdef CONFIG_X86_64
+#define send_IPI_allbutself (genapic->send_IPI_allbutself)
+#define send_IPI_all (genapic->send_IPI_all)
+#else
+static inline void send_IPI_allbutself(int vector)
+{
+ /*
+ * if there are no other CPUs in the system then we get an APIC send
+ * error if we try to broadcast, thus avoid sending IPIs in this case.
+ */
+ if (!(num_online_cpus() > 1))
+ return;
+
+ __local_send_IPI_allbutself(vector);
+ return;
+}
+
+static inline void send_IPI_all(int vector)
+{
+ __local_send_IPI_all(vector);
+}
+#endif
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpparse.h b/arch/x86/include/asm/mach-default/mach_mpparse.h
new file mode 100644
index 00000000000..8c1ea21238a
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_mpparse.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
+#define _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
+
+static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid)
+{
+ return 0;
+}
+
+/* Hook from generic ACPI tables.c */
+static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return 0;
+}
+
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpspec.h b/arch/x86/include/asm/mach-default/mach_mpspec.h
new file mode 100644
index 00000000000..e85ede686be
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_mpspec.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
+#define _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
+
+#define MAX_IRQ_SOURCES 256
+
+#if CONFIG_BASE_SMALL == 0
+#define MAX_MP_BUSSES 256
+#else
+#define MAX_MP_BUSSES 32
+#endif
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_timer.h b/arch/x86/include/asm/mach-default/mach_timer.h
new file mode 100644
index 00000000000..853728519ae
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_timer.h
@@ -0,0 +1,48 @@
+/*
+ * Machine specific calibrate_tsc() for generic.
+ * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
+ */
+/* ------ Calibrate the TSC -------
+ * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
+ * Too much 64-bit arithmetic here to do this cleanly in C, and for
+ * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
+ * output busy loop as low as possible. We avoid reading the CTC registers
+ * directly because of the awkward 8-bit access mechanism of the 82C54
+ * device.
+ */
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_TIMER_H
+#define _ASM_X86_MACH_DEFAULT_MACH_TIMER_H
+
+#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
+#define CALIBRATE_LATCH \
+ ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
+
+static inline void mach_prepare_counter(void)
+{
+ /* Set the Gate high, disable speaker */
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ /*
+ * Now let's take care of CTC channel 2
+ *
+ * Set the Gate high, program CTC channel 2 for mode 0,
+ * (interrupt on terminal count mode), binary count,
+ * load 5 * LATCH count, (LSB and MSB) to begin countdown.
+ *
+ * Some devices need a delay here.
+ */
+ outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */
+ outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
+ outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */
+}
+
+static inline void mach_countup(unsigned long *count_p)
+{
+ unsigned long count = 0;
+ do {
+ count++;
+ } while ((inb_p(0x61) & 0x20) == 0);
+ *count_p = count;
+}
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_TIMER_H */
diff --git a/arch/x86/include/asm/mach-default/mach_traps.h b/arch/x86/include/asm/mach-default/mach_traps.h
new file mode 100644
index 00000000000..f7920601e47
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_traps.h
@@ -0,0 +1,33 @@
+/*
+ * Machine specific NMI handling for generic.
+ * Split out from traps.c by Osamu Tomita <tomita@cinet.co.jp>
+ */
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
+#define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
+
+#include <asm/mc146818rtc.h>
+
+static inline unsigned char get_nmi_reason(void)
+{
+ return inb(0x61);
+}
+
+static inline void reassert_nmi(void)
+{
+ int old_reg = -1;
+
+ if (do_i_have_lock_cmos())
+ old_reg = current_lock_cmos_reg();
+ else
+ lock_cmos(0); /* register doesn't matter here */
+ outb(0x8f, 0x70);
+ inb(0x71); /* dummy */
+ outb(0x0f, 0x70);
+ inb(0x71); /* dummy */
+ if (old_reg >= 0)
+ outb(old_reg, 0x70);
+ else
+ unlock_cmos();
+}
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H */
diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h
new file mode 100644
index 00000000000..d5c0b826a4f
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h
@@ -0,0 +1,42 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
+#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
+
+/*
+ * This file copes with machines that wakeup secondary CPUs by the
+ * INIT, INIT, STARTUP sequence.
+ */
+
+#define WAKE_SECONDARY_VIA_INIT
+
+#define TRAMPOLINE_LOW phys_to_virt(0x467)
+#define TRAMPOLINE_HIGH phys_to_virt(0x469)
+
+#define boot_cpu_apicid boot_cpu_physical_apicid
+
+static inline void wait_for_init_deassert(atomic_t *deassert)
+{
+ while (!atomic_read(deassert))
+ cpu_relax();
+ return;
+}
+
+/* Nothing to do for most platforms, since cleared by the INIT cycle */
+static inline void smp_callin_clear_local_apic(void)
+{
+}
+
+static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
+{
+}
+
+static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
+{
+}
+
+#if APIC_DEBUG
+ #define inquire_remote_apic(apicid) __inquire_remote_apic(apicid)
+#else
+ #define inquire_remote_apic(apicid) {}
+#endif
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/mach-default/pci-functions.h b/arch/x86/include/asm/mach-default/pci-functions.h
new file mode 100644
index 00000000000..ed0bab42735
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/pci-functions.h
@@ -0,0 +1,19 @@
+/*
+ * PCI BIOS function numbering for conventional PCI BIOS
+ * systems
+ */
+
+#define PCIBIOS_PCI_FUNCTION_ID 0xb1XX
+#define PCIBIOS_PCI_BIOS_PRESENT 0xb101
+#define PCIBIOS_FIND_PCI_DEVICE 0xb102
+#define PCIBIOS_FIND_PCI_CLASS_CODE 0xb103
+#define PCIBIOS_GENERATE_SPECIAL_CYCLE 0xb106
+#define PCIBIOS_READ_CONFIG_BYTE 0xb108
+#define PCIBIOS_READ_CONFIG_WORD 0xb109
+#define PCIBIOS_READ_CONFIG_DWORD 0xb10a
+#define PCIBIOS_WRITE_CONFIG_BYTE 0xb10b
+#define PCIBIOS_WRITE_CONFIG_WORD 0xb10c
+#define PCIBIOS_WRITE_CONFIG_DWORD 0xb10d
+#define PCIBIOS_GET_ROUTING_OPTIONS 0xb10e
+#define PCIBIOS_SET_PCI_HW_INT 0xb10f
+
diff --git a/arch/x86/include/asm/mach-default/setup_arch.h b/arch/x86/include/asm/mach-default/setup_arch.h
new file mode 100644
index 00000000000..38846208b54
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/setup_arch.h
@@ -0,0 +1,3 @@
+/* Hook to call BIOS initialisation function */
+
+/* no action for generic */
diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h
new file mode 100644
index 00000000000..dbab36d64d4
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h
@@ -0,0 +1,59 @@
+/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws
+ * which needs to alter them. */
+
+static inline void smpboot_clear_io_apic_irqs(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ io_apic_irqs = 0;
+#endif
+}
+
+static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
+{
+ CMOS_WRITE(0xa, 0xf);
+ local_flush_tlb();
+ pr_debug("1.\n");
+ *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4;
+ pr_debug("2.\n");
+ *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf;
+ pr_debug("3.\n");
+}
+
+static inline void smpboot_restore_warm_reset_vector(void)
+{
+ /*
+ * Install writable page 0 entry to set BIOS data area.
+ */
+ local_flush_tlb();
+
+ /*
+ * Paranoid: Set warm reset code and vector here back
+ * to default values.
+ */
+ CMOS_WRITE(0, 0xf);
+
+ *((volatile long *) phys_to_virt(0x467)) = 0;
+}
+
+static inline void __init smpboot_setup_io_apic(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Here we can be sure that there is an IO-APIC in the system. Let's
+ * go and set it up:
+ */
+ if (!skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+ else {
+ nr_ioapics = 0;
+ localise_nmi_watchdog();
+ }
+#endif
+}
+
+static inline void smpboot_clear_io_apic(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ nr_ioapics = 0;
+#endif
+}
diff --git a/arch/x86/include/asm/mach-generic/gpio.h b/arch/x86/include/asm/mach-generic/gpio.h
new file mode 100644
index 00000000000..995c45efdb3
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/gpio.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_X86_MACH_GENERIC_GPIO_H
+#define _ASM_X86_MACH_GENERIC_GPIO_H
+
+int gpio_request(unsigned gpio, const char *label);
+void gpio_free(unsigned gpio);
+int gpio_direction_input(unsigned gpio);
+int gpio_direction_output(unsigned gpio, int value);
+int gpio_get_value(unsigned gpio);
+void gpio_set_value(unsigned gpio, int value);
+int gpio_to_irq(unsigned gpio);
+int irq_to_gpio(unsigned irq);
+
+#include <asm-generic/gpio.h> /* cansleep wrappers */
+
+#endif /* _ASM_X86_MACH_GENERIC_GPIO_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h
new file mode 100644
index 00000000000..5180bd7478f
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_apic.h
@@ -0,0 +1,33 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_APIC_H
+#define _ASM_X86_MACH_GENERIC_MACH_APIC_H
+
+#include <asm/genapic.h>
+
+#define esr_disable (genapic->ESR_DISABLE)
+#define NO_BALANCE_IRQ (genapic->no_balance_irq)
+#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
+#define INT_DEST_MODE (genapic->int_dest_mode)
+#undef APIC_DEST_LOGICAL
+#define APIC_DEST_LOGICAL (genapic->apic_destination_logical)
+#define TARGET_CPUS (genapic->target_cpus())
+#define apic_id_registered (genapic->apic_id_registered)
+#define init_apic_ldr (genapic->init_apic_ldr)
+#define ioapic_phys_id_map (genapic->ioapic_phys_id_map)
+#define setup_apic_routing (genapic->setup_apic_routing)
+#define multi_timer_check (genapic->multi_timer_check)
+#define apicid_to_node (genapic->apicid_to_node)
+#define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid)
+#define cpu_present_to_apicid (genapic->cpu_present_to_apicid)
+#define apicid_to_cpu_present (genapic->apicid_to_cpu_present)
+#define setup_portio_remap (genapic->setup_portio_remap)
+#define check_apicid_present (genapic->check_apicid_present)
+#define check_phys_apicid_present (genapic->check_phys_apicid_present)
+#define check_apicid_used (genapic->check_apicid_used)
+#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define vector_allocation_domain (genapic->vector_allocation_domain)
+#define enable_apic_mode (genapic->enable_apic_mode)
+#define phys_pkg_id (genapic->phys_pkg_id)
+
+extern void generic_bigsmp_probe(void);
+
+#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apicdef.h b/arch/x86/include/asm/mach-generic/mach_apicdef.h
new file mode 100644
index 00000000000..68041f3802f
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_apicdef.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
+#define _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
+
+#ifndef APIC_DEFINITION
+#include <asm/genapic.h>
+
+#define GET_APIC_ID (genapic->get_apic_id)
+#define APIC_ID_MASK (genapic->apic_id_mask)
+#endif
+
+#endif /* _ASM_X86_MACH_GENERIC_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_ipi.h b/arch/x86/include/asm/mach-generic/mach_ipi.h
new file mode 100644
index 00000000000..ffd637e3c3d
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_ipi.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_IPI_H
+#define _ASM_X86_MACH_GENERIC_MACH_IPI_H
+
+#include <asm/genapic.h>
+
+#define send_IPI_mask (genapic->send_IPI_mask)
+#define send_IPI_allbutself (genapic->send_IPI_allbutself)
+#define send_IPI_all (genapic->send_IPI_all)
+
+#endif /* _ASM_X86_MACH_GENERIC_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpparse.h b/arch/x86/include/asm/mach-generic/mach_mpparse.h
new file mode 100644
index 00000000000..048f1d46853
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_mpparse.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
+#define _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
+
+
+extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid);
+
+extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
+
+#endif /* _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpspec.h b/arch/x86/include/asm/mach-generic/mach_mpspec.h
new file mode 100644
index 00000000000..bbab5ccfd4f
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_mpspec.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
+#define _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
+
+#define MAX_IRQ_SOURCES 256
+
+/* Summit or generic (i.e. installer) kernels need lots of bus entries. */
+/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
+#define MAX_MP_BUSSES 260
+
+extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid);
+#endif /* _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-rdc321x/gpio.h b/arch/x86/include/asm/mach-rdc321x/gpio.h
new file mode 100644
index 00000000000..c210ab5788b
--- /dev/null
+++ b/arch/x86/include/asm/mach-rdc321x/gpio.h
@@ -0,0 +1,60 @@
+#ifndef _ASM_X86_MACH_RDC321X_GPIO_H
+#define _ASM_X86_MACH_RDC321X_GPIO_H
+
+#include <linux/kernel.h>
+
+extern int rdc_gpio_get_value(unsigned gpio);
+extern void rdc_gpio_set_value(unsigned gpio, int value);
+extern int rdc_gpio_direction_input(unsigned gpio);
+extern int rdc_gpio_direction_output(unsigned gpio, int value);
+extern int rdc_gpio_request(unsigned gpio, const char *label);
+extern void rdc_gpio_free(unsigned gpio);
+extern void __init rdc321x_gpio_setup(void);
+
+/* Wrappers for the arch-neutral GPIO API */
+
+static inline int gpio_request(unsigned gpio, const char *label)
+{
+ return rdc_gpio_request(gpio, label);
+}
+
+static inline void gpio_free(unsigned gpio)
+{
+ might_sleep();
+ rdc_gpio_free(gpio);
+}
+
+static inline int gpio_direction_input(unsigned gpio)
+{
+ return rdc_gpio_direction_input(gpio);
+}
+
+static inline int gpio_direction_output(unsigned gpio, int value)
+{
+ return rdc_gpio_direction_output(gpio, value);
+}
+
+static inline int gpio_get_value(unsigned gpio)
+{
+ return rdc_gpio_get_value(gpio);
+}
+
+static inline void gpio_set_value(unsigned gpio, int value)
+{
+ rdc_gpio_set_value(gpio, value);
+}
+
+static inline int gpio_to_irq(unsigned gpio)
+{
+ return gpio;
+}
+
+static inline int irq_to_gpio(unsigned irq)
+{
+ return irq;
+}
+
+/* For cansleep */
+#include <asm-generic/gpio.h>
+
+#endif /* _ASM_X86_MACH_RDC321X_GPIO_H */
diff --git a/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h b/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h
new file mode 100644
index 00000000000..c8e9c8bed3d
--- /dev/null
+++ b/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h
@@ -0,0 +1,12 @@
+#define PFX "rdc321x: "
+
+/* General purpose configuration and data registers */
+#define RDC3210_CFGREG_ADDR 0x0CF8
+#define RDC3210_CFGREG_DATA 0x0CFC
+
+#define RDC321X_GPIO_CTRL_REG1 0x48
+#define RDC321X_GPIO_CTRL_REG2 0x84
+#define RDC321X_GPIO_DATA_REG1 0x4c
+#define RDC321X_GPIO_DATA_REG2 0x88
+
+#define RDC321X_MAX_GPIO 58
diff --git a/arch/x86/include/asm/mach-voyager/do_timer.h b/arch/x86/include/asm/mach-voyager/do_timer.h
new file mode 100644
index 00000000000..9e5a459fd15
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/do_timer.h
@@ -0,0 +1,17 @@
+/* defines for inline arch setup functions */
+#include <linux/clockchips.h>
+
+#include <asm/voyager.h>
+#include <asm/i8253.h>
+
+/**
+ * do_timer_interrupt_hook - hook into timer tick
+ *
+ * Call the pit clock event handler. see asm/i8253.h
+ **/
+static inline void do_timer_interrupt_hook(void)
+{
+ global_clock_event->event_handler(global_clock_event);
+ voyager_timer_interrupt();
+}
+
diff --git a/arch/x86/include/asm/mach-voyager/entry_arch.h b/arch/x86/include/asm/mach-voyager/entry_arch.h
new file mode 100644
index 00000000000..ae52624b593
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/entry_arch.h
@@ -0,0 +1,26 @@
+/* -*- mode: c; c-basic-offset: 8 -*- */
+
+/* Copyright (C) 2002
+ *
+ * Author: James.Bottomley@HansenPartnership.com
+ *
+ * linux/arch/i386/voyager/entry_arch.h
+ *
+ * This file builds the VIC and QIC CPI gates
+ */
+
+/* initialise the voyager interrupt gates
+ *
+ * This uses the macros in irq.h to set up assembly jump gates. The
+ * calls are then redirected to the same routine with smp_ prefixed */
+BUILD_INTERRUPT(vic_sys_interrupt, VIC_SYS_INT)
+BUILD_INTERRUPT(vic_cmn_interrupt, VIC_CMN_INT)
+BUILD_INTERRUPT(vic_cpi_interrupt, VIC_CPI_LEVEL0);
+
+/* do all the QIC interrupts */
+BUILD_INTERRUPT(qic_timer_interrupt, QIC_TIMER_CPI);
+BUILD_INTERRUPT(qic_invalidate_interrupt, QIC_INVALIDATE_CPI);
+BUILD_INTERRUPT(qic_reschedule_interrupt, QIC_RESCHEDULE_CPI);
+BUILD_INTERRUPT(qic_enable_irq_interrupt, QIC_ENABLE_IRQ_CPI);
+BUILD_INTERRUPT(qic_call_function_interrupt, QIC_CALL_FUNCTION_CPI);
+BUILD_INTERRUPT(qic_call_function_single_interrupt, QIC_CALL_FUNCTION_SINGLE_CPI);
diff --git a/arch/x86/include/asm/mach-voyager/setup_arch.h b/arch/x86/include/asm/mach-voyager/setup_arch.h
new file mode 100644
index 00000000000..71729ca05cd
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/setup_arch.h
@@ -0,0 +1,12 @@
+#include <asm/voyager.h>
+#include <asm/setup.h>
+#define VOYAGER_BIOS_INFO ((struct voyager_bios_info *) \
+ (&boot_params.apm_bios_info))
+
+/* Hook to call BIOS initialisation function */
+
+/* for voyager, pass the voyager BIOS/SUS info area to the detection
+ * routines */
+
+#define ARCH_SETUP voyager_detect(VOYAGER_BIOS_INFO);
+
diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
new file mode 100644
index 00000000000..5a65b107ad5
--- /dev/null
+++ b/arch/x86/include/asm/math_emu.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_MATH_EMU_H
+#define _ASM_X86_MATH_EMU_H
+
+/* This structure matches the layout of the data saved to the stack
+ following a device-not-present interrupt, part of it saved
+ automatically by the 80386/80486.
+ */
+struct info {
+ long ___orig_eip;
+ long ___ebx;
+ long ___ecx;
+ long ___edx;
+ long ___esi;
+ long ___edi;
+ long ___ebp;
+ long ___eax;
+ long ___ds;
+ long ___es;
+ long ___fs;
+ long ___orig_eax;
+ long ___eip;
+ long ___cs;
+ long ___eflags;
+ long ___esp;
+ long ___ss;
+ long ___vm86_es; /* This and the following only in vm86 mode */
+ long ___vm86_ds;
+ long ___vm86_fs;
+ long ___vm86_gs;
+};
+#endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
new file mode 100644
index 00000000000..01fdf5674e2
--- /dev/null
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -0,0 +1,104 @@
+/*
+ * Machine dependent access functions for RTC registers.
+ */
+#ifndef _ASM_X86_MC146818RTC_H
+#define _ASM_X86_MC146818RTC_H
+
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <linux/mc146818rtc.h>
+
+#ifndef RTC_PORT
+#define RTC_PORT(x) (0x70 + (x))
+#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
+#endif
+
+#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
+/*
+ * This lock provides nmi access to the CMOS/RTC registers. It has some
+ * special properties. It is owned by a CPU and stores the index register
+ * currently being accessed (if owned). The idea here is that it works
+ * like a normal lock (normally). However, in an NMI, the NMI code will
+ * first check to see if its CPU owns the lock, meaning that the NMI
+ * interrupted during the read/write of the device. If it does, it goes ahead
+ * and performs the access and then restores the index register. If it does
+ * not, it locks normally.
+ *
+ * Note that since we are working with NMIs, we need this lock even in
+ * a non-SMP machine just to mark that the lock is owned.
+ *
+ * This only works with compare-and-swap. There is no other way to
+ * atomically claim the lock and set the owner.
+ */
+#include <linux/smp.h>
+extern volatile unsigned long cmos_lock;
+
+/*
+ * All of these below must be called with interrupts off, preempt
+ * disabled, etc.
+ */
+
+static inline void lock_cmos(unsigned char reg)
+{
+ unsigned long new;
+ new = ((smp_processor_id() + 1) << 8) | reg;
+ for (;;) {
+ if (cmos_lock) {
+ cpu_relax();
+ continue;
+ }
+ if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0)
+ return;
+ }
+}
+
+static inline void unlock_cmos(void)
+{
+ cmos_lock = 0;
+}
+
+static inline int do_i_have_lock_cmos(void)
+{
+ return (cmos_lock >> 8) == (smp_processor_id() + 1);
+}
+
+static inline unsigned char current_lock_cmos_reg(void)
+{
+ return cmos_lock & 0xff;
+}
+
+#define lock_cmos_prefix(reg) \
+ do { \
+ unsigned long cmos_flags; \
+ local_irq_save(cmos_flags); \
+ lock_cmos(reg)
+
+#define lock_cmos_suffix(reg) \
+ unlock_cmos(); \
+ local_irq_restore(cmos_flags); \
+ } while (0)
+#else
+#define lock_cmos_prefix(reg) do {} while (0)
+#define lock_cmos_suffix(reg) do {} while (0)
+#define lock_cmos(reg)
+#define unlock_cmos()
+#define do_i_have_lock_cmos() 0
+#define current_lock_cmos_reg() 0
+#endif
+
+/*
+ * The yet supported machines all access the RTC index register via
+ * an ISA port access but the way to access the date register differs ...
+ */
+#define CMOS_READ(addr) rtc_cmos_read(addr)
+#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr)
+unsigned char rtc_cmos_read(unsigned char addr);
+void rtc_cmos_write(unsigned char val, unsigned char addr);
+
+extern int mach_set_rtc_mmss(unsigned long nowtime);
+extern unsigned long mach_get_cmos_time(void);
+
+#define RTC_IRQ 8
+
+#endif /* _ASM_X86_MC146818RTC_H */
diff --git a/arch/x86/include/asm/mca.h b/arch/x86/include/asm/mca.h
new file mode 100644
index 00000000000..eedbb6cc1ef
--- /dev/null
+++ b/arch/x86/include/asm/mca.h
@@ -0,0 +1,43 @@
+/* -*- mode: c; c-basic-offset: 8 -*- */
+
+/* Platform specific MCA defines */
+#ifndef _ASM_X86_MCA_H
+#define _ASM_X86_MCA_H
+
+/* Maximal number of MCA slots - actually, some machines have less, but
+ * they all have sufficient number of POS registers to cover 8.
+ */
+#define MCA_MAX_SLOT_NR 8
+
+/* Most machines have only one MCA bus. The only multiple bus machines
+ * I know have at most two */
+#define MAX_MCA_BUSSES 2
+
+#define MCA_PRIMARY_BUS 0
+#define MCA_SECONDARY_BUS 1
+
+/* Dummy slot numbers on primary MCA for integrated functions */
+#define MCA_INTEGSCSI (MCA_MAX_SLOT_NR)
+#define MCA_INTEGVIDEO (MCA_MAX_SLOT_NR+1)
+#define MCA_MOTHERBOARD (MCA_MAX_SLOT_NR+2)
+
+/* Dummy POS values for integrated functions */
+#define MCA_DUMMY_POS_START 0x10000
+#define MCA_INTEGSCSI_POS (MCA_DUMMY_POS_START+1)
+#define MCA_INTEGVIDEO_POS (MCA_DUMMY_POS_START+2)
+#define MCA_MOTHERBOARD_POS (MCA_DUMMY_POS_START+3)
+
+/* MCA registers */
+
+#define MCA_MOTHERBOARD_SETUP_REG 0x94
+#define MCA_ADAPTER_SETUP_REG 0x96
+#define MCA_POS_REG(n) (0x100+(n))
+
+#define MCA_ENABLED 0x01 /* POS 2, set if adapter enabled */
+
+/* Max number of adapters, including both slots and various integrated
+ * things.
+ */
+#define MCA_NUMADAPTERS (MCA_MAX_SLOT_NR+3)
+
+#endif /* _ASM_X86_MCA_H */
diff --git a/arch/x86/include/asm/mca_dma.h b/arch/x86/include/asm/mca_dma.h
new file mode 100644
index 00000000000..45271aef82d
--- /dev/null
+++ b/arch/x86/include/asm/mca_dma.h
@@ -0,0 +1,201 @@
+#ifndef _ASM_X86_MCA_DMA_H
+#define _ASM_X86_MCA_DMA_H
+
+#include <asm/io.h>
+#include <linux/ioport.h>
+
+/*
+ * Microchannel specific DMA stuff. DMA on an MCA machine is fairly similar to
+ * standard PC dma, but it certainly has its quirks. DMA register addresses
+ * are in a different place and there are some added functions. Most of this
+ * should be pretty obvious on inspection. Note that the user must divide
+ * count by 2 when using 16-bit dma; that is not handled by these functions.
+ *
+ * Ramen Noodles are yummy.
+ *
+ * 1998 Tymm Twillman <tymm@computer.org>
+ */
+
+/*
+ * Registers that are used by the DMA controller; FN is the function register
+ * (tell the controller what to do) and EXE is the execution register (how
+ * to do it)
+ */
+
+#define MCA_DMA_REG_FN 0x18
+#define MCA_DMA_REG_EXE 0x1A
+
+/*
+ * Functions that the DMA controller can do
+ */
+
+#define MCA_DMA_FN_SET_IO 0x00
+#define MCA_DMA_FN_SET_ADDR 0x20
+#define MCA_DMA_FN_GET_ADDR 0x30
+#define MCA_DMA_FN_SET_COUNT 0x40
+#define MCA_DMA_FN_GET_COUNT 0x50
+#define MCA_DMA_FN_GET_STATUS 0x60
+#define MCA_DMA_FN_SET_MODE 0x70
+#define MCA_DMA_FN_SET_ARBUS 0x80
+#define MCA_DMA_FN_MASK 0x90
+#define MCA_DMA_FN_RESET_MASK 0xA0
+#define MCA_DMA_FN_MASTER_CLEAR 0xD0
+
+/*
+ * Modes (used by setting MCA_DMA_FN_MODE in the function register)
+ *
+ * Note that the MODE_READ is read from memory (write to device), and
+ * MODE_WRITE is vice-versa.
+ */
+
+#define MCA_DMA_MODE_XFER 0x04 /* read by default */
+#define MCA_DMA_MODE_READ 0x04 /* same as XFER */
+#define MCA_DMA_MODE_WRITE 0x08 /* OR with MODE_XFER to use */
+#define MCA_DMA_MODE_IO 0x01 /* DMA from IO register */
+#define MCA_DMA_MODE_16 0x40 /* 16 bit xfers */
+
+
+/**
+ * mca_enable_dma - channel to enable DMA on
+ * @dmanr: DMA channel
+ *
+ * Enable the MCA bus DMA on a channel. This can be called from
+ * IRQ context.
+ */
+
+static inline void mca_enable_dma(unsigned int dmanr)
+{
+ outb(MCA_DMA_FN_RESET_MASK | dmanr, MCA_DMA_REG_FN);
+}
+
+/**
+ * mca_disble_dma - channel to disable DMA on
+ * @dmanr: DMA channel
+ *
+ * Enable the MCA bus DMA on a channel. This can be called from
+ * IRQ context.
+ */
+
+static inline void mca_disable_dma(unsigned int dmanr)
+{
+ outb(MCA_DMA_FN_MASK | dmanr, MCA_DMA_REG_FN);
+}
+
+/**
+ * mca_set_dma_addr - load a 24bit DMA address
+ * @dmanr: DMA channel
+ * @a: 24bit bus address
+ *
+ * Load the address register in the DMA controller. This has a 24bit
+ * limitation (16Mb).
+ */
+
+static inline void mca_set_dma_addr(unsigned int dmanr, unsigned int a)
+{
+ outb(MCA_DMA_FN_SET_ADDR | dmanr, MCA_DMA_REG_FN);
+ outb(a & 0xff, MCA_DMA_REG_EXE);
+ outb((a >> 8) & 0xff, MCA_DMA_REG_EXE);
+ outb((a >> 16) & 0xff, MCA_DMA_REG_EXE);
+}
+
+/**
+ * mca_get_dma_addr - load a 24bit DMA address
+ * @dmanr: DMA channel
+ *
+ * Read the address register in the DMA controller. This has a 24bit
+ * limitation (16Mb). The return is a bus address.
+ */
+
+static inline unsigned int mca_get_dma_addr(unsigned int dmanr)
+{
+ unsigned int addr;
+
+ outb(MCA_DMA_FN_GET_ADDR | dmanr, MCA_DMA_REG_FN);
+ addr = inb(MCA_DMA_REG_EXE);
+ addr |= inb(MCA_DMA_REG_EXE) << 8;
+ addr |= inb(MCA_DMA_REG_EXE) << 16;
+
+ return addr;
+}
+
+/**
+ * mca_set_dma_count - load a 16bit transfer count
+ * @dmanr: DMA channel
+ * @count: count
+ *
+ * Set the DMA count for this channel. This can be up to 64Kbytes.
+ * Setting a count of zero will not do what you expect.
+ */
+
+static inline void mca_set_dma_count(unsigned int dmanr, unsigned int count)
+{
+ count--; /* transfers one more than count -- correct for this */
+
+ outb(MCA_DMA_FN_SET_COUNT | dmanr, MCA_DMA_REG_FN);
+ outb(count & 0xff, MCA_DMA_REG_EXE);
+ outb((count >> 8) & 0xff, MCA_DMA_REG_EXE);
+}
+
+/**
+ * mca_get_dma_residue - get the remaining bytes to transfer
+ * @dmanr: DMA channel
+ *
+ * This function returns the number of bytes left to transfer
+ * on this DMA channel.
+ */
+
+static inline unsigned int mca_get_dma_residue(unsigned int dmanr)
+{
+ unsigned short count;
+
+ outb(MCA_DMA_FN_GET_COUNT | dmanr, MCA_DMA_REG_FN);
+ count = 1 + inb(MCA_DMA_REG_EXE);
+ count += inb(MCA_DMA_REG_EXE) << 8;
+
+ return count;
+}
+
+/**
+ * mca_set_dma_io - set the port for an I/O transfer
+ * @dmanr: DMA channel
+ * @io_addr: an I/O port number
+ *
+ * Unlike the ISA bus DMA controllers the DMA on MCA bus can transfer
+ * with an I/O port target.
+ */
+
+static inline void mca_set_dma_io(unsigned int dmanr, unsigned int io_addr)
+{
+ /*
+ * DMA from a port address -- set the io address
+ */
+
+ outb(MCA_DMA_FN_SET_IO | dmanr, MCA_DMA_REG_FN);
+ outb(io_addr & 0xff, MCA_DMA_REG_EXE);
+ outb((io_addr >> 8) & 0xff, MCA_DMA_REG_EXE);
+}
+
+/**
+ * mca_set_dma_mode - set the DMA mode
+ * @dmanr: DMA channel
+ * @mode: mode to set
+ *
+ * The DMA controller supports several modes. The mode values you can
+ * set are-
+ *
+ * %MCA_DMA_MODE_READ when reading from the DMA device.
+ *
+ * %MCA_DMA_MODE_WRITE to writing to the DMA device.
+ *
+ * %MCA_DMA_MODE_IO to do DMA to or from an I/O port.
+ *
+ * %MCA_DMA_MODE_16 to do 16bit transfers.
+ */
+
+static inline void mca_set_dma_mode(unsigned int dmanr, unsigned int mode)
+{
+ outb(MCA_DMA_FN_SET_MODE | dmanr, MCA_DMA_REG_FN);
+ outb(mode, MCA_DMA_REG_EXE);
+}
+
+#endif /* _ASM_X86_MCA_DMA_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
new file mode 100644
index 00000000000..1d6e17c2f23
--- /dev/null
+++ b/arch/x86/include/asm/mce.h
@@ -0,0 +1,130 @@
+#ifndef _ASM_X86_MCE_H
+#define _ASM_X86_MCE_H
+
+#ifdef __x86_64__
+
+#include <asm/ioctls.h>
+#include <asm/types.h>
+
+/*
+ * Machine Check support for x86
+ */
+
+#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
+
+#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
+#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
+#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */
+
+#define MCI_STATUS_VAL (1UL<<63) /* valid error */
+#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */
+#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */
+#define MCI_STATUS_EN (1UL<<60) /* error enabled */
+#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */
+
+/* Fields are zero when not available */
+struct mce {
+ __u64 status;
+ __u64 misc;
+ __u64 addr;
+ __u64 mcgstatus;
+ __u64 ip;
+ __u64 tsc; /* cpu time stamp counter */
+ __u64 res1; /* for future extension */
+ __u64 res2; /* dito. */
+ __u8 cs; /* code segment */
+ __u8 bank; /* machine check bank */
+ __u8 cpu; /* cpu that raised the error */
+ __u8 finished; /* entry is valid */
+ __u32 pad;
+};
+
+/*
+ * This structure contains all data related to the MCE log. Also
+ * carries a signature to make it easier to find from external
+ * debugging tools. Each entry is only valid when its finished flag
+ * is set.
+ */
+
+#define MCE_LOG_LEN 32
+
+struct mce_log {
+ char signature[12]; /* "MACHINECHECK" */
+ unsigned len; /* = MCE_LOG_LEN */
+ unsigned next;
+ unsigned flags;
+ unsigned pad0;
+ struct mce entry[MCE_LOG_LEN];
+};
+
+#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
+
+#define MCE_LOG_SIGNATURE "MACHINECHECK"
+
+#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
+#define MCE_GET_LOG_LEN _IOR('M', 2, int)
+#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int)
+
+/* Software defined banks */
+#define MCE_EXTENDED_BANK 128
+#define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0
+
+#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */
+#define K8_MCE_THRESHOLD_BANK_0 (MCE_THRESHOLD_BASE + 0 * 9)
+#define K8_MCE_THRESHOLD_BANK_1 (MCE_THRESHOLD_BASE + 1 * 9)
+#define K8_MCE_THRESHOLD_BANK_2 (MCE_THRESHOLD_BASE + 2 * 9)
+#define K8_MCE_THRESHOLD_BANK_3 (MCE_THRESHOLD_BASE + 3 * 9)
+#define K8_MCE_THRESHOLD_BANK_4 (MCE_THRESHOLD_BASE + 4 * 9)
+#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
+#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
+
+#endif /* __x86_64__ */
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_X86_32
+extern int mce_disabled;
+#else /* CONFIG_X86_32 */
+
+#include <asm/atomic.h>
+
+void mce_log(struct mce *m);
+DECLARE_PER_CPU(struct sys_device, device_mce);
+extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+
+#ifdef CONFIG_X86_MCE_INTEL
+void mce_intel_feature_init(struct cpuinfo_x86 *c);
+#else
+static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+void mce_amd_feature_init(struct cpuinfo_x86 *c);
+#else
+static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
+#endif
+
+void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+
+extern atomic_t mce_entry;
+
+extern void do_machine_check(struct pt_regs *, long);
+extern int mce_notify_user(void);
+
+#endif /* !CONFIG_X86_32 */
+
+
+
+#ifdef CONFIG_X86_MCE
+extern void mcheck_init(struct cpuinfo_x86 *c);
+#else
+#define mcheck_init(c) do { } while (0)
+#endif
+extern void stop_mce(void);
+extern void restart_mce(void);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
new file mode 100644
index 00000000000..c882664716c
--- /dev/null
+++ b/arch/x86/include/asm/microcode.h
@@ -0,0 +1,47 @@
+#ifndef _ASM_X86_MICROCODE_H
+#define _ASM_X86_MICROCODE_H
+
+struct cpu_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int rev;
+};
+
+struct device;
+
+struct microcode_ops {
+ int (*request_microcode_user) (int cpu, const void __user *buf, size_t size);
+ int (*request_microcode_fw) (int cpu, struct device *device);
+
+ void (*apply_microcode) (int cpu);
+
+ int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
+ void (*microcode_fini_cpu) (int cpu);
+};
+
+struct ucode_cpu_info {
+ struct cpu_signature cpu_sig;
+ int valid;
+ void *mc;
+};
+extern struct ucode_cpu_info ucode_cpu_info[];
+
+#ifdef CONFIG_MICROCODE_INTEL
+extern struct microcode_ops * __init init_intel_microcode(void);
+#else
+static inline struct microcode_ops * __init init_intel_microcode(void)
+{
+ return NULL;
+}
+#endif /* CONFIG_MICROCODE_INTEL */
+
+#ifdef CONFIG_MICROCODE_AMD
+extern struct microcode_ops * __init init_amd_microcode(void);
+#else
+static inline struct microcode_ops * __init init_amd_microcode(void)
+{
+ return NULL;
+}
+#endif
+
+#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
new file mode 100644
index 00000000000..90bc4108a4f
--- /dev/null
+++ b/arch/x86/include/asm/mman.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_MMAN_H
+#define _ASM_X86_MMAN_H
+
+#include <asm-generic/mman.h>
+
+#define MAP_32BIT 0x40 /* only give out 32bit addresses */
+
+#define MAP_GROWSDOWN 0x0100 /* stack-like segment */
+#define MAP_DENYWRITE 0x0800 /* ETXTBSY */
+#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */
+#define MAP_LOCKED 0x2000 /* pages are locked */
+#define MAP_NORESERVE 0x4000 /* don't check for reservations */
+#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
+#define MAP_NONBLOCK 0x10000 /* do not block on IO */
+#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
+
+#define MCL_CURRENT 1 /* lock all current mappings */
+#define MCL_FUTURE 2 /* lock all future mappings */
+
+#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/asm/mmconfig.h b/arch/x86/include/asm/mmconfig.h
new file mode 100644
index 00000000000..9b119da1d10
--- /dev/null
+++ b/arch/x86/include/asm/mmconfig.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_MMCONFIG_H
+#define _ASM_X86_MMCONFIG_H
+
+#ifdef CONFIG_PCI_MMCONFIG
+extern void __cpuinit fam10h_check_enable_mmcfg(void);
+extern void __cpuinit check_enable_amd_mmconf_dmi(void);
+#else
+static inline void fam10h_check_enable_mmcfg(void) { }
+static inline void check_enable_amd_mmconf_dmi(void) { }
+#endif
+
+#endif /* _ASM_X86_MMCONFIG_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
new file mode 100644
index 00000000000..80a1dee5bea
--- /dev/null
+++ b/arch/x86/include/asm/mmu.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_MMU_H
+#define _ASM_X86_MMU_H
+
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+/*
+ * The x86 doesn't have a mmu context, but
+ * we put the segment information here.
+ */
+typedef struct {
+ void *ldt;
+ int size;
+ struct mutex lock;
+ void *vdso;
+} mm_context_t;
+
+#ifdef CONFIG_SMP
+void leave_mm(int cpu);
+#else
+static inline void leave_mm(int cpu)
+{
+}
+#endif
+
+#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
new file mode 100644
index 00000000000..8aeeb3fd73d
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_X86_MMU_CONTEXT_H
+#define _ASM_X86_MMU_CONTEXT_H
+
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/paravirt.h>
+#ifndef CONFIG_PARAVIRT
+#include <asm-generic/mm_hooks.h>
+
+static inline void paravirt_activate_mm(struct mm_struct *prev,
+ struct mm_struct *next)
+{
+}
+#endif /* !CONFIG_PARAVIRT */
+
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+
+#ifdef CONFIG_X86_32
+# include "mmu_context_32.h"
+#else
+# include "mmu_context_64.h"
+#endif
+
+#define activate_mm(prev, next) \
+do { \
+ paravirt_activate_mm((prev), (next)); \
+ switch_mm((prev), (next), NULL); \
+} while (0);
+
+
+#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
new file mode 100644
index 00000000000..8e10015781f
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context_32.h
@@ -0,0 +1,56 @@
+#ifndef _ASM_X86_MMU_CONTEXT_32_H
+#define _ASM_X86_MMU_CONTEXT_32_H
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#ifdef CONFIG_SMP
+ unsigned cpu = smp_processor_id();
+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
+#endif
+}
+
+static inline void switch_mm(struct mm_struct *prev,
+ struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ int cpu = smp_processor_id();
+
+ if (likely(prev != next)) {
+ /* stop flush ipis for the previous mm */
+ cpu_clear(cpu, prev->cpu_vm_mask);
+#ifdef CONFIG_SMP
+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
+ per_cpu(cpu_tlbstate, cpu).active_mm = next;
+#endif
+ cpu_set(cpu, next->cpu_vm_mask);
+
+ /* Re-load page tables */
+ load_cr3(next->pgd);
+
+ /*
+ * load the LDT, if the LDT is different:
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt))
+ load_LDT_nolock(&next->context);
+ }
+#ifdef CONFIG_SMP
+ else {
+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
+ BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
+
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload %cr3.
+ */
+ load_cr3(next->pgd);
+ load_LDT_nolock(&next->context);
+ }
+ }
+#endif
+}
+
+#define deactivate_mm(tsk, mm) \
+ asm("movl %0,%%gs": :"r" (0));
+
+#endif /* _ASM_X86_MMU_CONTEXT_32_H */
diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
new file mode 100644
index 00000000000..677d36e9540
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context_64.h
@@ -0,0 +1,54 @@
+#ifndef _ASM_X86_MMU_CONTEXT_64_H
+#define _ASM_X86_MMU_CONTEXT_64_H
+
+#include <asm/pda.h>
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#ifdef CONFIG_SMP
+ if (read_pda(mmu_state) == TLBSTATE_OK)
+ write_pda(mmu_state, TLBSTATE_LAZY);
+#endif
+}
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned cpu = smp_processor_id();
+ if (likely(prev != next)) {
+ /* stop flush ipis for the previous mm */
+ cpu_clear(cpu, prev->cpu_vm_mask);
+#ifdef CONFIG_SMP
+ write_pda(mmu_state, TLBSTATE_OK);
+ write_pda(active_mm, next);
+#endif
+ cpu_set(cpu, next->cpu_vm_mask);
+ load_cr3(next->pgd);
+
+ if (unlikely(next->context.ldt != prev->context.ldt))
+ load_LDT_nolock(&next->context);
+ }
+#ifdef CONFIG_SMP
+ else {
+ write_pda(mmu_state, TLBSTATE_OK);
+ if (read_pda(active_mm) != next)
+ BUG();
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ */
+ load_cr3(next->pgd);
+ load_LDT_nolock(&next->context);
+ }
+ }
+#endif
+}
+
+#define deactivate_mm(tsk, mm) \
+do { \
+ load_gs_index(0); \
+ asm volatile("movl %0,%%fs"::"r"(0)); \
+} while (0)
+
+#endif /* _ASM_X86_MMU_CONTEXT_64_H */
diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h
new file mode 100644
index 00000000000..5cbf3135b97
--- /dev/null
+++ b/arch/x86/include/asm/mmx.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_MMX_H
+#define _ASM_X86_MMX_H
+
+/*
+ * MMX 3Dnow! helper operations
+ */
+
+#include <linux/types.h>
+
+extern void *_mmx_memcpy(void *to, const void *from, size_t size);
+extern void mmx_clear_page(void *page);
+extern void mmx_copy_page(void *to, void *from);
+
+#endif /* _ASM_X86_MMX_H */
diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h
new file mode 100644
index 00000000000..64217ea16a3
--- /dev/null
+++ b/arch/x86/include/asm/mmzone.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "mmzone_32.h"
+#else
+# include "mmzone_64.h"
+#endif
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
new file mode 100644
index 00000000000..485bdf059ff
--- /dev/null
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -0,0 +1,134 @@
+/*
+ * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002
+ *
+ */
+
+#ifndef _ASM_X86_MMZONE_32_H
+#define _ASM_X86_MMZONE_32_H
+
+#include <asm/smp.h>
+
+#ifdef CONFIG_NUMA
+extern struct pglist_data *node_data[];
+#define NODE_DATA(nid) (node_data[nid])
+
+#include <asm/numaq.h>
+/* summit or generic arch */
+#include <asm/srat.h>
+
+extern int get_memcfg_numa_flat(void);
+/*
+ * This allows any one NUMA architecture to be compiled
+ * for, and still fall back to the flat function if it
+ * fails.
+ */
+static inline void get_memcfg_numa(void)
+{
+
+ if (get_memcfg_numaq())
+ return;
+ if (get_memcfg_from_srat())
+ return;
+ get_memcfg_numa_flat();
+}
+
+extern int early_pfn_to_nid(unsigned long pfn);
+
+#else /* !CONFIG_NUMA */
+
+#define get_memcfg_numa get_memcfg_numa_flat
+
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_DISCONTIGMEM
+
+/*
+ * generic node memory support, the following assumptions apply:
+ *
+ * 1) memory comes in 64Mb contigious chunks which are either present or not
+ * 2) we will not have more than 64Gb in total
+ *
+ * for now assume that 64Gb is max amount of RAM for whole system
+ * 64Gb / 4096bytes/page = 16777216 pages
+ */
+#define MAX_NR_PAGES 16777216
+#define MAX_ELEMENTS 1024
+#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
+
+extern s8 physnode_map[];
+
+static inline int pfn_to_nid(unsigned long pfn)
+{
+#ifdef CONFIG_NUMA
+ return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);
+#else
+ return 0;
+#endif
+}
+
+/*
+ * Following are macros that each numa implmentation must define.
+ */
+
+#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid) \
+({ \
+ pg_data_t *__pgdat = NODE_DATA(nid); \
+ __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \
+})
+
+static inline int pfn_valid(int pfn)
+{
+ int nid = pfn_to_nid(pfn);
+
+ if (nid >= 0)
+ return (pfn < node_end_pfn(nid));
+ return 0;
+}
+
+#endif /* CONFIG_DISCONTIGMEM */
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+
+/*
+ * Following are macros that are specific to this numa platform.
+ */
+#define reserve_bootmem(addr, size, flags) \
+ reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
+#define alloc_bootmem(x) \
+ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_nopanic(x) \
+ __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
+ __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low(x) \
+ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_pages(x) \
+ __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_pages_nopanic(x) \
+ __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
+ __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low_pages(x) \
+ __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
+#define alloc_bootmem_node(pgdat, x) \
+({ \
+ struct pglist_data __maybe_unused \
+ *__alloc_bootmem_node__pgdat = (pgdat); \
+ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
+ __pa(MAX_DMA_ADDRESS)); \
+})
+#define alloc_bootmem_pages_node(pgdat, x) \
+({ \
+ struct pglist_data __maybe_unused \
+ *__alloc_bootmem_node__pgdat = (pgdat); \
+ __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
+ __pa(MAX_DMA_ADDRESS)); \
+})
+#define alloc_bootmem_low_pages_node(pgdat, x) \
+({ \
+ struct pglist_data __maybe_unused \
+ *__alloc_bootmem_node__pgdat = (pgdat); \
+ __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
+})
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
+
+#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
new file mode 100644
index 00000000000..a5b3817d4b9
--- /dev/null
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -0,0 +1,51 @@
+/* K8 NUMA support */
+/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */
+/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */
+#ifndef _ASM_X86_MMZONE_64_H
+#define _ASM_X86_MMZONE_64_H
+
+
+#ifdef CONFIG_NUMA
+
+#include <linux/mmdebug.h>
+
+#include <asm/smp.h>
+
+/* Simple perfect hash to map physical addresses to node numbers */
+struct memnode {
+ int shift;
+ unsigned int mapsize;
+ s16 *map;
+ s16 embedded_map[64 - 8];
+} ____cacheline_aligned; /* total size = 128 bytes */
+extern struct memnode memnode;
+#define memnode_shift memnode.shift
+#define memnodemap memnode.map
+#define memnodemapsize memnode.mapsize
+
+extern struct pglist_data *node_data[];
+
+static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
+{
+ unsigned nid;
+ VIRTUAL_BUG_ON(!memnodemap);
+ nid = memnodemap[addr >> memnode_shift];
+ VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
+ return nid;
+}
+
+#define NODE_DATA(nid) (node_data[nid])
+
+#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
+ NODE_DATA(nid)->node_spanned_pages)
+
+extern int early_pfn_to_nid(unsigned long pfn);
+
+#ifdef CONFIG_NUMA_EMU
+#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
+#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
+#endif
+
+#endif
+#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
new file mode 100644
index 00000000000..47d62743c4d
--- /dev/null
+++ b/arch/x86/include/asm/module.h
@@ -0,0 +1,80 @@
+#ifndef _ASM_X86_MODULE_H
+#define _ASM_X86_MODULE_H
+
+/* x86_32/64 are simple */
+struct mod_arch_specific {};
+
+#ifdef CONFIG_X86_32
+# define Elf_Shdr Elf32_Shdr
+# define Elf_Sym Elf32_Sym
+# define Elf_Ehdr Elf32_Ehdr
+#else
+# define Elf_Shdr Elf64_Shdr
+# define Elf_Sym Elf64_Sym
+# define Elf_Ehdr Elf64_Ehdr
+#endif
+
+#ifdef CONFIG_X86_64
+/* X86_64 does not define MODULE_PROC_FAMILY */
+#elif defined CONFIG_M386
+#define MODULE_PROC_FAMILY "386 "
+#elif defined CONFIG_M486
+#define MODULE_PROC_FAMILY "486 "
+#elif defined CONFIG_M586
+#define MODULE_PROC_FAMILY "586 "
+#elif defined CONFIG_M586TSC
+#define MODULE_PROC_FAMILY "586TSC "
+#elif defined CONFIG_M586MMX
+#define MODULE_PROC_FAMILY "586MMX "
+#elif defined CONFIG_MCORE2
+#define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_M686
+#define MODULE_PROC_FAMILY "686 "
+#elif defined CONFIG_MPENTIUMII
+#define MODULE_PROC_FAMILY "PENTIUMII "
+#elif defined CONFIG_MPENTIUMIII
+#define MODULE_PROC_FAMILY "PENTIUMIII "
+#elif defined CONFIG_MPENTIUMM
+#define MODULE_PROC_FAMILY "PENTIUMM "
+#elif defined CONFIG_MPENTIUM4
+#define MODULE_PROC_FAMILY "PENTIUM4 "
+#elif defined CONFIG_MK6
+#define MODULE_PROC_FAMILY "K6 "
+#elif defined CONFIG_MK7
+#define MODULE_PROC_FAMILY "K7 "
+#elif defined CONFIG_MK8
+#define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_X86_ELAN
+#define MODULE_PROC_FAMILY "ELAN "
+#elif defined CONFIG_MCRUSOE
+#define MODULE_PROC_FAMILY "CRUSOE "
+#elif defined CONFIG_MEFFICEON
+#define MODULE_PROC_FAMILY "EFFICEON "
+#elif defined CONFIG_MWINCHIPC6
+#define MODULE_PROC_FAMILY "WINCHIPC6 "
+#elif defined CONFIG_MWINCHIP3D
+#define MODULE_PROC_FAMILY "WINCHIP3D "
+#elif defined CONFIG_MCYRIXIII
+#define MODULE_PROC_FAMILY "CYRIXIII "
+#elif defined CONFIG_MVIAC3_2
+#define MODULE_PROC_FAMILY "VIAC3-2 "
+#elif defined CONFIG_MVIAC7
+#define MODULE_PROC_FAMILY "VIAC7 "
+#elif defined CONFIG_MGEODEGX1
+#define MODULE_PROC_FAMILY "GEODEGX1 "
+#elif defined CONFIG_MGEODE_LX
+#define MODULE_PROC_FAMILY "GEODE "
+#else
+#error unknown processor family
+#endif
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_4KSTACKS
+# define MODULE_STACKSIZE "4KSTACKS "
+# else
+# define MODULE_STACKSIZE ""
+# endif
+# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
+#endif
+
+#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
new file mode 100644
index 00000000000..91885c28f66
--- /dev/null
+++ b/arch/x86/include/asm/mpspec.h
@@ -0,0 +1,145 @@
+#ifndef _ASM_X86_MPSPEC_H
+#define _ASM_X86_MPSPEC_H
+
+#include <linux/init.h>
+
+#include <asm/mpspec_def.h>
+
+extern int apic_version[MAX_APICS];
+
+#ifdef CONFIG_X86_32
+#include <mach_mpspec.h>
+
+extern unsigned int def_to_bigsmp;
+extern u8 apicid_2_node[];
+extern int pic_mode;
+
+#ifdef CONFIG_X86_NUMAQ
+extern int mp_bus_id_to_node[MAX_MP_BUSSES];
+extern int mp_bus_id_to_local[MAX_MP_BUSSES];
+extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+#endif
+
+#define MAX_APICID 256
+
+#else
+
+#define MAX_MP_BUSSES 256
+/* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
+
+#endif
+
+extern void early_find_smp_config(void);
+extern void early_get_smp_config(void);
+
+#if defined(CONFIG_MCA) || defined(CONFIG_EISA)
+extern int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
+extern unsigned int boot_cpu_physical_apicid;
+extern unsigned int max_physical_apicid;
+extern int smp_found_config;
+extern int mpc_default_type;
+extern unsigned long mp_lapic_addr;
+
+extern void find_smp_config(void);
+extern void get_smp_config(void);
+#ifdef CONFIG_X86_MPPARSE
+extern void early_reserve_e820_mpc_new(void);
+#else
+static inline void early_reserve_e820_mpc_new(void) { }
+#endif
+
+void __cpuinit generic_processor_info(int apicid, int version);
+#ifdef CONFIG_ACPI
+extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
+extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
+ u32 gsi);
+extern void mp_config_acpi_legacy_irqs(void);
+extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+#ifdef CONFIG_X86_IO_APIC
+extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+ u32 gsi, int triggering, int polarity);
+#else
+static inline int
+mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+ u32 gsi, int triggering, int polarity)
+{
+ return 0;
+}
+#endif
+#endif /* CONFIG_ACPI */
+
+#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
+
+struct physid_mask {
+ unsigned long mask[PHYSID_ARRAY_SIZE];
+};
+
+typedef struct physid_mask physid_mask_t;
+
+#define physid_set(physid, map) set_bit(physid, (map).mask)
+#define physid_clear(physid, map) clear_bit(physid, (map).mask)
+#define physid_isset(physid, map) test_bit(physid, (map).mask)
+#define physid_test_and_set(physid, map) \
+ test_and_set_bit(physid, (map).mask)
+
+#define physids_and(dst, src1, src2) \
+ bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+
+#define physids_or(dst, src1, src2) \
+ bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+
+#define physids_clear(map) \
+ bitmap_zero((map).mask, MAX_APICS)
+
+#define physids_complement(dst, src) \
+ bitmap_complement((dst).mask, (src).mask, MAX_APICS)
+
+#define physids_empty(map) \
+ bitmap_empty((map).mask, MAX_APICS)
+
+#define physids_equal(map1, map2) \
+ bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
+
+#define physids_weight(map) \
+ bitmap_weight((map).mask, MAX_APICS)
+
+#define physids_shift_right(d, s, n) \
+ bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
+
+#define physids_shift_left(d, s, n) \
+ bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
+
+#define physids_coerce(map) ((map).mask[0])
+
+#define physids_promote(physids) \
+ ({ \
+ physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
+ __physid_mask.mask[0] = physids; \
+ __physid_mask; \
+ })
+
+/* Note: will create very large stack frames if physid_mask_t is big */
+#define physid_mask_of_physid(physid) \
+ ({ \
+ physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
+ physid_set(physid, __physid_mask); \
+ __physid_mask; \
+ })
+
+static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
+{
+ physids_clear(*map);
+ physid_set(physid, *map);
+}
+
+#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
+#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
+
+extern physid_mask_t phys_cpu_present_map;
+
+#endif /* _ASM_X86_MPSPEC_H */
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
new file mode 100644
index 00000000000..e3ace7d1d35
--- /dev/null
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -0,0 +1,180 @@
+#ifndef _ASM_X86_MPSPEC_DEF_H
+#define _ASM_X86_MPSPEC_DEF_H
+
+/*
+ * Structure definitions for SMP machines following the
+ * Intel Multiprocessing Specification 1.1 and 1.4.
+ */
+
+/*
+ * This tag identifies where the SMP configuration
+ * information is.
+ */
+
+#define SMP_MAGIC_IDENT (('_'<<24) | ('P'<<16) | ('M'<<8) | '_')
+
+#ifdef CONFIG_X86_32
+# define MAX_MPC_ENTRY 1024
+# define MAX_APICS 256
+#else
+# if NR_CPUS <= 255
+# define MAX_APICS 255
+# else
+# define MAX_APICS 32768
+# endif
+#endif
+
+struct intel_mp_floating {
+ char mpf_signature[4]; /* "_MP_" */
+ unsigned int mpf_physptr; /* Configuration table address */
+ unsigned char mpf_length; /* Our length (paragraphs) */
+ unsigned char mpf_specification;/* Specification version */
+ unsigned char mpf_checksum; /* Checksum (makes sum 0) */
+ unsigned char mpf_feature1; /* Standard or configuration ? */
+ unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */
+ unsigned char mpf_feature3; /* Unused (0) */
+ unsigned char mpf_feature4; /* Unused (0) */
+ unsigned char mpf_feature5; /* Unused (0) */
+};
+
+#define MPC_SIGNATURE "PCMP"
+
+struct mp_config_table {
+ char mpc_signature[4];
+ unsigned short mpc_length; /* Size of table */
+ char mpc_spec; /* 0x01 */
+ char mpc_checksum;
+ char mpc_oem[8];
+ char mpc_productid[12];
+ unsigned int mpc_oemptr; /* 0 if not present */
+ unsigned short mpc_oemsize; /* 0 if not present */
+ unsigned short mpc_oemcount;
+ unsigned int mpc_lapic; /* APIC address */
+ unsigned int reserved;
+};
+
+/* Followed by entries */
+
+#define MP_PROCESSOR 0
+#define MP_BUS 1
+#define MP_IOAPIC 2
+#define MP_INTSRC 3
+#define MP_LINTSRC 4
+/* Used by IBM NUMA-Q to describe node locality */
+#define MP_TRANSLATION 192
+
+#define CPU_ENABLED 1 /* Processor is available */
+#define CPU_BOOTPROCESSOR 2 /* Processor is the BP */
+
+#define CPU_STEPPING_MASK 0x000F
+#define CPU_MODEL_MASK 0x00F0
+#define CPU_FAMILY_MASK 0x0F00
+
+struct mpc_config_processor {
+ unsigned char mpc_type;
+ unsigned char mpc_apicid; /* Local APIC number */
+ unsigned char mpc_apicver; /* Its versions */
+ unsigned char mpc_cpuflag;
+ unsigned int mpc_cpufeature;
+ unsigned int mpc_featureflag; /* CPUID feature value */
+ unsigned int mpc_reserved[2];
+};
+
+struct mpc_config_bus {
+ unsigned char mpc_type;
+ unsigned char mpc_busid;
+ unsigned char mpc_bustype[6];
+};
+
+/* List of Bus Type string values, Intel MP Spec. */
+#define BUSTYPE_EISA "EISA"
+#define BUSTYPE_ISA "ISA"
+#define BUSTYPE_INTERN "INTERN" /* Internal BUS */
+#define BUSTYPE_MCA "MCA"
+#define BUSTYPE_VL "VL" /* Local bus */
+#define BUSTYPE_PCI "PCI"
+#define BUSTYPE_PCMCIA "PCMCIA"
+#define BUSTYPE_CBUS "CBUS"
+#define BUSTYPE_CBUSII "CBUSII"
+#define BUSTYPE_FUTURE "FUTURE"
+#define BUSTYPE_MBI "MBI"
+#define BUSTYPE_MBII "MBII"
+#define BUSTYPE_MPI "MPI"
+#define BUSTYPE_MPSA "MPSA"
+#define BUSTYPE_NUBUS "NUBUS"
+#define BUSTYPE_TC "TC"
+#define BUSTYPE_VME "VME"
+#define BUSTYPE_XPRESS "XPRESS"
+
+#define MPC_APIC_USABLE 0x01
+
+struct mpc_config_ioapic {
+ unsigned char mpc_type;
+ unsigned char mpc_apicid;
+ unsigned char mpc_apicver;
+ unsigned char mpc_flags;
+ unsigned int mpc_apicaddr;
+};
+
+struct mpc_config_intsrc {
+ unsigned char mpc_type;
+ unsigned char mpc_irqtype;
+ unsigned short mpc_irqflag;
+ unsigned char mpc_srcbus;
+ unsigned char mpc_srcbusirq;
+ unsigned char mpc_dstapic;
+ unsigned char mpc_dstirq;
+};
+
+enum mp_irq_source_types {
+ mp_INT = 0,
+ mp_NMI = 1,
+ mp_SMI = 2,
+ mp_ExtINT = 3
+};
+
+#define MP_IRQDIR_DEFAULT 0
+#define MP_IRQDIR_HIGH 1
+#define MP_IRQDIR_LOW 3
+
+#define MP_APIC_ALL 0xFF
+
+struct mpc_config_lintsrc {
+ unsigned char mpc_type;
+ unsigned char mpc_irqtype;
+ unsigned short mpc_irqflag;
+ unsigned char mpc_srcbusid;
+ unsigned char mpc_srcbusirq;
+ unsigned char mpc_destapic;
+ unsigned char mpc_destapiclint;
+};
+
+#define MPC_OEM_SIGNATURE "_OEM"
+
+struct mp_config_oemtable {
+ char oem_signature[4];
+ unsigned short oem_length; /* Size of table */
+ char oem_rev; /* 0x01 */
+ char oem_checksum;
+ char mpc_oem[8];
+};
+
+/*
+ * Default configurations
+ *
+ * 1 2 CPU ISA 82489DX
+ * 2 2 CPU EISA 82489DX neither IRQ 0 timer nor IRQ 13 DMA chaining
+ * 3 2 CPU EISA 82489DX
+ * 4 2 CPU MCA 82489DX
+ * 5 2 CPU ISA+PCI
+ * 6 2 CPU EISA+PCI
+ * 7 2 CPU MCA+PCI
+ */
+
+enum mp_bustype {
+ MP_BUS_ISA = 1,
+ MP_BUS_EISA,
+ MP_BUS_PCI,
+ MP_BUS_MCA,
+};
+#endif /* _ASM_X86_MPSPEC_DEF_H */
diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h
new file mode 100644
index 00000000000..7e4e9481f51
--- /dev/null
+++ b/arch/x86/include/asm/msgbuf.h
@@ -0,0 +1,39 @@
+#ifndef _ASM_X86_MSGBUF_H
+#define _ASM_X86_MSGBUF_H
+
+/*
+ * The msqid64_ds structure for i386 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space on i386 is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ *
+ * Pad space on x8664 is left for:
+ * - 2 miscellaneous 64-bit values
+ */
+struct msqid64_ds {
+ struct ipc64_perm msg_perm;
+ __kernel_time_t msg_stime; /* last msgsnd time */
+#ifdef __i386__
+ unsigned long __unused1;
+#endif
+ __kernel_time_t msg_rtime; /* last msgrcv time */
+#ifdef __i386__
+ unsigned long __unused2;
+#endif
+ __kernel_time_t msg_ctime; /* last change time */
+#ifdef __i386__
+ unsigned long __unused3;
+#endif
+ unsigned long msg_cbytes; /* current number of bytes on queue */
+ unsigned long msg_qnum; /* number of messages in queue */
+ unsigned long msg_qbytes; /* max number of bytes on queue */
+ __kernel_pid_t msg_lspid; /* pid of last msgsnd */
+ __kernel_pid_t msg_lrpid; /* last receive pid */
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+
+#endif /* _ASM_X86_MSGBUF_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
new file mode 100644
index 00000000000..6706b3006f1
--- /dev/null
+++ b/arch/x86/include/asm/msidef.h
@@ -0,0 +1,55 @@
+#ifndef _ASM_X86_MSIDEF_H
+#define _ASM_X86_MSIDEF_H
+
+/*
+ * Constants for Intel APIC based MSI messages.
+ */
+
+/*
+ * Shifts for MSI data
+ */
+
+#define MSI_DATA_VECTOR_SHIFT 0
+#define MSI_DATA_VECTOR_MASK 0x000000ff
+#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \
+ MSI_DATA_VECTOR_MASK)
+
+#define MSI_DATA_DELIVERY_MODE_SHIFT 8
+#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT)
+#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT)
+
+#define MSI_DATA_LEVEL_SHIFT 14
+#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT)
+#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT)
+
+#define MSI_DATA_TRIGGER_SHIFT 15
+#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT)
+#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT)
+
+/*
+ * Shift/mask fields for msi address
+ */
+
+#define MSI_ADDR_BASE_HI 0
+#define MSI_ADDR_BASE_LO 0xfee00000
+
+#define MSI_ADDR_DEST_MODE_SHIFT 2
+#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT)
+#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT)
+
+#define MSI_ADDR_REDIRECTION_SHIFT 3
+#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT)
+ /* dedicated cpu */
+#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
+ /* lowest priority */
+
+#define MSI_ADDR_DEST_ID_SHIFT 12
+#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
+#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
+ MSI_ADDR_DEST_ID_MASK)
+
+#define MSI_ADDR_IR_EXT_INT (1 << 4)
+#define MSI_ADDR_IR_SHV (1 << 3)
+#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13)
+#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5)
+#endif /* _ASM_X86_MSIDEF_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
new file mode 100644
index 00000000000..e38859d577a
--- /dev/null
+++ b/arch/x86/include/asm/msr-index.h
@@ -0,0 +1,332 @@
+#ifndef _ASM_X86_MSR_INDEX_H
+#define _ASM_X86_MSR_INDEX_H
+
+/* CPU model specific register (MSR) numbers */
+
+/* x86-64 specific MSRs */
+#define MSR_EFER 0xc0000080 /* extended feature register */
+#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */
+#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */
+#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */
+#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */
+#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
+#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
+#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
+
+/* EFER bits: */
+#define _EFER_SCE 0 /* SYSCALL/SYSRET */
+#define _EFER_LME 8 /* Long mode enable */
+#define _EFER_LMA 10 /* Long mode active (read-only) */
+#define _EFER_NX 11 /* No execute enable */
+
+#define EFER_SCE (1<<_EFER_SCE)
+#define EFER_LME (1<<_EFER_LME)
+#define EFER_LMA (1<<_EFER_LMA)
+#define EFER_NX (1<<_EFER_NX)
+
+/* Intel MSRs. Some also available on other CPUs */
+#define MSR_IA32_PERFCTR0 0x000000c1
+#define MSR_IA32_PERFCTR1 0x000000c2
+#define MSR_FSB_FREQ 0x000000cd
+
+#define MSR_MTRRcap 0x000000fe
+#define MSR_IA32_BBL_CR_CTL 0x00000119
+
+#define MSR_IA32_SYSENTER_CS 0x00000174
+#define MSR_IA32_SYSENTER_ESP 0x00000175
+#define MSR_IA32_SYSENTER_EIP 0x00000176
+
+#define MSR_IA32_MCG_CAP 0x00000179
+#define MSR_IA32_MCG_STATUS 0x0000017a
+#define MSR_IA32_MCG_CTL 0x0000017b
+
+#define MSR_IA32_PEBS_ENABLE 0x000003f1
+#define MSR_IA32_DS_AREA 0x00000600
+#define MSR_IA32_PERF_CAPABILITIES 0x00000345
+
+#define MSR_MTRRfix64K_00000 0x00000250
+#define MSR_MTRRfix16K_80000 0x00000258
+#define MSR_MTRRfix16K_A0000 0x00000259
+#define MSR_MTRRfix4K_C0000 0x00000268
+#define MSR_MTRRfix4K_C8000 0x00000269
+#define MSR_MTRRfix4K_D0000 0x0000026a
+#define MSR_MTRRfix4K_D8000 0x0000026b
+#define MSR_MTRRfix4K_E0000 0x0000026c
+#define MSR_MTRRfix4K_E8000 0x0000026d
+#define MSR_MTRRfix4K_F0000 0x0000026e
+#define MSR_MTRRfix4K_F8000 0x0000026f
+#define MSR_MTRRdefType 0x000002ff
+
+#define MSR_IA32_CR_PAT 0x00000277
+
+#define MSR_IA32_DEBUGCTLMSR 0x000001d9
+#define MSR_IA32_LASTBRANCHFROMIP 0x000001db
+#define MSR_IA32_LASTBRANCHTOIP 0x000001dc
+#define MSR_IA32_LASTINTFROMIP 0x000001dd
+#define MSR_IA32_LASTINTTOIP 0x000001de
+
+/* DEBUGCTLMSR bits (others vary by model): */
+#define _DEBUGCTLMSR_LBR 0 /* last branch recording */
+#define _DEBUGCTLMSR_BTF 1 /* single-step on branches */
+
+#define DEBUGCTLMSR_LBR (1UL << _DEBUGCTLMSR_LBR)
+#define DEBUGCTLMSR_BTF (1UL << _DEBUGCTLMSR_BTF)
+
+#define MSR_IA32_MC0_CTL 0x00000400
+#define MSR_IA32_MC0_STATUS 0x00000401
+#define MSR_IA32_MC0_ADDR 0x00000402
+#define MSR_IA32_MC0_MISC 0x00000403
+
+#define MSR_P6_PERFCTR0 0x000000c1
+#define MSR_P6_PERFCTR1 0x000000c2
+#define MSR_P6_EVNTSEL0 0x00000186
+#define MSR_P6_EVNTSEL1 0x00000187
+
+/* AMD64 MSRs. Not complete. See the architecture manual for a more
+ complete list. */
+
+#define MSR_AMD64_NB_CFG 0xc001001f
+#define MSR_AMD64_IBSFETCHCTL 0xc0011030
+#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
+#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
+#define MSR_AMD64_IBSOPCTL 0xc0011033
+#define MSR_AMD64_IBSOPRIP 0xc0011034
+#define MSR_AMD64_IBSOPDATA 0xc0011035
+#define MSR_AMD64_IBSOPDATA2 0xc0011036
+#define MSR_AMD64_IBSOPDATA3 0xc0011037
+#define MSR_AMD64_IBSDCLINAD 0xc0011038
+#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
+#define MSR_AMD64_IBSCTL 0xc001103a
+
+/* Fam 10h MSRs */
+#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
+#define FAM10H_MMIO_CONF_ENABLE (1<<0)
+#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
+#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
+#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
+#define FAM10H_MMIO_CONF_BASE_SHIFT 20
+
+/* K8 MSRs */
+#define MSR_K8_TOP_MEM1 0xc001001a
+#define MSR_K8_TOP_MEM2 0xc001001d
+#define MSR_K8_SYSCFG 0xc0010010
+#define MSR_K8_HWCR 0xc0010015
+#define MSR_K8_INT_PENDING_MSG 0xc0010055
+/* C1E active bits in int pending message */
+#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
+#define MSR_K8_TSEG_ADDR 0xc0010112
+#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */
+#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */
+#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */
+
+/* K7 MSRs */
+#define MSR_K7_EVNTSEL0 0xc0010000
+#define MSR_K7_PERFCTR0 0xc0010004
+#define MSR_K7_EVNTSEL1 0xc0010001
+#define MSR_K7_PERFCTR1 0xc0010005
+#define MSR_K7_EVNTSEL2 0xc0010002
+#define MSR_K7_PERFCTR2 0xc0010006
+#define MSR_K7_EVNTSEL3 0xc0010003
+#define MSR_K7_PERFCTR3 0xc0010007
+#define MSR_K7_CLK_CTL 0xc001001b
+#define MSR_K7_HWCR 0xc0010015
+#define MSR_K7_FID_VID_CTL 0xc0010041
+#define MSR_K7_FID_VID_STATUS 0xc0010042
+
+/* K6 MSRs */
+#define MSR_K6_EFER 0xc0000080
+#define MSR_K6_STAR 0xc0000081
+#define MSR_K6_WHCR 0xc0000082
+#define MSR_K6_UWCCR 0xc0000085
+#define MSR_K6_EPMR 0xc0000086
+#define MSR_K6_PSOR 0xc0000087
+#define MSR_K6_PFIR 0xc0000088
+
+/* Centaur-Hauls/IDT defined MSRs. */
+#define MSR_IDT_FCR1 0x00000107
+#define MSR_IDT_FCR2 0x00000108
+#define MSR_IDT_FCR3 0x00000109
+#define MSR_IDT_FCR4 0x0000010a
+
+#define MSR_IDT_MCR0 0x00000110
+#define MSR_IDT_MCR1 0x00000111
+#define MSR_IDT_MCR2 0x00000112
+#define MSR_IDT_MCR3 0x00000113
+#define MSR_IDT_MCR4 0x00000114
+#define MSR_IDT_MCR5 0x00000115
+#define MSR_IDT_MCR6 0x00000116
+#define MSR_IDT_MCR7 0x00000117
+#define MSR_IDT_MCR_CTRL 0x00000120
+
+/* VIA Cyrix defined MSRs*/
+#define MSR_VIA_FCR 0x00001107
+#define MSR_VIA_LONGHAUL 0x0000110a
+#define MSR_VIA_RNG 0x0000110b
+#define MSR_VIA_BCR2 0x00001147
+
+/* Transmeta defined MSRs */
+#define MSR_TMTA_LONGRUN_CTRL 0x80868010
+#define MSR_TMTA_LONGRUN_FLAGS 0x80868011
+#define MSR_TMTA_LRTI_READOUT 0x80868018
+#define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a
+
+/* Intel defined MSRs. */
+#define MSR_IA32_P5_MC_ADDR 0x00000000
+#define MSR_IA32_P5_MC_TYPE 0x00000001
+#define MSR_IA32_TSC 0x00000010
+#define MSR_IA32_PLATFORM_ID 0x00000017
+#define MSR_IA32_EBL_CR_POWERON 0x0000002a
+#define MSR_IA32_FEATURE_CONTROL 0x0000003a
+
+#define FEATURE_CONTROL_LOCKED (1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED (1<<2)
+
+#define MSR_IA32_APICBASE 0x0000001b
+#define MSR_IA32_APICBASE_BSP (1<<8)
+#define MSR_IA32_APICBASE_ENABLE (1<<11)
+#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
+
+#define MSR_IA32_UCODE_WRITE 0x00000079
+#define MSR_IA32_UCODE_REV 0x0000008b
+
+#define MSR_IA32_PERF_STATUS 0x00000198
+#define MSR_IA32_PERF_CTL 0x00000199
+
+#define MSR_IA32_MPERF 0x000000e7
+#define MSR_IA32_APERF 0x000000e8
+
+#define MSR_IA32_THERM_CONTROL 0x0000019a
+#define MSR_IA32_THERM_INTERRUPT 0x0000019b
+#define MSR_IA32_THERM_STATUS 0x0000019c
+#define MSR_IA32_MISC_ENABLE 0x000001a0
+
+/* Intel Model 6 */
+#define MSR_P6_EVNTSEL0 0x00000186
+#define MSR_P6_EVNTSEL1 0x00000187
+
+/* P4/Xeon+ specific */
+#define MSR_IA32_MCG_EAX 0x00000180
+#define MSR_IA32_MCG_EBX 0x00000181
+#define MSR_IA32_MCG_ECX 0x00000182
+#define MSR_IA32_MCG_EDX 0x00000183
+#define MSR_IA32_MCG_ESI 0x00000184
+#define MSR_IA32_MCG_EDI 0x00000185
+#define MSR_IA32_MCG_EBP 0x00000186
+#define MSR_IA32_MCG_ESP 0x00000187
+#define MSR_IA32_MCG_EFLAGS 0x00000188
+#define MSR_IA32_MCG_EIP 0x00000189
+#define MSR_IA32_MCG_RESERVED 0x0000018a
+
+/* Pentium IV performance counter MSRs */
+#define MSR_P4_BPU_PERFCTR0 0x00000300
+#define MSR_P4_BPU_PERFCTR1 0x00000301
+#define MSR_P4_BPU_PERFCTR2 0x00000302
+#define MSR_P4_BPU_PERFCTR3 0x00000303
+#define MSR_P4_MS_PERFCTR0 0x00000304
+#define MSR_P4_MS_PERFCTR1 0x00000305
+#define MSR_P4_MS_PERFCTR2 0x00000306
+#define MSR_P4_MS_PERFCTR3 0x00000307
+#define MSR_P4_FLAME_PERFCTR0 0x00000308
+#define MSR_P4_FLAME_PERFCTR1 0x00000309
+#define MSR_P4_FLAME_PERFCTR2 0x0000030a
+#define MSR_P4_FLAME_PERFCTR3 0x0000030b
+#define MSR_P4_IQ_PERFCTR0 0x0000030c
+#define MSR_P4_IQ_PERFCTR1 0x0000030d
+#define MSR_P4_IQ_PERFCTR2 0x0000030e
+#define MSR_P4_IQ_PERFCTR3 0x0000030f
+#define MSR_P4_IQ_PERFCTR4 0x00000310
+#define MSR_P4_IQ_PERFCTR5 0x00000311
+#define MSR_P4_BPU_CCCR0 0x00000360
+#define MSR_P4_BPU_CCCR1 0x00000361
+#define MSR_P4_BPU_CCCR2 0x00000362
+#define MSR_P4_BPU_CCCR3 0x00000363
+#define MSR_P4_MS_CCCR0 0x00000364
+#define MSR_P4_MS_CCCR1 0x00000365
+#define MSR_P4_MS_CCCR2 0x00000366
+#define MSR_P4_MS_CCCR3 0x00000367
+#define MSR_P4_FLAME_CCCR0 0x00000368
+#define MSR_P4_FLAME_CCCR1 0x00000369
+#define MSR_P4_FLAME_CCCR2 0x0000036a
+#define MSR_P4_FLAME_CCCR3 0x0000036b
+#define MSR_P4_IQ_CCCR0 0x0000036c
+#define MSR_P4_IQ_CCCR1 0x0000036d
+#define MSR_P4_IQ_CCCR2 0x0000036e
+#define MSR_P4_IQ_CCCR3 0x0000036f
+#define MSR_P4_IQ_CCCR4 0x00000370
+#define MSR_P4_IQ_CCCR5 0x00000371
+#define MSR_P4_ALF_ESCR0 0x000003ca
+#define MSR_P4_ALF_ESCR1 0x000003cb
+#define MSR_P4_BPU_ESCR0 0x000003b2
+#define MSR_P4_BPU_ESCR1 0x000003b3
+#define MSR_P4_BSU_ESCR0 0x000003a0
+#define MSR_P4_BSU_ESCR1 0x000003a1
+#define MSR_P4_CRU_ESCR0 0x000003b8
+#define MSR_P4_CRU_ESCR1 0x000003b9
+#define MSR_P4_CRU_ESCR2 0x000003cc
+#define MSR_P4_CRU_ESCR3 0x000003cd
+#define MSR_P4_CRU_ESCR4 0x000003e0
+#define MSR_P4_CRU_ESCR5 0x000003e1
+#define MSR_P4_DAC_ESCR0 0x000003a8
+#define MSR_P4_DAC_ESCR1 0x000003a9
+#define MSR_P4_FIRM_ESCR0 0x000003a4
+#define MSR_P4_FIRM_ESCR1 0x000003a5
+#define MSR_P4_FLAME_ESCR0 0x000003a6
+#define MSR_P4_FLAME_ESCR1 0x000003a7
+#define MSR_P4_FSB_ESCR0 0x000003a2
+#define MSR_P4_FSB_ESCR1 0x000003a3
+#define MSR_P4_IQ_ESCR0 0x000003ba
+#define MSR_P4_IQ_ESCR1 0x000003bb
+#define MSR_P4_IS_ESCR0 0x000003b4
+#define MSR_P4_IS_ESCR1 0x000003b5
+#define MSR_P4_ITLB_ESCR0 0x000003b6
+#define MSR_P4_ITLB_ESCR1 0x000003b7
+#define MSR_P4_IX_ESCR0 0x000003c8
+#define MSR_P4_IX_ESCR1 0x000003c9
+#define MSR_P4_MOB_ESCR0 0x000003aa
+#define MSR_P4_MOB_ESCR1 0x000003ab
+#define MSR_P4_MS_ESCR0 0x000003c0
+#define MSR_P4_MS_ESCR1 0x000003c1
+#define MSR_P4_PMH_ESCR0 0x000003ac
+#define MSR_P4_PMH_ESCR1 0x000003ad
+#define MSR_P4_RAT_ESCR0 0x000003bc
+#define MSR_P4_RAT_ESCR1 0x000003bd
+#define MSR_P4_SAAT_ESCR0 0x000003ae
+#define MSR_P4_SAAT_ESCR1 0x000003af
+#define MSR_P4_SSU_ESCR0 0x000003be
+#define MSR_P4_SSU_ESCR1 0x000003bf /* guess: not in manual */
+
+#define MSR_P4_TBPU_ESCR0 0x000003c2
+#define MSR_P4_TBPU_ESCR1 0x000003c3
+#define MSR_P4_TC_ESCR0 0x000003c4
+#define MSR_P4_TC_ESCR1 0x000003c5
+#define MSR_P4_U2L_ESCR0 0x000003b0
+#define MSR_P4_U2L_ESCR1 0x000003b1
+
+/* Intel Core-based CPU performance counters */
+#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
+#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
+#define MSR_CORE_PERF_FIXED_CTR2 0x0000030b
+#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d
+#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e
+#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390
+
+/* Geode defined MSRs */
+#define MSR_GEODE_BUSCONT_CONF0 0x00001900
+
+/* Intel VT MSRs */
+#define MSR_IA32_VMX_BASIC 0x00000480
+#define MSR_IA32_VMX_PINBASED_CTLS 0x00000481
+#define MSR_IA32_VMX_PROCBASED_CTLS 0x00000482
+#define MSR_IA32_VMX_EXIT_CTLS 0x00000483
+#define MSR_IA32_VMX_ENTRY_CTLS 0x00000484
+#define MSR_IA32_VMX_MISC 0x00000485
+#define MSR_IA32_VMX_CR0_FIXED0 0x00000486
+#define MSR_IA32_VMX_CR0_FIXED1 0x00000487
+#define MSR_IA32_VMX_CR4_FIXED0 0x00000488
+#define MSR_IA32_VMX_CR4_FIXED1 0x00000489
+#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
+#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
+#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
+
+#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
new file mode 100644
index 00000000000..46be2fa7ac2
--- /dev/null
+++ b/arch/x86/include/asm/msr.h
@@ -0,0 +1,247 @@
+#ifndef _ASM_X86_MSR_H
+#define _ASM_X86_MSR_H
+
+#include <asm/msr-index.h>
+
+#ifndef __ASSEMBLY__
+# include <linux/types.h>
+#endif
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+
+#include <asm/asm.h>
+#include <asm/errno.h>
+
+static inline unsigned long long native_read_tscp(unsigned int *aux)
+{
+ unsigned long low, high;
+ asm volatile(".byte 0x0f,0x01,0xf9"
+ : "=a" (low), "=d" (high), "=c" (*aux));
+ return low | ((u64)high << 32);
+}
+
+/*
+ * i386 calling convention returns 64-bit value in edx:eax, while
+ * x86_64 returns at rax. Also, the "A" constraint does not really
+ * mean rdx:rax in x86_64, so we need specialized behaviour for each
+ * architecture
+ */
+#ifdef CONFIG_X86_64
+#define DECLARE_ARGS(val, low, high) unsigned low, high
+#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32))
+#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
+#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
+#else
+#define DECLARE_ARGS(val, low, high) unsigned long long val
+#define EAX_EDX_VAL(val, low, high) (val)
+#define EAX_EDX_ARGS(val, low, high) "A" (val)
+#define EAX_EDX_RET(val, low, high) "=A" (val)
+#endif
+
+static inline unsigned long long native_read_msr(unsigned int msr)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+ return EAX_EDX_VAL(val, low, high);
+}
+
+static inline unsigned long long native_read_msr_safe(unsigned int msr,
+ int *err)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("2: rdmsr ; xor %[err],%[err]\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: mov %[fault],%[err] ; jmp 1b\n\t"
+ ".previous\n\t"
+ _ASM_EXTABLE(2b, 3b)
+ : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
+ : "c" (msr), [fault] "i" (-EFAULT));
+ return EAX_EDX_VAL(val, low, high);
+}
+
+static inline unsigned long long native_read_msr_amd_safe(unsigned int msr,
+ int *err)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("2: rdmsr ; xor %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: mov %3,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ _ASM_EXTABLE(2b, 3b)
+ : "=r" (*err), EAX_EDX_RET(val, low, high)
+ : "c" (msr), "D" (0x9c5a203a), "i" (-EFAULT));
+ return EAX_EDX_VAL(val, low, high);
+}
+
+static inline void native_write_msr(unsigned int msr,
+ unsigned low, unsigned high)
+{
+ asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+}
+
+static inline int native_write_msr_safe(unsigned int msr,
+ unsigned low, unsigned high)
+{
+ int err;
+ asm volatile("2: wrmsr ; xor %[err],%[err]\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: mov %[fault],%[err] ; jmp 1b\n\t"
+ ".previous\n\t"
+ _ASM_EXTABLE(2b, 3b)
+ : [err] "=a" (err)
+ : "c" (msr), "0" (low), "d" (high),
+ [fault] "i" (-EFAULT)
+ : "memory");
+ return err;
+}
+
+extern unsigned long long native_read_tsc(void);
+
+static __always_inline unsigned long long __native_read_tsc(void)
+{
+ DECLARE_ARGS(val, low, high);
+
+ rdtsc_barrier();
+ asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
+ rdtsc_barrier();
+
+ return EAX_EDX_VAL(val, low, high);
+}
+
+static inline unsigned long long native_read_pmc(int counter)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+ return EAX_EDX_VAL(val, low, high);
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#include <linux/errno.h>
+/*
+ * Access to machine-specific registers (available on 586 and better only)
+ * Note: the rd* operations modify the parameters directly (without using
+ * pointer indirection), this allows gcc to optimize better
+ */
+
+#define rdmsr(msr, val1, val2) \
+do { \
+ u64 __val = native_read_msr((msr)); \
+ (val1) = (u32)__val; \
+ (val2) = (u32)(__val >> 32); \
+} while (0)
+
+static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
+{
+ native_write_msr(msr, low, high);
+}
+
+#define rdmsrl(msr, val) \
+ ((val) = native_read_msr((msr)))
+
+#define wrmsrl(msr, val) \
+ native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32))
+
+/* wrmsr with exception handling */
+static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
+{
+ return native_write_msr_safe(msr, low, high);
+}
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr, p1, p2) \
+({ \
+ int __err; \
+ u64 __val = native_read_msr_safe((msr), &__err); \
+ (*p1) = (u32)__val; \
+ (*p2) = (u32)(__val >> 32); \
+ __err; \
+})
+
+static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
+{
+ int err;
+
+ *p = native_read_msr_safe(msr, &err);
+ return err;
+}
+static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+{
+ int err;
+
+ *p = native_read_msr_amd_safe(msr, &err);
+ return err;
+}
+
+#define rdtscl(low) \
+ ((low) = (u32)native_read_tsc())
+
+#define rdtscll(val) \
+ ((val) = native_read_tsc())
+
+#define rdpmc(counter, low, high) \
+do { \
+ u64 _l = native_read_pmc((counter)); \
+ (low) = (u32)_l; \
+ (high) = (u32)(_l >> 32); \
+} while (0)
+
+#define rdtscp(low, high, aux) \
+do { \
+ unsigned long long _val = native_read_tscp(&(aux)); \
+ (low) = (u32)_val; \
+ (high) = (u32)(_val >> 32); \
+} while (0)
+
+#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
+
+#endif /* !CONFIG_PARAVIRT */
+
+
+#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
+ (u32)((val) >> 32))
+
+#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2))
+
+#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)
+
+#ifdef CONFIG_SMP
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+#else /* CONFIG_SMP */
+static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ rdmsr(msr_no, *l, *h);
+ return 0;
+}
+static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ wrmsr(msr_no, l, h);
+ return 0;
+}
+static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
+ u32 *l, u32 *h)
+{
+ return rdmsr_safe(msr_no, l, h);
+}
+static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ return wrmsr_safe(msr_no, l, h);
+}
+#endif /* CONFIG_SMP */
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+
+#endif /* _ASM_X86_MSR_H */
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
new file mode 100644
index 00000000000..7c1e4258b31
--- /dev/null
+++ b/arch/x86/include/asm/mtrr.h
@@ -0,0 +1,173 @@
+/* Generic MTRR (Memory Type Range Register) ioctls.
+
+ Copyright (C) 1997-1999 Richard Gooch
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with this library; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ Richard Gooch may be reached by email at rgooch@atnf.csiro.au
+ The postal address is:
+ Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
+*/
+#ifndef _ASM_X86_MTRR_H
+#define _ASM_X86_MTRR_H
+
+#include <linux/ioctl.h>
+#include <linux/errno.h>
+
+#define MTRR_IOCTL_BASE 'M'
+
+struct mtrr_sentry {
+ unsigned long base; /* Base address */
+ unsigned int size; /* Size of region */
+ unsigned int type; /* Type of region */
+};
+
+/* Warning: this structure has a different order from i386
+ on x86-64. The 32bit emulation code takes care of that.
+ But you need to use this for 64bit, otherwise your X server
+ will break. */
+
+#ifdef __i386__
+struct mtrr_gentry {
+ unsigned int regnum; /* Register number */
+ unsigned long base; /* Base address */
+ unsigned int size; /* Size of region */
+ unsigned int type; /* Type of region */
+};
+
+#else /* __i386__ */
+
+struct mtrr_gentry {
+ unsigned long base; /* Base address */
+ unsigned int size; /* Size of region */
+ unsigned int regnum; /* Register number */
+ unsigned int type; /* Type of region */
+};
+#endif /* !__i386__ */
+
+/* These are the various ioctls */
+#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry)
+#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry)
+#define MTRRIOC_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry)
+#define MTRRIOC_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry)
+#define MTRRIOC_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry)
+#define MTRRIOC_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry)
+#define MTRRIOC_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry)
+#define MTRRIOC_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry)
+#define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
+#define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry)
+
+/* These are the region types */
+#define MTRR_TYPE_UNCACHABLE 0
+#define MTRR_TYPE_WRCOMB 1
+/*#define MTRR_TYPE_ 2*/
+/*#define MTRR_TYPE_ 3*/
+#define MTRR_TYPE_WRTHROUGH 4
+#define MTRR_TYPE_WRPROT 5
+#define MTRR_TYPE_WRBACK 6
+#define MTRR_NUM_TYPES 7
+
+#ifdef __KERNEL__
+
+/* The following functions are for use by other drivers */
+# ifdef CONFIG_MTRR
+extern u8 mtrr_type_lookup(u64 addr, u64 end);
+extern void mtrr_save_fixed_ranges(void *);
+extern void mtrr_save_state(void);
+extern int mtrr_add(unsigned long base, unsigned long size,
+ unsigned int type, bool increment);
+extern int mtrr_add_page(unsigned long base, unsigned long size,
+ unsigned int type, bool increment);
+extern int mtrr_del(int reg, unsigned long base, unsigned long size);
+extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
+extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
+extern void mtrr_ap_init(void);
+extern void mtrr_bp_init(void);
+extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
+extern int amd_special_default_mtrr(void);
+# else
+static inline u8 mtrr_type_lookup(u64 addr, u64 end)
+{
+ /*
+ * Return no-MTRRs:
+ */
+ return 0xff;
+}
+#define mtrr_save_fixed_ranges(arg) do {} while (0)
+#define mtrr_save_state() do {} while (0)
+static inline int mtrr_add(unsigned long base, unsigned long size,
+ unsigned int type, bool increment)
+{
+ return -ENODEV;
+}
+static inline int mtrr_add_page(unsigned long base, unsigned long size,
+ unsigned int type, bool increment)
+{
+ return -ENODEV;
+}
+static inline int mtrr_del(int reg, unsigned long base, unsigned long size)
+{
+ return -ENODEV;
+}
+static inline int mtrr_del_page(int reg, unsigned long base, unsigned long size)
+{
+ return -ENODEV;
+}
+static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+ return 0;
+}
+static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
+{
+}
+
+#define mtrr_ap_init() do {} while (0)
+#define mtrr_bp_init() do {} while (0)
+# endif
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct mtrr_sentry32 {
+ compat_ulong_t base; /* Base address */
+ compat_uint_t size; /* Size of region */
+ compat_uint_t type; /* Type of region */
+};
+
+struct mtrr_gentry32 {
+ compat_ulong_t regnum; /* Register number */
+ compat_uint_t base; /* Base address */
+ compat_uint_t size; /* Size of region */
+ compat_uint_t type; /* Type of region */
+};
+
+#define MTRR_IOCTL_BASE 'M'
+
+#define MTRRIOC32_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry32)
+#define MTRRIOC32_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry32)
+#define MTRRIOC32_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry32)
+#define MTRRIOC32_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32)
+#define MTRRIOC32_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry32)
+#define MTRRIOC32_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry32)
+#define MTRRIOC32_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry32)
+#define MTRRIOC32_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry32)
+#define MTRRIOC32_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32)
+#define MTRRIOC32_KILL_PAGE_ENTRY \
+ _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32)
+#endif /* CONFIG_COMPAT */
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_MTRR_H */
diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h
new file mode 100644
index 00000000000..a731b9c573a
--- /dev/null
+++ b/arch/x86/include/asm/mutex.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "mutex_32.h"
+#else
+# include "mutex_64.h"
+#endif
diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h
new file mode 100644
index 00000000000..03f90c8a5a7
--- /dev/null
+++ b/arch/x86/include/asm/mutex_32.h
@@ -0,0 +1,125 @@
+/*
+ * Assembly implementation of the mutex fastpath, based on atomic
+ * decrement/increment.
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#ifndef _ASM_X86_MUTEX_32_H
+#define _ASM_X86_MUTEX_32_H
+
+#include <asm/alternative.h>
+
+/**
+ * __mutex_fastpath_lock - try to take the lock by moving the count
+ * from 1 to a 0 value
+ * @count: pointer of type atomic_t
+ * @fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fn> if it
+ * wasn't 1 originally. This function MUST leave the value lower than 1
+ * even when the "1" assertion wasn't true.
+ */
+#define __mutex_fastpath_lock(count, fail_fn) \
+do { \
+ unsigned int dummy; \
+ \
+ typecheck(atomic_t *, count); \
+ typecheck_fn(void (*)(atomic_t *), fail_fn); \
+ \
+ asm volatile(LOCK_PREFIX " decl (%%eax)\n" \
+ " jns 1f \n" \
+ " call " #fail_fn "\n" \
+ "1:\n" \
+ : "=a" (dummy) \
+ : "a" (count) \
+ : "memory", "ecx", "edx"); \
+} while (0)
+
+
+/**
+ * __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ * from 1 to a 0 value
+ * @count: pointer of type atomic_t
+ * @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if it
+ * wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns
+ */
+static inline int __mutex_fastpath_lock_retval(atomic_t *count,
+ int (*fail_fn)(atomic_t *))
+{
+ if (unlikely(atomic_dec_return(count) < 0))
+ return fail_fn(count);
+ else
+ return 0;
+}
+
+/**
+ * __mutex_fastpath_unlock - try to promote the mutex from 0 to 1
+ * @count: pointer of type atomic_t
+ * @fail_fn: function to call if the original value was not 0
+ *
+ * try to promote the mutex from 0 to 1. if it wasn't 0, call <fail_fn>.
+ * In the failure case, this function is allowed to either set the value
+ * to 1, or to set it to a value lower than 1.
+ *
+ * If the implementation sets it to a value of lower than 1, the
+ * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
+ * to return 0 otherwise.
+ */
+#define __mutex_fastpath_unlock(count, fail_fn) \
+do { \
+ unsigned int dummy; \
+ \
+ typecheck(atomic_t *, count); \
+ typecheck_fn(void (*)(atomic_t *), fail_fn); \
+ \
+ asm volatile(LOCK_PREFIX " incl (%%eax)\n" \
+ " jg 1f\n" \
+ " call " #fail_fn "\n" \
+ "1:\n" \
+ : "=a" (dummy) \
+ : "a" (count) \
+ : "memory", "ecx", "edx"); \
+} while (0)
+
+#define __mutex_slowpath_needs_to_unlock() 1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ * @count: pointer of type atomic_t
+ * @fail_fn: fallback function
+ *
+ * Change the count from 1 to a value lower than 1, and return 0 (failure)
+ * if it wasn't 1 originally, or return 1 (success) otherwise. This function
+ * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
+ * Additionally, if the value was < 0 originally, this function must not leave
+ * it to 0 on failure.
+ */
+static inline int __mutex_fastpath_trylock(atomic_t *count,
+ int (*fail_fn)(atomic_t *))
+{
+ /*
+ * We have two variants here. The cmpxchg based one is the best one
+ * because it never induce a false contention state. It is included
+ * here because architectures using the inc/dec algorithms over the
+ * xchg ones are much more likely to support cmpxchg natively.
+ *
+ * If not we fall back to the spinlock based variant - that is
+ * just as efficient (and simpler) as a 'destructive' probing of
+ * the mutex state would be.
+ */
+#ifdef __HAVE_ARCH_CMPXCHG
+ if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+ return 1;
+ return 0;
+#else
+ return fail_fn(count);
+#endif
+}
+
+#endif /* _ASM_X86_MUTEX_32_H */
diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
new file mode 100644
index 00000000000..68a87b0f8e2
--- /dev/null
+++ b/arch/x86/include/asm/mutex_64.h
@@ -0,0 +1,100 @@
+/*
+ * Assembly implementation of the mutex fastpath, based on atomic
+ * decrement/increment.
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#ifndef _ASM_X86_MUTEX_64_H
+#define _ASM_X86_MUTEX_64_H
+
+/**
+ * __mutex_fastpath_lock - decrement and call function if negative
+ * @v: pointer of type atomic_t
+ * @fail_fn: function to call if the result is negative
+ *
+ * Atomically decrements @v and calls <fail_fn> if the result is negative.
+ */
+#define __mutex_fastpath_lock(v, fail_fn) \
+do { \
+ unsigned long dummy; \
+ \
+ typecheck(atomic_t *, v); \
+ typecheck_fn(void (*)(atomic_t *), fail_fn); \
+ \
+ asm volatile(LOCK_PREFIX " decl (%%rdi)\n" \
+ " jns 1f \n" \
+ " call " #fail_fn "\n" \
+ "1:" \
+ : "=D" (dummy) \
+ : "D" (v) \
+ : "rax", "rsi", "rdx", "rcx", \
+ "r8", "r9", "r10", "r11", "memory"); \
+} while (0)
+
+/**
+ * __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ * from 1 to a 0 value
+ * @count: pointer of type atomic_t
+ * @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns
+ */
+static inline int __mutex_fastpath_lock_retval(atomic_t *count,
+ int (*fail_fn)(atomic_t *))
+{
+ if (unlikely(atomic_dec_return(count) < 0))
+ return fail_fn(count);
+ else
+ return 0;
+}
+
+/**
+ * __mutex_fastpath_unlock - increment and call function if nonpositive
+ * @v: pointer of type atomic_t
+ * @fail_fn: function to call if the result is nonpositive
+ *
+ * Atomically increments @v and calls <fail_fn> if the result is nonpositive.
+ */
+#define __mutex_fastpath_unlock(v, fail_fn) \
+do { \
+ unsigned long dummy; \
+ \
+ typecheck(atomic_t *, v); \
+ typecheck_fn(void (*)(atomic_t *), fail_fn); \
+ \
+ asm volatile(LOCK_PREFIX " incl (%%rdi)\n" \
+ " jg 1f\n" \
+ " call " #fail_fn "\n" \
+ "1:" \
+ : "=D" (dummy) \
+ : "D" (v) \
+ : "rax", "rsi", "rdx", "rcx", \
+ "r8", "r9", "r10", "r11", "memory"); \
+} while (0)
+
+#define __mutex_slowpath_needs_to_unlock() 1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ * @count: pointer of type atomic_t
+ * @fail_fn: fallback function
+ *
+ * Change the count from 1 to 0 and return 1 (success), or return 0 (failure)
+ * if it wasn't 1 originally. [the fallback function is never used on
+ * x86_64, because all x86_64 CPUs have a CMPXCHG instruction.]
+ */
+static inline int __mutex_fastpath_trylock(atomic_t *count,
+ int (*fail_fn)(atomic_t *))
+{
+ if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+ return 1;
+ else
+ return 0;
+}
+
+#endif /* _ASM_X86_MUTEX_64_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
new file mode 100644
index 00000000000..c45a0a568df
--- /dev/null
+++ b/arch/x86/include/asm/nmi.h
@@ -0,0 +1,81 @@
+#ifndef _ASM_X86_NMI_H
+#define _ASM_X86_NMI_H
+
+#include <linux/pm.h>
+#include <asm/irq.h>
+#include <asm/io.h>
+
+#ifdef ARCH_HAS_NMI_WATCHDOG
+
+/**
+ * do_nmi_callback
+ *
+ * Check to see if a callback exists and execute it. Return 1
+ * if the handler exists and was handled successfully.
+ */
+int do_nmi_callback(struct pt_regs *regs, int cpu);
+
+extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
+extern int check_nmi_watchdog(void);
+extern int nmi_watchdog_enabled;
+extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
+extern int avail_to_resrv_perfctr_nmi(unsigned int);
+extern int reserve_perfctr_nmi(unsigned int);
+extern void release_perfctr_nmi(unsigned int);
+extern int reserve_evntsel_nmi(unsigned int);
+extern void release_evntsel_nmi(unsigned int);
+
+extern void setup_apic_nmi_watchdog(void *);
+extern void stop_apic_nmi_watchdog(void *);
+extern void disable_timer_nmi_watchdog(void);
+extern void enable_timer_nmi_watchdog(void);
+extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
+extern void cpu_nmi_set_wd_enabled(void);
+
+extern atomic_t nmi_active;
+extern unsigned int nmi_watchdog;
+#define NMI_NONE 0
+#define NMI_IO_APIC 1
+#define NMI_LOCAL_APIC 2
+#define NMI_INVALID 3
+
+struct ctl_table;
+struct file;
+extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
+ void __user *, size_t *, loff_t *);
+extern int unknown_nmi_panic;
+
+void __trigger_all_cpu_backtrace(void);
+#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+
+static inline void localise_nmi_watchdog(void)
+{
+ if (nmi_watchdog == NMI_IO_APIC)
+ nmi_watchdog = NMI_LOCAL_APIC;
+}
+
+/* check if nmi_watchdog is active (ie was specified at boot) */
+static inline int nmi_watchdog_active(void)
+{
+ /*
+ * actually it should be:
+ * return (nmi_watchdog == NMI_LOCAL_APIC ||
+ * nmi_watchdog == NMI_IO_APIC)
+ * but since they are power of two we could use a
+ * cheaper way --cvg
+ */
+ return nmi_watchdog & 0x3;
+}
+#endif
+
+void lapic_watchdog_stop(void);
+int lapic_watchdog_init(unsigned nmi_hz);
+int lapic_wd_event(unsigned nmi_hz);
+unsigned lapic_adjust_nmi_hz(unsigned hz);
+int lapic_watchdog_ok(void);
+void disable_lapic_nmi_watchdog(void);
+void enable_lapic_nmi_watchdog(void);
+void stop_nmi(void);
+void restart_nmi(void);
+
+#endif /* _ASM_X86_NMI_H */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
new file mode 100644
index 00000000000..ad2668ee1aa
--- /dev/null
+++ b/arch/x86/include/asm/nops.h
@@ -0,0 +1,118 @@
+#ifndef _ASM_X86_NOPS_H
+#define _ASM_X86_NOPS_H
+
+/* Define nops for use with alternative() */
+
+/* generic versions from gas
+ 1: nop
+ the following instructions are NOT nops in 64-bit mode,
+ for 64-bit mode use K8 or P6 nops instead
+ 2: movl %esi,%esi
+ 3: leal 0x00(%esi),%esi
+ 4: leal 0x00(,%esi,1),%esi
+ 6: leal 0x00000000(%esi),%esi
+ 7: leal 0x00000000(,%esi,1),%esi
+*/
+#define GENERIC_NOP1 ".byte 0x90\n"
+#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
+#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
+#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
+#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
+#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
+#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
+#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
+
+/* Opteron 64bit nops
+ 1: nop
+ 2: osp nop
+ 3: osp osp nop
+ 4: osp osp osp nop
+*/
+#define K8_NOP1 GENERIC_NOP1
+#define K8_NOP2 ".byte 0x66,0x90\n"
+#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
+#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
+#define K8_NOP5 K8_NOP3 K8_NOP2
+#define K8_NOP6 K8_NOP3 K8_NOP3
+#define K8_NOP7 K8_NOP4 K8_NOP3
+#define K8_NOP8 K8_NOP4 K8_NOP4
+
+/* K7 nops
+ uses eax dependencies (arbitary choice)
+ 1: nop
+ 2: movl %eax,%eax
+ 3: leal (,%eax,1),%eax
+ 4: leal 0x00(,%eax,1),%eax
+ 6: leal 0x00000000(%eax),%eax
+ 7: leal 0x00000000(,%eax,1),%eax
+*/
+#define K7_NOP1 GENERIC_NOP1
+#define K7_NOP2 ".byte 0x8b,0xc0\n"
+#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
+#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
+#define K7_NOP5 K7_NOP4 ASM_NOP1
+#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
+#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
+#define K7_NOP8 K7_NOP7 ASM_NOP1
+
+/* P6 nops
+ uses eax dependencies (Intel-recommended choice)
+ 1: nop
+ 2: osp nop
+ 3: nopl (%eax)
+ 4: nopl 0x00(%eax)
+ 5: nopl 0x00(%eax,%eax,1)
+ 6: osp nopl 0x00(%eax,%eax,1)
+ 7: nopl 0x00000000(%eax)
+ 8: nopl 0x00000000(%eax,%eax,1)
+*/
+#define P6_NOP1 GENERIC_NOP1
+#define P6_NOP2 ".byte 0x66,0x90\n"
+#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
+#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
+#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
+#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
+#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
+#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
+
+#if defined(CONFIG_MK7)
+#define ASM_NOP1 K7_NOP1
+#define ASM_NOP2 K7_NOP2
+#define ASM_NOP3 K7_NOP3
+#define ASM_NOP4 K7_NOP4
+#define ASM_NOP5 K7_NOP5
+#define ASM_NOP6 K7_NOP6
+#define ASM_NOP7 K7_NOP7
+#define ASM_NOP8 K7_NOP8
+#elif defined(CONFIG_X86_P6_NOP)
+#define ASM_NOP1 P6_NOP1
+#define ASM_NOP2 P6_NOP2
+#define ASM_NOP3 P6_NOP3
+#define ASM_NOP4 P6_NOP4
+#define ASM_NOP5 P6_NOP5
+#define ASM_NOP6 P6_NOP6
+#define ASM_NOP7 P6_NOP7
+#define ASM_NOP8 P6_NOP8
+#elif defined(CONFIG_X86_64)
+#define ASM_NOP1 K8_NOP1
+#define ASM_NOP2 K8_NOP2
+#define ASM_NOP3 K8_NOP3
+#define ASM_NOP4 K8_NOP4
+#define ASM_NOP5 K8_NOP5
+#define ASM_NOP6 K8_NOP6
+#define ASM_NOP7 K8_NOP7
+#define ASM_NOP8 K8_NOP8
+#else
+#define ASM_NOP1 GENERIC_NOP1
+#define ASM_NOP2 GENERIC_NOP2
+#define ASM_NOP3 GENERIC_NOP3
+#define ASM_NOP4 GENERIC_NOP4
+#define ASM_NOP5 GENERIC_NOP5
+#define ASM_NOP6 GENERIC_NOP6
+#define ASM_NOP7 GENERIC_NOP7
+#define ASM_NOP8 GENERIC_NOP8
+#endif
+
+#define ASM_NOP_MAX 8
+
+#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
new file mode 100644
index 00000000000..27da400d313
--- /dev/null
+++ b/arch/x86/include/asm/numa.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "numa_32.h"
+#else
+# include "numa_64.h"
+#endif
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
new file mode 100644
index 00000000000..e9f5db79624
--- /dev/null
+++ b/arch/x86/include/asm/numa_32.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_NUMA_32_H
+#define _ASM_X86_NUMA_32_H
+
+extern int pxm_to_nid(int pxm);
+extern void numa_remove_cpu(int cpu);
+
+#ifdef CONFIG_NUMA
+extern void set_highmem_pages_init(void);
+#endif
+
+#endif /* _ASM_X86_NUMA_32_H */
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
new file mode 100644
index 00000000000..064ed6df4cb
--- /dev/null
+++ b/arch/x86/include/asm/numa_64.h
@@ -0,0 +1,43 @@
+#ifndef _ASM_X86_NUMA_64_H
+#define _ASM_X86_NUMA_64_H
+
+#include <linux/nodemask.h>
+#include <asm/apicdef.h>
+
+struct bootnode {
+ u64 start;
+ u64 end;
+};
+
+extern int compute_hash_shift(struct bootnode *nodes, int numblks,
+ int *nodeids);
+
+#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
+
+extern void numa_init_array(void);
+extern int numa_off;
+
+extern void srat_reserve_add_area(int nodeid);
+extern int hotadd_percent;
+
+extern s16 apicid_to_node[MAX_LOCAL_APIC];
+
+extern unsigned long numa_free_all_bootmem(void);
+extern void setup_node_bootmem(int nodeid, unsigned long start,
+ unsigned long end);
+
+#ifdef CONFIG_NUMA
+extern void __init init_cpu_to_node(void);
+extern void __cpuinit numa_set_node(int cpu, int node);
+extern void __cpuinit numa_clear_node(int cpu);
+extern void __cpuinit numa_add_cpu(int cpu);
+extern void __cpuinit numa_remove_cpu(int cpu);
+#else
+static inline void init_cpu_to_node(void) { }
+static inline void numa_set_node(int cpu, int node) { }
+static inline void numa_clear_node(int cpu) { }
+static inline void numa_add_cpu(int cpu, int node) { }
+static inline void numa_remove_cpu(int cpu) { }
+#endif
+
+#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
new file mode 100644
index 00000000000..1e8bd30b4c1
--- /dev/null
+++ b/arch/x86/include/asm/numaq.h
@@ -0,0 +1,169 @@
+/*
+ * Written by: Patricia Gaughen, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <gone@us.ibm.com>
+ */
+
+#ifndef _ASM_X86_NUMAQ_H
+#define _ASM_X86_NUMAQ_H
+
+#ifdef CONFIG_X86_NUMAQ
+
+extern int found_numaq;
+extern int get_memcfg_numaq(void);
+
+/*
+ * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
+ */
+#define SYS_CFG_DATA_PRIV_ADDR 0x0009d000 /* place for scd in private
+ quad space */
+
+/*
+ * Communication area for each processor on lynxer-processor tests.
+ *
+ * NOTE: If you change the size of this eachproc structure you need
+ * to change the definition for EACH_QUAD_SIZE.
+ */
+struct eachquadmem {
+ unsigned int priv_mem_start; /* Starting address of this */
+ /* quad's private memory. */
+ /* This is always 0. */
+ /* In MB. */
+ unsigned int priv_mem_size; /* Size of this quad's */
+ /* private memory. */
+ /* In MB. */
+ unsigned int low_shrd_mem_strp_start;/* Starting address of this */
+ /* quad's low shared block */
+ /* (untranslated). */
+ /* In MB. */
+ unsigned int low_shrd_mem_start; /* Starting address of this */
+ /* quad's low shared memory */
+ /* (untranslated). */
+ /* In MB. */
+ unsigned int low_shrd_mem_size; /* Size of this quad's low */
+ /* shared memory. */
+ /* In MB. */
+ unsigned int lmmio_copb_start; /* Starting address of this */
+ /* quad's local memory */
+ /* mapped I/O in the */
+ /* compatibility OPB. */
+ /* In MB. */
+ unsigned int lmmio_copb_size; /* Size of this quad's local */
+ /* memory mapped I/O in the */
+ /* compatibility OPB. */
+ /* In MB. */
+ unsigned int lmmio_nopb_start; /* Starting address of this */
+ /* quad's local memory */
+ /* mapped I/O in the */
+ /* non-compatibility OPB. */
+ /* In MB. */
+ unsigned int lmmio_nopb_size; /* Size of this quad's local */
+ /* memory mapped I/O in the */
+ /* non-compatibility OPB. */
+ /* In MB. */
+ unsigned int io_apic_0_start; /* Starting address of I/O */
+ /* APIC 0. */
+ unsigned int io_apic_0_sz; /* Size I/O APIC 0. */
+ unsigned int io_apic_1_start; /* Starting address of I/O */
+ /* APIC 1. */
+ unsigned int io_apic_1_sz; /* Size I/O APIC 1. */
+ unsigned int hi_shrd_mem_start; /* Starting address of this */
+ /* quad's high shared memory.*/
+ /* In MB. */
+ unsigned int hi_shrd_mem_size; /* Size of this quad's high */
+ /* shared memory. */
+ /* In MB. */
+ unsigned int mps_table_addr; /* Address of this quad's */
+ /* MPS tables from BIOS, */
+ /* in system space.*/
+ unsigned int lcl_MDC_pio_addr; /* Port-I/O address for */
+ /* local access of MDC. */
+ unsigned int rmt_MDC_mmpio_addr; /* MM-Port-I/O address for */
+ /* remote access of MDC. */
+ unsigned int mm_port_io_start; /* Starting address of this */
+ /* quad's memory mapped Port */
+ /* I/O space. */
+ unsigned int mm_port_io_size; /* Size of this quad's memory*/
+ /* mapped Port I/O space. */
+ unsigned int mm_rmt_io_apic_start; /* Starting address of this */
+ /* quad's memory mapped */
+ /* remote I/O APIC space. */
+ unsigned int mm_rmt_io_apic_size; /* Size of this quad's memory*/
+ /* mapped remote I/O APIC */
+ /* space. */
+ unsigned int mm_isa_start; /* Starting address of this */
+ /* quad's memory mapped ISA */
+ /* space (contains MDC */
+ /* memory space). */
+ unsigned int mm_isa_size; /* Size of this quad's memory*/
+ /* mapped ISA space (contains*/
+ /* MDC memory space). */
+ unsigned int rmt_qmi_addr; /* Remote addr to access QMI.*/
+ unsigned int lcl_qmi_addr; /* Local addr to access QMI. */
+};
+
+/*
+ * Note: This structure must be NOT be changed unless the multiproc and
+ * OS are changed to reflect the new structure.
+ */
+struct sys_cfg_data {
+ unsigned int quad_id;
+ unsigned int bsp_proc_id; /* Boot Strap Processor in this quad. */
+ unsigned int scd_version; /* Version number of this table. */
+ unsigned int first_quad_id;
+ unsigned int quads_present31_0; /* 1 bit for each quad */
+ unsigned int quads_present63_32; /* 1 bit for each quad */
+ unsigned int config_flags;
+ unsigned int boot_flags;
+ unsigned int csr_start_addr; /* Absolute value (not in MB) */
+ unsigned int csr_size; /* Absolute value (not in MB) */
+ unsigned int lcl_apic_start_addr; /* Absolute value (not in MB) */
+ unsigned int lcl_apic_size; /* Absolute value (not in MB) */
+ unsigned int low_shrd_mem_base; /* 0 or 512MB or 1GB */
+ unsigned int low_shrd_mem_quad_offset; /* 0,128M,256M,512M,1G */
+ /* may not be totally populated */
+ unsigned int split_mem_enbl; /* 0 for no low shared memory */
+ unsigned int mmio_sz; /* Size of total system memory mapped I/O */
+ /* (in MB). */
+ unsigned int quad_spin_lock; /* Spare location used for quad */
+ /* bringup. */
+ unsigned int nonzero55; /* For checksumming. */
+ unsigned int nonzeroaa; /* For checksumming. */
+ unsigned int scd_magic_number;
+ unsigned int system_type;
+ unsigned int checksum;
+ /*
+ * memory configuration area for each quad
+ */
+ struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */
+};
+
+void numaq_tsc_disable(void);
+
+#else
+static inline int get_memcfg_numaq(void)
+{
+ return 0;
+}
+#endif /* CONFIG_X86_NUMAQ */
+#endif /* _ASM_X86_NUMAQ_H */
+
diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h
new file mode 100644
index 00000000000..0bf2a06b7a4
--- /dev/null
+++ b/arch/x86/include/asm/numaq/apic.h
@@ -0,0 +1,136 @@
+#ifndef __ASM_NUMAQ_APIC_H
+#define __ASM_NUMAQ_APIC_H
+
+#include <asm/io.h>
+#include <linux/mmzone.h>
+#include <linux/nodemask.h>
+
+#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
+
+static inline cpumask_t target_cpus(void)
+{
+ return CPU_MASK_ALL;
+}
+
+#define NO_BALANCE_IRQ (1)
+#define esr_disable (1)
+
+#define INT_DELIVERY_MODE dest_LowestPrio
+#define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */
+
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return physid_isset(apicid, bitmap);
+}
+static inline unsigned long check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+#define apicid_cluster(apicid) (apicid & 0xF0)
+
+static inline int apic_id_registered(void)
+{
+ return 1;
+}
+
+static inline void init_apic_ldr(void)
+{
+ /* Already done in NUMA-Q firmware */
+}
+
+static inline void setup_apic_routing(void)
+{
+ printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
+ "NUMA-Q", nr_ioapics);
+}
+
+/*
+ * Skip adding the timer int on secondary nodes, which causes
+ * a small but painful rift in the time-space continuum.
+ */
+static inline int multi_timer_check(int apic, int irq)
+{
+ return apic != 0 && irq == 0;
+}
+
+static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* We don't have a good way to do this yet - hack */
+ return physids_promote(0xFUL);
+}
+
+/* Mapping from cpu number to logical apicid */
+extern u8 cpu_2_logical_apicid[];
+static inline int cpu_to_logical_apicid(int cpu)
+{
+ if (cpu >= NR_CPUS)
+ return BAD_APICID;
+ return (int)cpu_2_logical_apicid[cpu];
+}
+
+/*
+ * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
+ * cpu to APIC ID relation to properly interact with the intelligent
+ * mode of the cluster controller.
+ */
+static inline int cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < 60)
+ return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
+ else
+ return BAD_APICID;
+}
+
+static inline int apicid_to_node(int logical_apicid)
+{
+ return logical_apicid >> 4;
+}
+
+static inline physid_mask_t apicid_to_cpu_present(int logical_apicid)
+{
+ int node = apicid_to_node(logical_apicid);
+ int cpu = __ffs(logical_apicid & 0xf);
+
+ return physid_mask_of_physid(cpu + 4*node);
+}
+
+extern void *xquad_portio;
+
+static inline void setup_portio_remap(void)
+{
+ int num_quads = num_online_nodes();
+
+ if (num_quads <= 1)
+ return;
+
+ printk("Remapping cross-quad port I/O for %d quads\n", num_quads);
+ xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
+ printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
+ (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
+}
+
+static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return (1);
+}
+
+static inline void enable_apic_mode(void)
+{
+}
+
+/*
+ * We use physical apicids here, not logical, so just return the default
+ * physical broadcast to stop people from breaking us
+ */
+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ return (int) 0xF;
+}
+
+/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
+static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+#endif /* __ASM_NUMAQ_APIC_H */
diff --git a/arch/x86/include/asm/numaq/apicdef.h b/arch/x86/include/asm/numaq/apicdef.h
new file mode 100644
index 00000000000..e012a46cc22
--- /dev/null
+++ b/arch/x86/include/asm/numaq/apicdef.h
@@ -0,0 +1,14 @@
+#ifndef __ASM_NUMAQ_APICDEF_H
+#define __ASM_NUMAQ_APICDEF_H
+
+
+#define APIC_ID_MASK (0xF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+ return (((x)>>24)&0x0F);
+}
+
+#define GET_APIC_ID(x) get_apic_id(x)
+
+#endif
diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h
new file mode 100644
index 00000000000..935588d286c
--- /dev/null
+++ b/arch/x86/include/asm/numaq/ipi.h
@@ -0,0 +1,25 @@
+#ifndef __ASM_NUMAQ_IPI_H
+#define __ASM_NUMAQ_IPI_H
+
+void send_IPI_mask_sequence(cpumask_t, int vector);
+
+static inline void send_IPI_mask(cpumask_t mask, int vector)
+{
+ send_IPI_mask_sequence(mask, vector);
+}
+
+static inline void send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ send_IPI_mask(mask, vector);
+}
+
+static inline void send_IPI_all(int vector)
+{
+ send_IPI_mask(cpu_online_map, vector);
+}
+
+#endif /* __ASM_NUMAQ_IPI_H */
diff --git a/arch/x86/include/asm/numaq/mpparse.h b/arch/x86/include/asm/numaq/mpparse.h
new file mode 100644
index 00000000000..252292e077b
--- /dev/null
+++ b/arch/x86/include/asm/numaq/mpparse.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_NUMAQ_MPPARSE_H
+#define __ASM_NUMAQ_MPPARSE_H
+
+extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid);
+
+#endif /* __ASM_NUMAQ_MPPARSE_H */
diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h
new file mode 100644
index 00000000000..c577bda5b1c
--- /dev/null
+++ b/arch/x86/include/asm/numaq/wakecpu.h
@@ -0,0 +1,43 @@
+#ifndef __ASM_NUMAQ_WAKECPU_H
+#define __ASM_NUMAQ_WAKECPU_H
+
+/* This file copes with machines that wakeup secondary CPUs by NMIs */
+
+#define WAKE_SECONDARY_VIA_NMI
+
+#define TRAMPOLINE_LOW phys_to_virt(0x8)
+#define TRAMPOLINE_HIGH phys_to_virt(0xa)
+
+#define boot_cpu_apicid boot_cpu_logical_apicid
+
+/* We don't do anything here because we use NMI's to boot instead */
+static inline void wait_for_init_deassert(atomic_t *deassert)
+{
+}
+
+/*
+ * Because we use NMIs rather than the INIT-STARTUP sequence to
+ * bootstrap the CPUs, the APIC may be in a weird state. Kick it.
+ */
+static inline void smp_callin_clear_local_apic(void)
+{
+ clear_local_APIC();
+}
+
+static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
+{
+ printk("Storing NMI vector\n");
+ *high = *((volatile unsigned short *) TRAMPOLINE_HIGH);
+ *low = *((volatile unsigned short *) TRAMPOLINE_LOW);
+}
+
+static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
+{
+ printk("Restoring NMI vector\n");
+ *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high;
+ *((volatile unsigned short *) TRAMPOLINE_LOW) = *low;
+}
+
+#define inquire_remote_apic(apicid) {}
+
+#endif /* __ASM_NUMAQ_WAKECPU_H */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
new file mode 100644
index 00000000000..834a30295fa
--- /dev/null
+++ b/arch/x86/include/asm/olpc.h
@@ -0,0 +1,132 @@
+/* OLPC machine specific definitions */
+
+#ifndef _ASM_X86_OLPC_H
+#define _ASM_X86_OLPC_H
+
+#include <asm/geode.h>
+
+struct olpc_platform_t {
+ int flags;
+ uint32_t boardrev;
+ int ecver;
+};
+
+#define OLPC_F_PRESENT 0x01
+#define OLPC_F_DCON 0x02
+#define OLPC_F_VSA 0x04
+
+#ifdef CONFIG_OLPC
+
+extern struct olpc_platform_t olpc_platform_info;
+
+/*
+ * OLPC board IDs contain the major build number within the mask 0x0ff0,
+ * and the minor build number withing 0x000f. Pre-builds have a minor
+ * number less than 8, and normal builds start at 8. For example, 0x0B10
+ * is a PreB1, and 0x0C18 is a C1.
+ */
+
+static inline uint32_t olpc_board(uint8_t id)
+{
+ return (id << 4) | 0x8;
+}
+
+static inline uint32_t olpc_board_pre(uint8_t id)
+{
+ return id << 4;
+}
+
+static inline int machine_is_olpc(void)
+{
+ return (olpc_platform_info.flags & OLPC_F_PRESENT) ? 1 : 0;
+}
+
+/*
+ * The DCON is OLPC's Display Controller. It has a number of unique
+ * features that we might want to take advantage of..
+ */
+static inline int olpc_has_dcon(void)
+{
+ return (olpc_platform_info.flags & OLPC_F_DCON) ? 1 : 0;
+}
+
+/*
+ * The VSA is software from AMD that typical Geode bioses will include.
+ * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does
+ * not include the VSA; instead, PCI is emulated by the kernel.
+ *
+ * The VSA is described further in arch/x86/pci/olpc.c.
+ */
+static inline int olpc_has_vsa(void)
+{
+ return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0;
+}
+
+/*
+ * The "Mass Production" version of OLPC's XO is identified as being model
+ * C2. During the prototype phase, the following models (in chronological
+ * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models
+ * were based on Geode GX CPUs, and models after that were based upon
+ * Geode LX CPUs. There were also some hand-assembled models floating
+ * around, referred to as PreB1, PreB2, etc.
+ */
+static inline int olpc_board_at_least(uint32_t rev)
+{
+ return olpc_platform_info.boardrev >= rev;
+}
+
+#else
+
+static inline int machine_is_olpc(void)
+{
+ return 0;
+}
+
+static inline int olpc_has_dcon(void)
+{
+ return 0;
+}
+
+static inline int olpc_has_vsa(void)
+{
+ return 0;
+}
+
+#endif
+
+/* EC related functions */
+
+extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
+ unsigned char *outbuf, size_t outlen);
+
+extern int olpc_ec_mask_set(uint8_t bits);
+extern int olpc_ec_mask_unset(uint8_t bits);
+
+/* EC commands */
+
+#define EC_FIRMWARE_REV 0x08
+
+/* SCI source values */
+
+#define EC_SCI_SRC_EMPTY 0x00
+#define EC_SCI_SRC_GAME 0x01
+#define EC_SCI_SRC_BATTERY 0x02
+#define EC_SCI_SRC_BATSOC 0x04
+#define EC_SCI_SRC_BATERR 0x08
+#define EC_SCI_SRC_EBOOK 0x10
+#define EC_SCI_SRC_WLAN 0x20
+#define EC_SCI_SRC_ACPWR 0x40
+#define EC_SCI_SRC_ALL 0x7F
+
+/* GPIO assignments */
+
+#define OLPC_GPIO_MIC_AC geode_gpio(1)
+#define OLPC_GPIO_DCON_IRQ geode_gpio(7)
+#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
+#define OLPC_GPIO_SMB_CLK geode_gpio(14)
+#define OLPC_GPIO_SMB_DATA geode_gpio(15)
+#define OLPC_GPIO_WORKAUX geode_gpio(24)
+#define OLPC_GPIO_LID geode_gpio(26)
+#define OLPC_GPIO_ECSCI geode_gpio(27)
+
+#endif /* _ASM_X86_OLPC_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
new file mode 100644
index 00000000000..e9873a2e869
--- /dev/null
+++ b/arch/x86/include/asm/page.h
@@ -0,0 +1,209 @@
+#ifndef _ASM_X86_PAGE_H
+#define _ASM_X86_PAGE_H
+
+#include <linux/const.h>
+
+/* PAGE_SHIFT determines the page size */
+#define PAGE_SHIFT 12
+#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
+#define PAGE_MASK (~(PAGE_SIZE-1))
+
+#ifdef __KERNEL__
+
+#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
+#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
+
+/* Cast PAGE_MASK to a signed type so that it is sign-extended if
+ virtual addresses are 32-bits but physical addresses are larger
+ (ie, 32-bit PAE). */
+#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
+
+/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
+
+/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
+#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
+
+#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
+
+#define HPAGE_SHIFT PMD_SHIFT
+#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
+
+#define HUGE_MAX_HSTATE 2
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#endif
+
+#ifdef CONFIG_X86_64
+#include <asm/page_64.h>
+#else
+#include <asm/page_32.h>
+#endif /* CONFIG_X86_64 */
+
+#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
+
+#define VM_DATA_DEFAULT_FLAGS \
+ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+
+#ifndef __ASSEMBLY__
+
+typedef struct { pgdval_t pgd; } pgd_t;
+typedef struct { pgprotval_t pgprot; } pgprot_t;
+
+extern int page_is_ram(unsigned long pagenr);
+extern int pagerange_is_ram(unsigned long start, unsigned long end);
+extern int devmem_is_allowed(unsigned long pagenr);
+extern void map_devmem(unsigned long pfn, unsigned long size,
+ pgprot_t vma_prot);
+extern void unmap_devmem(unsigned long pfn, unsigned long size,
+ pgprot_t vma_prot);
+
+extern unsigned long max_low_pfn_mapped;
+extern unsigned long max_pfn_mapped;
+
+struct page;
+
+static inline void clear_user_page(void *page, unsigned long vaddr,
+ struct page *pg)
+{
+ clear_page(page);
+}
+
+static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
+ struct page *topage)
+{
+ copy_page(to, from);
+}
+
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+ alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
+static inline pgd_t native_make_pgd(pgdval_t val)
+{
+ return (pgd_t) { val };
+}
+
+static inline pgdval_t native_pgd_val(pgd_t pgd)
+{
+ return pgd.pgd;
+}
+
+#if PAGETABLE_LEVELS >= 3
+#if PAGETABLE_LEVELS == 4
+typedef struct { pudval_t pud; } pud_t;
+
+static inline pud_t native_make_pud(pmdval_t val)
+{
+ return (pud_t) { val };
+}
+
+static inline pudval_t native_pud_val(pud_t pud)
+{
+ return pud.pud;
+}
+#else /* PAGETABLE_LEVELS == 3 */
+#include <asm-generic/pgtable-nopud.h>
+
+static inline pudval_t native_pud_val(pud_t pud)
+{
+ return native_pgd_val(pud.pgd);
+}
+#endif /* PAGETABLE_LEVELS == 4 */
+
+typedef struct { pmdval_t pmd; } pmd_t;
+
+static inline pmd_t native_make_pmd(pmdval_t val)
+{
+ return (pmd_t) { val };
+}
+
+static inline pmdval_t native_pmd_val(pmd_t pmd)
+{
+ return pmd.pmd;
+}
+#else /* PAGETABLE_LEVELS == 2 */
+#include <asm-generic/pgtable-nopmd.h>
+
+static inline pmdval_t native_pmd_val(pmd_t pmd)
+{
+ return native_pgd_val(pmd.pud.pgd);
+}
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+static inline pte_t native_make_pte(pteval_t val)
+{
+ return (pte_t) { .pte = val };
+}
+
+static inline pteval_t native_pte_val(pte_t pte)
+{
+ return pte.pte;
+}
+
+static inline pteval_t native_pte_flags(pte_t pte)
+{
+ return native_pte_val(pte) & PTE_FLAGS_MASK;
+}
+
+#define pgprot_val(x) ((x).pgprot)
+#define __pgprot(x) ((pgprot_t) { (x) } )
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+
+#define pgd_val(x) native_pgd_val(x)
+#define __pgd(x) native_make_pgd(x)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define pud_val(x) native_pud_val(x)
+#define __pud(x) native_make_pud(x)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pmd_val(x) native_pmd_val(x)
+#define __pmd(x) native_make_pmd(x)
+#endif
+
+#define pte_val(x) native_pte_val(x)
+#define pte_flags(x) native_pte_flags(x)
+#define __pte(x) native_make_pte(x)
+
+#endif /* CONFIG_PARAVIRT */
+
+#define __pa(x) __phys_addr((unsigned long)(x))
+#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
+/* __pa_symbol should be used for C visible symbols.
+ This seems to be the official gcc blessed way to do such arithmetic. */
+#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
+
+#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
+
+#define __boot_va(x) __va(x)
+#define __boot_pa(x) __pa(x)
+
+/*
+ * virt_to_page(kaddr) returns a valid pointer if and only if
+ * virt_addr_valid(kaddr) returns true.
+ */
+#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
+extern bool __virt_addr_valid(unsigned long kaddr);
+#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr))
+
+#endif /* __ASSEMBLY__ */
+
+#include <asm-generic/memory_model.h>
+#include <asm-generic/page.h>
+
+#define __HAVE_ARCH_GATE_AREA 1
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_PAGE_H */
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
new file mode 100644
index 00000000000..bcde0d7b432
--- /dev/null
+++ b/arch/x86/include/asm/page_32.h
@@ -0,0 +1,136 @@
+#ifndef _ASM_X86_PAGE_32_H
+#define _ASM_X86_PAGE_32_H
+
+/*
+ * This handles the memory map.
+ *
+ * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
+ * a virtual address space of one gigabyte, which limits the
+ * amount of physical memory you can use to about 950MB.
+ *
+ * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
+ * and CONFIG_HIGHMEM64G options in the kernel configuration.
+ */
+#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
+
+#ifdef CONFIG_4KSTACKS
+#define THREAD_ORDER 0
+#else
+#define THREAD_ORDER 1
+#endif
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+
+#define STACKFAULT_STACK 0
+#define DOUBLEFAULT_STACK 1
+#define NMI_STACK 0
+#define DEBUG_STACK 0
+#define MCE_STACK 0
+#define N_EXCEPTION_STACKS 1
+
+#ifdef CONFIG_X86_PAE
+/* 44=32+12, the limit we can fit into an unsigned long pfn */
+#define __PHYSICAL_MASK_SHIFT 44
+#define __VIRTUAL_MASK_SHIFT 32
+#define PAGETABLE_LEVELS 3
+
+#ifndef __ASSEMBLY__
+typedef u64 pteval_t;
+typedef u64 pmdval_t;
+typedef u64 pudval_t;
+typedef u64 pgdval_t;
+typedef u64 pgprotval_t;
+
+typedef union {
+ struct {
+ unsigned long pte_low, pte_high;
+ };
+ pteval_t pte;
+} pte_t;
+#endif /* __ASSEMBLY__
+ */
+#else /* !CONFIG_X86_PAE */
+#define __PHYSICAL_MASK_SHIFT 32
+#define __VIRTUAL_MASK_SHIFT 32
+#define PAGETABLE_LEVELS 2
+
+#ifndef __ASSEMBLY__
+typedef unsigned long pteval_t;
+typedef unsigned long pmdval_t;
+typedef unsigned long pudval_t;
+typedef unsigned long pgdval_t;
+typedef unsigned long pgprotval_t;
+
+typedef union {
+ pteval_t pte;
+ pteval_t pte_low;
+} pte_t;
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_X86_PAE */
+
+#ifndef __ASSEMBLY__
+typedef struct page *pgtable_t;
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#endif
+
+#ifndef __ASSEMBLY__
+#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET)
+#ifdef CONFIG_DEBUG_VIRTUAL
+extern unsigned long __phys_addr(unsigned long);
+#else
+#define __phys_addr(x) __phys_addr_nodebug(x)
+#endif
+#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
+
+#ifdef CONFIG_FLATMEM
+#define pfn_valid(pfn) ((pfn) < max_mapnr)
+#endif /* CONFIG_FLATMEM */
+
+extern int nx_enabled;
+
+/*
+ * This much address space is reserved for vmalloc() and iomap()
+ * as well as fixmap mappings.
+ */
+extern unsigned int __VMALLOC_RESERVE;
+extern int sysctl_legacy_va_layout;
+
+extern void find_low_pfn_range(void);
+extern unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end);
+extern void initmem_init(unsigned long, unsigned long);
+extern void free_initmem(void);
+extern void setup_bootmem_allocator(void);
+
+
+#ifdef CONFIG_X86_USE_3DNOW
+#include <asm/mmx.h>
+
+static inline void clear_page(void *page)
+{
+ mmx_clear_page(page);
+}
+
+static inline void copy_page(void *to, void *from)
+{
+ mmx_copy_page(to, from);
+}
+#else /* !CONFIG_X86_USE_3DNOW */
+#include <linux/string.h>
+
+static inline void clear_page(void *page)
+{
+ memset(page, 0, PAGE_SIZE);
+}
+
+static inline void copy_page(void *to, void *from)
+{
+ memcpy(to, from, PAGE_SIZE);
+}
+#endif /* CONFIG_X86_3DNOW */
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PAGE_32_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
new file mode 100644
index 00000000000..5ebca29f44f
--- /dev/null
+++ b/arch/x86/include/asm/page_64.h
@@ -0,0 +1,105 @@
+#ifndef _ASM_X86_PAGE_64_H
+#define _ASM_X86_PAGE_64_H
+
+#define PAGETABLE_LEVELS 4
+
+#define THREAD_ORDER 1
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+#define CURRENT_MASK (~(THREAD_SIZE - 1))
+
+#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+
+#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
+#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
+
+#define IRQSTACK_ORDER 2
+#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
+
+#define STACKFAULT_STACK 1
+#define DOUBLEFAULT_STACK 2
+#define NMI_STACK 3
+#define DEBUG_STACK 4
+#define MCE_STACK 5
+#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
+
+#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
+
+/*
+ * Set __PAGE_OFFSET to the most negative possible address +
+ * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
+ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
+ * what Xen requires.
+ */
+#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
+
+#define __PHYSICAL_START CONFIG_PHYSICAL_START
+#define __KERNEL_ALIGN 0x200000
+
+/*
+ * Make sure kernel is aligned to 2MB address. Catching it at compile
+ * time is better. Change your config file and compile the kernel
+ * for a 2MB aligned address (CONFIG_PHYSICAL_START)
+ */
+#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
+#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
+#endif
+
+#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
+#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
+
+/* See Documentation/x86_64/mm.txt for a description of the memory map. */
+#define __PHYSICAL_MASK_SHIFT 46
+#define __VIRTUAL_MASK_SHIFT 48
+
+/*
+ * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
+ * arch/x86/kernel/head_64.S), and it is mapped here:
+ */
+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
+#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
+
+#ifndef __ASSEMBLY__
+void clear_page(void *page);
+void copy_page(void *to, void *from);
+
+/* duplicated to the one in bootmem.h */
+extern unsigned long max_pfn;
+extern unsigned long phys_base;
+
+extern unsigned long __phys_addr(unsigned long);
+#define __phys_reloc_hide(x) (x)
+
+/*
+ * These are used to make use of C type-checking..
+ */
+typedef unsigned long pteval_t;
+typedef unsigned long pmdval_t;
+typedef unsigned long pudval_t;
+typedef unsigned long pgdval_t;
+typedef unsigned long pgprotval_t;
+
+typedef struct page *pgtable_t;
+
+typedef struct { pteval_t pte; } pte_t;
+
+#define vmemmap ((struct page *)VMEMMAP_START)
+
+extern unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end);
+
+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern void free_initmem(void);
+
+extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
+extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
+
+#endif /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_FLATMEM
+#define pfn_valid(pfn) ((pfn) < max_pfn)
+#endif
+
+
+#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h
new file mode 100644
index 00000000000..6f0d0422f4c
--- /dev/null
+++ b/arch/x86/include/asm/param.h
@@ -0,0 +1,22 @@
+#ifndef _ASM_X86_PARAM_H
+#define _ASM_X86_PARAM_H
+
+#ifdef __KERNEL__
+# define HZ CONFIG_HZ /* Internal kernel timer frequency */
+# define USER_HZ 100 /* some user interfaces are */
+# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */
+#endif
+
+#ifndef HZ
+#define HZ 100
+#endif
+
+#define EXEC_PAGESIZE 4096
+
+#ifndef NOGROUP
+#define NOGROUP (-1)
+#endif
+
+#define MAXHOSTNAMELEN 64 /* max length of hostname */
+
+#endif /* _ASM_X86_PARAM_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
new file mode 100644
index 00000000000..ba3e2ff6aed
--- /dev/null
+++ b/arch/x86/include/asm/paravirt.h
@@ -0,0 +1,1650 @@
+#ifndef _ASM_X86_PARAVIRT_H
+#define _ASM_X86_PARAVIRT_H
+/* Various instructions on x86 need to be replaced for
+ * para-virtualization: those hooks are defined here. */
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/page.h>
+#include <asm/asm.h>
+
+/* Bitmask of what can be clobbered: usually at least eax. */
+#define CLBR_NONE 0
+#define CLBR_EAX (1 << 0)
+#define CLBR_ECX (1 << 1)
+#define CLBR_EDX (1 << 2)
+
+#ifdef CONFIG_X86_64
+#define CLBR_RSI (1 << 3)
+#define CLBR_RDI (1 << 4)
+#define CLBR_R8 (1 << 5)
+#define CLBR_R9 (1 << 6)
+#define CLBR_R10 (1 << 7)
+#define CLBR_R11 (1 << 8)
+#define CLBR_ANY ((1 << 9) - 1)
+#include <asm/desc_defs.h>
+#else
+/* CLBR_ANY should match all regs platform has. For i386, that's just it */
+#define CLBR_ANY ((1 << 3) - 1)
+#endif /* X86_64 */
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/cpumask.h>
+#include <asm/kmap_types.h>
+#include <asm/desc_defs.h>
+
+struct page;
+struct thread_struct;
+struct desc_ptr;
+struct tss_struct;
+struct mm_struct;
+struct desc_struct;
+
+/* general info */
+struct pv_info {
+ unsigned int kernel_rpl;
+ int shared_kernel_pmd;
+ int paravirt_enabled;
+ const char *name;
+};
+
+struct pv_init_ops {
+ /*
+ * Patch may replace one of the defined code sequences with
+ * arbitrary code, subject to the same register constraints.
+ * This generally means the code is not free to clobber any
+ * registers other than EAX. The patch function should return
+ * the number of bytes of code generated, as we nop pad the
+ * rest in generic code.
+ */
+ unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
+ unsigned long addr, unsigned len);
+
+ /* Basic arch-specific setup */
+ void (*arch_setup)(void);
+ char *(*memory_setup)(void);
+ void (*post_allocator_init)(void);
+
+ /* Print a banner to identify the environment */
+ void (*banner)(void);
+};
+
+
+struct pv_lazy_ops {
+ /* Set deferred update mode, used for batching operations. */
+ void (*enter)(void);
+ void (*leave)(void);
+};
+
+struct pv_time_ops {
+ void (*time_init)(void);
+
+ /* Set and set time of day */
+ unsigned long (*get_wallclock)(void);
+ int (*set_wallclock)(unsigned long);
+
+ unsigned long long (*sched_clock)(void);
+ unsigned long (*get_tsc_khz)(void);
+};
+
+struct pv_cpu_ops {
+ /* hooks for various privileged instructions */
+ unsigned long (*get_debugreg)(int regno);
+ void (*set_debugreg)(int regno, unsigned long value);
+
+ void (*clts)(void);
+
+ unsigned long (*read_cr0)(void);
+ void (*write_cr0)(unsigned long);
+
+ unsigned long (*read_cr4_safe)(void);
+ unsigned long (*read_cr4)(void);
+ void (*write_cr4)(unsigned long);
+
+#ifdef CONFIG_X86_64
+ unsigned long (*read_cr8)(void);
+ void (*write_cr8)(unsigned long);
+#endif
+
+ /* Segment descriptor handling */
+ void (*load_tr_desc)(void);
+ void (*load_gdt)(const struct desc_ptr *);
+ void (*load_idt)(const struct desc_ptr *);
+ void (*store_gdt)(struct desc_ptr *);
+ void (*store_idt)(struct desc_ptr *);
+ void (*set_ldt)(const void *desc, unsigned entries);
+ unsigned long (*store_tr)(void);
+ void (*load_tls)(struct thread_struct *t, unsigned int cpu);
+#ifdef CONFIG_X86_64
+ void (*load_gs_index)(unsigned int idx);
+#endif
+ void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
+ const void *desc);
+ void (*write_gdt_entry)(struct desc_struct *,
+ int entrynum, const void *desc, int size);
+ void (*write_idt_entry)(gate_desc *,
+ int entrynum, const gate_desc *gate);
+ void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
+ void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
+
+ void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
+
+ void (*set_iopl_mask)(unsigned mask);
+
+ void (*wbinvd)(void);
+ void (*io_delay)(void);
+
+ /* cpuid emulation, mostly so that caps bits can be disabled */
+ void (*cpuid)(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
+ /* MSR, PMC and TSR operations.
+ err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
+ u64 (*read_msr_amd)(unsigned int msr, int *err);
+ u64 (*read_msr)(unsigned int msr, int *err);
+ int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
+
+ u64 (*read_tsc)(void);
+ u64 (*read_pmc)(int counter);
+ unsigned long long (*read_tscp)(unsigned int *aux);
+
+ /*
+ * Atomically enable interrupts and return to userspace. This
+ * is only ever used to return to 32-bit processes; in a
+ * 64-bit kernel, it's used for 32-on-64 compat processes, but
+ * never native 64-bit processes. (Jump, not call.)
+ */
+ void (*irq_enable_sysexit)(void);
+
+ /*
+ * Switch to usermode gs and return to 64-bit usermode using
+ * sysret. Only used in 64-bit kernels to return to 64-bit
+ * processes. Usermode register state, including %rsp, must
+ * already be restored.
+ */
+ void (*usergs_sysret64)(void);
+
+ /*
+ * Switch to usermode gs and return to 32-bit usermode using
+ * sysret. Used to return to 32-on-64 compat processes.
+ * Other usermode register state, including %esp, must already
+ * be restored.
+ */
+ void (*usergs_sysret32)(void);
+
+ /* Normal iret. Jump to this with the standard iret stack
+ frame set up. */
+ void (*iret)(void);
+
+ void (*swapgs)(void);
+
+ struct pv_lazy_ops lazy_mode;
+};
+
+struct pv_irq_ops {
+ void (*init_IRQ)(void);
+
+ /*
+ * Get/set interrupt state. save_fl and restore_fl are only
+ * expected to use X86_EFLAGS_IF; all other bits
+ * returned from save_fl are undefined, and may be ignored by
+ * restore_fl.
+ */
+ unsigned long (*save_fl)(void);
+ void (*restore_fl)(unsigned long);
+ void (*irq_disable)(void);
+ void (*irq_enable)(void);
+ void (*safe_halt)(void);
+ void (*halt)(void);
+
+#ifdef CONFIG_X86_64
+ void (*adjust_exception_frame)(void);
+#endif
+};
+
+struct pv_apic_ops {
+#ifdef CONFIG_X86_LOCAL_APIC
+ void (*setup_boot_clock)(void);
+ void (*setup_secondary_clock)(void);
+
+ void (*startup_ipi_hook)(int phys_apicid,
+ unsigned long start_eip,
+ unsigned long start_esp);
+#endif
+};
+
+struct pv_mmu_ops {
+ /*
+ * Called before/after init_mm pagetable setup. setup_start
+ * may reset %cr3, and may pre-install parts of the pagetable;
+ * pagetable setup is expected to preserve any existing
+ * mapping.
+ */
+ void (*pagetable_setup_start)(pgd_t *pgd_base);
+ void (*pagetable_setup_done)(pgd_t *pgd_base);
+
+ unsigned long (*read_cr2)(void);
+ void (*write_cr2)(unsigned long);
+
+ unsigned long (*read_cr3)(void);
+ void (*write_cr3)(unsigned long);
+
+ /*
+ * Hooks for intercepting the creation/use/destruction of an
+ * mm_struct.
+ */
+ void (*activate_mm)(struct mm_struct *prev,
+ struct mm_struct *next);
+ void (*dup_mmap)(struct mm_struct *oldmm,
+ struct mm_struct *mm);
+ void (*exit_mmap)(struct mm_struct *mm);
+
+
+ /* TLB operations */
+ void (*flush_tlb_user)(void);
+ void (*flush_tlb_kernel)(void);
+ void (*flush_tlb_single)(unsigned long addr);
+ void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
+ unsigned long va);
+
+ /* Hooks for allocating and freeing a pagetable top-level */
+ int (*pgd_alloc)(struct mm_struct *mm);
+ void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
+
+ /*
+ * Hooks for allocating/releasing pagetable pages when they're
+ * attached to a pagetable
+ */
+ void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
+ void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
+ void (*release_pte)(unsigned long pfn);
+ void (*release_pmd)(unsigned long pfn);
+ void (*release_pud)(unsigned long pfn);
+
+ /* Pagetable manipulation functions */
+ void (*set_pte)(pte_t *ptep, pte_t pteval);
+ void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval);
+ void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+ void (*pte_update)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep);
+ void (*pte_update_defer)(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep);
+
+ pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep);
+ void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+
+ pteval_t (*pte_val)(pte_t);
+ pteval_t (*pte_flags)(pte_t);
+ pte_t (*make_pte)(pteval_t pte);
+
+ pgdval_t (*pgd_val)(pgd_t);
+ pgd_t (*make_pgd)(pgdval_t pgd);
+
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+ void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
+ void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+ void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep);
+ void (*pmd_clear)(pmd_t *pmdp);
+
+#endif /* CONFIG_X86_PAE */
+
+ void (*set_pud)(pud_t *pudp, pud_t pudval);
+
+ pmdval_t (*pmd_val)(pmd_t);
+ pmd_t (*make_pmd)(pmdval_t pmd);
+
+#if PAGETABLE_LEVELS == 4
+ pudval_t (*pud_val)(pud_t);
+ pud_t (*make_pud)(pudval_t pud);
+
+ void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
+#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+#ifdef CONFIG_HIGHPTE
+ void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
+#endif
+
+ struct pv_lazy_ops lazy_mode;
+
+ /* dom0 ops */
+
+ /* Sometimes the physical address is a pfn, and sometimes its
+ an mfn. We can tell which is which from the index. */
+ void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
+ unsigned long phys, pgprot_t flags);
+};
+
+struct raw_spinlock;
+struct pv_lock_ops {
+ int (*spin_is_locked)(struct raw_spinlock *lock);
+ int (*spin_is_contended)(struct raw_spinlock *lock);
+ void (*spin_lock)(struct raw_spinlock *lock);
+ void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags);
+ int (*spin_trylock)(struct raw_spinlock *lock);
+ void (*spin_unlock)(struct raw_spinlock *lock);
+};
+
+/* This contains all the paravirt structures: we get a convenient
+ * number for each function using the offset which we use to indicate
+ * what to patch. */
+struct paravirt_patch_template {
+ struct pv_init_ops pv_init_ops;
+ struct pv_time_ops pv_time_ops;
+ struct pv_cpu_ops pv_cpu_ops;
+ struct pv_irq_ops pv_irq_ops;
+ struct pv_apic_ops pv_apic_ops;
+ struct pv_mmu_ops pv_mmu_ops;
+ struct pv_lock_ops pv_lock_ops;
+};
+
+extern struct pv_info pv_info;
+extern struct pv_init_ops pv_init_ops;
+extern struct pv_time_ops pv_time_ops;
+extern struct pv_cpu_ops pv_cpu_ops;
+extern struct pv_irq_ops pv_irq_ops;
+extern struct pv_apic_ops pv_apic_ops;
+extern struct pv_mmu_ops pv_mmu_ops;
+extern struct pv_lock_ops pv_lock_ops;
+
+#define PARAVIRT_PATCH(x) \
+ (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
+
+#define paravirt_type(op) \
+ [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
+ [paravirt_opptr] "m" (op)
+#define paravirt_clobber(clobber) \
+ [paravirt_clobber] "i" (clobber)
+
+/*
+ * Generate some code, and mark it as patchable by the
+ * apply_paravirt() alternate instruction patcher.
+ */
+#define _paravirt_alt(insn_string, type, clobber) \
+ "771:\n\t" insn_string "\n" "772:\n" \
+ ".pushsection .parainstructions,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR " 771b\n" \
+ " .byte " type "\n" \
+ " .byte 772b-771b\n" \
+ " .short " clobber "\n" \
+ ".popsection\n"
+
+/* Generate patchable code, with the default asm parameters. */
+#define paravirt_alt(insn_string) \
+ _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
+
+/* Simple instruction patching code. */
+#define DEF_NATIVE(ops, name, code) \
+ extern const char start_##ops##_##name[], end_##ops##_##name[]; \
+ asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
+
+unsigned paravirt_patch_nop(void);
+unsigned paravirt_patch_ignore(unsigned len);
+unsigned paravirt_patch_call(void *insnbuf,
+ const void *target, u16 tgt_clobbers,
+ unsigned long addr, u16 site_clobbers,
+ unsigned len);
+unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
+ unsigned long addr, unsigned len);
+unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
+ unsigned long addr, unsigned len);
+
+unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
+ const char *start, const char *end);
+
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len);
+
+int paravirt_disable_iospace(void);
+
+/*
+ * This generates an indirect call based on the operation type number.
+ * The type number, computed in PARAVIRT_PATCH, is derived from the
+ * offset into the paravirt_patch_template structure, and can therefore be
+ * freely converted back into a structure offset.
+ */
+#define PARAVIRT_CALL "call *%[paravirt_opptr];"
+
+/*
+ * These macros are intended to wrap calls through one of the paravirt
+ * ops structs, so that they can be later identified and patched at
+ * runtime.
+ *
+ * Normally, a call to a pv_op function is a simple indirect call:
+ * (pv_op_struct.operations)(args...).
+ *
+ * Unfortunately, this is a relatively slow operation for modern CPUs,
+ * because it cannot necessarily determine what the destination
+ * address is. In this case, the address is a runtime constant, so at
+ * the very least we can patch the call to e a simple direct call, or
+ * ideally, patch an inline implementation into the callsite. (Direct
+ * calls are essentially free, because the call and return addresses
+ * are completely predictable.)
+ *
+ * For i386, these macros rely on the standard gcc "regparm(3)" calling
+ * convention, in which the first three arguments are placed in %eax,
+ * %edx, %ecx (in that order), and the remaining arguments are placed
+ * on the stack. All caller-save registers (eax,edx,ecx) are expected
+ * to be modified (either clobbered or used for return values).
+ * X86_64, on the other hand, already specifies a register-based calling
+ * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
+ * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
+ * special handling for dealing with 4 arguments, unlike i386.
+ * However, x86_64 also have to clobber all caller saved registers, which
+ * unfortunately, are quite a bit (r8 - r11)
+ *
+ * The call instruction itself is marked by placing its start address
+ * and size into the .parainstructions section, so that
+ * apply_paravirt() in arch/i386/kernel/alternative.c can do the
+ * appropriate patching under the control of the backend pv_init_ops
+ * implementation.
+ *
+ * Unfortunately there's no way to get gcc to generate the args setup
+ * for the call, and then allow the call itself to be generated by an
+ * inline asm. Because of this, we must do the complete arg setup and
+ * return value handling from within these macros. This is fairly
+ * cumbersome.
+ *
+ * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
+ * It could be extended to more arguments, but there would be little
+ * to be gained from that. For each number of arguments, there are
+ * the two VCALL and CALL variants for void and non-void functions.
+ *
+ * When there is a return value, the invoker of the macro must specify
+ * the return type. The macro then uses sizeof() on that type to
+ * determine whether its a 32 or 64 bit value, and places the return
+ * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
+ * 64-bit). For x86_64 machines, it just returns at %rax regardless of
+ * the return value size.
+ *
+ * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
+ * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
+ * in low,high order
+ *
+ * Small structures are passed and returned in registers. The macro
+ * calling convention can't directly deal with this, so the wrapper
+ * functions must do this.
+ *
+ * These PVOP_* macros are only defined within this header. This
+ * means that all uses must be wrapped in inline functions. This also
+ * makes sure the incoming and outgoing types are always correct.
+ */
+#ifdef CONFIG_X86_32
+#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx
+#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
+#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
+ "=c" (__ecx)
+#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
+#define EXTRA_CLOBBERS
+#define VEXTRA_CLOBBERS
+#else
+#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx
+#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
+#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
+ "=S" (__esi), "=d" (__edx), \
+ "=c" (__ecx)
+
+#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
+
+#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
+#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
+#endif
+
+#ifdef CONFIG_PARAVIRT_DEBUG
+#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
+#else
+#define PVOP_TEST_NULL(op) ((void)op)
+#endif
+
+#define __PVOP_CALL(rettype, op, pre, post, ...) \
+ ({ \
+ rettype __ret; \
+ PVOP_CALL_ARGS; \
+ PVOP_TEST_NULL(op); \
+ /* This is 32-bit specific, but is okay in 64-bit */ \
+ /* since this condition will never hold */ \
+ if (sizeof(rettype) > sizeof(unsigned long)) { \
+ asm volatile(pre \
+ paravirt_alt(PARAVIRT_CALL) \
+ post \
+ : PVOP_CALL_CLOBBERS \
+ : paravirt_type(op), \
+ paravirt_clobber(CLBR_ANY), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" EXTRA_CLOBBERS); \
+ __ret = (rettype)((((u64)__edx) << 32) | __eax); \
+ } else { \
+ asm volatile(pre \
+ paravirt_alt(PARAVIRT_CALL) \
+ post \
+ : PVOP_CALL_CLOBBERS \
+ : paravirt_type(op), \
+ paravirt_clobber(CLBR_ANY), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" EXTRA_CLOBBERS); \
+ __ret = (rettype)__eax; \
+ } \
+ __ret; \
+ })
+#define __PVOP_VCALL(op, pre, post, ...) \
+ ({ \
+ PVOP_VCALL_ARGS; \
+ PVOP_TEST_NULL(op); \
+ asm volatile(pre \
+ paravirt_alt(PARAVIRT_CALL) \
+ post \
+ : PVOP_VCALL_CLOBBERS \
+ : paravirt_type(op), \
+ paravirt_clobber(CLBR_ANY), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" VEXTRA_CLOBBERS); \
+ })
+
+#define PVOP_CALL0(rettype, op) \
+ __PVOP_CALL(rettype, op, "", "")
+#define PVOP_VCALL0(op) \
+ __PVOP_VCALL(op, "", "")
+
+#define PVOP_CALL1(rettype, op, arg1) \
+ __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)))
+#define PVOP_VCALL1(op, arg1) \
+ __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)))
+
+#define PVOP_CALL2(rettype, op, arg1, arg2) \
+ __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
+ "1" ((unsigned long)(arg2)))
+#define PVOP_VCALL2(op, arg1, arg2) \
+ __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
+ "1" ((unsigned long)(arg2)))
+
+#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
+ __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
+ "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
+#define PVOP_VCALL3(op, arg1, arg2, arg3) \
+ __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
+ "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
+
+/* This is the only difference in x86_64. We can make it much simpler */
+#ifdef CONFIG_X86_32
+#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
+ __PVOP_CALL(rettype, op, \
+ "push %[_arg4];", "lea 4(%%esp),%%esp;", \
+ "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
+ "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
+#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
+ __PVOP_VCALL(op, \
+ "push %[_arg4];", "lea 4(%%esp),%%esp;", \
+ "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
+ "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
+#else
+#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
+ __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
+ "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
+ "3"((unsigned long)(arg4)))
+#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
+ __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
+ "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
+ "3"((unsigned long)(arg4)))
+#endif
+
+static inline int paravirt_enabled(void)
+{
+ return pv_info.paravirt_enabled;
+}
+
+static inline void load_sp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
+}
+
+#define ARCH_SETUP pv_init_ops.arch_setup();
+static inline unsigned long get_wallclock(void)
+{
+ return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
+}
+
+static inline int set_wallclock(unsigned long nowtime)
+{
+ return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
+}
+
+static inline void (*choose_time_init(void))(void)
+{
+ return pv_time_ops.time_init;
+}
+
+/* The paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+static inline unsigned long paravirt_get_debugreg(int reg)
+{
+ return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
+}
+#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
+static inline void set_debugreg(unsigned long val, int reg)
+{
+ PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
+}
+
+static inline void clts(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.clts);
+}
+
+static inline unsigned long read_cr0(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
+}
+
+static inline void write_cr0(unsigned long x)
+{
+ PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+ return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
+}
+
+static inline void write_cr2(unsigned long x)
+{
+ PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
+}
+
+static inline unsigned long read_cr3(void)
+{
+ return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
+}
+
+static inline void write_cr3(unsigned long x)
+{
+ PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
+}
+
+static inline unsigned long read_cr4(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
+}
+static inline unsigned long read_cr4_safe(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
+}
+
+static inline void write_cr4(unsigned long x)
+{
+ PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long read_cr8(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8);
+}
+
+static inline void write_cr8(unsigned long x)
+{
+ PVOP_VCALL1(pv_cpu_ops.write_cr8, x);
+}
+#endif
+
+static inline void raw_safe_halt(void)
+{
+ PVOP_VCALL0(pv_irq_ops.safe_halt);
+}
+
+static inline void halt(void)
+{
+ PVOP_VCALL0(pv_irq_ops.safe_halt);
+}
+
+static inline void wbinvd(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.wbinvd);
+}
+
+#define get_kernel_rpl() (pv_info.kernel_rpl)
+
+static inline u64 paravirt_read_msr(unsigned msr, int *err)
+{
+ return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
+}
+static inline u64 paravirt_read_msr_amd(unsigned msr, int *err)
+{
+ return PVOP_CALL2(u64, pv_cpu_ops.read_msr_amd, msr, err);
+}
+static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
+{
+ return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
+}
+
+/* These should all do BUG_ON(_err), but our headers are too tangled. */
+#define rdmsr(msr, val1, val2) \
+do { \
+ int _err; \
+ u64 _l = paravirt_read_msr(msr, &_err); \
+ val1 = (u32)_l; \
+ val2 = _l >> 32; \
+} while (0)
+
+#define wrmsr(msr, val1, val2) \
+do { \
+ paravirt_write_msr(msr, val1, val2); \
+} while (0)
+
+#define rdmsrl(msr, val) \
+do { \
+ int _err; \
+ val = paravirt_read_msr(msr, &_err); \
+} while (0)
+
+#define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
+#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b)
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr, a, b) \
+({ \
+ int _err; \
+ u64 _l = paravirt_read_msr(msr, &_err); \
+ (*a) = (u32)_l; \
+ (*b) = _l >> 32; \
+ _err; \
+})
+
+static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
+{
+ int err;
+
+ *p = paravirt_read_msr(msr, &err);
+ return err;
+}
+static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+{
+ int err;
+
+ *p = paravirt_read_msr_amd(msr, &err);
+ return err;
+}
+
+static inline u64 paravirt_read_tsc(void)
+{
+ return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
+}
+
+#define rdtscl(low) \
+do { \
+ u64 _l = paravirt_read_tsc(); \
+ low = (int)_l; \
+} while (0)
+
+#define rdtscll(val) (val = paravirt_read_tsc())
+
+static inline unsigned long long paravirt_sched_clock(void)
+{
+ return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
+}
+#define calibrate_tsc() (pv_time_ops.get_tsc_khz())
+
+static inline unsigned long long paravirt_read_pmc(int counter)
+{
+ return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
+}
+
+#define rdpmc(counter, low, high) \
+do { \
+ u64 _l = paravirt_read_pmc(counter); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while (0)
+
+static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
+{
+ return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
+}
+
+#define rdtscp(low, high, aux) \
+do { \
+ int __aux; \
+ unsigned long __val = paravirt_rdtscp(&__aux); \
+ (low) = (u32)__val; \
+ (high) = (u32)(__val >> 32); \
+ (aux) = __aux; \
+} while (0)
+
+#define rdtscpll(val, aux) \
+do { \
+ unsigned long __aux; \
+ val = paravirt_rdtscp(&__aux); \
+ (aux) = __aux; \
+} while (0)
+
+static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+ PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
+}
+
+static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+ PVOP_VCALL2(pv_cpu_ops.free_ldt, ldt, entries);
+}
+
+static inline void load_TR_desc(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
+}
+static inline void load_gdt(const struct desc_ptr *dtr)
+{
+ PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
+}
+static inline void load_idt(const struct desc_ptr *dtr)
+{
+ PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
+}
+static inline void set_ldt(const void *addr, unsigned entries)
+{
+ PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
+}
+static inline void store_gdt(struct desc_ptr *dtr)
+{
+ PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
+}
+static inline void store_idt(struct desc_ptr *dtr)
+{
+ PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
+}
+static inline unsigned long paravirt_store_tr(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
+}
+#define store_tr(tr) ((tr) = paravirt_store_tr())
+static inline void load_TLS(struct thread_struct *t, unsigned cpu)
+{
+ PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
+}
+
+#ifdef CONFIG_X86_64
+static inline void load_gs_index(unsigned int gs)
+{
+ PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs);
+}
+#endif
+
+static inline void write_ldt_entry(struct desc_struct *dt, int entry,
+ const void *desc)
+{
+ PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
+}
+
+static inline void write_gdt_entry(struct desc_struct *dt, int entry,
+ void *desc, int type)
+{
+ PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
+}
+
+static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
+{
+ PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
+}
+static inline void set_iopl_mask(unsigned mask)
+{
+ PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
+}
+
+/* The paravirtualized I/O functions */
+static inline void slow_down_io(void)
+{
+ pv_cpu_ops.io_delay();
+#ifdef REALLY_SLOW_IO
+ pv_cpu_ops.io_delay();
+ pv_cpu_ops.io_delay();
+ pv_cpu_ops.io_delay();
+#endif
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static inline void setup_boot_clock(void)
+{
+ PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
+}
+
+static inline void setup_secondary_clock(void)
+{
+ PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
+}
+#endif
+
+static inline void paravirt_post_allocator_init(void)
+{
+ if (pv_init_ops.post_allocator_init)
+ (*pv_init_ops.post_allocator_init)();
+}
+
+static inline void paravirt_pagetable_setup_start(pgd_t *base)
+{
+ (*pv_mmu_ops.pagetable_setup_start)(base);
+}
+
+static inline void paravirt_pagetable_setup_done(pgd_t *base)
+{
+ (*pv_mmu_ops.pagetable_setup_done)(base);
+}
+
+#ifdef CONFIG_SMP
+static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
+ unsigned long start_esp)
+{
+ PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
+ phys_apicid, start_eip, start_esp);
+}
+#endif
+
+static inline void paravirt_activate_mm(struct mm_struct *prev,
+ struct mm_struct *next)
+{
+ PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
+}
+
+static inline void arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+ PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
+}
+
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+ PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
+}
+
+static inline void __flush_tlb(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
+}
+static inline void __flush_tlb_global(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
+}
+static inline void __flush_tlb_single(unsigned long addr)
+{
+ PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
+}
+
+static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+ unsigned long va)
+{
+ PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
+}
+
+static inline int paravirt_pgd_alloc(struct mm_struct *mm)
+{
+ return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
+}
+
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd);
+}
+
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn);
+}
+static inline void paravirt_release_pte(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_pte, pfn);
+}
+
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
+}
+
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+ unsigned long start, unsigned long count)
+{
+ PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
+}
+static inline void paravirt_release_pmd(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
+}
+
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn);
+}
+static inline void paravirt_release_pud(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
+}
+
+#ifdef CONFIG_HIGHPTE
+static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
+{
+ unsigned long ret;
+ ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
+ return (void *)ret;
+}
+#endif
+
+static inline void pte_update(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
+}
+
+static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
+}
+
+static inline pte_t __pte(pteval_t val)
+{
+ pteval_t ret;
+
+ if (sizeof(pteval_t) > sizeof(long))
+ ret = PVOP_CALL2(pteval_t,
+ pv_mmu_ops.make_pte,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALL1(pteval_t,
+ pv_mmu_ops.make_pte,
+ val);
+
+ return (pte_t) { .pte = ret };
+}
+
+static inline pteval_t pte_val(pte_t pte)
+{
+ pteval_t ret;
+
+ if (sizeof(pteval_t) > sizeof(long))
+ ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val,
+ pte.pte, (u64)pte.pte >> 32);
+ else
+ ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
+ pte.pte);
+
+ return ret;
+}
+
+static inline pteval_t pte_flags(pte_t pte)
+{
+ pteval_t ret;
+
+ if (sizeof(pteval_t) > sizeof(long))
+ ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
+ pte.pte, (u64)pte.pte >> 32);
+ else
+ ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags,
+ pte.pte);
+
+#ifdef CONFIG_PARAVIRT_DEBUG
+ BUG_ON(ret & PTE_PFN_MASK);
+#endif
+ return ret;
+}
+
+static inline pgd_t __pgd(pgdval_t val)
+{
+ pgdval_t ret;
+
+ if (sizeof(pgdval_t) > sizeof(long))
+ ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd,
+ val);
+
+ return (pgd_t) { ret };
+}
+
+static inline pgdval_t pgd_val(pgd_t pgd)
+{
+ pgdval_t ret;
+
+ if (sizeof(pgdval_t) > sizeof(long))
+ ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val,
+ pgd.pgd, (u64)pgd.pgd >> 32);
+ else
+ ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val,
+ pgd.pgd);
+
+ return ret;
+}
+
+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pteval_t ret;
+
+ ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start,
+ mm, addr, ptep);
+
+ return (pte_t) { .pte = ret };
+}
+
+static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ if (sizeof(pteval_t) > sizeof(long))
+ /* 5 arg words */
+ pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte);
+ else
+ PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit,
+ mm, addr, ptep, pte.pte);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+ if (sizeof(pteval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_pte, ptep,
+ pte.pte, (u64)pte.pte >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pte, ptep,
+ pte.pte);
+}
+
+static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ if (sizeof(pteval_t) > sizeof(long))
+ /* 5 arg words */
+ pv_mmu_ops.set_pte_at(mm, addr, ptep, pte);
+ else
+ PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ pmdval_t val = native_pmd_val(pmd);
+
+ if (sizeof(pmdval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
+}
+
+#if PAGETABLE_LEVELS >= 3
+static inline pmd_t __pmd(pmdval_t val)
+{
+ pmdval_t ret;
+
+ if (sizeof(pmdval_t) > sizeof(long))
+ ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd,
+ val);
+
+ return (pmd_t) { ret };
+}
+
+static inline pmdval_t pmd_val(pmd_t pmd)
+{
+ pmdval_t ret;
+
+ if (sizeof(pmdval_t) > sizeof(long))
+ ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val,
+ pmd.pmd, (u64)pmd.pmd >> 32);
+ else
+ ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val,
+ pmd.pmd);
+
+ return ret;
+}
+
+static inline void set_pud(pud_t *pudp, pud_t pud)
+{
+ pudval_t val = native_pud_val(pud);
+
+ if (sizeof(pudval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
+ val, (u64)val >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
+ val);
+}
+#if PAGETABLE_LEVELS == 4
+static inline pud_t __pud(pudval_t val)
+{
+ pudval_t ret;
+
+ if (sizeof(pudval_t) > sizeof(long))
+ ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud,
+ val);
+
+ return (pud_t) { ret };
+}
+
+static inline pudval_t pud_val(pud_t pud)
+{
+ pudval_t ret;
+
+ if (sizeof(pudval_t) > sizeof(long))
+ ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val,
+ pud.pud, (u64)pud.pud >> 32);
+ else
+ ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val,
+ pud.pud);
+
+ return ret;
+}
+
+static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ pgdval_t val = native_pgd_val(pgd);
+
+ if (sizeof(pgdval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp,
+ val, (u64)val >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp,
+ val);
+}
+
+static inline void pgd_clear(pgd_t *pgdp)
+{
+ set_pgd(pgdp, __pgd(0));
+}
+
+static inline void pud_clear(pud_t *pudp)
+{
+ set_pud(pudp, __pud(0));
+}
+
+#endif /* PAGETABLE_LEVELS == 4 */
+
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+#ifdef CONFIG_X86_PAE
+/* Special-case pte-setting operations for PAE, which can't update a
+ 64-bit pte atomically */
+static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
+ pte.pte, pte.pte >> 32);
+}
+
+static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ /* 5 arg words */
+ pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+ PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
+}
+#else /* !CONFIG_X86_PAE */
+static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ set_pte(ptep, pte);
+}
+
+static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ set_pte(ptep, pte);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ set_pte_at(mm, addr, ptep, __pte(0));
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+ set_pmd(pmdp, __pmd(0));
+}
+#endif /* CONFIG_X86_PAE */
+
+/* Lazy mode for batching updates / context switch */
+enum paravirt_lazy_mode {
+ PARAVIRT_LAZY_NONE,
+ PARAVIRT_LAZY_MMU,
+ PARAVIRT_LAZY_CPU,
+};
+
+enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
+void paravirt_enter_lazy_cpu(void);
+void paravirt_leave_lazy_cpu(void);
+void paravirt_enter_lazy_mmu(void);
+void paravirt_leave_lazy_mmu(void);
+void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
+
+#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE
+static inline void arch_enter_lazy_cpu_mode(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
+}
+
+static inline void arch_leave_lazy_cpu_mode(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
+}
+
+static inline void arch_flush_lazy_cpu_mode(void)
+{
+ if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) {
+ arch_leave_lazy_cpu_mode();
+ arch_enter_lazy_cpu_mode();
+ }
+}
+
+
+#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+static inline void arch_enter_lazy_mmu_mode(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
+}
+
+static inline void arch_leave_lazy_mmu_mode(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
+}
+
+static inline void arch_flush_lazy_mmu_mode(void)
+{
+ if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) {
+ arch_leave_lazy_mmu_mode();
+ arch_enter_lazy_mmu_mode();
+ }
+}
+
+static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
+ unsigned long phys, pgprot_t flags)
+{
+ pv_mmu_ops.set_fixmap(idx, phys, flags);
+}
+
+void _paravirt_nop(void);
+#define paravirt_nop ((void *)_paravirt_nop)
+
+void paravirt_use_bytelocks(void);
+
+#ifdef CONFIG_SMP
+
+static inline int __raw_spin_is_locked(struct raw_spinlock *lock)
+{
+ return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
+}
+
+static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
+{
+ return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
+}
+
+static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
+{
+ PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
+}
+
+static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock,
+ unsigned long flags)
+{
+ PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
+}
+
+static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock)
+{
+ return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
+}
+
+static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
+{
+ PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
+}
+
+#endif
+
+/* These all sit in the .parainstructions section to tell us what to patch. */
+struct paravirt_patch_site {
+ u8 *instr; /* original instructions */
+ u8 instrtype; /* type of this instruction */
+ u8 len; /* length of original instruction */
+ u16 clobbers; /* what registers you may clobber */
+};
+
+extern struct paravirt_patch_site __parainstructions[],
+ __parainstructions_end[];
+
+#ifdef CONFIG_X86_32
+#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;"
+#define PV_RESTORE_REGS "popl %%edx; popl %%ecx"
+#define PV_FLAGS_ARG "0"
+#define PV_EXTRA_CLOBBERS
+#define PV_VEXTRA_CLOBBERS
+#else
+/* We save some registers, but all of them, that's too much. We clobber all
+ * caller saved registers but the argument parameter */
+#define PV_SAVE_REGS "pushq %%rdi;"
+#define PV_RESTORE_REGS "popq %%rdi;"
+#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
+#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
+#define PV_FLAGS_ARG "D"
+#endif
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ unsigned long f;
+
+ asm volatile(paravirt_alt(PV_SAVE_REGS
+ PARAVIRT_CALL
+ PV_RESTORE_REGS)
+ : "=a"(f)
+ : paravirt_type(pv_irq_ops.save_fl),
+ paravirt_clobber(CLBR_EAX)
+ : "memory", "cc" PV_VEXTRA_CLOBBERS);
+ return f;
+}
+
+static inline void raw_local_irq_restore(unsigned long f)
+{
+ asm volatile(paravirt_alt(PV_SAVE_REGS
+ PARAVIRT_CALL
+ PV_RESTORE_REGS)
+ : "=a"(f)
+ : PV_FLAGS_ARG(f),
+ paravirt_type(pv_irq_ops.restore_fl),
+ paravirt_clobber(CLBR_EAX)
+ : "memory", "cc" PV_EXTRA_CLOBBERS);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ asm volatile(paravirt_alt(PV_SAVE_REGS
+ PARAVIRT_CALL
+ PV_RESTORE_REGS)
+ :
+ : paravirt_type(pv_irq_ops.irq_disable),
+ paravirt_clobber(CLBR_EAX)
+ : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ asm volatile(paravirt_alt(PV_SAVE_REGS
+ PARAVIRT_CALL
+ PV_RESTORE_REGS)
+ :
+ : paravirt_type(pv_irq_ops.irq_enable),
+ paravirt_clobber(CLBR_EAX)
+ : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+ unsigned long f;
+
+ f = __raw_local_save_flags();
+ raw_local_irq_disable();
+ return f;
+}
+
+
+/* Make sure as little as possible of this mess escapes. */
+#undef PARAVIRT_CALL
+#undef __PVOP_CALL
+#undef __PVOP_VCALL
+#undef PVOP_VCALL0
+#undef PVOP_CALL0
+#undef PVOP_VCALL1
+#undef PVOP_CALL1
+#undef PVOP_VCALL2
+#undef PVOP_CALL2
+#undef PVOP_VCALL3
+#undef PVOP_CALL3
+#undef PVOP_VCALL4
+#undef PVOP_CALL4
+
+#else /* __ASSEMBLY__ */
+
+#define _PVSITE(ptype, clobbers, ops, word, algn) \
+771:; \
+ ops; \
+772:; \
+ .pushsection .parainstructions,"a"; \
+ .align algn; \
+ word 771b; \
+ .byte ptype; \
+ .byte 772b-771b; \
+ .short clobbers; \
+ .popsection
+
+
+#ifdef CONFIG_X86_64
+#define PV_SAVE_REGS \
+ push %rax; \
+ push %rcx; \
+ push %rdx; \
+ push %rsi; \
+ push %rdi; \
+ push %r8; \
+ push %r9; \
+ push %r10; \
+ push %r11
+#define PV_RESTORE_REGS \
+ pop %r11; \
+ pop %r10; \
+ pop %r9; \
+ pop %r8; \
+ pop %rdi; \
+ pop %rsi; \
+ pop %rdx; \
+ pop %rcx; \
+ pop %rax
+#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
+#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
+#define PARA_INDIRECT(addr) *addr(%rip)
+#else
+#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx
+#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax
+#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
+#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
+#define PARA_INDIRECT(addr) *%cs:addr
+#endif
+
+#define INTERRUPT_RETURN \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret))
+
+#define DISABLE_INTERRUPTS(clobbers) \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
+ PV_SAVE_REGS; \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
+ PV_RESTORE_REGS;) \
+
+#define ENABLE_INTERRUPTS(clobbers) \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
+ PV_SAVE_REGS; \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
+ PV_RESTORE_REGS;)
+
+#define USERGS_SYSRET32 \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
+ CLBR_NONE, \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
+
+#ifdef CONFIG_X86_32
+#define GET_CR0_INTO_EAX \
+ push %ecx; push %edx; \
+ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
+ pop %edx; pop %ecx
+
+#define ENABLE_INTERRUPTS_SYSEXIT \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
+ CLBR_NONE, \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
+
+
+#else /* !CONFIG_X86_32 */
+
+/*
+ * If swapgs is used while the userspace stack is still current,
+ * there's no way to call a pvop. The PV replacement *must* be
+ * inlined, or the swapgs instruction must be trapped and emulated.
+ */
+#define SWAPGS_UNSAFE_STACK \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
+ swapgs)
+
+#define SWAPGS \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
+ PV_SAVE_REGS; \
+ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \
+ PV_RESTORE_REGS \
+ )
+
+#define GET_CR2_INTO_RCX \
+ call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2); \
+ movq %rax, %rcx; \
+ xorq %rax, %rax;
+
+#define PARAVIRT_ADJUST_EXCEPTION_FRAME \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
+ CLBR_NONE, \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame))
+
+#define USERGS_SYSRET64 \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
+ CLBR_NONE, \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+
+#define ENABLE_INTERRUPTS_SYSEXIT32 \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
+ CLBR_NONE, \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
+#endif /* CONFIG_X86_32 */
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/parport.h b/arch/x86/include/asm/parport.h
new file mode 100644
index 00000000000..3c4ffeb467e
--- /dev/null
+++ b/arch/x86/include/asm/parport.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_PARPORT_H
+#define _ASM_X86_PARPORT_H
+
+static int __devinit parport_pc_find_isa_ports(int autoirq, int autodma);
+static int __devinit parport_pc_find_nonpci_ports(int autoirq, int autodma)
+{
+ return parport_pc_find_isa_ports(autoirq, autodma);
+}
+
+#endif /* _ASM_X86_PARPORT_H */
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
new file mode 100644
index 00000000000..b8493b3b989
--- /dev/null
+++ b/arch/x86/include/asm/pat.h
@@ -0,0 +1,22 @@
+#ifndef _ASM_X86_PAT_H
+#define _ASM_X86_PAT_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_X86_PAT
+extern int pat_enabled;
+extern void validate_pat_support(struct cpuinfo_x86 *c);
+#else
+static const int pat_enabled;
+static inline void validate_pat_support(struct cpuinfo_x86 *c) { }
+#endif
+
+extern void pat_init(void);
+
+extern int reserve_memtype(u64 start, u64 end,
+ unsigned long req_type, unsigned long *ret_type);
+extern int free_memtype(u64 start, u64 end);
+
+extern void pat_disable(char *reason);
+
+#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
new file mode 100644
index 00000000000..b1e7a45d868
--- /dev/null
+++ b/arch/x86/include/asm/pci-direct.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_PCI_DIRECT_H
+#define _ASM_X86_PCI_DIRECT_H
+
+#include <linux/types.h>
+
+/* Direct PCI access. This is used for PCI accesses in early boot before
+ the PCI subsystem works. */
+
+extern u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset);
+extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset);
+extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset);
+extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val);
+extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val);
+extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val);
+
+extern int early_pci_allowed(void);
+
+extern unsigned int pci_early_dump_regs;
+extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
+extern void early_dump_pci_devices(void);
+#endif /* _ASM_X86_PCI_DIRECT_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
new file mode 100644
index 00000000000..875b38edf19
--- /dev/null
+++ b/arch/x86/include/asm/pci.h
@@ -0,0 +1,114 @@
+#ifndef _ASM_X86_PCI_H
+#define _ASM_X86_PCI_H
+
+#include <linux/mm.h> /* for struct page */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <asm/scatterlist.h>
+#include <asm/io.h>
+
+#ifdef __KERNEL__
+
+struct pci_sysdata {
+ int domain; /* PCI domain */
+ int node; /* NUMA node */
+#ifdef CONFIG_X86_64
+ void *iommu; /* IOMMU private data */
+#endif
+};
+
+extern int pci_routeirq;
+
+/* scan a bus after allocating a pci_sysdata for it */
+extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
+ int node);
+extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
+
+static inline int pci_domain_nr(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ return sd->domain;
+}
+
+static inline int pci_proc_domain(struct pci_bus *bus)
+{
+ return pci_domain_nr(bus);
+}
+
+
+/* Can be used to override the logic in pci_scan_bus for skipping
+ already-configured bus numbers - to be used for buggy BIOSes
+ or architectures with incomplete PCI setup by the loader */
+
+#ifdef CONFIG_PCI
+extern unsigned int pcibios_assign_all_busses(void);
+#else
+#define pcibios_assign_all_busses() 0
+#endif
+#define pcibios_scan_all_fns(a, b) 0
+
+extern unsigned long pci_mem_start;
+#define PCIBIOS_MIN_IO 0x1000
+#define PCIBIOS_MIN_MEM (pci_mem_start)
+
+#define PCIBIOS_MIN_CARDBUS_IO 0x4000
+
+void pcibios_config_init(void);
+struct pci_bus *pcibios_scan_root(int bus);
+
+void pcibios_set_master(struct pci_dev *dev);
+void pcibios_penalize_isa_irq(int irq, int active);
+struct irq_routing_table *pcibios_get_irq_routing_table(void);
+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+
+
+#define HAVE_PCI_MMAP
+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+ enum pci_mmap_state mmap_state,
+ int write_combine);
+
+
+#ifdef CONFIG_PCI
+extern void early_quirks(void);
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+ enum pci_dma_burst_strategy *strat,
+ unsigned long *strategy_parameter)
+{
+ *strat = PCI_DMA_BURST_INFINITY;
+ *strategy_parameter = ~0UL;
+}
+#else
+static inline void early_quirks(void) { }
+#endif
+
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_32
+# include "pci_32.h"
+#else
+# include "pci_64.h"
+#endif
+
+/* implement the pci_ DMA API in terms of the generic device dma_ one */
+#include <asm-generic/pci-dma-compat.h>
+
+/* generic pci stuff */
+#include <asm-generic/pci.h>
+
+#ifdef CONFIG_NUMA
+/* Returns the node based on pci bus */
+static inline int __pcibus_to_node(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+
+ return sd->node;
+}
+
+static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return node_to_cpumask(__pcibus_to_node(bus));
+}
+#endif
+
+#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/asm/pci_32.h b/arch/x86/include/asm/pci_32.h
new file mode 100644
index 00000000000..6f1213a6ef4
--- /dev/null
+++ b/arch/x86/include/asm/pci_32.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_PCI_32_H
+#define _ASM_X86_PCI_32_H
+
+
+#ifdef __KERNEL__
+
+
+/* Dynamic DMA mapping stuff.
+ * i386 has everything mapped statically.
+ */
+
+struct pci_dev;
+
+/* The PCI address space does equal the physical memory
+ * address space. The networking and block device layers use
+ * this boolean for bounce buffer decisions.
+ */
+#define PCI_DMA_BUS_IS_PHYS (1)
+
+/* pci_unmap_{page,single} is a nop so... */
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
+#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
+ do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
+#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
+ do { break; } while (pci_unmap_len(PTR, LEN_NAME))
+
+
+#endif /* __KERNEL__ */
+
+
+#endif /* _ASM_X86_PCI_32_H */
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
new file mode 100644
index 00000000000..5b28995d664
--- /dev/null
+++ b/arch/x86/include/asm/pci_64.h
@@ -0,0 +1,66 @@
+#ifndef _ASM_X86_PCI_64_H
+#define _ASM_X86_PCI_64_H
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_CALGARY_IOMMU
+static inline void *pci_iommu(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ return sd->iommu;
+}
+
+static inline void set_pci_iommu(struct pci_bus *bus, void *val)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ sd->iommu = val;
+}
+#endif /* CONFIG_CALGARY_IOMMU */
+
+extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
+ int reg, int len, u32 *value);
+extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
+ int reg, int len, u32 value);
+
+extern void dma32_reserve_bootmem(void);
+extern void pci_iommu_alloc(void);
+
+/* The PCI address space does equal the physical memory
+ * address space. The networking and block device layers use
+ * this boolean for bounce buffer decisions
+ *
+ * On AMD64 it mostly equals, but we set it to zero if a hardware
+ * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
+ */
+#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
+
+#if defined(CONFIG_GART_IOMMU) || defined(CONFIG_CALGARY_IOMMU)
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
+ dma_addr_t ADDR_NAME;
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
+ __u32 LEN_NAME;
+#define pci_unmap_addr(PTR, ADDR_NAME) \
+ ((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
+ (((PTR)->ADDR_NAME) = (VAL))
+#define pci_unmap_len(PTR, LEN_NAME) \
+ ((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
+ (((PTR)->LEN_NAME) = (VAL))
+
+#else
+/* No IOMMU */
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
+#define pci_unmap_addr(PTR, ADDR_NAME) (0)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
+#define pci_unmap_len(PTR, LEN_NAME) (0)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
+
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
new file mode 100644
index 00000000000..2fbfff88df3
--- /dev/null
+++ b/arch/x86/include/asm/pda.h
@@ -0,0 +1,137 @@
+#ifndef _ASM_X86_PDA_H
+#define _ASM_X86_PDA_H
+
+#ifndef __ASSEMBLY__
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <asm/page.h>
+
+/* Per processor datastructure. %gs points to it while the kernel runs */
+struct x8664_pda {
+ struct task_struct *pcurrent; /* 0 Current process */
+ unsigned long data_offset; /* 8 Per cpu data offset from linker
+ address */
+ unsigned long kernelstack; /* 16 top of kernel stack for current */
+ unsigned long oldrsp; /* 24 user rsp for system call */
+ int irqcount; /* 32 Irq nesting counter. Starts -1 */
+ unsigned int cpunumber; /* 36 Logical CPU number */
+#ifdef CONFIG_CC_STACKPROTECTOR
+ unsigned long stack_canary; /* 40 stack canary value */
+ /* gcc-ABI: this canary MUST be at
+ offset 40!!! */
+#endif
+ char *irqstackptr;
+ short nodenumber; /* number of current node (32k max) */
+ short in_bootmem; /* pda lives in bootmem */
+ unsigned int __softirq_pending;
+ unsigned int __nmi_count; /* number of NMI on this CPUs */
+ short mmu_state;
+ short isidle;
+ struct mm_struct *active_mm;
+ unsigned apic_timer_irqs;
+ unsigned irq0_irqs;
+ unsigned irq_resched_count;
+ unsigned irq_call_count;
+ unsigned irq_tlb_count;
+ unsigned irq_thermal_count;
+ unsigned irq_threshold_count;
+ unsigned irq_spurious_count;
+} ____cacheline_aligned_in_smp;
+
+extern struct x8664_pda **_cpu_pda;
+extern void pda_init(int);
+
+#define cpu_pda(i) (_cpu_pda[i])
+
+/*
+ * There is no fast way to get the base address of the PDA, all the accesses
+ * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
+ */
+extern void __bad_pda_field(void) __attribute__((noreturn));
+
+/*
+ * proxy_pda doesn't actually exist, but tell gcc it is accessed for
+ * all PDA accesses so it gets read/write dependencies right.
+ */
+extern struct x8664_pda _proxy_pda;
+
+#define pda_offset(field) offsetof(struct x8664_pda, field)
+
+#define pda_to_op(op, field, val) \
+do { \
+ typedef typeof(_proxy_pda.field) T__; \
+ if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \
+ switch (sizeof(_proxy_pda.field)) { \
+ case 2: \
+ asm(op "w %1,%%gs:%c2" : \
+ "+m" (_proxy_pda.field) : \
+ "ri" ((T__)val), \
+ "i"(pda_offset(field))); \
+ break; \
+ case 4: \
+ asm(op "l %1,%%gs:%c2" : \
+ "+m" (_proxy_pda.field) : \
+ "ri" ((T__)val), \
+ "i" (pda_offset(field))); \
+ break; \
+ case 8: \
+ asm(op "q %1,%%gs:%c2": \
+ "+m" (_proxy_pda.field) : \
+ "ri" ((T__)val), \
+ "i"(pda_offset(field))); \
+ break; \
+ default: \
+ __bad_pda_field(); \
+ } \
+} while (0)
+
+#define pda_from_op(op, field) \
+({ \
+ typeof(_proxy_pda.field) ret__; \
+ switch (sizeof(_proxy_pda.field)) { \
+ case 2: \
+ asm(op "w %%gs:%c1,%0" : \
+ "=r" (ret__) : \
+ "i" (pda_offset(field)), \
+ "m" (_proxy_pda.field)); \
+ break; \
+ case 4: \
+ asm(op "l %%gs:%c1,%0": \
+ "=r" (ret__): \
+ "i" (pda_offset(field)), \
+ "m" (_proxy_pda.field)); \
+ break; \
+ case 8: \
+ asm(op "q %%gs:%c1,%0": \
+ "=r" (ret__) : \
+ "i" (pda_offset(field)), \
+ "m" (_proxy_pda.field)); \
+ break; \
+ default: \
+ __bad_pda_field(); \
+ } \
+ ret__; \
+})
+
+#define read_pda(field) pda_from_op("mov", field)
+#define write_pda(field, val) pda_to_op("mov", field, val)
+#define add_pda(field, val) pda_to_op("add", field, val)
+#define sub_pda(field, val) pda_to_op("sub", field, val)
+#define or_pda(field, val) pda_to_op("or", field, val)
+
+/* This is not atomic against other CPUs -- CPU preemption needs to be off */
+#define test_and_clear_bit_pda(bit, field) \
+({ \
+ int old__; \
+ asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
+ : "=r" (old__), "+m" (_proxy_pda.field) \
+ : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
+ old__; \
+})
+
+#endif
+
+#define PDA_STACKOFFSET (5*8)
+
+#endif /* _ASM_X86_PDA_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
new file mode 100644
index 00000000000..ece72053ba6
--- /dev/null
+++ b/arch/x86/include/asm/percpu.h
@@ -0,0 +1,218 @@
+#ifndef _ASM_X86_PERCPU_H
+#define _ASM_X86_PERCPU_H
+
+#ifdef CONFIG_X86_64
+#include <linux/compiler.h>
+
+/* Same as asm-generic/percpu.h, except that we store the per cpu offset
+ in the PDA. Longer term the PDA and every per cpu variable
+ should be just put into a single section and referenced directly
+ from %gs */
+
+#ifdef CONFIG_SMP
+#include <asm/pda.h>
+
+#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
+#define __my_cpu_offset read_pda(data_offset)
+
+#define per_cpu_offset(x) (__per_cpu_offset(x))
+
+#endif
+#include <asm-generic/percpu.h>
+
+DECLARE_PER_CPU(struct x8664_pda, pda);
+
+/*
+ * These are supposed to be implemented as a single instruction which
+ * operates on the per-cpu data base segment. x86-64 doesn't have
+ * that yet, so this is a fairly inefficient workaround for the
+ * meantime. The single instruction is atomic with respect to
+ * preemption and interrupts, so we need to explicitly disable
+ * interrupts here to achieve the same effect. However, because it
+ * can be used from within interrupt-disable/enable, we can't actually
+ * disable interrupts; disabling preemption is enough.
+ */
+#define x86_read_percpu(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp; \
+ preempt_disable(); \
+ __tmp = __get_cpu_var(var); \
+ preempt_enable(); \
+ __tmp; \
+ })
+
+#define x86_write_percpu(var, val) \
+ do { \
+ preempt_disable(); \
+ __get_cpu_var(var) = (val); \
+ preempt_enable(); \
+ } while(0)
+
+#else /* CONFIG_X86_64 */
+
+#ifdef __ASSEMBLY__
+
+/*
+ * PER_CPU finds an address of a per-cpu variable.
+ *
+ * Args:
+ * var - variable name
+ * reg - 32bit register
+ *
+ * The resulting address is stored in the "reg" argument.
+ *
+ * Example:
+ * PER_CPU(cpu_gdt_descr, %ebx)
+ */
+#ifdef CONFIG_SMP
+#define PER_CPU(var, reg) \
+ movl %fs:per_cpu__##this_cpu_off, reg; \
+ lea per_cpu__##var(reg), reg
+#define PER_CPU_VAR(var) %fs:per_cpu__##var
+#else /* ! SMP */
+#define PER_CPU(var, reg) \
+ movl $per_cpu__##var, reg
+#define PER_CPU_VAR(var) per_cpu__##var
+#endif /* SMP */
+
+#else /* ...!ASSEMBLY */
+
+/*
+ * PER_CPU finds an address of a per-cpu variable.
+ *
+ * Args:
+ * var - variable name
+ * cpu - 32bit register containing the current CPU number
+ *
+ * The resulting address is stored in the "cpu" argument.
+ *
+ * Example:
+ * PER_CPU(cpu_gdt_descr, %ebx)
+ */
+#ifdef CONFIG_SMP
+
+#define __my_cpu_offset x86_read_percpu(this_cpu_off)
+
+/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
+#define __percpu_seg "%%fs:"
+
+#else /* !SMP */
+
+#define __percpu_seg ""
+
+#endif /* SMP */
+
+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
+/* For arch-specific code, we can use direct single-insn ops (they
+ * don't give an lvalue though). */
+extern void __bad_percpu_size(void);
+
+#define percpu_to_op(op, var, val) \
+do { \
+ typedef typeof(var) T__; \
+ if (0) { \
+ T__ tmp__; \
+ tmp__ = (val); \
+ } \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm(op "b %1,"__percpu_seg"%0" \
+ : "+m" (var) \
+ : "ri" ((T__)val)); \
+ break; \
+ case 2: \
+ asm(op "w %1,"__percpu_seg"%0" \
+ : "+m" (var) \
+ : "ri" ((T__)val)); \
+ break; \
+ case 4: \
+ asm(op "l %1,"__percpu_seg"%0" \
+ : "+m" (var) \
+ : "ri" ((T__)val)); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+} while (0)
+
+#define percpu_from_op(op, var) \
+({ \
+ typeof(var) ret__; \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm(op "b "__percpu_seg"%1,%0" \
+ : "=r" (ret__) \
+ : "m" (var)); \
+ break; \
+ case 2: \
+ asm(op "w "__percpu_seg"%1,%0" \
+ : "=r" (ret__) \
+ : "m" (var)); \
+ break; \
+ case 4: \
+ asm(op "l "__percpu_seg"%1,%0" \
+ : "=r" (ret__) \
+ : "m" (var)); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ ret__; \
+})
+
+#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
+#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
+#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
+#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
+#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
+#endif /* !__ASSEMBLY__ */
+#endif /* !CONFIG_X86_64 */
+
+#ifdef CONFIG_SMP
+
+/*
+ * Define the "EARLY_PER_CPU" macros. These are used for some per_cpu
+ * variables that are initialized and accessed before there are per_cpu
+ * areas allocated.
+ */
+
+#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
+ DEFINE_PER_CPU(_type, _name) = _initvalue; \
+ __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \
+ { [0 ... NR_CPUS-1] = _initvalue }; \
+ __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
+ EXPORT_PER_CPU_SYMBOL(_name)
+
+#define DECLARE_EARLY_PER_CPU(_type, _name) \
+ DECLARE_PER_CPU(_type, _name); \
+ extern __typeof__(_type) *_name##_early_ptr; \
+ extern __typeof__(_type) _name##_early_map[]
+
+#define early_per_cpu_ptr(_name) (_name##_early_ptr)
+#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
+#define early_per_cpu(_name, _cpu) \
+ (early_per_cpu_ptr(_name) ? \
+ early_per_cpu_ptr(_name)[_cpu] : \
+ per_cpu(_name, _cpu))
+
+#else /* !CONFIG_SMP */
+#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
+ DEFINE_PER_CPU(_type, _name) = _initvalue
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
+ EXPORT_PER_CPU_SYMBOL(_name)
+
+#define DECLARE_EARLY_PER_CPU(_type, _name) \
+ DECLARE_PER_CPU(_type, _name)
+
+#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
+#define early_per_cpu_ptr(_name) NULL
+/* no early_per_cpu_map() */
+
+#endif /* !CONFIG_SMP */
+
+#endif /* _ASM_X86_PERCPU_H */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
new file mode 100644
index 00000000000..cb7c151a8bf
--- /dev/null
+++ b/arch/x86/include/asm/pgalloc.h
@@ -0,0 +1,114 @@
+#ifndef _ASM_X86_PGALLOC_H
+#define _ASM_X86_PGALLOC_H
+
+#include <linux/threads.h>
+#include <linux/mm.h> /* for struct page */
+#include <linux/pagemap.h>
+
+static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm)
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+ unsigned long start, unsigned long count) {}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_release_pte(unsigned long pfn) {}
+static inline void paravirt_release_pmd(unsigned long pfn) {}
+static inline void paravirt_release_pud(unsigned long pfn) {}
+#endif
+
+/*
+ * Allocate and free page tables.
+ */
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
+/* Should really implement gc for free page table pages. This could be
+ done with a reference count in struct page. */
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+ free_page((unsigned long)pte);
+}
+
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+ __free_page(pte);
+}
+
+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+ pmd_t *pmd, pte_t *pte)
+{
+ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+ struct page *pte)
+{
+ unsigned long pfn = page_to_pfn(pte);
+
+ paravirt_alloc_pte(mm, pfn);
+ set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+
+#if PAGETABLE_LEVELS > 2
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+ free_page((unsigned long)pmd);
+}
+
+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+#ifdef CONFIG_X86_PAE
+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
+#else /* !CONFIG_X86_PAE */
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+}
+#endif /* CONFIG_X86_PAE */
+
+#if PAGETABLE_LEVELS > 3
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+ paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+ free_page((unsigned long)pud);
+}
+
+extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* PAGETABLE_LEVELS > 2 */
+
+#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level-defs.h b/arch/x86/include/asm/pgtable-2level-defs.h
new file mode 100644
index 00000000000..d77db8990ea
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-2level-defs.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H
+#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H
+
+#define SHARED_KERNEL_PMD 0
+
+/*
+ * traditional i386 two-level paging structure:
+ */
+
+#define PGDIR_SHIFT 22
+#define PTRS_PER_PGD 1024
+
+/*
+ * the i386 is two-level, so we don't really have any
+ * PMD directory physically.
+ */
+
+#define PTRS_PER_PTE 1024
+
+#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
new file mode 100644
index 00000000000..b17edfd2362
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -0,0 +1,79 @@
+#ifndef _ASM_X86_PGTABLE_2LEVEL_H
+#define _ASM_X86_PGTABLE_2LEVEL_H
+
+#define pte_ERROR(e) \
+ printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
+#define pgd_ERROR(e) \
+ printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
+
+/*
+ * Certain architectures need to do special things when PTEs
+ * within a page table are directly modified. Thus, the following
+ * hook is made available.
+ */
+static inline void native_set_pte(pte_t *ptep , pte_t pte)
+{
+ *ptep = pte;
+}
+
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ *pmdp = pmd;
+}
+
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ native_set_pte(ptep, pte);
+}
+
+static inline void native_set_pte_present(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ native_set_pte(ptep, pte);
+}
+
+static inline void native_pmd_clear(pmd_t *pmdp)
+{
+ native_set_pmd(pmdp, __pmd(0));
+}
+
+static inline void native_pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *xp)
+{
+ *xp = native_make_pte(0);
+}
+
+#ifdef CONFIG_SMP
+static inline pte_t native_ptep_get_and_clear(pte_t *xp)
+{
+ return __pte(xchg(&xp->pte_low, 0));
+}
+#else
+#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
+#endif
+
+#define pte_none(x) (!(x).pte_low)
+
+/*
+ * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
+ * into this range:
+ */
+#define PTE_FILE_MAX_BITS 29
+
+#define pte_to_pgoff(pte) \
+ ((((pte).pte_low >> 1) & 0x1f) + (((pte).pte_low >> 8) << 5))
+
+#define pgoff_to_pte(off) \
+ ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + \
+ (((off) >> 5) << 8) + _PAGE_FILE })
+
+/* Encode and de-code a swap entry */
+#define __swp_type(x) (((x).val >> 1) & 0x1f)
+#define __swp_offset(x) ((x).val >> 8)
+#define __swp_entry(type, offset) \
+ ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
+#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
+#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+
+#endif /* _ASM_X86_PGTABLE_2LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-3level-defs.h b/arch/x86/include/asm/pgtable-3level-defs.h
new file mode 100644
index 00000000000..62561367653
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-3level-defs.h
@@ -0,0 +1,28 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+
+#ifdef CONFIG_PARAVIRT
+#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd)
+#else
+#define SHARED_KERNEL_PMD 1
+#endif
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 30
+#define PTRS_PER_PGD 4
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
new file mode 100644
index 00000000000..fb16cec702e
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -0,0 +1,175 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_H
+#define _ASM_X86_PGTABLE_3LEVEL_H
+
+/*
+ * Intel Physical Address Extension (PAE) Mode - three-level page
+ * tables on PPro+ CPUs.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#define pte_ERROR(e) \
+ printk("%s:%d: bad pte %p(%08lx%08lx).\n", \
+ __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
+#define pmd_ERROR(e) \
+ printk("%s:%d: bad pmd %p(%016Lx).\n", \
+ __FILE__, __LINE__, &(e), pmd_val(e))
+#define pgd_ERROR(e) \
+ printk("%s:%d: bad pgd %p(%016Lx).\n", \
+ __FILE__, __LINE__, &(e), pgd_val(e))
+
+static inline int pud_none(pud_t pud)
+{
+ return pud_val(pud) == 0;
+}
+
+static inline int pud_bad(pud_t pud)
+{
+ return (pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+
+static inline int pud_present(pud_t pud)
+{
+ return pud_val(pud) & _PAGE_PRESENT;
+}
+
+/* Rules for using set_pte: the pte being assigned *must* be
+ * either not present or in a state where the hardware will
+ * not attempt to update the pte. In places where this is
+ * not possible, use pte_get_and_clear to obtain the old pte
+ * value and then use set_pte to update it. -ben
+ */
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+/*
+ * Since this is only called on user PTEs, and the page fault handler
+ * must handle the already racy situation of simultaneous page faults,
+ * we are justified in merely clearing the PTE present bit, followed
+ * by a set. The ordering here is important.
+ */
+static inline void native_set_pte_present(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ ptep->pte_low = 0;
+ smp_wmb();
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
+}
+
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd));
+}
+
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
+{
+ set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
+}
+
+/*
+ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
+ * entry, so clear the bottom half first and enforce ordering with a compiler
+ * barrier.
+ */
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ ptep->pte_low = 0;
+ smp_wmb();
+ ptep->pte_high = 0;
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+ u32 *tmp = (u32 *)pmd;
+ *tmp = 0;
+ smp_wmb();
+ *(tmp + 1) = 0;
+}
+
+static inline void pud_clear(pud_t *pudp)
+{
+ unsigned long pgd;
+
+ set_pud(pudp, __pud(0));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ *
+ * Make sure the pud entry we're updating is within the
+ * current pgd to avoid unnecessary TLB flushes.
+ */
+ pgd = read_cr3();
+ if (__pa(pudp) >= pgd && __pa(pudp) <
+ (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
+ write_cr3(pgd);
+}
+
+#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
+
+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
+
+
+/* Find an entry in the second-level page table.. */
+#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \
+ pmd_index(address))
+
+#ifdef CONFIG_SMP
+static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
+{
+ pte_t res;
+
+ /* xchg acts as a barrier before the setting of the high bits */
+ res.pte_low = xchg(&ptep->pte_low, 0);
+ res.pte_high = ptep->pte_high;
+ ptep->pte_high = 0;
+
+ return res;
+}
+#else
+#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
+#endif
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t a, pte_t b)
+{
+ return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
+}
+
+static inline int pte_none(pte_t pte)
+{
+ return !pte.pte_low && !pte.pte_high;
+}
+
+/*
+ * Bits 0, 6 and 7 are taken in the low part of the pte,
+ * put the 32 bits of offset into the high part.
+ */
+#define pte_to_pgoff(pte) ((pte).pte_high)
+#define pgoff_to_pte(off) \
+ ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
+#define PTE_FILE_MAX_BITS 32
+
+/* Encode and de-code a swap entry */
+#define __swp_type(x) (((x).val) & 0x1f)
+#define __swp_offset(x) ((x).val >> 5)
+#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
+#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
+#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
new file mode 100644
index 00000000000..c012f3b1167
--- /dev/null
+++ b/arch/x86/include/asm/pgtable.h
@@ -0,0 +1,562 @@
+#ifndef _ASM_X86_PGTABLE_H
+#define _ASM_X86_PGTABLE_H
+
+#define FIRST_USER_ADDRESS 0
+
+#define _PAGE_BIT_PRESENT 0 /* is present */
+#define _PAGE_BIT_RW 1 /* writeable */
+#define _PAGE_BIT_USER 2 /* userspace addressable */
+#define _PAGE_BIT_PWT 3 /* page write through */
+#define _PAGE_BIT_PCD 4 /* page cache disabled */
+#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
+#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
+#define _PAGE_BIT_FILE 6
+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
+#define _PAGE_BIT_PAT 7 /* on 4KB pages */
+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
+#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
+#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
+#define _PAGE_BIT_UNUSED3 11
+#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+
+#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
+#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
+#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
+#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
+#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
+#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
+#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
+#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define __HAVE_ARCH_PTE_SPECIAL
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#else
+#define _PAGE_NX (_AT(pteval_t, 0))
+#endif
+
+/* If _PAGE_PRESENT is clear, we use these: */
+#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping,
+ * saved PTE; unset:swap */
+#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
+ pte_present gives true */
+
+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
+ _PAGE_DIRTY)
+
+/* Set of bits not changed in pte_modify */
+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
+#define _PAGE_CACHE_WB (0)
+#define _PAGE_CACHE_WC (_PAGE_PWT)
+#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
+#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
+
+#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+
+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
+ _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+#define PAGE_COPY PAGE_COPY_NOEXEC
+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+
+#define __PAGE_KERNEL_EXEC \
+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
+#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
+
+#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
+#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
+#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
+#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+
+#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
+
+#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
+#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+
+#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
+#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC)
+
+/* xwr */
+#define __P000 PAGE_NONE
+#define __P001 PAGE_READONLY
+#define __P010 PAGE_COPY
+#define __P011 PAGE_COPY
+#define __P100 PAGE_READONLY_EXEC
+#define __P101 PAGE_READONLY_EXEC
+#define __P110 PAGE_COPY_EXEC
+#define __P111 PAGE_COPY_EXEC
+
+#define __S000 PAGE_NONE
+#define __S001 PAGE_READONLY
+#define __S010 PAGE_SHARED
+#define __S011 PAGE_SHARED
+#define __S100 PAGE_READONLY_EXEC
+#define __S101 PAGE_READONLY_EXEC
+#define __S110 PAGE_SHARED_EXEC
+#define __S111 PAGE_SHARED_EXEC
+
+/*
+ * early identity mapping pte attrib macros.
+ */
+#ifdef CONFIG_X86_64
+#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#else
+/*
+ * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
+ * bits are combined, this will alow user to access the high address mapped
+ * VDSO in the presence of CONFIG_COMPAT_VDSO
+ */
+#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
+#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
+#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
+#endif
+
+#ifndef __ASSEMBLY__
+
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+
+extern spinlock_t pgd_lock;
+extern struct list_head pgd_list;
+
+/*
+ * The following only work if pte_present() is true.
+ * Undefined behaviour if not..
+ */
+static inline int pte_dirty(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_DIRTY;
+}
+
+static inline int pte_young(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_ACCESSED;
+}
+
+static inline int pte_write(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_RW;
+}
+
+static inline int pte_file(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_FILE;
+}
+
+static inline int pte_huge(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_PSE;
+}
+
+static inline int pte_global(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_GLOBAL;
+}
+
+static inline int pte_exec(pte_t pte)
+{
+ return !(pte_flags(pte) & _PAGE_NX);
+}
+
+static inline int pte_special(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SPECIAL;
+}
+
+static inline unsigned long pte_pfn(pte_t pte)
+{
+ return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+#define pte_page(pte) pfn_to_page(pte_pfn(pte))
+
+static inline int pmd_large(pmd_t pte)
+{
+ return (pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
+}
+
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_RW);
+}
+
+static inline pte_t pte_mkexec(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_NX);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_RW);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_PSE);
+}
+
+static inline pte_t pte_clrhuge(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_PSE);
+}
+
+static inline pte_t pte_mkglobal(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_GLOBAL);
+}
+
+static inline pte_t pte_clrglobal(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_GLOBAL);
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_SPECIAL);
+}
+
+extern pteval_t __supported_pte_mask;
+
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+ return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
+ pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+{
+ return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
+ pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+ pteval_t val = pte_val(pte);
+
+ /*
+ * Chop off the NX bit (if present), and add the NX portion of
+ * the newprot (if present):
+ */
+ val &= _PAGE_CHG_MASK;
+ val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
+
+ return __pte(val);
+}
+
+/* mprotect needs to preserve PAT bits when updating vm_page_prot */
+#define pgprot_modify pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+ pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
+ pgprotval_t addbits = pgprot_val(newprot);
+ return __pgprot(preservebits | addbits);
+}
+
+#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
+
+#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
+
+#ifndef __ASSEMBLY__
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot);
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t *vma_prot);
+#endif
+
+/* Install a pte for a particular vaddr in kernel space. */
+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+
+#ifdef CONFIG_X86_32
+extern void native_pagetable_setup_start(pgd_t *base);
+extern void native_pagetable_setup_done(pgd_t *base);
+#else
+static inline void native_pagetable_setup_start(pgd_t *base) {}
+static inline void native_pagetable_setup_done(pgd_t *base) {}
+#endif
+
+struct seq_file;
+extern void arch_report_meminfo(struct seq_file *m);
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+#define set_pte(ptep, pte) native_set_pte(ptep, pte)
+#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
+
+#define set_pte_present(mm, addr, ptep, pte) \
+ native_set_pte_present(mm, addr, ptep, pte)
+#define set_pte_atomic(ptep, pte) \
+ native_set_pte_atomic(ptep, pte)
+
+#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
+#define pgd_clear(pgd) native_pgd_clear(pgd)
+#endif
+
+#ifndef set_pud
+# define set_pud(pudp, pud) native_set_pud(pudp, pud)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pud_clear(pud) native_pud_clear(pud)
+#endif
+
+#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
+#define pmd_clear(pmd) native_pmd_clear(pmd)
+
+#define pte_update(mm, addr, ptep) do { } while (0)
+#define pte_update_defer(mm, addr, ptep) do { } while (0)
+
+static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
+{
+ native_pagetable_setup_start(base);
+}
+
+static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
+{
+ native_pagetable_setup_done(base);
+}
+#endif /* CONFIG_PARAVIRT */
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_X86_32
+# include "pgtable_32.h"
+#else
+# include "pgtable_64.h"
+#endif
+
+/*
+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
+ *
+ * this macro returns the index of the entry in the pgd page which would
+ * control the given virtual address
+ */
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+
+/*
+ * pgd_offset() returns a (pgd_t *)
+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
+ */
+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+/*
+ * a shortcut which implies the use of the kernel's pgd, instead
+ * of a process's
+ */
+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
+
+
+#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
+#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
+
+#ifndef __ASSEMBLY__
+
+enum {
+ PG_LEVEL_NONE,
+ PG_LEVEL_4K,
+ PG_LEVEL_2M,
+ PG_LEVEL_1G,
+ PG_LEVEL_NUM
+};
+
+#ifdef CONFIG_PROC_FS
+extern void update_page_count(int level, unsigned long pages);
+#else
+static inline void update_page_count(int level, unsigned long pages) { }
+#endif
+
+/*
+ * Helper function that returns the kernel pagetable entry controlling
+ * the virtual address 'address'. NULL means no pagetable entry present.
+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+ * as a pte too.
+ */
+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+
+/* local pte updates need not use xchg for locking */
+static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
+{
+ pte_t res = *ptep;
+
+ /* Pure native function needs no input for mm, addr */
+ native_pte_clear(NULL, 0, ptep);
+ return res;
+}
+
+static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep , pte_t pte)
+{
+ native_set_pte(ptep, pte);
+}
+
+#ifndef CONFIG_PARAVIRT
+/*
+ * Rules for using pte_update - it must be called after any PTE update which
+ * has not been done using the set_pte / clear_pte interfaces. It is used by
+ * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
+ * updates should either be sets, clears, or set_pte_atomic for P->P
+ * transitions, which means this hook should only be called for user PTEs.
+ * This hook implies a P->P protection or access change has taken place, which
+ * requires a subsequent TLB flush. The notification can optionally be delayed
+ * until the TLB flush event by using the pte_update_defer form of the
+ * interface, but care must be taken to assure that the flush happens while
+ * still holding the same page table lock so that the shadow and primary pages
+ * do not become out of sync on SMP.
+ */
+#define pte_update(mm, addr, ptep) do { } while (0)
+#define pte_update_defer(mm, addr, ptep) do { } while (0)
+#endif
+
+/*
+ * We only update the dirty/accessed state if we set
+ * the dirty bit by hand in the kernel, since the hardware
+ * will do the accessed bit for us, and we don't want to
+ * race with other CPU's that might be updating the dirty
+ * bit at the same time.
+ */
+struct vm_area_struct;
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pte_t pte = native_ptep_get_and_clear(ptep);
+ pte_update(mm, addr, ptep);
+ return pte;
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ int full)
+{
+ pte_t pte;
+ if (full) {
+ /*
+ * Full address destruction in progress; paravirt does not
+ * care about updates and native needs no locking
+ */
+ pte = native_local_ptep_get_and_clear(ptep);
+ } else {
+ pte = ptep_get_and_clear(mm, addr, ptep);
+ }
+ return pte;
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
+ pte_update(mm, addr, ptep);
+}
+
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ * dst - pointer to pgd range anwhere on a pgd page
+ * src - ""
+ * count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+ memcpy(dst, src, count * sizeof(pgd_t));
+}
+
+
+#include <asm-generic/pgtable.h>
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
new file mode 100644
index 00000000000..f9d5889b336
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -0,0 +1,191 @@
+#ifndef _ASM_X86_PGTABLE_32_H
+#define _ASM_X86_PGTABLE_32_H
+
+
+/*
+ * The Linux memory management assumes a three-level page table setup. On
+ * the i386, we use that, but "fold" the mid level into the top-level page
+ * table, so that we physically have the same two-level page table as the
+ * i386 mmu expects.
+ *
+ * This file contains the functions and defines necessary to modify and use
+ * the i386 page table tree.
+ */
+#ifndef __ASSEMBLY__
+#include <asm/processor.h>
+#include <asm/fixmap.h>
+#include <linux/threads.h>
+#include <asm/paravirt.h>
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct mm_struct;
+struct vm_area_struct;
+
+extern pgd_t swapper_pg_dir[1024];
+
+static inline void pgtable_cache_init(void) { }
+static inline void check_pgt_cache(void) { }
+void paging_init(void);
+
+extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
+
+/*
+ * The Linux x86 paging architecture is 'compile-time dual-mode', it
+ * implements both the traditional 2-level x86 page tables and the
+ * newer 3-level PAE-mode page tables.
+ */
+#ifdef CONFIG_X86_PAE
+# include <asm/pgtable-3level-defs.h>
+# define PMD_SIZE (1UL << PMD_SHIFT)
+# define PMD_MASK (~(PMD_SIZE - 1))
+#else
+# include <asm/pgtable-2level-defs.h>
+#endif
+
+#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+/* Just any arbitrary offset to the start of the vmalloc VM area: the
+ * current 8MB value just means that there will be a 8MB "hole" after the
+ * physical memory until the kernel virtual memory starts. That means that
+ * any out-of-bounds memory accesses will hopefully be caught.
+ * The vmalloc() routines leaves a hole of 4kB between each vmalloced
+ * area for the same reason. ;)
+ */
+#define VMALLOC_OFFSET (8 * 1024 * 1024)
+#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 512
+#else
+#define LAST_PKMAP 1024
+#endif
+
+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
+ & PMD_MASK)
+
+#ifdef CONFIG_HIGHMEM
+# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
+#else
+# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
+#endif
+
+#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
+
+/*
+ * Define this if things work differently on an i386 and an i486:
+ * it will (on an i486) warn about kernel memory accesses that are
+ * done without a 'access_ok(VERIFY_WRITE,..)'
+ */
+#undef TEST_ACCESS_OK
+
+/* The boot page tables (all created as a single array) */
+extern unsigned long pg0[];
+
+#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
+
+/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
+#define pmd_none(x) (!(unsigned long)pmd_val((x)))
+#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
+#define pmd_bad(x) ((pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+
+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
+
+#ifdef CONFIG_X86_PAE
+# include <asm/pgtable-3level.h>
+#else
+# include <asm/pgtable-2level.h>
+#endif
+
+/*
+ * Macro to mark a page protection value as "uncacheable".
+ * On processors which do not support it, this is a no-op.
+ */
+#define pgprot_noncached(prot) \
+ ((boot_cpu_data.x86 > 3) \
+ ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
+ : (prot))
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+
+
+static inline int pud_large(pud_t pud) { return 0; }
+
+/*
+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
+ *
+ * this macro returns the index of the entry in the pmd page which would
+ * control the given virtual address
+ */
+#define pmd_index(address) \
+ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+
+/*
+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
+ *
+ * this macro returns the index of the entry in the pte page which would
+ * control the given virtual address
+ */
+#define pte_index(address) \
+ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define pte_offset_kernel(dir, address) \
+ ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
+
+#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
+
+#define pmd_page_vaddr(pmd) \
+ ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
+
+#if defined(CONFIG_HIGHPTE)
+#define pte_offset_map(dir, address) \
+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
+ pte_index((address)))
+#define pte_offset_map_nested(dir, address) \
+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
+ pte_index((address)))
+#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
+#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
+#else
+#define pte_offset_map(dir, address) \
+ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
+#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
+#define pte_unmap(pte) do { } while (0)
+#define pte_unmap_nested(pte) do { } while (0)
+#endif
+
+/* Clear a kernel PTE and flush it from the TLB */
+#define kpte_clear_flush(ptep, vaddr) \
+do { \
+ pte_clear(&init_mm, (vaddr), (ptep)); \
+ __flush_tlb_one((vaddr)); \
+} while (0)
+
+/*
+ * The i386 doesn't have any external MMU info: the kernel page
+ * tables contain all the necessary information.
+ */
+#define update_mmu_cache(vma, address, pte) do { } while (0)
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * kern_addr_valid() is (1) for FLATMEM and (0) for
+ * SPARSEMEM and DISCONTIGMEM
+ */
+#ifdef CONFIG_FLATMEM
+#define kern_addr_valid(addr) (1)
+#else
+#define kern_addr_valid(kaddr) (0)
+#endif
+
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
+ remap_pfn_range(vma, vaddr, pfn, size, prot)
+
+#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
new file mode 100644
index 00000000000..545a0e042bb
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -0,0 +1,285 @@
+#ifndef _ASM_X86_PGTABLE_64_H
+#define _ASM_X86_PGTABLE_64_H
+
+#include <linux/const.h>
+#ifndef __ASSEMBLY__
+
+/*
+ * This file contains the functions and defines necessary to modify and use
+ * the x86-64 page table tree.
+ */
+#include <asm/processor.h>
+#include <linux/bitops.h>
+#include <linux/threads.h>
+#include <asm/pda.h>
+
+extern pud_t level3_kernel_pgt[512];
+extern pud_t level3_ident_pgt[512];
+extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
+extern pgd_t init_level4_pgt[];
+
+#define swapper_pg_dir init_level4_pgt
+
+extern void paging_init(void);
+
+#endif /* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD 0
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 39
+#define PTRS_PER_PGD 512
+
+/*
+ * 3rd level page
+ */
+#define PUD_SHIFT 30
+#define PTRS_PER_PUD 512
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+#ifndef __ASSEMBLY__
+
+#define pte_ERROR(e) \
+ printk("%s:%d: bad pte %p(%016lx).\n", \
+ __FILE__, __LINE__, &(e), pte_val(e))
+#define pmd_ERROR(e) \
+ printk("%s:%d: bad pmd %p(%016lx).\n", \
+ __FILE__, __LINE__, &(e), pmd_val(e))
+#define pud_ERROR(e) \
+ printk("%s:%d: bad pud %p(%016lx).\n", \
+ __FILE__, __LINE__, &(e), pud_val(e))
+#define pgd_ERROR(e) \
+ printk("%s:%d: bad pgd %p(%016lx).\n", \
+ __FILE__, __LINE__, &(e), pgd_val(e))
+
+#define pgd_none(x) (!pgd_val(x))
+#define pud_none(x) (!pud_val(x))
+
+struct mm_struct;
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
+
+
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ *ptep = native_make_pte(0);
+}
+
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
+{
+ *ptep = pte;
+}
+
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ native_set_pte(ptep, pte);
+}
+
+static inline pte_t native_ptep_get_and_clear(pte_t *xp)
+{
+#ifdef CONFIG_SMP
+ return native_make_pte(xchg(&xp->pte, 0));
+#else
+ /* native_local_ptep_get_and_clear,
+ but duplicated because of cyclic dependency */
+ pte_t ret = *xp;
+ native_pte_clear(NULL, 0, xp);
+ return ret;
+#endif
+}
+
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ *pmdp = pmd;
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+ native_set_pmd(pmd, native_make_pmd(0));
+}
+
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
+{
+ *pudp = pud;
+}
+
+static inline void native_pud_clear(pud_t *pud)
+{
+ native_set_pud(pud, native_make_pud(0));
+}
+
+static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ *pgdp = pgd;
+}
+
+static inline void native_pgd_clear(pgd_t *pgd)
+{
+ native_set_pgd(pgd, native_make_pgd(0));
+}
+
+#define pte_same(a, b) ((a).pte == (b).pte)
+
+#endif /* !__ASSEMBLY__ */
+
+#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_MASK (~(PMD_SIZE - 1))
+#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK (~(PUD_SIZE - 1))
+#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+
+#define MAXMEM _AC(0x00003fffffffffff, UL)
+#define VMALLOC_START _AC(0xffffc20000000000, UL)
+#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
+#define VMEMMAP_START _AC(0xffffe20000000000, UL)
+#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
+#define MODULES_END _AC(0xffffffffff000000, UL)
+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+
+#ifndef __ASSEMBLY__
+
+static inline int pgd_bad(pgd_t pgd)
+{
+ return (pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+}
+
+static inline int pud_bad(pud_t pud)
+{
+ return (pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+}
+
+static inline int pmd_bad(pmd_t pmd)
+{
+ return (pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+}
+
+#define pte_none(x) (!pte_val((x)))
+#define pte_present(x) (pte_val((x)) & (_PAGE_PRESENT | _PAGE_PROTNONE))
+
+#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
+
+/*
+ * Macro to mark a page protection value as "uncacheable".
+ */
+#define pgprot_noncached(prot) \
+ (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+
+/*
+ * Level 4 access.
+ */
+#define pgd_page_vaddr(pgd) \
+ ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
+#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
+#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
+static inline int pgd_large(pgd_t pgd) { return 0; }
+#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
+
+/* PUD - Level3 access */
+/* to find an entry in a page-table-directory. */
+#define pud_page_vaddr(pud) \
+ ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
+#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
+#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+#define pud_offset(pgd, address) \
+ ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
+#define pud_present(pud) (pud_val((pud)) & _PAGE_PRESENT)
+
+static inline int pud_large(pud_t pte)
+{
+ return (pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+/* PMD - Level 2 access */
+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
+#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
+
+#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
+ pmd_index(address))
+#define pmd_none(x) (!pmd_val((x)))
+#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
+#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
+#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+
+#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
+ _PAGE_FILE })
+#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
+
+/* PTE - Level 1 access. */
+
+/* page, protection -> pte */
+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
+
+#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
+ pte_index((address)))
+
+/* x86-64 always has all page tables mapped. */
+#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
+#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
+#define pte_unmap(pte) /* NOP */
+#define pte_unmap_nested(pte) /* NOP */
+
+#define update_mmu_cache(vma, address, pte) do { } while (0)
+
+extern int direct_gbpages;
+
+/* Encode and de-code a swap entry */
+#define __swp_type(x) (((x).val >> 1) & 0x3f)
+#define __swp_offset(x) ((x).val >> 8)
+#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \
+ ((offset) << 8) })
+#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
+#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+
+extern int kern_addr_valid(unsigned long addr);
+extern void cleanup_highmap(void);
+
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
+ remap_pfn_range(vma, vaddr, pfn, size, prot)
+
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+
+#define pgtable_cache_init() do { } while (0)
+#define check_pgt_cache() do { } while (0)
+
+#define PAGE_AGP PAGE_KERNEL_NOCACHE
+#define HAVE_PAGE_AGP 1
+
+/* fs/proc/kcore.c */
+#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
+#define kc_offset_to_vaddr(o) \
+ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
+ ? ((o) | ~__VIRTUAL_MASK) \
+ : (o))
+
+#define __HAVE_ARCH_PTE_SAME
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/poll.h b/arch/x86/include/asm/poll.h
new file mode 100644
index 00000000000..c98509d3149
--- /dev/null
+++ b/arch/x86/include/asm/poll.h
@@ -0,0 +1 @@
+#include <asm-generic/poll.h>
diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h
new file mode 100644
index 00000000000..bb7133dc155
--- /dev/null
+++ b/arch/x86/include/asm/posix_types.h
@@ -0,0 +1,13 @@
+#ifdef __KERNEL__
+# ifdef CONFIG_X86_32
+# include "posix_types_32.h"
+# else
+# include "posix_types_64.h"
+# endif
+#else
+# ifdef __i386__
+# include "posix_types_32.h"
+# else
+# include "posix_types_64.h"
+# endif
+#endif
diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h
new file mode 100644
index 00000000000..f7d9adf82e5
--- /dev/null
+++ b/arch/x86/include/asm/posix_types_32.h
@@ -0,0 +1,85 @@
+#ifndef _ASM_X86_POSIX_TYPES_32_H
+#define _ASM_X86_POSIX_TYPES_32_H
+
+/*
+ * This file is generally used by user-level software, so you need to
+ * be a little careful about namespace pollution etc. Also, we cannot
+ * assume GCC is being used.
+ */
+
+typedef unsigned long __kernel_ino_t;
+typedef unsigned short __kernel_mode_t;
+typedef unsigned short __kernel_nlink_t;
+typedef long __kernel_off_t;
+typedef int __kernel_pid_t;
+typedef unsigned short __kernel_ipc_pid_t;
+typedef unsigned short __kernel_uid_t;
+typedef unsigned short __kernel_gid_t;
+typedef unsigned int __kernel_size_t;
+typedef int __kernel_ssize_t;
+typedef int __kernel_ptrdiff_t;
+typedef long __kernel_time_t;
+typedef long __kernel_suseconds_t;
+typedef long __kernel_clock_t;
+typedef int __kernel_timer_t;
+typedef int __kernel_clockid_t;
+typedef int __kernel_daddr_t;
+typedef char * __kernel_caddr_t;
+typedef unsigned short __kernel_uid16_t;
+typedef unsigned short __kernel_gid16_t;
+typedef unsigned int __kernel_uid32_t;
+typedef unsigned int __kernel_gid32_t;
+
+typedef unsigned short __kernel_old_uid_t;
+typedef unsigned short __kernel_old_gid_t;
+typedef unsigned short __kernel_old_dev_t;
+
+#ifdef __GNUC__
+typedef long long __kernel_loff_t;
+#endif
+
+typedef struct {
+ int val[2];
+} __kernel_fsid_t;
+
+#if defined(__KERNEL__)
+
+#undef __FD_SET
+#define __FD_SET(fd,fdsetp) \
+ asm volatile("btsl %1,%0": \
+ "+m" (*(__kernel_fd_set *)(fdsetp)) \
+ : "r" ((int)(fd)))
+
+#undef __FD_CLR
+#define __FD_CLR(fd,fdsetp) \
+ asm volatile("btrl %1,%0": \
+ "+m" (*(__kernel_fd_set *)(fdsetp)) \
+ : "r" ((int) (fd)))
+
+#undef __FD_ISSET
+#define __FD_ISSET(fd,fdsetp) \
+ (__extension__ \
+ ({ \
+ unsigned char __result; \
+ asm volatile("btl %1,%2 ; setb %0" \
+ : "=q" (__result) \
+ : "r" ((int)(fd)), \
+ "m" (*(__kernel_fd_set *)(fdsetp))); \
+ __result; \
+}))
+
+#undef __FD_ZERO
+#define __FD_ZERO(fdsetp) \
+do { \
+ int __d0, __d1; \
+ asm volatile("cld ; rep ; stosl" \
+ : "=m" (*(__kernel_fd_set *)(fdsetp)), \
+ "=&c" (__d0), "=&D" (__d1) \
+ : "a" (0), "1" (__FDSET_LONGS), \
+ "2" ((__kernel_fd_set *)(fdsetp)) \
+ : "memory"); \
+} while (0)
+
+#endif /* defined(__KERNEL__) */
+
+#endif /* _ASM_X86_POSIX_TYPES_32_H */
diff --git a/arch/x86/include/asm/posix_types_64.h b/arch/x86/include/asm/posix_types_64.h
new file mode 100644
index 00000000000..eb8d2d92b63
--- /dev/null
+++ b/arch/x86/include/asm/posix_types_64.h
@@ -0,0 +1,119 @@
+#ifndef _ASM_X86_POSIX_TYPES_64_H
+#define _ASM_X86_POSIX_TYPES_64_H
+
+/*
+ * This file is generally used by user-level software, so you need to
+ * be a little careful about namespace pollution etc. Also, we cannot
+ * assume GCC is being used.
+ */
+
+typedef unsigned long __kernel_ino_t;
+typedef unsigned int __kernel_mode_t;
+typedef unsigned long __kernel_nlink_t;
+typedef long __kernel_off_t;
+typedef int __kernel_pid_t;
+typedef int __kernel_ipc_pid_t;
+typedef unsigned int __kernel_uid_t;
+typedef unsigned int __kernel_gid_t;
+typedef unsigned long __kernel_size_t;
+typedef long __kernel_ssize_t;
+typedef long __kernel_ptrdiff_t;
+typedef long __kernel_time_t;
+typedef long __kernel_suseconds_t;
+typedef long __kernel_clock_t;
+typedef int __kernel_timer_t;
+typedef int __kernel_clockid_t;
+typedef int __kernel_daddr_t;
+typedef char * __kernel_caddr_t;
+typedef unsigned short __kernel_uid16_t;
+typedef unsigned short __kernel_gid16_t;
+
+#ifdef __GNUC__
+typedef long long __kernel_loff_t;
+#endif
+
+typedef struct {
+ int val[2];
+} __kernel_fsid_t;
+
+typedef unsigned short __kernel_old_uid_t;
+typedef unsigned short __kernel_old_gid_t;
+typedef __kernel_uid_t __kernel_uid32_t;
+typedef __kernel_gid_t __kernel_gid32_t;
+
+typedef unsigned long __kernel_old_dev_t;
+
+#ifdef __KERNEL__
+
+#undef __FD_SET
+static inline void __FD_SET(unsigned long fd, __kernel_fd_set *fdsetp)
+{
+ unsigned long _tmp = fd / __NFDBITS;
+ unsigned long _rem = fd % __NFDBITS;
+ fdsetp->fds_bits[_tmp] |= (1UL<<_rem);
+}
+
+#undef __FD_CLR
+static inline void __FD_CLR(unsigned long fd, __kernel_fd_set *fdsetp)
+{
+ unsigned long _tmp = fd / __NFDBITS;
+ unsigned long _rem = fd % __NFDBITS;
+ fdsetp->fds_bits[_tmp] &= ~(1UL<<_rem);
+}
+
+#undef __FD_ISSET
+static inline int __FD_ISSET(unsigned long fd, __const__ __kernel_fd_set *p)
+{
+ unsigned long _tmp = fd / __NFDBITS;
+ unsigned long _rem = fd % __NFDBITS;
+ return (p->fds_bits[_tmp] & (1UL<<_rem)) != 0;
+}
+
+/*
+ * This will unroll the loop for the normal constant cases (8 or 32 longs,
+ * for 256 and 1024-bit fd_sets respectively)
+ */
+#undef __FD_ZERO
+static inline void __FD_ZERO(__kernel_fd_set *p)
+{
+ unsigned long *tmp = p->fds_bits;
+ int i;
+
+ if (__builtin_constant_p(__FDSET_LONGS)) {
+ switch (__FDSET_LONGS) {
+ case 32:
+ tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
+ tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
+ tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
+ tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
+ tmp[16] = 0; tmp[17] = 0; tmp[18] = 0; tmp[19] = 0;
+ tmp[20] = 0; tmp[21] = 0; tmp[22] = 0; tmp[23] = 0;
+ tmp[24] = 0; tmp[25] = 0; tmp[26] = 0; tmp[27] = 0;
+ tmp[28] = 0; tmp[29] = 0; tmp[30] = 0; tmp[31] = 0;
+ return;
+ case 16:
+ tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
+ tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
+ tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
+ tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
+ return;
+ case 8:
+ tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
+ tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
+ return;
+ case 4:
+ tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
+ return;
+ }
+ }
+ i = __FDSET_LONGS;
+ while (i) {
+ i--;
+ *tmp = 0;
+ tmp++;
+ }
+}
+
+#endif /* defined(__KERNEL__) */
+
+#endif /* _ASM_X86_POSIX_TYPES_64_H */
diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h
new file mode 100644
index 00000000000..fe681147a4f
--- /dev/null
+++ b/arch/x86/include/asm/prctl.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_PRCTL_H
+#define _ASM_X86_PRCTL_H
+
+#define ARCH_SET_GS 0x1001
+#define ARCH_SET_FS 0x1002
+#define ARCH_GET_FS 0x1003
+#define ARCH_GET_GS 0x1004
+
+
+#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h
new file mode 100644
index 00000000000..1198f2a0e42
--- /dev/null
+++ b/arch/x86/include/asm/processor-cyrix.h
@@ -0,0 +1,38 @@
+/*
+ * NSC/Cyrix CPU indexed register access. Must be inlined instead of
+ * macros to ensure correct access ordering
+ * Access order is always 0x22 (=offset), 0x23 (=value)
+ *
+ * When using the old macros a line like
+ * setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+ * gets expanded to:
+ * do {
+ * outb((CX86_CCR2), 0x22);
+ * outb((({
+ * outb((CX86_CCR2), 0x22);
+ * inb(0x23);
+ * }) | 0x88), 0x23);
+ * } while (0);
+ *
+ * which in fact violates the access order (= 0x22, 0x22, 0x23, 0x23).
+ */
+
+static inline u8 getCx86(u8 reg)
+{
+ outb(reg, 0x22);
+ return inb(0x23);
+}
+
+static inline void setCx86(u8 reg, u8 data)
+{
+ outb(reg, 0x22);
+ outb(data, 0x23);
+}
+
+#define getCx86_old(reg) ({ outb((reg), 0x22); inb(0x23); })
+
+#define setCx86_old(reg, data) do { \
+ outb((reg), 0x22); \
+ outb((data), 0x23); \
+} while (0)
+
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
new file mode 100644
index 00000000000..7a3e836eb2a
--- /dev/null
+++ b/arch/x86/include/asm/processor-flags.h
@@ -0,0 +1,100 @@
+#ifndef _ASM_X86_PROCESSOR_FLAGS_H
+#define _ASM_X86_PROCESSOR_FLAGS_H
+/* Various flags defined: can be included from assembler. */
+
+/*
+ * EFLAGS bits
+ */
+#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
+#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
+#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
+#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
+#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
+#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
+#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
+#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
+#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
+#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
+#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
+#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
+#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
+#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
+#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
+#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
+#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
+
+/*
+ * Basic CPU control in CR0
+ */
+#define X86_CR0_PE 0x00000001 /* Protection Enable */
+#define X86_CR0_MP 0x00000002 /* Monitor Coprocessor */
+#define X86_CR0_EM 0x00000004 /* Emulation */
+#define X86_CR0_TS 0x00000008 /* Task Switched */
+#define X86_CR0_ET 0x00000010 /* Extension Type */
+#define X86_CR0_NE 0x00000020 /* Numeric Error */
+#define X86_CR0_WP 0x00010000 /* Write Protect */
+#define X86_CR0_AM 0x00040000 /* Alignment Mask */
+#define X86_CR0_NW 0x20000000 /* Not Write-through */
+#define X86_CR0_CD 0x40000000 /* Cache Disable */
+#define X86_CR0_PG 0x80000000 /* Paging */
+
+/*
+ * Paging options in CR3
+ */
+#define X86_CR3_PWT 0x00000008 /* Page Write Through */
+#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
+
+/*
+ * Intel CPU features in CR4
+ */
+#define X86_CR4_VME 0x00000001 /* enable vm86 extensions */
+#define X86_CR4_PVI 0x00000002 /* virtual interrupts flag enable */
+#define X86_CR4_TSD 0x00000004 /* disable time stamp at ipl 3 */
+#define X86_CR4_DE 0x00000008 /* enable debugging extensions */
+#define X86_CR4_PSE 0x00000010 /* enable page size extensions */
+#define X86_CR4_PAE 0x00000020 /* enable physical address extensions */
+#define X86_CR4_MCE 0x00000040 /* Machine check enable */
+#define X86_CR4_PGE 0x00000080 /* enable global pages */
+#define X86_CR4_PCE 0x00000100 /* enable performance counters at ipl 3 */
+#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */
+#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
+#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
+#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
+
+/*
+ * x86-64 Task Priority Register, CR8
+ */
+#define X86_CR8_TPR 0x0000000F /* task priority register */
+
+/*
+ * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h>
+ */
+
+/*
+ * NSC/Cyrix CPU configuration register indexes
+ */
+#define CX86_PCR0 0x20
+#define CX86_GCR 0xb8
+#define CX86_CCR0 0xc0
+#define CX86_CCR1 0xc1
+#define CX86_CCR2 0xc2
+#define CX86_CCR3 0xc3
+#define CX86_CCR4 0xe8
+#define CX86_CCR5 0xe9
+#define CX86_CCR6 0xea
+#define CX86_CCR7 0xeb
+#define CX86_PCR1 0xf0
+#define CX86_DIR0 0xfe
+#define CX86_DIR1 0xff
+#define CX86_ARR_BASE 0xc4
+#define CX86_RCR_BASE 0xdc
+
+#ifdef __KERNEL__
+#ifdef CONFIG_VM86
+#define X86_VM_MASK X86_EFLAGS_VM
+#else
+#define X86_VM_MASK 0 /* No VM86 support */
+#endif
+#endif
+
+#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
new file mode 100644
index 00000000000..5ca01e38326
--- /dev/null
+++ b/arch/x86/include/asm/processor.h
@@ -0,0 +1,936 @@
+#ifndef _ASM_X86_PROCESSOR_H
+#define _ASM_X86_PROCESSOR_H
+
+#include <asm/processor-flags.h>
+
+/* Forward declaration, a strange C thing */
+struct task_struct;
+struct mm_struct;
+
+#include <asm/vm86.h>
+#include <asm/math_emu.h>
+#include <asm/segment.h>
+#include <asm/types.h>
+#include <asm/sigcontext.h>
+#include <asm/current.h>
+#include <asm/cpufeature.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/percpu.h>
+#include <asm/msr.h>
+#include <asm/desc_defs.h>
+#include <asm/nops.h>
+#include <asm/ds.h>
+
+#include <linux/personality.h>
+#include <linux/cpumask.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+
+/*
+ * Default implementation of macro that returns current
+ * instruction pointer ("program counter").
+ */
+static inline void *current_text_addr(void)
+{
+ void *pc;
+
+ asm volatile("mov $1f, %0; 1:":"=r" (pc));
+
+ return pc;
+}
+
+#ifdef CONFIG_X86_VSMP
+# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
+# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
+#else
+# define ARCH_MIN_TASKALIGN 16
+# define ARCH_MIN_MMSTRUCT_ALIGN 0
+#endif
+
+/*
+ * CPU type and hardware bug flags. Kept separately for each CPU.
+ * Members of this structure are referenced in head.S, so think twice
+ * before touching them. [mj]
+ */
+
+struct cpuinfo_x86 {
+ __u8 x86; /* CPU family */
+ __u8 x86_vendor; /* CPU vendor */
+ __u8 x86_model;
+ __u8 x86_mask;
+#ifdef CONFIG_X86_32
+ char wp_works_ok; /* It doesn't on 386's */
+
+ /* Problems on some 486Dx4's and old 386's: */
+ char hlt_works_ok;
+ char hard_math;
+ char rfu;
+ char fdiv_bug;
+ char f00f_bug;
+ char coma_bug;
+ char pad0;
+#else
+ /* Number of 4K pages in DTLB/ITLB combined(in pages): */
+ int x86_tlbsize;
+ __u8 x86_virt_bits;
+ __u8 x86_phys_bits;
+#endif
+ /* CPUID returned core id bits: */
+ __u8 x86_coreid_bits;
+ /* Max extended CPUID function supported: */
+ __u32 extended_cpuid_level;
+ /* Maximum supported CPUID level, -1=no CPUID: */
+ int cpuid_level;
+ __u32 x86_capability[NCAPINTS];
+ char x86_vendor_id[16];
+ char x86_model_id[64];
+ /* in KB - valid for CPUS which support this call: */
+ int x86_cache_size;
+ int x86_cache_alignment; /* In bytes */
+ int x86_power;
+ unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+ /* cpus sharing the last level cache: */
+ cpumask_t llc_shared_map;
+#endif
+ /* cpuid returned max cores value: */
+ u16 x86_max_cores;
+ u16 apicid;
+ u16 initial_apicid;
+ u16 x86_clflush_size;
+#ifdef CONFIG_SMP
+ /* number of cores as seen by the OS: */
+ u16 booted_cores;
+ /* Physical processor id: */
+ u16 phys_proc_id;
+ /* Core id: */
+ u16 cpu_core_id;
+ /* Index into per_cpu list: */
+ u16 cpu_index;
+#endif
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define X86_VENDOR_INTEL 0
+#define X86_VENDOR_CYRIX 1
+#define X86_VENDOR_AMD 2
+#define X86_VENDOR_UMC 3
+#define X86_VENDOR_CENTAUR 5
+#define X86_VENDOR_TRANSMETA 7
+#define X86_VENDOR_NSC 8
+#define X86_VENDOR_NUM 9
+
+#define X86_VENDOR_UNKNOWN 0xff
+
+/*
+ * capabilities of CPUs
+ */
+extern struct cpuinfo_x86 boot_cpu_data;
+extern struct cpuinfo_x86 new_cpu_data;
+
+extern struct tss_struct doublefault_tss;
+extern __u32 cleared_cpu_caps[NCAPINTS];
+
+#ifdef CONFIG_SMP
+DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+#define cpu_data(cpu) per_cpu(cpu_info, cpu)
+#define current_cpu_data __get_cpu_var(cpu_info)
+#else
+#define cpu_data(cpu) boot_cpu_data
+#define current_cpu_data boot_cpu_data
+#endif
+
+extern const struct seq_operations cpuinfo_op;
+
+static inline int hlt_works(int cpu)
+{
+#ifdef CONFIG_X86_32
+ return cpu_data(cpu).hlt_works_ok;
+#else
+ return 1;
+#endif
+}
+
+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
+
+extern void cpu_detect(struct cpuinfo_x86 *c);
+
+extern struct pt_regs *idle_regs(struct pt_regs *);
+
+extern void early_cpu_init(void);
+extern void identify_boot_cpu(void);
+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+extern void detect_extended_topology(struct cpuinfo_x86 *c);
+extern void detect_ht(struct cpuinfo_x86 *c);
+
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+static inline void load_cr3(pgd_t *pgdir)
+{
+ write_cr3(__pa(pgdir));
+}
+
+#ifdef CONFIG_X86_32
+/* This is the TSS defined by the hardware. */
+struct x86_hw_tss {
+ unsigned short back_link, __blh;
+ unsigned long sp0;
+ unsigned short ss0, __ss0h;
+ unsigned long sp1;
+ /* ss1 caches MSR_IA32_SYSENTER_CS: */
+ unsigned short ss1, __ss1h;
+ unsigned long sp2;
+ unsigned short ss2, __ss2h;
+ unsigned long __cr3;
+ unsigned long ip;
+ unsigned long flags;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long bx;
+ unsigned long sp;
+ unsigned long bp;
+ unsigned long si;
+ unsigned long di;
+ unsigned short es, __esh;
+ unsigned short cs, __csh;
+ unsigned short ss, __ssh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+ unsigned short ldt, __ldth;
+ unsigned short trace;
+ unsigned short io_bitmap_base;
+
+} __attribute__((packed));
+#else
+struct x86_hw_tss {
+ u32 reserved1;
+ u64 sp0;
+ u64 sp1;
+ u64 sp2;
+ u64 reserved2;
+ u64 ist[7];
+ u32 reserved3;
+ u32 reserved4;
+ u16 reserved5;
+ u16 io_bitmap_base;
+
+} __attribute__((packed)) ____cacheline_aligned;
+#endif
+
+/*
+ * IO-bitmap sizes:
+ */
+#define IO_BITMAP_BITS 65536
+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
+#define INVALID_IO_BITMAP_OFFSET 0x8000
+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
+
+struct tss_struct {
+ /*
+ * The hardware state:
+ */
+ struct x86_hw_tss x86_tss;
+
+ /*
+ * The extra 1 is there because the CPU will access an
+ * additional byte beyond the end of the IO permission
+ * bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+ /*
+ * Cache the current maximum and the last task that used the bitmap:
+ */
+ unsigned long io_bitmap_max;
+ struct thread_struct *io_bitmap_owner;
+
+ /*
+ * .. and then another 0x100 bytes for the emergency kernel stack:
+ */
+ unsigned long stack[64];
+
+} ____cacheline_aligned;
+
+DECLARE_PER_CPU(struct tss_struct, init_tss);
+
+/*
+ * Save the original ist values for checking stack pointers during debugging
+ */
+struct orig_ist {
+ unsigned long ist[7];
+};
+
+#define MXCSR_DEFAULT 0x1f80
+
+struct i387_fsave_struct {
+ u32 cwd; /* FPU Control Word */
+ u32 swd; /* FPU Status Word */
+ u32 twd; /* FPU Tag Word */
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Pointer Offset */
+ u32 fos; /* FPU Operand Pointer Selector */
+
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+
+ /* Software status information [not touched by FSAVE ]: */
+ u32 status;
+};
+
+struct i387_fxsave_struct {
+ u16 cwd; /* Control Word */
+ u16 swd; /* Status Word */
+ u16 twd; /* Tag Word */
+ u16 fop; /* Last Instruction Opcode */
+ union {
+ struct {
+ u64 rip; /* Instruction Pointer */
+ u64 rdp; /* Data Pointer */
+ };
+ struct {
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Offset */
+ u32 fos; /* FPU Operand Selector */
+ };
+ };
+ u32 mxcsr; /* MXCSR Register State */
+ u32 mxcsr_mask; /* MXCSR Mask */
+
+ /* 8*16 bytes for each FP-reg = 128 bytes: */
+ u32 st_space[32];
+
+ /* 16*16 bytes for each XMM-reg = 256 bytes: */
+ u32 xmm_space[64];
+
+ u32 padding[12];
+
+ union {
+ u32 padding1[12];
+ u32 sw_reserved[12];
+ };
+
+} __attribute__((aligned(16)));
+
+struct i387_soft_struct {
+ u32 cwd;
+ u32 swd;
+ u32 twd;
+ u32 fip;
+ u32 fcs;
+ u32 foo;
+ u32 fos;
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+ u8 ftop;
+ u8 changed;
+ u8 lookahead;
+ u8 no_update;
+ u8 rm;
+ u8 alimit;
+ struct info *info;
+ u32 entry_eip;
+};
+
+struct xsave_hdr_struct {
+ u64 xstate_bv;
+ u64 reserved1[2];
+ u64 reserved2[5];
+} __attribute__((packed));
+
+struct xsave_struct {
+ struct i387_fxsave_struct i387;
+ struct xsave_hdr_struct xsave_hdr;
+ /* new processor state extensions will go here */
+} __attribute__ ((packed, aligned (64)));
+
+union thread_xstate {
+ struct i387_fsave_struct fsave;
+ struct i387_fxsave_struct fxsave;
+ struct i387_soft_struct soft;
+ struct xsave_struct xsave;
+};
+
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern unsigned int xstate_size;
+extern void free_thread_xstate(struct task_struct *);
+extern struct kmem_cache *task_xstate_cachep;
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+struct thread_struct {
+ /* Cached TLS descriptors: */
+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+ unsigned long sp0;
+ unsigned long sp;
+#ifdef CONFIG_X86_32
+ unsigned long sysenter_cs;
+#else
+ unsigned long usersp; /* Copy from PDA */
+ unsigned short es;
+ unsigned short ds;
+ unsigned short fsindex;
+ unsigned short gsindex;
+#endif
+ unsigned long ip;
+ unsigned long fs;
+ unsigned long gs;
+ /* Hardware debugging registers: */
+ unsigned long debugreg0;
+ unsigned long debugreg1;
+ unsigned long debugreg2;
+ unsigned long debugreg3;
+ unsigned long debugreg6;
+ unsigned long debugreg7;
+ /* Fault info: */
+ unsigned long cr2;
+ unsigned long trap_no;
+ unsigned long error_code;
+ /* floating point and extended processor state */
+ union thread_xstate *xstate;
+#ifdef CONFIG_X86_32
+ /* Virtual 86 mode info */
+ struct vm86_struct __user *vm86_info;
+ unsigned long screen_bitmap;
+ unsigned long v86flags;
+ unsigned long v86mask;
+ unsigned long saved_sp0;
+ unsigned int saved_fs;
+ unsigned int saved_gs;
+#endif
+ /* IO permissions: */
+ unsigned long *io_bitmap_ptr;
+ unsigned long iopl;
+ /* Max allowed port in the bitmap, in bytes: */
+ unsigned io_bitmap_max;
+/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
+ unsigned long debugctlmsr;
+#ifdef CONFIG_X86_DS
+/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
+ struct ds_context *ds_ctx;
+#endif /* CONFIG_X86_DS */
+#ifdef CONFIG_X86_PTRACE_BTS
+/* the signal to send on a bts buffer overflow */
+ unsigned int bts_ovfl_signal;
+#endif /* CONFIG_X86_PTRACE_BTS */
+};
+
+static inline unsigned long native_get_debugreg(int regno)
+{
+ unsigned long val = 0; /* Damn you, gcc! */
+
+ switch (regno) {
+ case 0:
+ asm("mov %%db0, %0" :"=r" (val));
+ break;
+ case 1:
+ asm("mov %%db1, %0" :"=r" (val));
+ break;
+ case 2:
+ asm("mov %%db2, %0" :"=r" (val));
+ break;
+ case 3:
+ asm("mov %%db3, %0" :"=r" (val));
+ break;
+ case 6:
+ asm("mov %%db6, %0" :"=r" (val));
+ break;
+ case 7:
+ asm("mov %%db7, %0" :"=r" (val));
+ break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static inline void native_set_debugreg(int regno, unsigned long value)
+{
+ switch (regno) {
+ case 0:
+ asm("mov %0, %%db0" ::"r" (value));
+ break;
+ case 1:
+ asm("mov %0, %%db1" ::"r" (value));
+ break;
+ case 2:
+ asm("mov %0, %%db2" ::"r" (value));
+ break;
+ case 3:
+ asm("mov %0, %%db3" ::"r" (value));
+ break;
+ case 6:
+ asm("mov %0, %%db6" ::"r" (value));
+ break;
+ case 7:
+ asm("mov %0, %%db7" ::"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void native_set_iopl_mask(unsigned mask)
+{
+#ifdef CONFIG_X86_32
+ unsigned int reg;
+
+ asm volatile ("pushfl;"
+ "popl %0;"
+ "andl %1, %0;"
+ "orl %2, %0;"
+ "pushl %0;"
+ "popfl"
+ : "=&r" (reg)
+ : "i" (~X86_EFLAGS_IOPL), "r" (mask));
+#endif
+}
+
+static inline void
+native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+{
+ tss->x86_tss.sp0 = thread->sp0;
+#ifdef CONFIG_X86_32
+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+ tss->x86_tss.ss1 = thread->sysenter_cs;
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+ }
+#endif
+}
+
+static inline void native_swapgs(void)
+{
+#ifdef CONFIG_X86_64
+ asm volatile("swapgs" ::: "memory");
+#endif
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __cpuid native_cpuid
+#define paravirt_enabled() 0
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register) \
+ (var) = native_get_debugreg(register)
+#define set_debugreg(value, register) \
+ native_set_debugreg(register, value)
+
+static inline void load_sp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ native_load_sp0(tss, thread);
+}
+
+#define set_iopl_mask native_set_iopl_mask
+#endif /* CONFIG_PARAVIRT */
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+extern unsigned long mmu_cr4_features;
+
+static inline void set_in_cr4(unsigned long mask)
+{
+ unsigned cr4;
+
+ mmu_cr4_features |= mask;
+ cr4 = read_cr4();
+ cr4 |= mask;
+ write_cr4(cr4);
+}
+
+static inline void clear_in_cr4(unsigned long mask)
+{
+ unsigned cr4;
+
+ mmu_cr4_features &= ~mask;
+ cr4 = read_cr4();
+ cr4 &= ~mask;
+ write_cr4(cr4);
+}
+
+typedef struct {
+ unsigned long seg;
+} mm_segment_t;
+
+
+/*
+ * create a kernel thread without removing it from tasklists
+ */
+extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+
+/* Free all resources held by a thread. */
+extern void release_thread(struct task_struct *);
+
+/* Prepare to copy thread state - unlazy all lazy state */
+extern void prepare_to_copy(struct task_struct *tsk);
+
+unsigned long get_wchan(struct task_struct *p);
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = 0;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = count;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return eax;
+}
+
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ebx;
+}
+
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ecx;
+}
+
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return edx;
+}
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+ asm volatile("rep; nop" ::: "memory");
+}
+
+static inline void cpu_relax(void)
+{
+ rep_nop();
+}
+
+/* Stop speculative execution: */
+static inline void sync_core(void)
+{
+ int tmp;
+
+ asm volatile("cpuid" : "=a" (tmp) : "0" (1)
+ : "ebx", "ecx", "edx", "memory");
+}
+
+static inline void __monitor(const void *eax, unsigned long ecx,
+ unsigned long edx)
+{
+ /* "monitor %eax, %ecx, %edx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xc8;"
+ :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+ /* "mwait %eax, %ecx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+}
+
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+ trace_hardirqs_on();
+ /* "mwait %eax, %ecx;" */
+ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+}
+
+extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+
+extern void select_idle_routine(const struct cpuinfo_x86 *c);
+
+extern unsigned long boot_option_idle_override;
+extern unsigned long idle_halt;
+extern unsigned long idle_nomwait;
+
+/*
+ * on systems with caches, caches must be flashed as the absolute
+ * last instruction before going into a suspended halt. Otherwise,
+ * dirty data can linger in the cache and become stale on resume,
+ * leading to strange errors.
+ *
+ * perform a variety of operations to guarantee that the compiler
+ * will not reorder instructions. wbinvd itself is serializing
+ * so the processor will not reorder.
+ *
+ * Systems without cache can just go into halt.
+ */
+static inline void wbinvd_halt(void)
+{
+ mb();
+ /* check for clflush to determine if wbinvd is legal */
+ if (cpu_has_clflush)
+ asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
+ else
+ while (1)
+ halt();
+}
+
+extern void enable_sep_cpu(void);
+extern int sysenter_setup(void);
+
+/* Defined in head.S */
+extern struct desc_ptr early_gdt_descr;
+
+extern void cpu_set_gdt(int);
+extern void switch_to_new_gdt(void);
+extern void cpu_init(void);
+extern void init_gdt(int cpu);
+
+static inline void update_debugctlmsr(unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return;
+#endif
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+}
+
+/*
+ * from system description table in BIOS. Mostly for MCA use, but
+ * others may find it useful:
+ */
+extern unsigned int machine_id;
+extern unsigned int machine_submodel_id;
+extern unsigned int BIOS_revision;
+
+/* Boot loader type from the setup header: */
+extern int bootloader_type;
+
+extern char ignore_fpu_irq;
+
+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+
+#ifdef CONFIG_X86_32
+# define BASE_PREFETCH ASM_NOP4
+# define ARCH_HAS_PREFETCH
+#else
+# define BASE_PREFETCH "prefetcht0 (%1)"
+#endif
+
+/*
+ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
+ *
+ * It's not worth to care about 3dnow prefetches for the K6
+ * because they are microcoded there and very slow.
+ */
+static inline void prefetch(const void *x)
+{
+ alternative_input(BASE_PREFETCH,
+ "prefetchnta (%1)",
+ X86_FEATURE_XMM,
+ "r" (x));
+}
+
+/*
+ * 3dnow prefetch to get an exclusive cache line.
+ * Useful for spinlocks to avoid one state transition in the
+ * cache coherency protocol:
+ */
+static inline void prefetchw(const void *x)
+{
+ alternative_input(BASE_PREFETCH,
+ "prefetchw (%1)",
+ X86_FEATURE_3DNOW,
+ "r" (x));
+}
+
+static inline void spin_lock_prefetch(const void *x)
+{
+ prefetchw(x);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * User space process size: 3GB (default).
+ */
+#define TASK_SIZE PAGE_OFFSET
+#define STACK_TOP TASK_SIZE
+#define STACK_TOP_MAX STACK_TOP
+
+#define INIT_THREAD { \
+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .vm86_info = NULL, \
+ .sysenter_cs = __KERNEL_CS, \
+ .io_bitmap_ptr = NULL, \
+ .fs = __KERNEL_PERCPU, \
+}
+
+/*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+#define INIT_TSS { \
+ .x86_tss = { \
+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .ss0 = __KERNEL_DS, \
+ .ss1 = __KERNEL_CS, \
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
+ }, \
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
+}
+
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
+#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
+#define KSTK_TOP(info) \
+({ \
+ unsigned long *__ptr = (unsigned long *)(info); \
+ (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
+})
+
+/*
+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * This is necessary to guarantee that the entire "struct pt_regs"
+ * is accessable even if the CPU haven't stored the SS/ESP registers
+ * on the stack (interrupt gate does not save these registers
+ * when switching to the same priv ring).
+ * Therefore beware: accessing the ss/esp fields of the
+ * "struct pt_regs" is possible, but they may contain the
+ * completely wrong values.
+ */
+#define task_pt_regs(task) \
+({ \
+ struct pt_regs *__regs__; \
+ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+ __regs__ - 1; \
+})
+
+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
+
+#else
+/*
+ * User space process size. 47bits minus one guard page.
+ */
+#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE)
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
+ 0xc0000000 : 0xFFFFe000)
+
+#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE64)
+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE64)
+
+#define STACK_TOP TASK_SIZE
+#define STACK_TOP_MAX TASK_SIZE64
+
+#define INIT_THREAD { \
+ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+#define INIT_TSS { \
+ .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
+
+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
+#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
+#endif /* CONFIG_X86_64 */
+
+extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
+ unsigned long new_sp);
+
+/*
+ * This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+
+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+
+/* Get/set a process' ability to use the timestamp counter instruction */
+#define GET_TSC_CTL(adr) get_tsc_mode((adr))
+#define SET_TSC_CTL(val) set_tsc_mode((val))
+
+extern int get_tsc_mode(unsigned long adr);
+extern int set_tsc_mode(unsigned int val);
+
+#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
new file mode 100644
index 00000000000..d6a22f92ba7
--- /dev/null
+++ b/arch/x86/include/asm/proto.h
@@ -0,0 +1,32 @@
+#ifndef _ASM_X86_PROTO_H
+#define _ASM_X86_PROTO_H
+
+#include <asm/ldt.h>
+
+/* misc architecture specific prototypes */
+
+extern void early_idt_handler(void);
+
+extern void system_call(void);
+extern void syscall_init(void);
+
+extern void ia32_syscall(void);
+extern void ia32_cstar_target(void);
+extern void ia32_sysenter_target(void);
+
+extern void syscall32_cpu_init(void);
+
+extern void check_efer(void);
+
+#ifdef CONFIG_X86_BIOS_REBOOT
+extern int reboot_force;
+#else
+static const int reboot_force = 0;
+#endif
+
+long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
+
+#define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1))
+#define round_down(x, y) ((x) & ~((y) - 1))
+
+#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
new file mode 100644
index 00000000000..25f1bb8fc62
--- /dev/null
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -0,0 +1,145 @@
+#ifndef _ASM_X86_PTRACE_ABI_H
+#define _ASM_X86_PTRACE_ABI_H
+
+#ifdef __i386__
+
+#define EBX 0
+#define ECX 1
+#define EDX 2
+#define ESI 3
+#define EDI 4
+#define EBP 5
+#define EAX 6
+#define DS 7
+#define ES 8
+#define FS 9
+#define GS 10
+#define ORIG_EAX 11
+#define EIP 12
+#define CS 13
+#define EFL 14
+#define UESP 15
+#define SS 16
+#define FRAME_SIZE 17
+
+#else /* __i386__ */
+
+#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+#define R15 0
+#define R14 8
+#define R13 16
+#define R12 24
+#define RBP 32
+#define RBX 40
+/* arguments: interrupts/non tracing syscalls only save upto here*/
+#define R11 48
+#define R10 56
+#define R9 64
+#define R8 72
+#define RAX 80
+#define RCX 88
+#define RDX 96
+#define RSI 104
+#define RDI 112
+#define ORIG_RAX 120 /* = ERROR */
+/* end of arguments */
+/* cpu exception frame or undefined in case of fast syscall. */
+#define RIP 128
+#define CS 136
+#define EFLAGS 144
+#define RSP 152
+#define SS 160
+#define ARGOFFSET R11
+#endif /* __ASSEMBLY__ */
+
+/* top of stack page */
+#define FRAME_SIZE 168
+
+#endif /* !__i386__ */
+
+/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
+#define PTRACE_GETREGS 12
+#define PTRACE_SETREGS 13
+#define PTRACE_GETFPREGS 14
+#define PTRACE_SETFPREGS 15
+#define PTRACE_GETFPXREGS 18
+#define PTRACE_SETFPXREGS 19
+
+#define PTRACE_OLDSETOPTIONS 21
+
+/* only useful for access 32bit programs / kernels */
+#define PTRACE_GET_THREAD_AREA 25
+#define PTRACE_SET_THREAD_AREA 26
+
+#ifdef __x86_64__
+# define PTRACE_ARCH_PRCTL 30
+#endif
+
+#define PTRACE_SYSEMU 31
+#define PTRACE_SYSEMU_SINGLESTEP 32
+
+#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
+
+#ifdef CONFIG_X86_PTRACE_BTS
+
+#ifndef __ASSEMBLY__
+#include <asm/types.h>
+
+/* configuration/status structure used in PTRACE_BTS_CONFIG and
+ PTRACE_BTS_STATUS commands.
+*/
+struct ptrace_bts_config {
+ /* requested or actual size of BTS buffer in bytes */
+ __u32 size;
+ /* bitmask of below flags */
+ __u32 flags;
+ /* buffer overflow signal */
+ __u32 signal;
+ /* actual size of bts_struct in bytes */
+ __u32 bts_size;
+};
+#endif /* __ASSEMBLY__ */
+
+#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */
+#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */
+#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG<signal> on buffer overflow
+ instead of wrapping around */
+#define PTRACE_BTS_O_ALLOC 0x8 /* (re)allocate buffer */
+
+#define PTRACE_BTS_CONFIG 40
+/* Configure branch trace recording.
+ ADDR points to a struct ptrace_bts_config.
+ DATA gives the size of that buffer.
+ A new buffer is allocated, if requested in the flags.
+ An overflow signal may only be requested for new buffers.
+ Returns the number of bytes read.
+*/
+#define PTRACE_BTS_STATUS 41
+/* Return the current configuration in a struct ptrace_bts_config
+ pointed to by ADDR; DATA gives the size of that buffer.
+ Returns the number of bytes written.
+*/
+#define PTRACE_BTS_SIZE 42
+/* Return the number of available BTS records for draining.
+ DATA and ADDR are ignored.
+*/
+#define PTRACE_BTS_GET 43
+/* Get a single BTS record.
+ DATA defines the index into the BTS array, where 0 is the newest
+ entry, and higher indices refer to older entries.
+ ADDR is pointing to struct bts_struct (see asm/ds.h).
+*/
+#define PTRACE_BTS_CLEAR 44
+/* Clear the BTS buffer.
+ DATA and ADDR are ignored.
+*/
+#define PTRACE_BTS_DRAIN 45
+/* Read all available BTS records and clear the buffer.
+ ADDR points to an array of struct bts_struct.
+ DATA gives the size of that buffer.
+ BTS records are read from oldest to newest.
+ Returns number of BTS records drained.
+*/
+#endif /* CONFIG_X86_PTRACE_BTS */
+
+#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
new file mode 100644
index 00000000000..d1531c8480b
--- /dev/null
+++ b/arch/x86/include/asm/ptrace.h
@@ -0,0 +1,280 @@
+#ifndef _ASM_X86_PTRACE_H
+#define _ASM_X86_PTRACE_H
+
+#include <linux/compiler.h> /* For __user */
+#include <asm/ptrace-abi.h>
+#include <asm/processor-flags.h>
+
+#ifdef __KERNEL__
+#include <asm/ds.h> /* the DS BTS struct is used for ptrace too */
+#include <asm/segment.h>
+#endif
+
+#ifndef __ASSEMBLY__
+
+#ifdef __i386__
+/* this struct defines the way the registers are stored on the
+ stack during a system call. */
+
+#ifndef __KERNEL__
+
+struct pt_regs {
+ long ebx;
+ long ecx;
+ long edx;
+ long esi;
+ long edi;
+ long ebp;
+ long eax;
+ int xds;
+ int xes;
+ int xfs;
+ /* int gs; */
+ long orig_eax;
+ long eip;
+ int xcs;
+ long eflags;
+ long esp;
+ int xss;
+};
+
+#else /* __KERNEL__ */
+
+struct pt_regs {
+ unsigned long bx;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long bp;
+ unsigned long ax;
+ unsigned long ds;
+ unsigned long es;
+ unsigned long fs;
+ /* int gs; */
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+};
+
+#endif /* __KERNEL__ */
+
+#else /* __i386__ */
+
+#ifndef __KERNEL__
+
+struct pt_regs {
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long rbp;
+ unsigned long rbx;
+/* arguments: non interrupts/non tracing syscalls only save upto here*/
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long rax;
+ unsigned long rcx;
+ unsigned long rdx;
+ unsigned long rsi;
+ unsigned long rdi;
+ unsigned long orig_rax;
+/* end of arguments */
+/* cpu exception frame or undefined */
+ unsigned long rip;
+ unsigned long cs;
+ unsigned long eflags;
+ unsigned long rsp;
+ unsigned long ss;
+/* top of stack page */
+};
+
+#else /* __KERNEL__ */
+
+struct pt_regs {
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long bp;
+ unsigned long bx;
+/* arguments: non interrupts/non tracing syscalls only save upto here*/
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long orig_ax;
+/* end of arguments */
+/* cpu exception frame or undefined */
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+/* top of stack page */
+};
+
+#endif /* __KERNEL__ */
+#endif /* !__i386__ */
+
+
+#ifdef CONFIG_X86_PTRACE_BTS
+/* a branch trace record entry
+ *
+ * In order to unify the interface between various processor versions,
+ * we use the below data structure for all processors.
+ */
+enum bts_qualifier {
+ BTS_INVALID = 0,
+ BTS_BRANCH,
+ BTS_TASK_ARRIVES,
+ BTS_TASK_DEPARTS
+};
+
+struct bts_struct {
+ __u64 qualifier;
+ union {
+ /* BTS_BRANCH */
+ struct {
+ __u64 from_ip;
+ __u64 to_ip;
+ } lbr;
+ /* BTS_TASK_ARRIVES or
+ BTS_TASK_DEPARTS */
+ __u64 jiffies;
+ } variant;
+};
+#endif /* CONFIG_X86_PTRACE_BTS */
+
+#ifdef __KERNEL__
+
+#include <linux/init.h>
+
+struct cpuinfo_x86;
+struct task_struct;
+
+#ifdef CONFIG_X86_PTRACE_BTS
+extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *);
+extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
+#else
+#define ptrace_bts_init_intel(config) do {} while (0)
+#endif /* CONFIG_X86_PTRACE_BTS */
+
+extern unsigned long profile_pc(struct pt_regs *regs);
+
+extern unsigned long
+convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
+extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
+ int error_code, int si_code);
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+
+extern long syscall_trace_enter(struct pt_regs *);
+extern void syscall_trace_leave(struct pt_regs *);
+
+static inline unsigned long regs_return_value(struct pt_regs *regs)
+{
+ return regs->ax;
+}
+
+/*
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value. This tricky test checks that with
+ * one comparison. Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
+static inline int user_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
+#else
+ return !!(regs->cs & 3);
+#endif
+}
+
+static inline int user_mode_vm(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
+ USER_RPL;
+#else
+ return user_mode(regs);
+#endif
+}
+
+static inline int v8086_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ return (regs->flags & X86_VM_MASK);
+#else
+ return 0; /* No V86 mode support in long mode */
+#endif
+}
+
+/*
+ * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
+ * when it traps. So regs will be the current sp.
+ *
+ * This is valid only for kernel mode traps.
+ */
+static inline unsigned long kernel_trap_sp(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ return (unsigned long)regs;
+#else
+ return regs->sp;
+#endif
+}
+
+static inline unsigned long instruction_pointer(struct pt_regs *regs)
+{
+ return regs->ip;
+}
+
+static inline unsigned long frame_pointer(struct pt_regs *regs)
+{
+ return regs->bp;
+}
+
+static inline unsigned long user_stack_pointer(struct pt_regs *regs)
+{
+ return regs->sp;
+}
+
+/*
+ * These are defined as per linux/ptrace.h, which see.
+ */
+#define arch_has_single_step() (1)
+extern void user_enable_single_step(struct task_struct *);
+extern void user_disable_single_step(struct task_struct *);
+
+extern void user_enable_block_step(struct task_struct *);
+#ifdef CONFIG_X86_DEBUGCTLMSR
+#define arch_has_block_step() (1)
+#else
+#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
+#endif
+
+struct user_desc;
+extern int do_get_thread_area(struct task_struct *p, int idx,
+ struct user_desc __user *info);
+extern int do_set_thread_area(struct task_struct *p, int idx,
+ struct user_desc __user *info, int can_allocate);
+
+#define __ARCH_WANT_COMPAT_SYS_PTRACE
+
+#endif /* __KERNEL__ */
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PTRACE_H */
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
new file mode 100644
index 00000000000..6d93508f262
--- /dev/null
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -0,0 +1,42 @@
+#ifndef _ASM_X86_PVCLOCK_ABI_H
+#define _ASM_X86_PVCLOCK_ABI_H
+#ifndef __ASSEMBLY__
+
+/*
+ * These structs MUST NOT be changed.
+ * They are the ABI between hypervisor and guest OS.
+ * Both Xen and KVM are using this.
+ *
+ * pvclock_vcpu_time_info holds the system time and the tsc timestamp
+ * of the last update. So the guest can use the tsc delta to get a
+ * more precise system time. There is one per virtual cpu.
+ *
+ * pvclock_wall_clock references the point in time when the system
+ * time was zero (usually boot time), thus the guest calculates the
+ * current wall clock by adding the system time.
+ *
+ * Protocol for the "version" fields is: hypervisor raises it (making
+ * it uneven) before it starts updating the fields and raises it again
+ * (making it even) when it is done. Thus the guest can make sure the
+ * time values it got are consistent by checking the version before
+ * and after reading them.
+ */
+
+struct pvclock_vcpu_time_info {
+ u32 version;
+ u32 pad0;
+ u64 tsc_timestamp;
+ u64 system_time;
+ u32 tsc_to_system_mul;
+ s8 tsc_shift;
+ u8 pad[3];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct pvclock_wall_clock {
+ u32 version;
+ u32 sec;
+ u32 nsec;
+} __attribute__((__packed__));
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
new file mode 100644
index 00000000000..53235fd5f8c
--- /dev/null
+++ b/arch/x86/include/asm/pvclock.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_PVCLOCK_H
+#define _ASM_X86_PVCLOCK_H
+
+#include <linux/clocksource.h>
+#include <asm/pvclock-abi.h>
+
+/* some helper functions for xen and kvm pv clock sources */
+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
+ struct pvclock_vcpu_time_info *vcpu,
+ struct timespec *ts);
+
+#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
new file mode 100644
index 00000000000..df7710354f8
--- /dev/null
+++ b/arch/x86/include/asm/reboot.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_REBOOT_H
+#define _ASM_X86_REBOOT_H
+
+struct pt_regs;
+
+struct machine_ops {
+ void (*restart)(char *cmd);
+ void (*halt)(void);
+ void (*power_off)(void);
+ void (*shutdown)(void);
+ void (*crash_shutdown)(struct pt_regs *);
+ void (*emergency_restart)(void);
+};
+
+extern struct machine_ops machine_ops;
+
+void native_machine_crash_shutdown(struct pt_regs *regs);
+void native_machine_shutdown(void);
+void machine_real_restart(const unsigned char *code, int length);
+
+#endif /* _ASM_X86_REBOOT_H */
diff --git a/arch/x86/include/asm/reboot_fixups.h b/arch/x86/include/asm/reboot_fixups.h
new file mode 100644
index 00000000000..765debe4c54
--- /dev/null
+++ b/arch/x86/include/asm/reboot_fixups.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_REBOOT_FIXUPS_H
+#define _ASM_X86_REBOOT_FIXUPS_H
+
+extern void mach_reboot_fixups(void);
+
+#endif /* _ASM_X86_REBOOT_FIXUPS_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
new file mode 100644
index 00000000000..d5cd6c58688
--- /dev/null
+++ b/arch/x86/include/asm/required-features.h
@@ -0,0 +1,82 @@
+#ifndef _ASM_X86_REQUIRED_FEATURES_H
+#define _ASM_X86_REQUIRED_FEATURES_H
+
+/* Define minimum CPUID feature set for kernel These bits are checked
+ really early to actually display a visible error message before the
+ kernel dies. Make sure to assign features to the proper mask!
+
+ Some requirements that are not in CPUID yet are also in the
+ CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too.
+
+ The real information is in arch/x86/Kconfig.cpu, this just converts
+ the CONFIGs into a bitmask */
+
+#ifndef CONFIG_MATH_EMULATION
+# define NEED_FPU (1<<(X86_FEATURE_FPU & 31))
+#else
+# define NEED_FPU 0
+#endif
+
+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+# define NEED_PAE (1<<(X86_FEATURE_PAE & 31))
+#else
+# define NEED_PAE 0
+#endif
+
+#ifdef CONFIG_X86_CMPXCHG64
+# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31))
+#else
+# define NEED_CX8 0
+#endif
+
+#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64)
+# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31))
+#else
+# define NEED_CMOV 0
+#endif
+
+#ifdef CONFIG_X86_USE_3DNOW
+# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31))
+#else
+# define NEED_3DNOW 0
+#endif
+
+#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
+# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31))
+#else
+# define NEED_NOPL 0
+#endif
+
+#ifdef CONFIG_X86_64
+#define NEED_PSE 0
+#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
+#define NEED_PGE (1<<(X86_FEATURE_PGE & 31))
+#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
+#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
+#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
+#define NEED_LM (1<<(X86_FEATURE_LM & 31))
+#else
+#define NEED_PSE 0
+#define NEED_MSR 0
+#define NEED_PGE 0
+#define NEED_FXSR 0
+#define NEED_XMM 0
+#define NEED_XMM2 0
+#define NEED_LM 0
+#endif
+
+#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\
+ NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\
+ NEED_XMM|NEED_XMM2)
+#define SSE_MASK (NEED_XMM|NEED_XMM2)
+
+#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW)
+
+#define REQUIRED_MASK2 0
+#define REQUIRED_MASK3 (NEED_NOPL)
+#define REQUIRED_MASK4 0
+#define REQUIRED_MASK5 0
+#define REQUIRED_MASK6 0
+#define REQUIRED_MASK7 0
+
+#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/resource.h b/arch/x86/include/asm/resource.h
new file mode 100644
index 00000000000..04bc4db8921
--- /dev/null
+++ b/arch/x86/include/asm/resource.h
@@ -0,0 +1 @@
+#include <asm-generic/resource.h>
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/resume-trace.h
new file mode 100644
index 00000000000..3ff1c2cb1da
--- /dev/null
+++ b/arch/x86/include/asm/resume-trace.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_RESUME_TRACE_H
+#define _ASM_X86_RESUME_TRACE_H
+
+#include <asm/asm.h>
+
+#define TRACE_RESUME(user) \
+do { \
+ if (pm_trace_enabled) { \
+ const void *tracedata; \
+ asm volatile(_ASM_MOV " $1f,%0\n" \
+ ".section .tracedata,\"a\"\n" \
+ "1:\t.word %c1\n\t" \
+ _ASM_PTR " %c2\n" \
+ ".previous" \
+ :"=r" (tracedata) \
+ : "i" (__LINE__), "i" (__FILE__)); \
+ generate_resume_trace(tracedata, user); \
+ } \
+} while (0)
+
+#endif /* _ASM_X86_RESUME_TRACE_H */
diff --git a/arch/x86/include/asm/rio.h b/arch/x86/include/asm/rio.h
new file mode 100644
index 00000000000..97bab6388a9
--- /dev/null
+++ b/arch/x86/include/asm/rio.h
@@ -0,0 +1,63 @@
+/*
+ * Derived from include/asm-x86/mach-summit/mach_mpparse.h
+ * and include/asm-x86/mach-default/bios_ebda.h
+ *
+ * Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ */
+
+#ifndef _ASM_X86_RIO_H
+#define _ASM_X86_RIO_H
+
+#define RIO_TABLE_VERSION 3
+
+struct rio_table_hdr {
+ u8 version; /* Version number of this data structure */
+ u8 num_scal_dev; /* # of Scalability devices */
+ u8 num_rio_dev; /* # of RIO I/O devices */
+} __attribute__((packed));
+
+struct scal_detail {
+ u8 node_id; /* Scalability Node ID */
+ u32 CBAR; /* Address of 1MB register space */
+ u8 port0node; /* Node ID port connected to: 0xFF=None */
+ u8 port0port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 port1node; /* Node ID port connected to: 0xFF = None */
+ u8 port1port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 port2node; /* Node ID port connected to: 0xFF = None */
+ u8 port2port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 chassis_num; /* 1 based Chassis number (1 = boot node) */
+} __attribute__((packed));
+
+struct rio_detail {
+ u8 node_id; /* RIO Node ID */
+ u32 BBAR; /* Address of 1MB register space */
+ u8 type; /* Type of device */
+ u8 owner_id; /* Node ID of Hurricane that owns this */
+ /* node */
+ u8 port0node; /* Node ID port connected to: 0xFF=None */
+ u8 port0port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 port1node; /* Node ID port connected to: 0xFF=None */
+ u8 port1port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 first_slot; /* Lowest slot number below this Calgary */
+ u8 status; /* Bit 0 = 1 : the XAPIC is used */
+ /* = 0 : the XAPIC is not used, ie: */
+ /* ints fwded to another XAPIC */
+ /* Bits1:7 Reserved */
+ u8 WP_index; /* instance index - lower ones have */
+ /* lower slot numbers/PCI bus numbers */
+ u8 chassis_num; /* 1 based Chassis number */
+} __attribute__((packed));
+
+enum {
+ HURR_SCALABILTY = 0, /* Hurricane Scalability info */
+ HURR_RIOIB = 2, /* Hurricane RIOIB info */
+ COMPAT_CALGARY = 4, /* Compatibility Calgary */
+ ALT_CALGARY = 5, /* Second Planar Calgary */
+};
+
+#endif /* _ASM_X86_RIO_H */
diff --git a/arch/x86/include/asm/rtc.h b/arch/x86/include/asm/rtc.h
new file mode 100644
index 00000000000..f71c3b0ed36
--- /dev/null
+++ b/arch/x86/include/asm/rtc.h
@@ -0,0 +1 @@
+#include <asm-generic/rtc.h>
diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h
new file mode 100644
index 00000000000..6a8c0d64510
--- /dev/null
+++ b/arch/x86/include/asm/rwlock.h
@@ -0,0 +1,8 @@
+#ifndef _ASM_X86_RWLOCK_H
+#define _ASM_X86_RWLOCK_H
+
+#define RW_LOCK_BIAS 0x01000000
+
+/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
+
+#endif /* _ASM_X86_RWLOCK_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
new file mode 100644
index 00000000000..ca7517d3377
--- /dev/null
+++ b/arch/x86/include/asm/rwsem.h
@@ -0,0 +1,265 @@
+/* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for i486+
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ *
+ * Derived from asm-x86/semaphore.h
+ *
+ *
+ * The MSW of the count is the negated number of active writers and waiting
+ * lockers, and the LSW is the total number of active locks
+ *
+ * The lock count is initialized to 0 (no active and no waiting lockers).
+ *
+ * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an
+ * uncontended lock. This can be determined because XADD returns the old value.
+ * Readers increment by 1 and see a positive value when uncontended, negative
+ * if there are writers (and maybe) readers waiting (in which case it goes to
+ * sleep).
+ *
+ * The value of WAITING_BIAS supports up to 32766 waiting processes. This can
+ * be extended to 65534 by manually checking the whole MSW rather than relying
+ * on the S flag.
+ *
+ * The value of ACTIVE_BIAS supports up to 65535 active processes.
+ *
+ * This should be totally fair - if anything is waiting, a process that wants a
+ * lock will go to the back of the queue. When the currently active lock is
+ * released, if there's a writer at the front of the queue, then that and only
+ * that will be woken up; if there's a bunch of consequtive readers at the
+ * front, then they'll all be woken up, but no other readers will be.
+ */
+
+#ifndef _ASM_X86_RWSEM_H
+#define _ASM_X86_RWSEM_H
+
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
+#ifdef __KERNEL__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/lockdep.h>
+
+struct rwsem_waiter;
+
+extern asmregparm struct rw_semaphore *
+ rwsem_down_read_failed(struct rw_semaphore *sem);
+extern asmregparm struct rw_semaphore *
+ rwsem_down_write_failed(struct rw_semaphore *sem);
+extern asmregparm struct rw_semaphore *
+ rwsem_wake(struct rw_semaphore *);
+extern asmregparm struct rw_semaphore *
+ rwsem_downgrade_wake(struct rw_semaphore *sem);
+
+/*
+ * the semaphore definition
+ */
+
+#define RWSEM_UNLOCKED_VALUE 0x00000000
+#define RWSEM_ACTIVE_BIAS 0x00000001
+#define RWSEM_ACTIVE_MASK 0x0000ffff
+#define RWSEM_WAITING_BIAS (-0x00010000)
+#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+
+struct rw_semaphore {
+ signed long count;
+ spinlock_t wait_lock;
+ struct list_head wait_list;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
+#else
+# define __RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
+
+#define __RWSEM_INITIALIZER(name) \
+{ \
+ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
+ LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
+}
+
+#define DECLARE_RWSEM(name) \
+ struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+
+extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key);
+
+#define init_rwsem(sem) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ __init_rwsem((sem), #sem, &__key); \
+} while (0)
+
+/*
+ * lock for reading
+ */
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ asm volatile("# beginning down_read\n\t"
+ LOCK_PREFIX " incl (%%eax)\n\t"
+ /* adds 0x00000001, returns the old value */
+ " jns 1f\n"
+ " call call_rwsem_down_read_failed\n"
+ "1:\n\t"
+ "# ending down_read\n\t"
+ : "+m" (sem->count)
+ : "a" (sem)
+ : "memory", "cc");
+}
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ __s32 result, tmp;
+ asm volatile("# beginning __down_read_trylock\n\t"
+ " movl %0,%1\n\t"
+ "1:\n\t"
+ " movl %1,%2\n\t"
+ " addl %3,%2\n\t"
+ " jle 2f\n\t"
+ LOCK_PREFIX " cmpxchgl %2,%0\n\t"
+ " jnz 1b\n\t"
+ "2:\n\t"
+ "# ending __down_read_trylock\n\t"
+ : "+m" (sem->count), "=&a" (result), "=&r" (tmp)
+ : "i" (RWSEM_ACTIVE_READ_BIAS)
+ : "memory", "cc");
+ return result >= 0 ? 1 : 0;
+}
+
+/*
+ * lock for writing
+ */
+static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+ int tmp;
+
+ tmp = RWSEM_ACTIVE_WRITE_BIAS;
+ asm volatile("# beginning down_write\n\t"
+ LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
+ /* subtract 0x0000ffff, returns the old value */
+ " testl %%edx,%%edx\n\t"
+ /* was the count 0 before? */
+ " jz 1f\n"
+ " call call_rwsem_down_write_failed\n"
+ "1:\n"
+ "# ending down_write"
+ : "+m" (sem->count), "=d" (tmp)
+ : "a" (sem), "1" (tmp)
+ : "memory", "cc");
+}
+
+static inline void __down_write(struct rw_semaphore *sem)
+{
+ __down_write_nested(sem, 0);
+}
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ signed long ret = cmpxchg(&sem->count,
+ RWSEM_UNLOCKED_VALUE,
+ RWSEM_ACTIVE_WRITE_BIAS);
+ if (ret == RWSEM_UNLOCKED_VALUE)
+ return 1;
+ return 0;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
+ asm volatile("# beginning __up_read\n\t"
+ LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
+ /* subtracts 1, returns the old value */
+ " jns 1f\n\t"
+ " call call_rwsem_wake\n"
+ "1:\n"
+ "# ending __up_read\n"
+ : "+m" (sem->count), "=d" (tmp)
+ : "a" (sem), "1" (tmp)
+ : "memory", "cc");
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ asm volatile("# beginning __up_write\n\t"
+ " movl %2,%%edx\n\t"
+ LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t"
+ /* tries to transition
+ 0xffff0001 -> 0x00000000 */
+ " jz 1f\n"
+ " call call_rwsem_wake\n"
+ "1:\n\t"
+ "# ending __up_write\n"
+ : "+m" (sem->count)
+ : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS)
+ : "memory", "cc", "edx");
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ asm volatile("# beginning __downgrade_write\n\t"
+ LOCK_PREFIX " addl %2,(%%eax)\n\t"
+ /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
+ " jns 1f\n\t"
+ " call call_rwsem_downgrade_wake\n"
+ "1:\n\t"
+ "# ending __downgrade_write\n"
+ : "+m" (sem->count)
+ : "a" (sem), "i" (-RWSEM_WAITING_BIAS)
+ : "memory", "cc");
+}
+
+/*
+ * implement atomic add functionality
+ */
+static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0"
+ : "+m" (sem->count)
+ : "ir" (delta));
+}
+
+/*
+ * implement exchange and add functionality
+ */
+static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+{
+ int tmp = delta;
+
+ asm volatile(LOCK_PREFIX "xadd %0,%1"
+ : "+r" (tmp), "+m" (sem->count)
+ : : "memory");
+
+ return tmp + delta;
+}
+
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+ return (sem->count != 0);
+}
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
new file mode 100644
index 00000000000..263d397d2ee
--- /dev/null
+++ b/arch/x86/include/asm/scatterlist.h
@@ -0,0 +1,33 @@
+#ifndef _ASM_X86_SCATTERLIST_H
+#define _ASM_X86_SCATTERLIST_H
+
+#include <asm/types.h>
+
+struct scatterlist {
+#ifdef CONFIG_DEBUG_SG
+ unsigned long sg_magic;
+#endif
+ unsigned long page_link;
+ unsigned int offset;
+ unsigned int length;
+ dma_addr_t dma_address;
+ unsigned int dma_length;
+};
+
+#define ARCH_HAS_SG_CHAIN
+#define ISA_DMA_THRESHOLD (0x00ffffff)
+
+/*
+ * These macros should be used after a pci_map_sg call has been done
+ * to get bus addresses of each of the SG entries and their lengths.
+ * You should only work with the number of sg entries pci_map_sg
+ * returns.
+ */
+#define sg_dma_address(sg) ((sg)->dma_address)
+#ifdef CONFIG_X86_32
+# define sg_dma_len(sg) ((sg)->length)
+#else
+# define sg_dma_len(sg) ((sg)->dma_length)
+#endif
+
+#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
new file mode 100644
index 00000000000..c62e58a5a90
--- /dev/null
+++ b/arch/x86/include/asm/seccomp.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "seccomp_32.h"
+#else
+# include "seccomp_64.h"
+#endif
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
new file mode 100644
index 00000000000..a6ad87b352c
--- /dev/null
+++ b/arch/x86/include/asm/seccomp_32.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_X86_SECCOMP_32_H
+#define _ASM_X86_SECCOMP_32_H
+
+#include <linux/thread_info.h>
+
+#ifdef TIF_32BIT
+#error "unexpected TIF_32BIT on i386"
+#endif
+
+#include <linux/unistd.h>
+
+#define __NR_seccomp_read __NR_read
+#define __NR_seccomp_write __NR_write
+#define __NR_seccomp_exit __NR_exit
+#define __NR_seccomp_sigreturn __NR_sigreturn
+
+#endif /* _ASM_X86_SECCOMP_32_H */
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
new file mode 100644
index 00000000000..4171bb794e9
--- /dev/null
+++ b/arch/x86/include/asm/seccomp_64.h
@@ -0,0 +1,25 @@
+#ifndef _ASM_X86_SECCOMP_64_H
+#define _ASM_X86_SECCOMP_64_H
+
+#include <linux/thread_info.h>
+
+#ifdef TIF_32BIT
+#error "unexpected TIF_32BIT on x86_64"
+#else
+#define TIF_32BIT TIF_IA32
+#endif
+
+#include <linux/unistd.h>
+#include <asm/ia32_unistd.h>
+
+#define __NR_seccomp_read __NR_read
+#define __NR_seccomp_write __NR_write
+#define __NR_seccomp_exit __NR_exit
+#define __NR_seccomp_sigreturn __NR_rt_sigreturn
+
+#define __NR_seccomp_read_32 __NR_ia32_read
+#define __NR_seccomp_write_32 __NR_ia32_write
+#define __NR_seccomp_exit_32 __NR_ia32_exit
+#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
+
+#endif /* _ASM_X86_SECCOMP_64_H */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
new file mode 100644
index 00000000000..2b8c5160388
--- /dev/null
+++ b/arch/x86/include/asm/sections.h
@@ -0,0 +1 @@
+#include <asm-generic/sections.h>
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
new file mode 100644
index 00000000000..1dc1b51ac62
--- /dev/null
+++ b/arch/x86/include/asm/segment.h
@@ -0,0 +1,209 @@
+#ifndef _ASM_X86_SEGMENT_H
+#define _ASM_X86_SEGMENT_H
+
+/* Constructor for a conventional segment GDT (or LDT) entry */
+/* This is a macro so it can be used in initializers */
+#define GDT_ENTRY(flags, base, limit) \
+ ((((base) & 0xff000000ULL) << (56-24)) | \
+ (((flags) & 0x0000f0ffULL) << 40) | \
+ (((limit) & 0x000f0000ULL) << (48-16)) | \
+ (((base) & 0x00ffffffULL) << 16) | \
+ (((limit) & 0x0000ffffULL)))
+
+/* Simple and small GDT entries for booting only */
+
+#define GDT_ENTRY_BOOT_CS 2
+#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
+
+#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
+#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
+
+#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
+#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
+
+#ifdef CONFIG_X86_32
+/*
+ * The layout of the per-CPU GDT under Linux:
+ *
+ * 0 - null
+ * 1 - reserved
+ * 2 - reserved
+ * 3 - reserved
+ *
+ * 4 - unused <==== new cacheline
+ * 5 - unused
+ *
+ * ------- start of TLS (Thread-Local Storage) segments:
+ *
+ * 6 - TLS segment #1 [ glibc's TLS segment ]
+ * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
+ * 8 - TLS segment #3
+ * 9 - reserved
+ * 10 - reserved
+ * 11 - reserved
+ *
+ * ------- start of kernel segments:
+ *
+ * 12 - kernel code segment <==== new cacheline
+ * 13 - kernel data segment
+ * 14 - default user CS
+ * 15 - default user DS
+ * 16 - TSS
+ * 17 - LDT
+ * 18 - PNPBIOS support (16->32 gate)
+ * 19 - PNPBIOS support
+ * 20 - PNPBIOS support
+ * 21 - PNPBIOS support
+ * 22 - PNPBIOS support
+ * 23 - APM BIOS support
+ * 24 - APM BIOS support
+ * 25 - APM BIOS support
+ *
+ * 26 - ESPFIX small SS
+ * 27 - per-cpu [ offset to per-cpu data area ]
+ * 28 - unused
+ * 29 - unused
+ * 30 - unused
+ * 31 - TSS for double fault handler
+ */
+#define GDT_ENTRY_TLS_MIN 6
+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+
+#define GDT_ENTRY_DEFAULT_USER_CS 14
+
+#define GDT_ENTRY_DEFAULT_USER_DS 15
+
+#define GDT_ENTRY_KERNEL_BASE 12
+
+#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
+
+#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
+
+#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
+#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
+
+#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
+#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
+
+#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+
+#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
+#ifdef CONFIG_SMP
+#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
+#else
+#define __KERNEL_PERCPU 0
+#endif
+
+#define GDT_ENTRY_DOUBLEFAULT_TSS 31
+
+/*
+ * The GDT has 32 entries
+ */
+#define GDT_ENTRIES 32
+
+/* The PnP BIOS entries in the GDT */
+#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
+#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
+#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
+#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
+#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
+
+/* The PnP BIOS selectors */
+#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
+#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
+#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
+#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
+#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
+
+/* Bottom two bits of selector give the ring privilege level */
+#define SEGMENT_RPL_MASK 0x3
+/* Bit 2 is table indicator (LDT/GDT) */
+#define SEGMENT_TI_MASK 0x4
+
+/* User mode is privilege level 3 */
+#define USER_RPL 0x3
+/* LDT segment has TI set, GDT has it cleared */
+#define SEGMENT_LDT 0x4
+#define SEGMENT_GDT 0x0
+
+/*
+ * Matching rules for certain types of segments.
+ */
+
+/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
+#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
+
+
+#else
+#include <asm/cache.h>
+
+#define GDT_ENTRY_KERNEL32_CS 1
+#define GDT_ENTRY_KERNEL_CS 2
+#define GDT_ENTRY_KERNEL_DS 3
+
+#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
+
+/*
+ * we cannot use the same code segment descriptor for user and kernel
+ * -- not even in the long flat mode, because of different DPL /kkeil
+ * The segment offset needs to contain a RPL. Grr. -AK
+ * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
+ */
+#define GDT_ENTRY_DEFAULT_USER32_CS 4
+#define GDT_ENTRY_DEFAULT_USER_DS 5
+#define GDT_ENTRY_DEFAULT_USER_CS 6
+#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
+#define __USER32_DS __USER_DS
+
+#define GDT_ENTRY_TSS 8 /* needs two entries */
+#define GDT_ENTRY_LDT 10 /* needs two entries */
+#define GDT_ENTRY_TLS_MIN 12
+#define GDT_ENTRY_TLS_MAX 14
+
+#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
+#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
+
+/* TLS indexes for 64bit - hardcoded in arch_prctl */
+#define FS_TLS 0
+#define GS_TLS 1
+
+#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
+#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
+
+#define GDT_ENTRIES 16
+
+#endif
+
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
+#ifndef CONFIG_PARAVIRT
+#define get_kernel_rpl() 0
+#endif
+
+/* User mode is privilege level 3 */
+#define USER_RPL 0x3
+/* LDT segment has TI set, GDT has it cleared */
+#define SEGMENT_LDT 0x4
+#define SEGMENT_GDT 0x0
+
+/* Bottom two bits of selector give the ring privilege level */
+#define SEGMENT_RPL_MASK 0x3
+/* Bit 2 is table indicator (LDT/GDT) */
+#define SEGMENT_TI_MASK 0x4
+
+#define IDT_ENTRIES 256
+#define NUM_EXCEPTION_VECTORS 32
+#define GDT_SIZE (GDT_ENTRIES * 8)
+#define GDT_ENTRY_TLS_ENTRIES 3
+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
+#endif
+#endif
+
+#endif /* _ASM_X86_SEGMENT_H */
diff --git a/arch/x86/include/asm/sembuf.h b/arch/x86/include/asm/sembuf.h
new file mode 100644
index 00000000000..ee50c801f7b
--- /dev/null
+++ b/arch/x86/include/asm/sembuf.h
@@ -0,0 +1,24 @@
+#ifndef _ASM_X86_SEMBUF_H
+#define _ASM_X86_SEMBUF_H
+
+/*
+ * The semid64_ds structure for x86 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ */
+struct semid64_ds {
+ struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
+ __kernel_time_t sem_otime; /* last semop time */
+ unsigned long __unused1;
+ __kernel_time_t sem_ctime; /* last change time */
+ unsigned long __unused2;
+ unsigned long sem_nsems; /* no. of semaphores in array */
+ unsigned long __unused3;
+ unsigned long __unused4;
+};
+
+#endif /* _ASM_X86_SEMBUF_H */
diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
new file mode 100644
index 00000000000..628c801535e
--- /dev/null
+++ b/arch/x86/include/asm/serial.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_SERIAL_H
+#define _ASM_X86_SERIAL_H
+
+/*
+ * This assumes you have a 1.8432 MHz clock for your UART.
+ *
+ * It'd be nice if someone built a serial card with a 24.576 MHz
+ * clock, since the 16550A is capable of handling a top speed of 1.5
+ * megabits/second; but this requires the faster clock.
+ */
+#define BASE_BAUD ( 1843200 / 16 )
+
+/* Standard COM flags (except for COM4, because of the 8514 problem) */
+#ifdef CONFIG_SERIAL_DETECT_IRQ
+#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
+#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ)
+#else
+#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST)
+#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF
+#endif
+
+#define SERIAL_PORT_DFNS \
+ /* UART CLK PORT IRQ FLAGS */ \
+ { 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS }, /* ttyS0 */ \
+ { 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS }, /* ttyS1 */ \
+ { 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS }, /* ttyS2 */ \
+ { 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */
+
+#endif /* _ASM_X86_SERIAL_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
new file mode 100644
index 00000000000..f12d3723746
--- /dev/null
+++ b/arch/x86/include/asm/setup.h
@@ -0,0 +1,105 @@
+#ifndef _ASM_X86_SETUP_H
+#define _ASM_X86_SETUP_H
+
+#define COMMAND_LINE_SIZE 2048
+
+#ifndef __ASSEMBLY__
+
+/* Interrupt control for vSMPowered x86_64 systems */
+void vsmp_init(void);
+
+#ifdef CONFIG_X86_VISWS
+extern void visws_early_detect(void);
+extern int is_visws_box(void);
+#else
+static inline void visws_early_detect(void) { }
+static inline int is_visws_box(void) { return 0; }
+#endif
+
+/*
+ * Any setup quirks to be performed?
+ */
+struct mpc_config_processor;
+struct mpc_config_bus;
+struct mp_config_oemtable;
+struct x86_quirks {
+ int (*arch_pre_time_init)(void);
+ int (*arch_time_init)(void);
+ int (*arch_pre_intr_init)(void);
+ int (*arch_intr_init)(void);
+ int (*arch_trap_init)(void);
+ char * (*arch_memory_setup)(void);
+ int (*mach_get_smp_config)(unsigned int early);
+ int (*mach_find_smp_config)(unsigned int reserve);
+
+ int *mpc_record;
+ int (*mpc_apic_id)(struct mpc_config_processor *m);
+ void (*mpc_oem_bus_info)(struct mpc_config_bus *m, char *name);
+ void (*mpc_oem_pci_bus)(struct mpc_config_bus *m);
+ void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,
+ unsigned short oemsize);
+ int (*setup_ioapic_ids)(void);
+};
+
+extern struct x86_quirks *x86_quirks;
+extern unsigned long saved_video_mode;
+
+#ifndef CONFIG_PARAVIRT
+#define paravirt_post_allocator_init() do {} while (0)
+#endif
+#endif /* __ASSEMBLY__ */
+
+#ifdef __KERNEL__
+
+#ifdef __i386__
+
+#include <linux/pfn.h>
+/*
+ * Reserved space for vmalloc and iomap - defined in asm/page.h
+ */
+#define MAXMEM_PFN PFN_DOWN(MAXMEM)
+#define MAX_NONPAE_PFN (1 << 20)
+
+#endif /* __i386__ */
+
+#define PARAM_SIZE 4096 /* sizeof(struct boot_params) */
+
+#define OLD_CL_MAGIC 0xA33F
+#define OLD_CL_ADDRESS 0x020 /* Relative to real mode data */
+#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
+
+#ifndef __ASSEMBLY__
+#include <asm/bootparam.h>
+
+#ifndef _SETUP
+
+/*
+ * This is set up by the setup-routine at boot-time
+ */
+extern struct boot_params boot_params;
+
+/*
+ * Do NOT EVER look at the BIOS memory size location.
+ * It does not work on many machines.
+ */
+#define LOWMEMSIZE() (0x9f000)
+
+#ifdef __i386__
+
+void __init i386_start_kernel(void);
+extern void probe_roms(void);
+
+extern unsigned long init_pg_tables_start;
+extern unsigned long init_pg_tables_end;
+
+#else
+void __init x86_64_init_pda(void);
+void __init x86_64_start_kernel(char *real_mode);
+void __init x86_64_start_reservations(char *real_mode_data);
+
+#endif /* __i386__ */
+#endif /* _SETUP */
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_SETUP_H */
diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h
new file mode 100644
index 00000000000..b51413b7497
--- /dev/null
+++ b/arch/x86/include/asm/shmbuf.h
@@ -0,0 +1,51 @@
+#ifndef _ASM_X86_SHMBUF_H
+#define _ASM_X86_SHMBUF_H
+
+/*
+ * The shmid64_ds structure for x86 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space on 32 bit is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ *
+ * Pad space on 64 bit is left for:
+ * - 2 miscellaneous 64-bit values
+ */
+
+struct shmid64_ds {
+ struct ipc64_perm shm_perm; /* operation perms */
+ size_t shm_segsz; /* size of segment (bytes) */
+ __kernel_time_t shm_atime; /* last attach time */
+#ifdef __i386__
+ unsigned long __unused1;
+#endif
+ __kernel_time_t shm_dtime; /* last detach time */
+#ifdef __i386__
+ unsigned long __unused2;
+#endif
+ __kernel_time_t shm_ctime; /* last change time */
+#ifdef __i386__
+ unsigned long __unused3;
+#endif
+ __kernel_pid_t shm_cpid; /* pid of creator */
+ __kernel_pid_t shm_lpid; /* pid of last operator */
+ unsigned long shm_nattch; /* no. of current attaches */
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+
+struct shminfo64 {
+ unsigned long shmmax;
+ unsigned long shmmin;
+ unsigned long shmmni;
+ unsigned long shmseg;
+ unsigned long shmall;
+ unsigned long __unused1;
+ unsigned long __unused2;
+ unsigned long __unused3;
+ unsigned long __unused4;
+};
+
+#endif /* _ASM_X86_SHMBUF_H */
diff --git a/arch/x86/include/asm/shmparam.h b/arch/x86/include/asm/shmparam.h
new file mode 100644
index 00000000000..0880cf0917b
--- /dev/null
+++ b/arch/x86/include/asm/shmparam.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_SHMPARAM_H
+#define _ASM_X86_SHMPARAM_H
+
+#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */
+
+#endif /* _ASM_X86_SHMPARAM_H */
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
new file mode 100644
index 00000000000..0afcb5e58ac
--- /dev/null
+++ b/arch/x86/include/asm/sigcontext.h
@@ -0,0 +1,284 @@
+#ifndef _ASM_X86_SIGCONTEXT_H
+#define _ASM_X86_SIGCONTEXT_H
+
+#include <linux/compiler.h>
+#include <asm/types.h>
+
+#define FP_XSTATE_MAGIC1 0x46505853U
+#define FP_XSTATE_MAGIC2 0x46505845U
+#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2)
+
+/*
+ * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame
+ * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes
+ * are used to extended the fpstate pointer in the sigcontext, which now
+ * includes the extended state information along with fpstate information.
+ *
+ * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved
+ * area and FP_XSTATE_MAGIC2 at the end of memory layout
+ * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the
+ * extended state information in the memory layout pointed by the fpstate
+ * pointer in sigcontext.
+ */
+struct _fpx_sw_bytes {
+ __u32 magic1; /* FP_XSTATE_MAGIC1 */
+ __u32 extended_size; /* total size of the layout referred by
+ * fpstate pointer in the sigcontext.
+ */
+ __u64 xstate_bv;
+ /* feature bit mask (including fp/sse/extended
+ * state) that is present in the memory
+ * layout.
+ */
+ __u32 xstate_size; /* actual xsave state size, based on the
+ * features saved in the layout.
+ * 'extended_size' will be greater than
+ * 'xstate_size'.
+ */
+ __u32 padding[7]; /* for future use. */
+};
+
+#ifdef __i386__
+/*
+ * As documented in the iBCS2 standard..
+ *
+ * The first part of "struct _fpstate" is just the normal i387
+ * hardware setup, the extra "status" word is used to save the
+ * coprocessor status word before entering the handler.
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * The FPU state data structure has had to grow to accommodate the
+ * extended FPU state required by the Streaming SIMD Extensions.
+ * There is no documented standard to accomplish this at the moment.
+ */
+struct _fpreg {
+ unsigned short significand[4];
+ unsigned short exponent;
+};
+
+struct _fpxreg {
+ unsigned short significand[4];
+ unsigned short exponent;
+ unsigned short padding[3];
+};
+
+struct _xmmreg {
+ unsigned long element[4];
+};
+
+struct _fpstate {
+ /* Regular FPU environment */
+ unsigned long cw;
+ unsigned long sw;
+ unsigned long tag;
+ unsigned long ipoff;
+ unsigned long cssel;
+ unsigned long dataoff;
+ unsigned long datasel;
+ struct _fpreg _st[8];
+ unsigned short status;
+ unsigned short magic; /* 0xffff = regular FPU data only */
+
+ /* FXSR FPU environment */
+ unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */
+ unsigned long mxcsr;
+ unsigned long reserved;
+ struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */
+ struct _xmmreg _xmm[8];
+ unsigned long padding1[44];
+
+ union {
+ unsigned long padding2[12];
+ struct _fpx_sw_bytes sw_reserved; /* represents the extended
+ * state info */
+ };
+};
+
+#define X86_FXSR_MAGIC 0x0000
+
+#ifdef __KERNEL__
+struct sigcontext {
+ unsigned short gs, __gsh;
+ unsigned short fs, __fsh;
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned long di;
+ unsigned long si;
+ unsigned long bp;
+ unsigned long sp;
+ unsigned long bx;
+ unsigned long dx;
+ unsigned long cx;
+ unsigned long ax;
+ unsigned long trapno;
+ unsigned long err;
+ unsigned long ip;
+ unsigned short cs, __csh;
+ unsigned long flags;
+ unsigned long sp_at_signal;
+ unsigned short ss, __ssh;
+
+ /*
+ * fpstate is really (struct _fpstate *) or (struct _xstate *)
+ * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+ * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+ * of extended memory layout. See comments at the defintion of
+ * (struct _fpx_sw_bytes)
+ */
+ void __user *fpstate; /* zero when no FPU/extended context */
+ unsigned long oldmask;
+ unsigned long cr2;
+};
+#else /* __KERNEL__ */
+/*
+ * User-space might still rely on the old definition:
+ */
+struct sigcontext {
+ unsigned short gs, __gsh;
+ unsigned short fs, __fsh;
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned long edi;
+ unsigned long esi;
+ unsigned long ebp;
+ unsigned long esp;
+ unsigned long ebx;
+ unsigned long edx;
+ unsigned long ecx;
+ unsigned long eax;
+ unsigned long trapno;
+ unsigned long err;
+ unsigned long eip;
+ unsigned short cs, __csh;
+ unsigned long eflags;
+ unsigned long esp_at_signal;
+ unsigned short ss, __ssh;
+ struct _fpstate __user *fpstate;
+ unsigned long oldmask;
+ unsigned long cr2;
+};
+#endif /* !__KERNEL__ */
+
+#else /* __i386__ */
+
+/* FXSAVE frame */
+/* Note: reserved1/2 may someday contain valuable data. Always save/restore
+ them when you change signal frames. */
+struct _fpstate {
+ __u16 cwd;
+ __u16 swd;
+ __u16 twd; /* Note this is not the same as the
+ 32bit/x87/FSAVE twd */
+ __u16 fop;
+ __u64 rip;
+ __u64 rdp;
+ __u32 mxcsr;
+ __u32 mxcsr_mask;
+ __u32 st_space[32]; /* 8*16 bytes for each FP-reg */
+ __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg */
+ __u32 reserved2[12];
+ union {
+ __u32 reserved3[12];
+ struct _fpx_sw_bytes sw_reserved; /* represents the extended
+ * state information */
+ };
+};
+
+#ifdef __KERNEL__
+struct sigcontext {
+ unsigned long r8;
+ unsigned long r9;
+ unsigned long r10;
+ unsigned long r11;
+ unsigned long r12;
+ unsigned long r13;
+ unsigned long r14;
+ unsigned long r15;
+ unsigned long di;
+ unsigned long si;
+ unsigned long bp;
+ unsigned long bx;
+ unsigned long dx;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long sp;
+ unsigned long ip;
+ unsigned long flags;
+ unsigned short cs;
+ unsigned short gs;
+ unsigned short fs;
+ unsigned short __pad0;
+ unsigned long err;
+ unsigned long trapno;
+ unsigned long oldmask;
+ unsigned long cr2;
+
+ /*
+ * fpstate is really (struct _fpstate *) or (struct _xstate *)
+ * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+ * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+ * of extended memory layout. See comments at the defintion of
+ * (struct _fpx_sw_bytes)
+ */
+ void __user *fpstate; /* zero when no FPU/extended context */
+ unsigned long reserved1[8];
+};
+#else /* __KERNEL__ */
+/*
+ * User-space might still rely on the old definition:
+ */
+struct sigcontext {
+ unsigned long r8;
+ unsigned long r9;
+ unsigned long r10;
+ unsigned long r11;
+ unsigned long r12;
+ unsigned long r13;
+ unsigned long r14;
+ unsigned long r15;
+ unsigned long rdi;
+ unsigned long rsi;
+ unsigned long rbp;
+ unsigned long rbx;
+ unsigned long rdx;
+ unsigned long rax;
+ unsigned long rcx;
+ unsigned long rsp;
+ unsigned long rip;
+ unsigned long eflags; /* RFLAGS */
+ unsigned short cs;
+ unsigned short gs;
+ unsigned short fs;
+ unsigned short __pad0;
+ unsigned long err;
+ unsigned long trapno;
+ unsigned long oldmask;
+ unsigned long cr2;
+ struct _fpstate __user *fpstate; /* zero when no FPU context */
+ unsigned long reserved1[8];
+};
+#endif /* !__KERNEL__ */
+
+#endif /* !__i386__ */
+
+struct _xsave_hdr {
+ __u64 xstate_bv;
+ __u64 reserved1[2];
+ __u64 reserved2[5];
+};
+
+/*
+ * Extended state pointed by the fpstate pointer in the sigcontext.
+ * In addition to the fpstate, information encoded in the xstate_hdr
+ * indicates the presence of other extended state information
+ * supported by the processor and OS.
+ */
+struct _xstate {
+ struct _fpstate fpstate;
+ struct _xsave_hdr xstate_hdr;
+ /* new processor state extensions go here */
+};
+
+#endif /* _ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/include/asm/sigcontext32.h b/arch/x86/include/asm/sigcontext32.h
new file mode 100644
index 00000000000..6126188cf3a
--- /dev/null
+++ b/arch/x86/include/asm/sigcontext32.h
@@ -0,0 +1,75 @@
+#ifndef _ASM_X86_SIGCONTEXT32_H
+#define _ASM_X86_SIGCONTEXT32_H
+
+/* signal context for 32bit programs. */
+
+#define X86_FXSR_MAGIC 0x0000
+
+struct _fpreg {
+ unsigned short significand[4];
+ unsigned short exponent;
+};
+
+struct _fpxreg {
+ unsigned short significand[4];
+ unsigned short exponent;
+ unsigned short padding[3];
+};
+
+struct _xmmreg {
+ __u32 element[4];
+};
+
+/* FSAVE frame with extensions */
+struct _fpstate_ia32 {
+ /* Regular FPU environment */
+ __u32 cw;
+ __u32 sw;
+ __u32 tag; /* not compatible to 64bit twd */
+ __u32 ipoff;
+ __u32 cssel;
+ __u32 dataoff;
+ __u32 datasel;
+ struct _fpreg _st[8];
+ unsigned short status;
+ unsigned short magic; /* 0xffff = regular FPU data only */
+
+ /* FXSR FPU environment */
+ __u32 _fxsr_env[6];
+ __u32 mxcsr;
+ __u32 reserved;
+ struct _fpxreg _fxsr_st[8];
+ struct _xmmreg _xmm[8]; /* It's actually 16 */
+ __u32 padding[44];
+ union {
+ __u32 padding2[12];
+ struct _fpx_sw_bytes sw_reserved;
+ };
+};
+
+struct sigcontext_ia32 {
+ unsigned short gs, __gsh;
+ unsigned short fs, __fsh;
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned int di;
+ unsigned int si;
+ unsigned int bp;
+ unsigned int sp;
+ unsigned int bx;
+ unsigned int dx;
+ unsigned int cx;
+ unsigned int ax;
+ unsigned int trapno;
+ unsigned int err;
+ unsigned int ip;
+ unsigned short cs, __csh;
+ unsigned int flags;
+ unsigned int sp_at_signal;
+ unsigned short ss, __ssh;
+ unsigned int fpstate; /* really (struct _fpstate_ia32 *) */
+ unsigned int oldmask;
+ unsigned int cr2;
+};
+
+#endif /* _ASM_X86_SIGCONTEXT32_H */
diff --git a/arch/x86/include/asm/siginfo.h b/arch/x86/include/asm/siginfo.h
new file mode 100644
index 00000000000..fc1aa553564
--- /dev/null
+++ b/arch/x86/include/asm/siginfo.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_SIGINFO_H
+#define _ASM_X86_SIGINFO_H
+
+#ifdef __x86_64__
+# define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int))
+#endif
+
+#include <asm-generic/siginfo.h>
+
+#endif /* _ASM_X86_SIGINFO_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
new file mode 100644
index 00000000000..96ac44f275d
--- /dev/null
+++ b/arch/x86/include/asm/signal.h
@@ -0,0 +1,262 @@
+#ifndef _ASM_X86_SIGNAL_H
+#define _ASM_X86_SIGNAL_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/compiler.h>
+
+/* Avoid too many header ordering problems. */
+struct siginfo;
+
+#ifdef __KERNEL__
+#include <linux/linkage.h>
+
+/* Most things should be clean enough to redefine this at will, if care
+ is taken to make libc match. */
+
+#define _NSIG 64
+
+#ifdef __i386__
+# define _NSIG_BPW 32
+#else
+# define _NSIG_BPW 64
+#endif
+
+#define _NSIG_WORDS (_NSIG / _NSIG_BPW)
+
+typedef unsigned long old_sigset_t; /* at least 32 bits */
+
+typedef struct {
+ unsigned long sig[_NSIG_WORDS];
+} sigset_t;
+
+#else
+/* Here we must cater to libcs that poke about in kernel headers. */
+
+#define NSIG 32
+typedef unsigned long sigset_t;
+
+#endif /* __KERNEL__ */
+#endif /* __ASSEMBLY__ */
+
+#define SIGHUP 1
+#define SIGINT 2
+#define SIGQUIT 3
+#define SIGILL 4
+#define SIGTRAP 5
+#define SIGABRT 6
+#define SIGIOT 6
+#define SIGBUS 7
+#define SIGFPE 8
+#define SIGKILL 9
+#define SIGUSR1 10
+#define SIGSEGV 11
+#define SIGUSR2 12
+#define SIGPIPE 13
+#define SIGALRM 14
+#define SIGTERM 15
+#define SIGSTKFLT 16
+#define SIGCHLD 17
+#define SIGCONT 18
+#define SIGSTOP 19
+#define SIGTSTP 20
+#define SIGTTIN 21
+#define SIGTTOU 22
+#define SIGURG 23
+#define SIGXCPU 24
+#define SIGXFSZ 25
+#define SIGVTALRM 26
+#define SIGPROF 27
+#define SIGWINCH 28
+#define SIGIO 29
+#define SIGPOLL SIGIO
+/*
+#define SIGLOST 29
+*/
+#define SIGPWR 30
+#define SIGSYS 31
+#define SIGUNUSED 31
+
+/* These should not be considered constants from userland. */
+#define SIGRTMIN 32
+#define SIGRTMAX _NSIG
+
+/*
+ * SA_FLAGS values:
+ *
+ * SA_ONSTACK indicates that a registered stack_t will be used.
+ * SA_RESTART flag to get restarting signals (which were the default long ago)
+ * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
+ * SA_RESETHAND clears the handler when the signal is delivered.
+ * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
+ * SA_NODEFER prevents the current signal from being masked in the handler.
+ *
+ * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
+ * Unix names RESETHAND and NODEFER respectively.
+ */
+#define SA_NOCLDSTOP 0x00000001u
+#define SA_NOCLDWAIT 0x00000002u
+#define SA_SIGINFO 0x00000004u
+#define SA_ONSTACK 0x08000000u
+#define SA_RESTART 0x10000000u
+#define SA_NODEFER 0x40000000u
+#define SA_RESETHAND 0x80000000u
+
+#define SA_NOMASK SA_NODEFER
+#define SA_ONESHOT SA_RESETHAND
+
+#define SA_RESTORER 0x04000000
+
+/*
+ * sigaltstack controls
+ */
+#define SS_ONSTACK 1
+#define SS_DISABLE 2
+
+#define MINSIGSTKSZ 2048
+#define SIGSTKSZ 8192
+
+#include <asm-generic/signal.h>
+
+#ifndef __ASSEMBLY__
+
+#ifdef __i386__
+# ifdef __KERNEL__
+struct old_sigaction {
+ __sighandler_t sa_handler;
+ old_sigset_t sa_mask;
+ unsigned long sa_flags;
+ __sigrestore_t sa_restorer;
+};
+
+struct sigaction {
+ __sighandler_t sa_handler;
+ unsigned long sa_flags;
+ __sigrestore_t sa_restorer;
+ sigset_t sa_mask; /* mask last for extensibility */
+};
+
+struct k_sigaction {
+ struct sigaction sa;
+};
+
+extern void do_notify_resume(struct pt_regs *, void *, __u32);
+
+# else /* __KERNEL__ */
+/* Here we must cater to libcs that poke about in kernel headers. */
+
+struct sigaction {
+ union {
+ __sighandler_t _sa_handler;
+ void (*_sa_sigaction)(int, struct siginfo *, void *);
+ } _u;
+ sigset_t sa_mask;
+ unsigned long sa_flags;
+ void (*sa_restorer)(void);
+};
+
+#define sa_handler _u._sa_handler
+#define sa_sigaction _u._sa_sigaction
+
+# endif /* ! __KERNEL__ */
+#else /* __i386__ */
+
+struct sigaction {
+ __sighandler_t sa_handler;
+ unsigned long sa_flags;
+ __sigrestore_t sa_restorer;
+ sigset_t sa_mask; /* mask last for extensibility */
+};
+
+struct k_sigaction {
+ struct sigaction sa;
+};
+
+#endif /* !__i386__ */
+
+typedef struct sigaltstack {
+ void __user *ss_sp;
+ int ss_flags;
+ size_t ss_size;
+} stack_t;
+
+#ifdef __KERNEL__
+#include <asm/sigcontext.h>
+
+#ifdef __i386__
+
+#define __HAVE_ARCH_SIG_BITOPS
+
+#define sigaddset(set,sig) \
+ (__builtin_constant_p(sig) \
+ ? __const_sigaddset((set), (sig)) \
+ : __gen_sigaddset((set), (sig)))
+
+static inline void __gen_sigaddset(sigset_t *set, int _sig)
+{
+ asm("btsl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc");
+}
+
+static inline void __const_sigaddset(sigset_t *set, int _sig)
+{
+ unsigned long sig = _sig - 1;
+ set->sig[sig / _NSIG_BPW] |= 1 << (sig % _NSIG_BPW);
+}
+
+#define sigdelset(set, sig) \
+ (__builtin_constant_p(sig) \
+ ? __const_sigdelset((set), (sig)) \
+ : __gen_sigdelset((set), (sig)))
+
+
+static inline void __gen_sigdelset(sigset_t *set, int _sig)
+{
+ asm("btrl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc");
+}
+
+static inline void __const_sigdelset(sigset_t *set, int _sig)
+{
+ unsigned long sig = _sig - 1;
+ set->sig[sig / _NSIG_BPW] &= ~(1 << (sig % _NSIG_BPW));
+}
+
+static inline int __const_sigismember(sigset_t *set, int _sig)
+{
+ unsigned long sig = _sig - 1;
+ return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
+}
+
+static inline int __gen_sigismember(sigset_t *set, int _sig)
+{
+ int ret;
+ asm("btl %2,%1\n\tsbbl %0,%0"
+ : "=r"(ret) : "m"(*set), "Ir"(_sig-1) : "cc");
+ return ret;
+}
+
+#define sigismember(set, sig) \
+ (__builtin_constant_p(sig) \
+ ? __const_sigismember((set), (sig)) \
+ : __gen_sigismember((set), (sig)))
+
+static inline int sigfindinword(unsigned long word)
+{
+ asm("bsfl %1,%0" : "=r"(word) : "rm"(word) : "cc");
+ return word;
+}
+
+struct pt_regs;
+
+#else /* __i386__ */
+
+#undef __HAVE_ARCH_SIG_BITOPS
+
+#endif /* !__i386__ */
+
+#define ptrace_signal_deliver(regs, cookie) do { } while (0)
+
+#endif /* __KERNEL__ */
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_SIGNAL_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
new file mode 100644
index 00000000000..2766021aef8
--- /dev/null
+++ b/arch/x86/include/asm/smp.h
@@ -0,0 +1,229 @@
+#ifndef _ASM_X86_SMP_H
+#define _ASM_X86_SMP_H
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <asm/percpu.h>
+
+/*
+ * We need the APIC definitions automatically as part of 'smp.h'
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+# include <asm/mpspec.h>
+# include <asm/apic.h>
+# ifdef CONFIG_X86_IO_APIC
+# include <asm/io_apic.h>
+# endif
+#endif
+#include <asm/pda.h>
+#include <asm/thread_info.h>
+
+extern cpumask_t cpu_callout_map;
+extern cpumask_t cpu_initialized;
+extern cpumask_t cpu_callin_map;
+
+extern void (*mtrr_hook)(void);
+extern void zap_low_mappings(void);
+
+extern int __cpuinit get_local_pda(int cpu);
+
+extern int smp_num_siblings;
+extern unsigned int num_processors;
+extern cpumask_t cpu_initialized;
+
+DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+DECLARE_PER_CPU(u16, cpu_llc_id);
+#ifdef CONFIG_X86_32
+DECLARE_PER_CPU(int, cpu_number);
+#endif
+
+DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+
+/* Static state in head.S used to set up a CPU */
+extern struct {
+ void *sp;
+ unsigned short ss;
+} stack_start;
+
+struct smp_ops {
+ void (*smp_prepare_boot_cpu)(void);
+ void (*smp_prepare_cpus)(unsigned max_cpus);
+ void (*smp_cpus_done)(unsigned max_cpus);
+
+ void (*smp_send_stop)(void);
+ void (*smp_send_reschedule)(int cpu);
+
+ int (*cpu_up)(unsigned cpu);
+ int (*cpu_disable)(void);
+ void (*cpu_die)(unsigned int cpu);
+ void (*play_dead)(void);
+
+ void (*send_call_func_ipi)(cpumask_t mask);
+ void (*send_call_func_single_ipi)(int cpu);
+};
+
+/* Globals due to paravirt */
+extern void set_cpu_sibling_map(int cpu);
+
+#ifdef CONFIG_SMP
+#ifndef CONFIG_PARAVIRT
+#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
+#endif
+extern struct smp_ops smp_ops;
+
+static inline void smp_send_stop(void)
+{
+ smp_ops.smp_send_stop();
+}
+
+static inline void smp_prepare_boot_cpu(void)
+{
+ smp_ops.smp_prepare_boot_cpu();
+}
+
+static inline void smp_prepare_cpus(unsigned int max_cpus)
+{
+ smp_ops.smp_prepare_cpus(max_cpus);
+}
+
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+ smp_ops.smp_cpus_done(max_cpus);
+}
+
+static inline int __cpu_up(unsigned int cpu)
+{
+ return smp_ops.cpu_up(cpu);
+}
+
+static inline int __cpu_disable(void)
+{
+ return smp_ops.cpu_disable();
+}
+
+static inline void __cpu_die(unsigned int cpu)
+{
+ smp_ops.cpu_die(cpu);
+}
+
+static inline void play_dead(void)
+{
+ smp_ops.play_dead();
+}
+
+static inline void smp_send_reschedule(int cpu)
+{
+ smp_ops.smp_send_reschedule(cpu);
+}
+
+static inline void arch_send_call_function_single_ipi(int cpu)
+{
+ smp_ops.send_call_func_single_ipi(cpu);
+}
+
+static inline void arch_send_call_function_ipi(cpumask_t mask)
+{
+ smp_ops.send_call_func_ipi(mask);
+}
+
+void cpu_disable_common(void);
+void native_smp_prepare_boot_cpu(void);
+void native_smp_prepare_cpus(unsigned int max_cpus);
+void native_smp_cpus_done(unsigned int max_cpus);
+int native_cpu_up(unsigned int cpunum);
+int native_cpu_disable(void);
+void native_cpu_die(unsigned int cpu);
+void native_play_dead(void);
+void play_dead_common(void);
+
+void native_send_call_func_ipi(cpumask_t mask);
+void native_send_call_func_single_ipi(int cpu);
+
+extern void prefill_possible_map(void);
+
+void smp_store_cpu_info(int id);
+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+
+/* We don't mark CPUs online until __cpu_up(), so we need another measure */
+static inline int num_booting_cpus(void)
+{
+ return cpus_weight(cpu_callout_map);
+}
+#else
+static inline void prefill_possible_map(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+extern unsigned disabled_cpus __cpuinitdata;
+
+#ifdef CONFIG_X86_32_SMP
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+extern int safe_smp_processor_id(void);
+
+#elif defined(CONFIG_X86_64_SMP)
+#define raw_smp_processor_id() read_pda(cpunumber)
+
+#define stack_smp_processor_id() \
+({ \
+ struct thread_info *ti; \
+ __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
+ ti->cpu; \
+})
+#define safe_smp_processor_id() smp_processor_id()
+
+#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
+#define cpu_physical_id(cpu) boot_cpu_physical_apicid
+#define safe_smp_processor_id() 0
+#define stack_smp_processor_id() 0
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+#ifndef CONFIG_X86_64
+static inline int logical_smp_processor_id(void)
+{
+ /* we don't want to mark this access volatile - bad code generation */
+ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+}
+
+#include <mach_apicdef.h>
+static inline unsigned int read_apic_id(void)
+{
+ unsigned int reg;
+
+ reg = *(u32 *)(APIC_BASE + APIC_ID);
+
+ return GET_APIC_ID(reg);
+}
+#endif
+
+
+# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64)
+extern int hard_smp_processor_id(void);
+# else
+#include <mach_apicdef.h>
+static inline int hard_smp_processor_id(void)
+{
+ /* we don't want to mark this access volatile - bad code generation */
+ return read_apic_id();
+}
+# endif /* APIC_DEFINITION */
+
+#else /* CONFIG_X86_LOCAL_APIC */
+
+# ifndef CONFIG_SMP
+# define hard_smp_processor_id() 0
+# endif
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h
new file mode 100644
index 00000000000..8ab9cc8b2ec
--- /dev/null
+++ b/arch/x86/include/asm/socket.h
@@ -0,0 +1,57 @@
+#ifndef _ASM_X86_SOCKET_H
+#define _ASM_X86_SOCKET_H
+
+#include <asm/sockios.h>
+
+/* For setsockopt(2) */
+#define SOL_SOCKET 1
+
+#define SO_DEBUG 1
+#define SO_REUSEADDR 2
+#define SO_TYPE 3
+#define SO_ERROR 4
+#define SO_DONTROUTE 5
+#define SO_BROADCAST 6
+#define SO_SNDBUF 7
+#define SO_RCVBUF 8
+#define SO_SNDBUFFORCE 32
+#define SO_RCVBUFFORCE 33
+#define SO_KEEPALIVE 9
+#define SO_OOBINLINE 10
+#define SO_NO_CHECK 11
+#define SO_PRIORITY 12
+#define SO_LINGER 13
+#define SO_BSDCOMPAT 14
+/* To add :#define SO_REUSEPORT 15 */
+#define SO_PASSCRED 16
+#define SO_PEERCRED 17
+#define SO_RCVLOWAT 18
+#define SO_SNDLOWAT 19
+#define SO_RCVTIMEO 20
+#define SO_SNDTIMEO 21
+
+/* Security levels - as per NRL IPv6 - don't actually do anything */
+#define SO_SECURITY_AUTHENTICATION 22
+#define SO_SECURITY_ENCRYPTION_TRANSPORT 23
+#define SO_SECURITY_ENCRYPTION_NETWORK 24
+
+#define SO_BINDTODEVICE 25
+
+/* Socket filtering */
+#define SO_ATTACH_FILTER 26
+#define SO_DETACH_FILTER 27
+
+#define SO_PEERNAME 28
+#define SO_TIMESTAMP 29
+#define SCM_TIMESTAMP SO_TIMESTAMP
+
+#define SO_ACCEPTCONN 30
+
+#define SO_PEERSEC 31
+#define SO_PASSSEC 34
+#define SO_TIMESTAMPNS 35
+#define SCM_TIMESTAMPNS SO_TIMESTAMPNS
+
+#define SO_MARK 36
+
+#endif /* _ASM_X86_SOCKET_H */
diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h
new file mode 100644
index 00000000000..49cc72b5d3c
--- /dev/null
+++ b/arch/x86/include/asm/sockios.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_SOCKIOS_H
+#define _ASM_X86_SOCKIOS_H
+
+/* Socket-level I/O control calls. */
+#define FIOSETOWN 0x8901
+#define SIOCSPGRP 0x8902
+#define FIOGETOWN 0x8903
+#define SIOCGPGRP 0x8904
+#define SIOCATMARK 0x8905
+#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */
+#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */
+
+#endif /* _ASM_X86_SOCKIOS_H */
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
new file mode 100644
index 00000000000..be44f7dab39
--- /dev/null
+++ b/arch/x86/include/asm/sparsemem.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_SPARSEMEM_H
+#define _ASM_X86_SPARSEMEM_H
+
+#ifdef CONFIG_SPARSEMEM
+/*
+ * generic non-linear memory support:
+ *
+ * 1) we will not split memory into more chunks than will fit into the flags
+ * field of the struct page
+ *
+ * SECTION_SIZE_BITS 2^n: size of each section
+ * MAX_PHYSADDR_BITS 2^n: max size of physical address space
+ * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space
+ *
+ */
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+# define SECTION_SIZE_BITS 29
+# define MAX_PHYSADDR_BITS 36
+# define MAX_PHYSMEM_BITS 36
+# else
+# define SECTION_SIZE_BITS 26
+# define MAX_PHYSADDR_BITS 32
+# define MAX_PHYSMEM_BITS 32
+# endif
+#else /* CONFIG_X86_32 */
+# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
+# define MAX_PHYSADDR_BITS 44
+# define MAX_PHYSMEM_BITS 44
+#endif
+
+#endif /* CONFIG_SPARSEMEM */
+#endif /* _ASM_X86_SPARSEMEM_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
new file mode 100644
index 00000000000..d17c91981da
--- /dev/null
+++ b/arch/x86/include/asm/spinlock.h
@@ -0,0 +1,364 @@
+#ifndef _ASM_X86_SPINLOCK_H
+#define _ASM_X86_SPINLOCK_H
+
+#include <asm/atomic.h>
+#include <asm/rwlock.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
+#include <asm/paravirt.h>
+/*
+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
+ *
+ * Simple spin lock operations. There are two variants, one clears IRQ's
+ * on the local processor, one does not.
+ *
+ * These are fair FIFO ticket locks, which are currently limited to 256
+ * CPUs.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
+ */
+
+#ifdef CONFIG_X86_32
+# define LOCK_PTR_REG "a"
+# define REG_PTR_MODE "k"
+#else
+# define LOCK_PTR_REG "D"
+# define REG_PTR_MODE "q"
+#endif
+
+#if defined(CONFIG_X86_32) && \
+ (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
+/*
+ * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
+ * (PPro errata 66, 92)
+ */
+# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
+#else
+# define UNLOCK_LOCK_PREFIX
+#endif
+
+/*
+ * Ticket locks are conceptually two parts, one indicating the current head of
+ * the queue, and the other indicating the current tail. The lock is acquired
+ * by atomically noting the tail and incrementing it by one (thus adding
+ * ourself to the queue and noting our position), then waiting until the head
+ * becomes equal to the the initial value of the tail.
+ *
+ * We use an xadd covering *both* parts of the lock, to increment the tail and
+ * also load the position of the head, which takes care of memory ordering
+ * issues and should be optimal for the uncontended case. Note the tail must be
+ * in the high part, because a wide xadd increment of the low part would carry
+ * up and contaminate the high part.
+ *
+ * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
+ * save some instructions and make the code more elegant. There really isn't
+ * much between them in performance though, especially as locks are out of line.
+ */
+#if (NR_CPUS < 256)
+#define TICKET_SHIFT 8
+
+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
+{
+ short inc = 0x0100;
+
+ asm volatile (
+ LOCK_PREFIX "xaddw %w0, %1\n"
+ "1:\t"
+ "cmpb %h0, %b0\n\t"
+ "je 2f\n\t"
+ "rep ; nop\n\t"
+ "movb %1, %b0\n\t"
+ /* don't need lfence here, because loads are in-order */
+ "jmp 1b\n"
+ "2:"
+ : "+Q" (inc), "+m" (lock->slock)
+ :
+ : "memory", "cc");
+}
+
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
+{
+ int tmp, new;
+
+ asm volatile("movzwl %2, %0\n\t"
+ "cmpb %h0,%b0\n\t"
+ "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
+ "jne 1f\n\t"
+ LOCK_PREFIX "cmpxchgw %w1,%2\n\t"
+ "1:"
+ "sete %b1\n\t"
+ "movzbl %b1,%0\n\t"
+ : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
+ :
+ : "memory", "cc");
+
+ return tmp;
+}
+
+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
+{
+ asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
+ : "+m" (lock->slock)
+ :
+ : "memory", "cc");
+}
+#else
+#define TICKET_SHIFT 16
+
+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
+{
+ int inc = 0x00010000;
+ int tmp;
+
+ asm volatile(LOCK_PREFIX "xaddl %0, %1\n"
+ "movzwl %w0, %2\n\t"
+ "shrl $16, %0\n\t"
+ "1:\t"
+ "cmpl %0, %2\n\t"
+ "je 2f\n\t"
+ "rep ; nop\n\t"
+ "movzwl %1, %2\n\t"
+ /* don't need lfence here, because loads are in-order */
+ "jmp 1b\n"
+ "2:"
+ : "+r" (inc), "+m" (lock->slock), "=&r" (tmp)
+ :
+ : "memory", "cc");
+}
+
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
+{
+ int tmp;
+ int new;
+
+ asm volatile("movl %2,%0\n\t"
+ "movl %0,%1\n\t"
+ "roll $16, %0\n\t"
+ "cmpl %0,%1\n\t"
+ "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
+ "jne 1f\n\t"
+ LOCK_PREFIX "cmpxchgl %1,%2\n\t"
+ "1:"
+ "sete %b1\n\t"
+ "movzbl %b1,%0\n\t"
+ : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
+ :
+ : "memory", "cc");
+
+ return tmp;
+}
+
+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
+{
+ asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
+ : "+m" (lock->slock)
+ :
+ : "memory", "cc");
+}
+#endif
+
+static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
+{
+ int tmp = ACCESS_ONCE(lock->slock);
+
+ return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
+}
+
+static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
+{
+ int tmp = ACCESS_ONCE(lock->slock);
+
+ return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
+}
+
+#ifdef CONFIG_PARAVIRT
+/*
+ * Define virtualization-friendly old-style lock byte lock, for use in
+ * pv_lock_ops if desired.
+ *
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure. It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+struct __byte_spinlock {
+ s8 lock;
+ s8 spinners;
+};
+
+static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ return bl->lock != 0;
+}
+
+static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ return bl->spinners != 0;
+}
+
+static inline void __byte_spin_lock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ s8 val = 1;
+
+ asm("1: xchgb %1, %0\n"
+ " test %1,%1\n"
+ " jz 3f\n"
+ " " LOCK_PREFIX "incb %2\n"
+ "2: rep;nop\n"
+ " cmpb $1, %0\n"
+ " je 2b\n"
+ " " LOCK_PREFIX "decb %2\n"
+ " jmp 1b\n"
+ "3:"
+ : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
+}
+
+static inline int __byte_spin_trylock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ u8 old = 1;
+
+ asm("xchgb %1,%0"
+ : "+m" (bl->lock), "+q" (old) : : "memory");
+
+ return old == 0;
+}
+
+static inline void __byte_spin_unlock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ smp_wmb();
+ bl->lock = 0;
+}
+#else /* !CONFIG_PARAVIRT */
+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+{
+ return __ticket_spin_is_locked(lock);
+}
+
+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+{
+ return __ticket_spin_is_contended(lock);
+}
+
+static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+ __ticket_spin_lock(lock);
+}
+
+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+ return __ticket_spin_trylock(lock);
+}
+
+static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+ __ticket_spin_unlock(lock);
+}
+
+static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
+ unsigned long flags)
+{
+ __raw_spin_lock(lock);
+}
+
+#endif /* CONFIG_PARAVIRT */
+
+static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
+{
+ while (__raw_spin_is_locked(lock))
+ cpu_relax();
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ *
+ * On x86, we implement read-write locks as a 32-bit counter
+ * with the high bit (sign) being the "contended" bit.
+ */
+
+/**
+ * read_can_lock - would read_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int __raw_read_can_lock(raw_rwlock_t *lock)
+{
+ return (int)(lock)->lock > 0;
+}
+
+/**
+ * write_can_lock - would write_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int __raw_write_can_lock(raw_rwlock_t *lock)
+{
+ return (lock)->lock == RW_LOCK_BIAS;
+}
+
+static inline void __raw_read_lock(raw_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
+ "jns 1f\n"
+ "call __read_lock_failed\n\t"
+ "1:\n"
+ ::LOCK_PTR_REG (rw) : "memory");
+}
+
+static inline void __raw_write_lock(raw_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
+ "jz 1f\n"
+ "call __write_lock_failed\n\t"
+ "1:\n"
+ ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
+}
+
+static inline int __raw_read_trylock(raw_rwlock_t *lock)
+{
+ atomic_t *count = (atomic_t *)lock;
+
+ atomic_dec(count);
+ if (atomic_read(count) >= 0)
+ return 1;
+ atomic_inc(count);
+ return 0;
+}
+
+static inline int __raw_write_trylock(raw_rwlock_t *lock)
+{
+ atomic_t *count = (atomic_t *)lock;
+
+ if (atomic_sub_and_test(RW_LOCK_BIAS, count))
+ return 1;
+ atomic_add(RW_LOCK_BIAS, count);
+ return 0;
+}
+
+static inline void __raw_read_unlock(raw_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
+}
+
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX "addl %1, %0"
+ : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
+}
+
+#define _raw_spin_relax(lock) cpu_relax()
+#define _raw_read_relax(lock) cpu_relax()
+#define _raw_write_relax(lock) cpu_relax()
+
+#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
new file mode 100644
index 00000000000..845f81c8709
--- /dev/null
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_SPINLOCK_TYPES_H
+#define _ASM_X86_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct raw_spinlock {
+ unsigned int slock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED { 0 }
+
+typedef struct {
+ unsigned int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
+
+#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
new file mode 100644
index 00000000000..b508d639d1a
--- /dev/null
+++ b/arch/x86/include/asm/srat.h
@@ -0,0 +1,39 @@
+/*
+ * Some of the code in this file has been gleaned from the 64 bit
+ * discontigmem support code base.
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to Pat Gaughen <gone@us.ibm.com>
+ */
+
+#ifndef _ASM_X86_SRAT_H
+#define _ASM_X86_SRAT_H
+
+#ifdef CONFIG_ACPI_NUMA
+extern int get_memcfg_from_srat(void);
+#else
+static inline int get_memcfg_from_srat(void)
+{
+ return 0;
+}
+#endif
+
+#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
new file mode 100644
index 00000000000..f517944b2b1
--- /dev/null
+++ b/arch/x86/include/asm/stacktrace.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_STACKTRACE_H
+#define _ASM_X86_STACKTRACE_H
+
+extern int kstack_depth_to_print;
+
+/* Generic stack tracer with callbacks */
+
+struct stacktrace_ops {
+ void (*warning)(void *data, char *msg);
+ /* msg must contain %s for the symbol */
+ void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
+ void (*address)(void *data, unsigned long address, int reliable);
+ /* On negative return stop dumping */
+ int (*stack)(void *data, char *name);
+};
+
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data);
+
+#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/stat.h b/arch/x86/include/asm/stat.h
new file mode 100644
index 00000000000..e0b1d9bbcbc
--- /dev/null
+++ b/arch/x86/include/asm/stat.h
@@ -0,0 +1,114 @@
+#ifndef _ASM_X86_STAT_H
+#define _ASM_X86_STAT_H
+
+#define STAT_HAVE_NSEC 1
+
+#ifdef __i386__
+struct stat {
+ unsigned long st_dev;
+ unsigned long st_ino;
+ unsigned short st_mode;
+ unsigned short st_nlink;
+ unsigned short st_uid;
+ unsigned short st_gid;
+ unsigned long st_rdev;
+ unsigned long st_size;
+ unsigned long st_blksize;
+ unsigned long st_blocks;
+ unsigned long st_atime;
+ unsigned long st_atime_nsec;
+ unsigned long st_mtime;
+ unsigned long st_mtime_nsec;
+ unsigned long st_ctime;
+ unsigned long st_ctime_nsec;
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+
+#define STAT64_HAS_BROKEN_ST_INO 1
+
+/* This matches struct stat64 in glibc2.1, hence the absolutely
+ * insane amounts of padding around dev_t's.
+ */
+struct stat64 {
+ unsigned long long st_dev;
+ unsigned char __pad0[4];
+
+ unsigned long __st_ino;
+
+ unsigned int st_mode;
+ unsigned int st_nlink;
+
+ unsigned long st_uid;
+ unsigned long st_gid;
+
+ unsigned long long st_rdev;
+ unsigned char __pad3[4];
+
+ long long st_size;
+ unsigned long st_blksize;
+
+ /* Number 512-byte blocks allocated. */
+ unsigned long long st_blocks;
+
+ unsigned long st_atime;
+ unsigned long st_atime_nsec;
+
+ unsigned long st_mtime;
+ unsigned int st_mtime_nsec;
+
+ unsigned long st_ctime;
+ unsigned long st_ctime_nsec;
+
+ unsigned long long st_ino;
+};
+
+#else /* __i386__ */
+
+struct stat {
+ unsigned long st_dev;
+ unsigned long st_ino;
+ unsigned long st_nlink;
+
+ unsigned int st_mode;
+ unsigned int st_uid;
+ unsigned int st_gid;
+ unsigned int __pad0;
+ unsigned long st_rdev;
+ long st_size;
+ long st_blksize;
+ long st_blocks; /* Number 512-byte blocks allocated. */
+
+ unsigned long st_atime;
+ unsigned long st_atime_nsec;
+ unsigned long st_mtime;
+ unsigned long st_mtime_nsec;
+ unsigned long st_ctime;
+ unsigned long st_ctime_nsec;
+ long __unused[3];
+};
+#endif
+
+/* for 32bit emulation and 32 bit kernels */
+struct __old_kernel_stat {
+ unsigned short st_dev;
+ unsigned short st_ino;
+ unsigned short st_mode;
+ unsigned short st_nlink;
+ unsigned short st_uid;
+ unsigned short st_gid;
+ unsigned short st_rdev;
+#ifdef __i386__
+ unsigned long st_size;
+ unsigned long st_atime;
+ unsigned long st_mtime;
+ unsigned long st_ctime;
+#else
+ unsigned int st_size;
+ unsigned int st_atime;
+ unsigned int st_mtime;
+ unsigned int st_ctime;
+#endif
+};
+
+#endif /* _ASM_X86_STAT_H */
diff --git a/arch/x86/include/asm/statfs.h b/arch/x86/include/asm/statfs.h
new file mode 100644
index 00000000000..2d0adbf99a8
--- /dev/null
+++ b/arch/x86/include/asm/statfs.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_STATFS_H
+#define _ASM_X86_STATFS_H
+
+/*
+ * We need compat_statfs64 to be packed, because the i386 ABI won't
+ * add padding at the end to bring it to a multiple of 8 bytes, but
+ * the x86_64 ABI will.
+ */
+#define ARCH_PACK_COMPAT_STATFS64 __attribute__((packed,aligned(4)))
+
+#include <asm-generic/statfs.h>
+#endif /* _ASM_X86_STATFS_H */
diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
new file mode 100644
index 00000000000..6dfd6d9373a
--- /dev/null
+++ b/arch/x86/include/asm/string.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "string_32.h"
+#else
+# include "string_64.h"
+#endif
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
new file mode 100644
index 00000000000..0e0e3ba827f
--- /dev/null
+++ b/arch/x86/include/asm/string_32.h
@@ -0,0 +1,326 @@
+#ifndef _ASM_X86_STRING_32_H
+#define _ASM_X86_STRING_32_H
+
+#ifdef __KERNEL__
+
+/* Let gcc decide whether to inline or use the out of line functions */
+
+#define __HAVE_ARCH_STRCPY
+extern char *strcpy(char *dest, const char *src);
+
+#define __HAVE_ARCH_STRNCPY
+extern char *strncpy(char *dest, const char *src, size_t count);
+
+#define __HAVE_ARCH_STRCAT
+extern char *strcat(char *dest, const char *src);
+
+#define __HAVE_ARCH_STRNCAT
+extern char *strncat(char *dest, const char *src, size_t count);
+
+#define __HAVE_ARCH_STRCMP
+extern int strcmp(const char *cs, const char *ct);
+
+#define __HAVE_ARCH_STRNCMP
+extern int strncmp(const char *cs, const char *ct, size_t count);
+
+#define __HAVE_ARCH_STRCHR
+extern char *strchr(const char *s, int c);
+
+#define __HAVE_ARCH_STRLEN
+extern size_t strlen(const char *s);
+
+static __always_inline void *__memcpy(void *to, const void *from, size_t n)
+{
+ int d0, d1, d2;
+ asm volatile("rep ; movsl\n\t"
+ "movl %4,%%ecx\n\t"
+ "andl $3,%%ecx\n\t"
+ "jz 1f\n\t"
+ "rep ; movsb\n\t"
+ "1:"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
+ : "memory");
+ return to;
+}
+
+/*
+ * This looks ugly, but the compiler can optimize it totally,
+ * as the count is constant.
+ */
+static __always_inline void *__constant_memcpy(void *to, const void *from,
+ size_t n)
+{
+ long esi, edi;
+ if (!n)
+ return to;
+
+ switch (n) {
+ case 1:
+ *(char *)to = *(char *)from;
+ return to;
+ case 2:
+ *(short *)to = *(short *)from;
+ return to;
+ case 4:
+ *(int *)to = *(int *)from;
+ return to;
+
+ case 3:
+ *(short *)to = *(short *)from;
+ *((char *)to + 2) = *((char *)from + 2);
+ return to;
+ case 5:
+ *(int *)to = *(int *)from;
+ *((char *)to + 4) = *((char *)from + 4);
+ return to;
+ case 6:
+ *(int *)to = *(int *)from;
+ *((short *)to + 2) = *((short *)from + 2);
+ return to;
+ case 8:
+ *(int *)to = *(int *)from;
+ *((int *)to + 1) = *((int *)from + 1);
+ return to;
+ }
+
+ esi = (long)from;
+ edi = (long)to;
+ if (n >= 5 * 4) {
+ /* large block: use rep prefix */
+ int ecx;
+ asm volatile("rep ; movsl"
+ : "=&c" (ecx), "=&D" (edi), "=&S" (esi)
+ : "0" (n / 4), "1" (edi), "2" (esi)
+ : "memory"
+ );
+ } else {
+ /* small block: don't clobber ecx + smaller code */
+ if (n >= 4 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ if (n >= 3 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ if (n >= 2 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ if (n >= 1 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ }
+ switch (n % 4) {
+ /* tail */
+ case 0:
+ return to;
+ case 1:
+ asm volatile("movsb"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ return to;
+ case 2:
+ asm volatile("movsw"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ return to;
+ default:
+ asm volatile("movsw\n\tmovsb"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ return to;
+ }
+}
+
+#define __HAVE_ARCH_MEMCPY
+
+#ifdef CONFIG_X86_USE_3DNOW
+
+#include <asm/mmx.h>
+
+/*
+ * This CPU favours 3DNow strongly (eg AMD Athlon)
+ */
+
+static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
+{
+ if (len < 512)
+ return __constant_memcpy(to, from, len);
+ return _mmx_memcpy(to, from, len);
+}
+
+static inline void *__memcpy3d(void *to, const void *from, size_t len)
+{
+ if (len < 512)
+ return __memcpy(to, from, len);
+ return _mmx_memcpy(to, from, len);
+}
+
+#define memcpy(t, f, n) \
+ (__builtin_constant_p((n)) \
+ ? __constant_memcpy3d((t), (f), (n)) \
+ : __memcpy3d((t), (f), (n)))
+
+#else
+
+/*
+ * No 3D Now!
+ */
+
+#define memcpy(t, f, n) \
+ (__builtin_constant_p((n)) \
+ ? __constant_memcpy((t), (f), (n)) \
+ : __memcpy((t), (f), (n)))
+
+#endif
+
+#define __HAVE_ARCH_MEMMOVE
+void *memmove(void *dest, const void *src, size_t n);
+
+#define memcmp __builtin_memcmp
+
+#define __HAVE_ARCH_MEMCHR
+extern void *memchr(const void *cs, int c, size_t count);
+
+static inline void *__memset_generic(void *s, char c, size_t count)
+{
+ int d0, d1;
+ asm volatile("rep\n\t"
+ "stosb"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (c), "1" (s), "0" (count)
+ : "memory");
+ return s;
+}
+
+/* we might want to write optimized versions of these later */
+#define __constant_count_memset(s, c, count) __memset_generic((s), (c), (count))
+
+/*
+ * memset(x, 0, y) is a reasonably common thing to do, so we want to fill
+ * things 32 bits at a time even when we don't know the size of the
+ * area at compile-time..
+ */
+static __always_inline
+void *__constant_c_memset(void *s, unsigned long c, size_t count)
+{
+ int d0, d1;
+ asm volatile("rep ; stosl\n\t"
+ "testb $2,%b3\n\t"
+ "je 1f\n\t"
+ "stosw\n"
+ "1:\ttestb $1,%b3\n\t"
+ "je 2f\n\t"
+ "stosb\n"
+ "2:"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (c), "q" (count), "0" (count/4), "1" ((long)s)
+ : "memory");
+ return s;
+}
+
+/* Added by Gertjan van Wingerde to make minix and sysv module work */
+#define __HAVE_ARCH_STRNLEN
+extern size_t strnlen(const char *s, size_t count);
+/* end of additional stuff */
+
+#define __HAVE_ARCH_STRSTR
+extern char *strstr(const char *cs, const char *ct);
+
+/*
+ * This looks horribly ugly, but the compiler can optimize it totally,
+ * as we by now know that both pattern and count is constant..
+ */
+static __always_inline
+void *__constant_c_and_count_memset(void *s, unsigned long pattern,
+ size_t count)
+{
+ switch (count) {
+ case 0:
+ return s;
+ case 1:
+ *(unsigned char *)s = pattern & 0xff;
+ return s;
+ case 2:
+ *(unsigned short *)s = pattern & 0xffff;
+ return s;
+ case 3:
+ *(unsigned short *)s = pattern & 0xffff;
+ *((unsigned char *)s + 2) = pattern & 0xff;
+ return s;
+ case 4:
+ *(unsigned long *)s = pattern;
+ return s;
+ }
+
+#define COMMON(x) \
+ asm volatile("rep ; stosl" \
+ x \
+ : "=&c" (d0), "=&D" (d1) \
+ : "a" (eax), "0" (count/4), "1" ((long)s) \
+ : "memory")
+
+ {
+ int d0, d1;
+#if __GNUC__ == 4 && __GNUC_MINOR__ == 0
+ /* Workaround for broken gcc 4.0 */
+ register unsigned long eax asm("%eax") = pattern;
+#else
+ unsigned long eax = pattern;
+#endif
+
+ switch (count % 4) {
+ case 0:
+ COMMON("");
+ return s;
+ case 1:
+ COMMON("\n\tstosb");
+ return s;
+ case 2:
+ COMMON("\n\tstosw");
+ return s;
+ default:
+ COMMON("\n\tstosw\n\tstosb");
+ return s;
+ }
+ }
+
+#undef COMMON
+}
+
+#define __constant_c_x_memset(s, c, count) \
+ (__builtin_constant_p(count) \
+ ? __constant_c_and_count_memset((s), (c), (count)) \
+ : __constant_c_memset((s), (c), (count)))
+
+#define __memset(s, c, count) \
+ (__builtin_constant_p(count) \
+ ? __constant_count_memset((s), (c), (count)) \
+ : __memset_generic((s), (c), (count)))
+
+#define __HAVE_ARCH_MEMSET
+#define memset(s, c, count) \
+ (__builtin_constant_p(c) \
+ ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
+ (count)) \
+ : __memset((s), (c), (count)))
+
+/*
+ * find the first occurrence of byte 'c', or 1 past the area if none
+ */
+#define __HAVE_ARCH_MEMSCAN
+extern void *memscan(void *addr, int c, size_t size);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_STRING_32_H */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
new file mode 100644
index 00000000000..2afe164bf1e
--- /dev/null
+++ b/arch/x86/include/asm/string_64.h
@@ -0,0 +1,60 @@
+#ifndef _ASM_X86_STRING_64_H
+#define _ASM_X86_STRING_64_H
+
+#ifdef __KERNEL__
+
+/* Written 2002 by Andi Kleen */
+
+/* Only used for special circumstances. Stolen from i386/string.h */
+static __always_inline void *__inline_memcpy(void *to, const void *from, size_t n)
+{
+ unsigned long d0, d1, d2;
+ asm volatile("rep ; movsl\n\t"
+ "testb $2,%b4\n\t"
+ "je 1f\n\t"
+ "movsw\n"
+ "1:\ttestb $1,%b4\n\t"
+ "je 2f\n\t"
+ "movsb\n"
+ "2:"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from)
+ : "memory");
+ return to;
+}
+
+/* Even with __builtin_ the compiler may decide to use the out of line
+ function. */
+
+#define __HAVE_ARCH_MEMCPY 1
+#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
+extern void *memcpy(void *to, const void *from, size_t len);
+#else
+extern void *__memcpy(void *to, const void *from, size_t len);
+#define memcpy(dst, src, len) \
+({ \
+ size_t __len = (len); \
+ void *__ret; \
+ if (__builtin_constant_p(len) && __len >= 64) \
+ __ret = __memcpy((dst), (src), __len); \
+ else \
+ __ret = __builtin_memcpy((dst), (src), __len); \
+ __ret; \
+})
+#endif
+
+#define __HAVE_ARCH_MEMSET
+void *memset(void *s, int c, size_t n);
+
+#define __HAVE_ARCH_MEMMOVE
+void *memmove(void *dest, const void *src, size_t count);
+
+int memcmp(const void *cs, const void *ct, size_t count);
+size_t strlen(const char *s);
+char *strcpy(char *dest, const char *src);
+char *strcat(char *dest, const char *src);
+int strcmp(const char *cs, const char *ct);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h
new file mode 100644
index 00000000000..9b3070f1c2a
--- /dev/null
+++ b/arch/x86/include/asm/summit/apic.h
@@ -0,0 +1,184 @@
+#ifndef __ASM_SUMMIT_APIC_H
+#define __ASM_SUMMIT_APIC_H
+
+#include <asm/smp.h>
+
+#define esr_disable (1)
+#define NO_BALANCE_IRQ (0)
+
+/* In clustered mode, the high nibble of APIC ID is a cluster number.
+ * The low nibble is a 4-bit bitmap. */
+#define XAPIC_DEST_CPUS_SHIFT 4
+#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
+#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
+
+#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
+
+static inline cpumask_t target_cpus(void)
+{
+ /* CPU_MASK_ALL (0xff) has undefined behaviour with
+ * dest_LowestPrio mode logical clustered apic interrupt routing
+ * Just start on cpu 0. IRQ balancing will spread load
+ */
+ return cpumask_of_cpu(0);
+}
+
+#define INT_DELIVERY_MODE (dest_LowestPrio)
+#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
+
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return 0;
+}
+
+/* we don't use the phys_cpu_present_map to indicate apicid presence */
+static inline unsigned long check_apicid_present(int bit)
+{
+ return 1;
+}
+
+#define apicid_cluster(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
+
+extern u8 cpu_2_logical_apicid[];
+
+static inline void init_apic_ldr(void)
+{
+ unsigned long val, id;
+ int count = 0;
+ u8 my_id = (u8)hard_smp_processor_id();
+ u8 my_cluster = (u8)apicid_cluster(my_id);
+#ifdef CONFIG_SMP
+ u8 lid;
+ int i;
+
+ /* Create logical APIC IDs by counting CPUs already in cluster. */
+ for (count = 0, i = NR_CPUS; --i >= 0; ) {
+ lid = cpu_2_logical_apicid[i];
+ if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster)
+ ++count;
+ }
+#endif
+ /* We only have a 4 wide bitmap in cluster mode. If a deranged
+ * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
+ BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
+ id = my_cluster | (1UL << count);
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(id);
+ apic_write(APIC_LDR, val);
+}
+
+static inline int multi_timer_check(int apic, int irq)
+{
+ return 0;
+}
+
+static inline int apic_id_registered(void)
+{
+ return 1;
+}
+
+static inline void setup_apic_routing(void)
+{
+ printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
+ nr_ioapics);
+}
+
+static inline int apicid_to_node(int logical_apicid)
+{
+#ifdef CONFIG_SMP
+ return apicid_2_node[hard_smp_processor_id()];
+#else
+ return 0;
+#endif
+}
+
+/* Mapping from cpu number to logical apicid */
+static inline int cpu_to_logical_apicid(int cpu)
+{
+#ifdef CONFIG_SMP
+ if (cpu >= NR_CPUS)
+ return BAD_APICID;
+ return (int)cpu_2_logical_apicid[cpu];
+#else
+ return logical_smp_processor_id();
+#endif
+}
+
+static inline int cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < NR_CPUS)
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_id_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0x0F);
+}
+
+static inline physid_mask_t apicid_to_cpu_present(int apicid)
+{
+ return physid_mask_of_physid(0);
+}
+
+static inline void setup_portio_remap(void)
+{
+}
+
+static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return 1;
+}
+
+static inline void enable_apic_mode(void)
+{
+}
+
+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int num_bits_set;
+ int cpus_found = 0;
+ int cpu;
+ int apicid;
+
+ num_bits_set = cpus_weight(cpumask);
+ /* Return id to all */
+ if (num_bits_set == NR_CPUS)
+ return (int) 0xFF;
+ /*
+ * The cpus in the mask must all be on the apic cluster. If are not
+ * on the same apicid cluster return default value of TARGET_CPUS.
+ */
+ cpu = first_cpu(cpumask);
+ apicid = cpu_to_logical_apicid(cpu);
+ while (cpus_found < num_bits_set) {
+ if (cpu_isset(cpu, cpumask)) {
+ int new_apicid = cpu_to_logical_apicid(cpu);
+ if (apicid_cluster(apicid) !=
+ apicid_cluster(new_apicid)){
+ printk ("%s: Not a valid mask!\n", __func__);
+ return 0xFF;
+ }
+ apicid = apicid | new_apicid;
+ cpus_found++;
+ }
+ cpu++;
+ }
+ return apicid;
+}
+
+/* cpuid returns the value latched in the HW at reset, not the APIC ID
+ * register's value. For any box whose BIOS changes APIC IDs, like
+ * clustered APIC systems, we must use hard_smp_processor_id.
+ *
+ * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
+ */
+static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
+{
+ return hard_smp_processor_id() >> index_msb;
+}
+
+#endif /* __ASM_SUMMIT_APIC_H */
diff --git a/arch/x86/include/asm/summit/apicdef.h b/arch/x86/include/asm/summit/apicdef.h
new file mode 100644
index 00000000000..f3fbca1f61c
--- /dev/null
+++ b/arch/x86/include/asm/summit/apicdef.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_SUMMIT_APICDEF_H
+#define __ASM_SUMMIT_APICDEF_H
+
+#define APIC_ID_MASK (0xFF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+ return (x>>24)&0xFF;
+}
+
+#define GET_APIC_ID(x) get_apic_id(x)
+
+#endif
diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h
new file mode 100644
index 00000000000..53bd1e7bd7b
--- /dev/null
+++ b/arch/x86/include/asm/summit/ipi.h
@@ -0,0 +1,25 @@
+#ifndef __ASM_SUMMIT_IPI_H
+#define __ASM_SUMMIT_IPI_H
+
+void send_IPI_mask_sequence(cpumask_t mask, int vector);
+
+static inline void send_IPI_mask(cpumask_t mask, int vector)
+{
+ send_IPI_mask_sequence(mask, vector);
+}
+
+static inline void send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ send_IPI_mask(mask, vector);
+}
+
+static inline void send_IPI_all(int vector)
+{
+ send_IPI_mask(cpu_online_map, vector);
+}
+
+#endif /* __ASM_SUMMIT_IPI_H */
diff --git a/arch/x86/include/asm/summit/mpparse.h b/arch/x86/include/asm/summit/mpparse.h
new file mode 100644
index 00000000000..013ce6fab2d
--- /dev/null
+++ b/arch/x86/include/asm/summit/mpparse.h
@@ -0,0 +1,109 @@
+#ifndef __ASM_SUMMIT_MPPARSE_H
+#define __ASM_SUMMIT_MPPARSE_H
+
+#include <asm/tsc.h>
+
+extern int use_cyclone;
+
+#ifdef CONFIG_X86_SUMMIT_NUMA
+extern void setup_summit(void);
+#else
+#define setup_summit() {}
+#endif
+
+static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid)
+{
+ if (!strncmp(oem, "IBM ENSW", 8) &&
+ (!strncmp(productid, "VIGIL SMP", 9)
+ || !strncmp(productid, "EXA", 3)
+ || !strncmp(productid, "RUTHLESS SMP", 12))){
+ mark_tsc_unstable("Summit based system");
+ use_cyclone = 1; /*enable cyclone-timer*/
+ setup_summit();
+ return 1;
+ }
+ return 0;
+}
+
+/* Hook from generic ACPI tables.c */
+static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (!strncmp(oem_id, "IBM", 3) &&
+ (!strncmp(oem_table_id, "SERVIGIL", 8)
+ || !strncmp(oem_table_id, "EXA", 3))){
+ mark_tsc_unstable("Summit based system");
+ use_cyclone = 1; /*enable cyclone-timer*/
+ setup_summit();
+ return 1;
+ }
+ return 0;
+}
+
+struct rio_table_hdr {
+ unsigned char version; /* Version number of this data structure */
+ /* Version 3 adds chassis_num & WP_index */
+ unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
+ unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
+} __attribute__((packed));
+
+struct scal_detail {
+ unsigned char node_id; /* Scalability Node ID */
+ unsigned long CBAR; /* Address of 1MB register space */
+ unsigned char port0node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port1node; /* Node ID port connected to: 0xFF = None */
+ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port2node; /* Node ID port connected to: 0xFF = None */
+ unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
+} __attribute__((packed));
+
+struct rio_detail {
+ unsigned char node_id; /* RIO Node ID */
+ unsigned long BBAR; /* Address of 1MB register space */
+ unsigned char type; /* Type of device */
+ unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
+ /* For CYC: Node ID of Twister that owns this CYC */
+ unsigned char port0node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port1node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
+ /* For CYC: 0 */
+ unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
+ /* = 0 : the XAPIC is not used, ie:*/
+ /* ints fwded to another XAPIC */
+ /* Bits1:7 Reserved */
+ /* For CYC: Bits0:7 Reserved */
+ unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
+ /* lower slot numbers/PCI bus numbers */
+ /* For CYC: No meaning */
+ unsigned char chassis_num; /* 1 based Chassis number */
+ /* For LookOut WPEGs this field indicates the */
+ /* Expansion Chassis #, enumerated from Boot */
+ /* Node WPEG external port, then Boot Node CYC */
+ /* external port, then Next Vigil chassis WPEG */
+ /* external port, etc. */
+ /* Shared Lookouts have only 1 chassis number (the */
+ /* first one assigned) */
+} __attribute__((packed));
+
+
+typedef enum {
+ CompatTwister = 0, /* Compatibility Twister */
+ AltTwister = 1, /* Alternate Twister of internal 8-way */
+ CompatCyclone = 2, /* Compatibility Cyclone */
+ AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
+ CompatWPEG = 4, /* Compatibility WPEG */
+ AltWPEG = 5, /* Second Planar WPEG */
+ LookOutAWPEG = 6, /* LookOut WPEG */
+ LookOutBWPEG = 7, /* LookOut WPEG */
+} node_type;
+
+static inline int is_WPEG(struct rio_detail *rio){
+ return (rio->type == CompatWPEG || rio->type == AltWPEG ||
+ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
+}
+
+#endif /* __ASM_SUMMIT_MPPARSE_H */
diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h
new file mode 100644
index 00000000000..9bd521fe457
--- /dev/null
+++ b/arch/x86/include/asm/suspend.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "suspend_32.h"
+#else
+# include "suspend_64.h"
+#endif
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
new file mode 100644
index 00000000000..a5074bd0f8b
--- /dev/null
+++ b/arch/x86/include/asm/suspend_32.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2001-2002 Pavel Machek <pavel@suse.cz>
+ * Based on code
+ * Copyright 2001 Patrick Mochel <mochel@osdl.org>
+ */
+#ifndef _ASM_X86_SUSPEND_32_H
+#define _ASM_X86_SUSPEND_32_H
+
+#include <asm/desc.h>
+#include <asm/i387.h>
+
+static inline int arch_prepare_suspend(void) { return 0; }
+
+/* image of the saved processor state */
+struct saved_context {
+ u16 es, fs, gs, ss;
+ unsigned long cr0, cr2, cr3, cr4;
+ struct desc_ptr gdt;
+ struct desc_ptr idt;
+ u16 ldt;
+ u16 tss;
+ unsigned long tr;
+ unsigned long safety;
+ unsigned long return_address;
+} __attribute__((packed));
+
+#ifdef CONFIG_ACPI
+extern unsigned long saved_eip;
+extern unsigned long saved_esp;
+extern unsigned long saved_ebp;
+extern unsigned long saved_ebx;
+extern unsigned long saved_esi;
+extern unsigned long saved_edi;
+
+static inline void acpi_save_register_state(unsigned long return_point)
+{
+ saved_eip = return_point;
+ asm volatile("movl %%esp,%0" : "=m" (saved_esp));
+ asm volatile("movl %%ebp,%0" : "=m" (saved_ebp));
+ asm volatile("movl %%ebx,%0" : "=m" (saved_ebx));
+ asm volatile("movl %%edi,%0" : "=m" (saved_edi));
+ asm volatile("movl %%esi,%0" : "=m" (saved_esi));
+}
+
+#define acpi_restore_register_state() do {} while (0)
+
+/* routines for saving/restoring kernel state */
+extern int acpi_save_state_mem(void);
+#endif
+
+#endif /* _ASM_X86_SUSPEND_32_H */
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
new file mode 100644
index 00000000000..06284f42b75
--- /dev/null
+++ b/arch/x86/include/asm/suspend_64.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2001-2003 Pavel Machek <pavel@suse.cz>
+ * Based on code
+ * Copyright 2001 Patrick Mochel <mochel@osdl.org>
+ */
+#ifndef _ASM_X86_SUSPEND_64_H
+#define _ASM_X86_SUSPEND_64_H
+
+#include <asm/desc.h>
+#include <asm/i387.h>
+
+static inline int arch_prepare_suspend(void)
+{
+ return 0;
+}
+
+/*
+ * Image of the saved processor state, used by the low level ACPI suspend to
+ * RAM code and by the low level hibernation code.
+ *
+ * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that
+ * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c,
+ * still work as required.
+ */
+struct saved_context {
+ struct pt_regs regs;
+ u16 ds, es, fs, gs, ss;
+ unsigned long gs_base, gs_kernel_base, fs_base;
+ unsigned long cr0, cr2, cr3, cr4, cr8;
+ unsigned long efer;
+ u16 gdt_pad;
+ u16 gdt_limit;
+ unsigned long gdt_base;
+ u16 idt_pad;
+ u16 idt_limit;
+ unsigned long idt_base;
+ u16 ldt;
+ u16 tss;
+ unsigned long tr;
+ unsigned long safety;
+ unsigned long return_address;
+} __attribute__((packed));
+
+#define loaddebug(thread,register) \
+ set_debugreg((thread)->debugreg##register, register)
+
+/* routines for saving/restoring kernel state */
+extern int acpi_save_state_mem(void);
+extern char core_restore_code;
+extern char restore_registers;
+
+#endif /* _ASM_X86_SUSPEND_64_H */
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
new file mode 100644
index 00000000000..51fb2c76ad7
--- /dev/null
+++ b/arch/x86/include/asm/swiotlb.h
@@ -0,0 +1,58 @@
+#ifndef _ASM_X86_SWIOTLB_H
+#define _ASM_X86_SWIOTLB_H
+
+#include <asm/dma-mapping.h>
+
+/* SWIOTLB interface */
+
+extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr,
+ size_t size, int dir);
+extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags);
+extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
+ dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_for_device(struct device *hwdev,
+ dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_range_for_cpu(struct device *hwdev,
+ dma_addr_t dev_addr,
+ unsigned long offset,
+ size_t size, int dir);
+extern void swiotlb_sync_single_range_for_device(struct device *hwdev,
+ dma_addr_t dev_addr,
+ unsigned long offset,
+ size_t size, int dir);
+extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int dir);
+extern void swiotlb_sync_sg_for_device(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int dir);
+extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
+extern void swiotlb_free_coherent(struct device *hwdev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
+extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
+extern void swiotlb_init(void);
+
+extern int swiotlb_force;
+
+#ifdef CONFIG_SWIOTLB
+extern int swiotlb;
+extern void pci_swiotlb_init(void);
+#else
+#define swiotlb 0
+static inline void pci_swiotlb_init(void)
+{
+}
+#endif
+
+static inline void dma_mark_clean(void *addr, size_t size) {}
+
+#endif /* _ASM_X86_SWIOTLB_H */
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
new file mode 100644
index 00000000000..9d09b4073b6
--- /dev/null
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -0,0 +1,130 @@
+#ifndef _ASM_X86_SYNC_BITOPS_H
+#define _ASM_X86_SYNC_BITOPS_H
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ */
+
+/*
+ * These have to be done with inline assembly: that way the bit-setting
+ * is guaranteed to be atomic. All bit operations return 0 if the bit
+ * was cleared before the operation and != 0 if it was not.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+
+#define ADDR (*(volatile long *)addr)
+
+/**
+ * sync_set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered. See __set_bit()
+ * if you do not require the atomic guarantees.
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void sync_set_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("lock; btsl %1,%0"
+ : "+m" (ADDR)
+ : "Ir" (nr)
+ : "memory");
+}
+
+/**
+ * sync_clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * sync_clear_bit() is atomic and may not be reordered. However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void sync_clear_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("lock; btrl %1,%0"
+ : "+m" (ADDR)
+ : "Ir" (nr)
+ : "memory");
+}
+
+/**
+ * sync_change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * sync_change_bit() is atomic and may not be reordered.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void sync_change_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("lock; btcl %1,%0"
+ : "+m" (ADDR)
+ : "Ir" (nr)
+ : "memory");
+}
+
+/**
+ * sync_test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "+m" (ADDR)
+ : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+/**
+ * sync_test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "+m" (ADDR)
+ : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+/**
+ * sync_test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "+m" (ADDR)
+ : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+#define sync_test_bit(nr, addr) test_bit(nr, addr)
+
+#undef ADDR
+
+#endif /* _ASM_X86_SYNC_BITOPS_H */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
new file mode 100644
index 00000000000..d82f39bb790
--- /dev/null
+++ b/arch/x86/include/asm/syscall.h
@@ -0,0 +1,213 @@
+/*
+ * Access to user system call parameters and results
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * See asm-generic/syscall.h for descriptions of what we must do here.
+ */
+
+#ifndef _ASM_X86_SYSCALL_H
+#define _ASM_X86_SYSCALL_H
+
+#include <linux/sched.h>
+#include <linux/err.h>
+
+static inline long syscall_get_nr(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ /*
+ * We always sign-extend a -1 value being set here,
+ * so this is always either -1L or a syscall number.
+ */
+ return regs->orig_ax;
+}
+
+static inline void syscall_rollback(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ regs->ax = regs->orig_ax;
+}
+
+static inline long syscall_get_error(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ unsigned long error = regs->ax;
+#ifdef CONFIG_IA32_EMULATION
+ /*
+ * TS_COMPAT is set for 32-bit syscall entries and then
+ * remains set until we return to user mode.
+ */
+ if (task_thread_info(task)->status & TS_COMPAT)
+ /*
+ * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
+ * and will match correctly in comparisons.
+ */
+ error = (long) (int) error;
+#endif
+ return IS_ERR_VALUE(error) ? error : 0;
+}
+
+static inline long syscall_get_return_value(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ return regs->ax;
+}
+
+static inline void syscall_set_return_value(struct task_struct *task,
+ struct pt_regs *regs,
+ int error, long val)
+{
+ regs->ax = (long) error ?: val;
+}
+
+#ifdef CONFIG_X86_32
+
+static inline void syscall_get_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ unsigned long *args)
+{
+ BUG_ON(i + n > 6);
+ memcpy(args, &regs->bx + i, n * sizeof(args[0]));
+}
+
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ const unsigned long *args)
+{
+ BUG_ON(i + n > 6);
+ memcpy(&regs->bx + i, args, n * sizeof(args[0]));
+}
+
+#else /* CONFIG_X86_64 */
+
+static inline void syscall_get_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ unsigned long *args)
+{
+# ifdef CONFIG_IA32_EMULATION
+ if (task_thread_info(task)->status & TS_COMPAT)
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ *args++ = regs->bx;
+ case 1:
+ if (!n--) break;
+ *args++ = regs->cx;
+ case 2:
+ if (!n--) break;
+ *args++ = regs->dx;
+ case 3:
+ if (!n--) break;
+ *args++ = regs->si;
+ case 4:
+ if (!n--) break;
+ *args++ = regs->di;
+ case 5:
+ if (!n--) break;
+ *args++ = regs->bp;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+ else
+# endif
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ *args++ = regs->di;
+ case 1:
+ if (!n--) break;
+ *args++ = regs->si;
+ case 2:
+ if (!n--) break;
+ *args++ = regs->dx;
+ case 3:
+ if (!n--) break;
+ *args++ = regs->r10;
+ case 4:
+ if (!n--) break;
+ *args++ = regs->r8;
+ case 5:
+ if (!n--) break;
+ *args++ = regs->r9;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+}
+
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ const unsigned long *args)
+{
+# ifdef CONFIG_IA32_EMULATION
+ if (task_thread_info(task)->status & TS_COMPAT)
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ regs->bx = *args++;
+ case 1:
+ if (!n--) break;
+ regs->cx = *args++;
+ case 2:
+ if (!n--) break;
+ regs->dx = *args++;
+ case 3:
+ if (!n--) break;
+ regs->si = *args++;
+ case 4:
+ if (!n--) break;
+ regs->di = *args++;
+ case 5:
+ if (!n--) break;
+ regs->bp = *args++;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+ else
+# endif
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ regs->di = *args++;
+ case 1:
+ if (!n--) break;
+ regs->si = *args++;
+ case 2:
+ if (!n--) break;
+ regs->dx = *args++;
+ case 3:
+ if (!n--) break;
+ regs->r10 = *args++;
+ case 4:
+ if (!n--) break;
+ regs->r8 = *args++;
+ case 5:
+ if (!n--) break;
+ regs->r9 = *args++;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+}
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* _ASM_X86_SYSCALL_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
new file mode 100644
index 00000000000..87803da4401
--- /dev/null
+++ b/arch/x86/include/asm/syscalls.h
@@ -0,0 +1,93 @@
+/*
+ * syscalls.h - Linux syscall interfaces (arch-specific)
+ *
+ * Copyright (c) 2008 Jaswinder Singh
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#ifndef _ASM_X86_SYSCALLS_H
+#define _ASM_X86_SYSCALLS_H
+
+#include <linux/compiler.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/signal.h>
+
+/* Common in X86_32 and X86_64 */
+/* kernel/ioport.c */
+asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
+
+/* X86_32 only */
+#ifdef CONFIG_X86_32
+/* kernel/process_32.c */
+asmlinkage int sys_fork(struct pt_regs);
+asmlinkage int sys_clone(struct pt_regs);
+asmlinkage int sys_vfork(struct pt_regs);
+asmlinkage int sys_execve(struct pt_regs);
+
+/* kernel/signal_32.c */
+asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
+asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
+ struct old_sigaction __user *);
+asmlinkage int sys_sigaltstack(unsigned long);
+asmlinkage unsigned long sys_sigreturn(unsigned long);
+asmlinkage int sys_rt_sigreturn(unsigned long);
+
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned long);
+
+/* kernel/ldt.c */
+asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
+
+/* kernel/sys_i386_32.c */
+asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+struct mmap_arg_struct;
+asmlinkage int old_mmap(struct mmap_arg_struct __user *);
+struct sel_arg_struct;
+asmlinkage int old_select(struct sel_arg_struct __user *);
+asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
+struct old_utsname;
+asmlinkage int sys_uname(struct old_utsname __user *);
+struct oldold_utsname;
+asmlinkage int sys_olduname(struct oldold_utsname __user *);
+
+/* kernel/tls.c */
+asmlinkage int sys_set_thread_area(struct user_desc __user *);
+asmlinkage int sys_get_thread_area(struct user_desc __user *);
+
+/* kernel/vm86_32.c */
+asmlinkage int sys_vm86old(struct pt_regs);
+asmlinkage int sys_vm86(struct pt_regs);
+
+#else /* CONFIG_X86_32 */
+
+/* X86_64 only */
+/* kernel/process_64.c */
+asmlinkage long sys_fork(struct pt_regs *);
+asmlinkage long sys_clone(unsigned long, unsigned long,
+ void __user *, void __user *,
+ struct pt_regs *);
+asmlinkage long sys_vfork(struct pt_regs *);
+asmlinkage long sys_execve(char __user *, char __user * __user *,
+ char __user * __user *,
+ struct pt_regs *);
+
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
+
+/* kernel/signal_64.c */
+asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
+ struct pt_regs *);
+asmlinkage long sys_rt_sigreturn(struct pt_regs *);
+
+/* kernel/sys_x86_64.c */
+asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+struct new_utsname;
+asmlinkage long sys_uname(struct new_utsname __user *);
+
+#endif /* CONFIG_X86_32 */
+#endif /* _ASM_X86_SYSCALLS_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
new file mode 100644
index 00000000000..2ed3f0f44ff
--- /dev/null
+++ b/arch/x86/include/asm/system.h
@@ -0,0 +1,425 @@
+#ifndef _ASM_X86_SYSTEM_H
+#define _ASM_X86_SYSTEM_H
+
+#include <asm/asm.h>
+#include <asm/segment.h>
+#include <asm/cpufeature.h>
+#include <asm/cmpxchg.h>
+#include <asm/nops.h>
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+
+/* entries in ARCH_DLINFO: */
+#ifdef CONFIG_IA32_EMULATION
+# define AT_VECTOR_SIZE_ARCH 2
+#else
+# define AT_VECTOR_SIZE_ARCH 1
+#endif
+
+#ifdef CONFIG_X86_32
+
+struct task_struct; /* one of the stranger aspects of C forward declarations */
+struct task_struct *__switch_to(struct task_struct *prev,
+ struct task_struct *next);
+
+/*
+ * Saving eflags is important. It switches not only IOPL between tasks,
+ * it also protects other tasks from NT leaking through sysenter etc.
+ */
+#define switch_to(prev, next, last) \
+do { \
+ /* \
+ * Context-switching clobbers all registers, so we clobber \
+ * them explicitly, via unused output variables. \
+ * (EAX and EBP is not listed because EBP is saved/restored \
+ * explicitly for wchan access and EAX is the return value of \
+ * __switch_to()) \
+ */ \
+ unsigned long ebx, ecx, edx, esi, edi; \
+ \
+ asm volatile("pushfl\n\t" /* save flags */ \
+ "pushl %%ebp\n\t" /* save EBP */ \
+ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
+ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
+ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
+ "pushl %[next_ip]\n\t" /* restore EIP */ \
+ "jmp __switch_to\n" /* regparm call */ \
+ "1:\t" \
+ "popl %%ebp\n\t" /* restore EBP */ \
+ "popfl\n" /* restore flags */ \
+ \
+ /* output parameters */ \
+ : [prev_sp] "=m" (prev->thread.sp), \
+ [prev_ip] "=m" (prev->thread.ip), \
+ "=a" (last), \
+ \
+ /* clobbered output registers: */ \
+ "=b" (ebx), "=c" (ecx), "=d" (edx), \
+ "=S" (esi), "=D" (edi) \
+ \
+ /* input parameters: */ \
+ : [next_sp] "m" (next->thread.sp), \
+ [next_ip] "m" (next->thread.ip), \
+ \
+ /* regparm parameters for __switch_to(): */ \
+ [prev] "a" (prev), \
+ [next] "d" (next) \
+ \
+ : /* reloaded segment registers */ \
+ "memory"); \
+} while (0)
+
+/*
+ * disable hlt during certain critical i/o operations
+ */
+#define HAVE_DISABLE_HLT
+#else
+#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
+#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
+
+/* frame pointer must be last for get_wchan */
+#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
+
+#define __EXTRA_CLOBBER \
+ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
+ "r12", "r13", "r14", "r15"
+
+/* Save restore flags to clear handle leaking NT */
+#define switch_to(prev, next, last) \
+ asm volatile(SAVE_CONTEXT \
+ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
+ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
+ "call __switch_to\n\t" \
+ ".globl thread_return\n" \
+ "thread_return:\n\t" \
+ "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
+ "movq %P[thread_info](%%rsi),%%r8\n\t" \
+ LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
+ "movq %%rax,%%rdi\n\t" \
+ "jc ret_from_fork\n\t" \
+ RESTORE_CONTEXT \
+ : "=a" (last) \
+ : [next] "S" (next), [prev] "D" (prev), \
+ [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
+ [ti_flags] "i" (offsetof(struct thread_info, flags)), \
+ [tif_fork] "i" (TIF_FORK), \
+ [thread_info] "i" (offsetof(struct task_struct, stack)), \
+ [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
+ : "memory", "cc" __EXTRA_CLOBBER)
+#endif
+
+#ifdef __KERNEL__
+#define _set_base(addr, base) do { unsigned long __pr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+ "rorl $16,%%edx\n\t" \
+ "movb %%dl,%2\n\t" \
+ "movb %%dh,%3" \
+ :"=&d" (__pr) \
+ :"m" (*((addr)+2)), \
+ "m" (*((addr)+4)), \
+ "m" (*((addr)+7)), \
+ "0" (base) \
+ ); } while (0)
+
+#define _set_limit(addr, limit) do { unsigned long __lr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+ "rorl $16,%%edx\n\t" \
+ "movb %2,%%dh\n\t" \
+ "andb $0xf0,%%dh\n\t" \
+ "orb %%dh,%%dl\n\t" \
+ "movb %%dl,%2" \
+ :"=&d" (__lr) \
+ :"m" (*(addr)), \
+ "m" (*((addr)+6)), \
+ "0" (limit) \
+ ); } while (0)
+
+#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
+#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
+
+extern void native_load_gs_index(unsigned);
+
+/*
+ * Load a segment. Fall back on loading the zero
+ * segment if something goes wrong..
+ */
+#define loadsegment(seg, value) \
+ asm volatile("\n" \
+ "1:\t" \
+ "movl %k0,%%" #seg "\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3:\t" \
+ "movl %k1, %%" #seg "\n\t" \
+ "jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b,3b) \
+ : :"r" (value), "r" (0) : "memory")
+
+
+/*
+ * Save a segment register away
+ */
+#define savesegment(seg, value) \
+ asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
+
+static inline unsigned long get_limit(unsigned long segment)
+{
+ unsigned long __limit;
+ asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
+ return __limit + 1;
+}
+
+static inline void native_clts(void)
+{
+ asm volatile("clts");
+}
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long native_read_cr0(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr0(unsigned long val)
+{
+ asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr2(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr2(unsigned long val)
+{
+ asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr3(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr3(unsigned long val)
+{
+ asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr4(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline unsigned long native_read_cr4_safe(void)
+{
+ unsigned long val;
+ /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
+ * exists, so it will never fail. */
+#ifdef CONFIG_X86_32
+ asm volatile("1: mov %%cr4, %0\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b)
+ : "=r" (val), "=m" (__force_order) : "0" (0));
+#else
+ val = native_read_cr4();
+#endif
+ return val;
+}
+
+static inline void native_write_cr4(unsigned long val)
+{
+ asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long native_read_cr8(void)
+{
+ unsigned long cr8;
+ asm volatile("movq %%cr8,%0" : "=r" (cr8));
+ return cr8;
+}
+
+static inline void native_write_cr8(unsigned long val)
+{
+ asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
+}
+#endif
+
+static inline void native_wbinvd(void)
+{
+ asm volatile("wbinvd": : :"memory");
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define read_cr0() (native_read_cr0())
+#define write_cr0(x) (native_write_cr0(x))
+#define read_cr2() (native_read_cr2())
+#define write_cr2(x) (native_write_cr2(x))
+#define read_cr3() (native_read_cr3())
+#define write_cr3(x) (native_write_cr3(x))
+#define read_cr4() (native_read_cr4())
+#define read_cr4_safe() (native_read_cr4_safe())
+#define write_cr4(x) (native_write_cr4(x))
+#define wbinvd() (native_wbinvd())
+#ifdef CONFIG_X86_64
+#define read_cr8() (native_read_cr8())
+#define write_cr8(x) (native_write_cr8(x))
+#define load_gs_index native_load_gs_index
+#endif
+
+/* Clear the 'TS' bit */
+#define clts() (native_clts())
+
+#endif/* CONFIG_PARAVIRT */
+
+#define stts() write_cr0(read_cr0() | X86_CR0_TS)
+
+#endif /* __KERNEL__ */
+
+static inline void clflush(volatile void *__p)
+{
+ asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
+}
+
+#define nop() asm volatile ("nop")
+
+void disable_hlt(void);
+void enable_hlt(void);
+
+void cpu_idle_wait(void);
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+
+void default_idle(void);
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+#ifdef CONFIG_X86_32
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#else
+#define mb() asm volatile("mfence":::"memory")
+#define rmb() asm volatile("lfence":::"memory")
+#define wmb() asm volatile("sfence" ::: "memory")
+#endif
+
+/**
+ * read_barrier_depends - Flush all pending reads that subsequents reads
+ * depend on.
+ *
+ * No data-dependent reads from memory-like regions are ever reordered
+ * over this barrier. All reads preceding this primitive are guaranteed
+ * to access memory (but not necessarily other CPUs' caches) before any
+ * reads following this primitive that depend on the data return by
+ * any of the preceding reads. This primitive is much lighter weight than
+ * rmb() on most CPUs, and is never heavier weight than is
+ * rmb().
+ *
+ * These ordering constraints are respected by both the local CPU
+ * and the compiler.
+ *
+ * Ordering is not guaranteed by anything other than these primitives,
+ * not even by data dependencies. See the documentation for
+ * memory_barrier() for examples and URLs to more information.
+ *
+ * For example, the following code would force ordering (the initial
+ * value of "a" is zero, "b" is one, and "p" is "&a"):
+ *
+ * <programlisting>
+ * CPU 0 CPU 1
+ *
+ * b = 2;
+ * memory_barrier();
+ * p = &b; q = p;
+ * read_barrier_depends();
+ * d = *q;
+ * </programlisting>
+ *
+ * because the read of "*q" depends on the read of "p" and these
+ * two reads are separated by a read_barrier_depends(). However,
+ * the following code, with the same initial values for "a" and "b":
+ *
+ * <programlisting>
+ * CPU 0 CPU 1
+ *
+ * a = 2;
+ * memory_barrier();
+ * b = 3; y = b;
+ * read_barrier_depends();
+ * x = a;
+ * </programlisting>
+ *
+ * does not enforce ordering, since there is no data dependency between
+ * the read of "a" and the read of "b". Therefore, on some CPUs, such
+ * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
+ * in cases like this where there are no data dependencies.
+ **/
+
+#define read_barrier_depends() do { } while (0)
+
+#ifdef CONFIG_SMP
+#define smp_mb() mb()
+#ifdef CONFIG_X86_PPRO_FENCE
+# define smp_rmb() rmb()
+#else
+# define smp_rmb() barrier()
+#endif
+#ifdef CONFIG_X86_OOSTORE
+# define smp_wmb() wmb()
+#else
+# define smp_wmb() barrier()
+#endif
+#define smp_read_barrier_depends() read_barrier_depends()
+#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#else
+#define smp_mb() barrier()
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#define smp_read_barrier_depends() do { } while (0)
+#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif
+
+/*
+ * Stop RDTSC speculation. This is needed when you need to use RDTSC
+ * (or get_cycles or vread that possibly accesses the TSC) in a defined
+ * code region.
+ *
+ * (Could use an alternative three way for this if there was one.)
+ */
+static inline void rdtsc_barrier(void)
+{
+ alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+ alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+}
+
+#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h
new file mode 100644
index 00000000000..1159e091ad0
--- /dev/null
+++ b/arch/x86/include/asm/system_64.h
@@ -0,0 +1,22 @@
+#ifndef _ASM_X86_SYSTEM_64_H
+#define _ASM_X86_SYSTEM_64_H
+
+#include <asm/segment.h>
+#include <asm/cmpxchg.h>
+
+
+static inline unsigned long read_cr8(void)
+{
+ unsigned long cr8;
+ asm volatile("movq %%cr8,%0" : "=r" (cr8));
+ return cr8;
+}
+
+static inline void write_cr8(unsigned long val)
+{
+ asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
+}
+
+#include <linux/irqflags.h>
+
+#endif /* _ASM_X86_SYSTEM_64_H */
diff --git a/arch/x86/include/asm/tce.h b/arch/x86/include/asm/tce.h
new file mode 100644
index 00000000000..7a6677c1a71
--- /dev/null
+++ b/arch/x86/include/asm/tce.h
@@ -0,0 +1,48 @@
+/*
+ * This file is derived from asm-powerpc/tce.h.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_TCE_H
+#define _ASM_X86_TCE_H
+
+extern unsigned int specified_table_size;
+struct iommu_table;
+
+#define TCE_ENTRY_SIZE 8 /* in bytes */
+
+#define TCE_READ_SHIFT 0
+#define TCE_WRITE_SHIFT 1
+#define TCE_HUBID_SHIFT 2 /* unused */
+#define TCE_RSVD_SHIFT 8 /* unused */
+#define TCE_RPN_SHIFT 12
+#define TCE_UNUSED_SHIFT 48 /* unused */
+
+#define TCE_RPN_MASK 0x0000fffffffff000ULL
+
+extern void tce_build(struct iommu_table *tbl, unsigned long index,
+ unsigned int npages, unsigned long uaddr, int direction);
+extern void tce_free(struct iommu_table *tbl, long index, unsigned int npages);
+extern void * __init alloc_tce_table(void);
+extern void __init free_tce_table(void *tbl);
+extern int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar);
+
+#endif /* _ASM_X86_TCE_H */
diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h
new file mode 100644
index 00000000000..af1b70ea440
--- /dev/null
+++ b/arch/x86/include/asm/termbits.h
@@ -0,0 +1,198 @@
+#ifndef _ASM_X86_TERMBITS_H
+#define _ASM_X86_TERMBITS_H
+
+#include <linux/posix_types.h>
+
+typedef unsigned char cc_t;
+typedef unsigned int speed_t;
+typedef unsigned int tcflag_t;
+
+#define NCCS 19
+struct termios {
+ tcflag_t c_iflag; /* input mode flags */
+ tcflag_t c_oflag; /* output mode flags */
+ tcflag_t c_cflag; /* control mode flags */
+ tcflag_t c_lflag; /* local mode flags */
+ cc_t c_line; /* line discipline */
+ cc_t c_cc[NCCS]; /* control characters */
+};
+
+struct termios2 {
+ tcflag_t c_iflag; /* input mode flags */
+ tcflag_t c_oflag; /* output mode flags */
+ tcflag_t c_cflag; /* control mode flags */
+ tcflag_t c_lflag; /* local mode flags */
+ cc_t c_line; /* line discipline */
+ cc_t c_cc[NCCS]; /* control characters */
+ speed_t c_ispeed; /* input speed */
+ speed_t c_ospeed; /* output speed */
+};
+
+struct ktermios {
+ tcflag_t c_iflag; /* input mode flags */
+ tcflag_t c_oflag; /* output mode flags */
+ tcflag_t c_cflag; /* control mode flags */
+ tcflag_t c_lflag; /* local mode flags */
+ cc_t c_line; /* line discipline */
+ cc_t c_cc[NCCS]; /* control characters */
+ speed_t c_ispeed; /* input speed */
+ speed_t c_ospeed; /* output speed */
+};
+
+/* c_cc characters */
+#define VINTR 0
+#define VQUIT 1
+#define VERASE 2
+#define VKILL 3
+#define VEOF 4
+#define VTIME 5
+#define VMIN 6
+#define VSWTC 7
+#define VSTART 8
+#define VSTOP 9
+#define VSUSP 10
+#define VEOL 11
+#define VREPRINT 12
+#define VDISCARD 13
+#define VWERASE 14
+#define VLNEXT 15
+#define VEOL2 16
+
+/* c_iflag bits */
+#define IGNBRK 0000001
+#define BRKINT 0000002
+#define IGNPAR 0000004
+#define PARMRK 0000010
+#define INPCK 0000020
+#define ISTRIP 0000040
+#define INLCR 0000100
+#define IGNCR 0000200
+#define ICRNL 0000400
+#define IUCLC 0001000
+#define IXON 0002000
+#define IXANY 0004000
+#define IXOFF 0010000
+#define IMAXBEL 0020000
+#define IUTF8 0040000
+
+/* c_oflag bits */
+#define OPOST 0000001
+#define OLCUC 0000002
+#define ONLCR 0000004
+#define OCRNL 0000010
+#define ONOCR 0000020
+#define ONLRET 0000040
+#define OFILL 0000100
+#define OFDEL 0000200
+#define NLDLY 0000400
+#define NL0 0000000
+#define NL1 0000400
+#define CRDLY 0003000
+#define CR0 0000000
+#define CR1 0001000
+#define CR2 0002000
+#define CR3 0003000
+#define TABDLY 0014000
+#define TAB0 0000000
+#define TAB1 0004000
+#define TAB2 0010000
+#define TAB3 0014000
+#define XTABS 0014000
+#define BSDLY 0020000
+#define BS0 0000000
+#define BS1 0020000
+#define VTDLY 0040000
+#define VT0 0000000
+#define VT1 0040000
+#define FFDLY 0100000
+#define FF0 0000000
+#define FF1 0100000
+
+/* c_cflag bit meaning */
+#define CBAUD 0010017
+#define B0 0000000 /* hang up */
+#define B50 0000001
+#define B75 0000002
+#define B110 0000003
+#define B134 0000004
+#define B150 0000005
+#define B200 0000006
+#define B300 0000007
+#define B600 0000010
+#define B1200 0000011
+#define B1800 0000012
+#define B2400 0000013
+#define B4800 0000014
+#define B9600 0000015
+#define B19200 0000016
+#define B38400 0000017
+#define EXTA B19200
+#define EXTB B38400
+#define CSIZE 0000060
+#define CS5 0000000
+#define CS6 0000020
+#define CS7 0000040
+#define CS8 0000060
+#define CSTOPB 0000100
+#define CREAD 0000200
+#define PARENB 0000400
+#define PARODD 0001000
+#define HUPCL 0002000
+#define CLOCAL 0004000
+#define CBAUDEX 0010000
+#define BOTHER 0010000 /* non standard rate */
+#define B57600 0010001
+#define B115200 0010002
+#define B230400 0010003
+#define B460800 0010004
+#define B500000 0010005
+#define B576000 0010006
+#define B921600 0010007
+#define B1000000 0010010
+#define B1152000 0010011
+#define B1500000 0010012
+#define B2000000 0010013
+#define B2500000 0010014
+#define B3000000 0010015
+#define B3500000 0010016
+#define B4000000 0010017
+#define CIBAUD 002003600000 /* input baud rate */
+#define CMSPAR 010000000000 /* mark or space (stick) parity */
+#define CRTSCTS 020000000000 /* flow control */
+
+#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */
+
+/* c_lflag bits */
+#define ISIG 0000001
+#define ICANON 0000002
+#define XCASE 0000004
+#define ECHO 0000010
+#define ECHOE 0000020
+#define ECHOK 0000040
+#define ECHONL 0000100
+#define NOFLSH 0000200
+#define TOSTOP 0000400
+#define ECHOCTL 0001000
+#define ECHOPRT 0002000
+#define ECHOKE 0004000
+#define FLUSHO 0010000
+#define PENDIN 0040000
+#define IEXTEN 0100000
+
+/* tcflow() and TCXONC use these */
+#define TCOOFF 0
+#define TCOON 1
+#define TCIOFF 2
+#define TCION 3
+
+/* tcflush() and TCFLSH use these */
+#define TCIFLUSH 0
+#define TCOFLUSH 1
+#define TCIOFLUSH 2
+
+/* tcsetattr uses these */
+#define TCSANOW 0
+#define TCSADRAIN 1
+#define TCSAFLUSH 2
+
+#endif /* _ASM_X86_TERMBITS_H */
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
new file mode 100644
index 00000000000..f72956331c4
--- /dev/null
+++ b/arch/x86/include/asm/termios.h
@@ -0,0 +1,113 @@
+#ifndef _ASM_X86_TERMIOS_H
+#define _ASM_X86_TERMIOS_H
+
+#include <asm/termbits.h>
+#include <asm/ioctls.h>
+
+struct winsize {
+ unsigned short ws_row;
+ unsigned short ws_col;
+ unsigned short ws_xpixel;
+ unsigned short ws_ypixel;
+};
+
+#define NCC 8
+struct termio {
+ unsigned short c_iflag; /* input mode flags */
+ unsigned short c_oflag; /* output mode flags */
+ unsigned short c_cflag; /* control mode flags */
+ unsigned short c_lflag; /* local mode flags */
+ unsigned char c_line; /* line discipline */
+ unsigned char c_cc[NCC]; /* control characters */
+};
+
+/* modem lines */
+#define TIOCM_LE 0x001
+#define TIOCM_DTR 0x002
+#define TIOCM_RTS 0x004
+#define TIOCM_ST 0x008
+#define TIOCM_SR 0x010
+#define TIOCM_CTS 0x020
+#define TIOCM_CAR 0x040
+#define TIOCM_RNG 0x080
+#define TIOCM_DSR 0x100
+#define TIOCM_CD TIOCM_CAR
+#define TIOCM_RI TIOCM_RNG
+#define TIOCM_OUT1 0x2000
+#define TIOCM_OUT2 0x4000
+#define TIOCM_LOOP 0x8000
+
+/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
+
+#ifdef __KERNEL__
+
+#include <asm/uaccess.h>
+
+/* intr=^C quit=^\ erase=del kill=^U
+ eof=^D vtime=\0 vmin=\1 sxtc=\0
+ start=^Q stop=^S susp=^Z eol=\0
+ reprint=^R discard=^U werase=^W lnext=^V
+ eol2=\0
+*/
+#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
+
+/*
+ * Translate a "termio" structure into a "termios". Ugh.
+ */
+#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \
+ unsigned short __tmp; \
+ get_user(__tmp,&(termio)->x); \
+ *(unsigned short *) &(termios)->x = __tmp; \
+}
+
+static inline int user_termio_to_kernel_termios(struct ktermios *termios,
+ struct termio __user *termio)
+{
+ SET_LOW_TERMIOS_BITS(termios, termio, c_iflag);
+ SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
+ SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
+ SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
+ return copy_from_user(termios->c_cc, termio->c_cc, NCC);
+}
+
+/*
+ * Translate a "termios" structure into a "termio". Ugh.
+ */
+static inline int kernel_termios_to_user_termio(struct termio __user *termio,
+ struct ktermios *termios)
+{
+ put_user((termios)->c_iflag, &(termio)->c_iflag);
+ put_user((termios)->c_oflag, &(termio)->c_oflag);
+ put_user((termios)->c_cflag, &(termio)->c_cflag);
+ put_user((termios)->c_lflag, &(termio)->c_lflag);
+ put_user((termios)->c_line, &(termio)->c_line);
+ return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC);
+}
+
+static inline int user_termios_to_kernel_termios(struct ktermios *k,
+ struct termios2 __user *u)
+{
+ return copy_from_user(k, u, sizeof(struct termios2));
+}
+
+static inline int kernel_termios_to_user_termios(struct termios2 __user *u,
+ struct ktermios *k)
+{
+ return copy_to_user(u, k, sizeof(struct termios2));
+}
+
+static inline int user_termios_to_kernel_termios_1(struct ktermios *k,
+ struct termios __user *u)
+{
+ return copy_from_user(k, u, sizeof(struct termios));
+}
+
+static inline int kernel_termios_to_user_termios_1(struct termios __user *u,
+ struct ktermios *k)
+{
+ return copy_to_user(u, k, sizeof(struct termios));
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_TERMIOS_H */
diff --git a/arch/x86/include/asm/therm_throt.h b/arch/x86/include/asm/therm_throt.h
new file mode 100644
index 00000000000..c62349ee786
--- /dev/null
+++ b/arch/x86/include/asm/therm_throt.h
@@ -0,0 +1,9 @@
+#ifndef _ASM_X86_THERM_THROT_H
+#define _ASM_X86_THERM_THROT_H
+
+#include <asm/atomic.h>
+
+extern atomic_t therm_throt_en;
+int therm_throt_process(int curr);
+
+#endif /* _ASM_X86_THERM_THROT_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
new file mode 100644
index 00000000000..e44d379faad
--- /dev/null
+++ b/arch/x86/include/asm/thread_info.h
@@ -0,0 +1,264 @@
+/* thread_info.h: low-level thread information
+ *
+ * Copyright (C) 2002 David Howells (dhowells@redhat.com)
+ * - Incorporating suggestions made by Linus Torvalds and Dave Miller
+ */
+
+#ifndef _ASM_X86_THREAD_INFO_H
+#define _ASM_X86_THREAD_INFO_H
+
+#include <linux/compiler.h>
+#include <asm/page.h>
+#include <asm/types.h>
+
+/*
+ * low level task data that entry.S needs immediate access to
+ * - this struct should fit entirely inside of one cache line
+ * - this struct shares the supervisor stack pages
+ */
+#ifndef __ASSEMBLY__
+struct task_struct;
+struct exec_domain;
+#include <asm/processor.h>
+
+struct thread_info {
+ struct task_struct *task; /* main task structure */
+ struct exec_domain *exec_domain; /* execution domain */
+ unsigned long flags; /* low level flags */
+ __u32 status; /* thread synchronous flags */
+ __u32 cpu; /* current CPU */
+ int preempt_count; /* 0 => preemptable,
+ <0 => BUG */
+ mm_segment_t addr_limit;
+ struct restart_block restart_block;
+ void __user *sysenter_return;
+#ifdef CONFIG_X86_32
+ unsigned long previous_esp; /* ESP of the previous stack in
+ case of nested (IRQ) stacks
+ */
+ __u8 supervisor_stack[0];
+#endif
+};
+
+#define INIT_THREAD_INFO(tsk) \
+{ \
+ .task = &tsk, \
+ .exec_domain = &default_exec_domain, \
+ .flags = 0, \
+ .cpu = 0, \
+ .preempt_count = 1, \
+ .addr_limit = KERNEL_DS, \
+ .restart_block = { \
+ .fn = do_no_restart_syscall, \
+ }, \
+}
+
+#define init_thread_info (init_thread_union.thread_info)
+#define init_stack (init_thread_union.stack)
+
+#else /* !__ASSEMBLY__ */
+
+#include <asm/asm-offsets.h>
+
+#endif
+
+/*
+ * thread information flags
+ * - these are process state flags that various assembly files
+ * may need to access
+ * - pending work-to-be-done flags are in LSW
+ * - other flags in MSW
+ * Warning: layout of LSW is hardcoded in entry.S
+ */
+#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
+#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
+#define TIF_SIGPENDING 2 /* signal pending */
+#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
+#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
+#define TIF_IRET 5 /* force IRET */
+#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
+#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
+#define TIF_SECCOMP 8 /* secure computing */
+#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
+#define TIF_NOTSC 16 /* TSC is not accessible in userland */
+#define TIF_IA32 17 /* 32bit process */
+#define TIF_FORK 18 /* ret_from_fork */
+#define TIF_ABI_PENDING 19
+#define TIF_MEMDIE 20
+#define TIF_DEBUG 21 /* uses debug registers */
+#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
+#define TIF_FREEZE 23 /* is freezing for suspend */
+#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
+#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
+#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
+#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
+
+#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
+#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
+#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
+#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
+#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
+#define _TIF_IRET (1 << TIF_IRET)
+#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
+#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP (1 << TIF_SECCOMP)
+#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
+#define _TIF_NOTSC (1 << TIF_NOTSC)
+#define _TIF_IA32 (1 << TIF_IA32)
+#define _TIF_FORK (1 << TIF_FORK)
+#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING)
+#define _TIF_DEBUG (1 << TIF_DEBUG)
+#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
+#define _TIF_FREEZE (1 << TIF_FREEZE)
+#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
+#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
+#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
+#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
+
+/* work to do in syscall_trace_enter() */
+#define _TIF_WORK_SYSCALL_ENTRY \
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \
+ _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
+
+/* work to do in syscall_trace_leave() */
+#define _TIF_WORK_SYSCALL_EXIT \
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)
+
+/* work to do on interrupt/exception return */
+#define _TIF_WORK_MASK \
+ (0x0000FFFF & \
+ ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \
+ _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
+
+/* work to do on any return to user space */
+#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
+
+/* Only used for 64 bit */
+#define _TIF_DO_NOTIFY_MASK \
+ (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+
+/* flags to check in __switch_to() */
+#define _TIF_WORK_CTXSW \
+ (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
+ _TIF_NOTSC)
+
+#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
+
+#define PREEMPT_ACTIVE 0x10000000
+
+/* thread information allocation */
+#ifdef CONFIG_DEBUG_STACK_USAGE
+#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO)
+#else
+#define THREAD_FLAGS GFP_KERNEL
+#endif
+
+#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+
+#define alloc_thread_info(tsk) \
+ ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER))
+
+#ifdef CONFIG_X86_32
+
+#define STACK_WARN (THREAD_SIZE/8)
+/*
+ * macros/functions for gaining access to the thread information structure
+ *
+ * preempt_count needs to be 1 initially, until the scheduler is functional.
+ */
+#ifndef __ASSEMBLY__
+
+
+/* how to get the current stack pointer from C */
+register unsigned long current_stack_pointer asm("esp") __used;
+
+/* how to get the thread information struct from C */
+static inline struct thread_info *current_thread_info(void)
+{
+ return (struct thread_info *)
+ (current_stack_pointer & ~(THREAD_SIZE - 1));
+}
+
+#else /* !__ASSEMBLY__ */
+
+/* how to get the thread information struct from ASM */
+#define GET_THREAD_INFO(reg) \
+ movl $-THREAD_SIZE, reg; \
+ andl %esp, reg
+
+/* use this one if reg already contains %esp */
+#define GET_THREAD_INFO_WITH_ESP(reg) \
+ andl $-THREAD_SIZE, reg
+
+#endif
+
+#else /* X86_32 */
+
+#include <asm/pda.h>
+
+/*
+ * macros/functions for gaining access to the thread information structure
+ * preempt_count needs to be 1 initially, until the scheduler is functional.
+ */
+#ifndef __ASSEMBLY__
+static inline struct thread_info *current_thread_info(void)
+{
+ struct thread_info *ti;
+ ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
+ return ti;
+}
+
+/* do not use in interrupt context */
+static inline struct thread_info *stack_thread_info(void)
+{
+ struct thread_info *ti;
+ asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1)));
+ return ti;
+}
+
+#else /* !__ASSEMBLY__ */
+
+/* how to get the thread information struct from ASM */
+#define GET_THREAD_INFO(reg) \
+ movq %gs:pda_kernelstack,reg ; \
+ subq $(THREAD_SIZE-PDA_STACKOFFSET),reg
+
+#endif
+
+#endif /* !X86_32 */
+
+/*
+ * Thread-synchronous status.
+ *
+ * This is different from the flags in that nobody else
+ * ever touches our thread-synchronous status, so we don't
+ * have to worry about atomic accesses.
+ */
+#define TS_USEDFPU 0x0001 /* FPU was used by this task
+ this quantum (SMP) */
+#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
+#define TS_POLLING 0x0004 /* true if in idle loop
+ and not sleeping */
+#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
+#define TS_XSAVE 0x0010 /* Use xsave/xrstor */
+
+#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
+
+#ifndef __ASSEMBLY__
+#define HAVE_SET_RESTORE_SIGMASK 1
+static inline void set_restore_sigmask(void)
+{
+ struct thread_info *ti = current_thread_info();
+ ti->status |= TS_RESTORE_SIGMASK;
+ set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
+}
+#endif /* !__ASSEMBLY__ */
+
+#ifndef __ASSEMBLY__
+extern void arch_task_cache_init(void);
+extern void free_thread_info(struct thread_info *ti);
+extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
+#define arch_task_cache_init arch_task_cache_init
+#endif
+#endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
new file mode 100644
index 00000000000..50c733aac42
--- /dev/null
+++ b/arch/x86/include/asm/time.h
@@ -0,0 +1,63 @@
+#ifndef _ASM_X86_TIME_H
+#define _ASM_X86_TIME_H
+
+extern void hpet_time_init(void);
+
+#include <asm/mc146818rtc.h>
+#ifdef CONFIG_X86_32
+#include <linux/efi.h>
+
+static inline unsigned long native_get_wallclock(void)
+{
+ unsigned long retval;
+
+ if (efi_enabled)
+ retval = efi_get_time();
+ else
+ retval = mach_get_cmos_time();
+
+ return retval;
+}
+
+static inline int native_set_wallclock(unsigned long nowtime)
+{
+ int retval;
+
+ if (efi_enabled)
+ retval = efi_set_rtc_mmss(nowtime);
+ else
+ retval = mach_set_rtc_mmss(nowtime);
+
+ return retval;
+}
+
+#else
+extern void native_time_init_hook(void);
+
+static inline unsigned long native_get_wallclock(void)
+{
+ return mach_get_cmos_time();
+}
+
+static inline int native_set_wallclock(unsigned long nowtime)
+{
+ return mach_set_rtc_mmss(nowtime);
+}
+
+#endif
+
+extern void time_init(void);
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+
+#define get_wallclock() native_get_wallclock()
+#define set_wallclock(x) native_set_wallclock(x)
+#define choose_time_init() hpet_time_init
+
+#endif /* CONFIG_PARAVIRT */
+
+extern unsigned long __init calibrate_cpu(void);
+
+#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
new file mode 100644
index 00000000000..2bb6a835c45
--- /dev/null
+++ b/arch/x86/include/asm/timer.h
@@ -0,0 +1,66 @@
+#ifndef _ASM_X86_TIMER_H
+#define _ASM_X86_TIMER_H
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/percpu.h>
+
+#define TICK_SIZE (tick_nsec / 1000)
+
+unsigned long long native_sched_clock(void);
+unsigned long native_calibrate_tsc(void);
+
+#ifdef CONFIG_X86_32
+extern int timer_ack;
+extern int recalibrate_cpu_khz(void);
+#endif /* CONFIG_X86_32 */
+
+extern int no_timer_check;
+
+#ifndef CONFIG_PARAVIRT
+#define calibrate_tsc() native_calibrate_tsc()
+#endif
+
+/* Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ * basic equation:
+ * ns = cycles / (freq / ns_per_sec)
+ * ns = cycles * (ns_per_sec / freq)
+ * ns = cycles * (10^9 / (cpu_khz * 10^3))
+ * ns = cycles * (10^6 / cpu_khz)
+ *
+ * Then we use scaling math (suggested by george@mvista.com) to get:
+ * ns = cycles * (10^6 * SC / cpu_khz) / SC
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * And since SC is a constant power of two, we can convert the div
+ * into a shift.
+ *
+ * We can use khz divisor instead of mhz to keep a better precision, since
+ * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+DECLARE_PER_CPU(unsigned long, cyc2ns);
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+{
+ return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR;
+}
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ unsigned long long ns;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ns = __cycles_2_ns(cyc);
+ local_irq_restore(flags);
+
+ return ns;
+}
+
+#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
new file mode 100644
index 00000000000..1287dc1347d
--- /dev/null
+++ b/arch/x86/include/asm/timex.h
@@ -0,0 +1,19 @@
+/* x86 architecture timex specifications */
+#ifndef _ASM_X86_TIMEX_H
+#define _ASM_X86_TIMEX_H
+
+#include <asm/processor.h>
+#include <asm/tsc.h>
+
+#ifdef CONFIG_X86_ELAN
+# define PIT_TICK_RATE 1189200 /* AMD Elan has different frequency! */
+#elif defined(CONFIG_X86_RDC321X)
+# define PIT_TICK_RATE 1041667 /* Underlying HZ for R8610 */
+#else
+# define PIT_TICK_RATE 1193182 /* Underlying HZ */
+#endif
+#define CLOCK_TICK_RATE PIT_TICK_RATE
+
+#define ARCH_HAS_READ_CURRENT_TIMER
+
+#endif /* _ASM_X86_TIMEX_H */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
new file mode 100644
index 00000000000..829215fef9e
--- /dev/null
+++ b/arch/x86/include/asm/tlb.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_TLB_H
+#define _ASM_X86_TLB_H
+
+#define tlb_start_vma(tlb, vma) do { } while (0)
+#define tlb_end_vma(tlb, vma) do { } while (0)
+#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
+#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+
+#include <asm-generic/tlb.h>
+
+#endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
new file mode 100644
index 00000000000..0e7bbb54911
--- /dev/null
+++ b/arch/x86/include/asm/tlbflush.h
@@ -0,0 +1,178 @@
+#ifndef _ASM_X86_TLBFLUSH_H
+#define _ASM_X86_TLBFLUSH_H
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __flush_tlb() __native_flush_tlb()
+#define __flush_tlb_global() __native_flush_tlb_global()
+#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+#endif
+
+static inline void __native_flush_tlb(void)
+{
+ write_cr3(read_cr3());
+}
+
+static inline void __native_flush_tlb_global(void)
+{
+ unsigned long flags;
+ unsigned long cr4;
+
+ /*
+ * Read-modify-write to CR4 - protect it from preemption and
+ * from interrupts. (Use the raw variant because this code can
+ * be called from deep inside debugging code.)
+ */
+ raw_local_irq_save(flags);
+
+ cr4 = read_cr4();
+ /* clear PGE */
+ write_cr4(cr4 & ~X86_CR4_PGE);
+ /* write old PGE again and flush TLBs */
+ write_cr4(cr4);
+
+ raw_local_irq_restore(flags);
+}
+
+static inline void __native_flush_tlb_single(unsigned long addr)
+{
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+}
+
+static inline void __flush_tlb_all(void)
+{
+ if (cpu_has_pge)
+ __flush_tlb_global();
+ else
+ __flush_tlb();
+}
+
+static inline void __flush_tlb_one(unsigned long addr)
+{
+ if (cpu_has_invlpg)
+ __flush_tlb_single(addr);
+ else
+ __flush_tlb();
+}
+
+#ifdef CONFIG_X86_32
+# define TLB_FLUSH_ALL 0xffffffff
+#else
+# define TLB_FLUSH_ALL -1ULL
+#endif
+
+/*
+ * TLB flushing:
+ *
+ * - flush_tlb() flushes the current mm struct TLBs
+ * - flush_tlb_all() flushes all processes TLBs
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+ * - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus
+ *
+ * ..but the i386 has somewhat limited tlb flushing capabilities,
+ * and page-granular flushes are available only on i486 and up.
+ *
+ * x86-64 can only flush individual pages or full VMs. For a range flush
+ * we always do the full VM. Might be worth trying if for a small
+ * range a few INVLPGs in a row are a win.
+ */
+
+#ifndef CONFIG_SMP
+
+#define flush_tlb() __flush_tlb()
+#define flush_tlb_all() __flush_tlb_all()
+#define local_flush_tlb() __flush_tlb()
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+ if (mm == current->active_mm)
+ __flush_tlb();
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ if (vma->vm_mm == current->active_mm)
+ __flush_tlb_one(addr);
+}
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ if (vma->vm_mm == current->active_mm)
+ __flush_tlb();
+}
+
+static inline void native_flush_tlb_others(const cpumask_t *cpumask,
+ struct mm_struct *mm,
+ unsigned long va)
+{
+}
+
+static inline void reset_lazy_tlbstate(void)
+{
+}
+
+#else /* SMP */
+
+#include <asm/smp.h>
+
+#define local_flush_tlb() __flush_tlb()
+
+extern void flush_tlb_all(void);
+extern void flush_tlb_current_task(void);
+extern void flush_tlb_mm(struct mm_struct *);
+extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
+
+#define flush_tlb() flush_tlb_current_task()
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ flush_tlb_mm(vma->vm_mm);
+}
+
+void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
+ unsigned long va);
+
+#define TLBSTATE_OK 1
+#define TLBSTATE_LAZY 2
+
+#ifdef CONFIG_X86_32
+struct tlb_state {
+ struct mm_struct *active_mm;
+ int state;
+ char __cacheline_padding[L1_CACHE_BYTES-8];
+};
+DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+
+void reset_lazy_tlbstate(void);
+#else
+static inline void reset_lazy_tlbstate(void)
+{
+}
+#endif
+
+#endif /* SMP */
+
+#ifndef CONFIG_PARAVIRT
+#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va)
+#endif
+
+static inline void flush_tlb_kernel_range(unsigned long start,
+ unsigned long end)
+{
+ flush_tlb_all();
+}
+
+#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
new file mode 100644
index 00000000000..90ac7718469
--- /dev/null
+++ b/arch/x86/include/asm/topology.h
@@ -0,0 +1,258 @@
+/*
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#ifndef _ASM_X86_TOPOLOGY_H
+#define _ASM_X86_TOPOLOGY_H
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_HT
+# define ENABLE_TOPO_DEFINES
+# endif
+#else
+# ifdef CONFIG_SMP
+# define ENABLE_TOPO_DEFINES
+# endif
+#endif
+
+/* Node not present */
+#define NUMA_NO_NODE (-1)
+
+#ifdef CONFIG_NUMA
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+
+#ifdef CONFIG_X86_32
+
+/* Mappings between node number and cpus on that node. */
+extern cpumask_t node_to_cpumask_map[];
+
+/* Mappings between logical cpu number and node number */
+extern int cpu_to_node_map[];
+
+/* Returns the number of the node containing CPU 'cpu' */
+static inline int cpu_to_node(int cpu)
+{
+ return cpu_to_node_map[cpu];
+}
+#define early_cpu_to_node(cpu) cpu_to_node(cpu)
+
+/* Returns a bitmask of CPUs on Node 'node'.
+ *
+ * Side note: this function creates the returned cpumask on the stack
+ * so with a high NR_CPUS count, excessive stack space is used. The
+ * node_to_cpumask_ptr function should be used whenever possible.
+ */
+static inline cpumask_t node_to_cpumask(int node)
+{
+ return node_to_cpumask_map[node];
+}
+
+#else /* CONFIG_X86_64 */
+
+/* Mappings between node number and cpus on that node. */
+extern cpumask_t *node_to_cpumask_map;
+
+/* Mappings between logical cpu number and node number */
+DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
+
+/* Returns the number of the current Node. */
+#define numa_node_id() read_pda(nodenumber)
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+extern int cpu_to_node(int cpu);
+extern int early_cpu_to_node(int cpu);
+extern const cpumask_t *_node_to_cpumask_ptr(int node);
+extern cpumask_t node_to_cpumask(int node);
+
+#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+/* Returns the number of the node containing CPU 'cpu' */
+static inline int cpu_to_node(int cpu)
+{
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+/* Same function but used if called before per_cpu areas are setup */
+static inline int early_cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map))
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+{
+ return &node_to_cpumask_map[node];
+}
+
+/* Returns a bitmask of CPUs on Node 'node'. */
+static inline cpumask_t node_to_cpumask(int node)
+{
+ return node_to_cpumask_map[node];
+}
+
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+/* Replace default node_to_cpumask_ptr with optimized version */
+#define node_to_cpumask_ptr(v, node) \
+ const cpumask_t *v = _node_to_cpumask_ptr(node)
+
+#define node_to_cpumask_ptr_next(v, node) \
+ v = _node_to_cpumask_ptr(node)
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Returns the number of the node containing Node 'node'. This
+ * architecture is flat, so it is a pretty simple function!
+ */
+#define parent_node(node) (node)
+
+#define pcibus_to_node(bus) __pcibus_to_node(bus)
+#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
+
+#ifdef CONFIG_X86_32
+extern unsigned long node_start_pfn[];
+extern unsigned long node_end_pfn[];
+extern unsigned long node_remap_size[];
+#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
+
+# define SD_CACHE_NICE_TRIES 1
+# define SD_IDLE_IDX 1
+# define SD_NEWIDLE_IDX 2
+# define SD_FORKEXEC_IDX 0
+
+#else
+
+# define SD_CACHE_NICE_TRIES 2
+# define SD_IDLE_IDX 2
+# define SD_NEWIDLE_IDX 2
+# define SD_FORKEXEC_IDX 1
+
+#endif
+
+/* sched_domains SD_NODE_INIT for NUMAQ machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_nice_tries = SD_CACHE_NICE_TRIES, \
+ .busy_idx = 3, \
+ .idle_idx = SD_IDLE_IDX, \
+ .newidle_idx = SD_NEWIDLE_IDX, \
+ .wake_idx = 1, \
+ .forkexec_idx = SD_FORKEXEC_IDX, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_BALANCE_FORK \
+ | SD_SERIALIZE \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+}
+
+#ifdef CONFIG_X86_64_ACPI_NUMA
+extern int __node_distance(int, int);
+#define node_distance(a, b) __node_distance(a, b)
+#endif
+
+#else /* !CONFIG_NUMA */
+
+#define numa_node_id() 0
+#define cpu_to_node(cpu) 0
+#define early_cpu_to_node(cpu) 0
+
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+{
+ return &cpu_online_map;
+}
+static inline cpumask_t node_to_cpumask(int node)
+{
+ return cpu_online_map;
+}
+static inline int node_to_first_cpu(int node)
+{
+ return first_cpu(cpu_online_map);
+}
+
+/* Replace default node_to_cpumask_ptr with optimized version */
+#define node_to_cpumask_ptr(v, node) \
+ const cpumask_t *v = _node_to_cpumask_ptr(node)
+
+#define node_to_cpumask_ptr_next(v, node) \
+ v = _node_to_cpumask_ptr(node)
+#endif
+
+#include <asm-generic/topology.h>
+
+#ifdef CONFIG_NUMA
+/* Returns the number of the first CPU on Node 'node'. */
+static inline int node_to_first_cpu(int node)
+{
+ node_to_cpumask_ptr(mask, node);
+ return first_cpu(*mask);
+}
+#endif
+
+extern cpumask_t cpu_coregroup_map(int cpu);
+
+#ifdef ENABLE_TOPO_DEFINES
+#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
+#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
+#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu))
+#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
+
+/* indicates that pointers to the topology cpumask_t maps are valid */
+#define arch_provides_topology_pointers yes
+#endif
+
+static inline void arch_fix_phys_package_id(int num, u32 slot)
+{
+}
+
+struct pci_bus;
+void set_pci_bus_resources_arch_default(struct pci_bus *b);
+
+#ifdef CONFIG_SMP
+#define mc_capable() (boot_cpu_data.x86_max_cores > 1)
+#define smt_capable() (smp_num_siblings > 1)
+#endif
+
+#ifdef CONFIG_NUMA
+extern int get_mp_bus_to_node(int busnum);
+extern void set_mp_bus_to_node(int busnum, int node);
+#else
+static inline int get_mp_bus_to_node(int busnum)
+{
+ return 0;
+}
+static inline void set_mp_bus_to_node(int busnum, int node)
+{
+}
+#endif
+
+#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
new file mode 100644
index 00000000000..fa0d79facdb
--- /dev/null
+++ b/arch/x86/include/asm/trampoline.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_TRAMPOLINE_H
+#define _ASM_X86_TRAMPOLINE_H
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Trampoline 80x86 program as an array.
+ */
+extern const unsigned char trampoline_data [];
+extern const unsigned char trampoline_end [];
+extern unsigned char *trampoline_base;
+
+extern unsigned long init_rsp;
+extern unsigned long initial_code;
+
+#define TRAMPOLINE_BASE 0x6000
+extern unsigned long setup_trampoline(void);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_TRAMPOLINE_H */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
new file mode 100644
index 00000000000..45dee286e45
--- /dev/null
+++ b/arch/x86/include/asm/traps.h
@@ -0,0 +1,81 @@
+#ifndef _ASM_X86_TRAPS_H
+#define _ASM_X86_TRAPS_H
+
+#include <asm/debugreg.h>
+
+#ifdef CONFIG_X86_32
+#define dotraplinkage
+#else
+#define dotraplinkage asmlinkage
+#endif
+
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+#ifdef CONFIG_X86_64
+asmlinkage void double_fault(void);
+#endif
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void alignment_check(void);
+#ifdef CONFIG_X86_MCE
+asmlinkage void machine_check(void);
+#endif /* CONFIG_X86_MCE */
+asmlinkage void simd_coprocessor_error(void);
+
+dotraplinkage void do_divide_error(struct pt_regs *, long);
+dotraplinkage void do_debug(struct pt_regs *, long);
+dotraplinkage void do_nmi(struct pt_regs *, long);
+dotraplinkage void do_int3(struct pt_regs *, long);
+dotraplinkage void do_overflow(struct pt_regs *, long);
+dotraplinkage void do_bounds(struct pt_regs *, long);
+dotraplinkage void do_invalid_op(struct pt_regs *, long);
+dotraplinkage void do_device_not_available(struct pt_regs *, long);
+dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
+dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
+dotraplinkage void do_segment_not_present(struct pt_regs *, long);
+dotraplinkage void do_stack_segment(struct pt_regs *, long);
+dotraplinkage void do_general_protection(struct pt_regs *, long);
+dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
+dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
+dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
+dotraplinkage void do_alignment_check(struct pt_regs *, long);
+#ifdef CONFIG_X86_MCE
+dotraplinkage void do_machine_check(struct pt_regs *, long);
+#endif
+dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
+#ifdef CONFIG_X86_32
+dotraplinkage void do_iret_error(struct pt_regs *, long);
+#endif
+
+static inline int get_si_code(unsigned long condition)
+{
+ if (condition & DR_STEP)
+ return TRAP_TRACE;
+ else if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3))
+ return TRAP_HWBKPT;
+ else
+ return TRAP_BRKPT;
+}
+
+extern int panic_on_unrecovered_nmi;
+extern int kstack_depth_to_print;
+
+#ifdef CONFIG_X86_32
+void math_error(void __user *);
+unsigned long patch_espfix_desc(unsigned long, unsigned long);
+asmlinkage void math_emulate(long);
+#endif
+
+#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
new file mode 100644
index 00000000000..38ae163cc91
--- /dev/null
+++ b/arch/x86/include/asm/tsc.h
@@ -0,0 +1,62 @@
+/*
+ * x86 TSC related functions
+ */
+#ifndef _ASM_X86_TSC_H
+#define _ASM_X86_TSC_H
+
+#include <asm/processor.h>
+
+#define NS_SCALE 10 /* 2^10, carefully chosen */
+#define US_SCALE 32 /* 2^32, arbitralrily chosen */
+
+/*
+ * Standard way to access the cycle counter.
+ */
+typedef unsigned long long cycles_t;
+
+extern unsigned int cpu_khz;
+extern unsigned int tsc_khz;
+
+extern void disable_TSC(void);
+
+static inline cycles_t get_cycles(void)
+{
+ unsigned long long ret = 0;
+
+#ifndef CONFIG_X86_TSC
+ if (!cpu_has_tsc)
+ return 0;
+#endif
+ rdtscll(ret);
+
+ return ret;
+}
+
+static __always_inline cycles_t vget_cycles(void)
+{
+ /*
+ * We only do VDSOs on TSC capable CPUs, so this shouldnt
+ * access boot_cpu_data (which is not VDSO-safe):
+ */
+#ifndef CONFIG_X86_TSC
+ if (!cpu_has_tsc)
+ return 0;
+#endif
+ return (cycles_t)__native_read_tsc();
+}
+
+extern void tsc_init(void);
+extern void mark_tsc_unstable(char *reason);
+extern int unsynchronized_tsc(void);
+int check_tsc_unstable(void);
+
+/*
+ * Boot-time check whether the TSCs are synchronized across
+ * all CPUs/cores:
+ */
+extern void check_tsc_sync_source(int cpu);
+extern void check_tsc_sync_target(void);
+
+extern int notsc_setup(char *);
+
+#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
new file mode 100644
index 00000000000..e6f73632007
--- /dev/null
+++ b/arch/x86/include/asm/types.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_TYPES_H
+#define _ASM_X86_TYPES_H
+
+#include <asm-generic/int-ll64.h>
+
+#ifndef __ASSEMBLY__
+
+typedef unsigned short umode_t;
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * These aren't exported outside the kernel to avoid name space clashes
+ */
+#ifdef __KERNEL__
+
+#ifdef CONFIG_X86_32
+# define BITS_PER_LONG 32
+#else
+# define BITS_PER_LONG 64
+#endif
+
+#ifndef __ASSEMBLY__
+
+typedef u64 dma64_addr_t;
+#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G)
+/* DMA addresses come in 32-bit and 64-bit flavours. */
+typedef u64 dma_addr_t;
+#else
+typedef u32 dma_addr_t;
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_TYPES_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
new file mode 100644
index 00000000000..99192bb55a5
--- /dev/null
+++ b/arch/x86/include/asm/uaccess.h
@@ -0,0 +1,456 @@
+#ifndef _ASM_X86_UACCESS_H
+#define _ASM_X86_UACCESS_H
+/*
+ * User space memory access functions
+ */
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/thread_info.h>
+#include <linux/prefetch.h>
+#include <linux/string.h>
+#include <asm/asm.h>
+#include <asm/page.h>
+
+#define VERIFY_READ 0
+#define VERIFY_WRITE 1
+
+/*
+ * The fs value determines whether argument validity checking should be
+ * performed or not. If get_fs() == USER_DS, checking is performed, with
+ * get_fs() == KERNEL_DS, checking is bypassed.
+ *
+ * For historical reasons, these macros are grossly misnamed.
+ */
+
+#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
+
+#define KERNEL_DS MAKE_MM_SEG(-1UL)
+#define USER_DS MAKE_MM_SEG(PAGE_OFFSET)
+
+#define get_ds() (KERNEL_DS)
+#define get_fs() (current_thread_info()->addr_limit)
+#define set_fs(x) (current_thread_info()->addr_limit = (x))
+
+#define segment_eq(a, b) ((a).seg == (b).seg)
+
+#define __addr_ok(addr) \
+ ((unsigned long __force)(addr) < \
+ (current_thread_info()->addr_limit.seg))
+
+/*
+ * Test whether a block of memory is a valid user space address.
+ * Returns 0 if the range is valid, nonzero otherwise.
+ *
+ * This is equivalent to the following test:
+ * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64)
+ *
+ * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
+ */
+
+#define __range_not_ok(addr, size) \
+({ \
+ unsigned long flag, roksum; \
+ __chk_user_ptr(addr); \
+ asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \
+ : "=&r" (flag), "=r" (roksum) \
+ : "1" (addr), "g" ((long)(size)), \
+ "rm" (current_thread_info()->addr_limit.seg)); \
+ flag; \
+})
+
+/**
+ * access_ok: - Checks if a user space pointer is valid
+ * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
+ * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
+ * to write to a block, it is always safe to read from it.
+ * @addr: User space pointer to start of block to check
+ * @size: Size of block to check
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Checks if a pointer to a block of memory in user space is valid.
+ *
+ * Returns true (nonzero) if the memory block may be valid, false (zero)
+ * if it is definitely invalid.
+ *
+ * Note that, depending on architecture, this function probably just
+ * checks that the pointer is in the user space range - after calling
+ * this function, memory access functions may still return -EFAULT.
+ */
+#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
+
+/*
+ * The exception table consists of pairs of addresses: the first is the
+ * address of an instruction that is allowed to fault, and the second is
+ * the address at which the program should continue. No registers are
+ * modified, so it is entirely up to the continuation code to figure out
+ * what to do.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path. This means when everything is well,
+ * we don't even have to jump over them. Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry {
+ unsigned long insn, fixup;
+};
+
+extern int fixup_exception(struct pt_regs *regs);
+
+/*
+ * These are the main single-value transfer routines. They automatically
+ * use the right size if we just have the right pointer type.
+ *
+ * This gets kind of ugly. We want to return _two_ values in "get_user()"
+ * and yet we don't want to do any pointers, because that is too much
+ * of a performance impact. Thus we have a few rather ugly macros here,
+ * and hide all the ugliness from the user.
+ *
+ * The "__xxx" versions of the user access functions are versions that
+ * do not verify the address space, that must have been done previously
+ * with a separate "access_ok()" call (this is used when we do multiple
+ * accesses to the same area of user memory).
+ */
+
+extern int __get_user_1(void);
+extern int __get_user_2(void);
+extern int __get_user_4(void);
+extern int __get_user_8(void);
+extern int __get_user_bad(void);
+
+#define __get_user_x(size, ret, x, ptr) \
+ asm volatile("call __get_user_" #size \
+ : "=a" (ret),"=d" (x) \
+ : "0" (ptr)) \
+
+/* Careful: we have to cast the result to the type of the pointer
+ * for sign reasons */
+
+/**
+ * get_user: - Get a simple variable from user space.
+ * @x: Variable to store result.
+ * @ptr: Source address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple variable from user space to kernel
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
+ */
+#ifdef CONFIG_X86_32
+#define __get_user_8(__ret_gu, __val_gu, ptr) \
+ __get_user_x(X, __ret_gu, __val_gu, ptr)
+#else
+#define __get_user_8(__ret_gu, __val_gu, ptr) \
+ __get_user_x(8, __ret_gu, __val_gu, ptr)
+#endif
+
+#define get_user(x, ptr) \
+({ \
+ int __ret_gu; \
+ unsigned long __val_gu; \
+ __chk_user_ptr(ptr); \
+ might_fault(); \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ __get_user_x(1, __ret_gu, __val_gu, ptr); \
+ break; \
+ case 2: \
+ __get_user_x(2, __ret_gu, __val_gu, ptr); \
+ break; \
+ case 4: \
+ __get_user_x(4, __ret_gu, __val_gu, ptr); \
+ break; \
+ case 8: \
+ __get_user_8(__ret_gu, __val_gu, ptr); \
+ break; \
+ default: \
+ __get_user_x(X, __ret_gu, __val_gu, ptr); \
+ break; \
+ } \
+ (x) = (__typeof__(*(ptr)))__val_gu; \
+ __ret_gu; \
+})
+
+#define __put_user_x(size, x, ptr, __ret_pu) \
+ asm volatile("call __put_user_" #size : "=a" (__ret_pu) \
+ :"0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
+
+
+
+#ifdef CONFIG_X86_32
+#define __put_user_u64(x, addr, err) \
+ asm volatile("1: movl %%eax,0(%2)\n" \
+ "2: movl %%edx,4(%2)\n" \
+ "3:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "4: movl %3,%0\n" \
+ " jmp 3b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
+ : "=r" (err) \
+ : "A" (x), "r" (addr), "i" (-EFAULT), "0" (err))
+
+#define __put_user_x8(x, ptr, __ret_pu) \
+ asm volatile("call __put_user_8" : "=a" (__ret_pu) \
+ : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
+#else
+#define __put_user_u64(x, ptr, retval) \
+ __put_user_asm(x, ptr, retval, "q", "", "Zr", -EFAULT)
+#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
+#endif
+
+extern void __put_user_bad(void);
+
+/*
+ * Strange magic calling convention: pointer in %ecx,
+ * value in %eax(:%edx), return value in %eax. clobbers %rbx
+ */
+extern void __put_user_1(void);
+extern void __put_user_2(void);
+extern void __put_user_4(void);
+extern void __put_user_8(void);
+
+#ifdef CONFIG_X86_WP_WORKS_OK
+
+/**
+ * put_user: - Write a simple value into user space.
+ * @x: Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ */
+#define put_user(x, ptr) \
+({ \
+ int __ret_pu; \
+ __typeof__(*(ptr)) __pu_val; \
+ __chk_user_ptr(ptr); \
+ might_fault(); \
+ __pu_val = x; \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ __put_user_x(1, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 2: \
+ __put_user_x(2, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 4: \
+ __put_user_x(4, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 8: \
+ __put_user_x8(__pu_val, ptr, __ret_pu); \
+ break; \
+ default: \
+ __put_user_x(X, __pu_val, ptr, __ret_pu); \
+ break; \
+ } \
+ __ret_pu; \
+})
+
+#define __put_user_size(x, ptr, size, retval, errret) \
+do { \
+ retval = 0; \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __put_user_asm(x, ptr, retval, "b", "b", "iq", errret); \
+ break; \
+ case 2: \
+ __put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \
+ break; \
+ case 4: \
+ __put_user_asm(x, ptr, retval, "l", "k", "ir", errret);\
+ break; \
+ case 8: \
+ __put_user_u64((__typeof__(*ptr))(x), ptr, retval); \
+ break; \
+ default: \
+ __put_user_bad(); \
+ } \
+} while (0)
+
+#else
+
+#define __put_user_size(x, ptr, size, retval, errret) \
+do { \
+ __typeof__(*(ptr))__pus_tmp = x; \
+ retval = 0; \
+ \
+ if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, size) != 0)) \
+ retval = errret; \
+} while (0)
+
+#define put_user(x, ptr) \
+({ \
+ int __ret_pu; \
+ __typeof__(*(ptr))__pus_tmp = x; \
+ __ret_pu = 0; \
+ if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, \
+ sizeof(*(ptr))) != 0)) \
+ __ret_pu = -EFAULT; \
+ __ret_pu; \
+})
+#endif
+
+#ifdef CONFIG_X86_32
+#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad()
+#else
+#define __get_user_asm_u64(x, ptr, retval, errret) \
+ __get_user_asm(x, ptr, retval, "q", "", "=r", errret)
+#endif
+
+#define __get_user_size(x, ptr, size, retval, errret) \
+do { \
+ retval = 0; \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __get_user_asm(x, ptr, retval, "b", "b", "=q", errret); \
+ break; \
+ case 2: \
+ __get_user_asm(x, ptr, retval, "w", "w", "=r", errret); \
+ break; \
+ case 4: \
+ __get_user_asm(x, ptr, retval, "l", "k", "=r", errret); \
+ break; \
+ case 8: \
+ __get_user_asm_u64(x, ptr, retval, errret); \
+ break; \
+ default: \
+ (x) = __get_user_bad(); \
+ } \
+} while (0)
+
+#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
+ asm volatile("1: mov"itype" %2,%"rtype"1\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: mov %3,%0\n" \
+ " xor"itype" %"rtype"1,%"rtype"1\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r" (err), ltype(x) \
+ : "m" (__m(addr)), "i" (errret), "0" (err))
+
+#define __put_user_nocheck(x, ptr, size) \
+({ \
+ long __pu_err; \
+ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
+ __pu_err; \
+})
+
+#define __get_user_nocheck(x, ptr, size) \
+({ \
+ long __gu_err; \
+ unsigned long __gu_val; \
+ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
+ (x) = (__force __typeof__(*(ptr)))__gu_val; \
+ __gu_err; \
+})
+
+/* FIXME: this hack is definitely wrong -AK */
+struct __large_struct { unsigned long buf[100]; };
+#define __m(x) (*(struct __large_struct __user *)(x))
+
+/*
+ * Tell gcc we read from memory instead of writing: this is because
+ * we do not write to any memory gcc knows about, so there are no
+ * aliasing issues.
+ */
+#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \
+ asm volatile("1: mov"itype" %"rtype"1,%2\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: mov %3,%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r"(err) \
+ : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
+/**
+ * __get_user: - Get a simple variable from user space, with less checking.
+ * @x: Variable to store result.
+ * @ptr: Source address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple variable from user space to kernel
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
+ *
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
+ */
+
+#define __get_user(x, ptr) \
+ __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
+/**
+ * __put_user: - Write a simple value into user space, with less checking.
+ * @x: Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ */
+
+#define __put_user(x, ptr) \
+ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
+
+#define __get_user_unaligned __get_user
+#define __put_user_unaligned __put_user
+
+/*
+ * movsl can be slow when source and dest are not both 8-byte aligned
+ */
+#ifdef CONFIG_X86_INTEL_USERCOPY
+extern struct movsl_mask {
+ int mask;
+} ____cacheline_aligned_in_smp movsl_mask;
+#endif
+
+#define ARCH_HAS_NOCACHE_UACCESS 1
+
+#ifdef CONFIG_X86_32
+# include "uaccess_32.h"
+#else
+# define ARCH_HAS_SEARCH_EXTABLE
+# include "uaccess_64.h"
+#endif
+
+#endif /* _ASM_X86_UACCESS_H */
+
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
new file mode 100644
index 00000000000..5e06259e90e
--- /dev/null
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -0,0 +1,218 @@
+#ifndef _ASM_X86_UACCESS_32_H
+#define _ASM_X86_UACCESS_32_H
+
+/*
+ * User space memory access functions
+ */
+#include <linux/errno.h>
+#include <linux/thread_info.h>
+#include <linux/prefetch.h>
+#include <linux/string.h>
+#include <asm/asm.h>
+#include <asm/page.h>
+
+unsigned long __must_check __copy_to_user_ll
+ (void __user *to, const void *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll
+ (void *to, const void __user *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll_nozero
+ (void *to, const void __user *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll_nocache
+ (void *to, const void __user *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll_nocache_nozero
+ (void *to, const void __user *from, unsigned long n);
+
+/**
+ * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
+ * @to: Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only.
+ *
+ * Copy data from kernel space to user space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ * The caller should also make sure he pins the user space address
+ * so that the we don't result in page fault and sleep.
+ *
+ * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault
+ * we return the initial request size (1, 2 or 4), as copy_*_user should do.
+ * If a store crosses a page boundary and gets a fault, the x86 will not write
+ * anything, so this is accurate.
+ */
+
+static __always_inline unsigned long __must_check
+__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
+{
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __put_user_size(*(u8 *)from, (u8 __user *)to,
+ 1, ret, 1);
+ return ret;
+ case 2:
+ __put_user_size(*(u16 *)from, (u16 __user *)to,
+ 2, ret, 2);
+ return ret;
+ case 4:
+ __put_user_size(*(u32 *)from, (u32 __user *)to,
+ 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_to_user_ll(to, from, n);
+}
+
+/**
+ * __copy_to_user: - Copy a block of data into user space, with less checking.
+ * @to: Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Copy data from kernel space to user space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ */
+static __always_inline unsigned long __must_check
+__copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+ might_fault();
+ return __copy_to_user_inatomic(to, from, n);
+}
+
+static __always_inline unsigned long
+__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
+{
+ /* Avoid zeroing the tail if the copy fails..
+ * If 'n' is constant and 1, 2, or 4, we do still zero on a failure,
+ * but as the zeroing behaviour is only significant when n is not
+ * constant, that shouldn't be a problem.
+ */
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __get_user_size(*(u8 *)to, from, 1, ret, 1);
+ return ret;
+ case 2:
+ __get_user_size(*(u16 *)to, from, 2, ret, 2);
+ return ret;
+ case 4:
+ __get_user_size(*(u32 *)to, from, 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_from_user_ll_nozero(to, from, n);
+}
+
+/**
+ * __copy_from_user: - Copy a block of data from user space, with less checking.
+ * @to: Destination address, in kernel space.
+ * @from: Source address, in user space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Copy data from user space to kernel space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ *
+ * If some data could not be copied, this function will pad the copied
+ * data to the requested size using zero bytes.
+ *
+ * An alternate version - __copy_from_user_inatomic() - may be called from
+ * atomic context and will fail rather than sleep. In this case the
+ * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h
+ * for explanation of why this is needed.
+ */
+static __always_inline unsigned long
+__copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+ might_fault();
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __get_user_size(*(u8 *)to, from, 1, ret, 1);
+ return ret;
+ case 2:
+ __get_user_size(*(u16 *)to, from, 2, ret, 2);
+ return ret;
+ case 4:
+ __get_user_size(*(u32 *)to, from, 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_from_user_ll(to, from, n);
+}
+
+static __always_inline unsigned long __copy_from_user_nocache(void *to,
+ const void __user *from, unsigned long n)
+{
+ might_fault();
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __get_user_size(*(u8 *)to, from, 1, ret, 1);
+ return ret;
+ case 2:
+ __get_user_size(*(u16 *)to, from, 2, ret, 2);
+ return ret;
+ case 4:
+ __get_user_size(*(u32 *)to, from, 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_from_user_ll_nocache(to, from, n);
+}
+
+static __always_inline unsigned long
+__copy_from_user_inatomic_nocache(void *to, const void __user *from,
+ unsigned long n)
+{
+ return __copy_from_user_ll_nocache_nozero(to, from, n);
+}
+
+unsigned long __must_check copy_to_user(void __user *to,
+ const void *from, unsigned long n);
+unsigned long __must_check copy_from_user(void *to,
+ const void __user *from,
+ unsigned long n);
+long __must_check strncpy_from_user(char *dst, const char __user *src,
+ long count);
+long __must_check __strncpy_from_user(char *dst,
+ const char __user *src, long count);
+
+/**
+ * strlen_user: - Get the size of a string in user space.
+ * @str: The string to measure.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Get the size of a NUL-terminated string in user space.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ * On exception, returns 0.
+ *
+ * If there is a limit on the length of a valid string, you may wish to
+ * consider using strnlen_user() instead.
+ */
+#define strlen_user(str) strnlen_user(str, LONG_MAX)
+
+long strnlen_user(const char __user *str, long n);
+unsigned long __must_check clear_user(void __user *mem, unsigned long len);
+unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
+
+#endif /* _ASM_X86_UACCESS_32_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
new file mode 100644
index 00000000000..543ba883cc6
--- /dev/null
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -0,0 +1,208 @@
+#ifndef _ASM_X86_UACCESS_64_H
+#define _ASM_X86_UACCESS_64_H
+
+/*
+ * User space memory access functions
+ */
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/prefetch.h>
+#include <linux/lockdep.h>
+#include <asm/page.h>
+
+/*
+ * Copy To/From Userspace
+ */
+
+/* Handles exceptions in both to and from, but doesn't do access_ok */
+__must_check unsigned long
+copy_user_generic(void *to, const void *from, unsigned len);
+
+__must_check unsigned long
+copy_to_user(void __user *to, const void *from, unsigned len);
+__must_check unsigned long
+copy_from_user(void *to, const void __user *from, unsigned len);
+__must_check unsigned long
+copy_in_user(void __user *to, const void __user *from, unsigned len);
+
+static __always_inline __must_check
+int __copy_from_user(void *dst, const void __user *src, unsigned size)
+{
+ int ret = 0;
+
+ might_fault();
+ if (!__builtin_constant_p(size))
+ return copy_user_generic(dst, (__force void *)src, size);
+ switch (size) {
+ case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
+ ret, "b", "b", "=q", 1);
+ return ret;
+ case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
+ ret, "w", "w", "=r", 2);
+ return ret;
+ case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
+ ret, "l", "k", "=r", 4);
+ return ret;
+ case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 8);
+ return ret;
+ case 10:
+ __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 16);
+ if (unlikely(ret))
+ return ret;
+ __get_user_asm(*(u16 *)(8 + (char *)dst),
+ (u16 __user *)(8 + (char __user *)src),
+ ret, "w", "w", "=r", 2);
+ return ret;
+ case 16:
+ __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 16);
+ if (unlikely(ret))
+ return ret;
+ __get_user_asm(*(u64 *)(8 + (char *)dst),
+ (u64 __user *)(8 + (char __user *)src),
+ ret, "q", "", "=r", 8);
+ return ret;
+ default:
+ return copy_user_generic(dst, (__force void *)src, size);
+ }
+}
+
+static __always_inline __must_check
+int __copy_to_user(void __user *dst, const void *src, unsigned size)
+{
+ int ret = 0;
+
+ might_fault();
+ if (!__builtin_constant_p(size))
+ return copy_user_generic((__force void *)dst, src, size);
+ switch (size) {
+ case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
+ ret, "b", "b", "iq", 1);
+ return ret;
+ case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ return ret;
+ case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
+ ret, "l", "k", "ir", 4);
+ return ret;
+ case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "ir", 8);
+ return ret;
+ case 10:
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "ir", 10);
+ if (unlikely(ret))
+ return ret;
+ asm("":::"memory");
+ __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ return ret;
+ case 16:
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "ir", 16);
+ if (unlikely(ret))
+ return ret;
+ asm("":::"memory");
+ __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
+ ret, "q", "", "ir", 8);
+ return ret;
+ default:
+ return copy_user_generic((__force void *)dst, src, size);
+ }
+}
+
+static __always_inline __must_check
+int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
+{
+ int ret = 0;
+
+ might_fault();
+ if (!__builtin_constant_p(size))
+ return copy_user_generic((__force void *)dst,
+ (__force void *)src, size);
+ switch (size) {
+ case 1: {
+ u8 tmp;
+ __get_user_asm(tmp, (u8 __user *)src,
+ ret, "b", "b", "=q", 1);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u8 __user *)dst,
+ ret, "b", "b", "iq", 1);
+ return ret;
+ }
+ case 2: {
+ u16 tmp;
+ __get_user_asm(tmp, (u16 __user *)src,
+ ret, "w", "w", "=r", 2);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ return ret;
+ }
+
+ case 4: {
+ u32 tmp;
+ __get_user_asm(tmp, (u32 __user *)src,
+ ret, "l", "k", "=r", 4);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u32 __user *)dst,
+ ret, "l", "k", "ir", 4);
+ return ret;
+ }
+ case 8: {
+ u64 tmp;
+ __get_user_asm(tmp, (u64 __user *)src,
+ ret, "q", "", "=r", 8);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u64 __user *)dst,
+ ret, "q", "", "ir", 8);
+ return ret;
+ }
+ default:
+ return copy_user_generic((__force void *)dst,
+ (__force void *)src, size);
+ }
+}
+
+__must_check long
+strncpy_from_user(char *dst, const char __user *src, long count);
+__must_check long
+__strncpy_from_user(char *dst, const char __user *src, long count);
+__must_check long strnlen_user(const char __user *str, long n);
+__must_check long __strnlen_user(const char __user *str, long n);
+__must_check long strlen_user(const char __user *str);
+__must_check unsigned long clear_user(void __user *mem, unsigned long len);
+__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
+
+__must_check long __copy_from_user_inatomic(void *dst, const void __user *src,
+ unsigned size);
+
+static __must_check __always_inline int
+__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
+{
+ return copy_user_generic((__force void *)dst, src, size);
+}
+
+extern long __copy_user_nocache(void *dst, const void __user *src,
+ unsigned size, int zerorest);
+
+static inline int __copy_from_user_nocache(void *dst, const void __user *src,
+ unsigned size)
+{
+ might_sleep();
+ return __copy_user_nocache(dst, src, size, 1);
+}
+
+static inline int __copy_from_user_inatomic_nocache(void *dst,
+ const void __user *src,
+ unsigned size)
+{
+ return __copy_user_nocache(dst, src, size, 0);
+}
+
+unsigned long
+copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
+
+#endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h
new file mode 100644
index 00000000000..87324cf439d
--- /dev/null
+++ b/arch/x86/include/asm/ucontext.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_UCONTEXT_H
+#define _ASM_X86_UCONTEXT_H
+
+#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state
+ * information in the memory layout pointed
+ * by the fpstate pointer in the ucontext's
+ * sigcontext struct (uc_mcontext).
+ */
+
+struct ucontext {
+ unsigned long uc_flags;
+ struct ucontext *uc_link;
+ stack_t uc_stack;
+ struct sigcontext uc_mcontext;
+ sigset_t uc_sigmask; /* mask last for extensibility */
+};
+
+#endif /* _ASM_X86_UCONTEXT_H */
diff --git a/arch/x86/include/asm/unaligned.h b/arch/x86/include/asm/unaligned.h
new file mode 100644
index 00000000000..a7bd416b476
--- /dev/null
+++ b/arch/x86/include/asm/unaligned.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_UNALIGNED_H
+#define _ASM_X86_UNALIGNED_H
+
+/*
+ * The x86 can do unaligned accesses itself.
+ */
+
+#include <linux/unaligned/access_ok.h>
+#include <linux/unaligned/generic.h>
+
+#define get_unaligned __get_unaligned_le
+#define put_unaligned __put_unaligned_le
+
+#endif /* _ASM_X86_UNALIGNED_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
new file mode 100644
index 00000000000..2a58ed3e51d
--- /dev/null
+++ b/arch/x86/include/asm/unistd.h
@@ -0,0 +1,13 @@
+#ifdef __KERNEL__
+# ifdef CONFIG_X86_32
+# include "unistd_32.h"
+# else
+# include "unistd_64.h"
+# endif
+#else
+# ifdef __i386__
+# include "unistd_32.h"
+# else
+# include "unistd_64.h"
+# endif
+#endif
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
new file mode 100644
index 00000000000..f2bba78430a
--- /dev/null
+++ b/arch/x86/include/asm/unistd_32.h
@@ -0,0 +1,379 @@
+#ifndef _ASM_X86_UNISTD_32_H
+#define _ASM_X86_UNISTD_32_H
+
+/*
+ * This file contains the system call numbers.
+ */
+
+#define __NR_restart_syscall 0
+#define __NR_exit 1
+#define __NR_fork 2
+#define __NR_read 3
+#define __NR_write 4
+#define __NR_open 5
+#define __NR_close 6
+#define __NR_waitpid 7
+#define __NR_creat 8
+#define __NR_link 9
+#define __NR_unlink 10
+#define __NR_execve 11
+#define __NR_chdir 12
+#define __NR_time 13
+#define __NR_mknod 14
+#define __NR_chmod 15
+#define __NR_lchown 16
+#define __NR_break 17
+#define __NR_oldstat 18
+#define __NR_lseek 19
+#define __NR_getpid 20
+#define __NR_mount 21
+#define __NR_umount 22
+#define __NR_setuid 23
+#define __NR_getuid 24
+#define __NR_stime 25
+#define __NR_ptrace 26
+#define __NR_alarm 27
+#define __NR_oldfstat 28
+#define __NR_pause 29
+#define __NR_utime 30
+#define __NR_stty 31
+#define __NR_gtty 32
+#define __NR_access 33
+#define __NR_nice 34
+#define __NR_ftime 35
+#define __NR_sync 36
+#define __NR_kill 37
+#define __NR_rename 38
+#define __NR_mkdir 39
+#define __NR_rmdir 40
+#define __NR_dup 41
+#define __NR_pipe 42
+#define __NR_times 43
+#define __NR_prof 44
+#define __NR_brk 45
+#define __NR_setgid 46
+#define __NR_getgid 47
+#define __NR_signal 48
+#define __NR_geteuid 49
+#define __NR_getegid 50
+#define __NR_acct 51
+#define __NR_umount2 52
+#define __NR_lock 53
+#define __NR_ioctl 54
+#define __NR_fcntl 55
+#define __NR_mpx 56
+#define __NR_setpgid 57
+#define __NR_ulimit 58
+#define __NR_oldolduname 59
+#define __NR_umask 60
+#define __NR_chroot 61
+#define __NR_ustat 62
+#define __NR_dup2 63
+#define __NR_getppid 64
+#define __NR_getpgrp 65
+#define __NR_setsid 66
+#define __NR_sigaction 67
+#define __NR_sgetmask 68
+#define __NR_ssetmask 69
+#define __NR_setreuid 70
+#define __NR_setregid 71
+#define __NR_sigsuspend 72
+#define __NR_sigpending 73
+#define __NR_sethostname 74
+#define __NR_setrlimit 75
+#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */
+#define __NR_getrusage 77
+#define __NR_gettimeofday 78
+#define __NR_settimeofday 79
+#define __NR_getgroups 80
+#define __NR_setgroups 81
+#define __NR_select 82
+#define __NR_symlink 83
+#define __NR_oldlstat 84
+#define __NR_readlink 85
+#define __NR_uselib 86
+#define __NR_swapon 87
+#define __NR_reboot 88
+#define __NR_readdir 89
+#define __NR_mmap 90
+#define __NR_munmap 91
+#define __NR_truncate 92
+#define __NR_ftruncate 93
+#define __NR_fchmod 94
+#define __NR_fchown 95
+#define __NR_getpriority 96
+#define __NR_setpriority 97
+#define __NR_profil 98
+#define __NR_statfs 99
+#define __NR_fstatfs 100
+#define __NR_ioperm 101
+#define __NR_socketcall 102
+#define __NR_syslog 103
+#define __NR_setitimer 104
+#define __NR_getitimer 105
+#define __NR_stat 106
+#define __NR_lstat 107
+#define __NR_fstat 108
+#define __NR_olduname 109
+#define __NR_iopl 110
+#define __NR_vhangup 111
+#define __NR_idle 112
+#define __NR_vm86old 113
+#define __NR_wait4 114
+#define __NR_swapoff 115
+#define __NR_sysinfo 116
+#define __NR_ipc 117
+#define __NR_fsync 118
+#define __NR_sigreturn 119
+#define __NR_clone 120
+#define __NR_setdomainname 121
+#define __NR_uname 122
+#define __NR_modify_ldt 123
+#define __NR_adjtimex 124
+#define __NR_mprotect 125
+#define __NR_sigprocmask 126
+#define __NR_create_module 127
+#define __NR_init_module 128
+#define __NR_delete_module 129
+#define __NR_get_kernel_syms 130
+#define __NR_quotactl 131
+#define __NR_getpgid 132
+#define __NR_fchdir 133
+#define __NR_bdflush 134
+#define __NR_sysfs 135
+#define __NR_personality 136
+#define __NR_afs_syscall 137 /* Syscall for Andrew File System */
+#define __NR_setfsuid 138
+#define __NR_setfsgid 139
+#define __NR__llseek 140
+#define __NR_getdents 141
+#define __NR__newselect 142
+#define __NR_flock 143
+#define __NR_msync 144
+#define __NR_readv 145
+#define __NR_writev 146
+#define __NR_getsid 147
+#define __NR_fdatasync 148
+#define __NR__sysctl 149
+#define __NR_mlock 150
+#define __NR_munlock 151
+#define __NR_mlockall 152
+#define __NR_munlockall 153
+#define __NR_sched_setparam 154
+#define __NR_sched_getparam 155
+#define __NR_sched_setscheduler 156
+#define __NR_sched_getscheduler 157
+#define __NR_sched_yield 158
+#define __NR_sched_get_priority_max 159
+#define __NR_sched_get_priority_min 160
+#define __NR_sched_rr_get_interval 161
+#define __NR_nanosleep 162
+#define __NR_mremap 163
+#define __NR_setresuid 164
+#define __NR_getresuid 165
+#define __NR_vm86 166
+#define __NR_query_module 167
+#define __NR_poll 168
+#define __NR_nfsservctl 169
+#define __NR_setresgid 170
+#define __NR_getresgid 171
+#define __NR_prctl 172
+#define __NR_rt_sigreturn 173
+#define __NR_rt_sigaction 174
+#define __NR_rt_sigprocmask 175
+#define __NR_rt_sigpending 176
+#define __NR_rt_sigtimedwait 177
+#define __NR_rt_sigqueueinfo 178
+#define __NR_rt_sigsuspend 179
+#define __NR_pread64 180
+#define __NR_pwrite64 181
+#define __NR_chown 182
+#define __NR_getcwd 183
+#define __NR_capget 184
+#define __NR_capset 185
+#define __NR_sigaltstack 186
+#define __NR_sendfile 187
+#define __NR_getpmsg 188 /* some people actually want streams */
+#define __NR_putpmsg 189 /* some people actually want streams */
+#define __NR_vfork 190
+#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */
+#define __NR_mmap2 192
+#define __NR_truncate64 193
+#define __NR_ftruncate64 194
+#define __NR_stat64 195
+#define __NR_lstat64 196
+#define __NR_fstat64 197
+#define __NR_lchown32 198
+#define __NR_getuid32 199
+#define __NR_getgid32 200
+#define __NR_geteuid32 201
+#define __NR_getegid32 202
+#define __NR_setreuid32 203
+#define __NR_setregid32 204
+#define __NR_getgroups32 205
+#define __NR_setgroups32 206
+#define __NR_fchown32 207
+#define __NR_setresuid32 208
+#define __NR_getresuid32 209
+#define __NR_setresgid32 210
+#define __NR_getresgid32 211
+#define __NR_chown32 212
+#define __NR_setuid32 213
+#define __NR_setgid32 214
+#define __NR_setfsuid32 215
+#define __NR_setfsgid32 216
+#define __NR_pivot_root 217
+#define __NR_mincore 218
+#define __NR_madvise 219
+#define __NR_madvise1 219 /* delete when C lib stub is removed */
+#define __NR_getdents64 220
+#define __NR_fcntl64 221
+/* 223 is unused */
+#define __NR_gettid 224
+#define __NR_readahead 225
+#define __NR_setxattr 226
+#define __NR_lsetxattr 227
+#define __NR_fsetxattr 228
+#define __NR_getxattr 229
+#define __NR_lgetxattr 230
+#define __NR_fgetxattr 231
+#define __NR_listxattr 232
+#define __NR_llistxattr 233
+#define __NR_flistxattr 234
+#define __NR_removexattr 235
+#define __NR_lremovexattr 236
+#define __NR_fremovexattr 237
+#define __NR_tkill 238
+#define __NR_sendfile64 239
+#define __NR_futex 240
+#define __NR_sched_setaffinity 241
+#define __NR_sched_getaffinity 242
+#define __NR_set_thread_area 243
+#define __NR_get_thread_area 244
+#define __NR_io_setup 245
+#define __NR_io_destroy 246
+#define __NR_io_getevents 247
+#define __NR_io_submit 248
+#define __NR_io_cancel 249
+#define __NR_fadvise64 250
+/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
+#define __NR_exit_group 252
+#define __NR_lookup_dcookie 253
+#define __NR_epoll_create 254
+#define __NR_epoll_ctl 255
+#define __NR_epoll_wait 256
+#define __NR_remap_file_pages 257
+#define __NR_set_tid_address 258
+#define __NR_timer_create 259
+#define __NR_timer_settime (__NR_timer_create+1)
+#define __NR_timer_gettime (__NR_timer_create+2)
+#define __NR_timer_getoverrun (__NR_timer_create+3)
+#define __NR_timer_delete (__NR_timer_create+4)
+#define __NR_clock_settime (__NR_timer_create+5)
+#define __NR_clock_gettime (__NR_timer_create+6)
+#define __NR_clock_getres (__NR_timer_create+7)
+#define __NR_clock_nanosleep (__NR_timer_create+8)
+#define __NR_statfs64 268
+#define __NR_fstatfs64 269
+#define __NR_tgkill 270
+#define __NR_utimes 271
+#define __NR_fadvise64_64 272
+#define __NR_vserver 273
+#define __NR_mbind 274
+#define __NR_get_mempolicy 275
+#define __NR_set_mempolicy 276
+#define __NR_mq_open 277
+#define __NR_mq_unlink (__NR_mq_open+1)
+#define __NR_mq_timedsend (__NR_mq_open+2)
+#define __NR_mq_timedreceive (__NR_mq_open+3)
+#define __NR_mq_notify (__NR_mq_open+4)
+#define __NR_mq_getsetattr (__NR_mq_open+5)
+#define __NR_kexec_load 283
+#define __NR_waitid 284
+/* #define __NR_sys_setaltroot 285 */
+#define __NR_add_key 286
+#define __NR_request_key 287
+#define __NR_keyctl 288
+#define __NR_ioprio_set 289
+#define __NR_ioprio_get 290
+#define __NR_inotify_init 291
+#define __NR_inotify_add_watch 292
+#define __NR_inotify_rm_watch 293
+#define __NR_migrate_pages 294
+#define __NR_openat 295
+#define __NR_mkdirat 296
+#define __NR_mknodat 297
+#define __NR_fchownat 298
+#define __NR_futimesat 299
+#define __NR_fstatat64 300
+#define __NR_unlinkat 301
+#define __NR_renameat 302
+#define __NR_linkat 303
+#define __NR_symlinkat 304
+#define __NR_readlinkat 305
+#define __NR_fchmodat 306
+#define __NR_faccessat 307
+#define __NR_pselect6 308
+#define __NR_ppoll 309
+#define __NR_unshare 310
+#define __NR_set_robust_list 311
+#define __NR_get_robust_list 312
+#define __NR_splice 313
+#define __NR_sync_file_range 314
+#define __NR_tee 315
+#define __NR_vmsplice 316
+#define __NR_move_pages 317
+#define __NR_getcpu 318
+#define __NR_epoll_pwait 319
+#define __NR_utimensat 320
+#define __NR_signalfd 321
+#define __NR_timerfd_create 322
+#define __NR_eventfd 323
+#define __NR_fallocate 324
+#define __NR_timerfd_settime 325
+#define __NR_timerfd_gettime 326
+#define __NR_signalfd4 327
+#define __NR_eventfd2 328
+#define __NR_epoll_create1 329
+#define __NR_dup3 330
+#define __NR_pipe2 331
+#define __NR_inotify_init1 332
+
+#ifdef __KERNEL__
+
+#define __ARCH_WANT_IPC_PARSE_VERSION
+#define __ARCH_WANT_OLD_READDIR
+#define __ARCH_WANT_OLD_STAT
+#define __ARCH_WANT_STAT64
+#define __ARCH_WANT_SYS_ALARM
+#define __ARCH_WANT_SYS_GETHOSTNAME
+#define __ARCH_WANT_SYS_PAUSE
+#define __ARCH_WANT_SYS_SGETMASK
+#define __ARCH_WANT_SYS_SIGNAL
+#define __ARCH_WANT_SYS_TIME
+#define __ARCH_WANT_SYS_UTIME
+#define __ARCH_WANT_SYS_WAITPID
+#define __ARCH_WANT_SYS_SOCKETCALL
+#define __ARCH_WANT_SYS_FADVISE64
+#define __ARCH_WANT_SYS_GETPGRP
+#define __ARCH_WANT_SYS_LLSEEK
+#define __ARCH_WANT_SYS_NICE
+#define __ARCH_WANT_SYS_OLD_GETRLIMIT
+#define __ARCH_WANT_SYS_OLDUMOUNT
+#define __ARCH_WANT_SYS_SIGPENDING
+#define __ARCH_WANT_SYS_SIGPROCMASK
+#define __ARCH_WANT_SYS_RT_SIGACTION
+#define __ARCH_WANT_SYS_RT_SIGSUSPEND
+
+/*
+ * "Conditional" syscalls
+ *
+ * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
+ * but it doesn't work on all toolchains, so we just do it by hand
+ */
+#ifndef cond_syscall
+#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_UNISTD_32_H */
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
new file mode 100644
index 00000000000..834b2c1d89f
--- /dev/null
+++ b/arch/x86/include/asm/unistd_64.h
@@ -0,0 +1,693 @@
+#ifndef _ASM_X86_UNISTD_64_H
+#define _ASM_X86_UNISTD_64_H
+
+#ifndef __SYSCALL
+#define __SYSCALL(a, b)
+#endif
+
+/*
+ * This file contains the system call numbers.
+ *
+ * Note: holes are not allowed.
+ */
+
+/* at least 8 syscall per cacheline */
+#define __NR_read 0
+__SYSCALL(__NR_read, sys_read)
+#define __NR_write 1
+__SYSCALL(__NR_write, sys_write)
+#define __NR_open 2
+__SYSCALL(__NR_open, sys_open)
+#define __NR_close 3
+__SYSCALL(__NR_close, sys_close)
+#define __NR_stat 4
+__SYSCALL(__NR_stat, sys_newstat)
+#define __NR_fstat 5
+__SYSCALL(__NR_fstat, sys_newfstat)
+#define __NR_lstat 6
+__SYSCALL(__NR_lstat, sys_newlstat)
+#define __NR_poll 7
+__SYSCALL(__NR_poll, sys_poll)
+
+#define __NR_lseek 8
+__SYSCALL(__NR_lseek, sys_lseek)
+#define __NR_mmap 9
+__SYSCALL(__NR_mmap, sys_mmap)
+#define __NR_mprotect 10
+__SYSCALL(__NR_mprotect, sys_mprotect)
+#define __NR_munmap 11
+__SYSCALL(__NR_munmap, sys_munmap)
+#define __NR_brk 12
+__SYSCALL(__NR_brk, sys_brk)
+#define __NR_rt_sigaction 13
+__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction)
+#define __NR_rt_sigprocmask 14
+__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask)
+#define __NR_rt_sigreturn 15
+__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn)
+
+#define __NR_ioctl 16
+__SYSCALL(__NR_ioctl, sys_ioctl)
+#define __NR_pread64 17
+__SYSCALL(__NR_pread64, sys_pread64)
+#define __NR_pwrite64 18
+__SYSCALL(__NR_pwrite64, sys_pwrite64)
+#define __NR_readv 19
+__SYSCALL(__NR_readv, sys_readv)
+#define __NR_writev 20
+__SYSCALL(__NR_writev, sys_writev)
+#define __NR_access 21
+__SYSCALL(__NR_access, sys_access)
+#define __NR_pipe 22
+__SYSCALL(__NR_pipe, sys_pipe)
+#define __NR_select 23
+__SYSCALL(__NR_select, sys_select)
+
+#define __NR_sched_yield 24
+__SYSCALL(__NR_sched_yield, sys_sched_yield)
+#define __NR_mremap 25
+__SYSCALL(__NR_mremap, sys_mremap)
+#define __NR_msync 26
+__SYSCALL(__NR_msync, sys_msync)
+#define __NR_mincore 27
+__SYSCALL(__NR_mincore, sys_mincore)
+#define __NR_madvise 28
+__SYSCALL(__NR_madvise, sys_madvise)
+#define __NR_shmget 29
+__SYSCALL(__NR_shmget, sys_shmget)
+#define __NR_shmat 30
+__SYSCALL(__NR_shmat, sys_shmat)
+#define __NR_shmctl 31
+__SYSCALL(__NR_shmctl, sys_shmctl)
+
+#define __NR_dup 32
+__SYSCALL(__NR_dup, sys_dup)
+#define __NR_dup2 33
+__SYSCALL(__NR_dup2, sys_dup2)
+#define __NR_pause 34
+__SYSCALL(__NR_pause, sys_pause)
+#define __NR_nanosleep 35
+__SYSCALL(__NR_nanosleep, sys_nanosleep)
+#define __NR_getitimer 36
+__SYSCALL(__NR_getitimer, sys_getitimer)
+#define __NR_alarm 37
+__SYSCALL(__NR_alarm, sys_alarm)
+#define __NR_setitimer 38
+__SYSCALL(__NR_setitimer, sys_setitimer)
+#define __NR_getpid 39
+__SYSCALL(__NR_getpid, sys_getpid)
+
+#define __NR_sendfile 40
+__SYSCALL(__NR_sendfile, sys_sendfile64)
+#define __NR_socket 41
+__SYSCALL(__NR_socket, sys_socket)
+#define __NR_connect 42
+__SYSCALL(__NR_connect, sys_connect)
+#define __NR_accept 43
+__SYSCALL(__NR_accept, sys_accept)
+#define __NR_sendto 44
+__SYSCALL(__NR_sendto, sys_sendto)
+#define __NR_recvfrom 45
+__SYSCALL(__NR_recvfrom, sys_recvfrom)
+#define __NR_sendmsg 46
+__SYSCALL(__NR_sendmsg, sys_sendmsg)
+#define __NR_recvmsg 47
+__SYSCALL(__NR_recvmsg, sys_recvmsg)
+
+#define __NR_shutdown 48
+__SYSCALL(__NR_shutdown, sys_shutdown)
+#define __NR_bind 49
+__SYSCALL(__NR_bind, sys_bind)
+#define __NR_listen 50
+__SYSCALL(__NR_listen, sys_listen)
+#define __NR_getsockname 51
+__SYSCALL(__NR_getsockname, sys_getsockname)
+#define __NR_getpeername 52
+__SYSCALL(__NR_getpeername, sys_getpeername)
+#define __NR_socketpair 53
+__SYSCALL(__NR_socketpair, sys_socketpair)
+#define __NR_setsockopt 54
+__SYSCALL(__NR_setsockopt, sys_setsockopt)
+#define __NR_getsockopt 55
+__SYSCALL(__NR_getsockopt, sys_getsockopt)
+
+#define __NR_clone 56
+__SYSCALL(__NR_clone, stub_clone)
+#define __NR_fork 57
+__SYSCALL(__NR_fork, stub_fork)
+#define __NR_vfork 58
+__SYSCALL(__NR_vfork, stub_vfork)
+#define __NR_execve 59
+__SYSCALL(__NR_execve, stub_execve)
+#define __NR_exit 60
+__SYSCALL(__NR_exit, sys_exit)
+#define __NR_wait4 61
+__SYSCALL(__NR_wait4, sys_wait4)
+#define __NR_kill 62
+__SYSCALL(__NR_kill, sys_kill)
+#define __NR_uname 63
+__SYSCALL(__NR_uname, sys_uname)
+
+#define __NR_semget 64
+__SYSCALL(__NR_semget, sys_semget)
+#define __NR_semop 65
+__SYSCALL(__NR_semop, sys_semop)
+#define __NR_semctl 66
+__SYSCALL(__NR_semctl, sys_semctl)
+#define __NR_shmdt 67
+__SYSCALL(__NR_shmdt, sys_shmdt)
+#define __NR_msgget 68
+__SYSCALL(__NR_msgget, sys_msgget)
+#define __NR_msgsnd 69
+__SYSCALL(__NR_msgsnd, sys_msgsnd)
+#define __NR_msgrcv 70
+__SYSCALL(__NR_msgrcv, sys_msgrcv)
+#define __NR_msgctl 71
+__SYSCALL(__NR_msgctl, sys_msgctl)
+
+#define __NR_fcntl 72
+__SYSCALL(__NR_fcntl, sys_fcntl)
+#define __NR_flock 73
+__SYSCALL(__NR_flock, sys_flock)
+#define __NR_fsync 74
+__SYSCALL(__NR_fsync, sys_fsync)
+#define __NR_fdatasync 75
+__SYSCALL(__NR_fdatasync, sys_fdatasync)
+#define __NR_truncate 76
+__SYSCALL(__NR_truncate, sys_truncate)
+#define __NR_ftruncate 77
+__SYSCALL(__NR_ftruncate, sys_ftruncate)
+#define __NR_getdents 78
+__SYSCALL(__NR_getdents, sys_getdents)
+#define __NR_getcwd 79
+__SYSCALL(__NR_getcwd, sys_getcwd)
+
+#define __NR_chdir 80
+__SYSCALL(__NR_chdir, sys_chdir)
+#define __NR_fchdir 81
+__SYSCALL(__NR_fchdir, sys_fchdir)
+#define __NR_rename 82
+__SYSCALL(__NR_rename, sys_rename)
+#define __NR_mkdir 83
+__SYSCALL(__NR_mkdir, sys_mkdir)
+#define __NR_rmdir 84
+__SYSCALL(__NR_rmdir, sys_rmdir)
+#define __NR_creat 85
+__SYSCALL(__NR_creat, sys_creat)
+#define __NR_link 86
+__SYSCALL(__NR_link, sys_link)
+#define __NR_unlink 87
+__SYSCALL(__NR_unlink, sys_unlink)
+
+#define __NR_symlink 88
+__SYSCALL(__NR_symlink, sys_symlink)
+#define __NR_readlink 89
+__SYSCALL(__NR_readlink, sys_readlink)
+#define __NR_chmod 90
+__SYSCALL(__NR_chmod, sys_chmod)
+#define __NR_fchmod 91
+__SYSCALL(__NR_fchmod, sys_fchmod)
+#define __NR_chown 92
+__SYSCALL(__NR_chown, sys_chown)
+#define __NR_fchown 93
+__SYSCALL(__NR_fchown, sys_fchown)
+#define __NR_lchown 94
+__SYSCALL(__NR_lchown, sys_lchown)
+#define __NR_umask 95
+__SYSCALL(__NR_umask, sys_umask)
+
+#define __NR_gettimeofday 96
+__SYSCALL(__NR_gettimeofday, sys_gettimeofday)
+#define __NR_getrlimit 97
+__SYSCALL(__NR_getrlimit, sys_getrlimit)
+#define __NR_getrusage 98
+__SYSCALL(__NR_getrusage, sys_getrusage)
+#define __NR_sysinfo 99
+__SYSCALL(__NR_sysinfo, sys_sysinfo)
+#define __NR_times 100
+__SYSCALL(__NR_times, sys_times)
+#define __NR_ptrace 101
+__SYSCALL(__NR_ptrace, sys_ptrace)
+#define __NR_getuid 102
+__SYSCALL(__NR_getuid, sys_getuid)
+#define __NR_syslog 103
+__SYSCALL(__NR_syslog, sys_syslog)
+
+/* at the very end the stuff that never runs during the benchmarks */
+#define __NR_getgid 104
+__SYSCALL(__NR_getgid, sys_getgid)
+#define __NR_setuid 105
+__SYSCALL(__NR_setuid, sys_setuid)
+#define __NR_setgid 106
+__SYSCALL(__NR_setgid, sys_setgid)
+#define __NR_geteuid 107
+__SYSCALL(__NR_geteuid, sys_geteuid)
+#define __NR_getegid 108
+__SYSCALL(__NR_getegid, sys_getegid)
+#define __NR_setpgid 109
+__SYSCALL(__NR_setpgid, sys_setpgid)
+#define __NR_getppid 110
+__SYSCALL(__NR_getppid, sys_getppid)
+#define __NR_getpgrp 111
+__SYSCALL(__NR_getpgrp, sys_getpgrp)
+
+#define __NR_setsid 112
+__SYSCALL(__NR_setsid, sys_setsid)
+#define __NR_setreuid 113
+__SYSCALL(__NR_setreuid, sys_setreuid)
+#define __NR_setregid 114
+__SYSCALL(__NR_setregid, sys_setregid)
+#define __NR_getgroups 115
+__SYSCALL(__NR_getgroups, sys_getgroups)
+#define __NR_setgroups 116
+__SYSCALL(__NR_setgroups, sys_setgroups)
+#define __NR_setresuid 117
+__SYSCALL(__NR_setresuid, sys_setresuid)
+#define __NR_getresuid 118
+__SYSCALL(__NR_getresuid, sys_getresuid)
+#define __NR_setresgid 119
+__SYSCALL(__NR_setresgid, sys_setresgid)
+
+#define __NR_getresgid 120
+__SYSCALL(__NR_getresgid, sys_getresgid)
+#define __NR_getpgid 121
+__SYSCALL(__NR_getpgid, sys_getpgid)
+#define __NR_setfsuid 122
+__SYSCALL(__NR_setfsuid, sys_setfsuid)
+#define __NR_setfsgid 123
+__SYSCALL(__NR_setfsgid, sys_setfsgid)
+#define __NR_getsid 124
+__SYSCALL(__NR_getsid, sys_getsid)
+#define __NR_capget 125
+__SYSCALL(__NR_capget, sys_capget)
+#define __NR_capset 126
+__SYSCALL(__NR_capset, sys_capset)
+
+#define __NR_rt_sigpending 127
+__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending)
+#define __NR_rt_sigtimedwait 128
+__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait)
+#define __NR_rt_sigqueueinfo 129
+__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo)
+#define __NR_rt_sigsuspend 130
+__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend)
+#define __NR_sigaltstack 131
+__SYSCALL(__NR_sigaltstack, stub_sigaltstack)
+#define __NR_utime 132
+__SYSCALL(__NR_utime, sys_utime)
+#define __NR_mknod 133
+__SYSCALL(__NR_mknod, sys_mknod)
+
+/* Only needed for a.out */
+#define __NR_uselib 134
+__SYSCALL(__NR_uselib, sys_ni_syscall)
+#define __NR_personality 135
+__SYSCALL(__NR_personality, sys_personality)
+
+#define __NR_ustat 136
+__SYSCALL(__NR_ustat, sys_ustat)
+#define __NR_statfs 137
+__SYSCALL(__NR_statfs, sys_statfs)
+#define __NR_fstatfs 138
+__SYSCALL(__NR_fstatfs, sys_fstatfs)
+#define __NR_sysfs 139
+__SYSCALL(__NR_sysfs, sys_sysfs)
+
+#define __NR_getpriority 140
+__SYSCALL(__NR_getpriority, sys_getpriority)
+#define __NR_setpriority 141
+__SYSCALL(__NR_setpriority, sys_setpriority)
+#define __NR_sched_setparam 142
+__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
+#define __NR_sched_getparam 143
+__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
+#define __NR_sched_setscheduler 144
+__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
+#define __NR_sched_getscheduler 145
+__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
+#define __NR_sched_get_priority_max 146
+__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
+#define __NR_sched_get_priority_min 147
+__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
+#define __NR_sched_rr_get_interval 148
+__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval)
+
+#define __NR_mlock 149
+__SYSCALL(__NR_mlock, sys_mlock)
+#define __NR_munlock 150
+__SYSCALL(__NR_munlock, sys_munlock)
+#define __NR_mlockall 151
+__SYSCALL(__NR_mlockall, sys_mlockall)
+#define __NR_munlockall 152
+__SYSCALL(__NR_munlockall, sys_munlockall)
+
+#define __NR_vhangup 153
+__SYSCALL(__NR_vhangup, sys_vhangup)
+
+#define __NR_modify_ldt 154
+__SYSCALL(__NR_modify_ldt, sys_modify_ldt)
+
+#define __NR_pivot_root 155
+__SYSCALL(__NR_pivot_root, sys_pivot_root)
+
+#define __NR__sysctl 156
+__SYSCALL(__NR__sysctl, sys_sysctl)
+
+#define __NR_prctl 157
+__SYSCALL(__NR_prctl, sys_prctl)
+#define __NR_arch_prctl 158
+__SYSCALL(__NR_arch_prctl, sys_arch_prctl)
+
+#define __NR_adjtimex 159
+__SYSCALL(__NR_adjtimex, sys_adjtimex)
+
+#define __NR_setrlimit 160
+__SYSCALL(__NR_setrlimit, sys_setrlimit)
+
+#define __NR_chroot 161
+__SYSCALL(__NR_chroot, sys_chroot)
+
+#define __NR_sync 162
+__SYSCALL(__NR_sync, sys_sync)
+
+#define __NR_acct 163
+__SYSCALL(__NR_acct, sys_acct)
+
+#define __NR_settimeofday 164
+__SYSCALL(__NR_settimeofday, sys_settimeofday)
+
+#define __NR_mount 165
+__SYSCALL(__NR_mount, sys_mount)
+#define __NR_umount2 166
+__SYSCALL(__NR_umount2, sys_umount)
+
+#define __NR_swapon 167
+__SYSCALL(__NR_swapon, sys_swapon)
+#define __NR_swapoff 168
+__SYSCALL(__NR_swapoff, sys_swapoff)
+
+#define __NR_reboot 169
+__SYSCALL(__NR_reboot, sys_reboot)
+
+#define __NR_sethostname 170
+__SYSCALL(__NR_sethostname, sys_sethostname)
+#define __NR_setdomainname 171
+__SYSCALL(__NR_setdomainname, sys_setdomainname)
+
+#define __NR_iopl 172
+__SYSCALL(__NR_iopl, stub_iopl)
+#define __NR_ioperm 173
+__SYSCALL(__NR_ioperm, sys_ioperm)
+
+#define __NR_create_module 174
+__SYSCALL(__NR_create_module, sys_ni_syscall)
+#define __NR_init_module 175
+__SYSCALL(__NR_init_module, sys_init_module)
+#define __NR_delete_module 176
+__SYSCALL(__NR_delete_module, sys_delete_module)
+#define __NR_get_kernel_syms 177
+__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall)
+#define __NR_query_module 178
+__SYSCALL(__NR_query_module, sys_ni_syscall)
+
+#define __NR_quotactl 179
+__SYSCALL(__NR_quotactl, sys_quotactl)
+
+#define __NR_nfsservctl 180
+__SYSCALL(__NR_nfsservctl, sys_nfsservctl)
+
+/* reserved for LiS/STREAMS */
+#define __NR_getpmsg 181
+__SYSCALL(__NR_getpmsg, sys_ni_syscall)
+#define __NR_putpmsg 182
+__SYSCALL(__NR_putpmsg, sys_ni_syscall)
+
+/* reserved for AFS */
+#define __NR_afs_syscall 183
+__SYSCALL(__NR_afs_syscall, sys_ni_syscall)
+
+/* reserved for tux */
+#define __NR_tuxcall 184
+__SYSCALL(__NR_tuxcall, sys_ni_syscall)
+
+#define __NR_security 185
+__SYSCALL(__NR_security, sys_ni_syscall)
+
+#define __NR_gettid 186
+__SYSCALL(__NR_gettid, sys_gettid)
+
+#define __NR_readahead 187
+__SYSCALL(__NR_readahead, sys_readahead)
+#define __NR_setxattr 188
+__SYSCALL(__NR_setxattr, sys_setxattr)
+#define __NR_lsetxattr 189
+__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
+#define __NR_fsetxattr 190
+__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
+#define __NR_getxattr 191
+__SYSCALL(__NR_getxattr, sys_getxattr)
+#define __NR_lgetxattr 192
+__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
+#define __NR_fgetxattr 193
+__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
+#define __NR_listxattr 194
+__SYSCALL(__NR_listxattr, sys_listxattr)
+#define __NR_llistxattr 195
+__SYSCALL(__NR_llistxattr, sys_llistxattr)
+#define __NR_flistxattr 196
+__SYSCALL(__NR_flistxattr, sys_flistxattr)
+#define __NR_removexattr 197
+__SYSCALL(__NR_removexattr, sys_removexattr)
+#define __NR_lremovexattr 198
+__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
+#define __NR_fremovexattr 199
+__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
+#define __NR_tkill 200
+__SYSCALL(__NR_tkill, sys_tkill)
+#define __NR_time 201
+__SYSCALL(__NR_time, sys_time)
+#define __NR_futex 202
+__SYSCALL(__NR_futex, sys_futex)
+#define __NR_sched_setaffinity 203
+__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity)
+#define __NR_sched_getaffinity 204
+__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity)
+#define __NR_set_thread_area 205
+__SYSCALL(__NR_set_thread_area, sys_ni_syscall) /* use arch_prctl */
+#define __NR_io_setup 206
+__SYSCALL(__NR_io_setup, sys_io_setup)
+#define __NR_io_destroy 207
+__SYSCALL(__NR_io_destroy, sys_io_destroy)
+#define __NR_io_getevents 208
+__SYSCALL(__NR_io_getevents, sys_io_getevents)
+#define __NR_io_submit 209
+__SYSCALL(__NR_io_submit, sys_io_submit)
+#define __NR_io_cancel 210
+__SYSCALL(__NR_io_cancel, sys_io_cancel)
+#define __NR_get_thread_area 211
+__SYSCALL(__NR_get_thread_area, sys_ni_syscall) /* use arch_prctl */
+#define __NR_lookup_dcookie 212
+__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie)
+#define __NR_epoll_create 213
+__SYSCALL(__NR_epoll_create, sys_epoll_create)
+#define __NR_epoll_ctl_old 214
+__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall)
+#define __NR_epoll_wait_old 215
+__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall)
+#define __NR_remap_file_pages 216
+__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
+#define __NR_getdents64 217
+__SYSCALL(__NR_getdents64, sys_getdents64)
+#define __NR_set_tid_address 218
+__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
+#define __NR_restart_syscall 219
+__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
+#define __NR_semtimedop 220
+__SYSCALL(__NR_semtimedop, sys_semtimedop)
+#define __NR_fadvise64 221
+__SYSCALL(__NR_fadvise64, sys_fadvise64)
+#define __NR_timer_create 222
+__SYSCALL(__NR_timer_create, sys_timer_create)
+#define __NR_timer_settime 223
+__SYSCALL(__NR_timer_settime, sys_timer_settime)
+#define __NR_timer_gettime 224
+__SYSCALL(__NR_timer_gettime, sys_timer_gettime)
+#define __NR_timer_getoverrun 225
+__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
+#define __NR_timer_delete 226
+__SYSCALL(__NR_timer_delete, sys_timer_delete)
+#define __NR_clock_settime 227
+__SYSCALL(__NR_clock_settime, sys_clock_settime)
+#define __NR_clock_gettime 228
+__SYSCALL(__NR_clock_gettime, sys_clock_gettime)
+#define __NR_clock_getres 229
+__SYSCALL(__NR_clock_getres, sys_clock_getres)
+#define __NR_clock_nanosleep 230
+__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep)
+#define __NR_exit_group 231
+__SYSCALL(__NR_exit_group, sys_exit_group)
+#define __NR_epoll_wait 232
+__SYSCALL(__NR_epoll_wait, sys_epoll_wait)
+#define __NR_epoll_ctl 233
+__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
+#define __NR_tgkill 234
+__SYSCALL(__NR_tgkill, sys_tgkill)
+#define __NR_utimes 235
+__SYSCALL(__NR_utimes, sys_utimes)
+#define __NR_vserver 236
+__SYSCALL(__NR_vserver, sys_ni_syscall)
+#define __NR_mbind 237
+__SYSCALL(__NR_mbind, sys_mbind)
+#define __NR_set_mempolicy 238
+__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
+#define __NR_get_mempolicy 239
+__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
+#define __NR_mq_open 240
+__SYSCALL(__NR_mq_open, sys_mq_open)
+#define __NR_mq_unlink 241
+__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
+#define __NR_mq_timedsend 242
+__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend)
+#define __NR_mq_timedreceive 243
+__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive)
+#define __NR_mq_notify 244
+__SYSCALL(__NR_mq_notify, sys_mq_notify)
+#define __NR_mq_getsetattr 245
+__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
+#define __NR_kexec_load 246
+__SYSCALL(__NR_kexec_load, sys_kexec_load)
+#define __NR_waitid 247
+__SYSCALL(__NR_waitid, sys_waitid)
+#define __NR_add_key 248
+__SYSCALL(__NR_add_key, sys_add_key)
+#define __NR_request_key 249
+__SYSCALL(__NR_request_key, sys_request_key)
+#define __NR_keyctl 250
+__SYSCALL(__NR_keyctl, sys_keyctl)
+#define __NR_ioprio_set 251
+__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
+#define __NR_ioprio_get 252
+__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
+#define __NR_inotify_init 253
+__SYSCALL(__NR_inotify_init, sys_inotify_init)
+#define __NR_inotify_add_watch 254
+__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
+#define __NR_inotify_rm_watch 255
+__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
+#define __NR_migrate_pages 256
+__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
+#define __NR_openat 257
+__SYSCALL(__NR_openat, sys_openat)
+#define __NR_mkdirat 258
+__SYSCALL(__NR_mkdirat, sys_mkdirat)
+#define __NR_mknodat 259
+__SYSCALL(__NR_mknodat, sys_mknodat)
+#define __NR_fchownat 260
+__SYSCALL(__NR_fchownat, sys_fchownat)
+#define __NR_futimesat 261
+__SYSCALL(__NR_futimesat, sys_futimesat)
+#define __NR_newfstatat 262
+__SYSCALL(__NR_newfstatat, sys_newfstatat)
+#define __NR_unlinkat 263
+__SYSCALL(__NR_unlinkat, sys_unlinkat)
+#define __NR_renameat 264
+__SYSCALL(__NR_renameat, sys_renameat)
+#define __NR_linkat 265
+__SYSCALL(__NR_linkat, sys_linkat)
+#define __NR_symlinkat 266
+__SYSCALL(__NR_symlinkat, sys_symlinkat)
+#define __NR_readlinkat 267
+__SYSCALL(__NR_readlinkat, sys_readlinkat)
+#define __NR_fchmodat 268
+__SYSCALL(__NR_fchmodat, sys_fchmodat)
+#define __NR_faccessat 269
+__SYSCALL(__NR_faccessat, sys_faccessat)
+#define __NR_pselect6 270
+__SYSCALL(__NR_pselect6, sys_pselect6)
+#define __NR_ppoll 271
+__SYSCALL(__NR_ppoll, sys_ppoll)
+#define __NR_unshare 272
+__SYSCALL(__NR_unshare, sys_unshare)
+#define __NR_set_robust_list 273
+__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
+#define __NR_get_robust_list 274
+__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
+#define __NR_splice 275
+__SYSCALL(__NR_splice, sys_splice)
+#define __NR_tee 276
+__SYSCALL(__NR_tee, sys_tee)
+#define __NR_sync_file_range 277
+__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
+#define __NR_vmsplice 278
+__SYSCALL(__NR_vmsplice, sys_vmsplice)
+#define __NR_move_pages 279
+__SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_utimensat 280
+__SYSCALL(__NR_utimensat, sys_utimensat)
+#define __IGNORE_getcpu /* implemented as a vsyscall */
+#define __NR_epoll_pwait 281
+__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
+#define __NR_signalfd 282
+__SYSCALL(__NR_signalfd, sys_signalfd)
+#define __NR_timerfd_create 283
+__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
+#define __NR_eventfd 284
+__SYSCALL(__NR_eventfd, sys_eventfd)
+#define __NR_fallocate 285
+__SYSCALL(__NR_fallocate, sys_fallocate)
+#define __NR_timerfd_settime 286
+__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
+#define __NR_timerfd_gettime 287
+__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
+#define __NR_paccept 288
+__SYSCALL(__NR_paccept, sys_paccept)
+#define __NR_signalfd4 289
+__SYSCALL(__NR_signalfd4, sys_signalfd4)
+#define __NR_eventfd2 290
+__SYSCALL(__NR_eventfd2, sys_eventfd2)
+#define __NR_epoll_create1 291
+__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
+#define __NR_dup3 292
+__SYSCALL(__NR_dup3, sys_dup3)
+#define __NR_pipe2 293
+__SYSCALL(__NR_pipe2, sys_pipe2)
+#define __NR_inotify_init1 294
+__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+
+
+#ifndef __NO_STUBS
+#define __ARCH_WANT_OLD_READDIR
+#define __ARCH_WANT_OLD_STAT
+#define __ARCH_WANT_SYS_ALARM
+#define __ARCH_WANT_SYS_GETHOSTNAME
+#define __ARCH_WANT_SYS_PAUSE
+#define __ARCH_WANT_SYS_SGETMASK
+#define __ARCH_WANT_SYS_SIGNAL
+#define __ARCH_WANT_SYS_UTIME
+#define __ARCH_WANT_SYS_WAITPID
+#define __ARCH_WANT_SYS_SOCKETCALL
+#define __ARCH_WANT_SYS_FADVISE64
+#define __ARCH_WANT_SYS_GETPGRP
+#define __ARCH_WANT_SYS_LLSEEK
+#define __ARCH_WANT_SYS_NICE
+#define __ARCH_WANT_SYS_OLD_GETRLIMIT
+#define __ARCH_WANT_SYS_OLDUMOUNT
+#define __ARCH_WANT_SYS_SIGPENDING
+#define __ARCH_WANT_SYS_SIGPROCMASK
+#define __ARCH_WANT_SYS_RT_SIGACTION
+#define __ARCH_WANT_SYS_RT_SIGSUSPEND
+#define __ARCH_WANT_SYS_TIME
+#define __ARCH_WANT_COMPAT_SYS_TIME
+#endif /* __NO_STUBS */
+
+#ifdef __KERNEL__
+/*
+ * "Conditional" syscalls
+ *
+ * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
+ * but it doesn't work on all toolchains, so we just do it by hand
+ */
+#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_UNISTD_64_H */
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
new file mode 100644
index 00000000000..8b064bd9c55
--- /dev/null
+++ b/arch/x86/include/asm/unwind.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_UNWIND_H
+#define _ASM_X86_UNWIND_H
+
+#define UNW_PC(frame) ((void)(frame), 0UL)
+#define UNW_SP(frame) ((void)(frame), 0UL)
+#define UNW_FP(frame) ((void)(frame), 0UL)
+
+static inline int arch_unw_user_mode(const void *info)
+{
+ return 0;
+}
+
+#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
new file mode 100644
index 00000000000..999873b22e7
--- /dev/null
+++ b/arch/x86/include/asm/user.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "user_32.h"
+#else
+# include "user_64.h"
+#endif
diff --git a/arch/x86/include/asm/user32.h b/arch/x86/include/asm/user32.h
new file mode 100644
index 00000000000..14cbb73ebcb
--- /dev/null
+++ b/arch/x86/include/asm/user32.h
@@ -0,0 +1,70 @@
+#ifndef _ASM_X86_USER32_H
+#define _ASM_X86_USER32_H
+
+/* IA32 compatible user structures for ptrace.
+ * These should be used for 32bit coredumps too. */
+
+struct user_i387_ia32_struct {
+ u32 cwd;
+ u32 swd;
+ u32 twd;
+ u32 fip;
+ u32 fcs;
+ u32 foo;
+ u32 fos;
+ u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+};
+
+/* FSAVE frame with extensions */
+struct user32_fxsr_struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd; /* not compatible to 64bit twd */
+ unsigned short fop;
+ int fip;
+ int fcs;
+ int foo;
+ int fos;
+ int mxcsr;
+ int reserved;
+ int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ int xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+ int padding[56];
+};
+
+struct user_regs_struct32 {
+ __u32 ebx, ecx, edx, esi, edi, ebp, eax;
+ unsigned short ds, __ds, es, __es;
+ unsigned short fs, __fs, gs, __gs;
+ __u32 orig_eax, eip;
+ unsigned short cs, __cs;
+ __u32 eflags, esp;
+ unsigned short ss, __ss;
+};
+
+struct user32 {
+ struct user_regs_struct32 regs; /* Where the registers are actually stored */
+ int u_fpvalid; /* True if math co-processor being used. */
+ /* for this mess. Not yet used. */
+ struct user_i387_ia32_struct i387; /* Math Co-processor registers. */
+/* The rest of this junk is to help gdb figure out what goes where */
+ __u32 u_tsize; /* Text segment size (pages). */
+ __u32 u_dsize; /* Data segment size (pages). */
+ __u32 u_ssize; /* Stack segment size (pages). */
+ __u32 start_code; /* Starting virtual address of text. */
+ __u32 start_stack; /* Starting virtual address of stack area.
+ This is actually the bottom of the stack,
+ the top of the stack is always found in the
+ esp register. */
+ __u32 signal; /* Signal that caused the core dump. */
+ int reserved; /* No __u32er used */
+ __u32 u_ar0; /* Used by gdb to help find the values for */
+ /* the registers. */
+ __u32 u_fpstate; /* Math Co-processor pointer. */
+ __u32 magic; /* To uniquely identify a core file */
+ char u_comm[32]; /* User command that was responsible */
+ int u_debugreg[8];
+};
+
+
+#endif /* _ASM_X86_USER32_H */
diff --git a/arch/x86/include/asm/user_32.h b/arch/x86/include/asm/user_32.h
new file mode 100644
index 00000000000..bebfd864401
--- /dev/null
+++ b/arch/x86/include/asm/user_32.h
@@ -0,0 +1,131 @@
+#ifndef _ASM_X86_USER_32_H
+#define _ASM_X86_USER_32_H
+
+#include <asm/page.h>
+/* Core file format: The core file is written in such a way that gdb
+ can understand it and provide useful information to the user (under
+ linux we use the 'trad-core' bfd). There are quite a number of
+ obstacles to being able to view the contents of the floating point
+ registers, and until these are solved you will not be able to view the
+ contents of them. Actually, you can read in the core file and look at
+ the contents of the user struct to find out what the floating point
+ registers contain.
+ The actual file contents are as follows:
+ UPAGE: 1 page consisting of a user struct that tells gdb what is present
+ in the file. Directly after this is a copy of the task_struct, which
+ is currently not used by gdb, but it may come in useful at some point.
+ All of the registers are stored as part of the upage. The upage should
+ always be only one page.
+ DATA: The data area is stored. We use current->end_text to
+ current->brk to pick up all of the user variables, plus any memory
+ that may have been malloced. No attempt is made to determine if a page
+ is demand-zero or if a page is totally unused, we just cover the entire
+ range. All of the addresses are rounded in such a way that an integral
+ number of pages is written.
+ STACK: We need the stack information in order to get a meaningful
+ backtrace. We need to write the data from (esp) to
+ current->start_stack, so we round each of these off in order to be able
+ to write an integer number of pages.
+ The minimum core file size is 3 pages, or 12288 bytes.
+*/
+
+/*
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for
+ * interacting with the FXSR-format floating point environment. Floating
+ * point data can be accessed in the regular format in the usual manner,
+ * and both the standard and SIMD floating point data can be accessed via
+ * the new ptrace requests. In either case, changes to the FPU environment
+ * will be reflected in the task's state as expected.
+ */
+
+struct user_i387_struct {
+ long cwd;
+ long swd;
+ long twd;
+ long fip;
+ long fcs;
+ long foo;
+ long fos;
+ long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+};
+
+struct user_fxsr_struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd;
+ unsigned short fop;
+ long fip;
+ long fcs;
+ long foo;
+ long fos;
+ long mxcsr;
+ long reserved;
+ long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+ long padding[56];
+};
+
+/*
+ * This is the old layout of "struct pt_regs", and
+ * is still the layout used by user mode (the new
+ * pt_regs doesn't have all registers as the kernel
+ * doesn't use the extra segment registers)
+ */
+struct user_regs_struct {
+ unsigned long bx;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long bp;
+ unsigned long ax;
+ unsigned long ds;
+ unsigned long es;
+ unsigned long fs;
+ unsigned long gs;
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+};
+
+/* When the kernel dumps core, it starts by dumping the user struct -
+ this will be used by gdb to figure out where the data and stack segments
+ are within the file, and what virtual addresses to use. */
+struct user{
+/* We start with the registers, to mimic the way that "memory" is returned
+ from the ptrace(3,...) function. */
+ struct user_regs_struct regs; /* Where the registers are actually stored */
+/* ptrace does not yet supply these. Someday.... */
+ int u_fpvalid; /* True if math co-processor being used. */
+ /* for this mess. Not yet used. */
+ struct user_i387_struct i387; /* Math Co-processor registers. */
+/* The rest of this junk is to help gdb figure out what goes where */
+ unsigned long int u_tsize; /* Text segment size (pages). */
+ unsigned long int u_dsize; /* Data segment size (pages). */
+ unsigned long int u_ssize; /* Stack segment size (pages). */
+ unsigned long start_code; /* Starting virtual address of text. */
+ unsigned long start_stack; /* Starting virtual address of stack area.
+ This is actually the bottom of the stack,
+ the top of the stack is always found in the
+ esp register. */
+ long int signal; /* Signal that caused the core dump. */
+ int reserved; /* No longer used */
+ unsigned long u_ar0; /* Used by gdb to help find the values for */
+ /* the registers. */
+ struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */
+ unsigned long magic; /* To uniquely identify a core file */
+ char u_comm[32]; /* User command that was responsible */
+ int u_debugreg[8];
+};
+#define NBPG PAGE_SIZE
+#define UPAGES 1
+#define HOST_TEXT_START_ADDR (u.start_code)
+#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
+
+#endif /* _ASM_X86_USER_32_H */
diff --git a/arch/x86/include/asm/user_64.h b/arch/x86/include/asm/user_64.h
new file mode 100644
index 00000000000..faf2cd3e0d7
--- /dev/null
+++ b/arch/x86/include/asm/user_64.h
@@ -0,0 +1,137 @@
+#ifndef _ASM_X86_USER_64_H
+#define _ASM_X86_USER_64_H
+
+#include <asm/types.h>
+#include <asm/page.h>
+/* Core file format: The core file is written in such a way that gdb
+ can understand it and provide useful information to the user.
+ There are quite a number of obstacles to being able to view the
+ contents of the floating point registers, and until these are
+ solved you will not be able to view the contents of them.
+ Actually, you can read in the core file and look at the contents of
+ the user struct to find out what the floating point registers
+ contain.
+
+ The actual file contents are as follows:
+ UPAGE: 1 page consisting of a user struct that tells gdb what is present
+ in the file. Directly after this is a copy of the task_struct, which
+ is currently not used by gdb, but it may come in useful at some point.
+ All of the registers are stored as part of the upage. The upage should
+ always be only one page.
+ DATA: The data area is stored. We use current->end_text to
+ current->brk to pick up all of the user variables, plus any memory
+ that may have been malloced. No attempt is made to determine if a page
+ is demand-zero or if a page is totally unused, we just cover the entire
+ range. All of the addresses are rounded in such a way that an integral
+ number of pages is written.
+ STACK: We need the stack information in order to get a meaningful
+ backtrace. We need to write the data from (esp) to
+ current->start_stack, so we round each of these off in order to be able
+ to write an integer number of pages.
+ The minimum core file size is 3 pages, or 12288 bytes. */
+
+/*
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for
+ * interacting with the FXSR-format floating point environment. Floating
+ * point data can be accessed in the regular format in the usual manner,
+ * and both the standard and SIMD floating point data can be accessed via
+ * the new ptrace requests. In either case, changes to the FPU environment
+ * will be reflected in the task's state as expected.
+ *
+ * x86-64 support by Andi Kleen.
+ */
+
+/* This matches the 64bit FXSAVE format as defined by AMD. It is the same
+ as the 32bit format defined by Intel, except that the selector:offset pairs
+ for data and eip are replaced with flat 64bit pointers. */
+struct user_i387_struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd; /* Note this is not the same as
+ the 32bit/x87/FSAVE twd */
+ unsigned short fop;
+ __u64 rip;
+ __u64 rdp;
+ __u32 mxcsr;
+ __u32 mxcsr_mask;
+ __u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+ __u32 padding[24];
+};
+
+/*
+ * Segment register layout in coredumps.
+ */
+struct user_regs_struct {
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long bp;
+ unsigned long bx;
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+ unsigned long fs_base;
+ unsigned long gs_base;
+ unsigned long ds;
+ unsigned long es;
+ unsigned long fs;
+ unsigned long gs;
+};
+
+/* When the kernel dumps core, it starts by dumping the user struct -
+ this will be used by gdb to figure out where the data and stack segments
+ are within the file, and what virtual addresses to use. */
+
+struct user {
+/* We start with the registers, to mimic the way that "memory" is returned
+ from the ptrace(3,...) function. */
+ struct user_regs_struct regs; /* Where the registers are actually stored */
+/* ptrace does not yet supply these. Someday.... */
+ int u_fpvalid; /* True if math co-processor being used. */
+ /* for this mess. Not yet used. */
+ int pad0;
+ struct user_i387_struct i387; /* Math Co-processor registers. */
+/* The rest of this junk is to help gdb figure out what goes where */
+ unsigned long int u_tsize; /* Text segment size (pages). */
+ unsigned long int u_dsize; /* Data segment size (pages). */
+ unsigned long int u_ssize; /* Stack segment size (pages). */
+ unsigned long start_code; /* Starting virtual address of text. */
+ unsigned long start_stack; /* Starting virtual address of stack area.
+ This is actually the bottom of the stack,
+ the top of the stack is always found in the
+ esp register. */
+ long int signal; /* Signal that caused the core dump. */
+ int reserved; /* No longer used */
+ int pad1;
+ unsigned long u_ar0; /* Used by gdb to help find the values for */
+ /* the registers. */
+ struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */
+ unsigned long magic; /* To uniquely identify a core file */
+ char u_comm[32]; /* User command that was responsible */
+ unsigned long u_debugreg[8];
+ unsigned long error_code; /* CPU error code or 0 */
+ unsigned long fault_address; /* CR3 or 0 */
+};
+#define NBPG PAGE_SIZE
+#define UPAGES 1
+#define HOST_TEXT_START_ADDR (u.start_code)
+#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
+
+#endif /* _ASM_X86_USER_64_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
new file mode 100644
index 00000000000..d931d3b7e6f
--- /dev/null
+++ b/arch/x86/include/asm/uv/bios.h
@@ -0,0 +1,94 @@
+#ifndef _ASM_X86_UV_BIOS_H
+#define _ASM_X86_UV_BIOS_H
+
+/*
+ * UV BIOS layer definitions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson
+ */
+
+#include <linux/rtc.h>
+
+/*
+ * Values for the BIOS calls. It is passed as the first * argument in the
+ * BIOS call. Passing any other value in the first argument will result
+ * in a BIOS_STATUS_UNIMPLEMENTED return status.
+ */
+enum uv_bios_cmd {
+ UV_BIOS_COMMON,
+ UV_BIOS_GET_SN_INFO,
+ UV_BIOS_FREQ_BASE
+};
+
+/*
+ * Status values returned from a BIOS call.
+ */
+enum {
+ BIOS_STATUS_SUCCESS = 0,
+ BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
+ BIOS_STATUS_EINVAL = -EINVAL,
+ BIOS_STATUS_UNAVAIL = -EBUSY
+};
+
+/*
+ * The UV system table describes specific firmware
+ * capabilities available to the Linux kernel at runtime.
+ */
+struct uv_systab {
+ char signature[4]; /* must be "UVST" */
+ u32 revision; /* distinguish different firmware revs */
+ u64 function; /* BIOS runtime callback function ptr */
+};
+
+enum {
+ BIOS_FREQ_BASE_PLATFORM = 0,
+ BIOS_FREQ_BASE_INTERVAL_TIMER = 1,
+ BIOS_FREQ_BASE_REALTIME_CLOCK = 2
+};
+
+union partition_info_u {
+ u64 val;
+ struct {
+ u64 hub_version : 8,
+ partition_id : 16,
+ coherence_id : 16,
+ region_size : 24;
+ };
+};
+
+/*
+ * bios calls have 6 parameters
+ */
+extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
+extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
+extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
+
+extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
+extern s64 uv_bios_freq_base(u64, u64 *);
+
+extern void uv_bios_init(void);
+
+extern int uv_type;
+extern long sn_partition_id;
+extern long uv_coherency_id;
+extern long uv_region_size;
+#define partition_coherence_id() (uv_coherency_id)
+
+extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
+
+#endif /* _ASM_X86_UV_BIOS_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
new file mode 100644
index 00000000000..e2363253bbb
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -0,0 +1,332 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV Broadcast Assist Unit definitions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_BAU_H
+#define _ASM_X86_UV_UV_BAU_H
+
+#include <linux/bitmap.h>
+#define BITSPERBYTE 8
+
+/*
+ * Broadcast Assist Unit messaging structures
+ *
+ * Selective Broadcast activations are induced by software action
+ * specifying a particular 8-descriptor "set" via a 6-bit index written
+ * to an MMR.
+ * Thus there are 64 unique 512-byte sets of SB descriptors - one set for
+ * each 6-bit index value. These descriptor sets are mapped in sequence
+ * starting with set 0 located at the address specified in the
+ * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
+ * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
+ *
+ * We will use 31 sets, one for sending BAU messages from each of the 32
+ * cpu's on the node.
+ *
+ * TLB shootdown will use the first of the 8 descriptors of each set.
+ * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
+ */
+
+#define UV_ITEMS_PER_DESCRIPTOR 8
+#define UV_CPUS_PER_ACT_STATUS 32
+#define UV_ACT_STATUS_MASK 0x3
+#define UV_ACT_STATUS_SIZE 2
+#define UV_ACTIVATION_DESCRIPTOR_SIZE 32
+#define UV_DISTRIBUTION_SIZE 256
+#define UV_SW_ACK_NPENDING 8
+#define UV_NET_ENDPOINT_INTD 0x38
+#define UV_DESC_BASE_PNODE_SHIFT 49
+#define UV_PAYLOADQ_PNODE_SHIFT 49
+#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
+#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
+
+/*
+ * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
+ */
+#define DESC_STATUS_IDLE 0
+#define DESC_STATUS_ACTIVE 1
+#define DESC_STATUS_DESTINATION_TIMEOUT 2
+#define DESC_STATUS_SOURCE_TIMEOUT 3
+
+/*
+ * source side threshholds at which message retries print a warning
+ */
+#define SOURCE_TIMEOUT_LIMIT 20
+#define DESTINATION_TIMEOUT_LIMIT 20
+
+/*
+ * number of entries in the destination side payload queue
+ */
+#define DEST_Q_SIZE 17
+/*
+ * number of destination side software ack resources
+ */
+#define DEST_NUM_RESOURCES 8
+#define MAX_CPUS_PER_NODE 32
+/*
+ * completion statuses for sending a TLB flush message
+ */
+#define FLUSH_RETRY 1
+#define FLUSH_GIVEUP 2
+#define FLUSH_COMPLETE 3
+
+/*
+ * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
+ * If the 'multilevel' flag in the header portion of the descriptor
+ * has been set to 0, then endpoint multi-unicast mode is selected.
+ * The distribution specification (32 bytes) is interpreted as a 256-bit
+ * distribution vector. Adjacent bits correspond to consecutive even numbered
+ * nodeIDs. The result of adding the index of a given bit to the 15-bit
+ * 'base_dest_nodeid' field of the header corresponds to the
+ * destination nodeID associated with that specified bit.
+ */
+struct bau_target_nodemask {
+ unsigned long bits[BITS_TO_LONGS(256)];
+};
+
+/*
+ * mask of cpu's on a node
+ * (during initialization we need to check that unsigned long has
+ * enough bits for max. cpu's per node)
+ */
+struct bau_local_cpumask {
+ unsigned long bits;
+};
+
+/*
+ * Payload: 16 bytes (128 bits) (bytes 0x20-0x2f of descriptor)
+ * only 12 bytes (96 bits) of the payload area are usable.
+ * An additional 3 bytes (bits 27:4) of the header address are carried
+ * to the next bytes of the destination payload queue.
+ * And an additional 2 bytes of the header Suppl_A field are also
+ * carried to the destination payload queue.
+ * But the first byte of the Suppl_A becomes bits 127:120 (the 16th byte)
+ * of the destination payload queue, which is written by the hardware
+ * with the s/w ack resource bit vector.
+ * [ effective message contents (16 bytes (128 bits) maximum), not counting
+ * the s/w ack bit vector ]
+ */
+
+/*
+ * The payload is software-defined for INTD transactions
+ */
+struct bau_msg_payload {
+ unsigned long address; /* signifies a page or all TLB's
+ of the cpu */
+ /* 64 bits */
+ unsigned short sending_cpu; /* filled in by sender */
+ /* 16 bits */
+ unsigned short acknowledge_count;/* filled in by destination */
+ /* 16 bits */
+ unsigned int reserved1:32; /* not usable */
+};
+
+
+/*
+ * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
+ * see table 4.2.3.0.1 in broacast_assist spec.
+ */
+struct bau_msg_header {
+ int dest_subnodeid:6; /* must be zero */
+ /* bits 5:0 */
+ int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */
+ /* bits 20:6 */
+ int command:8; /* message type */
+ /* bits 28:21 */
+ /* 0x38: SN3net EndPoint Message */
+ int rsvd_1:3; /* must be zero */
+ /* bits 31:29 */
+ /* int will align on 32 bits */
+ int rsvd_2:9; /* must be zero */
+ /* bits 40:32 */
+ /* Suppl_A is 56-41 */
+ int payload_2a:8; /* becomes byte 16 of msg */
+ /* bits 48:41 */ /* not currently using */
+ int payload_2b:8; /* becomes byte 17 of msg */
+ /* bits 56:49 */ /* not currently using */
+ /* Address field (96:57) is never used as an
+ address (these are address bits 42:3) */
+ int rsvd_3:1; /* must be zero */
+ /* bit 57 */
+ /* address bits 27:4 are payload */
+ /* these 24 bits become bytes 12-14 of msg */
+ int replied_to:1; /* sent as 0 by the source to byte 12 */
+ /* bit 58 */
+
+ int payload_1a:5; /* not currently used */
+ /* bits 63:59 */
+ int payload_1b:8; /* not currently used */
+ /* bits 71:64 */
+ int payload_1c:8; /* not currently used */
+ /* bits 79:72 */
+ int payload_1d:2; /* not currently used */
+ /* bits 81:80 */
+
+ int rsvd_4:7; /* must be zero */
+ /* bits 88:82 */
+ int sw_ack_flag:1; /* software acknowledge flag */
+ /* bit 89 */
+ /* INTD trasactions at destination are to
+ wait for software acknowledge */
+ int rsvd_5:6; /* must be zero */
+ /* bits 95:90 */
+ int rsvd_6:5; /* must be zero */
+ /* bits 100:96 */
+ int int_both:1; /* if 1, interrupt both sockets on the blade */
+ /* bit 101*/
+ int fairness:3; /* usually zero */
+ /* bits 104:102 */
+ int multilevel:1; /* multi-level multicast format */
+ /* bit 105 */
+ /* 0 for TLB: endpoint multi-unicast messages */
+ int chaining:1; /* next descriptor is part of this activation*/
+ /* bit 106 */
+ int rsvd_7:21; /* must be zero */
+ /* bits 127:107 */
+};
+
+/*
+ * The activation descriptor:
+ * The format of the message to send, plus all accompanying control
+ * Should be 64 bytes
+ */
+struct bau_desc {
+ struct bau_target_nodemask distribution;
+ /*
+ * message template, consisting of header and payload:
+ */
+ struct bau_msg_header header;
+ struct bau_msg_payload payload;
+};
+/*
+ * -payload-- ---------header------
+ * bytes 0-11 bits 41-56 bits 58-81
+ * A B (2) C (3)
+ *
+ * A/B/C are moved to:
+ * A C B
+ * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector)
+ * ------------payload queue-----------
+ */
+
+/*
+ * The payload queue on the destination side is an array of these.
+ * With BAU_MISC_CONTROL set for software acknowledge mode, the messages
+ * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
+ * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
+ * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
+ * sw_ack_vector and payload_2)
+ * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
+ * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
+ * operation."
+ */
+struct bau_payload_queue_entry {
+ unsigned long address; /* signifies a page or all TLB's
+ of the cpu */
+ /* 64 bits, bytes 0-7 */
+
+ unsigned short sending_cpu; /* cpu that sent the message */
+ /* 16 bits, bytes 8-9 */
+
+ unsigned short acknowledge_count; /* filled in by destination */
+ /* 16 bits, bytes 10-11 */
+
+ unsigned short replied_to:1; /* sent as 0 by the source */
+ /* 1 bit */
+ unsigned short unused1:7; /* not currently using */
+ /* 7 bits: byte 12) */
+
+ unsigned char unused2[2]; /* not currently using */
+ /* bytes 13-14 */
+
+ unsigned char sw_ack_vector; /* filled in by the hardware */
+ /* byte 15 (bits 127:120) */
+
+ unsigned char unused4[3]; /* not currently using bytes 17-19 */
+ /* bytes 17-19 */
+
+ int number_of_cpus; /* filled in at destination */
+ /* 32 bits, bytes 20-23 (aligned) */
+
+ unsigned char unused5[8]; /* not using */
+ /* bytes 24-31 */
+};
+
+/*
+ * one for every slot in the destination payload queue
+ */
+struct bau_msg_status {
+ struct bau_local_cpumask seen_by; /* map of cpu's */
+};
+
+/*
+ * one for every slot in the destination software ack resources
+ */
+struct bau_sw_ack_status {
+ struct bau_payload_queue_entry *msg; /* associated message */
+ int watcher; /* cpu monitoring, or -1 */
+};
+
+/*
+ * one on every node and per-cpu; to locate the software tables
+ */
+struct bau_control {
+ struct bau_desc *descriptor_base;
+ struct bau_payload_queue_entry *bau_msg_head;
+ struct bau_payload_queue_entry *va_queue_first;
+ struct bau_payload_queue_entry *va_queue_last;
+ struct bau_msg_status *msg_statuses;
+ int *watching; /* pointer to array */
+};
+
+/*
+ * This structure is allocated per_cpu for UV TLB shootdown statistics.
+ */
+struct ptc_stats {
+ unsigned long ptc_i; /* number of IPI-style flushes */
+ unsigned long requestor; /* number of nodes this cpu sent to */
+ unsigned long requestee; /* times cpu was remotely requested */
+ unsigned long alltlb; /* times all tlb's on this cpu were flushed */
+ unsigned long onetlb; /* times just one tlb on this cpu was flushed */
+ unsigned long s_retry; /* retries on source side timeouts */
+ unsigned long d_retry; /* retries on destination side timeouts */
+ unsigned long sflush; /* cycles spent in uv_flush_tlb_others */
+ unsigned long dflush; /* cycles spent on destination side */
+ unsigned long retriesok; /* successes on retries */
+ unsigned long nomsg; /* interrupts with no message */
+ unsigned long multmsg; /* interrupts with multiple messages */
+ unsigned long ntargeted;/* nodes targeted */
+};
+
+static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp)
+{
+ return constant_test_bit(node, &dstp->bits[0]);
+}
+static inline void bau_node_set(int node, struct bau_target_nodemask *dstp)
+{
+ __set_bit(node, &dstp->bits[0]);
+}
+static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits)
+{
+ bitmap_zero(&dstp->bits[0], nbits);
+}
+
+static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
+{
+ bitmap_zero(&dstp->bits, nbits);
+}
+
+#define cpubit_isset(cpu, bau_local_cpumask) \
+ test_bit((cpu), (bau_local_cpumask).bits)
+
+extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
+extern void uv_bau_message_intr1(void);
+extern void uv_bau_timeout_intr1(void);
+
+#endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
new file mode 100644
index 00000000000..c6ad93e315c
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -0,0 +1,354 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV architectural definitions
+ *
+ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_HUB_H
+#define _ASM_X86_UV_UV_HUB_H
+
+#include <linux/numa.h>
+#include <linux/percpu.h>
+#include <asm/types.h>
+#include <asm/percpu.h>
+
+
+/*
+ * Addressing Terminology
+ *
+ * M - The low M bits of a physical address represent the offset
+ * into the blade local memory. RAM memory on a blade is physically
+ * contiguous (although various IO spaces may punch holes in
+ * it)..
+ *
+ * N - Number of bits in the node portion of a socket physical
+ * address.
+ *
+ * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
+ * routers always have low bit of 1, C/MBricks have low bit
+ * equal to 0. Most addressing macros that target UV hub chips
+ * right shift the NASID by 1 to exclude the always-zero bit.
+ * NASIDs contain up to 15 bits.
+ *
+ * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
+ * of nasids.
+ *
+ * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
+ * of the nasid for socket usage.
+ *
+ *
+ * NumaLink Global Physical Address Format:
+ * +--------------------------------+---------------------+
+ * |00..000| GNODE | NodeOffset |
+ * +--------------------------------+---------------------+
+ * |<-------53 - M bits --->|<--------M bits ----->
+ *
+ * M - number of node offset bits (35 .. 40)
+ *
+ *
+ * Memory/UV-HUB Processor Socket Address Format:
+ * +----------------+---------------+---------------------+
+ * |00..000000000000| PNODE | NodeOffset |
+ * +----------------+---------------+---------------------+
+ * <--- N bits --->|<--------M bits ----->
+ *
+ * M - number of node offset bits (35 .. 40)
+ * N - number of PNODE bits (0 .. 10)
+ *
+ * Note: M + N cannot currently exceed 44 (x86_64) or 46 (IA64).
+ * The actual values are configuration dependent and are set at
+ * boot time. M & N values are set by the hardware/BIOS at boot.
+ *
+ *
+ * APICID format
+ * NOTE!!!!!! This is the current format of the APICID. However, code
+ * should assume that this will change in the future. Use functions
+ * in this file for all APICID bit manipulations and conversion.
+ *
+ * 1111110000000000
+ * 5432109876543210
+ * pppppppppplc0cch
+ * sssssssssss
+ *
+ * p = pnode bits
+ * l = socket number on board
+ * c = core
+ * h = hyperthread
+ * s = bits that are in the SOCKET_ID CSR
+ *
+ * Note: Processor only supports 12 bits in the APICID register. The ACPI
+ * tables hold all 16 bits. Software needs to be aware of this.
+ *
+ * Unless otherwise specified, all references to APICID refer to
+ * the FULL value contained in ACPI tables, not the subset in the
+ * processor APICID register.
+ */
+
+
+/*
+ * Maximum number of bricks in all partitions and in all coherency domains.
+ * This is the total number of bricks accessible in the numalink fabric. It
+ * includes all C & M bricks. Routers are NOT included.
+ *
+ * This value is also the value of the maximum number of non-router NASIDs
+ * in the numalink fabric.
+ *
+ * NOTE: a brick may contain 1 or 2 OS nodes. Don't get these confused.
+ */
+#define UV_MAX_NUMALINK_BLADES 16384
+
+/*
+ * Maximum number of C/Mbricks within a software SSI (hardware may support
+ * more).
+ */
+#define UV_MAX_SSI_BLADES 256
+
+/*
+ * The largest possible NASID of a C or M brick (+ 2)
+ */
+#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2)
+
+/*
+ * The following defines attributes of the HUB chip. These attributes are
+ * frequently referenced and are kept in the per-cpu data areas of each cpu.
+ * They are kept together in a struct to minimize cache misses.
+ */
+struct uv_hub_info_s {
+ unsigned long global_mmr_base;
+ unsigned long gpa_mask;
+ unsigned long gnode_upper;
+ unsigned long lowmem_remap_top;
+ unsigned long lowmem_remap_base;
+ unsigned short pnode;
+ unsigned short pnode_mask;
+ unsigned short coherency_domain_number;
+ unsigned short numa_blade_id;
+ unsigned char blade_processor_id;
+ unsigned char m_val;
+ unsigned char n_val;
+};
+DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
+#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
+#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
+
+/*
+ * Local & Global MMR space macros.
+ * Note: macros are intended to be used ONLY by inline functions
+ * in this file - not by other kernel code.
+ * n - NASID (full 15-bit global nasid)
+ * g - GNODE (full 15-bit global nasid, right shifted 1)
+ * p - PNODE (local part of nsids, right shifted 1)
+ */
+#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
+#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper)
+
+#define UV_LOCAL_MMR_BASE 0xf4000000UL
+#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
+#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base)
+#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
+#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
+
+#define UV_GLOBAL_MMR32_PNODE_SHIFT 15
+#define UV_GLOBAL_MMR64_PNODE_SHIFT 26
+
+#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
+
+#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
+ ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+
+#define UV_APIC_PNODE_SHIFT 6
+
+/*
+ * Macros for converting between kernel virtual addresses, socket local physical
+ * addresses, and UV global physical addresses.
+ * Note: use the standard __pa() & __va() macros for converting
+ * between socket virtual and socket physical addresses.
+ */
+
+/* socket phys RAM --> UV global physical address */
+static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
+{
+ if (paddr < uv_hub_info->lowmem_remap_top)
+ paddr += uv_hub_info->lowmem_remap_base;
+ return paddr | uv_hub_info->gnode_upper;
+}
+
+
+/* socket virtual --> UV global physical address */
+static inline unsigned long uv_gpa(void *v)
+{
+ return __pa(v) | uv_hub_info->gnode_upper;
+}
+
+/* socket virtual --> UV global physical address */
+static inline void *uv_vgpa(void *v)
+{
+ return (void *)uv_gpa(v);
+}
+
+/* UV global physical address --> socket virtual */
+static inline void *uv_va(unsigned long gpa)
+{
+ return __va(gpa & uv_hub_info->gpa_mask);
+}
+
+/* pnode, offset --> socket virtual */
+static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
+{
+ return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset);
+}
+
+
+/*
+ * Extract a PNODE from an APICID (full apicid, not processor subset)
+ */
+static inline int uv_apicid_to_pnode(int apicid)
+{
+ return (apicid >> UV_APIC_PNODE_SHIFT);
+}
+
+/*
+ * Access global MMRs using the low memory MMR32 space. This region supports
+ * faster MMR access but not all MMRs are accessible in this space.
+ */
+static inline unsigned long *uv_global_mmr32_address(int pnode,
+ unsigned long offset)
+{
+ return __va(UV_GLOBAL_MMR32_BASE |
+ UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
+}
+
+static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
+ unsigned long val)
+{
+ *uv_global_mmr32_address(pnode, offset) = val;
+}
+
+static inline unsigned long uv_read_global_mmr32(int pnode,
+ unsigned long offset)
+{
+ return *uv_global_mmr32_address(pnode, offset);
+}
+
+/*
+ * Access Global MMR space using the MMR space located at the top of physical
+ * memory.
+ */
+static inline unsigned long *uv_global_mmr64_address(int pnode,
+ unsigned long offset)
+{
+ return __va(UV_GLOBAL_MMR64_BASE |
+ UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
+}
+
+static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
+ unsigned long val)
+{
+ *uv_global_mmr64_address(pnode, offset) = val;
+}
+
+static inline unsigned long uv_read_global_mmr64(int pnode,
+ unsigned long offset)
+{
+ return *uv_global_mmr64_address(pnode, offset);
+}
+
+/*
+ * Access hub local MMRs. Faster than using global space but only local MMRs
+ * are accessible.
+ */
+static inline unsigned long *uv_local_mmr_address(unsigned long offset)
+{
+ return __va(UV_LOCAL_MMR_BASE | offset);
+}
+
+static inline unsigned long uv_read_local_mmr(unsigned long offset)
+{
+ return *uv_local_mmr_address(offset);
+}
+
+static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
+{
+ *uv_local_mmr_address(offset) = val;
+}
+
+/*
+ * Structures and definitions for converting between cpu, node, pnode, and blade
+ * numbers.
+ */
+struct uv_blade_info {
+ unsigned short nr_possible_cpus;
+ unsigned short nr_online_cpus;
+ unsigned short pnode;
+};
+extern struct uv_blade_info *uv_blade_info;
+extern short *uv_node_to_blade;
+extern short *uv_cpu_to_blade;
+extern short uv_possible_blades;
+
+/* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */
+static inline int uv_blade_processor_id(void)
+{
+ return uv_hub_info->blade_processor_id;
+}
+
+/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */
+static inline int uv_numa_blade_id(void)
+{
+ return uv_hub_info->numa_blade_id;
+}
+
+/* Convert a cpu number to the the UV blade number */
+static inline int uv_cpu_to_blade_id(int cpu)
+{
+ return uv_cpu_to_blade[cpu];
+}
+
+/* Convert linux node number to the UV blade number */
+static inline int uv_node_to_blade_id(int nid)
+{
+ return uv_node_to_blade[nid];
+}
+
+/* Convert a blade id to the PNODE of the blade */
+static inline int uv_blade_to_pnode(int bid)
+{
+ return uv_blade_info[bid].pnode;
+}
+
+/* Determine the number of possible cpus on a blade */
+static inline int uv_blade_nr_possible_cpus(int bid)
+{
+ return uv_blade_info[bid].nr_possible_cpus;
+}
+
+/* Determine the number of online cpus on a blade */
+static inline int uv_blade_nr_online_cpus(int bid)
+{
+ return uv_blade_info[bid].nr_online_cpus;
+}
+
+/* Convert a cpu id to the PNODE of the blade containing the cpu */
+static inline int uv_cpu_to_pnode(int cpu)
+{
+ return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode;
+}
+
+/* Convert a linux node number to the PNODE of the blade */
+static inline int uv_node_to_pnode(int nid)
+{
+ return uv_blade_info[uv_node_to_blade_id(nid)].pnode;
+}
+
+/* Maximum possible number of blades */
+static inline int uv_num_possible_blades(void)
+{
+ return uv_possible_blades;
+}
+
+#endif /* _ASM_X86_UV_UV_HUB_H */
+
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
new file mode 100644
index 00000000000..9613c8c0b64
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -0,0 +1,36 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV IRQ definitions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_IRQ_H
+#define _ASM_X86_UV_UV_IRQ_H
+
+/* If a generic version of this structure gets defined, eliminate this one. */
+struct uv_IO_APIC_route_entry {
+ __u64 vector : 8,
+ delivery_mode : 3,
+ dest_mode : 1,
+ delivery_status : 1,
+ polarity : 1,
+ __reserved_1 : 1,
+ trigger : 1,
+ mask : 1,
+ __reserved_2 : 15,
+ dest : 32;
+};
+
+extern struct irq_chip uv_irq_chip;
+
+extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long);
+extern void arch_disable_uv_irq(int, unsigned long);
+
+extern int uv_setup_irq(char *, int, int, unsigned long);
+extern void uv_teardown_irq(unsigned int, int, unsigned long);
+
+#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
new file mode 100644
index 00000000000..dd627793a23
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -0,0 +1,1295 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV MMR definitions
+ *
+ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_MMRS_H
+#define _ASM_X86_UV_UV_MMRS_H
+
+#define UV_MMR_ENABLE (1UL << 63)
+
+/* ========================================================================= */
+/* UVH_BAU_DATA_CONFIG */
+/* ========================================================================= */
+#define UVH_BAU_DATA_CONFIG 0x61680UL
+#define UVH_BAU_DATA_CONFIG_32 0x0438
+
+#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
+#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
+#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
+#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
+#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_BAU_DATA_CONFIG_P_SHFT 13
+#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_BAU_DATA_CONFIG_T_SHFT 15
+#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_BAU_DATA_CONFIG_M_SHFT 16
+#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
+#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_bau_data_config_u {
+ unsigned long v;
+ struct uvh_bau_data_config_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long dm : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long status : 1; /* RO */
+ unsigned long p : 1; /* RO */
+ unsigned long rsvd_14 : 1; /* */
+ unsigned long t : 1; /* RO */
+ unsigned long m : 1; /* RW */
+ unsigned long rsvd_17_31: 15; /* */
+ unsigned long apic_id : 32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_EVENT_OCCURRED0 */
+/* ========================================================================= */
+#define UVH_EVENT_OCCURRED0 0x70000UL
+#define UVH_EVENT_OCCURRED0_32 0x005e8
+
+#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
+#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
+#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
+#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
+#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
+#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
+#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3
+#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
+#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4
+#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
+#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5
+#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
+#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6
+#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
+#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
+#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
+#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
+#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
+#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
+#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
+#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
+#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
+#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
+#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
+#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
+#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
+#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
+#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
+#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
+#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
+#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
+#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
+#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
+#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
+#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
+#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
+#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
+#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
+#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
+#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
+#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
+#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
+#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
+#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
+#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
+#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
+#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
+#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
+#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
+#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
+#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
+#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
+#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
+#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
+#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43
+#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
+#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
+#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
+#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45
+#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
+#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
+#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
+#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
+#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
+#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
+#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
+#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
+#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
+#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
+#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
+#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51
+#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
+#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52
+#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
+#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53
+#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
+#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54
+#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
+#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55
+#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
+#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
+#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
+union uvh_event_occurred0_u {
+ unsigned long v;
+ struct uvh_event_occurred0_s {
+ unsigned long lb_hcerr : 1; /* RW, W1C */
+ unsigned long gr0_hcerr : 1; /* RW, W1C */
+ unsigned long gr1_hcerr : 1; /* RW, W1C */
+ unsigned long lh_hcerr : 1; /* RW, W1C */
+ unsigned long rh_hcerr : 1; /* RW, W1C */
+ unsigned long xn_hcerr : 1; /* RW, W1C */
+ unsigned long si_hcerr : 1; /* RW, W1C */
+ unsigned long lb_aoerr0 : 1; /* RW, W1C */
+ unsigned long gr0_aoerr0 : 1; /* RW, W1C */
+ unsigned long gr1_aoerr0 : 1; /* RW, W1C */
+ unsigned long lh_aoerr0 : 1; /* RW, W1C */
+ unsigned long rh_aoerr0 : 1; /* RW, W1C */
+ unsigned long xn_aoerr0 : 1; /* RW, W1C */
+ unsigned long si_aoerr0 : 1; /* RW, W1C */
+ unsigned long lb_aoerr1 : 1; /* RW, W1C */
+ unsigned long gr0_aoerr1 : 1; /* RW, W1C */
+ unsigned long gr1_aoerr1 : 1; /* RW, W1C */
+ unsigned long lh_aoerr1 : 1; /* RW, W1C */
+ unsigned long rh_aoerr1 : 1; /* RW, W1C */
+ unsigned long xn_aoerr1 : 1; /* RW, W1C */
+ unsigned long si_aoerr1 : 1; /* RW, W1C */
+ unsigned long rh_vpi_int : 1; /* RW, W1C */
+ unsigned long system_shutdown_int : 1; /* RW, W1C */
+ unsigned long lb_irq_int_0 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_1 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_2 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_3 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_4 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_5 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_6 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_7 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_8 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_9 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_10 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_11 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_12 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_13 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_14 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_15 : 1; /* RW, W1C */
+ unsigned long l1_nmi_int : 1; /* RW, W1C */
+ unsigned long stop_clock : 1; /* RW, W1C */
+ unsigned long asic_to_l1 : 1; /* RW, W1C */
+ unsigned long l1_to_asic : 1; /* RW, W1C */
+ unsigned long ltc_int : 1; /* RW, W1C */
+ unsigned long la_seq_trigger : 1; /* RW, W1C */
+ unsigned long ipi_int : 1; /* RW, W1C */
+ unsigned long extio_int0 : 1; /* RW, W1C */
+ unsigned long extio_int1 : 1; /* RW, W1C */
+ unsigned long extio_int2 : 1; /* RW, W1C */
+ unsigned long extio_int3 : 1; /* RW, W1C */
+ unsigned long profile_int : 1; /* RW, W1C */
+ unsigned long rtc0 : 1; /* RW, W1C */
+ unsigned long rtc1 : 1; /* RW, W1C */
+ unsigned long rtc2 : 1; /* RW, W1C */
+ unsigned long rtc3 : 1; /* RW, W1C */
+ unsigned long bau_data : 1; /* RW, W1C */
+ unsigned long power_management_req : 1; /* RW, W1C */
+ unsigned long rsvd_57_63 : 7; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_EVENT_OCCURRED0_ALIAS */
+/* ========================================================================= */
+#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
+#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
+
+/* ========================================================================= */
+/* UVH_INT_CMPB */
+/* ========================================================================= */
+#define UVH_INT_CMPB 0x22080UL
+
+#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
+#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
+
+union uvh_int_cmpb_u {
+ unsigned long v;
+ struct uvh_int_cmpb_s {
+ unsigned long real_time_cmpb : 56; /* RW */
+ unsigned long rsvd_56_63 : 8; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_INT_CMPC */
+/* ========================================================================= */
+#define UVH_INT_CMPC 0x22100UL
+
+#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
+#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL
+
+union uvh_int_cmpc_u {
+ unsigned long v;
+ struct uvh_int_cmpc_s {
+ unsigned long real_time_cmpc : 56; /* RW */
+ unsigned long rsvd_56_63 : 8; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_INT_CMPD */
+/* ========================================================================= */
+#define UVH_INT_CMPD 0x22180UL
+
+#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
+#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL
+
+union uvh_int_cmpd_u {
+ unsigned long v;
+ struct uvh_int_cmpd_s {
+ unsigned long real_time_cmpd : 56; /* RW */
+ unsigned long rsvd_56_63 : 8; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_IPI_INT */
+/* ========================================================================= */
+#define UVH_IPI_INT 0x60500UL
+#define UVH_IPI_INT_32 0x0348
+
+#define UVH_IPI_INT_VECTOR_SHFT 0
+#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
+#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
+#define UVH_IPI_INT_DESTMODE_SHFT 11
+#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_IPI_INT_APIC_ID_SHFT 16
+#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
+#define UVH_IPI_INT_SEND_SHFT 63
+#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
+
+union uvh_ipi_int_u {
+ unsigned long v;
+ struct uvh_ipi_int_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long delivery_mode : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long rsvd_12_15 : 4; /* */
+ unsigned long apic_id : 32; /* RW */
+ unsigned long rsvd_48_62 : 15; /* */
+ unsigned long send : 1; /* WP */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */
+/* ========================================================================= */
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0
+
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
+
+union uvh_lb_bau_intd_payload_queue_first_u {
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_first_s {
+ unsigned long rsvd_0_3: 4; /* */
+ unsigned long address : 39; /* RW */
+ unsigned long rsvd_43_48: 6; /* */
+ unsigned long node_id : 14; /* RW */
+ unsigned long rsvd_63 : 1; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */
+/* ========================================================================= */
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8
+
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+
+union uvh_lb_bau_intd_payload_queue_last_u {
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_last_s {
+ unsigned long rsvd_0_3: 4; /* */
+ unsigned long address : 39; /* RW */
+ unsigned long rsvd_43_63: 21; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */
+/* ========================================================================= */
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0
+
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
+
+union uvh_lb_bau_intd_payload_queue_tail_u {
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_tail_s {
+ unsigned long rsvd_0_3: 4; /* */
+ unsigned long address : 39; /* RW */
+ unsigned long rsvd_43_63: 21; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */
+/* ========================================================================= */
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68
+
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+union uvh_lb_bau_intd_software_acknowledge_u {
+ unsigned long v;
+ struct uvh_lb_bau_intd_software_acknowledge_s {
+ unsigned long pending_0 : 1; /* RW, W1C */
+ unsigned long pending_1 : 1; /* RW, W1C */
+ unsigned long pending_2 : 1; /* RW, W1C */
+ unsigned long pending_3 : 1; /* RW, W1C */
+ unsigned long pending_4 : 1; /* RW, W1C */
+ unsigned long pending_5 : 1; /* RW, W1C */
+ unsigned long pending_6 : 1; /* RW, W1C */
+ unsigned long pending_7 : 1; /* RW, W1C */
+ unsigned long timeout_0 : 1; /* RW, W1C */
+ unsigned long timeout_1 : 1; /* RW, W1C */
+ unsigned long timeout_2 : 1; /* RW, W1C */
+ unsigned long timeout_3 : 1; /* RW, W1C */
+ unsigned long timeout_4 : 1; /* RW, W1C */
+ unsigned long timeout_5 : 1; /* RW, W1C */
+ unsigned long timeout_6 : 1; /* RW, W1C */
+ unsigned long timeout_7 : 1; /* RW, W1C */
+ unsigned long rsvd_16_63: 48; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */
+/* ========================================================================= */
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
+/* ========================================================================= */
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8
+
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
+
+union uvh_lb_bau_sb_activation_control_u {
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_control_s {
+ unsigned long index : 6; /* RW */
+ unsigned long rsvd_6_61: 56; /* */
+ unsigned long push : 1; /* WP */
+ unsigned long init : 1; /* WP */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */
+/* ========================================================================= */
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0
+
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
+
+union uvh_lb_bau_sb_activation_status_0_u {
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_status_0_s {
+ unsigned long status : 64; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */
+/* ========================================================================= */
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8
+
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
+
+union uvh_lb_bau_sb_activation_status_1_u {
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_status_1_s {
+ unsigned long status : 64; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */
+/* ========================================================================= */
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0
+
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
+
+union uvh_lb_bau_sb_descriptor_base_u {
+ unsigned long v;
+ struct uvh_lb_bau_sb_descriptor_base_s {
+ unsigned long rsvd_0_11 : 12; /* */
+ unsigned long page_address : 31; /* RW */
+ unsigned long rsvd_43_48 : 6; /* */
+ unsigned long node_id : 14; /* RW */
+ unsigned long rsvd_63 : 1; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_MCAST_AOERR0_RPT_ENABLE */
+/* ========================================================================= */
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL
+
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL
+
+union uvh_lb_mcast_aoerr0_rpt_enable_u {
+ unsigned long v;
+ struct uvh_lb_mcast_aoerr0_rpt_enable_s {
+ unsigned long mcast_obese_msg : 1; /* RW */
+ unsigned long mcast_data_sb_err : 1; /* RW */
+ unsigned long mcast_nack_buff_parity : 1; /* RW */
+ unsigned long mcast_timeout : 1; /* RW */
+ unsigned long mcast_inactive_reply : 1; /* RW */
+ unsigned long mcast_upgrade_error : 1; /* RW */
+ unsigned long mcast_reg_count_underflow : 1; /* RW */
+ unsigned long mcast_rep_obese_msg : 1; /* RW */
+ unsigned long ucache_req_runt_msg : 1; /* RW */
+ unsigned long ucache_req_obese_msg : 1; /* RW */
+ unsigned long ucache_req_data_sb_err : 1; /* RW */
+ unsigned long ucache_rep_runt_msg : 1; /* RW */
+ unsigned long ucache_rep_obese_msg : 1; /* RW */
+ unsigned long ucache_rep_data_sb_err : 1; /* RW */
+ unsigned long ucache_rep_command_err : 1; /* RW */
+ unsigned long ucache_pend_timeout : 1; /* RW */
+ unsigned long macc_req_runt_msg : 1; /* RW */
+ unsigned long macc_req_obese_msg : 1; /* RW */
+ unsigned long macc_req_data_sb_err : 1; /* RW */
+ unsigned long macc_rep_runt_msg : 1; /* RW */
+ unsigned long macc_rep_obese_msg : 1; /* RW */
+ unsigned long macc_rep_data_sb_err : 1; /* RW */
+ unsigned long macc_amo_timeout : 1; /* RW */
+ unsigned long macc_put_timeout : 1; /* RW */
+ unsigned long macc_spurious_event : 1; /* RW */
+ unsigned long ioh_destination_table_parity : 1; /* RW */
+ unsigned long get_had_error_reply : 1; /* RW */
+ unsigned long get_timeout : 1; /* RW */
+ unsigned long lock_manager_had_error_reply : 1; /* RW */
+ unsigned long put_had_error_reply : 1; /* RW */
+ unsigned long put_timeout : 1; /* RW */
+ unsigned long sb_activation_overrun : 1; /* RW */
+ unsigned long completed_gb_activation_had_error_reply : 1; /* RW */
+ unsigned long completed_gb_activation_timeout : 1; /* RW */
+ unsigned long descriptor_buffer_0_parity : 1; /* RW */
+ unsigned long descriptor_buffer_1_parity : 1; /* RW */
+ unsigned long socket_destination_table_parity : 1; /* RW */
+ unsigned long bau_reply_payload_corruption : 1; /* RW */
+ unsigned long io_port_destination_table_parity : 1; /* RW */
+ unsigned long intd_soft_ack_timeout : 1; /* RW */
+ unsigned long int_rep_obese_msg : 1; /* RW */
+ unsigned long int_rep_command_err : 1; /* RW */
+ unsigned long int_timeout : 1; /* RW */
+ unsigned long rsvd_43_63 : 21; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LOCAL_INT0_CONFIG */
+/* ========================================================================= */
+#define UVH_LOCAL_INT0_CONFIG 0x61000UL
+
+#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0
+#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8
+#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12
+#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13
+#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15
+#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16
+#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_local_int0_config_u {
+ unsigned long v;
+ struct uvh_local_int0_config_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long dm : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long status : 1; /* RO */
+ unsigned long p : 1; /* RO */
+ unsigned long rsvd_14 : 1; /* */
+ unsigned long t : 1; /* RO */
+ unsigned long m : 1; /* RW */
+ unsigned long rsvd_17_31: 15; /* */
+ unsigned long apic_id : 32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LOCAL_INT0_ENABLE */
+/* ========================================================================= */
+#define UVH_LOCAL_INT0_ENABLE 0x65000UL
+
+#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0
+#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL
+#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1
+#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL
+#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2
+#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL
+#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3
+#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL
+#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4
+#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL
+#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5
+#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL
+#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6
+#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL
+#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21
+#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL
+#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22
+#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL
+#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39
+#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL
+#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40
+#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL
+#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41
+#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL
+#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42
+#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43
+#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44
+#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
+
+union uvh_local_int0_enable_u {
+ unsigned long v;
+ struct uvh_local_int0_enable_s {
+ unsigned long lb_hcerr : 1; /* RW */
+ unsigned long gr0_hcerr : 1; /* RW */
+ unsigned long gr1_hcerr : 1; /* RW */
+ unsigned long lh_hcerr : 1; /* RW */
+ unsigned long rh_hcerr : 1; /* RW */
+ unsigned long xn_hcerr : 1; /* RW */
+ unsigned long si_hcerr : 1; /* RW */
+ unsigned long lb_aoerr0 : 1; /* RW */
+ unsigned long gr0_aoerr0 : 1; /* RW */
+ unsigned long gr1_aoerr0 : 1; /* RW */
+ unsigned long lh_aoerr0 : 1; /* RW */
+ unsigned long rh_aoerr0 : 1; /* RW */
+ unsigned long xn_aoerr0 : 1; /* RW */
+ unsigned long si_aoerr0 : 1; /* RW */
+ unsigned long lb_aoerr1 : 1; /* RW */
+ unsigned long gr0_aoerr1 : 1; /* RW */
+ unsigned long gr1_aoerr1 : 1; /* RW */
+ unsigned long lh_aoerr1 : 1; /* RW */
+ unsigned long rh_aoerr1 : 1; /* RW */
+ unsigned long xn_aoerr1 : 1; /* RW */
+ unsigned long si_aoerr1 : 1; /* RW */
+ unsigned long rh_vpi_int : 1; /* RW */
+ unsigned long system_shutdown_int : 1; /* RW */
+ unsigned long lb_irq_int_0 : 1; /* RW */
+ unsigned long lb_irq_int_1 : 1; /* RW */
+ unsigned long lb_irq_int_2 : 1; /* RW */
+ unsigned long lb_irq_int_3 : 1; /* RW */
+ unsigned long lb_irq_int_4 : 1; /* RW */
+ unsigned long lb_irq_int_5 : 1; /* RW */
+ unsigned long lb_irq_int_6 : 1; /* RW */
+ unsigned long lb_irq_int_7 : 1; /* RW */
+ unsigned long lb_irq_int_8 : 1; /* RW */
+ unsigned long lb_irq_int_9 : 1; /* RW */
+ unsigned long lb_irq_int_10 : 1; /* RW */
+ unsigned long lb_irq_int_11 : 1; /* RW */
+ unsigned long lb_irq_int_12 : 1; /* RW */
+ unsigned long lb_irq_int_13 : 1; /* RW */
+ unsigned long lb_irq_int_14 : 1; /* RW */
+ unsigned long lb_irq_int_15 : 1; /* RW */
+ unsigned long l1_nmi_int : 1; /* RW */
+ unsigned long stop_clock : 1; /* RW */
+ unsigned long asic_to_l1 : 1; /* RW */
+ unsigned long l1_to_asic : 1; /* RW */
+ unsigned long ltc_int : 1; /* RW */
+ unsigned long la_seq_trigger : 1; /* RW */
+ unsigned long rsvd_45_63 : 19; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_NODE_ID */
+/* ========================================================================= */
+#define UVH_NODE_ID 0x0UL
+
+#define UVH_NODE_ID_FORCE1_SHFT 0
+#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UVH_NODE_ID_MANUFACTURER_SHFT 1
+#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UVH_NODE_ID_PART_NUMBER_SHFT 12
+#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UVH_NODE_ID_REVISION_SHFT 28
+#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UVH_NODE_ID_NODE_ID_SHFT 32
+#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48
+#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
+#define UVH_NODE_ID_NI_PORT_SHFT 56
+#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
+
+union uvh_node_id_u {
+ unsigned long v;
+ struct uvh_node_id_s {
+ unsigned long force1 : 1; /* RO */
+ unsigned long manufacturer : 11; /* RO */
+ unsigned long part_number : 16; /* RO */
+ unsigned long revision : 4; /* RO */
+ unsigned long node_id : 15; /* RW */
+ unsigned long rsvd_47 : 1; /* */
+ unsigned long nodes_per_bit : 7; /* RW */
+ unsigned long rsvd_55 : 1; /* */
+ unsigned long ni_port : 4; /* RO */
+ unsigned long rsvd_60_63 : 4; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_NODE_PRESENT_TABLE */
+/* ========================================================================= */
+#define UVH_NODE_PRESENT_TABLE 0x1400UL
+#define UVH_NODE_PRESENT_TABLE_DEPTH 16
+
+#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
+#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
+
+union uvh_node_present_table_u {
+ unsigned long v;
+ struct uvh_node_present_table_s {
+ unsigned long nodes : 64; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
+ unsigned long rsvd_0_23 : 24; /* */
+ unsigned long dest_base : 22; /* RW */
+ unsigned long rsvd_46_63: 18; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
+ unsigned long rsvd_0_23 : 24; /* */
+ unsigned long dest_base : 22; /* RW */
+ unsigned long rsvd_46_63: 18; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
+ unsigned long rsvd_0_23 : 24; /* */
+ unsigned long dest_base : 22; /* RW */
+ unsigned long rsvd_46_63: 18; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL
+
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_cfg_overlay_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_cfg_overlay_config_mmr_s {
+ unsigned long rsvd_0_25: 26; /* */
+ unsigned long base : 20; /* RW */
+ unsigned long rsvd_46_62: 17; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_gru_overlay_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_27: 28; /* */
+ unsigned long base : 18; /* RW */
+ unsigned long rsvd_46_47: 2; /* */
+ unsigned long gr4 : 1; /* RW */
+ unsigned long rsvd_49_51: 3; /* */
+ unsigned long n_gru : 4; /* RW */
+ unsigned long rsvd_56_62: 7; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_mmioh_overlay_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_mmioh_overlay_config_mmr_s {
+ unsigned long rsvd_0_29: 30; /* */
+ unsigned long base : 16; /* RW */
+ unsigned long m_io : 6; /* RW */
+ unsigned long n_io : 4; /* RW */
+ unsigned long rsvd_56_62: 7; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_mmr_overlay_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25: 26; /* */
+ unsigned long base : 20; /* RW */
+ unsigned long dual_hub : 1; /* RW */
+ unsigned long rsvd_47_62: 16; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RTC */
+/* ========================================================================= */
+#define UVH_RTC 0x340000UL
+
+#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
+#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
+
+union uvh_rtc_u {
+ unsigned long v;
+ struct uvh_rtc_s {
+ unsigned long real_time_clock : 56; /* RW */
+ unsigned long rsvd_56_63 : 8; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RTC1_INT_CONFIG */
+/* ========================================================================= */
+#define UVH_RTC1_INT_CONFIG 0x615c0UL
+
+#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
+#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
+#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
+#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
+#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_RTC1_INT_CONFIG_P_SHFT 13
+#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_RTC1_INT_CONFIG_T_SHFT 15
+#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_RTC1_INT_CONFIG_M_SHFT 16
+#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
+#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_rtc1_int_config_u {
+ unsigned long v;
+ struct uvh_rtc1_int_config_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long dm : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long status : 1; /* RO */
+ unsigned long p : 1; /* RO */
+ unsigned long rsvd_14 : 1; /* */
+ unsigned long t : 1; /* RO */
+ unsigned long m : 1; /* RW */
+ unsigned long rsvd_17_31: 15; /* */
+ unsigned long apic_id : 32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RTC2_INT_CONFIG */
+/* ========================================================================= */
+#define UVH_RTC2_INT_CONFIG 0x61600UL
+
+#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0
+#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_RTC2_INT_CONFIG_DM_SHFT 8
+#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11
+#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12
+#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_RTC2_INT_CONFIG_P_SHFT 13
+#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_RTC2_INT_CONFIG_T_SHFT 15
+#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_RTC2_INT_CONFIG_M_SHFT 16
+#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32
+#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_rtc2_int_config_u {
+ unsigned long v;
+ struct uvh_rtc2_int_config_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long dm : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long status : 1; /* RO */
+ unsigned long p : 1; /* RO */
+ unsigned long rsvd_14 : 1; /* */
+ unsigned long t : 1; /* RO */
+ unsigned long m : 1; /* RW */
+ unsigned long rsvd_17_31: 15; /* */
+ unsigned long apic_id : 32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RTC3_INT_CONFIG */
+/* ========================================================================= */
+#define UVH_RTC3_INT_CONFIG 0x61640UL
+
+#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0
+#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_RTC3_INT_CONFIG_DM_SHFT 8
+#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11
+#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12
+#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_RTC3_INT_CONFIG_P_SHFT 13
+#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_RTC3_INT_CONFIG_T_SHFT 15
+#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_RTC3_INT_CONFIG_M_SHFT 16
+#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32
+#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_rtc3_int_config_u {
+ unsigned long v;
+ struct uvh_rtc3_int_config_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long dm : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long status : 1; /* RO */
+ unsigned long p : 1; /* RO */
+ unsigned long rsvd_14 : 1; /* */
+ unsigned long t : 1; /* RO */
+ unsigned long m : 1; /* RW */
+ unsigned long rsvd_17_31: 15; /* */
+ unsigned long apic_id : 32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RTC_INC_RATIO */
+/* ========================================================================= */
+#define UVH_RTC_INC_RATIO 0x350000UL
+
+#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0
+#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL
+#define UVH_RTC_INC_RATIO_RATIO_SHFT 20
+#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL
+
+union uvh_rtc_inc_ratio_u {
+ unsigned long v;
+ struct uvh_rtc_inc_ratio_s {
+ unsigned long fraction : 20; /* RW */
+ unsigned long ratio : 3; /* RW */
+ unsigned long rsvd_23_63: 41; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_SI_ADDR_MAP_CONFIG */
+/* ========================================================================= */
+#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
+
+#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0
+#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
+#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
+#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
+
+union uvh_si_addr_map_config_u {
+ unsigned long v;
+ struct uvh_si_addr_map_config_s {
+ unsigned long m_skt : 6; /* RW */
+ unsigned long rsvd_6_7: 2; /* */
+ unsigned long n_skt : 4; /* RW */
+ unsigned long rsvd_12_63: 52; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_SI_ALIAS0_OVERLAY_CONFIG */
+/* ========================================================================= */
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
+
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_si_alias0_overlay_config_u {
+ unsigned long v;
+ struct uvh_si_alias0_overlay_config_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_SI_ALIAS1_OVERLAY_CONFIG */
+/* ========================================================================= */
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
+
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_si_alias1_overlay_config_u {
+ unsigned long v;
+ struct uvh_si_alias1_overlay_config_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_SI_ALIAS2_OVERLAY_CONFIG */
+/* ========================================================================= */
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
+
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_si_alias2_overlay_config_u {
+ unsigned long v;
+ struct uvh_si_alias2_overlay_config_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+
+#endif /* _ASM_X86_UV_UV_MMRS_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
new file mode 100644
index 00000000000..9064052b73d
--- /dev/null
+++ b/arch/x86/include/asm/vdso.h
@@ -0,0 +1,47 @@
+#ifndef _ASM_X86_VDSO_H
+#define _ASM_X86_VDSO_H
+
+#ifdef CONFIG_X86_64
+extern const char VDSO64_PRELINK[];
+
+/*
+ * Given a pointer to the vDSO image, find the pointer to VDSO64_name
+ * as that symbol is defined in the vDSO sources or linker script.
+ */
+#define VDSO64_SYMBOL(base, name) \
+({ \
+ extern const char VDSO64_##name[]; \
+ (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
+})
+#endif
+
+#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
+extern const char VDSO32_PRELINK[];
+
+/*
+ * Given a pointer to the vDSO image, find the pointer to VDSO32_name
+ * as that symbol is defined in the vDSO sources or linker script.
+ */
+#define VDSO32_SYMBOL(base, name) \
+({ \
+ extern const char VDSO32_##name[]; \
+ (void *)(VDSO32_##name - VDSO32_PRELINK + (unsigned long)(base)); \
+})
+#endif
+
+/*
+ * These symbols are defined with the addresses in the vsyscall page.
+ * See vsyscall-sigreturn.S.
+ */
+extern void __user __kernel_sigreturn;
+extern void __user __kernel_rt_sigreturn;
+
+/*
+ * These symbols are defined by vdso32.S to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vdso32_int80_start, vdso32_int80_end;
+extern const char vdso32_syscall_start, vdso32_syscall_end;
+extern const char vdso32_sysenter_start, vdso32_sysenter_end;
+
+#endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
new file mode 100644
index 00000000000..c4b9dc2f67c
--- /dev/null
+++ b/arch/x86/include/asm/vga.h
@@ -0,0 +1,20 @@
+/*
+ * Access to VGA videoram
+ *
+ * (c) 1998 Martin Mares <mj@ucw.cz>
+ */
+
+#ifndef _ASM_X86_VGA_H
+#define _ASM_X86_VGA_H
+
+/*
+ * On the PC, we can just recalculate addresses and then
+ * access the videoram directly without any black magic.
+ */
+
+#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x)
+
+#define vga_readb(x) (*(x))
+#define vga_writeb(x, y) (*(y) = (x))
+
+#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
new file mode 100644
index 00000000000..dc27a69e5d2
--- /dev/null
+++ b/arch/x86/include/asm/vgtod.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_VGTOD_H
+#define _ASM_X86_VGTOD_H
+
+#include <asm/vsyscall.h>
+#include <linux/clocksource.h>
+
+struct vsyscall_gtod_data {
+ seqlock_t lock;
+
+ /* open coded 'struct timespec' */
+ time_t wall_time_sec;
+ u32 wall_time_nsec;
+
+ int sysctl_enabled;
+ struct timezone sys_tz;
+ struct { /* extract of a clocksource struct */
+ cycle_t (*vread)(void);
+ cycle_t cycle_last;
+ cycle_t mask;
+ u32 mult;
+ u32 shift;
+ } clock;
+ struct timespec wall_to_monotonic;
+};
+extern struct vsyscall_gtod_data __vsyscall_gtod_data
+__section_vsyscall_gtod_data;
+extern struct vsyscall_gtod_data vsyscall_gtod_data;
+
+#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vic.h b/arch/x86/include/asm/vic.h
new file mode 100644
index 00000000000..53100f35361
--- /dev/null
+++ b/arch/x86/include/asm/vic.h
@@ -0,0 +1,61 @@
+/* Copyright (C) 1999,2001
+ *
+ * Author: J.E.J.Bottomley@HansenPartnership.com
+ *
+ * Standard include definitions for the NCR Voyager Interrupt Controller */
+
+/* The eight CPI vectors. To activate a CPI, you write a bit mask
+ * corresponding to the processor set to be interrupted into the
+ * relevant register. That set of CPUs will then be interrupted with
+ * the CPI */
+static const int VIC_CPI_Registers[] =
+ {0xFC00, 0xFC01, 0xFC08, 0xFC09,
+ 0xFC10, 0xFC11, 0xFC18, 0xFC19 };
+
+#define VIC_PROC_WHO_AM_I 0xfc29
+# define QUAD_IDENTIFIER 0xC0
+# define EIGHT_SLOT_IDENTIFIER 0xE0
+#define QIC_EXTENDED_PROCESSOR_SELECT 0xFC72
+#define VIC_CPI_BASE_REGISTER 0xFC41
+#define VIC_PROCESSOR_ID 0xFC21
+# define VIC_CPU_MASQUERADE_ENABLE 0x8
+
+#define VIC_CLAIM_REGISTER_0 0xFC38
+#define VIC_CLAIM_REGISTER_1 0xFC39
+#define VIC_REDIRECT_REGISTER_0 0xFC60
+#define VIC_REDIRECT_REGISTER_1 0xFC61
+#define VIC_PRIORITY_REGISTER 0xFC20
+
+#define VIC_PRIMARY_MC_BASE 0xFC48
+#define VIC_SECONDARY_MC_BASE 0xFC49
+
+#define QIC_PROCESSOR_ID 0xFC71
+# define QIC_CPUID_ENABLE 0x08
+
+#define QIC_VIC_CPI_BASE_REGISTER 0xFC79
+#define QIC_CPI_BASE_REGISTER 0xFC7A
+
+#define QIC_MASK_REGISTER0 0xFC80
+/* NOTE: these are masked high, enabled low */
+# define QIC_PERF_TIMER 0x01
+# define QIC_LPE 0x02
+# define QIC_SYS_INT 0x04
+# define QIC_CMN_INT 0x08
+/* at the moment, just enable CMN_INT, disable SYS_INT */
+# define QIC_DEFAULT_MASK0 (~(QIC_CMN_INT /* | VIC_SYS_INT */))
+#define QIC_MASK_REGISTER1 0xFC81
+# define QIC_BOOT_CPI_MASK 0xFE
+/* Enable CPI's 1-6 inclusive */
+# define QIC_CPI_ENABLE 0x81
+
+#define QIC_INTERRUPT_CLEAR0 0xFC8A
+#define QIC_INTERRUPT_CLEAR1 0xFC8B
+
+/* this is where we place the CPI vectors */
+#define VIC_DEFAULT_CPI_BASE 0xC0
+/* this is where we place the QIC CPI vectors */
+#define QIC_DEFAULT_CPI_BASE 0xD0
+
+#define VIC_BOOT_INTERRUPT_MASK 0xfe
+
+extern void smp_vic_timer_interrupt(void);
diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h
new file mode 100644
index 00000000000..166adf61e77
--- /dev/null
+++ b/arch/x86/include/asm/visws/cobalt.h
@@ -0,0 +1,125 @@
+#ifndef _ASM_X86_VISWS_COBALT_H
+#define _ASM_X86_VISWS_COBALT_H
+
+#include <asm/fixmap.h>
+
+/*
+ * Cobalt SGI Visual Workstation system ASIC
+ */
+
+#define CO_CPU_NUM_PHYS 0x1e00
+#define CO_CPU_TAB_PHYS (CO_CPU_NUM_PHYS + 2)
+
+#define CO_CPU_MAX 4
+
+#define CO_CPU_PHYS 0xc2000000
+#define CO_APIC_PHYS 0xc4000000
+
+/* see set_fixmap() and asm/fixmap.h */
+#define CO_CPU_VADDR (fix_to_virt(FIX_CO_CPU))
+#define CO_APIC_VADDR (fix_to_virt(FIX_CO_APIC))
+
+/* Cobalt CPU registers -- relative to CO_CPU_VADDR, use co_cpu_*() */
+#define CO_CPU_REV 0x08
+#define CO_CPU_CTRL 0x10
+#define CO_CPU_STAT 0x20
+#define CO_CPU_TIMEVAL 0x30
+
+/* CO_CPU_CTRL bits */
+#define CO_CTRL_TIMERUN 0x04 /* 0 == disabled */
+#define CO_CTRL_TIMEMASK 0x08 /* 0 == unmasked */
+
+/* CO_CPU_STATUS bits */
+#define CO_STAT_TIMEINTR 0x02 /* (r) 1 == int pend, (w) 0 == clear */
+
+/* CO_CPU_TIMEVAL value */
+#define CO_TIME_HZ 100000000 /* Cobalt core rate */
+
+/* Cobalt APIC registers -- relative to CO_APIC_VADDR, use co_apic_*() */
+#define CO_APIC_HI(n) (((n) * 0x10) + 4)
+#define CO_APIC_LO(n) ((n) * 0x10)
+#define CO_APIC_ID 0x0ffc
+
+/* CO_APIC_ID bits */
+#define CO_APIC_ENABLE 0x00000100
+
+/* CO_APIC_LO bits */
+#define CO_APIC_MASK 0x00010000 /* 0 = enabled */
+#define CO_APIC_LEVEL 0x00008000 /* 0 = edge */
+
+/*
+ * Where things are physically wired to Cobalt
+ * #defines with no board _<type>_<rev>_ are common to all (thus far)
+ */
+#define CO_APIC_IDE0 4
+#define CO_APIC_IDE1 2 /* Only on 320 */
+
+#define CO_APIC_8259 12 /* serial, floppy, par-l-l */
+
+/* Lithium PCI Bridge A -- "the one with 82557 Ethernet" */
+#define CO_APIC_PCIA_BASE0 0 /* and 1 */ /* slot 0, line 0 */
+#define CO_APIC_PCIA_BASE123 5 /* and 6 */ /* slot 0, line 1 */
+
+#define CO_APIC_PIIX4_USB 7 /* this one is weird */
+
+/* Lithium PCI Bridge B -- "the one with PIIX4" */
+#define CO_APIC_PCIB_BASE0 8 /* and 9-12 *//* slot 0, line 0 */
+#define CO_APIC_PCIB_BASE123 13 /* 14.15 */ /* slot 0, line 1 */
+
+#define CO_APIC_VIDOUT0 16
+#define CO_APIC_VIDOUT1 17
+#define CO_APIC_VIDIN0 18
+#define CO_APIC_VIDIN1 19
+
+#define CO_APIC_LI_AUDIO 22
+
+#define CO_APIC_AS 24
+#define CO_APIC_RE 25
+
+#define CO_APIC_CPU 28 /* Timer and Cache interrupt */
+#define CO_APIC_NMI 29
+#define CO_APIC_LAST CO_APIC_NMI
+
+/*
+ * This is how irqs are assigned on the Visual Workstation.
+ * Legacy devices get irq's 1-15 (system clock is 0 and is CO_APIC_CPU).
+ * All other devices (including PCI) go to Cobalt and are irq's 16 on up.
+ */
+#define CO_IRQ_APIC0 16 /* irq of apic entry 0 */
+#define IS_CO_APIC(irq) ((irq) >= CO_IRQ_APIC0)
+#define CO_IRQ(apic) (CO_IRQ_APIC0 + (apic)) /* apic ent to irq */
+#define CO_APIC(irq) ((irq) - CO_IRQ_APIC0) /* irq to apic ent */
+#define CO_IRQ_IDE0 14 /* knowledge of... */
+#define CO_IRQ_IDE1 15 /* ... ide driver defaults! */
+#define CO_IRQ_8259 CO_IRQ(CO_APIC_8259)
+
+#ifdef CONFIG_X86_VISWS_APIC
+static inline void co_cpu_write(unsigned long reg, unsigned long v)
+{
+ *((volatile unsigned long *)(CO_CPU_VADDR+reg))=v;
+}
+
+static inline unsigned long co_cpu_read(unsigned long reg)
+{
+ return *((volatile unsigned long *)(CO_CPU_VADDR+reg));
+}
+
+static inline void co_apic_write(unsigned long reg, unsigned long v)
+{
+ *((volatile unsigned long *)(CO_APIC_VADDR+reg))=v;
+}
+
+static inline unsigned long co_apic_read(unsigned long reg)
+{
+ return *((volatile unsigned long *)(CO_APIC_VADDR+reg));
+}
+#endif
+
+extern char visws_board_type;
+
+#define VISWS_320 0
+#define VISWS_540 1
+
+extern char visws_board_rev;
+
+#endif /* _ASM_X86_VISWS_COBALT_H */
diff --git a/arch/x86/include/asm/visws/lithium.h b/arch/x86/include/asm/visws/lithium.h
new file mode 100644
index 00000000000..a10d89bc127
--- /dev/null
+++ b/arch/x86/include/asm/visws/lithium.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_X86_VISWS_LITHIUM_H
+#define _ASM_X86_VISWS_LITHIUM_H
+
+#include <asm/fixmap.h>
+
+/*
+ * Lithium is the SGI Visual Workstation I/O ASIC
+ */
+
+#define LI_PCI_A_PHYS 0xfc000000 /* Enet is dev 3 */
+#define LI_PCI_B_PHYS 0xfd000000 /* PIIX4 is here */
+
+/* see set_fixmap() and asm/fixmap.h */
+#define LI_PCIA_VADDR (fix_to_virt(FIX_LI_PCIA))
+#define LI_PCIB_VADDR (fix_to_virt(FIX_LI_PCIB))
+
+/* Not a standard PCI? (not in linux/pci.h) */
+#define LI_PCI_BUSNUM 0x44 /* lo8: primary, hi8: sub */
+#define LI_PCI_INTEN 0x46
+
+/* LI_PCI_INTENT bits */
+#define LI_INTA_0 0x0001
+#define LI_INTA_1 0x0002
+#define LI_INTA_2 0x0004
+#define LI_INTA_3 0x0008
+#define LI_INTA_4 0x0010
+#define LI_INTB 0x0020
+#define LI_INTC 0x0040
+#define LI_INTD 0x0080
+
+/* More special purpose macros... */
+static inline void li_pcia_write16(unsigned long reg, unsigned short v)
+{
+ *((volatile unsigned short *)(LI_PCIA_VADDR+reg))=v;
+}
+
+static inline unsigned short li_pcia_read16(unsigned long reg)
+{
+ return *((volatile unsigned short *)(LI_PCIA_VADDR+reg));
+}
+
+static inline void li_pcib_write16(unsigned long reg, unsigned short v)
+{
+ *((volatile unsigned short *)(LI_PCIB_VADDR+reg))=v;
+}
+
+static inline unsigned short li_pcib_read16(unsigned long reg)
+{
+ return *((volatile unsigned short *)(LI_PCIB_VADDR+reg));
+}
+
+#endif /* _ASM_X86_VISWS_LITHIUM_H */
+
diff --git a/arch/x86/include/asm/visws/piix4.h b/arch/x86/include/asm/visws/piix4.h
new file mode 100644
index 00000000000..d0af4d338e7
--- /dev/null
+++ b/arch/x86/include/asm/visws/piix4.h
@@ -0,0 +1,107 @@
+#ifndef _ASM_X86_VISWS_PIIX4_H
+#define _ASM_X86_VISWS_PIIX4_H
+
+/*
+ * PIIX4 as used on SGI Visual Workstations
+ */
+
+#define PIIX_PM_START 0x0F80
+
+#define SIO_GPIO_START 0x0FC0
+
+#define SIO_PM_START 0x0FC8
+
+#define PMBASE PIIX_PM_START
+#define GPIREG0 (PMBASE+0x30)
+#define GPIREG(x) (GPIREG0+((x)/8))
+#define GPIBIT(x) (1 << ((x)%8))
+
+#define PIIX_GPI_BD_ID1 18
+#define PIIX_GPI_BD_ID2 19
+#define PIIX_GPI_BD_ID3 20
+#define PIIX_GPI_BD_ID4 21
+#define PIIX_GPI_BD_REG GPIREG(PIIX_GPI_BD_ID1)
+#define PIIX_GPI_BD_MASK (GPIBIT(PIIX_GPI_BD_ID1) | \
+ GPIBIT(PIIX_GPI_BD_ID2) | \
+ GPIBIT(PIIX_GPI_BD_ID3) | \
+ GPIBIT(PIIX_GPI_BD_ID4) )
+
+#define PIIX_GPI_BD_SHIFT (PIIX_GPI_BD_ID1 % 8)
+
+#define SIO_INDEX 0x2e
+#define SIO_DATA 0x2f
+
+#define SIO_DEV_SEL 0x7
+#define SIO_DEV_ENB 0x30
+#define SIO_DEV_MSB 0x60
+#define SIO_DEV_LSB 0x61
+
+#define SIO_GP_DEV 0x7
+
+#define SIO_GP_BASE SIO_GPIO_START
+#define SIO_GP_MSB (SIO_GP_BASE>>8)
+#define SIO_GP_LSB (SIO_GP_BASE&0xff)
+
+#define SIO_GP_DATA1 (SIO_GP_BASE+0)
+
+#define SIO_PM_DEV 0x8
+
+#define SIO_PM_BASE SIO_PM_START
+#define SIO_PM_MSB (SIO_PM_BASE>>8)
+#define SIO_PM_LSB (SIO_PM_BASE&0xff)
+#define SIO_PM_INDEX (SIO_PM_BASE+0)
+#define SIO_PM_DATA (SIO_PM_BASE+1)
+
+#define SIO_PM_FER2 0x1
+
+#define SIO_PM_GP_EN 0x80
+
+
+
+/*
+ * This is the dev/reg where generating a config cycle will
+ * result in a PCI special cycle.
+ */
+#define SPECIAL_DEV 0xff
+#define SPECIAL_REG 0x00
+
+/*
+ * PIIX4 needs to see a special cycle with the following data
+ * to be convinced the processor has gone into the stop grant
+ * state. PIIX4 insists on seeing this before it will power
+ * down a system.
+ */
+#define PIIX_SPECIAL_STOP 0x00120002
+
+#define PIIX4_RESET_PORT 0xcf9
+#define PIIX4_RESET_VAL 0x6
+
+#define PMSTS_PORT 0xf80 // 2 bytes PM Status
+#define PMEN_PORT 0xf82 // 2 bytes PM Enable
+#define PMCNTRL_PORT 0xf84 // 2 bytes PM Control
+
+#define PM_SUSPEND_ENABLE 0x2000 // start sequence to suspend state
+
+/*
+ * PMSTS and PMEN I/O bit definitions.
+ * (Bits are the same in both registers)
+ */
+#define PM_STS_RSM (1<<15) // Resume Status
+#define PM_STS_PWRBTNOR (1<<11) // Power Button Override
+#define PM_STS_RTC (1<<10) // RTC status
+#define PM_STS_PWRBTN (1<<8) // Power Button Pressed?
+#define PM_STS_GBL (1<<5) // Global Status
+#define PM_STS_BM (1<<4) // Bus Master Status
+#define PM_STS_TMROF (1<<0) // Timer Overflow Status.
+
+/*
+ * Stop clock GPI register
+ */
+#define PIIX_GPIREG0 (0xf80 + 0x30)
+
+/*
+ * Stop clock GPI bit in GPIREG0
+ */
+#define PIIX_GPI_STPCLK 0x4 // STPCLK signal routed back in
+
+#endif /* _ASM_X86_VISWS_PIIX4_H */
diff --git a/arch/x86/include/asm/visws/sgivw.h b/arch/x86/include/asm/visws/sgivw.h
new file mode 100644
index 00000000000..5fbf63e1003
--- /dev/null
+++ b/arch/x86/include/asm/visws/sgivw.h
@@ -0,0 +1,5 @@
+/*
+ * Frame buffer position and size:
+ */
+extern unsigned long sgivwfb_mem_phys;
+extern unsigned long sgivwfb_mem_size;
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
new file mode 100644
index 00000000000..f9303602fbc
--- /dev/null
+++ b/arch/x86/include/asm/vm86.h
@@ -0,0 +1,208 @@
+#ifndef _ASM_X86_VM86_H
+#define _ASM_X86_VM86_H
+
+/*
+ * I'm guessing at the VIF/VIP flag usage, but hope that this is how
+ * the Pentium uses them. Linux will return from vm86 mode when both
+ * VIF and VIP is set.
+ *
+ * On a Pentium, we could probably optimize the virtual flags directly
+ * in the eflags register instead of doing it "by hand" in vflags...
+ *
+ * Linus
+ */
+
+#include <asm/processor-flags.h>
+
+#define BIOSSEG 0x0f000
+
+#define CPU_086 0
+#define CPU_186 1
+#define CPU_286 2
+#define CPU_386 3
+#define CPU_486 4
+#define CPU_586 5
+
+/*
+ * Return values for the 'vm86()' system call
+ */
+#define VM86_TYPE(retval) ((retval) & 0xff)
+#define VM86_ARG(retval) ((retval) >> 8)
+
+#define VM86_SIGNAL 0 /* return due to signal */
+#define VM86_UNKNOWN 1 /* unhandled GP fault
+ - IO-instruction or similar */
+#define VM86_INTx 2 /* int3/int x instruction (ARG = x) */
+#define VM86_STI 3 /* sti/popf/iret instruction enabled
+ virtual interrupts */
+
+/*
+ * Additional return values when invoking new vm86()
+ */
+#define VM86_PICRETURN 4 /* return due to pending PIC request */
+#define VM86_TRAP 6 /* return due to DOS-debugger request */
+
+/*
+ * function codes when invoking new vm86()
+ */
+#define VM86_PLUS_INSTALL_CHECK 0
+#define VM86_ENTER 1
+#define VM86_ENTER_NO_BYPASS 2
+#define VM86_REQUEST_IRQ 3
+#define VM86_FREE_IRQ 4
+#define VM86_GET_IRQ_BITS 5
+#define VM86_GET_AND_RESET_IRQ 6
+
+/*
+ * This is the stack-layout seen by the user space program when we have
+ * done a translation of "SAVE_ALL" from vm86 mode. The real kernel layout
+ * is 'kernel_vm86_regs' (see below).
+ */
+
+struct vm86_regs {
+/*
+ * normal regs, with special meaning for the segment descriptors..
+ */
+ long ebx;
+ long ecx;
+ long edx;
+ long esi;
+ long edi;
+ long ebp;
+ long eax;
+ long __null_ds;
+ long __null_es;
+ long __null_fs;
+ long __null_gs;
+ long orig_eax;
+ long eip;
+ unsigned short cs, __csh;
+ long eflags;
+ long esp;
+ unsigned short ss, __ssh;
+/*
+ * these are specific to v86 mode:
+ */
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+};
+
+struct revectored_struct {
+ unsigned long __map[8]; /* 256 bits */
+};
+
+struct vm86_struct {
+ struct vm86_regs regs;
+ unsigned long flags;
+ unsigned long screen_bitmap;
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+};
+
+/*
+ * flags masks
+ */
+#define VM86_SCREEN_BITMAP 0x0001
+
+struct vm86plus_info_struct {
+ unsigned long force_return_for_pic:1;
+ unsigned long vm86dbg_active:1; /* for debugger */
+ unsigned long vm86dbg_TFpendig:1; /* for debugger */
+ unsigned long unused:28;
+ unsigned long is_vm86pus:1; /* for vm86 internal use */
+ unsigned char vm86dbg_intxxtab[32]; /* for debugger */
+};
+struct vm86plus_struct {
+ struct vm86_regs regs;
+ unsigned long flags;
+ unsigned long screen_bitmap;
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+ struct vm86plus_info_struct vm86plus;
+};
+
+#ifdef __KERNEL__
+
+#include <asm/ptrace.h>
+
+/*
+ * This is the (kernel) stack-layout when we have done a "SAVE_ALL" from vm86
+ * mode - the main change is that the old segment descriptors aren't
+ * useful any more and are forced to be zero by the kernel (and the
+ * hardware when a trap occurs), and the real segment descriptors are
+ * at the end of the structure. Look at ptrace.h to see the "normal"
+ * setup. For user space layout see 'struct vm86_regs' above.
+ */
+
+struct kernel_vm86_regs {
+/*
+ * normal regs, with special meaning for the segment descriptors..
+ */
+ struct pt_regs pt;
+/*
+ * these are specific to v86 mode:
+ */
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+};
+
+struct kernel_vm86_struct {
+ struct kernel_vm86_regs regs;
+/*
+ * the below part remains on the kernel stack while we are in VM86 mode.
+ * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
+ * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
+ * 'struct kernel_vm86_regs' with the then actual values.
+ * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
+ * in kernelspace, hence we need not reget the data from userspace.
+ */
+#define VM86_TSS_ESP0 flags
+ unsigned long flags;
+ unsigned long screen_bitmap;
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+ struct vm86plus_info_struct vm86plus;
+ struct pt_regs *regs32; /* here we save the pointer to the old regs */
+/*
+ * The below is not part of the structure, but the stack layout continues
+ * this way. In front of 'return-eip' may be some data, depending on
+ * compilation, so we don't rely on this and save the pointer to 'oldregs'
+ * in 'regs32' above.
+ * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
+
+ long return-eip; from call to vm86()
+ struct pt_regs oldregs; user space registers as saved by syscall
+ */
+};
+
+#ifdef CONFIG_VM86
+
+void handle_vm86_fault(struct kernel_vm86_regs *, long);
+int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
+struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
+
+struct task_struct;
+void release_vm86_irqs(struct task_struct *);
+
+#else
+
+#define handle_vm86_fault(a, b)
+#define release_vm86_irqs(a)
+
+static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
+{
+ return 0;
+}
+
+#endif /* CONFIG_VM86 */
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_VM86_H */
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
new file mode 100644
index 00000000000..b7c0dea119f
--- /dev/null
+++ b/arch/x86/include/asm/vmi.h
@@ -0,0 +1,263 @@
+/*
+ * VMI interface definition
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Maintained by: Zachary Amsden zach@vmware.com
+ *
+ */
+#include <linux/types.h>
+
+/*
+ *---------------------------------------------------------------------
+ *
+ * VMI Option ROM API
+ *
+ *---------------------------------------------------------------------
+ */
+#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
+
+#define PCI_VENDOR_ID_VMWARE 0x15AD
+#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
+
+/*
+ * We use two version numbers for compatibility, with the major
+ * number signifying interface breakages, and the minor number
+ * interface extensions.
+ */
+#define VMI_API_REV_MAJOR 3
+#define VMI_API_REV_MINOR 0
+
+#define VMI_CALL_CPUID 0
+#define VMI_CALL_WRMSR 1
+#define VMI_CALL_RDMSR 2
+#define VMI_CALL_SetGDT 3
+#define VMI_CALL_SetLDT 4
+#define VMI_CALL_SetIDT 5
+#define VMI_CALL_SetTR 6
+#define VMI_CALL_GetGDT 7
+#define VMI_CALL_GetLDT 8
+#define VMI_CALL_GetIDT 9
+#define VMI_CALL_GetTR 10
+#define VMI_CALL_WriteGDTEntry 11
+#define VMI_CALL_WriteLDTEntry 12
+#define VMI_CALL_WriteIDTEntry 13
+#define VMI_CALL_UpdateKernelStack 14
+#define VMI_CALL_SetCR0 15
+#define VMI_CALL_SetCR2 16
+#define VMI_CALL_SetCR3 17
+#define VMI_CALL_SetCR4 18
+#define VMI_CALL_GetCR0 19
+#define VMI_CALL_GetCR2 20
+#define VMI_CALL_GetCR3 21
+#define VMI_CALL_GetCR4 22
+#define VMI_CALL_WBINVD 23
+#define VMI_CALL_SetDR 24
+#define VMI_CALL_GetDR 25
+#define VMI_CALL_RDPMC 26
+#define VMI_CALL_RDTSC 27
+#define VMI_CALL_CLTS 28
+#define VMI_CALL_EnableInterrupts 29
+#define VMI_CALL_DisableInterrupts 30
+#define VMI_CALL_GetInterruptMask 31
+#define VMI_CALL_SetInterruptMask 32
+#define VMI_CALL_IRET 33
+#define VMI_CALL_SYSEXIT 34
+#define VMI_CALL_Halt 35
+#define VMI_CALL_Reboot 36
+#define VMI_CALL_Shutdown 37
+#define VMI_CALL_SetPxE 38
+#define VMI_CALL_SetPxELong 39
+#define VMI_CALL_UpdatePxE 40
+#define VMI_CALL_UpdatePxELong 41
+#define VMI_CALL_MachineToPhysical 42
+#define VMI_CALL_PhysicalToMachine 43
+#define VMI_CALL_AllocatePage 44
+#define VMI_CALL_ReleasePage 45
+#define VMI_CALL_InvalPage 46
+#define VMI_CALL_FlushTLB 47
+#define VMI_CALL_SetLinearMapping 48
+
+#define VMI_CALL_SetIOPLMask 61
+#define VMI_CALL_SetInitialAPState 62
+#define VMI_CALL_APICWrite 63
+#define VMI_CALL_APICRead 64
+#define VMI_CALL_IODelay 65
+#define VMI_CALL_SetLazyMode 73
+
+/*
+ *---------------------------------------------------------------------
+ *
+ * MMU operation flags
+ *
+ *---------------------------------------------------------------------
+ */
+
+/* Flags used by VMI_{Allocate|Release}Page call */
+#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
+#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
+#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
+
+
+/* Flags shared by Allocate|Release Page and PTE updates */
+#define VMI_PAGE_PT 0x01
+#define VMI_PAGE_PD 0x02
+#define VMI_PAGE_PDP 0x04
+#define VMI_PAGE_PML4 0x08
+
+#define VMI_PAGE_NORMAL 0x00 /* for debugging */
+
+/* Flags used by PTE updates */
+#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
+#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
+#define VMI_PAGE_VA_MASK 0xfffff000
+
+#ifdef CONFIG_X86_PAE
+#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
+#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
+#else
+#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
+#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
+#endif
+
+/* Flags used by VMI_FlushTLB call */
+#define VMI_FLUSH_TLB 0x01
+#define VMI_FLUSH_GLOBAL 0x02
+
+/*
+ *---------------------------------------------------------------------
+ *
+ * VMI relocation definitions for ROM call get_reloc
+ *
+ *---------------------------------------------------------------------
+ */
+
+/* VMI Relocation types */
+#define VMI_RELOCATION_NONE 0
+#define VMI_RELOCATION_CALL_REL 1
+#define VMI_RELOCATION_JUMP_REL 2
+#define VMI_RELOCATION_NOP 3
+
+#ifndef __ASSEMBLY__
+struct vmi_relocation_info {
+ unsigned char *eip;
+ unsigned char type;
+ unsigned char reserved[3];
+};
+#endif
+
+
+/*
+ *---------------------------------------------------------------------
+ *
+ * Generic ROM structures and definitions
+ *
+ *---------------------------------------------------------------------
+ */
+
+#ifndef __ASSEMBLY__
+
+struct vrom_header {
+ u16 rom_signature; /* option ROM signature */
+ u8 rom_length; /* ROM length in 512 byte chunks */
+ u8 rom_entry[4]; /* 16-bit code entry point */
+ u8 rom_pad0; /* 4-byte align pad */
+ u32 vrom_signature; /* VROM identification signature */
+ u8 api_version_min;/* Minor version of API */
+ u8 api_version_maj;/* Major version of API */
+ u8 jump_slots; /* Number of jump slots */
+ u8 reserved1; /* Reserved for expansion */
+ u32 virtual_top; /* Hypervisor virtual address start */
+ u16 reserved2; /* Reserved for expansion */
+ u16 license_offs; /* Offset to License string */
+ u16 pci_header_offs;/* Offset to PCI OPROM header */
+ u16 pnp_header_offs;/* Offset to PnP OPROM header */
+ u32 rom_pad3; /* PnP reserverd / VMI reserved */
+ u8 reserved[96]; /* Reserved for headers */
+ char vmi_init[8]; /* VMI_Init jump point */
+ char get_reloc[8]; /* VMI_GetRelocationInfo jump point */
+} __attribute__((packed));
+
+struct pnp_header {
+ char sig[4];
+ char rev;
+ char size;
+ short next;
+ short res;
+ long devID;
+ unsigned short manufacturer_offset;
+ unsigned short product_offset;
+} __attribute__((packed));
+
+struct pci_header {
+ char sig[4];
+ short vendorID;
+ short deviceID;
+ short vpdData;
+ short size;
+ char rev;
+ char class;
+ char subclass;
+ char interface;
+ short chunks;
+ char rom_version_min;
+ char rom_version_maj;
+ char codetype;
+ char lastRom;
+ short reserved;
+} __attribute__((packed));
+
+/* Function prototypes for bootstrapping */
+extern void vmi_init(void);
+extern void vmi_bringup(void);
+extern void vmi_apply_boot_page_allocations(void);
+
+/* State needed to start an application processor in an SMP system. */
+struct vmi_ap_state {
+ u32 cr0;
+ u32 cr2;
+ u32 cr3;
+ u32 cr4;
+
+ u64 efer;
+
+ u32 eip;
+ u32 eflags;
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+ u32 esp;
+ u32 ebp;
+ u32 esi;
+ u32 edi;
+ u16 cs;
+ u16 ss;
+ u16 ds;
+ u16 es;
+ u16 fs;
+ u16 gs;
+ u16 ldtr;
+
+ u16 gdtr_limit;
+ u32 gdtr_base;
+ u32 idtr_base;
+ u16 idtr_limit;
+};
+
+#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
new file mode 100644
index 00000000000..c6e0bee93e3
--- /dev/null
+++ b/arch/x86/include/asm/vmi_time.h
@@ -0,0 +1,98 @@
+/*
+ * VMI Time wrappers
+ *
+ * Copyright (C) 2006, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to dhecht@vmware.com
+ *
+ */
+
+#ifndef _ASM_X86_VMI_TIME_H
+#define _ASM_X86_VMI_TIME_H
+
+/*
+ * Raw VMI call indices for timer functions
+ */
+#define VMI_CALL_GetCycleFrequency 66
+#define VMI_CALL_GetCycleCounter 67
+#define VMI_CALL_SetAlarm 68
+#define VMI_CALL_CancelAlarm 69
+#define VMI_CALL_GetWallclockTime 70
+#define VMI_CALL_WallclockUpdated 71
+
+/* Cached VMI timer operations */
+extern struct vmi_timer_ops {
+ u64 (*get_cycle_frequency)(void);
+ u64 (*get_cycle_counter)(int);
+ u64 (*get_wallclock)(void);
+ int (*wallclock_updated)(void);
+ void (*set_alarm)(u32 flags, u64 expiry, u64 period);
+ void (*cancel_alarm)(u32 flags);
+} vmi_timer_ops;
+
+/* Prototypes */
+extern void __init vmi_time_init(void);
+extern unsigned long vmi_get_wallclock(void);
+extern int vmi_set_wallclock(unsigned long now);
+extern unsigned long long vmi_sched_clock(void);
+extern unsigned long vmi_tsc_khz(void);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void __devinit vmi_time_bsp_init(void);
+extern void __devinit vmi_time_ap_init(void);
+#endif
+
+/*
+ * When run under a hypervisor, a vcpu is always in one of three states:
+ * running, halted, or ready. The vcpu is in the 'running' state if it
+ * is executing. When the vcpu executes the halt interface, the vcpu
+ * enters the 'halted' state and remains halted until there is some work
+ * pending for the vcpu (e.g. an alarm expires, host I/O completes on
+ * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
+ * state (waiting for the hypervisor to reschedule it). Finally, at any
+ * time when the vcpu is not in the 'running' state nor the 'halted'
+ * state, it is in the 'ready' state.
+ *
+ * Real time is advances while the vcpu is 'running', 'ready', or
+ * 'halted'. Stolen time is the time in which the vcpu is in the
+ * 'ready' state. Available time is the remaining time -- the vcpu is
+ * either 'running' or 'halted'.
+ *
+ * All three views of time are accessible through the VMI cycle
+ * counters.
+ */
+
+/* The cycle counters. */
+#define VMI_CYCLES_REAL 0
+#define VMI_CYCLES_AVAILABLE 1
+#define VMI_CYCLES_STOLEN 2
+
+/* The alarm interface 'flags' bits */
+#define VMI_ALARM_COUNTERS 2
+
+#define VMI_ALARM_COUNTER_MASK 0x000000ff
+
+#define VMI_ALARM_WIRED_IRQ0 0x00000000
+#define VMI_ALARM_WIRED_LVTT 0x00010000
+
+#define VMI_ALARM_IS_ONESHOT 0x00000000
+#define VMI_ALARM_IS_PERIODIC 0x00000100
+
+#define CONFIG_VMI_ALARM_HZ 100
+
+#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/include/asm/voyager.h b/arch/x86/include/asm/voyager.h
new file mode 100644
index 00000000000..9c811d2e6f9
--- /dev/null
+++ b/arch/x86/include/asm/voyager.h
@@ -0,0 +1,528 @@
+/* Copyright (C) 1999,2001
+ *
+ * Author: J.E.J.Bottomley@HansenPartnership.com
+ *
+ * Standard include definitions for the NCR Voyager system */
+
+#undef VOYAGER_DEBUG
+#undef VOYAGER_CAT_DEBUG
+
+#ifdef VOYAGER_DEBUG
+#define VDEBUG(x) printk x
+#else
+#define VDEBUG(x)
+#endif
+
+/* There are three levels of voyager machine: 3,4 and 5. The rule is
+ * if it's less than 3435 it's a Level 3 except for a 3360 which is
+ * a level 4. A 3435 or above is a Level 5 */
+#define VOYAGER_LEVEL5_AND_ABOVE 0x3435
+#define VOYAGER_LEVEL4 0x3360
+
+/* The L4 DINO ASIC */
+#define VOYAGER_DINO 0x43
+
+/* voyager ports in standard I/O space */
+#define VOYAGER_MC_SETUP 0x96
+
+
+#define VOYAGER_CAT_CONFIG_PORT 0x97
+# define VOYAGER_CAT_DESELECT 0xff
+#define VOYAGER_SSPB_RELOCATION_PORT 0x98
+
+/* Valid CAT controller commands */
+/* start instruction register cycle */
+#define VOYAGER_CAT_IRCYC 0x01
+/* start data register cycle */
+#define VOYAGER_CAT_DRCYC 0x02
+/* move to execute state */
+#define VOYAGER_CAT_RUN 0x0F
+/* end operation */
+#define VOYAGER_CAT_END 0x80
+/* hold in idle state */
+#define VOYAGER_CAT_HOLD 0x90
+/* single step an "intest" vector */
+#define VOYAGER_CAT_STEP 0xE0
+/* return cat controller to CLEMSON mode */
+#define VOYAGER_CAT_CLEMSON 0xFF
+
+/* the default cat command header */
+#define VOYAGER_CAT_HEADER 0x7F
+
+/* the range of possible CAT module ids in the system */
+#define VOYAGER_MIN_MODULE 0x10
+#define VOYAGER_MAX_MODULE 0x1f
+
+/* The voyager registers per asic */
+#define VOYAGER_ASIC_ID_REG 0x00
+#define VOYAGER_ASIC_TYPE_REG 0x01
+/* the sub address registers can be made auto incrementing on reads */
+#define VOYAGER_AUTO_INC_REG 0x02
+# define VOYAGER_AUTO_INC 0x04
+# define VOYAGER_NO_AUTO_INC 0xfb
+#define VOYAGER_SUBADDRDATA 0x03
+#define VOYAGER_SCANPATH 0x05
+# define VOYAGER_CONNECT_ASIC 0x01
+# define VOYAGER_DISCONNECT_ASIC 0xfe
+#define VOYAGER_SUBADDRLO 0x06
+#define VOYAGER_SUBADDRHI 0x07
+#define VOYAGER_SUBMODSELECT 0x08
+#define VOYAGER_SUBMODPRESENT 0x09
+
+#define VOYAGER_SUBADDR_LO 0xff
+#define VOYAGER_SUBADDR_HI 0xffff
+
+/* the maximum size of a scan path -- used to form instructions */
+#define VOYAGER_MAX_SCAN_PATH 0x100
+/* the biggest possible register size (in bytes) */
+#define VOYAGER_MAX_REG_SIZE 4
+
+/* Total number of possible modules (including submodules) */
+#define VOYAGER_MAX_MODULES 16
+/* Largest number of asics per module */
+#define VOYAGER_MAX_ASICS_PER_MODULE 7
+
+/* the CAT asic of each module is always the first one */
+#define VOYAGER_CAT_ID 0
+#define VOYAGER_PSI 0x1a
+
+/* voyager instruction operations and registers */
+#define VOYAGER_READ_CONFIG 0x1
+#define VOYAGER_WRITE_CONFIG 0x2
+#define VOYAGER_BYPASS 0xff
+
+typedef struct voyager_asic {
+ __u8 asic_addr; /* ASIC address; Level 4 */
+ __u8 asic_type; /* ASIC type */
+ __u8 asic_id; /* ASIC id */
+ __u8 jtag_id[4]; /* JTAG id */
+ __u8 asic_location; /* Location within scan path; start w/ 0 */
+ __u8 bit_location; /* Location within bit stream; start w/ 0 */
+ __u8 ireg_length; /* Instruction register length */
+ __u16 subaddr; /* Amount of sub address space */
+ struct voyager_asic *next; /* Next asic in linked list */
+} voyager_asic_t;
+
+typedef struct voyager_module {
+ __u8 module_addr; /* Module address */
+ __u8 scan_path_connected; /* Scan path connected */
+ __u16 ee_size; /* Size of the EEPROM */
+ __u16 num_asics; /* Number of Asics */
+ __u16 inst_bits; /* Instruction bits in the scan path */
+ __u16 largest_reg; /* Largest register in the scan path */
+ __u16 smallest_reg; /* Smallest register in the scan path */
+ voyager_asic_t *asic; /* First ASIC in scan path (CAT_I) */
+ struct voyager_module *submodule; /* Submodule pointer */
+ struct voyager_module *next; /* Next module in linked list */
+} voyager_module_t;
+
+typedef struct voyager_eeprom_hdr {
+ __u8 module_id[4];
+ __u8 version_id;
+ __u8 config_id;
+ __u16 boundry_id; /* boundary scan id */
+ __u16 ee_size; /* size of EEPROM */
+ __u8 assembly[11]; /* assembly # */
+ __u8 assembly_rev; /* assembly rev */
+ __u8 tracer[4]; /* tracer number */
+ __u16 assembly_cksum; /* asm checksum */
+ __u16 power_consump; /* pwr requirements */
+ __u16 num_asics; /* number of asics */
+ __u16 bist_time; /* min. bist time */
+ __u16 err_log_offset; /* error log offset */
+ __u16 scan_path_offset;/* scan path offset */
+ __u16 cct_offset;
+ __u16 log_length; /* length of err log */
+ __u16 xsum_end; /* offset to end of
+ checksum */
+ __u8 reserved[4];
+ __u8 sflag; /* starting sentinal */
+ __u8 part_number[13]; /* prom part number */
+ __u8 version[10]; /* version number */
+ __u8 signature[8];
+ __u16 eeprom_chksum;
+ __u32 data_stamp_offset;
+ __u8 eflag ; /* ending sentinal */
+} __attribute__((packed)) voyager_eprom_hdr_t;
+
+
+
+#define VOYAGER_EPROM_SIZE_OFFSET \
+ ((__u16)(&(((voyager_eprom_hdr_t *)0)->ee_size)))
+#define VOYAGER_XSUM_END_OFFSET 0x2a
+
+/* the following three definitions are for internal table layouts
+ * in the module EPROMs. We really only care about the IDs and
+ * offsets */
+typedef struct voyager_sp_table {
+ __u8 asic_id;
+ __u8 bypass_flag;
+ __u16 asic_data_offset;
+ __u16 config_data_offset;
+} __attribute__((packed)) voyager_sp_table_t;
+
+typedef struct voyager_jtag_table {
+ __u8 icode[4];
+ __u8 runbist[4];
+ __u8 intest[4];
+ __u8 samp_preld[4];
+ __u8 ireg_len;
+} __attribute__((packed)) voyager_jtt_t;
+
+typedef struct voyager_asic_data_table {
+ __u8 jtag_id[4];
+ __u16 length_bsr;
+ __u16 length_bist_reg;
+ __u32 bist_clk;
+ __u16 subaddr_bits;
+ __u16 seed_bits;
+ __u16 sig_bits;
+ __u16 jtag_offset;
+} __attribute__((packed)) voyager_at_t;
+
+/* Voyager Interrupt Controller (VIC) registers */
+
+/* Base to add to Cross Processor Interrupts (CPIs) when triggering
+ * the CPU IRQ line */
+/* register defines for the WCBICs (one per processor) */
+#define VOYAGER_WCBIC0 0x41 /* bus A node P1 processor 0 */
+#define VOYAGER_WCBIC1 0x49 /* bus A node P1 processor 1 */
+#define VOYAGER_WCBIC2 0x51 /* bus A node P2 processor 0 */
+#define VOYAGER_WCBIC3 0x59 /* bus A node P2 processor 1 */
+#define VOYAGER_WCBIC4 0x61 /* bus B node P1 processor 0 */
+#define VOYAGER_WCBIC5 0x69 /* bus B node P1 processor 1 */
+#define VOYAGER_WCBIC6 0x71 /* bus B node P2 processor 0 */
+#define VOYAGER_WCBIC7 0x79 /* bus B node P2 processor 1 */
+
+
+/* top of memory registers */
+#define VOYAGER_WCBIC_TOM_L 0x4
+#define VOYAGER_WCBIC_TOM_H 0x5
+
+/* register defines for Voyager Memory Contol (VMC)
+ * these are present on L4 machines only */
+#define VOYAGER_VMC1 0x81
+#define VOYAGER_VMC2 0x91
+#define VOYAGER_VMC3 0xa1
+#define VOYAGER_VMC4 0xb1
+
+/* VMC Ports */
+#define VOYAGER_VMC_MEMORY_SETUP 0x9
+# define VMC_Interleaving 0x01
+# define VMC_4Way 0x02
+# define VMC_EvenCacheLines 0x04
+# define VMC_HighLine 0x08
+# define VMC_Start0_Enable 0x20
+# define VMC_Start1_Enable 0x40
+# define VMC_Vremap 0x80
+#define VOYAGER_VMC_BANK_DENSITY 0xa
+# define VMC_BANK_EMPTY 0
+# define VMC_BANK_4MB 1
+# define VMC_BANK_16MB 2
+# define VMC_BANK_64MB 3
+# define VMC_BANK0_MASK 0x03
+# define VMC_BANK1_MASK 0x0C
+# define VMC_BANK2_MASK 0x30
+# define VMC_BANK3_MASK 0xC0
+
+/* Magellan Memory Controller (MMC) defines - present on L5 */
+#define VOYAGER_MMC_ASIC_ID 1
+/* the two memory modules corresponding to memory cards in the system */
+#define VOYAGER_MMC_MEMORY0_MODULE 0x14
+#define VOYAGER_MMC_MEMORY1_MODULE 0x15
+/* the Magellan Memory Address (MMA) defines */
+#define VOYAGER_MMA_ASIC_ID 2
+
+/* Submodule number for the Quad Baseboard */
+#define VOYAGER_QUAD_BASEBOARD 1
+
+/* ASIC defines for the Quad Baseboard */
+#define VOYAGER_QUAD_QDATA0 1
+#define VOYAGER_QUAD_QDATA1 2
+#define VOYAGER_QUAD_QABC 3
+
+/* Useful areas in extended CMOS */
+#define VOYAGER_PROCESSOR_PRESENT_MASK 0x88a
+#define VOYAGER_MEMORY_CLICKMAP 0xa23
+#define VOYAGER_DUMP_LOCATION 0xb1a
+
+/* SUS In Control bit - used to tell SUS that we don't need to be
+ * babysat anymore */
+#define VOYAGER_SUS_IN_CONTROL_PORT 0x3ff
+# define VOYAGER_IN_CONTROL_FLAG 0x80
+
+/* Voyager PSI defines */
+#define VOYAGER_PSI_STATUS_REG 0x08
+# define PSI_DC_FAIL 0x01
+# define PSI_MON 0x02
+# define PSI_FAULT 0x04
+# define PSI_ALARM 0x08
+# define PSI_CURRENT 0x10
+# define PSI_DVM 0x20
+# define PSI_PSCFAULT 0x40
+# define PSI_STAT_CHG 0x80
+
+#define VOYAGER_PSI_SUPPLY_REG 0x8000
+ /* read */
+# define PSI_FAIL_DC 0x01
+# define PSI_FAIL_AC 0x02
+# define PSI_MON_INT 0x04
+# define PSI_SWITCH_OFF 0x08
+# define PSI_HX_OFF 0x10
+# define PSI_SECURITY 0x20
+# define PSI_CMOS_BATT_LOW 0x40
+# define PSI_CMOS_BATT_FAIL 0x80
+ /* write */
+# define PSI_CLR_SWITCH_OFF 0x13
+# define PSI_CLR_HX_OFF 0x14
+# define PSI_CLR_CMOS_BATT_FAIL 0x17
+
+#define VOYAGER_PSI_MASK 0x8001
+# define PSI_MASK_MASK 0x10
+
+#define VOYAGER_PSI_AC_FAIL_REG 0x8004
+#define AC_FAIL_STAT_CHANGE 0x80
+
+#define VOYAGER_PSI_GENERAL_REG 0x8007
+ /* read */
+# define PSI_SWITCH_ON 0x01
+# define PSI_SWITCH_ENABLED 0x02
+# define PSI_ALARM_ENABLED 0x08
+# define PSI_SECURE_ENABLED 0x10
+# define PSI_COLD_RESET 0x20
+# define PSI_COLD_START 0x80
+ /* write */
+# define PSI_POWER_DOWN 0x10
+# define PSI_SWITCH_DISABLE 0x01
+# define PSI_SWITCH_ENABLE 0x11
+# define PSI_CLEAR 0x12
+# define PSI_ALARM_DISABLE 0x03
+# define PSI_ALARM_ENABLE 0x13
+# define PSI_CLEAR_COLD_RESET 0x05
+# define PSI_SET_COLD_RESET 0x15
+# define PSI_CLEAR_COLD_START 0x07
+# define PSI_SET_COLD_START 0x17
+
+
+
+struct voyager_bios_info {
+ __u8 len;
+ __u8 major;
+ __u8 minor;
+ __u8 debug;
+ __u8 num_classes;
+ __u8 class_1;
+ __u8 class_2;
+};
+
+/* The following structures and definitions are for the Kernel/SUS
+ * interface these are needed to find out how SUS initialised any Quad
+ * boards in the system */
+
+#define NUMBER_OF_MC_BUSSES 2
+#define SLOTS_PER_MC_BUS 8
+#define MAX_CPUS 16 /* 16 way CPU system */
+#define MAX_PROCESSOR_BOARDS 4 /* 4 processor slot system */
+#define MAX_CACHE_LEVELS 4 /* # of cache levels supported */
+#define MAX_SHARED_CPUS 4 /* # of CPUs that can share a LARC */
+#define NUMBER_OF_POS_REGS 8
+
+typedef struct {
+ __u8 MC_Slot;
+ __u8 POS_Values[NUMBER_OF_POS_REGS];
+} __attribute__((packed)) MC_SlotInformation_t;
+
+struct QuadDescription {
+ __u8 Type; /* for type 0 (DYADIC or MONADIC) all fields
+ * will be zero except for slot */
+ __u8 StructureVersion;
+ __u32 CPI_BaseAddress;
+ __u32 LARC_BankSize;
+ __u32 LocalMemoryStateBits;
+ __u8 Slot; /* Processor slots 1 - 4 */
+} __attribute__((packed));
+
+struct ProcBoardInfo {
+ __u8 Type;
+ __u8 StructureVersion;
+ __u8 NumberOfBoards;
+ struct QuadDescription QuadData[MAX_PROCESSOR_BOARDS];
+} __attribute__((packed));
+
+struct CacheDescription {
+ __u8 Level;
+ __u32 TotalSize;
+ __u16 LineSize;
+ __u8 Associativity;
+ __u8 CacheType;
+ __u8 WriteType;
+ __u8 Number_CPUs_SharedBy;
+ __u8 Shared_CPUs_Hardware_IDs[MAX_SHARED_CPUS];
+
+} __attribute__((packed));
+
+struct CPU_Description {
+ __u8 CPU_HardwareId;
+ char *FRU_String;
+ __u8 NumberOfCacheLevels;
+ struct CacheDescription CacheLevelData[MAX_CACHE_LEVELS];
+} __attribute__((packed));
+
+struct CPU_Info {
+ __u8 Type;
+ __u8 StructureVersion;
+ __u8 NumberOf_CPUs;
+ struct CPU_Description CPU_Data[MAX_CPUS];
+} __attribute__((packed));
+
+
+/*
+ * This structure will be used by SUS and the OS.
+ * The assumption about this structure is that no blank space is
+ * packed in it by our friend the compiler.
+ */
+typedef struct {
+ __u8 Mailbox_SUS; /* Written to by SUS to give
+ commands/response to the OS */
+ __u8 Mailbox_OS; /* Written to by the OS to give
+ commands/response to SUS */
+ __u8 SUS_MailboxVersion; /* Tells the OS which iteration of the
+ interface SUS supports */
+ __u8 OS_MailboxVersion; /* Tells SUS which iteration of the
+ interface the OS supports */
+ __u32 OS_Flags; /* Flags set by the OS as info for
+ SUS */
+ __u32 SUS_Flags; /* Flags set by SUS as info
+ for the OS */
+ __u32 WatchDogPeriod; /* Watchdog period (in seconds) which
+ the DP uses to see if the OS
+ is dead */
+ __u32 WatchDogCount; /* Updated by the OS on every tic. */
+ __u32 MemoryFor_SUS_ErrorLog; /* Flat 32 bit address which tells SUS
+ where to stuff the SUS error log
+ on a dump */
+ MC_SlotInformation_t MC_SlotInfo[NUMBER_OF_MC_BUSSES*SLOTS_PER_MC_BUS];
+ /* Storage for MCA POS data */
+ /* All new SECOND_PASS_INTERFACE fields added from this point */
+ struct ProcBoardInfo *BoardData;
+ struct CPU_Info *CPU_Data;
+ /* All new fields must be added from this point */
+} Voyager_KernelSUS_Mbox_t;
+
+/* structure for finding the right memory address to send a QIC CPI to */
+struct voyager_qic_cpi {
+ /* Each cache line (32 bytes) can trigger a cpi. The cpi
+ * read/write may occur anywhere in the cache line---pick the
+ * middle to be safe */
+ struct {
+ __u32 pad1[3];
+ __u32 cpi;
+ __u32 pad2[4];
+ } qic_cpi[8];
+};
+
+struct voyager_status {
+ __u32 power_fail:1;
+ __u32 switch_off:1;
+ __u32 request_from_kernel:1;
+};
+
+struct voyager_psi_regs {
+ __u8 cat_id;
+ __u8 cat_dev;
+ __u8 cat_control;
+ __u8 subaddr;
+ __u8 dummy4;
+ __u8 checkbit;
+ __u8 subaddr_low;
+ __u8 subaddr_high;
+ __u8 intstatus;
+ __u8 stat1;
+ __u8 stat3;
+ __u8 fault;
+ __u8 tms;
+ __u8 gen;
+ __u8 sysconf;
+ __u8 dummy15;
+};
+
+struct voyager_psi_subregs {
+ __u8 supply;
+ __u8 mask;
+ __u8 present;
+ __u8 DCfail;
+ __u8 ACfail;
+ __u8 fail;
+ __u8 UPSfail;
+ __u8 genstatus;
+};
+
+struct voyager_psi {
+ struct voyager_psi_regs regs;
+ struct voyager_psi_subregs subregs;
+};
+
+struct voyager_SUS {
+#define VOYAGER_DUMP_BUTTON_NMI 0x1
+#define VOYAGER_SUS_VALID 0x2
+#define VOYAGER_SYSINT_COMPLETE 0x3
+ __u8 SUS_mbox;
+#define VOYAGER_NO_COMMAND 0x0
+#define VOYAGER_IGNORE_DUMP 0x1
+#define VOYAGER_DO_DUMP 0x2
+#define VOYAGER_SYSINT_HANDSHAKE 0x3
+#define VOYAGER_DO_MEM_DUMP 0x4
+#define VOYAGER_SYSINT_WAS_RECOVERED 0x5
+ __u8 kernel_mbox;
+#define VOYAGER_MAILBOX_VERSION 0x10
+ __u8 SUS_version;
+ __u8 kernel_version;
+#define VOYAGER_OS_HAS_SYSINT 0x1
+#define VOYAGER_OS_IN_PROGRESS 0x2
+#define VOYAGER_UPDATING_WDPERIOD 0x4
+ __u32 kernel_flags;
+#define VOYAGER_SUS_BOOTING 0x1
+#define VOYAGER_SUS_IN_PROGRESS 0x2
+ __u32 SUS_flags;
+ __u32 watchdog_period;
+ __u32 watchdog_count;
+ __u32 SUS_errorlog;
+ /* lots of system configuration stuff under here */
+};
+
+/* Variables exported by voyager_smp */
+extern __u32 voyager_extended_vic_processors;
+extern __u32 voyager_allowed_boot_processors;
+extern __u32 voyager_quad_processors;
+extern struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS];
+extern struct voyager_SUS *voyager_SUS;
+
+/* variables exported always */
+extern struct task_struct *voyager_thread;
+extern int voyager_level;
+extern struct voyager_status voyager_status;
+
+/* functions exported by the voyager and voyager_smp modules */
+extern int voyager_cat_readb(__u8 module, __u8 asic, int reg);
+extern void voyager_cat_init(void);
+extern void voyager_detect(struct voyager_bios_info *);
+extern void voyager_trap_init(void);
+extern void voyager_setup_irqs(void);
+extern int voyager_memory_detect(int region, __u32 *addr, __u32 *length);
+extern void voyager_smp_intr_init(void);
+extern __u8 voyager_extended_cmos_read(__u16 cmos_address);
+extern void voyager_smp_dump(void);
+extern void voyager_timer_interrupt(void);
+extern void smp_local_timer_interrupt(void);
+extern void voyager_power_off(void);
+extern void smp_voyager_power_off(void *dummy);
+extern void voyager_restart(void);
+extern void voyager_cat_power_off(void);
+extern void voyager_cat_do_common_interrupt(void);
+extern void voyager_handle_nmi(void);
+/* Commands for the following are */
+#define VOYAGER_PSI_READ 0
+#define VOYAGER_PSI_WRITE 1
+#define VOYAGER_PSI_SUBREAD 2
+#define VOYAGER_PSI_SUBWRITE 3
+extern void voyager_cat_psi(__u8, __u16, __u8 *);
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
new file mode 100644
index 00000000000..d0983d255fb
--- /dev/null
+++ b/arch/x86/include/asm/vsyscall.h
@@ -0,0 +1,44 @@
+#ifndef _ASM_X86_VSYSCALL_H
+#define _ASM_X86_VSYSCALL_H
+
+enum vsyscall_num {
+ __NR_vgettimeofday,
+ __NR_vtime,
+ __NR_vgetcpu,
+};
+
+#define VSYSCALL_START (-10UL << 20)
+#define VSYSCALL_SIZE 1024
+#define VSYSCALL_END (-2UL << 20)
+#define VSYSCALL_MAPPED_PAGES 1
+#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
+
+#ifdef __KERNEL__
+#include <linux/seqlock.h>
+
+#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
+#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
+
+/* Definitions for CONFIG_GENERIC_TIME definitions */
+#define __section_vsyscall_gtod_data __attribute__ \
+ ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
+#define __section_vsyscall_clock __attribute__ \
+ ((unused, __section__ (".vsyscall_clock"),aligned(16)))
+#define __vsyscall_fn \
+ __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
+
+#define VGETCPU_RDTSCP 1
+#define VGETCPU_LSL 2
+
+extern int __vgetcpu_mode;
+extern volatile unsigned long __jiffies;
+
+/* kernel space (writeable) */
+extern int vgetcpu_mode;
+extern struct timezone sys_tz;
+
+extern void map_vsyscall(void);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/include/asm/xcr.h b/arch/x86/include/asm/xcr.h
new file mode 100644
index 00000000000..f2cba4e79a2
--- /dev/null
+++ b/arch/x86/include/asm/xcr.h
@@ -0,0 +1,49 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright 2008 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2 or (at your
+ * option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * asm-x86/xcr.h
+ *
+ * Definitions for the eXtended Control Register instructions
+ */
+
+#ifndef _ASM_X86_XCR_H
+#define _ASM_X86_XCR_H
+
+#define XCR_XFEATURE_ENABLED_MASK 0x00000000
+
+#ifdef __KERNEL__
+# ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+static inline u64 xgetbv(u32 index)
+{
+ u32 eax, edx;
+
+ asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
+ : "=a" (eax), "=d" (edx)
+ : "c" (index));
+ return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+ u32 eax = value;
+ u32 edx = value >> 32;
+
+ asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
+ : : "a" (eax), "d" (edx), "c" (index));
+}
+
+# endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_XCR_H */
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
new file mode 100644
index 00000000000..19144184983
--- /dev/null
+++ b/arch/x86/include/asm/xen/events.h
@@ -0,0 +1,24 @@
+#ifndef _ASM_X86_XEN_EVENTS_H
+#define _ASM_X86_XEN_EVENTS_H
+
+enum ipi_vector {
+ XEN_RESCHEDULE_VECTOR,
+ XEN_CALL_FUNCTION_VECTOR,
+ XEN_CALL_FUNCTION_SINGLE_VECTOR,
+ XEN_SPIN_UNLOCK_VECTOR,
+
+ XEN_NR_IPIS,
+};
+
+static inline int xen_irqs_disabled(struct pt_regs *regs)
+{
+ return raw_irqs_disabled_flags(regs->flags);
+}
+
+static inline void xen_do_IRQ(int irq, struct pt_regs *regs)
+{
+ regs->orig_ax = ~irq;
+ do_IRQ(regs);
+}
+
+#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/grant_table.h b/arch/x86/include/asm/xen/grant_table.h
new file mode 100644
index 00000000000..fdbbb45767a
--- /dev/null
+++ b/arch/x86/include/asm/xen/grant_table.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_X86_XEN_GRANT_TABLE_H
+#define _ASM_X86_XEN_GRANT_TABLE_H
+
+#define xen_alloc_vm_area(size) alloc_vm_area(size)
+#define xen_free_vm_area(area) free_vm_area(area)
+
+#endif /* _ASM_X86_XEN_GRANT_TABLE_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
new file mode 100644
index 00000000000..3f6000d95fe
--- /dev/null
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -0,0 +1,527 @@
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _ASM_X86_XEN_HYPERCALL_H
+#define _ASM_X86_XEN_HYPERCALL_H
+
+#include <linux/errno.h>
+#include <linux/string.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/physdev.h>
+
+/*
+ * The hypercall asms have to meet several constraints:
+ * - Work on 32- and 64-bit.
+ * The two architectures put their arguments in different sets of
+ * registers.
+ *
+ * - Work around asm syntax quirks
+ * It isn't possible to specify one of the rNN registers in a
+ * constraint, so we use explicit register variables to get the
+ * args into the right place.
+ *
+ * - Mark all registers as potentially clobbered
+ * Even unused parameters can be clobbered by the hypervisor, so we
+ * need to make sure gcc knows it.
+ *
+ * - Avoid compiler bugs.
+ * This is the tricky part. Because x86_32 has such a constrained
+ * register set, gcc versions below 4.3 have trouble generating
+ * code when all the arg registers and memory are trashed by the
+ * asm. There are syntactically simpler ways of achieving the
+ * semantics below, but they cause the compiler to crash.
+ *
+ * The only combination I found which works is:
+ * - assign the __argX variables first
+ * - list all actually used parameters as "+r" (__argX)
+ * - clobber the rest
+ *
+ * The result certainly isn't pretty, and it really shows up cpp's
+ * weakness as as macro language. Sorry. (But let's just give thanks
+ * there aren't more than 5 arguments...)
+ */
+
+extern struct { char _entry[32]; } hypercall_page[];
+
+#define __HYPERCALL "call hypercall_page+%c[offset]"
+#define __HYPERCALL_ENTRY(x) \
+ [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
+
+#ifdef CONFIG_X86_32
+#define __HYPERCALL_RETREG "eax"
+#define __HYPERCALL_ARG1REG "ebx"
+#define __HYPERCALL_ARG2REG "ecx"
+#define __HYPERCALL_ARG3REG "edx"
+#define __HYPERCALL_ARG4REG "esi"
+#define __HYPERCALL_ARG5REG "edi"
+#else
+#define __HYPERCALL_RETREG "rax"
+#define __HYPERCALL_ARG1REG "rdi"
+#define __HYPERCALL_ARG2REG "rsi"
+#define __HYPERCALL_ARG3REG "rdx"
+#define __HYPERCALL_ARG4REG "r10"
+#define __HYPERCALL_ARG5REG "r8"
+#endif
+
+#define __HYPERCALL_DECLS \
+ register unsigned long __res asm(__HYPERCALL_RETREG); \
+ register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \
+ register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
+ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
+ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
+ register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
+
+#define __HYPERCALL_0PARAM "=r" (__res)
+#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
+#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
+#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
+#define __HYPERCALL_4PARAM __HYPERCALL_3PARAM, "+r" (__arg4)
+#define __HYPERCALL_5PARAM __HYPERCALL_4PARAM, "+r" (__arg5)
+
+#define __HYPERCALL_0ARG()
+#define __HYPERCALL_1ARG(a1) \
+ __HYPERCALL_0ARG() __arg1 = (unsigned long)(a1);
+#define __HYPERCALL_2ARG(a1,a2) \
+ __HYPERCALL_1ARG(a1) __arg2 = (unsigned long)(a2);
+#define __HYPERCALL_3ARG(a1,a2,a3) \
+ __HYPERCALL_2ARG(a1,a2) __arg3 = (unsigned long)(a3);
+#define __HYPERCALL_4ARG(a1,a2,a3,a4) \
+ __HYPERCALL_3ARG(a1,a2,a3) __arg4 = (unsigned long)(a4);
+#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5) \
+ __HYPERCALL_4ARG(a1,a2,a3,a4) __arg5 = (unsigned long)(a5);
+
+#define __HYPERCALL_CLOBBER5 "memory"
+#define __HYPERCALL_CLOBBER4 __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG
+#define __HYPERCALL_CLOBBER3 __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG
+#define __HYPERCALL_CLOBBER2 __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG
+#define __HYPERCALL_CLOBBER1 __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG
+#define __HYPERCALL_CLOBBER0 __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG
+
+#define _hypercall0(type, name) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_0ARG(); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_0PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER0); \
+ (type)__res; \
+})
+
+#define _hypercall1(type, name, a1) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_1ARG(a1); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_1PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER1); \
+ (type)__res; \
+})
+
+#define _hypercall2(type, name, a1, a2) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_2ARG(a1, a2); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_2PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER2); \
+ (type)__res; \
+})
+
+#define _hypercall3(type, name, a1, a2, a3) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_3ARG(a1, a2, a3); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_3PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER3); \
+ (type)__res; \
+})
+
+#define _hypercall4(type, name, a1, a2, a3, a4) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_4ARG(a1, a2, a3, a4); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_4PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER4); \
+ (type)__res; \
+})
+
+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_5PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER5); \
+ (type)__res; \
+})
+
+static inline int
+HYPERVISOR_set_trap_table(struct trap_info *table)
+{
+ return _hypercall1(int, set_trap_table, table);
+}
+
+static inline int
+HYPERVISOR_mmu_update(struct mmu_update *req, int count,
+ int *success_count, domid_t domid)
+{
+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+
+static inline int
+HYPERVISOR_mmuext_op(struct mmuext_op *op, int count,
+ int *success_count, domid_t domid)
+{
+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+
+static inline int
+HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
+{
+ return _hypercall2(int, set_gdt, frame_list, entries);
+}
+
+static inline int
+HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
+{
+ return _hypercall2(int, stack_switch, ss, esp);
+}
+
+#ifdef CONFIG_X86_32
+static inline int
+HYPERVISOR_set_callbacks(unsigned long event_selector,
+ unsigned long event_address,
+ unsigned long failsafe_selector,
+ unsigned long failsafe_address)
+{
+ return _hypercall4(int, set_callbacks,
+ event_selector, event_address,
+ failsafe_selector, failsafe_address);
+}
+#else /* CONFIG_X86_64 */
+static inline int
+HYPERVISOR_set_callbacks(unsigned long event_address,
+ unsigned long failsafe_address,
+ unsigned long syscall_address)
+{
+ return _hypercall3(int, set_callbacks,
+ event_address, failsafe_address,
+ syscall_address);
+}
+#endif /* CONFIG_X86_{32,64} */
+
+static inline int
+HYPERVISOR_callback_op(int cmd, void *arg)
+{
+ return _hypercall2(int, callback_op, cmd, arg);
+}
+
+static inline int
+HYPERVISOR_fpu_taskswitch(int set)
+{
+ return _hypercall1(int, fpu_taskswitch, set);
+}
+
+static inline int
+HYPERVISOR_sched_op(int cmd, void *arg)
+{
+ return _hypercall2(int, sched_op_new, cmd, arg);
+}
+
+static inline long
+HYPERVISOR_set_timer_op(u64 timeout)
+{
+ unsigned long timeout_hi = (unsigned long)(timeout>>32);
+ unsigned long timeout_lo = (unsigned long)timeout;
+ return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
+}
+
+static inline int
+HYPERVISOR_set_debugreg(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static inline unsigned long
+HYPERVISOR_get_debugreg(int reg)
+{
+ return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int
+HYPERVISOR_update_descriptor(u64 ma, u64 desc)
+{
+ return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
+}
+
+static inline int
+HYPERVISOR_memory_op(unsigned int cmd, void *arg)
+{
+ return _hypercall2(int, memory_op, cmd, arg);
+}
+
+static inline int
+HYPERVISOR_multicall(void *call_list, int nr_calls)
+{
+ return _hypercall2(int, multicall, call_list, nr_calls);
+}
+
+static inline int
+HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
+ unsigned long flags)
+{
+ if (sizeof(new_val) == sizeof(long))
+ return _hypercall3(int, update_va_mapping, va,
+ new_val.pte, flags);
+ else
+ return _hypercall4(int, update_va_mapping, va,
+ new_val.pte, new_val.pte >> 32, flags);
+}
+
+static inline int
+HYPERVISOR_event_channel_op(int cmd, void *arg)
+{
+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
+ if (unlikely(rc == -ENOSYS)) {
+ struct evtchn_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, event_channel_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+ return rc;
+}
+
+static inline int
+HYPERVISOR_xen_version(int cmd, void *arg)
+{
+ return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static inline int
+HYPERVISOR_console_io(int cmd, int count, char *str)
+{
+ return _hypercall3(int, console_io, cmd, count, str);
+}
+
+static inline int
+HYPERVISOR_physdev_op(int cmd, void *arg)
+{
+ int rc = _hypercall2(int, physdev_op, cmd, arg);
+ if (unlikely(rc == -ENOSYS)) {
+ struct physdev_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, physdev_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+ return rc;
+}
+
+static inline int
+HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
+{
+ return _hypercall3(int, grant_table_op, cmd, uop, count);
+}
+
+static inline int
+HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
+ unsigned long flags, domid_t domid)
+{
+ if (sizeof(new_val) == sizeof(long))
+ return _hypercall4(int, update_va_mapping_otherdomain, va,
+ new_val.pte, flags, domid);
+ else
+ return _hypercall5(int, update_va_mapping_otherdomain, va,
+ new_val.pte, new_val.pte >> 32,
+ flags, domid);
+}
+
+static inline int
+HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
+{
+ return _hypercall2(int, vm_assist, cmd, type);
+}
+
+static inline int
+HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
+{
+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+
+#ifdef CONFIG_X86_64
+static inline int
+HYPERVISOR_set_segment_base(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
+#endif
+
+static inline int
+HYPERVISOR_suspend(unsigned long srec)
+{
+ return _hypercall3(int, sched_op, SCHEDOP_shutdown,
+ SHUTDOWN_suspend, srec);
+}
+
+static inline int
+HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
+{
+ return _hypercall2(int, nmi_op, op, arg);
+}
+
+static inline void
+MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
+{
+ mcl->op = __HYPERVISOR_fpu_taskswitch;
+ mcl->args[0] = set;
+}
+
+static inline void
+MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
+ pte_t new_val, unsigned long flags)
+{
+ mcl->op = __HYPERVISOR_update_va_mapping;
+ mcl->args[0] = va;
+ if (sizeof(new_val) == sizeof(long)) {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+ } else {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = new_val.pte >> 32;
+ mcl->args[3] = flags;
+ }
+}
+
+static inline void
+MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
+ void *uop, unsigned int count)
+{
+ mcl->op = __HYPERVISOR_grant_table_op;
+ mcl->args[0] = cmd;
+ mcl->args[1] = (unsigned long)uop;
+ mcl->args[2] = count;
+}
+
+static inline void
+MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va,
+ pte_t new_val, unsigned long flags,
+ domid_t domid)
+{
+ mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
+ mcl->args[0] = va;
+ if (sizeof(new_val) == sizeof(long)) {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+ mcl->args[3] = domid;
+ } else {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = new_val.pte >> 32;
+ mcl->args[3] = flags;
+ mcl->args[4] = domid;
+ }
+}
+
+static inline void
+MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
+ struct desc_struct desc)
+{
+ mcl->op = __HYPERVISOR_update_descriptor;
+ if (sizeof(maddr) == sizeof(long)) {
+ mcl->args[0] = maddr;
+ mcl->args[1] = *(unsigned long *)&desc;
+ } else {
+ mcl->args[0] = maddr;
+ mcl->args[1] = maddr >> 32;
+ mcl->args[2] = desc.a;
+ mcl->args[3] = desc.b;
+ }
+}
+
+static inline void
+MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
+{
+ mcl->op = __HYPERVISOR_memory_op;
+ mcl->args[0] = cmd;
+ mcl->args[1] = (unsigned long)arg;
+}
+
+static inline void
+MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
+ int count, int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)req;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+}
+
+static inline void
+MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
+ int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmuext_op;
+ mcl->args[0] = (unsigned long)op;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+}
+
+static inline void
+MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
+{
+ mcl->op = __HYPERVISOR_set_gdt;
+ mcl->args[0] = (unsigned long)frames;
+ mcl->args[1] = entries;
+}
+
+static inline void
+MULTI_stack_switch(struct multicall_entry *mcl,
+ unsigned long ss, unsigned long esp)
+{
+ mcl->op = __HYPERVISOR_stack_switch;
+ mcl->args[0] = ss;
+ mcl->args[1] = esp;
+}
+
+#endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
new file mode 100644
index 00000000000..a38d25ac87d
--- /dev/null
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * hypervisor.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _ASM_X86_XEN_HYPERVISOR_H
+#define _ASM_X86_XEN_HYPERVISOR_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+
+#include <asm/ptrace.h>
+#include <asm/page.h>
+#include <asm/desc.h>
+#if defined(__i386__)
+# ifdef CONFIG_X86_PAE
+# include <asm-generic/pgtable-nopud.h>
+# else
+# include <asm-generic/pgtable-nopmd.h>
+# endif
+#endif
+#include <asm/xen/hypercall.h>
+
+/* arch/i386/kernel/setup.c */
+extern struct shared_info *HYPERVISOR_shared_info;
+extern struct start_info *xen_start_info;
+
+/* arch/i386/mach-xen/evtchn.c */
+/* Force a proper event-channel callback from Xen. */
+extern void force_evtchn_callback(void);
+
+/* Turn jiffies into Xen system time. */
+u64 jiffies_to_st(unsigned long jiffies);
+
+
+#define MULTI_UVMFLAGS_INDEX 3
+#define MULTI_UVMDOMID_INDEX 4
+
+enum xen_domain_type {
+ XEN_NATIVE,
+ XEN_PV_DOMAIN,
+ XEN_HVM_DOMAIN,
+};
+
+extern enum xen_domain_type xen_domain_type;
+
+#define xen_domain() (xen_domain_type != XEN_NATIVE)
+#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN)
+#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN)
+#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN)
+
+#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
new file mode 100644
index 00000000000..e8506c1f0c5
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface.h
@@ -0,0 +1,175 @@
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 Xen.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef _ASM_X86_XEN_INTERFACE_H
+#define _ASM_X86_XEN_INTERFACE_H
+
+#ifdef __XEN__
+#define __DEFINE_GUEST_HANDLE(name, type) \
+ typedef struct { type *p; } __guest_handle_ ## name
+#else
+#define __DEFINE_GUEST_HANDLE(name, type) \
+ typedef type * __guest_handle_ ## name
+#endif
+
+#define DEFINE_GUEST_HANDLE_STRUCT(name) \
+ __DEFINE_GUEST_HANDLE(name, struct name)
+#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
+#define GUEST_HANDLE(name) __guest_handle_ ## name
+
+#ifdef __XEN__
+#if defined(__i386__)
+#define set_xen_guest_handle(hnd, val) \
+ do { \
+ if (sizeof(hnd) == 8) \
+ *(uint64_t *)&(hnd) = 0; \
+ (hnd).p = val; \
+ } while (0)
+#elif defined(__x86_64__)
+#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
+#endif
+#else
+#if defined(__i386__)
+#define set_xen_guest_handle(hnd, val) \
+ do { \
+ if (sizeof(hnd) == 8) \
+ *(uint64_t *)&(hnd) = 0; \
+ (hnd) = val; \
+ } while (0)
+#elif defined(__x86_64__)
+#define set_xen_guest_handle(hnd, val) do { (hnd) = val; } while (0)
+#endif
+#endif
+
+#ifndef __ASSEMBLY__
+/* Guest handles for primitive C types. */
+__DEFINE_GUEST_HANDLE(uchar, unsigned char);
+__DEFINE_GUEST_HANDLE(uint, unsigned int);
+__DEFINE_GUEST_HANDLE(ulong, unsigned long);
+DEFINE_GUEST_HANDLE(char);
+DEFINE_GUEST_HANDLE(int);
+DEFINE_GUEST_HANDLE(long);
+DEFINE_GUEST_HANDLE(void);
+#endif
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+
+/*
+ * SEGMENT DESCRIPTOR TABLES
+ */
+/*
+ * A number of GDT entries are reserved by Xen. These are not situated at the
+ * start of the GDT because some stupid OSes export hard-coded selector values
+ * in their ABI. These hard-coded values are always near the start of the GDT,
+ * so Xen places itself out of the way, at the far end of the GDT.
+ */
+#define FIRST_RESERVED_GDT_PAGE 14
+#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
+
+/*
+ * Send an array of these to HYPERVISOR_set_trap_table()
+ * The privilege level specifies which modes may enter a trap via a software
+ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
+ * privilege levels as follows:
+ * Level == 0: Noone may enter
+ * Level == 1: Kernel may enter
+ * Level == 2: Kernel may enter
+ * Level == 3: Everyone may enter
+ */
+#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
+#define TI_GET_IF(_ti) ((_ti)->flags & 4)
+#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
+#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
+
+#ifndef __ASSEMBLY__
+struct trap_info {
+ uint8_t vector; /* exception vector */
+ uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
+ uint16_t cs; /* code selector */
+ unsigned long address; /* code offset */
+};
+DEFINE_GUEST_HANDLE_STRUCT(trap_info);
+
+struct arch_shared_info {
+ unsigned long max_pfn; /* max pfn that appears in table */
+ /* Frame containing list of mfns containing list of mfns containing p2m. */
+ unsigned long pfn_to_mfn_frame_list_list;
+ unsigned long nmi_reason;
+};
+#endif /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_X86_32
+#include "interface_32.h"
+#else
+#include "interface_64.h"
+#endif
+
+#ifndef __ASSEMBLY__
+/*
+ * The following is all CPU context. Note that the fpu_ctxt block is filled
+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ */
+struct vcpu_guest_context {
+ /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
+ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_HVM_GUEST (1<<1)
+#define VGCF_IN_KERNEL (1<<2)
+ unsigned long flags; /* VGCF_* flags */
+ struct cpu_user_regs user_regs; /* User-level CPU registers */
+ struct trap_info trap_ctxt[256]; /* Virtual IDT */
+ unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
+ unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
+ unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
+ /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
+ unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
+ unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
+#ifdef __i386__
+ unsigned long event_callback_cs; /* CS:EIP of event callback */
+ unsigned long event_callback_eip;
+ unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
+ unsigned long failsafe_callback_eip;
+#else
+ unsigned long event_callback_eip;
+ unsigned long failsafe_callback_eip;
+ unsigned long syscall_callback_eip;
+#endif
+ unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
+#ifdef __x86_64__
+ /* Segment base addresses. */
+ uint64_t fs_base;
+ uint64_t gs_base_kernel;
+ uint64_t gs_base_user;
+#endif
+};
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * Prefix forces emulation of some non-trapping instructions.
+ * Currently only CPUID.
+ */
+#ifdef __ASSEMBLY__
+#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
+#define XEN_CPUID XEN_EMULATE_PREFIX cpuid
+#else
+#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
+#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
+#endif
+
+#endif /* _ASM_X86_XEN_INTERFACE_H */
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
new file mode 100644
index 00000000000..42a7e004ae5
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 32-bit Xen.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef _ASM_X86_XEN_INTERFACE_32_H
+#define _ASM_X86_XEN_INTERFACE_32_H
+
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS FLAT_RING3_CS
+#define FLAT_USER_DS FLAT_RING3_DS
+#define FLAT_USER_SS FLAT_RING3_SS
+
+/* And the trap vector is... */
+#define TRAP_INSTR "int $0x82"
+
+/*
+ * Virtual addresses beyond this are not modifiable by guest OSes. The
+ * machine->physical mapping table starts at this address, read-only.
+ */
+#define __HYPERVISOR_VIRT_START 0xF5800000
+
+#ifndef __ASSEMBLY__
+
+struct cpu_user_regs {
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+ uint32_t esi;
+ uint32_t edi;
+ uint32_t ebp;
+ uint32_t eax;
+ uint16_t error_code; /* private */
+ uint16_t entry_vector; /* private */
+ uint32_t eip;
+ uint16_t cs;
+ uint8_t saved_upcall_mask;
+ uint8_t _pad0;
+ uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
+ uint32_t esp;
+ uint16_t ss, _pad1;
+ uint16_t es, _pad2;
+ uint16_t ds, _pad3;
+ uint16_t fs, _pad4;
+ uint16_t gs, _pad5;
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
+};
+
+struct xen_callback {
+ unsigned long cs;
+ unsigned long eip;
+};
+typedef struct xen_callback xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __eip) \
+ ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
+#endif /* !__ASSEMBLY__ */
+
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+#endif /* _ASM_X86_XEN_INTERFACE_32_H */
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
new file mode 100644
index 00000000000..100d2662b97
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -0,0 +1,159 @@
+#ifndef _ASM_X86_XEN_INTERFACE_64_H
+#define _ASM_X86_XEN_INTERFACE_64_H
+
+/*
+ * 64-bit segment selectors
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+
+#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */
+#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */
+#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_DS64 0x0000 /* NULL selector */
+#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */
+
+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
+#define FLAT_KERNEL_DS FLAT_KERNEL_DS64
+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
+#define FLAT_KERNEL_CS FLAT_KERNEL_CS64
+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
+#define FLAT_KERNEL_SS FLAT_KERNEL_SS64
+
+#define FLAT_USER_DS64 FLAT_RING3_DS64
+#define FLAT_USER_DS32 FLAT_RING3_DS32
+#define FLAT_USER_DS FLAT_USER_DS64
+#define FLAT_USER_CS64 FLAT_RING3_CS64
+#define FLAT_USER_CS32 FLAT_RING3_CS32
+#define FLAT_USER_CS FLAT_USER_CS64
+#define FLAT_USER_SS64 FLAT_RING3_SS64
+#define FLAT_USER_SS32 FLAT_RING3_SS32
+#define FLAT_USER_SS FLAT_USER_SS64
+
+#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
+#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
+#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
+#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
+#endif
+
+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ * @which == SEGBASE_* ; @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS 0
+#define SEGBASE_GS_USER 1
+#define SEGBASE_GS_KERNEL 2
+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
+
+/*
+ * int HYPERVISOR_iret(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * The saved CS is mapped as follows:
+ * RING0 -> RING3 kernel mode.
+ * RING1 -> RING3 kernel mode.
+ * RING2 -> RING3 kernel mode.
+ * RING3 -> RING3 user mode.
+ * However RING0 indicates that the guest kernel should return to iteself
+ * directly with
+ * orb $3,1*8(%rsp)
+ * iretq
+ * If flags contains VGCF_in_syscall:
+ * Restore RAX, RIP, RFLAGS, RSP.
+ * Discard R11, RCX, CS, SS.
+ * Otherwise:
+ * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
+#define _VGCF_in_syscall 8
+#define VGCF_in_syscall (1<<_VGCF_in_syscall)
+#define VGCF_IN_SYSCALL VGCF_in_syscall
+
+#ifndef __ASSEMBLY__
+
+struct iret_context {
+ /* Top of stack (%rsp at point of hypercall). */
+ uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+ /* Bottom of iret stack frame. */
+};
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
+#define __DECL_REG(name) union { \
+ uint64_t r ## name, e ## name; \
+ uint32_t _e ## name; \
+}
+#else
+/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
+#define __DECL_REG(name) uint64_t r ## name
+#endif
+
+struct cpu_user_regs {
+ uint64_t r15;
+ uint64_t r14;
+ uint64_t r13;
+ uint64_t r12;
+ __DECL_REG(bp);
+ __DECL_REG(bx);
+ uint64_t r11;
+ uint64_t r10;
+ uint64_t r9;
+ uint64_t r8;
+ __DECL_REG(ax);
+ __DECL_REG(cx);
+ __DECL_REG(dx);
+ __DECL_REG(si);
+ __DECL_REG(di);
+ uint32_t error_code; /* private */
+ uint32_t entry_vector; /* private */
+ __DECL_REG(ip);
+ uint16_t cs, _pad0[1];
+ uint8_t saved_upcall_mask;
+ uint8_t _pad1[3];
+ __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */
+ __DECL_REG(sp);
+ uint16_t ss, _pad2[3];
+ uint16_t es, _pad3[3];
+ uint16_t ds, _pad4[3];
+ uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
+ uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+#undef __DECL_REG
+
+#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
+#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+
+typedef unsigned long xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __rip) \
+ ((unsigned long)(__rip))
+
+#endif /* !__ASSEMBLY__ */
+
+
+#endif /* _ASM_X86_XEN_INTERFACE_64_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
new file mode 100644
index 00000000000..bc628998a1b
--- /dev/null
+++ b/arch/x86/include/asm/xen/page.h
@@ -0,0 +1,165 @@
+#ifndef _ASM_X86_XEN_PAGE_H
+#define _ASM_X86_XEN_PAGE_H
+
+#include <linux/pfn.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+#include <xen/features.h>
+
+/* Xen machine address */
+typedef struct xmaddr {
+ phys_addr_t maddr;
+} xmaddr_t;
+
+/* Xen pseudo-physical address */
+typedef struct xpaddr {
+ phys_addr_t paddr;
+} xpaddr_t;
+
+#define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
+#define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY (~0UL)
+#define FOREIGN_FRAME_BIT (1UL<<31)
+#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
+
+/* Maximum amount of memory we can handle in a domain in pages */
+#define MAX_DOMAIN_PAGES \
+ ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+
+
+extern unsigned long get_phys_to_machine(unsigned long pfn);
+extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return pfn;
+
+ return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
+}
+
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 1;
+
+ return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY;
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return mfn;
+
+#if 0
+ if (unlikely((mfn >> machine_to_phys_order) != 0))
+ return max_mapnr;
+#endif
+
+ pfn = 0;
+ /*
+ * The array access can fail (e.g., device space beyond end of RAM).
+ * In such cases it doesn't matter what we return (we return garbage),
+ * but we must handle the fault without crashing!
+ */
+ __get_user(pfn, &machine_to_phys_mapping[mfn]);
+
+ return pfn;
+}
+
+static inline xmaddr_t phys_to_machine(xpaddr_t phys)
+{
+ unsigned offset = phys.paddr & ~PAGE_MASK;
+ return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
+}
+
+static inline xpaddr_t machine_to_phys(xmaddr_t machine)
+{
+ unsigned offset = machine.maddr & ~PAGE_MASK;
+ return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
+}
+
+/*
+ * We detect special mappings in one of two ways:
+ * 1. If the MFN is an I/O page then Xen will set the m2p entry
+ * to be outside our maximum possible pseudophys range.
+ * 2. If the MFN belongs to a different domain then we will certainly
+ * not have MFN in our p2m table. Conversely, if the page is ours,
+ * then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ * require. In all the cases we care about, the FOREIGN_FRAME bit is
+ * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+{
+ extern unsigned long max_mapnr;
+ unsigned long pfn = mfn_to_pfn(mfn);
+ if ((pfn < max_mapnr)
+ && !xen_feature(XENFEAT_auto_translated_physmap)
+ && (get_phys_to_machine(pfn) != mfn))
+ return max_mapnr; /* force !pfn_valid() */
+ /* XXX fixme; not true with sparsemem */
+ return pfn;
+}
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
+#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v))))
+#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+
+static inline unsigned long pte_mfn(pte_t pte)
+{
+ return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+ pte_t pte;
+
+ pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) |
+ (pgprot_val(pgprot) & __supported_pte_mask);
+
+ return pte;
+}
+
+static inline pteval_t pte_val_ma(pte_t pte)
+{
+ return pte.pte;
+}
+
+static inline pte_t __pte_ma(pteval_t x)
+{
+ return (pte_t) { .pte = x };
+}
+
+#define pmd_val_ma(v) ((v).pmd)
+#ifdef __PAGETABLE_PUD_FOLDED
+#define pud_val_ma(v) ((v).pgd.pgd)
+#else
+#define pud_val_ma(v) ((v).pud)
+#endif
+#define __pmd_ma(x) ((pmd_t) { (x) } )
+
+#define pgd_val_ma(x) ((x).pgd)
+
+
+xmaddr_t arbitrary_virt_to_machine(void *address);
+void make_lowmem_page_readonly(void *vaddr);
+void make_lowmem_page_readwrite(void *vaddr);
+
+#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
new file mode 100644
index 00000000000..11b3bb86e17
--- /dev/null
+++ b/arch/x86/include/asm/xor.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "xor_32.h"
+#else
+# include "xor_64.h"
+#endif
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
new file mode 100644
index 00000000000..133b40a0f49
--- /dev/null
+++ b/arch/x86/include/asm/xor_32.h
@@ -0,0 +1,888 @@
+#ifndef _ASM_X86_XOR_32_H
+#define _ASM_X86_XOR_32_H
+
+/*
+ * Optimized RAID-5 checksumming functions for MMX and SSE.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * High-speed RAID5 checksumming functions utilizing MMX instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+
+#define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
+#define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
+#define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
+#define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
+#define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
+#define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
+
+#include <asm/i387.h>
+
+static void
+xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ ST(i, 0) \
+ XO1(i+1, 1) \
+ ST(i+1, 1) \
+ XO1(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO1(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ ST(i, 0) \
+ XO2(i + 1, 1) \
+ ST(i + 1, 1) \
+ XO2(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO2(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i, 0) \
+ ST(i, 0) \
+ XO3(i + 1, 1) \
+ ST(i + 1, 1) \
+ XO3(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO3(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " addl $128, %4 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+
+static void
+xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ /* Make sure GCC forgets anything it knows about p4 or p5,
+ such that it won't pass to the asm volatile below a
+ register that is shared with any other variable. That's
+ because we modify p4 and p5 there, but we can't mark them
+ as read/write, otherwise we'd overflow the 10-asm-operands
+ limit of GCC < 3.1. */
+ asm("" : "+r" (p4), "+r" (p5));
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ XO4(i, 0) \
+ ST(i, 0) \
+ XO4(i + 1, 1) \
+ ST(i + 1, 1) \
+ XO4(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO4(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " addl $128, %4 ;\n"
+ " addl $128, %5 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ : "r" (p4), "r" (p5)
+ : "memory");
+
+ /* p4 and p5 were modified, and now the variables are dead.
+ Clobber them just to be sure nobody does something stupid
+ like assuming they have some legal value. */
+ asm("" : "=r" (p4), "=r" (p5));
+
+ kernel_fpu_end();
+}
+
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef BLOCK
+
+static void
+xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+ " .align 32 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ :
+ : "memory" );
+
+ kernel_fpu_end();
+}
+
+static void
+xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor (%4), %%mm0 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " pxor 8(%4), %%mm1 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " pxor 16(%4), %%mm2 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 24(%4), %%mm3 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " pxor 32(%4), %%mm4 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " pxor 40(%4), %%mm5 ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%4), %%mm6 ;\n"
+ " pxor 56(%4), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " addl $64, %4 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ /* Make sure GCC forgets anything it knows about p4 or p5,
+ such that it won't pass to the asm volatile below a
+ register that is shared with any other variable. That's
+ because we modify p4 and p5 there, but we can't mark them
+ as read/write, otherwise we'd overflow the 10-asm-operands
+ limit of GCC < 3.1. */
+ asm("" : "+r" (p4), "+r" (p5));
+
+ asm volatile(
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " pxor (%4), %%mm0 ;\n"
+ " pxor 8(%4), %%mm1 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " pxor (%5), %%mm0 ;\n"
+ " pxor 8(%5), %%mm1 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 16(%4), %%mm2 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " pxor 16(%5), %%mm2 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 24(%4), %%mm3 ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 24(%5), %%mm3 ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%4), %%mm4 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " pxor 32(%5), %%mm4 ;\n"
+ " pxor 40(%4), %%mm5 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " pxor 40(%5), %%mm5 ;\n"
+ " pxor 48(%4), %%mm6 ;\n"
+ " pxor 56(%4), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%5), %%mm6 ;\n"
+ " pxor 56(%5), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " addl $64, %4 ;\n"
+ " addl $64, %5 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ : "r" (p4), "r" (p5)
+ : "memory");
+
+ /* p4 and p5 were modified, and now the variables are dead.
+ Clobber them just to be sure nobody does something stupid
+ like assuming they have some legal value. */
+ asm("" : "=r" (p4), "=r" (p5));
+
+ kernel_fpu_end();
+}
+
+static struct xor_block_template xor_block_pII_mmx = {
+ .name = "pII_mmx",
+ .do_2 = xor_pII_mmx_2,
+ .do_3 = xor_pII_mmx_3,
+ .do_4 = xor_pII_mmx_4,
+ .do_5 = xor_pII_mmx_5,
+};
+
+static struct xor_block_template xor_block_p5_mmx = {
+ .name = "p5_mmx",
+ .do_2 = xor_p5_mmx_2,
+ .do_3 = xor_p5_mmx_3,
+ .do_4 = xor_p5_mmx_4,
+ .do_5 = xor_p5_mmx_5,
+};
+
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+#define XMMS_SAVE \
+do { \
+ preempt_disable(); \
+ cr0 = read_cr0(); \
+ clts(); \
+ asm volatile( \
+ "movups %%xmm0,(%0) ;\n\t" \
+ "movups %%xmm1,0x10(%0) ;\n\t" \
+ "movups %%xmm2,0x20(%0) ;\n\t" \
+ "movups %%xmm3,0x30(%0) ;\n\t" \
+ : \
+ : "r" (xmm_save) \
+ : "memory"); \
+} while (0)
+
+#define XMMS_RESTORE \
+do { \
+ asm volatile( \
+ "sfence ;\n\t" \
+ "movups (%0),%%xmm0 ;\n\t" \
+ "movups 0x10(%0),%%xmm1 ;\n\t" \
+ "movups 0x20(%0),%%xmm2 ;\n\t" \
+ "movups 0x30(%0),%%xmm3 ;\n\t" \
+ : \
+ : "r" (xmm_save) \
+ : "memory"); \
+ write_cr0(cr0); \
+ preempt_enable(); \
+} while (0)
+
+#define ALIGN16 __attribute__((aligned(16)))
+
+#define OFFS(x) "16*("#x")"
+#define PF_OFFS(x) "256+16*("#x")"
+#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
+#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
+#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
+#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
+#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
+#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
+#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
+#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
+#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
+#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
+#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
+#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
+#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
+
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4] ALIGN16;
+ int cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2)
+ :
+ : "memory");
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4] ALIGN16;
+ int cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i,0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i,0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i,0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ ST(i,0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " addl $256, %3 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r"(p2), "+r"(p3)
+ :
+ : "memory" );
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4] ALIGN16;
+ int cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i,0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i,0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO2(i,0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i,0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ ST(i,0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " addl $256, %3 ;\n"
+ " addl $256, %4 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
+ :
+ : "memory" );
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4] ALIGN16;
+ int cr0;
+
+ XMMS_SAVE;
+
+ /* Make sure GCC forgets anything it knows about p4 or p5,
+ such that it won't pass to the asm volatile below a
+ register that is shared with any other variable. That's
+ because we modify p4 and p5 there, but we can't mark them
+ as read/write, otherwise we'd overflow the 10-asm-operands
+ limit of GCC < 3.1. */
+ asm("" : "+r" (p4), "+r" (p5));
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i,0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i,0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ XO2(i,0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ PF4(i) \
+ PF4(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO3(i,0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ XO4(i,0) \
+ XO4(i + 1, 1) \
+ XO4(i + 2, 2) \
+ XO4(i + 3, 3) \
+ ST(i,0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " addl $256, %3 ;\n"
+ " addl $256, %4 ;\n"
+ " addl $256, %5 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ : "r" (p4), "r" (p5)
+ : "memory");
+
+ /* p4 and p5 were modified, and now the variables are dead.
+ Clobber them just to be sure nobody does something stupid
+ like assuming they have some legal value. */
+ asm("" : "=r" (p4), "=r" (p5));
+
+ XMMS_RESTORE;
+}
+
+static struct xor_block_template xor_block_pIII_sse = {
+ .name = "pIII_sse",
+ .do_2 = xor_sse_2,
+ .do_3 = xor_sse_3,
+ .do_4 = xor_sse_4,
+ .do_5 = xor_sse_5,
+};
+
+/* Also try the generic routines. */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES \
+do { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_8regs_p); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_32regs_p); \
+ if (cpu_has_xmm) \
+ xor_speed(&xor_block_pIII_sse); \
+ if (cpu_has_mmx) { \
+ xor_speed(&xor_block_pII_mmx); \
+ xor_speed(&xor_block_p5_mmx); \
+ } \
+} while (0)
+
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+ (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+
+#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
new file mode 100644
index 00000000000..1549b5e261f
--- /dev/null
+++ b/arch/x86/include/asm/xor_64.h
@@ -0,0 +1,361 @@
+#ifndef _ASM_X86_XOR_64_H
+#define _ASM_X86_XOR_64_H
+
+/*
+ * Optimized RAID-5 checksumming functions for MMX and SSE.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+/*
+ * Based on
+ * High-speed RAID5 checksumming functions utilizing SSE instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+
+/*
+ * x86-64 changes / gcc fixes from Andi Kleen.
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ *
+ * This hasn't been optimized for the hammer yet, but there are likely
+ * no advantages to be gotten from x86-64 here anyways.
+ */
+
+typedef struct {
+ unsigned long a, b;
+} __attribute__((aligned(16))) xmm_store_t;
+
+/* Doesn't use gcc to save the XMM registers, because there is no easy way to
+ tell it to do a clts before the register saving. */
+#define XMMS_SAVE \
+do { \
+ preempt_disable(); \
+ asm volatile( \
+ "movq %%cr0,%0 ;\n\t" \
+ "clts ;\n\t" \
+ "movups %%xmm0,(%1) ;\n\t" \
+ "movups %%xmm1,0x10(%1) ;\n\t" \
+ "movups %%xmm2,0x20(%1) ;\n\t" \
+ "movups %%xmm3,0x30(%1) ;\n\t" \
+ : "=&r" (cr0) \
+ : "r" (xmm_save) \
+ : "memory"); \
+} while (0)
+
+#define XMMS_RESTORE \
+do { \
+ asm volatile( \
+ "sfence ;\n\t" \
+ "movups (%1),%%xmm0 ;\n\t" \
+ "movups 0x10(%1),%%xmm1 ;\n\t" \
+ "movups 0x20(%1),%%xmm2 ;\n\t" \
+ "movups 0x30(%1),%%xmm3 ;\n\t" \
+ "movq %0,%%cr0 ;\n\t" \
+ : \
+ : "r" (cr0), "r" (xmm_save) \
+ : "memory"); \
+ preempt_enable(); \
+} while (0)
+
+#define OFFS(x) "16*("#x")"
+#define PF_OFFS(x) "256+16*("#x")"
+#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
+#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
+#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
+#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
+#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
+#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
+#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
+#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
+#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
+#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
+#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
+#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
+#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
+
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned int lines = bytes >> 8;
+ unsigned long cr0;
+ xmm_store_t xmm_save[4];
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
+ : [inc] "r" (256UL)
+ : "memory");
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned int lines = bytes >> 8;
+ xmm_store_t xmm_save[4];
+ unsigned long cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " addq %[inc], %[p3] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+ : [inc] "r" (256UL)
+ : "memory");
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned int lines = bytes >> 8;
+ xmm_store_t xmm_save[4];
+ unsigned long cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " addq %[inc], %[p3] ;\n"
+ " addq %[inc], %[p4] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [cnt] "+c" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+ : [inc] "r" (256UL)
+ : "memory" );
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned int lines = bytes >> 8;
+ xmm_store_t xmm_save[4];
+ unsigned long cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ PF4(i) \
+ PF4(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ XO4(i, 0) \
+ XO4(i + 1, 1) \
+ XO4(i + 2, 2) \
+ XO4(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " addq %[inc], %[p3] ;\n"
+ " addq %[inc], %[p4] ;\n"
+ " addq %[inc], %[p5] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [cnt] "+c" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
+ [p5] "+r" (p5)
+ : [inc] "r" (256UL)
+ : "memory");
+
+ XMMS_RESTORE;
+}
+
+static struct xor_block_template xor_block_sse = {
+ .name = "generic_sse",
+ .do_2 = xor_sse_2,
+ .do_3 = xor_sse_3,
+ .do_4 = xor_sse_4,
+ .do_5 = xor_sse_5,
+};
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES \
+do { \
+ xor_speed(&xor_block_sse); \
+} while (0)
+
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
+#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+
+#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
new file mode 100644
index 00000000000..08e9a1ac07a
--- /dev/null
+++ b/arch/x86/include/asm/xsave.h
@@ -0,0 +1,118 @@
+#ifndef __ASM_X86_XSAVE_H
+#define __ASM_X86_XSAVE_H
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+
+#define XSTATE_FP 0x1
+#define XSTATE_SSE 0x2
+
+#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
+
+#define FXSAVE_SIZE 512
+
+/*
+ * These are the features that the OS can handle currently.
+ */
+#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE)
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX "0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+extern unsigned int xstate_size;
+extern u64 pcntxt_mask;
+extern struct xsave_struct *init_xstate_buf;
+
+extern void xsave_cntxt_init(void);
+extern void xsave_init(void);
+extern int init_fpu(struct task_struct *child);
+extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
+ void __user *fpstate,
+ struct _fpx_sw_bytes *sw);
+
+static inline int xrstor_checking(struct xsave_struct *fx)
+{
+ int err;
+
+ asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err)
+ : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
+ : "memory");
+
+ return err;
+}
+
+static inline int xsave_user(struct xsave_struct __user *buf)
+{
+ int err;
+ __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ _ASM_ALIGN "\n"
+ _ASM_PTR "1b,3b\n"
+ ".previous"
+ : [err] "=r" (err)
+ : "D" (buf), "a" (-1), "d" (-1), "0" (0)
+ : "memory");
+ if (unlikely(err) && __clear_user(buf, xstate_size))
+ err = -EFAULT;
+ /* No need to clear here because the caller clears USED_MATH */
+ return err;
+}
+
+static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
+{
+ int err;
+ struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ _ASM_ALIGN "\n"
+ _ASM_PTR "1b,3b\n"
+ ".previous"
+ : [err] "=r" (err)
+ : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
+ : "memory"); /* memory required? */
+ return err;
+}
+
+static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+}
+
+static inline void xsave(struct task_struct *tsk)
+{
+ /* This, however, we can work around by forcing the compiler to select
+ an addressing mode that doesn't require extended registers. */
+ __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
+ : : "D" (&(tsk->thread.xstate->xsave)),
+ "a" (-1), "d"(-1) : "memory");
+}
+#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3db651fc8ec..d7e5a58ee22 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_tsc.o = -pg
CFLAGS_REMOVE_rtc.o = -pg
-CFLAGS_REMOVE_paravirt.o = -pg
+CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
endif
#
@@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp)
CFLAGS_tsc.o := $(nostackp)
obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
-obj-y += traps_$(BITS).o irq_$(BITS).o
+obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time_$(BITS).o ioport.o ldt.o
obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
@@ -38,7 +38,7 @@ obj-y += tsc.o io_delay.o rtc.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
-obj-y += i387.o
+obj-y += i387.o xsave.o
obj-y += ptrace.o
obj-y += ds.o
obj-$(CONFIG_X86_32) += tls.o
@@ -51,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
-obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_PCI) += early-quirks.o
apm-y := apm_32.o
obj-$(CONFIG_APM) += apm.o
@@ -61,14 +60,15 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o
obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o
-obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
+obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
+obj-$(CONFIG_X86_ES7000) += es7000_32.o
obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
obj-y += vsmp_64.o
obj-$(CONFIG_KPROBES) += kprobes.o
@@ -89,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
-obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
@@ -99,11 +99,18 @@ scx200-y += scx200_32.o
obj-$(CONFIG_OLPC) += olpc.o
+microcode-y := microcode_core.o
+microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
+microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
+obj-$(CONFIG_MICROCODE) += microcode.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
- obj-y += bios_uv.o
+ obj-y += bios_uv.o uv_irq.o uv_sysfs.o
+ obj-y += genx2apic_cluster.o
+ obj-y += genx2apic_phys.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index bfd10fd211c..8c1f76abae9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
#include <asm/proto.h>
-#include <asm/genapic.h>
#else /* X86 */
@@ -97,8 +96,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#warning ACPI uses CMPXCHG, i486 and later hardware
#endif
-static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
-
/* --------------------------------------------------------------------------
Boot-time Configuration
-------------------------------------------------------------------------- */
@@ -156,6 +153,9 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
}
#ifdef CONFIG_PCI_MMCONFIG
+
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
struct acpi_mcfg_allocation *pci_mmcfg_config;
int pci_mmcfg_config_num;
@@ -253,10 +253,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
return;
}
-#ifdef CONFIG_X86_32
if (boot_cpu_physical_apicid != -1U)
ver = apic_version[boot_cpu_physical_apicid];
-#endif
generic_processor_info(id, ver);
}
@@ -775,11 +773,9 @@ static void __init acpi_register_lapic_address(unsigned long address)
set_fixmap_nocache(FIX_APIC_BASE, address);
if (boot_cpu_physical_apicid == -1U) {
- boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
-#ifdef CONFIG_X86_32
+ boot_cpu_physical_apicid = read_apic_id();
apic_version[boot_cpu_physical_apicid] =
GET_APIC_VERSION(apic_read(APIC_LVR));
-#endif
}
}
@@ -1141,7 +1137,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
return gsi;
}
if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
- pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
+ pr_debug("Pin %d-%d already programmed\n",
mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
#ifdef CONFIG_X86_32
return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
@@ -1261,7 +1257,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
count =
acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
- NR_IRQ_VECTORS);
+ nr_irqs);
if (count < 0) {
printk(KERN_ERR PREFIX
"Error parsing interrupt source overrides entry\n");
@@ -1281,7 +1277,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
count =
acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
- NR_IRQ_VECTORS);
+ nr_irqs);
if (count < 0) {
printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
@@ -1351,7 +1347,9 @@ static void __init acpi_process_madt(void)
acpi_ioapic = 1;
smp_found_config = 1;
+#ifdef CONFIG_X86_32
setup_apic_routing();
+#endif
}
}
if (error == -EINVAL) {
@@ -1421,8 +1419,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
*/
static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
{
- pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident);
- acpi_skip_timer_override = 1;
+ /*
+ * The ati_ixp4x0_rev() early PCI quirk should have set
+ * the acpi_skip_timer_override flag already:
+ */
+ if (!acpi_skip_timer_override) {
+ WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
+ pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+ d->ident);
+ acpi_skip_timer_override = 1;
+ }
return 0;
}
@@ -1593,6 +1599,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
},
},
+ {}
+};
+
+/* second table for DMI checks that should run after early-quirks */
+static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
/*
* HP laptops which use a DSDT reporting as HP/SB400/10000,
* which includes some code which overrides all temperature
@@ -1605,6 +1616,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
*/
{
.callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP nx6115 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
.ident = "HP NX6125 laptop",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1619,6 +1638,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
},
},
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP 6715b laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
+ },
+ },
{}
};
@@ -1705,6 +1732,9 @@ int __init early_acpi_boot_init(void)
int __init acpi_boot_init(void)
{
+ /* those are executed after early-quirks are executed */
+ dmi_check_system(acpi_dmi_table_late);
+
/*
* If acpi_disabled, bail out
* One exception: acpi=ht continues far enough to enumerate LAPICs
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 426e5d91b63..806b4e9051b 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -10,6 +10,7 @@
#include <linux/dmi.h>
#include <linux/cpumask.h>
#include <asm/segment.h>
+#include <asm/desc.h>
#include "realmode/wakeup.h"
#include "sleep.h"
@@ -21,7 +22,7 @@ unsigned long acpi_realmode_flags;
static unsigned long acpi_realmode;
#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
-static char temp_stack[10240];
+static char temp_stack[4096];
#endif
/**
@@ -97,7 +98,9 @@ int acpi_save_state_mem(void)
#else /* CONFIG_64BIT */
header->trampoline_segment = setup_trampoline() >> 4;
#ifdef CONFIG_SMP
- stack_start.sp = temp_stack + 4096;
+ stack_start.sp = temp_stack + sizeof(temp_stack);
+ early_gdt_descr.address =
+ (unsigned long)get_cpu_gdt_table(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65a0c1b4869..a84ac7b570e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -231,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
continue;
if (*ptr > text_end)
continue;
- text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
+ /* turn DS segment override prefix into lock prefix */
+ text_poke(*ptr, ((unsigned char []){0xf0}), 1);
};
}
static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
u8 **ptr;
- char insn[1];
if (noreplace_smp)
return;
- add_nops(insn, 1);
for (ptr = start; ptr < end; ptr++) {
if (*ptr < text)
continue;
if (*ptr > text_end)
continue;
- text_poke(*ptr, insn, 1);
+ /* turn lock prefix into DS segment override prefix */
+ text_poke(*ptr, ((unsigned char []){0x3E}), 1);
};
}
@@ -444,7 +444,7 @@ void __init alternative_instructions(void)
_text, _etext);
/* Only switch to UP mode if we don't immediately boot others */
- if (num_possible_cpus() == 1 || setup_max_cpus <= 1)
+ if (num_present_cpus() == 1 || setup_max_cpus <= 1)
alternatives_smp_switch(0);
}
#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 69b4d060b21..a8fd9ebdc8e 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -33,6 +33,10 @@
static DEFINE_RWLOCK(amd_iommu_devtable_lock);
+/* A list of preallocated protection domains */
+static LIST_HEAD(iommu_pd_list);
+static DEFINE_SPINLOCK(iommu_pd_list_lock);
+
/*
* general struct to manage commands send to an IOMMU
*/
@@ -51,6 +55,102 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
/****************************************************************************
*
+ * Interrupt handling functions
+ *
+ ****************************************************************************/
+
+static void iommu_print_event(void *__evt)
+{
+ u32 *event = __evt;
+ int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
+ int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
+ int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
+ int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
+ u64 address = (u64)(((u64)event[3]) << 32) | event[2];
+
+ printk(KERN_ERR "AMD IOMMU: Event logged [");
+
+ switch (type) {
+ case EVENT_TYPE_ILL_DEV:
+ printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
+ "address=0x%016llx flags=0x%04x]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ address, flags);
+ break;
+ case EVENT_TYPE_IO_FAULT:
+ printk("IO_PAGE_FAULT device=%02x:%02x.%x "
+ "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ domid, address, flags);
+ break;
+ case EVENT_TYPE_DEV_TAB_ERR:
+ printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
+ "address=0x%016llx flags=0x%04x]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ address, flags);
+ break;
+ case EVENT_TYPE_PAGE_TAB_ERR:
+ printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
+ "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ domid, address, flags);
+ break;
+ case EVENT_TYPE_ILL_CMD:
+ printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+ break;
+ case EVENT_TYPE_CMD_HARD_ERR:
+ printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
+ "flags=0x%04x]\n", address, flags);
+ break;
+ case EVENT_TYPE_IOTLB_INV_TO:
+ printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
+ "address=0x%016llx]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ address);
+ break;
+ case EVENT_TYPE_INV_DEV_REQ:
+ printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
+ "address=0x%016llx flags=0x%04x]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ address, flags);
+ break;
+ default:
+ printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
+ }
+}
+
+static void iommu_poll_events(struct amd_iommu *iommu)
+{
+ u32 head, tail;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+
+ head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
+
+ while (head != tail) {
+ iommu_print_event(iommu->evt_buf + head);
+ head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
+ }
+
+ writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+
+ spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+irqreturn_t amd_iommu_int_handler(int irq, void *data)
+{
+ struct amd_iommu *iommu;
+
+ list_for_each_entry(iommu, &amd_iommu_list, list)
+ iommu_poll_events(iommu);
+
+ return IRQ_HANDLED;
+}
+
+/****************************************************************************
+ *
* IOMMU command queuing functions
*
****************************************************************************/
@@ -101,10 +201,10 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
*/
static int iommu_completion_wait(struct amd_iommu *iommu)
{
- int ret, ready = 0;
+ int ret = 0, ready = 0;
unsigned status = 0;
struct iommu_cmd cmd;
- unsigned long i = 0;
+ unsigned long flags, i = 0;
memset(&cmd, 0, sizeof(cmd));
cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
@@ -112,10 +212,12 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
iommu->need_sync = 0;
- ret = iommu_queue_command(iommu, &cmd);
+ spin_lock_irqsave(&iommu->lock, flags);
+
+ ret = __iommu_queue_command(iommu, &cmd);
if (ret)
- return ret;
+ goto out;
while (!ready && (i < EXIT_LOOP_COUNT)) {
++i;
@@ -130,6 +232,8 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
+out:
+ spin_unlock_irqrestore(&iommu->lock, flags);
return 0;
}
@@ -140,6 +244,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
{
struct iommu_cmd cmd;
+ int ret;
BUG_ON(iommu == NULL);
@@ -147,9 +252,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
cmd.data[0] = devid;
+ ret = iommu_queue_command(iommu, &cmd);
+
iommu->need_sync = 1;
- return iommu_queue_command(iommu, &cmd);
+ return ret;
}
/*
@@ -159,6 +266,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
u64 address, u16 domid, int pde, int s)
{
struct iommu_cmd cmd;
+ int ret;
memset(&cmd, 0, sizeof(cmd));
address &= PAGE_MASK;
@@ -171,9 +279,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+ ret = iommu_queue_command(iommu, &cmd);
+
iommu->need_sync = 1;
- return iommu_queue_command(iommu, &cmd);
+ return ret;
}
/*
@@ -185,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
u64 address, size_t size)
{
int s = 0;
- unsigned pages = iommu_num_pages(address, size);
+ unsigned pages = iommu_num_pages(address, size, PAGE_SIZE);
address &= PAGE_MASK;
@@ -203,6 +313,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
return 0;
}
+/* Flush the whole IO/TLB for a given protection domain */
+static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
+{
+ u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+
+ iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
+}
+
/****************************************************************************
*
* The functions below are used the create the page table mappings for
@@ -362,11 +480,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
* efficient allocator.
*
****************************************************************************/
-static unsigned long dma_mask_to_pages(unsigned long mask)
-{
- return (mask >> PAGE_SHIFT) +
- (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
-}
/*
* The address allocator core function.
@@ -375,25 +488,31 @@ static unsigned long dma_mask_to_pages(unsigned long mask)
*/
static unsigned long dma_ops_alloc_addresses(struct device *dev,
struct dma_ops_domain *dom,
- unsigned int pages)
+ unsigned int pages,
+ unsigned long align_mask,
+ u64 dma_mask)
{
- unsigned long limit = dma_mask_to_pages(*dev->dma_mask);
+ unsigned long limit;
unsigned long address;
- unsigned long size = dom->aperture_size >> PAGE_SHIFT;
unsigned long boundary_size;
boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
PAGE_SIZE) >> PAGE_SHIFT;
- limit = limit < size ? limit : size;
+ limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
+ dma_mask >> PAGE_SHIFT);
- if (dom->next_bit >= limit)
+ if (dom->next_bit >= limit) {
dom->next_bit = 0;
+ dom->need_flush = true;
+ }
address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
- 0 , boundary_size, 0);
- if (address == -1)
+ 0 , boundary_size, align_mask);
+ if (address == -1) {
address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
- 0, boundary_size, 0);
+ 0, boundary_size, align_mask);
+ dom->need_flush = true;
+ }
if (likely(address != -1)) {
dom->next_bit = address + pages;
@@ -459,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
if (start_page + pages > last_page)
pages = last_page - start_page;
- set_bit_string(dom->bitmap, start_page, pages);
+ iommu_area_reserve(dom->bitmap, start_page, pages);
}
static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
@@ -553,12 +672,16 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
dma_dom->bitmap[0] = 1;
dma_dom->next_bit = 0;
+ dma_dom->need_flush = false;
+ dma_dom->target_dev = 0xffff;
+
/* Intialize the exclusion range if necessary */
if (iommu->exclusion_start &&
iommu->exclusion_start < dma_dom->aperture_size) {
unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length);
+ iommu->exclusion_length,
+ PAGE_SIZE);
dma_ops_reserve_addresses(dma_dom, startpage, pages);
}
@@ -623,12 +746,13 @@ static void set_device_domain(struct amd_iommu *iommu,
u64 pte_root = virt_to_phys(domain->pt_root);
- pte_root |= (domain->mode & 0x07) << 9;
- pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2;
+ pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
+ << DEV_ENTRY_MODE_SHIFT;
+ pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- amd_iommu_dev_table[devid].data[0] = pte_root;
- amd_iommu_dev_table[devid].data[1] = pte_root >> 32;
+ amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+ amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
amd_iommu_dev_table[devid].data[2] = domain->id;
amd_iommu_pd_table[devid] = domain;
@@ -646,6 +770,45 @@ static void set_device_domain(struct amd_iommu *iommu,
*****************************************************************************/
/*
+ * This function checks if the driver got a valid device from the caller to
+ * avoid dereferencing invalid pointers.
+ */
+static bool check_device(struct device *dev)
+{
+ if (!dev || !dev->dma_mask)
+ return false;
+
+ return true;
+}
+
+/*
+ * In this function the list of preallocated protection domains is traversed to
+ * find the domain for a specific device
+ */
+static struct dma_ops_domain *find_protection_domain(u16 devid)
+{
+ struct dma_ops_domain *entry, *ret = NULL;
+ unsigned long flags;
+
+ if (list_empty(&iommu_pd_list))
+ return NULL;
+
+ spin_lock_irqsave(&iommu_pd_list_lock, flags);
+
+ list_for_each_entry(entry, &iommu_pd_list, list) {
+ if (entry->target_dev == devid) {
+ ret = entry;
+ list_del(&ret->list);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
+
+ return ret;
+}
+
+/*
* In the dma_ops path we only have the struct device. This function
* finds the corresponding IOMMU, the protection domain and the
* requestor id for a given device.
@@ -661,27 +824,30 @@ static int get_device_resources(struct device *dev,
struct pci_dev *pcidev;
u16 _bdf;
- BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
+ *iommu = NULL;
+ *domain = NULL;
+ *bdf = 0xffff;
+
+ if (dev->bus != &pci_bus_type)
+ return 0;
pcidev = to_pci_dev(dev);
_bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
/* device not translated by any IOMMU in the system? */
- if (_bdf > amd_iommu_last_bdf) {
- *iommu = NULL;
- *domain = NULL;
- *bdf = 0xffff;
+ if (_bdf > amd_iommu_last_bdf)
return 0;
- }
*bdf = amd_iommu_alias_table[_bdf];
*iommu = amd_iommu_rlookup_table[*bdf];
if (*iommu == NULL)
return 0;
- dma_dom = (*iommu)->default_dom;
*domain = domain_for_device(*bdf);
if (*domain == NULL) {
+ dma_dom = find_protection_domain(*bdf);
+ if (!dma_dom)
+ dma_dom = (*iommu)->default_dom;
*domain = &dma_dom->domain;
set_device_domain(*iommu, *domain, *bdf);
printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
@@ -760,17 +926,24 @@ static dma_addr_t __map_single(struct device *dev,
struct dma_ops_domain *dma_dom,
phys_addr_t paddr,
size_t size,
- int dir)
+ int dir,
+ bool align,
+ u64 dma_mask)
{
dma_addr_t offset = paddr & ~PAGE_MASK;
dma_addr_t address, start;
unsigned int pages;
+ unsigned long align_mask = 0;
int i;
- pages = iommu_num_pages(paddr, size);
+ pages = iommu_num_pages(paddr, size, PAGE_SIZE);
paddr &= PAGE_MASK;
- address = dma_ops_alloc_addresses(dev, dma_dom, pages);
+ if (align)
+ align_mask = (1UL << get_order(size)) - 1;
+
+ address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
+ dma_mask);
if (unlikely(address == bad_dma_address))
goto out;
@@ -782,6 +955,12 @@ static dma_addr_t __map_single(struct device *dev,
}
address += offset;
+ if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
+ iommu_flush_tlb(iommu, dma_dom->domain.id);
+ dma_dom->need_flush = false;
+ } else if (unlikely(iommu_has_npcache(iommu)))
+ iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
+
out:
return address;
}
@@ -802,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu,
if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
return;
- pages = iommu_num_pages(dma_addr, size);
+ pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
dma_addr &= PAGE_MASK;
start = dma_addr;
@@ -812,6 +991,9 @@ static void __unmap_single(struct amd_iommu *iommu,
}
dma_ops_free_addresses(dma_dom, dma_addr, pages);
+
+ if (amd_iommu_unmap_flush)
+ iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
}
/*
@@ -825,6 +1007,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
struct protection_domain *domain;
u16 devid;
dma_addr_t addr;
+ u64 dma_mask;
+
+ if (!check_device(dev))
+ return bad_dma_address;
+
+ dma_mask = *dev->dma_mask;
get_device_resources(dev, &iommu, &domain, &devid);
@@ -833,14 +1021,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
return (dma_addr_t)paddr;
spin_lock_irqsave(&domain->lock, flags);
- addr = __map_single(dev, iommu, domain->priv, paddr, size, dir);
+ addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
+ dma_mask);
if (addr == bad_dma_address)
goto out;
- if (iommu_has_npcache(iommu))
- iommu_flush_pages(iommu, domain->id, addr, size);
-
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
out:
@@ -860,7 +1046,8 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
struct protection_domain *domain;
u16 devid;
- if (!get_device_resources(dev, &iommu, &domain, &devid))
+ if (!check_device(dev) ||
+ !get_device_resources(dev, &iommu, &domain, &devid))
/* device not handled by any AMD IOMMU */
return;
@@ -868,9 +1055,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
__unmap_single(iommu, domain->priv, dma_addr, size, dir);
- iommu_flush_pages(iommu, domain->id, dma_addr, size);
-
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -909,6 +1094,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
struct scatterlist *s;
phys_addr_t paddr;
int mapped_elems = 0;
+ u64 dma_mask;
+
+ if (!check_device(dev))
+ return 0;
+
+ dma_mask = *dev->dma_mask;
get_device_resources(dev, &iommu, &domain, &devid);
@@ -921,19 +1112,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
paddr = sg_phys(s);
s->dma_address = __map_single(dev, iommu, domain->priv,
- paddr, s->length, dir);
+ paddr, s->length, dir, false,
+ dma_mask);
if (s->dma_address) {
s->dma_length = s->length;
mapped_elems++;
} else
goto unmap;
- if (iommu_has_npcache(iommu))
- iommu_flush_pages(iommu, domain->id, s->dma_address,
- s->dma_length);
}
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
out:
@@ -967,7 +1156,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
u16 devid;
int i;
- if (!get_device_resources(dev, &iommu, &domain, &devid))
+ if (!check_device(dev) ||
+ !get_device_resources(dev, &iommu, &domain, &devid))
return;
spin_lock_irqsave(&domain->lock, flags);
@@ -975,12 +1165,10 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
for_each_sg(sglist, s, nelems, i) {
__unmap_single(iommu, domain->priv, s->dma_address,
s->dma_length, dir);
- iommu_flush_pages(iommu, domain->id, s->dma_address,
- s->dma_length);
s->dma_address = s->dma_length = 0;
}
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -998,25 +1186,33 @@ static void *alloc_coherent(struct device *dev, size_t size,
struct protection_domain *domain;
u16 devid;
phys_addr_t paddr;
+ u64 dma_mask = dev->coherent_dma_mask;
+
+ if (!check_device(dev))
+ return NULL;
+ if (!get_device_resources(dev, &iommu, &domain, &devid))
+ flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+ flag |= __GFP_ZERO;
virt_addr = (void *)__get_free_pages(flag, get_order(size));
if (!virt_addr)
return 0;
- memset(virt_addr, 0, size);
paddr = virt_to_phys(virt_addr);
- get_device_resources(dev, &iommu, &domain, &devid);
-
if (!iommu || !domain) {
*dma_addr = (dma_addr_t)paddr;
return virt_addr;
}
+ if (!dma_mask)
+ dma_mask = *dev->dma_mask;
+
spin_lock_irqsave(&domain->lock, flags);
*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
- size, DMA_BIDIRECTIONAL);
+ size, DMA_BIDIRECTIONAL, true, dma_mask);
if (*dma_addr == bad_dma_address) {
free_pages((unsigned long)virt_addr, get_order(size));
@@ -1024,10 +1220,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
goto out;
}
- if (iommu_has_npcache(iommu))
- iommu_flush_pages(iommu, domain->id, *dma_addr, size);
-
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
out:
@@ -1038,8 +1231,6 @@ out:
/*
* The exported free_coherent function for dma_ops.
- * FIXME: fix the generic x86 DMA layer so that it actually calls that
- * function.
*/
static void free_coherent(struct device *dev, size_t size,
void *virt_addr, dma_addr_t dma_addr)
@@ -1049,6 +1240,9 @@ static void free_coherent(struct device *dev, size_t size,
struct protection_domain *domain;
u16 devid;
+ if (!check_device(dev))
+ return;
+
get_device_resources(dev, &iommu, &domain, &devid);
if (!iommu || !domain)
@@ -1057,9 +1251,8 @@ static void free_coherent(struct device *dev, size_t size,
spin_lock_irqsave(&domain->lock, flags);
__unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
- iommu_flush_pages(iommu, domain->id, dma_addr, size);
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1069,6 +1262,30 @@ free_mem:
}
/*
+ * This function is called by the DMA layer to find out if we can handle a
+ * particular device. It is part of the dma_ops.
+ */
+static int amd_iommu_dma_supported(struct device *dev, u64 mask)
+{
+ u16 bdf;
+ struct pci_dev *pcidev;
+
+ /* No device or no PCI device */
+ if (!dev || dev->bus != &pci_bus_type)
+ return 0;
+
+ pcidev = to_pci_dev(dev);
+
+ bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
+
+ /* Out of our scope? */
+ if (bdf > amd_iommu_last_bdf)
+ return 0;
+
+ return 1;
+}
+
+/*
* The function for pre-allocating protection domains.
*
* If the driver core informs the DMA layer if a driver grabs a device
@@ -1097,10 +1314,9 @@ void prealloc_protection_domains(void)
if (!dma_dom)
continue;
init_unity_mappings_for_device(dma_dom, devid);
- set_device_domain(iommu, &dma_dom->domain, devid);
- printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ",
- dma_dom->domain.id);
- print_devid(devid, 1);
+ dma_dom->target_dev = devid;
+
+ list_add_tail(&dma_dom->list, &iommu_pd_list);
}
}
@@ -1111,6 +1327,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
.unmap_single = unmap_single,
.map_sg = map_sg,
.unmap_sg = unmap_sg,
+ .dma_supported = amd_iommu_dma_supported,
};
/*
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index a69cc0f5204..0cdcda35a05 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,6 +22,8 @@
#include <linux/gfp.h>
#include <linux/list.h>
#include <linux/sysdev.h>
+#include <linux/interrupt.h>
+#include <linux/msi.h>
#include <asm/pci-direct.h>
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
@@ -30,7 +32,6 @@
/*
* definitions for the ACPI scanning code
*/
-#define PCI_BUS(x) (((x) >> 8) & 0xff)
#define IVRS_HEADER_LENGTH 48
#define ACPI_IVHD_TYPE 0x10
@@ -121,6 +122,7 @@ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
int amd_iommu_isolate; /* if 1, device isolation is enabled */
+bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
system */
@@ -210,7 +212,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
/* Programs the physical address of the device table into the IOMMU hardware */
static void __init iommu_set_device_table(struct amd_iommu *iommu)
{
- u32 entry;
+ u64 entry;
BUG_ON(iommu->mmio_base == NULL);
@@ -234,7 +236,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
{
u32 ctrl;
- ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
ctrl &= ~(1 << bit);
writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
}
@@ -242,13 +244,23 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
/* Function to enable the hardware */
void __init iommu_enable(struct amd_iommu *iommu)
{
- printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
- print_devid(iommu->devid, 0);
- printk(" cap 0x%hx\n", iommu->cap_ptr);
+ printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
+ "at %02x:%02x.%x cap 0x%hx\n",
+ iommu->dev->bus->number,
+ PCI_SLOT(iommu->dev->devfn),
+ PCI_FUNC(iommu->dev->devfn),
+ iommu->cap_ptr);
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
}
+/* Function to enable IOMMU event logging and event interrupts */
+void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+{
+ iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
+ iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+}
+
/*
* mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
* the system has one.
@@ -286,6 +298,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
****************************************************************************/
/*
+ * This function calculates the length of a given IVHD entry
+ */
+static inline int ivhd_entry_length(u8 *ivhd)
+{
+ return 0x04 << (*ivhd >> 6);
+}
+
+/*
* This function reads the last device id the IOMMU has to handle from the PCI
* capability header for this IOMMU
*/
@@ -329,7 +349,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
default:
break;
}
- p += 0x04 << (*p >> 6);
+ p += ivhd_entry_length(p);
}
WARN_ON(p != end);
@@ -414,7 +434,32 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
static void __init free_command_buffer(struct amd_iommu *iommu)
{
- free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
+ free_pages((unsigned long)iommu->cmd_buf,
+ get_order(iommu->cmd_buf_size));
+}
+
+/* allocates the memory where the IOMMU will log its events to */
+static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
+{
+ u64 entry;
+ iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(EVT_BUFFER_SIZE));
+
+ if (iommu->evt_buf == NULL)
+ return NULL;
+
+ entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+ memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
+ &entry, sizeof(entry));
+
+ iommu->evt_buf_size = EVT_BUFFER_SIZE;
+
+ return iommu->evt_buf;
+}
+
+static void __init free_event_buffer(struct amd_iommu *iommu)
+{
+ free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
}
/* sets a specific bit in the device table entry. */
@@ -487,19 +532,21 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
*/
static void __init init_iommu_from_pci(struct amd_iommu *iommu)
{
- int bus = PCI_BUS(iommu->devid);
- int dev = PCI_SLOT(iommu->devid);
- int fn = PCI_FUNC(iommu->devid);
int cap_ptr = iommu->cap_ptr;
- u32 range;
+ u32 range, misc;
- iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
+ pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
+ &iommu->cap);
+ pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
+ &range);
+ pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
+ &misc);
- range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
iommu->first_device = calc_devid(MMIO_GET_BUS(range),
MMIO_GET_FD(range));
iommu->last_device = calc_devid(MMIO_GET_BUS(range),
MMIO_GET_LD(range));
+ iommu->evt_msi_num = MMIO_MSI_NUM(misc);
}
/*
@@ -604,7 +651,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
break;
}
- p += 0x04 << (e->type >> 6);
+ p += ivhd_entry_length(p);
}
}
@@ -622,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
static void __init free_iommu_one(struct amd_iommu *iommu)
{
free_command_buffer(iommu);
+ free_event_buffer(iommu);
iommu_unmap_mmio_space(iommu);
}
@@ -649,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
/*
* Copy data from ACPI table entry to the iommu struct
*/
- iommu->devid = h->devid;
+ iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
+ if (!iommu->dev)
+ return 1;
+
iommu->cap_ptr = h->cap_ptr;
+ iommu->pci_seg = h->pci_seg;
iommu->mmio_phys = h->mmio_phys;
iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
if (!iommu->mmio_base)
@@ -661,11 +713,17 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
if (!iommu->cmd_buf)
return -ENOMEM;
+ iommu->evt_buf = alloc_event_buffer(iommu);
+ if (!iommu->evt_buf)
+ return -ENOMEM;
+
+ iommu->int_enabled = false;
+
init_iommu_from_pci(iommu);
init_iommu_from_acpi(iommu, h);
init_iommu_devices(iommu);
- return 0;
+ return pci_enable_device(iommu->dev);
}
/*
@@ -706,6 +764,95 @@ static int __init init_iommu_all(struct acpi_table_header *table)
/****************************************************************************
*
+ * The following functions initialize the MSI interrupts for all IOMMUs
+ * in the system. Its a bit challenging because there could be multiple
+ * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
+ * pci_dev.
+ *
+ ****************************************************************************/
+
+static int __init iommu_setup_msix(struct amd_iommu *iommu)
+{
+ struct amd_iommu *curr;
+ struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
+ int nvec = 0, i;
+
+ list_for_each_entry(curr, &amd_iommu_list, list) {
+ if (curr->dev == iommu->dev) {
+ entries[nvec].entry = curr->evt_msi_num;
+ entries[nvec].vector = 0;
+ curr->int_enabled = true;
+ nvec++;
+ }
+ }
+
+ if (pci_enable_msix(iommu->dev, entries, nvec)) {
+ pci_disable_msix(iommu->dev);
+ return 1;
+ }
+
+ for (i = 0; i < nvec; ++i) {
+ int r = request_irq(entries->vector, amd_iommu_int_handler,
+ IRQF_SAMPLE_RANDOM,
+ "AMD IOMMU",
+ NULL);
+ if (r)
+ goto out_free;
+ }
+
+ return 0;
+
+out_free:
+ for (i -= 1; i >= 0; --i)
+ free_irq(entries->vector, NULL);
+
+ pci_disable_msix(iommu->dev);
+
+ return 1;
+}
+
+static int __init iommu_setup_msi(struct amd_iommu *iommu)
+{
+ int r;
+ struct amd_iommu *curr;
+
+ list_for_each_entry(curr, &amd_iommu_list, list) {
+ if (curr->dev == iommu->dev)
+ curr->int_enabled = true;
+ }
+
+
+ if (pci_enable_msi(iommu->dev))
+ return 1;
+
+ r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
+ IRQF_SAMPLE_RANDOM,
+ "AMD IOMMU",
+ NULL);
+
+ if (r) {
+ pci_disable_msi(iommu->dev);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int __init iommu_init_msi(struct amd_iommu *iommu)
+{
+ if (iommu->int_enabled)
+ return 0;
+
+ if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
+ return iommu_setup_msix(iommu);
+ else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
+ return iommu_setup_msi(iommu);
+
+ return 1;
+}
+
+/****************************************************************************
+ *
* The next functions belong to the third pass of parsing the ACPI
* table. In this last pass the memory mapping requirements are
* gathered (like exclusion and unity mapping reanges).
@@ -811,7 +958,6 @@ static void init_device_table(void)
for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
set_dev_entry_bit(devid, DEV_ENTRY_VALID);
set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
- set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT);
}
}
@@ -825,6 +971,8 @@ static void __init enable_iommus(void)
list_for_each_entry(iommu, &amd_iommu_list, list) {
iommu_set_exclusion_range(iommu);
+ iommu_init_msi(iommu);
+ iommu_enable_event_logging(iommu);
iommu_enable(iommu);
}
}
@@ -995,11 +1143,17 @@ int __init amd_iommu_init(void)
else
printk("disabled\n");
+ if (amd_iommu_unmap_flush)
+ printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
+ else
+ printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
+
out:
return ret;
free:
- free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+ free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
+ get_order(MAX_DOMAIN_ID/8));
free_pages((unsigned long)amd_iommu_pd_table,
get_order(rlookup_table_size));
@@ -1057,8 +1211,10 @@ void __init amd_iommu_detect(void)
static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
- if (strcmp(str, "isolate") == 0)
+ if (strncmp(str, "isolate", 7) == 0)
amd_iommu_isolate = 1;
+ if (strncmp(str, "fullflush", 11) == 0)
+ amd_iommu_unmap_flush = true;
}
return 1;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 44e21826db1..9a32b37ee2e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -455,11 +455,11 @@ out:
force_iommu ||
valid_agp ||
fallback_aper_force) {
- printk(KERN_ERR
+ printk(KERN_INFO
"Your BIOS doesn't leave a aperture memory hole\n");
- printk(KERN_ERR
+ printk(KERN_INFO
"Please enable the IOMMU option in the BIOS setup\n");
- printk(KERN_ERR
+ printk(KERN_INFO
"This costs you %d MB of RAM\n",
32 << fallback_aper_order);
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic.c
index f88bd0d982b..04a7f960bbc 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic.c
@@ -23,11 +23,13 @@
#include <linux/mc146818rtc.h>
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
+#include <linux/ioport.h>
#include <linux/cpu.h>
#include <linux/clockchips.h>
#include <linux/acpi_pmtmr.h>
#include <linux/module.h>
#include <linux/dmi.h>
+#include <linux/dmar.h>
#include <asm/atomic.h>
#include <asm/smp.h>
@@ -36,8 +38,14 @@
#include <asm/desc.h>
#include <asm/arch_hooks.h>
#include <asm/hpet.h>
+#include <asm/pgalloc.h>
#include <asm/i8253.h>
#include <asm/nmi.h>
+#include <asm/idle.h>
+#include <asm/proto.h>
+#include <asm/timex.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
#include <mach_apic.h>
#include <mach_apicdef.h>
@@ -50,20 +58,60 @@
# error SPURIOUS_APIC_VECTOR definition error
#endif
-unsigned long mp_lapic_addr;
-
+#ifdef CONFIG_X86_32
/*
* Knob to control our willingness to enable the local APIC.
*
* +1=force-enable
*/
static int force_enable_local_apic;
-int disable_apic;
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+ force_enable_local_apic = 1;
+ return 0;
+}
+early_param("lapic", parse_lapic);
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+
+#endif
+
+#ifdef CONFIG_X86_64
+static int apic_calibrate_pmtmr __initdata;
+static __init int setup_apicpmtimer(char *s)
+{
+ apic_calibrate_pmtmr = 1;
+ notsc_setup(NULL);
+ return 0;
+}
+__setup("apicpmtimer", setup_apicpmtimer);
+#endif
+
+#ifdef CONFIG_X86_64
+#define HAVE_X2APIC
+#endif
+
+#ifdef HAVE_X2APIC
+int x2apic;
+/* x2apic enabled before OS handover */
+int x2apic_preenabled;
+int disable_x2apic;
+static __init int setup_nox2apic(char *str)
+{
+ disable_x2apic = 1;
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ return 0;
+}
+early_param("nox2apic", setup_nox2apic);
+#endif
-/* Local APIC timer verification ok */
-static int local_apic_timer_verify_ok;
+unsigned long mp_lapic_addr;
+int disable_apic;
/* Disable local APIC timer from the kernel commandline or via dmi quirk */
-static int local_apic_timer_disabled;
+static int disable_apic_timer __cpuinitdata;
/* Local APIC timer works in C2 */
int local_apic_timer_c2_ok;
EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -112,9 +160,6 @@ static struct clock_event_device lapic_clockevent = {
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-/* Local APIC was disabled by the BIOS and enabled by the kernel */
-static int enabled_via_apicbase;
-
static unsigned long apic_phys;
/*
@@ -130,7 +175,11 @@ static inline int lapic_get_version(void)
*/
static inline int lapic_is_integrated(void)
{
+#ifdef CONFIG_X86_64
+ return 1;
+#else
return APIC_INTEGRATED(lapic_get_version());
+#endif
}
/*
@@ -145,13 +194,18 @@ static int modern_apic(void)
return lapic_get_version() >= 0x14;
}
-void apic_wait_icr_idle(void)
+/*
+ * Paravirt kernels also might be using these below ops. So we still
+ * use generic apic_read()/apic_write(), which might be pointing to different
+ * ops in PARAVIRT case.
+ */
+void xapic_wait_icr_idle(void)
{
while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
cpu_relax();
}
-u32 safe_apic_wait_icr_idle(void)
+u32 safe_xapic_wait_icr_idle(void)
{
u32 send_status;
int timeout;
@@ -167,19 +221,88 @@ u32 safe_apic_wait_icr_idle(void)
return send_status;
}
+void xapic_icr_write(u32 low, u32 id)
+{
+ apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
+ apic_write(APIC_ICR, low);
+}
+
+u64 xapic_icr_read(void)
+{
+ u32 icr1, icr2;
+
+ icr2 = apic_read(APIC_ICR2);
+ icr1 = apic_read(APIC_ICR);
+
+ return icr1 | ((u64)icr2 << 32);
+}
+
+static struct apic_ops xapic_ops = {
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = xapic_icr_read,
+ .icr_write = xapic_icr_write,
+ .wait_icr_idle = xapic_wait_icr_idle,
+ .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
+};
+
+struct apic_ops __read_mostly *apic_ops = &xapic_ops;
+EXPORT_SYMBOL_GPL(apic_ops);
+
+#ifdef HAVE_X2APIC
+static void x2apic_wait_icr_idle(void)
+{
+ /* no need to wait for icr idle in x2apic */
+ return;
+}
+
+static u32 safe_x2apic_wait_icr_idle(void)
+{
+ /* no need to wait for icr idle in x2apic */
+ return 0;
+}
+
+void x2apic_icr_write(u32 low, u32 id)
+{
+ wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
+}
+
+u64 x2apic_icr_read(void)
+{
+ unsigned long val;
+
+ rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
+ return val;
+}
+
+static struct apic_ops x2apic_ops = {
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .icr_read = x2apic_icr_read,
+ .icr_write = x2apic_icr_write,
+ .wait_icr_idle = x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
+};
+#endif
+
/**
* enable_NMI_through_LVT0 - enable NMI through local vector table 0
*/
void __cpuinit enable_NMI_through_LVT0(void)
{
- unsigned int v = APIC_DM_NMI;
+ unsigned int v;
+
+ /* unmask and set to NMI */
+ v = APIC_DM_NMI;
- /* Level triggered for 82489DX */
+ /* Level triggered for 82489DX (32bit mode) */
if (!lapic_is_integrated())
v |= APIC_LVT_LEVEL_TRIGGER;
+
apic_write(APIC_LVT0, v);
}
+#ifdef CONFIG_X86_32
/**
* get_physical_broadcast - Get number of physical broadcast IDs
*/
@@ -187,15 +310,20 @@ int get_physical_broadcast(void)
{
return modern_apic() ? 0xff : 0xf;
}
+#endif
/**
* lapic_get_maxlvt - get the maximum number of local vector table entries
*/
int lapic_get_maxlvt(void)
{
- unsigned int v = apic_read(APIC_LVR);
+ unsigned int v;
- /* 82489DXs do not report # of LVT entries. */
+ v = apic_read(APIC_LVR);
+ /*
+ * - we always have APIC integrated on 64bit mode
+ * - 82489DXs do not report # of LVT entries
+ */
return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
}
@@ -203,7 +331,7 @@ int lapic_get_maxlvt(void)
* Local APIC timer
*/
-/* Clock divisor is set to 16 */
+/* Clock divisor */
#define APIC_DIVISOR 16
/*
@@ -212,6 +340,9 @@ int lapic_get_maxlvt(void)
* this function twice on the boot CPU, once with a bogus timeout
* value, second time for real. The other (noncalibrating) CPUs
* call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
*/
static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
{
@@ -233,14 +364,48 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
*/
tmp_value = apic_read(APIC_TDCR);
apic_write(APIC_TDCR,
- (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
- APIC_TDR_DIV_16);
+ (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+ APIC_TDR_DIV_16);
if (!oneshot)
apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
}
/*
+ * Setup extended LVT, AMD specific (K8, family 10h)
+ *
+ * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
+ * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ *
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs.
+ */
+
+#define APIC_EILVT_LVTOFF_MCE 0
+#define APIC_EILVT_LVTOFF_IBS 1
+
+static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+{
+ unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+ unsigned int v = (mask << 16) | (msg_type << 8) | vector;
+
+ apic_write(reg, v);
+}
+
+u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+{
+ setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
+ return APIC_EILVT_LVTOFF_MCE;
+}
+
+u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+{
+ setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
+ return APIC_EILVT_LVTOFF_IBS;
+}
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
+
+/*
* Program the next event, relative to now
*/
static int lapic_next_event(unsigned long delta,
@@ -259,8 +424,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
unsigned long flags;
unsigned int v;
- /* Lapic used for broadcast ? */
- if (!local_apic_timer_verify_ok)
+ /* Lapic used as dummy for broadcast ? */
+ if (evt->features & CLOCK_EVT_FEAT_DUMMY)
return;
local_irq_save(flags);
@@ -299,7 +464,7 @@ static void lapic_timer_broadcast(cpumask_t mask)
* Setup the local APIC timer for this CPU. Copy the initilized values
* of the boot CPU and register the clock event in the framework.
*/
-static void __devinit setup_APIC_timer(void)
+static void __cpuinit setup_APIC_timer(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
@@ -369,14 +534,51 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
}
+static int __init calibrate_by_pmtimer(long deltapm, long *delta)
+{
+ const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
+ const long pm_thresh = pm_100ms / 100;
+ unsigned long mult;
+ u64 res;
+
+#ifndef CONFIG_X86_PM_TIMER
+ return -1;
+#endif
+
+ apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+
+ /* Check, if the PM timer is available */
+ if (!deltapm)
+ return -1;
+
+ mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+
+ if (deltapm > (pm_100ms - pm_thresh) &&
+ deltapm < (pm_100ms + pm_thresh)) {
+ apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
+ } else {
+ res = (((u64)deltapm) * mult) >> 22;
+ do_div(res, 1000000);
+ printk(KERN_WARNING "APIC calibration not consistent "
+ "with PM Timer: %ldms instead of 100ms\n",
+ (long)res);
+ /* Correct the lapic counter value */
+ res = (((u64)(*delta)) * pm_100ms);
+ do_div(res, deltapm);
+ printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
+ "%lu (%ld)\n", (unsigned long)res, *delta);
+ *delta = (long)res;
+ }
+
+ return 0;
+}
+
static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
- const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
- const long pm_thresh = pm_100ms/100;
void (*real_handler)(struct clock_event_device *dev);
unsigned long deltaj;
- long delta, deltapm;
+ long delta;
int pm_referenced = 0;
local_irq_disable();
@@ -386,10 +588,10 @@ static int __init calibrate_APIC_clock(void)
global_clock_event->event_handler = lapic_cal_handler;
/*
- * Setup the APIC counter to 1e9. There is no way the lapic
+ * Setup the APIC counter to maximum. There is no way the lapic
* can underflow in the 100ms detection time frame
*/
- __setup_APIC_LVTT(1000000000, 0, 0);
+ __setup_APIC_LVTT(0xffffffff, 0, 0);
/* Let the interrupts run */
local_irq_enable();
@@ -406,34 +608,9 @@ static int __init calibrate_APIC_clock(void)
delta = lapic_cal_t1 - lapic_cal_t2;
apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
- /* Check, if the PM timer is available */
- deltapm = lapic_cal_pm2 - lapic_cal_pm1;
- apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
-
- if (deltapm) {
- unsigned long mult;
- u64 res;
-
- mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
-
- if (deltapm > (pm_100ms - pm_thresh) &&
- deltapm < (pm_100ms + pm_thresh)) {
- apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
- } else {
- res = (((u64) deltapm) * mult) >> 22;
- do_div(res, 1000000);
- printk(KERN_WARNING "APIC calibration not consistent "
- "with PM Timer: %ldms instead of 100ms\n",
- (long)res);
- /* Correct the lapic counter value */
- res = (((u64) delta) * pm_100ms);
- do_div(res, deltapm);
- printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
- "%lu (%ld)\n", (unsigned long) res, delta);
- delta = (long) res;
- }
- pm_referenced = 1;
- }
+ /* we trust the PM based calibration if possible */
+ pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
+ &delta);
/* Calculate the scaled math multiplication factor */
lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -473,9 +650,12 @@ static int __init calibrate_APIC_clock(void)
return -1;
}
- local_apic_timer_verify_ok = 1;
+ levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
- /* We trust the pm timer based calibration */
+ /*
+ * PM timer calibration failed or not turned on
+ * so lets try APIC timer based calibration
+ */
if (!pm_referenced) {
apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -507,11 +687,11 @@ static int __init calibrate_APIC_clock(void)
if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
else
- local_apic_timer_verify_ok = 0;
+ levt->features |= CLOCK_EVT_FEAT_DUMMY;
} else
local_irq_enable();
- if (!local_apic_timer_verify_ok) {
+ if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
printk(KERN_WARNING
"APIC timer disabled due to verification failure.\n");
return -1;
@@ -533,7 +713,8 @@ void __init setup_boot_APIC_clock(void)
* timer as a dummy clock event source on SMP systems, so the
* broadcast mechanism is used. On UP systems simply ignore it.
*/
- if (local_apic_timer_disabled) {
+ if (disable_apic_timer) {
+ printk(KERN_INFO "Disabling APIC timer\n");
/* No broadcast on UP ! */
if (num_possible_cpus() > 1) {
lapic_clockevent.mult = 1;
@@ -567,7 +748,7 @@ void __init setup_boot_APIC_clock(void)
setup_APIC_timer();
}
-void __devinit setup_secondary_APIC_clock(void)
+void __cpuinit setup_secondary_APIC_clock(void)
{
setup_APIC_timer();
}
@@ -602,7 +783,11 @@ static void local_apic_timer_interrupt(void)
/*
* the NMI deadlock-detector uses this.
*/
+#ifdef CONFIG_X86_64
+ add_pda(apic_timer_irqs, 1);
+#else
per_cpu(irq_stat, cpu).apic_timer_irqs++;
+#endif
evt->event_handler(evt);
}
@@ -629,6 +814,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
* Besides, if we don't timer interrupts ignore the global
* interrupt lock, which is the WrongThing (tm) to do.
*/
+#ifdef CONFIG_X86_64
+ exit_idle();
+#endif
irq_enter();
local_apic_timer_interrupt();
irq_exit();
@@ -642,35 +830,6 @@ int setup_profiling_timer(unsigned int multiplier)
}
/*
- * Setup extended LVT, AMD specific (K8, family 10h)
- *
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
- */
-
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
-
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
-{
- unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
- unsigned int v = (mask << 16) | (msg_type << 8) | vector;
- apic_write(reg, v);
-}
-
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_MCE;
-}
-
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_IBS;
-}
-
-/*
* Local APIC start and shutdown
*/
@@ -715,7 +874,7 @@ void clear_local_APIC(void)
}
/* lets not touch this if we didn't frob it */
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
if (maxlvt >= 5) {
v = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -732,10 +891,6 @@ void clear_local_APIC(void)
if (maxlvt >= 4)
apic_write(APIC_LVTPC, APIC_LVT_MASKED);
-#ifdef CONFIG_X86_MCE_P4THERMAL
- if (maxlvt >= 5)
- apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
-#endif
/* Integrated APIC (!82489DX) ? */
if (lapic_is_integrated()) {
if (maxlvt > 3)
@@ -750,7 +905,7 @@ void clear_local_APIC(void)
*/
void disable_local_APIC(void)
{
- unsigned long value;
+ unsigned int value;
clear_local_APIC();
@@ -762,6 +917,7 @@ void disable_local_APIC(void)
value &= ~APIC_SPIV_APIC_ENABLED;
apic_write(APIC_SPIV, value);
+#ifdef CONFIG_X86_32
/*
* When LAPIC was disabled by the BIOS and enabled by the kernel,
* restore the disabled state.
@@ -773,6 +929,7 @@ void disable_local_APIC(void)
l &= ~MSR_IA32_APICBASE_ENABLE;
wrmsr(MSR_IA32_APICBASE, l, h);
}
+#endif
}
/*
@@ -789,11 +946,15 @@ void lapic_shutdown(void)
return;
local_irq_save(flags);
- clear_local_APIC();
- if (enabled_via_apicbase)
+#ifdef CONFIG_X86_32
+ if (!enabled_via_apicbase)
+ clear_local_APIC();
+ else
+#endif
disable_local_APIC();
+
local_irq_restore(flags);
}
@@ -838,6 +999,12 @@ int __init verify_local_APIC(void)
*/
reg0 = apic_read(APIC_ID);
apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+ apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+ reg1 = apic_read(APIC_ID);
+ apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
+ apic_write(APIC_ID, reg0);
+ if (reg1 != (reg0 ^ APIC_ID_MASK))
+ return 0;
/*
* The next two are just to see if we have sane values.
@@ -863,14 +1030,15 @@ void __init sync_Arb_IDs(void)
*/
if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return;
+
/*
* Wait for idle.
*/
apic_wait_icr_idle();
apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write(APIC_ICR,
- APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
+ apic_write(APIC_ICR, APIC_DEST_ALLINC |
+ APIC_INT_LEVELTRIG | APIC_DM_INIT);
}
/*
@@ -878,7 +1046,7 @@ void __init sync_Arb_IDs(void)
*/
void __init init_bsp_APIC(void)
{
- unsigned long value;
+ unsigned int value;
/*
* Don't do the setup now if we have a SMP BIOS as the
@@ -899,11 +1067,13 @@ void __init init_bsp_APIC(void)
value &= ~APIC_VECTOR_MASK;
value |= APIC_SPIV_APIC_ENABLED;
+#ifdef CONFIG_X86_32
/* This bit is reserved on P4/Xeon and should be cleared */
if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
(boot_cpu_data.x86 == 15))
value &= ~APIC_SPIV_FOCUS_DISABLED;
else
+#endif
value |= APIC_SPIV_FOCUS_DISABLED;
value |= SPURIOUS_APIC_VECTOR;
apic_write(APIC_SPIV, value);
@@ -920,39 +1090,43 @@ void __init init_bsp_APIC(void)
static void __cpuinit lapic_setup_esr(void)
{
- unsigned long oldvalue, value, maxlvt;
- if (lapic_is_integrated() && !esr_disable) {
- /* !82489DX */
- maxlvt = lapic_get_maxlvt();
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- oldvalue = apic_read(APIC_ESR);
+ unsigned int oldvalue, value, maxlvt;
+
+ if (!lapic_is_integrated()) {
+ printk(KERN_INFO "No ESR for 82489DX.\n");
+ return;
+ }
- /* enables sending errors */
- value = ERROR_APIC_VECTOR;
- apic_write(APIC_LVTERR, value);
+ if (esr_disable) {
/*
- * spec says clear errors after enabling vector.
+ * Something untraceable is creating bad interrupts on
+ * secondary quads ... for the moment, just leave the
+ * ESR disabled - we can't do anything useful with the
+ * errors anyway - mbligh
*/
- if (maxlvt > 3)
- apic_write(APIC_ESR, 0);
- value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08lx after: 0x%08lx\n",
- oldvalue, value);
- } else {
- if (esr_disable)
- /*
- * Something untraceable is creating bad interrupts on
- * secondary quads ... for the moment, just leave the
- * ESR disabled - we can't do anything useful with the
- * errors anyway - mbligh
- */
- printk(KERN_INFO "Leaving ESR disabled.\n");
- else
- printk(KERN_INFO "No ESR for 82489DX.\n");
+ printk(KERN_INFO "Leaving ESR disabled.\n");
+ return;
}
+
+ maxlvt = lapic_get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ oldvalue = apic_read(APIC_ESR);
+
+ /* enables sending errors */
+ value = ERROR_APIC_VECTOR;
+ apic_write(APIC_LVTERR, value);
+
+ /*
+ * spec says clear errors after enabling vector.
+ */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+ value = apic_read(APIC_ESR);
+ if (value != oldvalue)
+ apic_printk(APIC_VERBOSE, "ESR value before enabling "
+ "vector: 0x%08x after: 0x%08x\n",
+ oldvalue, value);
}
@@ -961,24 +1135,27 @@ static void __cpuinit lapic_setup_esr(void)
*/
void __cpuinit setup_local_APIC(void)
{
- unsigned long value, integrated;
+ unsigned int value;
int i, j;
+#ifdef CONFIG_X86_32
/* Pound the ESR really hard over the head with a big hammer - mbligh */
- if (esr_disable) {
+ if (lapic_is_integrated() && esr_disable) {
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
}
+#endif
- integrated = lapic_is_integrated();
+ preempt_disable();
/*
* Double-check whether this APIC is really registered.
+ * This is meaningless in clustered apic mode, so we skip it.
*/
if (!apic_id_registered())
- WARN_ON_ONCE(1);
+ BUG();
/*
* Intel recommends to set DFR, LDR and TPR before enabling
@@ -1024,6 +1201,7 @@ void __cpuinit setup_local_APIC(void)
*/
value |= APIC_SPIV_APIC_ENABLED;
+#ifdef CONFIG_X86_32
/*
* Some unknown Intel IO/APIC (or APIC) errata is biting us with
* certain networking cards. If high frequency interrupts are
@@ -1044,8 +1222,13 @@ void __cpuinit setup_local_APIC(void)
* See also the comment in end_level_ioapic_irq(). --macro
*/
- /* Enable focus processor (bit==0) */
+ /*
+ * - enable focus processor (bit==0)
+ * - 64bit mode always use processor focus
+ * so no need to set it
+ */
value &= ~APIC_SPIV_FOCUS_DISABLED;
+#endif
/*
* Set spurious IRQ vector
@@ -1082,25 +1265,178 @@ void __cpuinit setup_local_APIC(void)
value = APIC_DM_NMI;
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
- if (!integrated) /* 82489DX */
+ if (!lapic_is_integrated()) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write(APIC_LVT1, value);
+
+ preempt_enable();
}
void __cpuinit end_local_APIC_setup(void)
{
- unsigned long value;
-
lapic_setup_esr();
- /* Disable the local apic timer */
- value = apic_read(APIC_LVTT);
- value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write(APIC_LVTT, value);
+
+#ifdef CONFIG_X86_32
+ {
+ unsigned int value;
+ /* Disable the local apic timer */
+ value = apic_read(APIC_LVTT);
+ value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, value);
+ }
+#endif
setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
}
+#ifdef HAVE_X2APIC
+void check_x2apic(void)
+{
+ int msr, msr2;
+
+ rdmsr(MSR_IA32_APICBASE, msr, msr2);
+
+ if (msr & X2APIC_ENABLE) {
+ printk("x2apic enabled by BIOS, switching to x2apic ops\n");
+ x2apic_preenabled = x2apic = 1;
+ apic_ops = &x2apic_ops;
+ }
+}
+
+void enable_x2apic(void)
+{
+ int msr, msr2;
+
+ rdmsr(MSR_IA32_APICBASE, msr, msr2);
+ if (!(msr & X2APIC_ENABLE)) {
+ printk("Enabling x2apic\n");
+ wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
+ }
+}
+
+void enable_IR_x2apic(void)
+{
+#ifdef CONFIG_INTR_REMAP
+ int ret;
+ unsigned long flags;
+
+ if (!cpu_has_x2apic)
+ return;
+
+ if (!x2apic_preenabled && disable_x2apic) {
+ printk(KERN_INFO
+ "Skipped enabling x2apic and Interrupt-remapping "
+ "because of nox2apic\n");
+ return;
+ }
+
+ if (x2apic_preenabled && disable_x2apic)
+ panic("Bios already enabled x2apic, can't enforce nox2apic");
+
+ if (!x2apic_preenabled && skip_ioapic_setup) {
+ printk(KERN_INFO
+ "Skipped enabling x2apic and Interrupt-remapping "
+ "because of skipping io-apic setup\n");
+ return;
+ }
+
+ ret = dmar_table_init();
+ if (ret) {
+ printk(KERN_INFO
+ "dmar_table_init() failed with %d:\n", ret);
+
+ if (x2apic_preenabled)
+ panic("x2apic enabled by bios. But IR enabling failed");
+ else
+ printk(KERN_INFO
+ "Not enabling x2apic,Intr-remapping\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ mask_8259A();
+
+ ret = save_mask_IO_APIC_setup();
+ if (ret) {
+ printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret);
+ goto end;
+ }
+
+ ret = enable_intr_remapping(1);
+
+ if (ret && x2apic_preenabled) {
+ local_irq_restore(flags);
+ panic("x2apic enabled by bios. But IR enabling failed");
+ }
+
+ if (ret)
+ goto end_restore;
+
+ if (!x2apic) {
+ x2apic = 1;
+ apic_ops = &x2apic_ops;
+ enable_x2apic();
+ }
+
+end_restore:
+ if (ret)
+ /*
+ * IR enabling failed
+ */
+ restore_IO_APIC_setup();
+ else
+ reinit_intr_remapped_IO_APIC(x2apic_preenabled);
+
+end:
+ unmask_8259A();
+ local_irq_restore(flags);
+
+ if (!ret) {
+ if (!x2apic_preenabled)
+ printk(KERN_INFO
+ "Enabled x2apic and interrupt-remapping\n");
+ else
+ printk(KERN_INFO
+ "Enabled Interrupt-remapping\n");
+ } else
+ printk(KERN_ERR
+ "Failed to enable Interrupt-remapping and x2apic\n");
+#else
+ if (!cpu_has_x2apic)
+ return;
+
+ if (x2apic_preenabled)
+ panic("x2apic enabled prior OS handover,"
+ " enable CONFIG_INTR_REMAP");
+
+ printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
+ " and x2apic\n");
+#endif
+
+ return;
+}
+#endif /* HAVE_X2APIC */
+
+#ifdef CONFIG_X86_64
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.)
+ */
+static int __init detect_init_APIC(void)
+{
+ if (!cpu_has_apic) {
+ printk(KERN_INFO "No local APIC present\n");
+ return -1;
+ }
+
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ boot_cpu_physical_apicid = 0;
+ return 0;
+}
+#else
/*
* Detect and initialize APIC
*/
@@ -1179,12 +1515,46 @@ no_apic:
printk(KERN_INFO "No local APIC present or hardware disabled\n");
return -1;
}
+#endif
+
+#ifdef CONFIG_X86_64
+void __init early_init_lapic_mapping(void)
+{
+ unsigned long phys_addr;
+
+ /*
+ * If no local APIC can be found then go out
+ * : it means there is no mpatable and MADT
+ */
+ if (!smp_found_config)
+ return;
+
+ phys_addr = mp_lapic_addr;
+
+ set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
+ apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
+ APIC_BASE, phys_addr);
+
+ /*
+ * Fetch the APIC ID of the BSP in case we have a
+ * default configuration (or the MP table is broken).
+ */
+ boot_cpu_physical_apicid = read_apic_id();
+}
+#endif
/**
* init_apic_mappings - initialize APIC mappings
*/
void __init init_apic_mappings(void)
{
+#ifdef HAVE_X2APIC
+ if (x2apic) {
+ boot_cpu_physical_apicid = read_apic_id();
+ return;
+ }
+#endif
+
/*
* If no local APIC can be found then set up a fake all
* zeroes page to simulate the local APIC and another
@@ -1197,27 +1567,36 @@ void __init init_apic_mappings(void)
apic_phys = mp_lapic_addr;
set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
- printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
- apic_phys);
+ apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
+ APIC_BASE, apic_phys);
/*
* Fetch the APIC ID of the BSP in case we have a
* default configuration (or the MP table is broken).
*/
if (boot_cpu_physical_apicid == -1U)
- boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
-
+ boot_cpu_physical_apicid = read_apic_id();
}
/*
* This initializes the IO-APIC and APIC hardware if this is
* a UP kernel.
*/
-
int apic_version[MAX_APICS];
int __init APIC_init_uniprocessor(void)
{
+#ifdef CONFIG_X86_64
+ if (disable_apic) {
+ printk(KERN_INFO "Apic disabled\n");
+ return -1;
+ }
+ if (!cpu_has_apic) {
+ disable_apic = 1;
+ printk(KERN_INFO "Apic disabled by BIOS\n");
+ return -1;
+ }
+#else
if (!smp_found_config && !cpu_has_apic)
return -1;
@@ -1226,39 +1605,68 @@ int __init APIC_init_uniprocessor(void)
*/
if (!cpu_has_apic &&
APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+ printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n",
boot_cpu_physical_apicid);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
return -1;
}
+#endif
- verify_local_APIC();
+#ifdef HAVE_X2APIC
+ enable_IR_x2apic();
+#endif
+#ifdef CONFIG_X86_64
+ setup_apic_routing();
+#endif
+ verify_local_APIC();
connect_bsp_APIC();
+#ifdef CONFIG_X86_64
+ apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
+#else
/*
* Hack: In case of kdump, after a crash, kernel might be booting
* on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
* might be zero if read from MP tables. Get it from LAPIC.
*/
-#ifdef CONFIG_CRASH_DUMP
- boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
+# ifdef CONFIG_CRASH_DUMP
+ boot_cpu_physical_apicid = read_apic_id();
+# endif
#endif
physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
-
setup_local_APIC();
+#ifdef CONFIG_X86_64
+ /*
+ * Now enable IO-APICs, actually call clear_IO_APIC
+ * We need clear_IO_APIC before enabling vector on BP
+ */
+ if (!skip_ioapic_setup && nr_ioapics)
+ enable_IO_APIC();
+#endif
+
#ifdef CONFIG_X86_IO_APIC
if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
#endif
localise_nmi_watchdog();
end_local_APIC_setup();
+
#ifdef CONFIG_X86_IO_APIC
- if (smp_found_config)
- if (!skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
+ if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+# ifdef CONFIG_X86_64
+ else
+ nr_ioapics = 0;
+# endif
#endif
+
+#ifdef CONFIG_X86_64
+ setup_boot_APIC_clock();
+ check_nmi_watchdog();
+#else
setup_boot_clock();
+#endif
return 0;
}
@@ -1272,8 +1680,11 @@ int __init APIC_init_uniprocessor(void)
*/
void smp_spurious_interrupt(struct pt_regs *regs)
{
- unsigned long v;
+ u32 v;
+#ifdef CONFIG_X86_64
+ exit_idle();
+#endif
irq_enter();
/*
* Check if this really is a spurious interrupt and ACK it
@@ -1284,10 +1695,14 @@ void smp_spurious_interrupt(struct pt_regs *regs)
if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
ack_APIC_irq();
+#ifdef CONFIG_X86_64
+ add_pda(irq_spurious_count, 1);
+#else
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
"should never happen.\n", smp_processor_id());
__get_cpu_var(irq_stat).irq_spurious_count++;
+#endif
irq_exit();
}
@@ -1296,8 +1711,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
*/
void smp_error_interrupt(struct pt_regs *regs)
{
- unsigned long v, v1;
+ u32 v, v1;
+#ifdef CONFIG_X86_64
+ exit_idle();
+#endif
irq_enter();
/* First tickle the hardware, only then report what went on. -- REW */
v = apic_read(APIC_ESR);
@@ -1316,64 +1734,17 @@ void smp_error_interrupt(struct pt_regs *regs)
6: Received illegal vector
7: Illegal register address
*/
- printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
+ printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
smp_processor_id(), v , v1);
irq_exit();
}
-#ifdef CONFIG_SMP
-void __init smp_intr_init(void)
-{
- /*
- * IRQ0 must be given a fixed assignment and initialized,
- * because it's used before the IO-APIC is set up.
- */
- set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
-
- /*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
- */
- alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPI for invalidation */
- alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
-
- /* IPI for generic function call */
- alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
- /* IPI for single call function */
- set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
- call_function_single_interrupt);
-}
-#endif
-
-/*
- * Initialize APIC interrupts
- */
-void __init apic_intr_init(void)
-{
-#ifdef CONFIG_SMP
- smp_intr_init();
-#endif
- /* self generated IPI for local APIC timer */
- alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
- /* IPI vectors for APIC spurious and error interrupts */
- alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
- /* thermal monitor LVT interrupt */
-#ifdef CONFIG_X86_MCE_P4THERMAL
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-}
-
/**
* connect_bsp_APIC - attach the APIC to the interrupt system
*/
void __init connect_bsp_APIC(void)
{
+#ifdef CONFIG_X86_32
if (pic_mode) {
/*
* Do not trust the local APIC being empty at bootup.
@@ -1388,6 +1759,7 @@ void __init connect_bsp_APIC(void)
outb(0x70, 0x22);
outb(0x01, 0x23);
}
+#endif
enable_apic_mode();
}
@@ -1400,6 +1772,9 @@ void __init connect_bsp_APIC(void)
*/
void disconnect_bsp_APIC(int virt_wire_setup)
{
+ unsigned int value;
+
+#ifdef CONFIG_X86_32
if (pic_mode) {
/*
* Put the board back into PIC mode (has an effect only on
@@ -1411,54 +1786,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
"entering PIC mode.\n");
outb(0x70, 0x22);
outb(0x00, 0x23);
- } else {
- /* Go back to Virtual Wire compatibility mode */
- unsigned long value;
-
- /* For the spurious interrupt use vector F, and enable it */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
- value |= 0xf;
- apic_write(APIC_SPIV, value);
-
- if (!virt_wire_setup) {
- /*
- * For LVT0 make it edge triggered, active high,
- * external and enabled
- */
- value = apic_read(APIC_LVT0);
- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
- apic_write(APIC_LVT0, value);
- } else {
- /* Disable LVT0 */
- apic_write(APIC_LVT0, APIC_LVT_MASKED);
- }
+ return;
+ }
+#endif
+
+ /* Go back to Virtual Wire compatibility mode */
+
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write(APIC_SPIV, value);
+ if (!virt_wire_setup) {
/*
- * For LVT1 make it edge triggered, active high, nmi and
- * enabled
+ * For LVT0 make it edge triggered, active high,
+ * external and enabled
*/
- value = apic_read(APIC_LVT1);
- value &= ~(
- APIC_MODE_MASK | APIC_SEND_PENDING |
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
- apic_write(APIC_LVT1, value);
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+ apic_write(APIC_LVT0, value);
+ } else {
+ /* Disable LVT0 */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED);
}
+
+ /*
+ * For LVT1 make it edge triggered, active high,
+ * nmi and enabled
+ */
+ value = apic_read(APIC_LVT1);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write(APIC_LVT1, value);
}
void __cpuinit generic_processor_info(int apicid, int version)
{
int cpu;
cpumask_t tmp_map;
- physid_mask_t phys_cpu;
/*
* Validate version
@@ -1471,9 +1845,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
}
apic_version[apicid] = version;
- phys_cpu = apicid_to_cpu_present(apicid);
- physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
-
if (num_processors >= NR_CPUS) {
printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
" Processor ignored.\n", NR_CPUS);
@@ -1484,17 +1855,19 @@ void __cpuinit generic_processor_info(int apicid, int version)
cpus_complement(tmp_map, cpu_present_map);
cpu = first_cpu(tmp_map);
- if (apicid == boot_cpu_physical_apicid)
+ physid_set(apicid, phys_cpu_present_map);
+ if (apicid == boot_cpu_physical_apicid) {
/*
* x86_bios_cpu_apicid is required to have processors listed
* in same order as logical cpu numbers. Hence the first
* entry is BSP, and so on.
*/
cpu = 0;
-
+ }
if (apicid > max_physical_apicid)
max_physical_apicid = apicid;
+#ifdef CONFIG_X86_32
/*
* Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
* but we need to work other dependencies like SMP_SUSPEND etc
@@ -1514,7 +1887,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
def_to_bigsmp = 1;
}
}
-#ifdef CONFIG_SMP
+#endif
+
+#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
/* are we being called early in kernel startup? */
if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1527,16 +1902,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
}
#endif
+
cpu_set(cpu, cpu_possible_map);
cpu_set(cpu, cpu_present_map);
}
+#ifdef CONFIG_X86_64
+int hard_smp_processor_id(void)
+{
+ return read_apic_id();
+}
+#endif
+
/*
* Power management
*/
#ifdef CONFIG_PM
static struct {
+ /*
+ * 'active' is true if the local APIC was enabled by us and
+ * not the BIOS; this signifies that we are also responsible
+ * for disabling it before entering apm/acpi suspend
+ */
int active;
/* r/w apic fields */
unsigned int apic_id;
@@ -1577,7 +1965,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
if (maxlvt >= 5)
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
#endif
@@ -1601,16 +1989,23 @@ static int lapic_resume(struct sys_device *dev)
local_irq_save(flags);
- /*
- * Make sure the APICBASE points to the right address
- *
- * FIXME! This will be wrong if we ever support suspend on
- * SMP! We'll need to do this as part of the CPU restore!
- */
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
+#ifdef HAVE_X2APIC
+ if (x2apic)
+ enable_x2apic();
+ else
+#endif
+ {
+ /*
+ * Make sure the APICBASE points to the right address
+ *
+ * FIXME! This will be wrong if we ever support suspend on
+ * SMP! We'll need to do this as part of the CPU restore!
+ */
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1620,7 +2015,7 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
if (maxlvt >= 5)
apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
#endif
@@ -1634,7 +2029,9 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
+
local_irq_restore(flags);
+
return 0;
}
@@ -1654,7 +2051,7 @@ static struct sys_device device_lapic = {
.cls = &lapic_sysclass,
};
-static void __devinit apic_pm_activate(void)
+static void __cpuinit apic_pm_activate(void)
{
apic_pm_state.active = 1;
}
@@ -1680,30 +2077,101 @@ static void apic_pm_activate(void) { }
#endif /* CONFIG_PM */
+#ifdef CONFIG_X86_64
/*
- * APIC command line parameters
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ *
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis. Use available data to take a good guess.
+ * If in doubt, go HPET.
*/
-static int __init parse_lapic(char *arg)
+__cpuinit int apic_is_clustered_box(void)
{
- force_enable_local_apic = 1;
- return 0;
+ int i, clusters, zeros;
+ unsigned id;
+ u16 *bios_cpu_apicid;
+ DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+
+ /*
+ * there is not this kind of box with AMD CPU yet.
+ * Some AMD box with quadcore cpu and 8 sockets apicid
+ * will be [4, 0x23] or [8, 0x27] could be thought to
+ * vsmp box still need checking...
+ */
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
+ return 0;
+
+ bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
+ bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
+
+ for (i = 0; i < NR_CPUS; i++) {
+ /* are we being called early in kernel startup? */
+ if (bios_cpu_apicid) {
+ id = bios_cpu_apicid[i];
+ }
+ else if (i < nr_cpu_ids) {
+ if (cpu_present(i))
+ id = per_cpu(x86_bios_cpu_apicid, i);
+ else
+ continue;
+ }
+ else
+ break;
+
+ if (id != BAD_APICID)
+ __set_bit(APIC_CLUSTERID(id), clustermap);
+ }
+
+ /* Problem: Partially populated chassis may not have CPUs in some of
+ * the APIC clusters they have been allocated. Only present CPUs have
+ * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
+ * Since clusters are allocated sequentially, count zeros only if
+ * they are bounded by ones.
+ */
+ clusters = 0;
+ zeros = 0;
+ for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+ if (test_bit(i, clustermap)) {
+ clusters += 1 + zeros;
+ zeros = 0;
+ } else
+ ++zeros;
+ }
+
+ /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+ * not guaranteed to be synced between boards
+ */
+ if (is_vsmp_box() && clusters > 1)
+ return 1;
+
+ /*
+ * If clusters > 2, then should be multi-chassis.
+ * May have to revisit this when multi-core + hyperthreaded CPUs come
+ * out, but AFAIK this will work even for them.
+ */
+ return (clusters > 2);
}
-early_param("lapic", parse_lapic);
+#endif
-static int __init parse_nolapic(char *arg)
+/*
+ * APIC command line parameters
+ */
+static int __init setup_disableapic(char *arg)
{
disable_apic = 1;
setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
-early_param("nolapic", parse_nolapic);
+early_param("disableapic", setup_disableapic);
-static int __init parse_disable_lapic_timer(char *arg)
+/* same as disableapic, for compatibility */
+static int __init setup_nolapic(char *arg)
{
- local_apic_timer_disabled = 1;
- return 0;
+ return setup_disableapic(arg);
}
-early_param("nolapic_timer", parse_disable_lapic_timer);
+early_param("nolapic", setup_nolapic);
static int __init parse_lapic_timer_c2_ok(char *arg)
{
@@ -1712,15 +2180,39 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
}
early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+static int __init parse_disable_apic_timer(char *arg)
+{
+ disable_apic_timer = 1;
+ return 0;
+}
+early_param("noapictimer", parse_disable_apic_timer);
+
+static int __init parse_nolapic_timer(char *arg)
+{
+ disable_apic_timer = 1;
+ return 0;
+}
+early_param("nolapic_timer", parse_nolapic_timer);
+
static int __init apic_set_verbosity(char *arg)
{
- if (!arg)
+ if (!arg) {
+#ifdef CONFIG_X86_64
+ skip_ioapic_setup = 0;
+ return 0;
+#endif
return -EINVAL;
+ }
- if (strcmp(arg, "debug") == 0)
+ if (strcmp("debug", arg) == 0)
apic_verbosity = APIC_DEBUG;
- else if (strcmp(arg, "verbose") == 0)
+ else if (strcmp("verbose", arg) == 0)
apic_verbosity = APIC_VERBOSE;
+ else {
+ printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+ " use apic=verbose or apic=debug\n", arg);
+ return -EINVAL;
+ }
return 0;
}
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
deleted file mode 100644
index 446c062e831..00000000000
--- a/arch/x86/kernel/apic_64.c
+++ /dev/null
@@ -1,1390 +0,0 @@
-/*
- * Local APIC handling, local APIC timers
- *
- * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- * Fixes
- * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
- * thanks to Eric Gilmore
- * and Rolf G. Tews
- * for testing these extensively.
- * Maciej W. Rozycki : Various updates and fixes.
- * Mikael Pettersson : Power Management for UP-APIC.
- * Pavel Machek and
- * Mikael Pettersson : PM converted to driver model.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/ioport.h>
-#include <linux/clockchips.h>
-#include <linux/acpi_pmtmr.h>
-#include <linux/module.h>
-
-#include <asm/atomic.h>
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/hpet.h>
-#include <asm/pgalloc.h>
-#include <asm/nmi.h>
-#include <asm/idle.h>
-#include <asm/proto.h>
-#include <asm/timex.h>
-#include <asm/apic.h>
-
-#include <mach_ipi.h>
-#include <mach_apic.h>
-
-static int disable_apic_timer __cpuinitdata;
-static int apic_calibrate_pmtmr __initdata;
-int disable_apic;
-
-/* Local APIC timer works in C2 */
-int local_apic_timer_c2_ok;
-EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
-
-/*
- * Debug level, exported for io_apic.c
- */
-unsigned int apic_verbosity;
-
-/* Have we found an MP table */
-int smp_found_config;
-
-static struct resource lapic_resource = {
- .name = "Local APIC",
- .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
-};
-
-static unsigned int calibration_result;
-
-static int lapic_next_event(unsigned long delta,
- struct clock_event_device *evt);
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static void lapic_timer_broadcast(cpumask_t mask);
-static void apic_pm_activate(void);
-
-static struct clock_event_device lapic_clockevent = {
- .name = "lapic",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
- | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
- .shift = 32,
- .set_mode = lapic_timer_setup,
- .set_next_event = lapic_next_event,
- .broadcast = lapic_timer_broadcast,
- .rating = 100,
- .irq = -1,
-};
-static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-
-static unsigned long apic_phys;
-
-unsigned long mp_lapic_addr;
-
-/*
- * Get the LAPIC version
- */
-static inline int lapic_get_version(void)
-{
- return GET_APIC_VERSION(apic_read(APIC_LVR));
-}
-
-/*
- * Check, if the APIC is integrated or a seperate chip
- */
-static inline int lapic_is_integrated(void)
-{
- return 1;
-}
-
-/*
- * Check, whether this is a modern or a first generation APIC
- */
-static int modern_apic(void)
-{
- /* AMD systems use old APIC versions, so check the CPU */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 >= 0xf)
- return 1;
- return lapic_get_version() >= 0x14;
-}
-
-void apic_wait_icr_idle(void)
-{
- while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
- cpu_relax();
-}
-
-u32 safe_apic_wait_icr_idle(void)
-{
- u32 send_status;
- int timeout;
-
- timeout = 0;
- do {
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- if (!send_status)
- break;
- udelay(100);
- } while (timeout++ < 1000);
-
- return send_status;
-}
-
-/**
- * enable_NMI_through_LVT0 - enable NMI through local vector table 0
- */
-void __cpuinit enable_NMI_through_LVT0(void)
-{
- unsigned int v;
-
- /* unmask and set to NMI */
- v = APIC_DM_NMI;
- apic_write(APIC_LVT0, v);
-}
-
-/**
- * lapic_get_maxlvt - get the maximum number of local vector table entries
- */
-int lapic_get_maxlvt(void)
-{
- unsigned int v, maxlvt;
-
- v = apic_read(APIC_LVR);
- maxlvt = GET_APIC_MAXLVT(v);
- return maxlvt;
-}
-
-/*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
- */
-
-static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
-{
- unsigned int lvtt_value, tmp_value;
-
- lvtt_value = LOCAL_TIMER_VECTOR;
- if (!oneshot)
- lvtt_value |= APIC_LVT_TIMER_PERIODIC;
- if (!irqen)
- lvtt_value |= APIC_LVT_MASKED;
-
- apic_write(APIC_LVTT, lvtt_value);
-
- /*
- * Divide PICLK by 16
- */
- tmp_value = apic_read(APIC_TDCR);
- apic_write(APIC_TDCR, (tmp_value
- & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
- | APIC_TDR_DIV_16);
-
- if (!oneshot)
- apic_write(APIC_TMICT, clocks);
-}
-
-/*
- * Setup extended LVT, AMD specific (K8, family 10h)
- *
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
- */
-
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
-
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
-{
- unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
- unsigned int v = (mask << 16) | (msg_type << 8) | vector;
-
- apic_write(reg, v);
-}
-
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_MCE;
-}
-
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_IBS;
-}
-
-/*
- * Program the next event, relative to now
- */
-static int lapic_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- apic_write(APIC_TMICT, delta);
- return 0;
-}
-
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- unsigned long flags;
- unsigned int v;
-
- /* Lapic used as dummy for broadcast ? */
- if (evt->features & CLOCK_EVT_FEAT_DUMMY)
- return;
-
- local_irq_save(flags);
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- case CLOCK_EVT_MODE_ONESHOT:
- __setup_APIC_LVTT(calibration_result,
- mode != CLOCK_EVT_MODE_PERIODIC, 1);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- v = apic_read(APIC_LVTT);
- v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write(APIC_LVTT, v);
- break;
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
-
- local_irq_restore(flags);
-}
-
-/*
- * Local APIC timer broadcast function
- */
-static void lapic_timer_broadcast(cpumask_t mask)
-{
-#ifdef CONFIG_SMP
- send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
-#endif
-}
-
-/*
- * Setup the local APIC timer for this CPU. Copy the initilized values
- * of the boot CPU and register the clock event in the framework.
- */
-static void setup_APIC_timer(void)
-{
- struct clock_event_device *levt = &__get_cpu_var(lapic_events);
-
- memcpy(levt, &lapic_clockevent, sizeof(*levt));
- levt->cpumask = cpumask_of_cpu(smp_processor_id());
-
- clockevents_register_device(levt);
-}
-
-/*
- * In this function we calibrate APIC bus clocks to the external
- * timer. Unfortunately we cannot use jiffies and the timer irq
- * to calibrate, since some later bootup code depends on getting
- * the first irq? Ugh.
- *
- * We want to do the calibration only once since we
- * want to have local timer irqs syncron. CPUs connected
- * by the same APIC bus have the very same bus frequency.
- * And we want to have irqs off anyways, no accidental
- * APIC irq that way.
- */
-
-#define TICK_COUNT 100000000
-
-static int __init calibrate_APIC_clock(void)
-{
- unsigned apic, apic_start;
- unsigned long tsc, tsc_start;
- int result;
-
- local_irq_disable();
-
- /*
- * Put whatever arbitrary (but long enough) timeout
- * value into the APIC clock, we just want to get the
- * counter running for calibration.
- *
- * No interrupt enable !
- */
- __setup_APIC_LVTT(250000000, 0, 0);
-
- apic_start = apic_read(APIC_TMCCT);
-#ifdef CONFIG_X86_PM_TIMER
- if (apic_calibrate_pmtmr && pmtmr_ioport) {
- pmtimer_wait(5000); /* 5ms wait */
- apic = apic_read(APIC_TMCCT);
- result = (apic_start - apic) * 1000L / 5;
- } else
-#endif
- {
- rdtscll(tsc_start);
-
- do {
- apic = apic_read(APIC_TMCCT);
- rdtscll(tsc);
- } while ((tsc - tsc_start) < TICK_COUNT &&
- (apic_start - apic) < TICK_COUNT);
-
- result = (apic_start - apic) * 1000L * tsc_khz /
- (tsc - tsc_start);
- }
-
- local_irq_enable();
-
- printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
-
- printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
- result / 1000 / 1000, result / 1000 % 1000);
-
- /* Calculate the scaled math multiplication factor */
- lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC,
- lapic_clockevent.shift);
- lapic_clockevent.max_delta_ns =
- clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
- lapic_clockevent.min_delta_ns =
- clockevent_delta2ns(0xF, &lapic_clockevent);
-
- calibration_result = result / HZ;
-
- /*
- * Do a sanity check on the APIC calibration result
- */
- if (calibration_result < (1000000 / HZ)) {
- printk(KERN_WARNING
- "APIC frequency too slow, disabling apic timer\n");
- return -1;
- }
-
- return 0;
-}
-
-/*
- * Setup the boot APIC
- *
- * Calibrate and verify the result.
- */
-void __init setup_boot_APIC_clock(void)
-{
- /*
- * The local apic timer can be disabled via the kernel commandline.
- * Register the lapic timer as a dummy clock event source on SMP
- * systems, so the broadcast mechanism is used. On UP systems simply
- * ignore it.
- */
- if (disable_apic_timer) {
- printk(KERN_INFO "Disabling APIC timer\n");
- /* No broadcast on UP ! */
- if (num_possible_cpus() > 1) {
- lapic_clockevent.mult = 1;
- setup_APIC_timer();
- }
- return;
- }
-
- printk(KERN_INFO "Using local APIC timer interrupts.\n");
- if (calibrate_APIC_clock()) {
- /* No broadcast on UP ! */
- if (num_possible_cpus() > 1)
- setup_APIC_timer();
- return;
- }
-
- /*
- * If nmi_watchdog is set to IO_APIC, we need the
- * PIT/HPET going. Otherwise register lapic as a dummy
- * device.
- */
- if (nmi_watchdog != NMI_IO_APIC)
- lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
- else
- printk(KERN_WARNING "APIC timer registered as dummy,"
- " due to nmi_watchdog=%d!\n", nmi_watchdog);
-
- setup_APIC_timer();
-}
-
-void __cpuinit setup_secondary_APIC_clock(void)
-{
- setup_APIC_timer();
-}
-
-/*
- * The guts of the apic timer interrupt
- */
-static void local_apic_timer_interrupt(void)
-{
- int cpu = smp_processor_id();
- struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
-
- /*
- * Normally we should not be here till LAPIC has been initialized but
- * in some cases like kdump, its possible that there is a pending LAPIC
- * timer interrupt from previous kernel's context and is delivered in
- * new kernel the moment interrupts are enabled.
- *
- * Interrupts are enabled early and LAPIC is setup much later, hence
- * its possible that when we get here evt->event_handler is NULL.
- * Check for event_handler being NULL and discard the interrupt as
- * spurious.
- */
- if (!evt->event_handler) {
- printk(KERN_WARNING
- "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
- /* Switch it off */
- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
- return;
- }
-
- /*
- * the NMI deadlock-detector uses this.
- */
- add_pda(apic_timer_irqs, 1);
-
- evt->event_handler(evt);
-}
-
-/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- * interrupt as well. Thus we cannot inline the local irq ... ]
- */
-void smp_apic_timer_interrupt(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- */
- ack_APIC_irq();
- /*
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- exit_idle();
- irq_enter();
- local_apic_timer_interrupt();
- irq_exit();
- set_irq_regs(old_regs);
-}
-
-int setup_profiling_timer(unsigned int multiplier)
-{
- return -EINVAL;
-}
-
-
-/*
- * Local APIC start and shutdown
- */
-
-/**
- * clear_local_APIC - shutdown the local APIC
- *
- * This is called, when a CPU is disabled and before rebooting, so the state of
- * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
- * leftovers during boot.
- */
-void clear_local_APIC(void)
-{
- int maxlvt;
- u32 v;
-
- /* APIC hasn't been mapped yet */
- if (!apic_phys)
- return;
-
- maxlvt = lapic_get_maxlvt();
- /*
- * Masking an LVT entry can trigger a local APIC error
- * if the vector is zero. Mask LVTERR first to prevent this.
- */
- if (maxlvt >= 3) {
- v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
- apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
- }
- /*
- * Careful: we have to set masks only first to deassert
- * any level-triggered sources.
- */
- v = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
- v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
- v = apic_read(APIC_LVT1);
- apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
- if (maxlvt >= 4) {
- v = apic_read(APIC_LVTPC);
- apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
- }
-
- /*
- * Clean APIC state for other OSs:
- */
- apic_write(APIC_LVTT, APIC_LVT_MASKED);
- apic_write(APIC_LVT0, APIC_LVT_MASKED);
- apic_write(APIC_LVT1, APIC_LVT_MASKED);
- if (maxlvt >= 3)
- apic_write(APIC_LVTERR, APIC_LVT_MASKED);
- if (maxlvt >= 4)
- apic_write(APIC_LVTPC, APIC_LVT_MASKED);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
-}
-
-/**
- * disable_local_APIC - clear and disable the local APIC
- */
-void disable_local_APIC(void)
-{
- unsigned int value;
-
- clear_local_APIC();
-
- /*
- * Disable APIC (implies clearing of registers
- * for 82489DX!).
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_SPIV_APIC_ENABLED;
- apic_write(APIC_SPIV, value);
-}
-
-void lapic_shutdown(void)
-{
- unsigned long flags;
-
- if (!cpu_has_apic)
- return;
-
- local_irq_save(flags);
-
- disable_local_APIC();
-
- local_irq_restore(flags);
-}
-
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
- unsigned int reg0, reg1;
-
- /*
- * The version register is read-only in a real APIC.
- */
- reg0 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
- apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
- reg1 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
- /*
- * The two version reads above should print the same
- * numbers. If the second one is different, then we
- * poke at a non-APIC.
- */
- if (reg1 != reg0)
- return 0;
-
- /*
- * Check if the version looks reasonably.
- */
- reg1 = GET_APIC_VERSION(reg0);
- if (reg1 == 0x00 || reg1 == 0xff)
- return 0;
- reg1 = lapic_get_maxlvt();
- if (reg1 < 0x02 || reg1 == 0xff)
- return 0;
-
- /*
- * The ID register is read/write in a real APIC.
- */
- reg0 = read_apic_id();
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
- apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
- reg1 = read_apic_id();
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
- apic_write(APIC_ID, reg0);
- if (reg1 != (reg0 ^ APIC_ID_MASK))
- return 0;
-
- /*
- * The next two are just to see if we have sane values.
- * They're only really relevant if we're in Virtual Wire
- * compatibility mode, but most boxes are anymore.
- */
- reg0 = apic_read(APIC_LVT0);
- apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
- reg1 = apic_read(APIC_LVT1);
- apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
- return 1;
-}
-
-/**
- * sync_Arb_IDs - synchronize APIC bus arbitration IDs
- */
-void __init sync_Arb_IDs(void)
-{
- /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
- if (modern_apic())
- return;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
-
- apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
- | APIC_DM_INIT);
-}
-
-/*
- * An initial setup of the virtual wire mode.
- */
-void __init init_bsp_APIC(void)
-{
- unsigned int value;
-
- /*
- * Don't do the setup now if we have a SMP BIOS as the
- * through-I/O-APIC virtual wire mode might be active.
- */
- if (smp_found_config || !cpu_has_apic)
- return;
-
- value = apic_read(APIC_LVR);
-
- /*
- * Do not trust the local APIC being empty at bootup.
- */
- clear_local_APIC();
-
- /*
- * Enable APIC.
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
- value |= APIC_SPIV_FOCUS_DISABLED;
- value |= SPURIOUS_APIC_VECTOR;
- apic_write(APIC_SPIV, value);
-
- /*
- * Set up the virtual wire mode.
- */
- apic_write(APIC_LVT0, APIC_DM_EXTINT);
- value = APIC_DM_NMI;
- apic_write(APIC_LVT1, value);
-}
-
-/**
- * setup_local_APIC - setup the local APIC
- */
-void __cpuinit setup_local_APIC(void)
-{
- unsigned int value;
- int i, j;
-
- preempt_disable();
- value = apic_read(APIC_LVR);
-
- BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
-
- /*
- * Double-check whether this APIC is really registered.
- * This is meaningless in clustered apic mode, so we skip it.
- */
- if (!apic_id_registered())
- BUG();
-
- /*
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
- init_apic_ldr();
-
- /*
- * Set Task Priority to 'accept all'. We never change this
- * later on.
- */
- value = apic_read(APIC_TASKPRI);
- value &= ~APIC_TPRI_MASK;
- apic_write(APIC_TASKPRI, value);
-
- /*
- * After a crash, we no longer service the interrupts and a pending
- * interrupt from previous kernel might still have ISR bit set.
- *
- * Most probably by now CPU has serviced that pending interrupt and
- * it might not have done the ack_APIC_irq() because it thought,
- * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
- * does not clear the ISR bit and cpu thinks it has already serivced
- * the interrupt. Hence a vector might get locked. It was noticed
- * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
- */
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for (j = 31; j >= 0; j--) {
- if (value & (1<<j))
- ack_APIC_irq();
- }
- }
-
- /*
- * Now that we are all set up, enable the APIC
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- /*
- * Enable APIC
- */
- value |= APIC_SPIV_APIC_ENABLED;
-
- /* We always use processor focus */
-
- /*
- * Set spurious IRQ vector
- */
- value |= SPURIOUS_APIC_VECTOR;
- apic_write(APIC_SPIV, value);
-
- /*
- * Set up LVT0, LVT1:
- *
- * set up through-local-APIC on the BP's LINT0. This is not
- * strictly necessary in pure symmetric-IO mode, but sometimes
- * we delegate interrupts to the 8259A.
- */
- /*
- * TODO: set up through-local-APIC from through-I/O-APIC? --macro
- */
- value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
- if (!smp_processor_id() && !value) {
- value = APIC_DM_EXTINT;
- apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
- smp_processor_id());
- } else {
- value = APIC_DM_EXTINT | APIC_LVT_MASKED;
- apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
- smp_processor_id());
- }
- apic_write(APIC_LVT0, value);
-
- /*
- * only the BP should see the LINT1 NMI signal, obviously.
- */
- if (!smp_processor_id())
- value = APIC_DM_NMI;
- else
- value = APIC_DM_NMI | APIC_LVT_MASKED;
- apic_write(APIC_LVT1, value);
- preempt_enable();
-}
-
-static void __cpuinit lapic_setup_esr(void)
-{
- unsigned maxlvt = lapic_get_maxlvt();
-
- apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
- /*
- * spec says clear errors after enabling vector.
- */
- if (maxlvt > 3)
- apic_write(APIC_ESR, 0);
-}
-
-void __cpuinit end_local_APIC_setup(void)
-{
- lapic_setup_esr();
- setup_apic_nmi_watchdog(NULL);
- apic_pm_activate();
-}
-
-/*
- * Detect and enable local APICs on non-SMP boards.
- * Original code written by Keir Fraser.
- * On AMD64 we trust the BIOS - if it says no APIC it is likely
- * not correctly set up (usually the APIC timer won't work etc.)
- */
-static int __init detect_init_APIC(void)
-{
- if (!cpu_has_apic) {
- printk(KERN_INFO "No local APIC present\n");
- return -1;
- }
-
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- boot_cpu_physical_apicid = 0;
- return 0;
-}
-
-void __init early_init_lapic_mapping(void)
-{
- unsigned long phys_addr;
-
- /*
- * If no local APIC can be found then go out
- * : it means there is no mpatable and MADT
- */
- if (!smp_found_config)
- return;
-
- phys_addr = mp_lapic_addr;
-
- set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, phys_addr);
-
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
-}
-
-/**
- * init_apic_mappings - initialize APIC mappings
- */
-void __init init_apic_mappings(void)
-{
- /*
- * If no local APIC can be found then set up a fake all
- * zeroes page to simulate the local APIC and another
- * one for the IO-APIC.
- */
- if (!smp_found_config && detect_init_APIC()) {
- apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
- apic_phys = __pa(apic_phys);
- } else
- apic_phys = mp_lapic_addr;
-
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, apic_phys);
-
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
-}
-
-/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
- */
-int __init APIC_init_uniprocessor(void)
-{
- if (disable_apic) {
- printk(KERN_INFO "Apic disabled\n");
- return -1;
- }
- if (!cpu_has_apic) {
- disable_apic = 1;
- printk(KERN_INFO "Apic disabled by BIOS\n");
- return -1;
- }
-
- verify_local_APIC();
-
- connect_bsp_APIC();
-
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
- apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
-
- setup_local_APIC();
-
- /*
- * Now enable IO-APICs, actually call clear_IO_APIC
- * We need clear_IO_APIC before enabling vector on BP
- */
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
-
- if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
- localise_nmi_watchdog();
- end_local_APIC_setup();
-
- if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
- else
- nr_ioapics = 0;
- setup_boot_APIC_clock();
- check_nmi_watchdog();
- return 0;
-}
-
-/*
- * Local APIC interrupts
- */
-
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
- */
-asmlinkage void smp_spurious_interrupt(void)
-{
- unsigned int v;
- exit_idle();
- irq_enter();
- /*
- * Check if this really is a spurious interrupt and ACK it
- * if it is a vectored one. Just in case...
- * Spurious interrupts should not be ACKed.
- */
- v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
- if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
- ack_APIC_irq();
-
- add_pda(irq_spurious_count, 1);
- irq_exit();
-}
-
-/*
- * This interrupt should never happen with our APIC/SMP architecture
- */
-asmlinkage void smp_error_interrupt(void)
-{
- unsigned int v, v1;
-
- exit_idle();
- irq_enter();
- /* First tickle the hardware, only then report what went on. -- REW */
- v = apic_read(APIC_ESR);
- apic_write(APIC_ESR, 0);
- v1 = apic_read(APIC_ESR);
- ack_APIC_irq();
- atomic_inc(&irq_err_count);
-
- /* Here is what the APIC error bits mean:
- 0: Send CS error
- 1: Receive CS error
- 2: Send accept error
- 3: Receive accept error
- 4: Reserved
- 5: Send illegal vector
- 6: Received illegal vector
- 7: Illegal register address
- */
- printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
- smp_processor_id(), v , v1);
- irq_exit();
-}
-
-/**
- * * connect_bsp_APIC - attach the APIC to the interrupt system
- * */
-void __init connect_bsp_APIC(void)
-{
- enable_apic_mode();
-}
-
-void disconnect_bsp_APIC(int virt_wire_setup)
-{
- /* Go back to Virtual Wire compatibility mode */
- unsigned long value;
-
- /* For the spurious interrupt use vector F, and enable it */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
- value |= 0xf;
- apic_write(APIC_SPIV, value);
-
- if (!virt_wire_setup) {
- /*
- * For LVT0 make it edge triggered, active high,
- * external and enabled
- */
- value = apic_read(APIC_LVT0);
- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
- apic_write(APIC_LVT0, value);
- } else {
- /* Disable LVT0 */
- apic_write(APIC_LVT0, APIC_LVT_MASKED);
- }
-
- /* For LVT1 make it edge triggered, active high, nmi and enabled */
- value = apic_read(APIC_LVT1);
- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
- apic_write(APIC_LVT1, value);
-}
-
-void __cpuinit generic_processor_info(int apicid, int version)
-{
- int cpu;
- cpumask_t tmp_map;
-
- if (num_processors >= NR_CPUS) {
- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
- " Processor ignored.\n", NR_CPUS);
- return;
- }
-
- num_processors++;
- cpus_complement(tmp_map, cpu_present_map);
- cpu = first_cpu(tmp_map);
-
- physid_set(apicid, phys_cpu_present_map);
- if (apicid == boot_cpu_physical_apicid) {
- /*
- * x86_bios_cpu_apicid is required to have processors listed
- * in same order as logical cpu numbers. Hence the first
- * entry is BSP, and so on.
- */
- cpu = 0;
- }
- if (apicid > max_physical_apicid)
- max_physical_apicid = apicid;
-
- /* are we being called early in kernel startup? */
- if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
- u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
- u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-
- cpu_to_apicid[cpu] = apicid;
- bios_cpu_apicid[cpu] = apicid;
- } else {
- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
- }
-
- cpu_set(cpu, cpu_possible_map);
- cpu_set(cpu, cpu_present_map);
-}
-
-/*
- * Power management
- */
-#ifdef CONFIG_PM
-
-static struct {
- /* 'active' is true if the local APIC was enabled by us and
- not the BIOS; this signifies that we are also responsible
- for disabling it before entering apm/acpi suspend */
- int active;
- /* r/w apic fields */
- unsigned int apic_id;
- unsigned int apic_taskpri;
- unsigned int apic_ldr;
- unsigned int apic_dfr;
- unsigned int apic_spiv;
- unsigned int apic_lvtt;
- unsigned int apic_lvtpc;
- unsigned int apic_lvt0;
- unsigned int apic_lvt1;
- unsigned int apic_lvterr;
- unsigned int apic_tmict;
- unsigned int apic_tdcr;
- unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
- unsigned long flags;
- int maxlvt;
-
- if (!apic_pm_state.active)
- return 0;
-
- maxlvt = lapic_get_maxlvt();
-
- apic_pm_state.apic_id = read_apic_id();
- apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
- apic_pm_state.apic_ldr = apic_read(APIC_LDR);
- apic_pm_state.apic_dfr = apic_read(APIC_DFR);
- apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
- apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
- if (maxlvt >= 4)
- apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
- apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
- apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
- apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
- apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
- apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_INTEL
- if (maxlvt >= 5)
- apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-#endif
- local_irq_save(flags);
- disable_local_APIC();
- local_irq_restore(flags);
- return 0;
-}
-
-static int lapic_resume(struct sys_device *dev)
-{
- unsigned int l, h;
- unsigned long flags;
- int maxlvt;
-
- if (!apic_pm_state.active)
- return 0;
-
- maxlvt = lapic_get_maxlvt();
-
- local_irq_save(flags);
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
- apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
- apic_write(APIC_ID, apic_pm_state.apic_id);
- apic_write(APIC_DFR, apic_pm_state.apic_dfr);
- apic_write(APIC_LDR, apic_pm_state.apic_ldr);
- apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
- apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
- apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
- apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_INTEL
- if (maxlvt >= 5)
- apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-#endif
- if (maxlvt >= 4)
- apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
- apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
- apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
- apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- local_irq_restore(flags);
- return 0;
-}
-
-static struct sysdev_class lapic_sysclass = {
- .name = "lapic",
- .resume = lapic_resume,
- .suspend = lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
- .id = 0,
- .cls = &lapic_sysclass,
-};
-
-static void __cpuinit apic_pm_activate(void)
-{
- apic_pm_state.active = 1;
-}
-
-static int __init init_lapic_sysfs(void)
-{
- int error;
-
- if (!cpu_has_apic)
- return 0;
- /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-
- error = sysdev_class_register(&lapic_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic);
- return error;
-}
-device_initcall(init_lapic_sysfs);
-
-#else /* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif /* CONFIG_PM */
-
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
-{
- int i, clusters, zeros;
- unsigned id;
- u16 *bios_cpu_apicid;
- DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
-
- /*
- * there is not this kind of box with AMD CPU yet.
- * Some AMD box with quadcore cpu and 8 sockets apicid
- * will be [4, 0x23] or [8, 0x27] could be thought to
- * vsmp box still need checking...
- */
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
- return 0;
-
- bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
- bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-
- for (i = 0; i < NR_CPUS; i++) {
- /* are we being called early in kernel startup? */
- if (bios_cpu_apicid) {
- id = bios_cpu_apicid[i];
- }
- else if (i < nr_cpu_ids) {
- if (cpu_present(i))
- id = per_cpu(x86_bios_cpu_apicid, i);
- else
- continue;
- }
- else
- break;
-
- if (id != BAD_APICID)
- __set_bit(APIC_CLUSTERID(id), clustermap);
- }
-
- /* Problem: Partially populated chassis may not have CPUs in some of
- * the APIC clusters they have been allocated. Only present CPUs have
- * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
- * Since clusters are allocated sequentially, count zeros only if
- * they are bounded by ones.
- */
- clusters = 0;
- zeros = 0;
- for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
- if (test_bit(i, clustermap)) {
- clusters += 1 + zeros;
- zeros = 0;
- } else
- ++zeros;
- }
-
- /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
- * not guaranteed to be synced between boards
- */
- if (is_vsmp_box() && clusters > 1)
- return 1;
-
- /*
- * If clusters > 2, then should be multi-chassis.
- * May have to revisit this when multi-core + hyperthreaded CPUs come
- * out, but AFAIK this will work even for them.
- */
- return (clusters > 2);
-}
-
-/*
- * APIC command line parameters
- */
-static int __init apic_set_verbosity(char *str)
-{
- if (str == NULL) {
- skip_ioapic_setup = 0;
- ioapic_force = 1;
- return 0;
- }
- if (strcmp("debug", str) == 0)
- apic_verbosity = APIC_DEBUG;
- else if (strcmp("verbose", str) == 0)
- apic_verbosity = APIC_VERBOSE;
- else {
- printk(KERN_WARNING "APIC Verbosity level %s not recognised"
- " use apic=verbose or apic=debug\n", str);
- return -EINVAL;
- }
-
- return 0;
-}
-early_param("apic", apic_set_verbosity);
-
-static __init int setup_disableapic(char *str)
-{
- disable_apic = 1;
- setup_clear_cpu_cap(X86_FEATURE_APIC);
- return 0;
-}
-early_param("disableapic", setup_disableapic);
-
-/* same as disableapic, for compatibility */
-static __init int setup_nolapic(char *str)
-{
- return setup_disableapic(str);
-}
-early_param("nolapic", setup_nolapic);
-
-static int __init parse_lapic_timer_c2_ok(char *arg)
-{
- local_apic_timer_c2_ok = 1;
- return 0;
-}
-early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
-
-static __init int setup_noapictimer(char *str)
-{
- if (str[0] != ' ' && str[0] != 0)
- return 0;
- disable_apic_timer = 1;
- return 1;
-}
-__setup("noapictimer", setup_noapictimer);
-
-static __init int setup_apicpmtimer(char *s)
-{
- apic_calibrate_pmtmr = 1;
- notsc_setup(NULL);
- return 0;
-}
-__setup("apicpmtimer", setup_apicpmtimer);
-
-static int __init lapic_insert_resource(void)
-{
- if (!apic_phys)
- return -1;
-
- /* Put local APIC into the resource map. */
- lapic_resource.start = apic_phys;
- lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
- insert_resource(&iomem_resource, &lapic_resource);
-
- return 0;
-}
-
-/*
- * need call insert after e820_reserve_resources()
- * that is using request_resource
- */
-late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 9ee24e6bc4b..5145a6e72bb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,12 +228,12 @@
#include <linux/suspend.h>
#include <linux/kthread.h>
#include <linux/jiffies.h>
-#include <linux/smp_lock.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
#include <asm/i8253.h>
+#include <asm/olpc.h>
#include <asm/paravirt.h>
#include <asm/reboot.h>
@@ -2217,7 +2217,7 @@ static int __init apm_init(void)
dmi_check_system(apm_dmi_table);
- if (apm_info.bios.version == 0 || paravirt_enabled()) {
+ if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
printk(KERN_INFO "apm: BIOS not found.\n");
return -ENODEV;
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index aa89387006f..7fcf63d22f8 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -22,7 +22,7 @@
#define __NO_STUBS 1
#undef __SYSCALL
-#undef _ASM_X86_64_UNISTD_H_
+#undef _ASM_X86_UNISTD_64_H
#define __SYSCALL(nr, sym) [nr] = 1,
static char syscalls[] = {
#include <asm/unistd.h>
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index c639bd55391..f0dfe6f17e7 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -1,8 +1,6 @@
/*
* BIOS run time interface routines.
*
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
- *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -16,33 +14,128 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson
*/
+#include <linux/efi.h>
+#include <asm/efi.h>
+#include <linux/io.h>
#include <asm/uv/bios.h>
+#include <asm/uv/uv_hub.h>
+
+struct uv_systab uv_systab;
-const char *
-x86_bios_strerror(long status)
+s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
{
- const char *str;
- switch (status) {
- case 0: str = "Call completed without error"; break;
- case -1: str = "Not implemented"; break;
- case -2: str = "Invalid argument"; break;
- case -3: str = "Call completed with error"; break;
- default: str = "Unknown BIOS status code"; break;
- }
- return str;
+ struct uv_systab *tab = &uv_systab;
+
+ if (!tab->function)
+ /*
+ * BIOS does not support UV systab
+ */
+ return BIOS_STATUS_UNIMPLEMENTED;
+
+ return efi_call6((void *)__va(tab->function),
+ (u64)which, a1, a2, a3, a4, a5);
}
-long
-x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
- unsigned long *drift_info)
+s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+ u64 a4, u64 a5)
{
- struct uv_bios_retval isrv;
+ unsigned long bios_flags;
+ s64 ret;
- BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
- *ticks_per_second = isrv.v0;
- *drift_info = isrv.v1;
- return isrv.status;
+ local_irq_save(bios_flags);
+ ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+ local_irq_restore(bios_flags);
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(x86_bios_freq_base);
+
+s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+ u64 a4, u64 a5)
+{
+ s64 ret;
+
+ preempt_disable();
+ ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+ preempt_enable();
+
+ return ret;
+}
+
+
+long sn_partition_id;
+EXPORT_SYMBOL_GPL(sn_partition_id);
+long uv_coherency_id;
+EXPORT_SYMBOL_GPL(uv_coherency_id);
+long uv_region_size;
+EXPORT_SYMBOL_GPL(uv_region_size);
+int uv_type;
+
+
+s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
+ long *region)
+{
+ s64 ret;
+ u64 v0, v1;
+ union partition_info_u part;
+
+ ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
+ (u64)(&v0), (u64)(&v1), 0, 0);
+ if (ret != BIOS_STATUS_SUCCESS)
+ return ret;
+
+ part.val = v0;
+ if (uvtype)
+ *uvtype = part.hub_version;
+ if (partid)
+ *partid = part.partition_id;
+ if (coher)
+ *coher = part.coherence_id;
+ if (region)
+ *region = part.region_size;
+ return ret;
+}
+
+
+s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
+{
+ return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
+ (u64)ticks_per_second, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_freq_base);
+
+
+#ifdef CONFIG_EFI
+void uv_bios_init(void)
+{
+ struct uv_systab *tab;
+
+ if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
+ (efi.uv_systab == (unsigned long)NULL)) {
+ printk(KERN_CRIT "No EFI UV System Table.\n");
+ uv_systab.function = (unsigned long)NULL;
+ return;
+ }
+
+ tab = (struct uv_systab *)ioremap(efi.uv_systab,
+ sizeof(struct uv_systab));
+ if (strncmp(tab->signature, "UVST", 4) != 0)
+ printk(KERN_ERR "bad signature in UV system table!");
+
+ /*
+ * Copy table to permanent spot for later use.
+ */
+ memcpy(&uv_systab, tab, sizeof(struct uv_systab));
+ iounmap(tab);
+
+ printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision);
+}
+#else /* !CONFIG_EFI */
+
+void uv_bios_init(void) { }
+#endif
+
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
new file mode 100644
index 00000000000..667df55a439
--- /dev/null
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -0,0 +1 @@
+capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ee76eaad300..82ec6075c05 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,22 +3,30 @@
#
obj-y := intel_cacheinfo.o addon_cpuid_features.o
-obj-y += proc.o feature_names.o
-
-obj-$(CONFIG_X86_32) += common.o bugs.o
-obj-$(CONFIG_X86_64) += common_64.o bugs_64.o
-obj-$(CONFIG_X86_32) += amd.o
-obj-$(CONFIG_X86_64) += amd_64.o
-obj-$(CONFIG_X86_32) += cyrix.o
-obj-$(CONFIG_X86_32) += centaur.o
-obj-$(CONFIG_X86_64) += centaur_64.o
-obj-$(CONFIG_X86_32) += transmeta.o
-obj-$(CONFIG_X86_32) += intel.o
-obj-$(CONFIG_X86_64) += intel_64.o
-obj-$(CONFIG_X86_32) += umc.o
+obj-y += proc.o capflags.o powerflags.o common.o
+
+obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
+obj-$(CONFIG_X86_64) += bugs_64.o
+
+obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
+obj-$(CONFIG_CPU_SUP_AMD) += amd.o
+obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
+obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
+obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
+obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
+obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+
+quiet_cmd_mkcapflags = MKCAP $@
+ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
+
+cpufeature = $(src)/../../include/asm/cpufeature.h
+
+targets += capflags.c
+$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
+ $(call if_changed,mkcapflags)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index a6ef672adbb..0d9c993aa93 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,6 +7,8 @@
#include <asm/pat.h>
#include <asm/processor.h>
+#include <mach_apic.h>
+
struct cpuid_bit {
u16 feature;
u8 reg;
@@ -48,6 +50,92 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
}
}
+/* leaf 0xb SMT level */
+#define SMT_LEVEL 0
+
+/* leaf 0xb sub-leaf types */
+#define INVALID_TYPE 0
+#define SMT_TYPE 1
+#define CORE_TYPE 2
+
+#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
+#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
+#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
+
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned int eax, ebx, ecx, edx, sub_index;
+ unsigned int ht_mask_width, core_plus_mask_width;
+ unsigned int core_select_mask, core_level_siblings;
+
+ if (c->cpuid_level < 0xb)
+ return;
+
+ cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+
+ /*
+ * check if the cpuid leaf 0xb is actually implemented.
+ */
+ if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
+ return;
+
+ set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
+
+ /*
+ * initial apic id, which also represents 32-bit extended x2apic id.
+ */
+ c->initial_apicid = edx;
+
+ /*
+ * Populate HT related information from sub-leaf level 0.
+ */
+ core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+
+ sub_index = 1;
+ do {
+ cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
+
+ /*
+ * Check for the Core type in the implemented sub leaves.
+ */
+ if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
+ core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+ break;
+ }
+
+ sub_index++;
+ } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+
+ core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
+
+#ifdef CONFIG_X86_32
+ c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
+ & core_select_mask;
+ c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
+#else
+ c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
+ c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
+#endif
+ c->x86_max_cores = (core_level_siblings / smp_num_siblings);
+
+
+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
+ c->phys_proc_id);
+ if (c->x86_max_cores > 1)
+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
+ c->cpu_core_id);
+ return;
+#endif
+}
+
#ifdef CONFIG_X86_PAT
void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
{
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 18514ed2610..8f1e31db2ad 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
#include <linux/init.h>
#include <linux/bitops.h>
#include <linux/mm.h>
+
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/apic.h>
+#ifdef CONFIG_X86_64
+# include <asm/numa_64.h>
+# include <asm/mmconfig.h>
+# include <asm/cacheflush.h>
+#endif
+
#include <mach_apic.h>
+
#include "cpu.h"
+#ifdef CONFIG_X86_32
/*
* B step AMD K6 before B 9730xxxx have hardware bugs that can cause
* misexecution of code under Linux. Owners of such processors should
@@ -24,26 +33,273 @@
extern void vide(void);
__asm__(".align 4\nvide: ret");
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
{
- if (cpuid_eax(0x80000000) >= 0x80000007) {
- c->x86_power = cpuid_edx(0x80000007);
- if (c->x86_power & (1<<8))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+/*
+ * General Systems BIOSen alias the cpu frequency registers
+ * of the Elan at 0x000df000. Unfortuantly, one of the Linux
+ * drivers subsequently pokes it, and changes the CPU speed.
+ * Workaround : Remove the unneeded alias.
+ */
+#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
+#define CBAR_ENB (0x80000000)
+#define CBAR_KEY (0X000000CB)
+ if (c->x86_model == 9 || c->x86_model == 10) {
+ if (inl (CBAR) & CBAR_ENB)
+ outl (0 | CBAR_KEY, CBAR);
}
-
- /* Set MTRR capability flag if appropriate */
- if (c->x86_model == 13 || c->x86_model == 9 ||
- (c->x86_model == 8 && c->x86_mask >= 8))
- set_cpu_cap(c, X86_FEATURE_K6_MTRR);
}
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+
+static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
{
u32 l, h;
int mbytes = num_physpages >> (20-PAGE_SHIFT);
- int r;
+ if (c->x86_model < 6) {
+ /* Based on AMD doc 20734R - June 2000 */
+ if (c->x86_model == 0) {
+ clear_cpu_cap(c, X86_FEATURE_APIC);
+ set_cpu_cap(c, X86_FEATURE_PGE);
+ }
+ return;
+ }
+
+ if (c->x86_model == 6 && c->x86_mask == 1) {
+ const int K6_BUG_LOOP = 1000000;
+ int n;
+ void (*f_vide)(void);
+ unsigned long d, d2;
+
+ printk(KERN_INFO "AMD K6 stepping B detected - ");
+
+ /*
+ * It looks like AMD fixed the 2.6.2 bug and improved indirect
+ * calls at the same time.
+ */
+
+ n = K6_BUG_LOOP;
+ f_vide = vide;
+ rdtscl(d);
+ while (n--)
+ f_vide();
+ rdtscl(d2);
+ d = d2-d;
+
+ if (d > 20*K6_BUG_LOOP)
+ printk("system stability may be impaired when more than 32 MB are used.\n");
+ else
+ printk("probably OK (after B9730xxxx).\n");
+ printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
+ }
+
+ /* K6 with old style WHCR */
+ if (c->x86_model < 8 ||
+ (c->x86_model == 8 && c->x86_mask < 8)) {
+ /* We can only write allocate on the low 508Mb */
+ if (mbytes > 508)
+ mbytes = 508;
+
+ rdmsr(MSR_K6_WHCR, l, h);
+ if ((l&0x0000FFFF) == 0) {
+ unsigned long flags;
+ l = (1<<0)|((mbytes/4)<<1);
+ local_irq_save(flags);
+ wbinvd();
+ wrmsr(MSR_K6_WHCR, l, h);
+ local_irq_restore(flags);
+ printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+ mbytes);
+ }
+ return;
+ }
+
+ if ((c->x86_model == 8 && c->x86_mask > 7) ||
+ c->x86_model == 9 || c->x86_model == 13) {
+ /* The more serious chips .. */
+
+ if (mbytes > 4092)
+ mbytes = 4092;
+
+ rdmsr(MSR_K6_WHCR, l, h);
+ if ((l&0xFFFF0000) == 0) {
+ unsigned long flags;
+ l = ((mbytes>>2)<<22)|(1<<16);
+ local_irq_save(flags);
+ wbinvd();
+ wrmsr(MSR_K6_WHCR, l, h);
+ local_irq_restore(flags);
+ printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+ mbytes);
+ }
+
+ return;
+ }
+
+ if (c->x86_model == 10) {
+ /* AMD Geode LX is model 10 */
+ /* placeholder for any needed mods */
+ return;
+ }
+}
+
+static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
+{
+ u32 l, h;
+
+ /*
+ * Bit 15 of Athlon specific MSR 15, needs to be 0
+ * to enable SSE on Palomino/Morgan/Barton CPU's.
+ * If the BIOS didn't enable it already, enable it here.
+ */
+ if (c->x86_model >= 6 && c->x86_model <= 10) {
+ if (!cpu_has(c, X86_FEATURE_XMM)) {
+ printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+ rdmsr(MSR_K7_HWCR, l, h);
+ l &= ~0x00008000;
+ wrmsr(MSR_K7_HWCR, l, h);
+ set_cpu_cap(c, X86_FEATURE_XMM);
+ }
+ }
+
+ /*
+ * It's been determined by AMD that Athlons since model 8 stepping 1
+ * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+ * As per AMD technical note 27212 0.2
+ */
+ if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
+ rdmsr(MSR_K7_CLK_CTL, l, h);
+ if ((l & 0xfff00000) != 0x20000000) {
+ printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
+ ((l & 0x000fffff)|0x20000000));
+ wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+ }
+ }
+
+ set_cpu_cap(c, X86_FEATURE_K7);
+}
+#endif
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+static int __cpuinit nearby_node(int apicid)
+{
+ int i, node;
+
+ for (i = apicid - 1; i >= 0; i--) {
+ node = apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+ node = apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_HT
+ unsigned bits;
+
+ bits = c->x86_coreid_bits;
+
+ /* Low order bits define the core id (index of core in socket) */
+ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ /* Convert the initial APIC ID into the socket ID */
+ c->phys_proc_id = c->initial_apicid >> bits;
+#endif
+}
+
+static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
+{
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+ int cpu = smp_processor_id();
+ int node;
+ unsigned apicid = hard_smp_processor_id();
+
+ node = c->phys_proc_id;
+ if (apicid_to_node[apicid] != NUMA_NO_NODE)
+ node = apicid_to_node[apicid];
+ if (!node_online(node)) {
+ /* Two possibilities here:
+ - The CPU is missing memory and no node was created.
+ In that case try picking one from a nearby CPU
+ - The APIC IDs differ from the HyperTransport node IDs
+ which the K8 northbridge parsing fills in.
+ Assume they are all increased by a constant offset,
+ but in the same order as the HT nodeids.
+ If that doesn't result in a usable node fall back to the
+ path for the previous case. */
+
+ int ht_nodeid = c->initial_apicid;
+
+ if (ht_nodeid >= 0 &&
+ apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
+
+ printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
+
+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_HT
+ unsigned bits, ecx;
+
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level < 0x80000008)
+ return;
+
+ ecx = cpuid_ecx(0x80000008);
+
+ c->x86_max_cores = (ecx & 0xff) + 1;
+
+ /* CPU telling us the core id bits shift? */
+ bits = (ecx >> 12) & 0xF;
+
+ /* Otherwise recompute */
+ if (bits == 0) {
+ while ((1 << bits) < c->x86_max_cores)
+ bits++;
+ }
+
+ c->x86_coreid_bits = bits;
+#endif
+}
+
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+ early_init_amd_mc(c);
+
+ /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+ if (c->x86_power & (1<<8))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+#else
+ /* Set MTRR capability flag if appropriate */
+ if (c->x86 == 5)
+ if (c->x86_model == 13 || c->x86_model == 9 ||
+ (c->x86_model == 8 && c->x86_mask >= 8))
+ set_cpu_cap(c, X86_FEATURE_K6_MTRR);
+#endif
+}
+
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+{
#ifdef CONFIG_SMP
unsigned long long value;
@@ -54,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* Errata 63 for SH-B3 steppings
* Errata 122 for all steppings (F+ have it disabled by default)
*/
- if (c->x86 == 15) {
+ if (c->x86 == 0xf) {
rdmsrl(MSR_K7_HWCR, value);
value |= 1 << 6;
wrmsrl(MSR_K7_HWCR, value);
@@ -64,209 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
early_init_amd(c);
/*
- * FIXME: We should handle the K5 here. Set up the write
- * range and also turn on MSR 83 bits 4 and 31 (write alloc,
- * no bus pipeline)
- */
-
- /*
* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
* 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
*/
clear_cpu_cap(c, 0*32+31);
- r = get_model_name(c);
+#ifdef CONFIG_X86_64
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ if (c->x86 == 0xf) {
+ u32 level;
- switch (c->x86) {
- case 4:
- /*
- * General Systems BIOSen alias the cpu frequency registers
- * of the Elan at 0x000df000. Unfortuantly, one of the Linux
- * drivers subsequently pokes it, and changes the CPU speed.
- * Workaround : Remove the unneeded alias.
- */
-#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
-#define CBAR_ENB (0x80000000)
-#define CBAR_KEY (0X000000CB)
- if (c->x86_model == 9 || c->x86_model == 10) {
- if (inl (CBAR) & CBAR_ENB)
- outl (0 | CBAR_KEY, CBAR);
- }
- break;
- case 5:
- if (c->x86_model < 6) {
- /* Based on AMD doc 20734R - June 2000 */
- if (c->x86_model == 0) {
- clear_cpu_cap(c, X86_FEATURE_APIC);
- set_cpu_cap(c, X86_FEATURE_PGE);
- }
- break;
- }
-
- if (c->x86_model == 6 && c->x86_mask == 1) {
- const int K6_BUG_LOOP = 1000000;
- int n;
- void (*f_vide)(void);
- unsigned long d, d2;
-
- printk(KERN_INFO "AMD K6 stepping B detected - ");
-
- /*
- * It looks like AMD fixed the 2.6.2 bug and improved indirect
- * calls at the same time.
- */
-
- n = K6_BUG_LOOP;
- f_vide = vide;
- rdtscl(d);
- while (n--)
- f_vide();
- rdtscl(d2);
- d = d2-d;
-
- if (d > 20*K6_BUG_LOOP)
- printk("system stability may be impaired when more than 32 MB are used.\n");
- else
- printk("probably OK (after B9730xxxx).\n");
- printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
- }
-
- /* K6 with old style WHCR */
- if (c->x86_model < 8 ||
- (c->x86_model == 8 && c->x86_mask < 8)) {
- /* We can only write allocate on the low 508Mb */
- if (mbytes > 508)
- mbytes = 508;
-
- rdmsr(MSR_K6_WHCR, l, h);
- if ((l&0x0000FFFF) == 0) {
- unsigned long flags;
- l = (1<<0)|((mbytes/4)<<1);
- local_irq_save(flags);
- wbinvd();
- wrmsr(MSR_K6_WHCR, l, h);
- local_irq_restore(flags);
- printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
- mbytes);
- }
- break;
- }
-
- if ((c->x86_model == 8 && c->x86_mask > 7) ||
- c->x86_model == 9 || c->x86_model == 13) {
- /* The more serious chips .. */
-
- if (mbytes > 4092)
- mbytes = 4092;
-
- rdmsr(MSR_K6_WHCR, l, h);
- if ((l&0xFFFF0000) == 0) {
- unsigned long flags;
- l = ((mbytes>>2)<<22)|(1<<16);
- local_irq_save(flags);
- wbinvd();
- wrmsr(MSR_K6_WHCR, l, h);
- local_irq_restore(flags);
- printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
- mbytes);
- }
-
- break;
- }
-
- if (c->x86_model == 10) {
- /* AMD Geode LX is model 10 */
- /* placeholder for any needed mods */
- break;
- }
- break;
- case 6: /* An Athlon/Duron */
-
- /*
- * Bit 15 of Athlon specific MSR 15, needs to be 0
- * to enable SSE on Palomino/Morgan/Barton CPU's.
- * If the BIOS didn't enable it already, enable it here.
- */
- if (c->x86_model >= 6 && c->x86_model <= 10) {
- if (!cpu_has(c, X86_FEATURE_XMM)) {
- printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
- rdmsr(MSR_K7_HWCR, l, h);
- l &= ~0x00008000;
- wrmsr(MSR_K7_HWCR, l, h);
- set_cpu_cap(c, X86_FEATURE_XMM);
- }
- }
-
- /*
- * It's been determined by AMD that Athlons since model 8 stepping 1
- * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
- * As per AMD technical note 27212 0.2
- */
- if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
- rdmsr(MSR_K7_CLK_CTL, l, h);
- if ((l & 0xfff00000) != 0x20000000) {
- printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
- ((l & 0x000fffff)|0x20000000));
- wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
- }
- }
- break;
+ level = cpuid_eax(1);
+ if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
+ if (c->x86 == 0x10 || c->x86 == 0x11)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+#else
+
+ /*
+ * FIXME: We should handle the K5 here. Set up the write
+ * range and also turn on MSR 83 bits 4 and 31 (write alloc,
+ * no bus pipeline)
+ */
switch (c->x86) {
- case 15:
- /* Use K8 tuning for Fam10h and Fam11h */
- case 0x10:
- case 0x11:
- set_cpu_cap(c, X86_FEATURE_K8);
+ case 4:
+ init_amd_k5(c);
break;
- case 6:
- set_cpu_cap(c, X86_FEATURE_K7);
+ case 5:
+ init_amd_k6(c);
+ break;
+ case 6: /* An Athlon/Duron */
+ init_amd_k7(c);
break;
}
+
+ /* K6s reports MCEs but don't actually have all the MSRs */
+ if (c->x86 < 6)
+ clear_cpu_cap(c, X86_FEATURE_MCE);
+#endif
+
+ /* Enable workaround for FXSAVE leak */
if (c->x86 >= 6)
set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
- display_cacheinfo(c);
-
- if (cpuid_eax(0x80000000) >= 0x80000008)
- c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+ if (!c->x86_model_id[0]) {
+ switch (c->x86) {
+ case 0xf:
+ /* Should distinguish Models here, but this is only
+ a fallback anyways. */
+ strcpy(c->x86_model_id, "Hammer");
+ break;
+ }
+ }
-#ifdef CONFIG_X86_HT
- /*
- * On a AMD multi core setup the lower bits of the APIC id
- * distinguish the cores.
- */
- if (c->x86_max_cores > 1) {
- int cpu = smp_processor_id();
- unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
+ display_cacheinfo(c);
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
- }
- c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
- c->phys_proc_id >>= bits;
- printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
- cpu, c->x86_max_cores, c->cpu_core_id);
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level >= 0x80000008) {
+ amd_detect_cmp(c);
+ srat_detect_node(c);
}
+
+#ifdef CONFIG_X86_32
+ detect_ht(c);
#endif
- if (cpuid_eax(0x80000000) >= 0x80000006) {
- if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000))
+ if (c->extended_cpuid_level >= 0x80000006) {
+ if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
num_cache_leaves = 4;
else
num_cache_leaves = 3;
}
- /* K6s reports MCEs but don't actually have all the MSRs */
- if (c->x86 < 6)
- clear_cpu_cap(c, X86_FEATURE_MCE);
+ if (c->x86 >= 0xf && c->x86 <= 0x11)
+ set_cpu_cap(c, X86_FEATURE_K8);
- if (cpu_has_xmm2)
+ if (cpu_has_xmm2) {
+ /* MFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+ }
+
+#ifdef CONFIG_X86_64
+ if (c->x86 == 0x10) {
+ /* do this for boot cpu */
+ if (c == &boot_cpu_data)
+ check_enable_amd_mmconf_dmi();
+
+ fam10h_check_enable_mmcfg();
+ }
+
+ if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+ if ((tseg>>PMD_SHIFT) <
+ (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
+ ((tseg>>PMD_SHIFT) <
+ (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
+ (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
+ }
+#endif
}
+#ifdef CONFIG_X86_32
static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* AMD errata T13 (order #21922) */
@@ -279,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
}
return size;
}
+#endif
static struct cpu_dev amd_cpu_dev __cpuinitdata = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
+#ifdef CONFIG_X86_32
.c_models = {
{ .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
{
@@ -295,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
}
},
},
+ .c_size_cache = amd_size_cache,
+#endif
.c_early_init = early_init_amd,
.c_init = init_amd,
- .c_size_cache = amd_size_cache,
+ .c_x86_vendor = X86_VENDOR_AMD,
};
-cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
+cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
deleted file mode 100644
index d1692b2a41f..00000000000
--- a/arch/x86/kernel/cpu/amd_64.c
+++ /dev/null
@@ -1,224 +0,0 @@
-#include <linux/init.h>
-#include <linux/mm.h>
-
-#include <asm/numa_64.h>
-#include <asm/mmconfig.h>
-#include <asm/cacheflush.h>
-
-#include <mach_apic.h>
-
-#include "cpu.h"
-
-int force_mwait __cpuinitdata;
-
-#ifdef CONFIG_NUMA
-static int __cpuinit nearby_node(int apicid)
-{
- int i, node;
-
- for (i = apicid - 1; i >= 0; i--) {
- node = apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
- node = apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned bits;
-#ifdef CONFIG_NUMA
- int cpu = smp_processor_id();
- int node = 0;
- unsigned apicid = hard_smp_processor_id();
-#endif
- bits = c->x86_coreid_bits;
-
- /* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
-
-#ifdef CONFIG_NUMA
- node = c->phys_proc_id;
- if (apicid_to_node[apicid] != NUMA_NO_NODE)
- node = apicid_to_node[apicid];
- if (!node_online(node)) {
- /* Two possibilities here:
- - The CPU is missing memory and no node was created.
- In that case try picking one from a nearby CPU
- - The APIC IDs differ from the HyperTransport node IDs
- which the K8 northbridge parsing fills in.
- Assume they are all increased by a constant offset,
- but in the same order as the HT nodeids.
- If that doesn't result in a usable node fall back to the
- path for the previous case. */
-
- int ht_nodeid = c->initial_apicid;
-
- if (ht_nodeid >= 0 &&
- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
- node = apicid_to_node[ht_nodeid];
- /* Pick a nearby node */
- if (!node_online(node))
- node = nearby_node(apicid);
- }
- numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-#endif
-}
-
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned bits, ecx;
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level < 0x80000008)
- return;
-
- ecx = cpuid_ecx(0x80000008);
-
- c->x86_max_cores = (ecx & 0xff) + 1;
-
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
-
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
- }
-
- c->x86_coreid_bits = bits;
-
-#endif
-}
-
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
-{
- early_init_amd_mc(c);
-
- /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
- if (c->x86_power & (1<<8))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
- set_cpu_cap(c, X86_FEATURE_SYSCALL32);
-}
-
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-{
- unsigned level;
-
-#ifdef CONFIG_SMP
- unsigned long value;
-
- /*
- * Disable TLB flush filter by setting HWCR.FFDIS on K8
- * bit 6 of msr C001_0015
- *
- * Errata 63 for SH-B3 steppings
- * Errata 122 for all steppings (F+ have it disabled by default)
- */
- if (c->x86 == 0xf) {
- rdmsrl(MSR_K8_HWCR, value);
- value |= 1 << 6;
- wrmsrl(MSR_K8_HWCR, value);
- }
-#endif
-
- /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
- clear_cpu_cap(c, 0*32+31);
-
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- if (c->x86 == 0xf) {
- level = cpuid_eax(1);
- if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- }
- if (c->x86 == 0x10 || c->x86 == 0x11)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- /* Enable workaround for FXSAVE leak */
- if (c->x86 >= 6)
- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
- level = get_model_name(c);
- if (!level) {
- switch (c->x86) {
- case 0xf:
- /* Should distinguish Models here, but this is only
- a fallback anyways. */
- strcpy(c->x86_model_id, "Hammer");
- break;
- }
- }
- display_cacheinfo(c);
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level >= 0x80000008)
- amd_detect_cmp(c);
-
- if (c->extended_cpuid_level >= 0x80000006 &&
- (cpuid_edx(0x80000006) & 0xf000))
- num_cache_leaves = 4;
- else
- num_cache_leaves = 3;
-
- if (c->x86 >= 0xf && c->x86 <= 0x11)
- set_cpu_cap(c, X86_FEATURE_K8);
-
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
-
- if (c->x86 == 0x10) {
- /* do this for boot cpu */
- if (c == &boot_cpu_data)
- check_enable_amd_mmconf_dmi();
-
- fam10h_check_enable_mmcfg();
- }
-
- if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- if ((tseg>>PMD_SHIFT) <
- (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
- ((tseg>>PMD_SHIFT) <
- (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
- (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
- }
-}
-
-static struct cpu_dev amd_cpu_dev __cpuinitdata = {
- .c_vendor = "AMD",
- .c_ident = { "AuthenticAMD" },
- .c_early_init = early_init_amd,
- .c_init = init_amd,
-};
-
-cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index a0534c04d38..89bfdd9cacc 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
if (c->x86_model >= 6 && c->x86_model < 9)
set_cpu_cap(c, X86_FEATURE_3DNOW);
- get_model_name(c);
display_cacheinfo(c);
}
@@ -475,6 +474,7 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
.c_early_init = early_init_centaur,
.c_init = init_centaur,
.c_size_cache = centaur_size_cache,
+ .c_x86_vendor = X86_VENDOR_CENTAUR,
};
-cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
+cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 1d181c40e2e..a1625f5a1e7 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
{
+ early_init_centaur(c);
+
if (c->x86 == 0x6 && c->x86_model >= 0xf) {
c->x86_cache_alignment = c->x86_clflush_size * 2;
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -29,7 +30,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
.c_init = init_centaur,
+ .c_x86_vendor = X86_VENDOR_CENTAUR,
};
-cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
+cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
new file mode 100644
index 00000000000..2056ccf572c
--- /dev/null
+++ b/arch/x86/kernel/cpu/cmpxchg.c
@@ -0,0 +1,72 @@
+/*
+ * cmpxchg*() fallbacks for CPU not supporting these instructions
+ */
+
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+
+#ifndef CONFIG_X86_CMPXCHG
+unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
+{
+ u8 prev;
+ unsigned long flags;
+
+ /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ local_irq_save(flags);
+ prev = *(u8 *)ptr;
+ if (prev == old)
+ *(u8 *)ptr = new;
+ local_irq_restore(flags);
+ return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u8);
+
+unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
+{
+ u16 prev;
+ unsigned long flags;
+
+ /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ local_irq_save(flags);
+ prev = *(u16 *)ptr;
+ if (prev == old)
+ *(u16 *)ptr = new;
+ local_irq_restore(flags);
+ return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u16);
+
+unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
+{
+ u32 prev;
+ unsigned long flags;
+
+ /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+ local_irq_save(flags);
+ prev = *(u32 *)ptr;
+ if (prev == old)
+ *(u32 *)ptr = new;
+ local_irq_restore(flags);
+ return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u32);
+#endif
+
+#ifndef CONFIG_X86_CMPXCHG64
+unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
+{
+ u64 prev;
+ unsigned long flags;
+
+ /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
+ local_irq_save(flags);
+ prev = *(u64 *)ptr;
+ if (prev == old)
+ *(u64 *)ptr = new;
+ local_irq_restore(flags);
+ return prev;
+}
+EXPORT_SYMBOL(cmpxchg_486_u64);
+#endif
+
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4e456bd955b..25581dcb280 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,28 +1,62 @@
#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/kgdb.h>
+#include <linux/topology.h>
#include <linux/delay.h>
#include <linux/smp.h>
-#include <linux/module.h>
#include <linux/percpu.h>
-#include <linux/bootmem.h>
-#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/msr.h>
#include <asm/io.h>
+#include <asm/linkage.h>
#include <asm/mmu_context.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
#include <asm/pat.h>
#include <asm/asm.h>
+#include <asm/numa.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/mpspec.h>
#include <asm/apic.h>
#include <mach_apic.h>
+#include <asm/genapic.h>
#endif
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+
#include "cpu.h"
+static struct cpu_dev *this_cpu __cpuinitdata;
+
+#ifdef CONFIG_X86_64
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ */
+/* The TLS descriptors are currently at a different place compared to i386.
+ Hopefully nobody expects them at a fixed place (Wine?) */
DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+ [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+} };
+#else
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -56,17 +90,157 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
[GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
} };
+#endif
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-
+#ifdef CONFIG_X86_32
static int cachesize_override __cpuinitdata = -1;
static int disable_x86_serial_nr __cpuinitdata = 1;
-struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+static int __init cachesize_setup(char *str)
+{
+ get_option(&str, &cachesize_override);
+ return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
+static int __init x86_fxsr_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+ setup_clear_cpu_cap(X86_FEATURE_XMM);
+ return 1;
+}
+__setup("nofxsr", x86_fxsr_setup);
+
+static int __init x86_sep_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SEP);
+ return 1;
+}
+__setup("nosep", x86_sep_setup);
+
+/* Standard macro to see if a specific flag is changeable */
+static inline int flag_is_changeable_p(u32 flag)
+{
+ u32 f1, f2;
+
+ /*
+ * Cyrix and IDT cpus allow disabling of CPUID
+ * so the code below may return different results
+ * when it is executed before and after enabling
+ * the CPUID. Add "volatile" to not allow gcc to
+ * optimize the subsequent calls to this function.
+ */
+ asm volatile ("pushfl\n\t"
+ "pushfl\n\t"
+ "popl %0\n\t"
+ "movl %0,%1\n\t"
+ "xorl %2,%0\n\t"
+ "pushl %0\n\t"
+ "popfl\n\t"
+ "pushfl\n\t"
+ "popl %0\n\t"
+ "popfl\n\t"
+ : "=&r" (f1), "=&r" (f2)
+ : "ir" (flag));
+
+ return ((f1^f2) & flag) != 0;
+}
+
+/* Probe for the CPUID instruction */
+static int __cpuinit have_cpuid_p(void)
+{
+ return flag_is_changeable_p(X86_EFLAGS_ID);
+}
+
+static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
+ /* Disable processor serial number */
+ unsigned long lo, hi;
+ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+ lo |= 0x200000;
+ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+ printk(KERN_NOTICE "CPU serial number disabled.\n");
+ clear_cpu_cap(c, X86_FEATURE_PN);
+
+ /* Disabling the serial number may affect the cpuid level */
+ c->cpuid_level = cpuid_eax(0);
+ }
+}
+
+static int __init x86_serial_nr_setup(char *s)
+{
+ disable_x86_serial_nr = 0;
+ return 1;
+}
+__setup("serialnumber", x86_serial_nr_setup);
+#else
+static inline int flag_is_changeable_p(u32 flag)
+{
+ return 1;
+}
+/* Probe for the CPUID instruction */
+static inline int have_cpuid_p(void)
+{
+ return 1;
+}
+static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
+/*
+ * Naming convention should be: <Name> [(<Codename>)]
+ * This table only is used unless init_<vendor>() below doesn't set it;
+ * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
+ *
+ */
+
+/* Look up CPU names by table lookup. */
+static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+{
+ struct cpu_model_info *info;
+
+ if (c->x86_model >= 16)
+ return NULL; /* Range check */
+
+ if (!this_cpu)
+ return NULL;
+
+ info = this_cpu->c_models;
+
+ while (info && info->family) {
+ if (info->family == c->x86)
+ return info->model_names[c->x86_model];
+ info++;
+ }
+ return NULL; /* Not found */
+}
+
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+#ifdef CONFIG_X86_32
+ asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
+#endif
+}
+
+static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
static void __cpuinit default_init(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_64
+ display_cacheinfo(c);
+#else
/* Not much we can do here... */
/* Check if at least it has cpuid */
if (c->cpuid_level == -1) {
@@ -76,28 +250,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
else if (c->x86 == 3)
strcpy(c->x86_model_id, "386");
}
+#endif
}
static struct cpu_dev __cpuinitdata default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
+ .c_x86_vendor = X86_VENDOR_UNKNOWN,
};
-static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
-
-static int __init cachesize_setup(char *str)
-{
- get_option(&str, &cachesize_override);
- return 1;
-}
-__setup("cachesize=", cachesize_setup);
-int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
char *p, *q;
- if (cpuid_eax(0x80000000) < 0x80000004)
- return 0;
+ if (c->extended_cpuid_level < 0x80000004)
+ return;
v = (unsigned int *) c->x86_model_id;
cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
@@ -116,30 +284,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
while (q <= &c->x86_model_id[48])
*q++ = '\0'; /* Zero-pad the rest */
}
-
- return 1;
}
-
void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
{
- unsigned int n, dummy, ecx, edx, l2size;
+ unsigned int n, dummy, ebx, ecx, edx, l2size;
- n = cpuid_eax(0x80000000);
+ n = c->extended_cpuid_level;
if (n >= 0x80000005) {
- cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
- edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
- c->x86_cache_size = (ecx>>24)+(edx>>24);
+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+ c->x86_cache_size = (ecx>>24) + (edx>>24);
+#ifdef CONFIG_X86_64
+ /* On K8 L1 TLB is inclusive, so don't count it */
+ c->x86_tlbsize = 0;
+#endif
}
if (n < 0x80000006) /* Some chips just has a large L1. */
return;
- ecx = cpuid_ecx(0x80000006);
+ cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
l2size = ecx >> 16;
+#ifdef CONFIG_X86_64
+ c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+#else
/* do processor-specific cache resizing */
if (this_cpu->c_size_cache)
l2size = this_cpu->c_size_cache(c, l2size);
@@ -150,116 +322,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
if (l2size == 0)
return; /* Again, no L2 cache is possible */
+#endif
c->x86_cache_size = l2size;
printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
- l2size, ecx & 0xFF);
+ l2size, ecx & 0xFF);
}
-/*
- * Naming convention should be: <Name> [(<Codename>)]
- * This table only is used unless init_<vendor>() below doesn't set it;
- * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
- *
- */
-
-/* Look up CPU names by table lookup. */
-static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
{
- struct cpu_model_info *info;
+#ifdef CONFIG_X86_HT
+ u32 eax, ebx, ecx, edx;
+ int index_msb, core_bits;
- if (c->x86_model >= 16)
- return NULL; /* Range check */
+ if (!cpu_has(c, X86_FEATURE_HT))
+ return;
- if (!this_cpu)
- return NULL;
+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+ goto out;
- info = this_cpu->c_models;
+ if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
+ return;
- while (info && info->family) {
- if (info->family == c->x86)
- return info->model_names[c->x86_model];
- info++;
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+ smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+ if (smp_num_siblings == 1) {
+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
+ } else if (smp_num_siblings > 1) {
+
+ if (smp_num_siblings > NR_CPUS) {
+ printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
+ smp_num_siblings);
+ smp_num_siblings = 1;
+ return;
+ }
+
+ index_msb = get_count_order(smp_num_siblings);
+#ifdef CONFIG_X86_64
+ c->phys_proc_id = phys_pkg_id(index_msb);
+#else
+ c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
+#endif
+
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+ index_msb = get_count_order(smp_num_siblings);
+
+ core_bits = get_count_order(c->x86_max_cores);
+
+#ifdef CONFIG_X86_64
+ c->cpu_core_id = phys_pkg_id(index_msb) &
+ ((1 << core_bits) - 1);
+#else
+ c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
+ ((1 << core_bits) - 1);
+#endif
}
- return NULL; /* Not found */
-}
+out:
+ if ((c->x86_max_cores * smp_num_siblings) > 1) {
+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
+ c->phys_proc_id);
+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
+ c->cpu_core_id);
+ }
+#endif
+}
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
int i;
static int printed;
for (i = 0; i < X86_VENDOR_NUM; i++) {
- if (cpu_devs[i]) {
- if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
- (cpu_devs[i]->c_ident[1] &&
- !strcmp(v, cpu_devs[i]->c_ident[1]))) {
- c->x86_vendor = i;
- if (!early)
- this_cpu = cpu_devs[i];
- return;
- }
+ if (!cpu_devs[i])
+ break;
+
+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+ (cpu_devs[i]->c_ident[1] &&
+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+ this_cpu = cpu_devs[i];
+ c->x86_vendor = this_cpu->c_x86_vendor;
+ return;
}
}
+
if (!printed) {
printed++;
- printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+ printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
printk(KERN_ERR "CPU: Your system may be unstable.\n");
}
+
c->x86_vendor = X86_VENDOR_UNKNOWN;
this_cpu = &default_cpu;
}
-
-static int __init x86_fxsr_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
- setup_clear_cpu_cap(X86_FEATURE_XMM);
- return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
-
-
-static int __init x86_sep_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_SEP);
- return 1;
-}
-__setup("nosep", x86_sep_setup);
-
-
-/* Standard macro to see if a specific flag is changeable */
-static inline int flag_is_changeable_p(u32 flag)
-{
- u32 f1, f2;
-
- asm("pushfl\n\t"
- "pushfl\n\t"
- "popl %0\n\t"
- "movl %0,%1\n\t"
- "xorl %2,%0\n\t"
- "pushl %0\n\t"
- "popfl\n\t"
- "pushfl\n\t"
- "popl %0\n\t"
- "popfl\n\t"
- : "=&r" (f1), "=&r" (f2)
- : "ir" (flag));
-
- return ((f1^f2) & flag) != 0;
-}
-
-
-/* Probe for the CPUID instruction */
-static int __cpuinit have_cpuid_p(void)
-{
- return flag_is_changeable_p(X86_EFLAGS_ID);
-}
-
-void __init cpu_detect(struct cpuinfo_x86 *c)
+void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
{
/* Get vendor name */
cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -268,50 +430,87 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
(unsigned int *)&c->x86_vendor_id[4]);
c->x86 = 4;
+ /* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
u32 junk, tfms, cap0, misc;
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
- c->x86 = (tfms >> 8) & 15;
- c->x86_model = (tfms >> 4) & 15;
+ c->x86 = (tfms >> 8) & 0xf;
+ c->x86_model = (tfms >> 4) & 0xf;
+ c->x86_mask = tfms & 0xf;
if (c->x86 == 0xf)
c->x86 += (tfms >> 20) & 0xff;
if (c->x86 >= 0x6)
- c->x86_model += ((tfms >> 16) & 0xF) << 4;
- c->x86_mask = tfms & 15;
+ c->x86_model += ((tfms >> 16) & 0xf) << 4;
if (cap0 & (1<<19)) {
- c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+ c->x86_cache_alignment = c->x86_clflush_size;
}
}
}
-static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
+
+static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
{
u32 tfms, xlvl;
- unsigned int ebx;
+ u32 ebx;
- memset(&c->x86_capability, 0, sizeof c->x86_capability);
- if (have_cpuid_p()) {
- /* Intel-defined flags: level 0x00000001 */
- if (c->cpuid_level >= 0x00000001) {
- u32 capability, excap;
- cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
- c->x86_capability[0] = capability;
- c->x86_capability[4] = excap;
- }
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ u32 capability, excap;
+ cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
+ c->x86_capability[0] = capability;
+ c->x86_capability[4] = excap;
+ }
- /* AMD-defined flags: level 0x80000001 */
- xlvl = cpuid_eax(0x80000000);
- if ((xlvl & 0xffff0000) == 0x80000000) {
- if (xlvl >= 0x80000001) {
- c->x86_capability[1] = cpuid_edx(0x80000001);
- c->x86_capability[6] = cpuid_ecx(0x80000001);
- }
+ /* AMD-defined flags: level 0x80000001 */
+ xlvl = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = xlvl;
+ if ((xlvl & 0xffff0000) == 0x80000000) {
+ if (xlvl >= 0x80000001) {
+ c->x86_capability[1] = cpuid_edx(0x80000001);
+ c->x86_capability[6] = cpuid_ecx(0x80000001);
}
+ }
+
+#ifdef CONFIG_X86_64
+ if (c->extended_cpuid_level >= 0x80000008) {
+ u32 eax = cpuid_eax(0x80000008);
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
}
+#endif
+
+ if (c->extended_cpuid_level >= 0x80000007)
+ c->x86_power = cpuid_edx(0x80000007);
}
+static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ int i;
+
+ /*
+ * First of all, decide if this is a 486 or higher
+ * It's a 486 if we can modify the AC flag
+ */
+ if (flag_is_changeable_p(X86_EFLAGS_AC))
+ c->x86 = 4;
+ else
+ c->x86 = 3;
+
+ for (i = 0; i < X86_VENDOR_NUM; i++)
+ if (cpu_devs[i] && cpu_devs[i]->c_identify) {
+ c->x86_vendor_id[0] = 0;
+ cpu_devs[i]->c_identify(c);
+ if (c->x86_vendor_id[0]) {
+ get_cpu_vendor(c);
+ break;
+ }
+ }
+#endif
+}
+
/*
* Do minimum CPU detection early.
* Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -321,25 +520,61 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
* WARNING: this function is only called on the BP. Don't add code here
* that is supposed to run on all CPUs.
*/
-static void __init early_cpu_detect(void)
+static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- c->x86_cache_alignment = 32;
+#ifdef CONFIG_X86_64
+ c->x86_clflush_size = 64;
+#else
c->x86_clflush_size = 32;
+#endif
+ c->x86_cache_alignment = c->x86_clflush_size;
+
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+ c->extended_cpuid_level = 0;
+
+ if (!have_cpuid_p())
+ identify_cpu_without_cpuid(c);
+ /* cyrix could have cpuid enabled via c_identify()*/
if (!have_cpuid_p())
return;
cpu_detect(c);
- get_cpu_vendor(c, 1);
+ get_cpu_vendor(c);
+
+ get_cpu_cap(c);
- early_get_cap(c);
+ if (this_cpu->c_early_init)
+ this_cpu->c_early_init(c);
- if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
- cpu_devs[c->x86_vendor]->c_early_init)
- cpu_devs[c->x86_vendor]->c_early_init(c);
+ validate_pat_support(c);
+}
+
+void __init early_cpu_init(void)
+{
+ struct cpu_dev **cdev;
+ int count = 0;
+
+ printk("KERNEL supported cpus:\n");
+ for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
+ struct cpu_dev *cpudev = *cdev;
+ unsigned int j;
+
+ if (count >= X86_VENDOR_NUM)
+ break;
+ cpu_devs[count] = cpudev;
+ count++;
+
+ for (j = 0; j < 2; j++) {
+ if (!cpudev->c_ident[j])
+ continue;
+ printk(" %s %s\n", cpudev->c_vendor,
+ cpudev->c_ident[j]);
+ }
+ }
+
+ early_identify_cpu(&boot_cpu_data);
}
/*
@@ -357,86 +592,41 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
{
- u32 tfms, xlvl;
- unsigned int ebx;
-
- if (have_cpuid_p()) {
- /* Get vendor name */
- cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
- (unsigned int *)&c->x86_vendor_id[0],
- (unsigned int *)&c->x86_vendor_id[8],
- (unsigned int *)&c->x86_vendor_id[4]);
-
- get_cpu_vendor(c, 0);
- /* Initialize the standard set of capabilities */
- /* Note that the vendor-specific code below might override */
- /* Intel-defined flags: level 0x00000001 */
- if (c->cpuid_level >= 0x00000001) {
- u32 capability, excap;
- cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
- c->x86_capability[0] = capability;
- c->x86_capability[4] = excap;
- c->x86 = (tfms >> 8) & 15;
- c->x86_model = (tfms >> 4) & 15;
- if (c->x86 == 0xf)
- c->x86 += (tfms >> 20) & 0xff;
- if (c->x86 >= 0x6)
- c->x86_model += ((tfms >> 16) & 0xF) << 4;
- c->x86_mask = tfms & 15;
- c->initial_apicid = (ebx >> 24) & 0xFF;
-#ifdef CONFIG_X86_HT
- c->apicid = phys_pkg_id(c->initial_apicid, 0);
- c->phys_proc_id = c->initial_apicid;
-#else
- c->apicid = c->initial_apicid;
-#endif
- if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
- c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
- } else {
- /* Have CPUID level 0 only - unheard of */
- c->x86 = 4;
- }
+ c->extended_cpuid_level = 0;
- /* AMD-defined flags: level 0x80000001 */
- xlvl = cpuid_eax(0x80000000);
- if ((xlvl & 0xffff0000) == 0x80000000) {
- if (xlvl >= 0x80000001) {
- c->x86_capability[1] = cpuid_edx(0x80000001);
- c->x86_capability[6] = cpuid_ecx(0x80000001);
- }
- if (xlvl >= 0x80000004)
- get_model_name(c); /* Default name */
- }
+ if (!have_cpuid_p())
+ identify_cpu_without_cpuid(c);
- init_scattered_cpuid_features(c);
- detect_nopl(c);
- }
-}
+ /* cyrix could have cpuid enabled via c_identify()*/
+ if (!have_cpuid_p())
+ return;
-static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
-{
- if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
- /* Disable processor serial number */
- unsigned long lo, hi;
- rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- lo |= 0x200000;
- wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- printk(KERN_NOTICE "CPU serial number disabled.\n");
- clear_cpu_cap(c, X86_FEATURE_PN);
+ cpu_detect(c);
- /* Disabling the serial number may affect the cpuid level */
- c->cpuid_level = cpuid_eax(0);
- }
-}
+ get_cpu_vendor(c);
-static int __init x86_serial_nr_setup(char *s)
-{
- disable_x86_serial_nr = 0;
- return 1;
-}
-__setup("serialnumber", x86_serial_nr_setup);
+ get_cpu_cap(c);
+
+ if (c->cpuid_level >= 0x00000001) {
+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_HT
+ c->apicid = phys_pkg_id(c->initial_apicid, 0);
+# else
+ c->apicid = c->initial_apicid;
+# endif
+#endif
+#ifdef CONFIG_X86_HT
+ c->phys_proc_id = c->initial_apicid;
+#endif
+ }
+ get_model_name(c); /* Default name */
+
+ init_scattered_cpuid_features(c);
+ detect_nopl(c);
+}
/*
* This does the hard work of actually picking apart the CPU stuff...
@@ -448,30 +638,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->loops_per_jiffy = loops_per_jiffy;
c->x86_cache_size = -1;
c->x86_vendor = X86_VENDOR_UNKNOWN;
- c->cpuid_level = -1; /* CPUID not detected */
c->x86_model = c->x86_mask = 0; /* So far unknown... */
c->x86_vendor_id[0] = '\0'; /* Unset */
c->x86_model_id[0] = '\0'; /* Unset */
c->x86_max_cores = 1;
+ c->x86_coreid_bits = 0;
+#ifdef CONFIG_X86_64
+ c->x86_clflush_size = 64;
+#else
+ c->cpuid_level = -1; /* CPUID not detected */
c->x86_clflush_size = 32;
+#endif
+ c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof c->x86_capability);
- if (!have_cpuid_p()) {
- /*
- * First of all, decide if this is a 486 or higher
- * It's a 486 if we can modify the AC flag
- */
- if (flag_is_changeable_p(X86_EFLAGS_AC))
- c->x86 = 4;
- else
- c->x86 = 3;
- }
-
generic_identify(c);
if (this_cpu->c_identify)
this_cpu->c_identify(c);
+#ifdef CONFIG_X86_64
+ c->apicid = phys_pkg_id(0);
+#endif
+
/*
* Vendor-specific initialization. In this section we
* canonicalize the feature flags, meaning if there are
@@ -505,6 +694,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86, c->x86_model);
}
+#ifdef CONFIG_X86_64
+ detect_ht(c);
+#endif
+
/*
* On SMP, boot_cpu_data holds the common feature set between
* all CPUs; so make sure that we indicate which features are
@@ -513,7 +706,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
*/
if (c != &boot_cpu_data) {
/* AND the already accumulated flags with these */
- for (i = 0 ; i < NCAPINTS ; i++)
+ for (i = 0; i < NCAPINTS; i++)
boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
}
@@ -521,72 +714,91 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
for (i = 0; i < NCAPINTS; i++)
c->x86_capability[i] &= ~cleared_cpu_caps[i];
+#ifdef CONFIG_X86_MCE
/* Init Machine Check Exception if available. */
mcheck_init(c);
+#endif
select_idle_routine(c);
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+ numa_add_cpu(smp_processor_id());
+#endif
}
+#ifdef CONFIG_X86_64
+static void vgetcpu_set_mode(void)
+{
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
+ vgetcpu_mode = VGETCPU_RDTSCP;
+ else
+ vgetcpu_mode = VGETCPU_LSL;
+}
+#endif
+
void __init identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
+#ifdef CONFIG_X86_32
sysenter_setup();
enable_sep_cpu();
+#else
+ vgetcpu_set_mode();
+#endif
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
{
BUG_ON(c == &boot_cpu_data);
identify_cpu(c);
+#ifdef CONFIG_X86_32
enable_sep_cpu();
+#endif
mtrr_ap_init();
}
-#ifdef CONFIG_X86_HT
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
- u32 eax, ebx, ecx, edx;
- int index_msb, core_bits;
-
- cpuid(1, &eax, &ebx, &ecx, &edx);
-
- if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
- return;
-
- smp_num_siblings = (ebx & 0xff0000) >> 16;
+struct msr_range {
+ unsigned min;
+ unsigned max;
+};
- if (smp_num_siblings == 1) {
- printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- } else if (smp_num_siblings > 1) {
+static struct msr_range msr_range_array[] __cpuinitdata = {
+ { 0x00000000, 0x00000418},
+ { 0xc0000000, 0xc000040b},
+ { 0xc0010000, 0xc0010142},
+ { 0xc0011000, 0xc001103b},
+};
- if (smp_num_siblings > NR_CPUS) {
- printk(KERN_WARNING "CPU: Unsupported number of the "
- "siblings %d", smp_num_siblings);
- smp_num_siblings = 1;
- return;
+static void __cpuinit print_cpu_msr(void)
+{
+ unsigned index;
+ u64 val;
+ int i;
+ unsigned index_min, index_max;
+
+ for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
+ index_min = msr_range_array[i].min;
+ index_max = msr_range_array[i].max;
+ for (index = index_min; index < index_max; index++) {
+ if (rdmsrl_amd_safe(index, &val))
+ continue;
+ printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
}
+ }
+}
- index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
-
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
-
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
- index_msb = get_count_order(smp_num_siblings) ;
-
- core_bits = get_count_order(c->x86_max_cores);
+static int show_msr __cpuinitdata;
+static __init int setup_show_msr(char *arg)
+{
+ int num;
- c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
+ get_option(&arg, &num);
- if (c->x86_max_cores > 1)
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- }
+ if (num > 0)
+ show_msr = num;
+ return 1;
}
-#endif
+__setup("show_msr=", setup_show_msr);
static __init int setup_noclflush(char *arg)
{
@@ -604,18 +816,26 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
else if (c->cpuid_level >= 0)
vendor = c->x86_vendor_id;
- if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
- printk("%s ", vendor);
+ if (vendor && !strstr(c->x86_model_id, vendor))
+ printk(KERN_CONT "%s ", vendor);
- if (!c->x86_model_id[0])
- printk("%d86", c->x86);
+ if (c->x86_model_id[0])
+ printk(KERN_CONT "%s", c->x86_model_id);
else
- printk("%s", c->x86_model_id);
+ printk(KERN_CONT "%d86", c->x86);
if (c->x86_mask || c->cpuid_level >= 0)
- printk(" stepping %02x\n", c->x86_mask);
+ printk(KERN_CONT " stepping %02x\n", c->x86_mask);
else
- printk("\n");
+ printk(KERN_CONT "\n");
+
+#ifdef CONFIG_SMP
+ if (c->cpu_index < show_msr)
+ print_cpu_msr();
+#else
+ if (show_msr)
+ print_cpu_msr();
+#endif
}
static __init int setup_disablecpuid(char *arg)
@@ -631,19 +851,89 @@ __setup("clearcpuid=", setup_disablecpuid);
cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-void __init early_cpu_init(void)
+#ifdef CONFIG_X86_64
+struct x8664_pda **_cpu_pda __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
+
+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
+
+char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+
+void __cpuinit pda_init(int cpu)
{
- struct cpu_vendor_dev *cvdev;
+ struct x8664_pda *pda = cpu_pda(cpu);
+
+ /* Setup up data that may be needed in __get_free_pages early */
+ loadsegment(fs, 0);
+ loadsegment(gs, 0);
+ /* Memory clobbers used to order PDA accessed */
+ mb();
+ wrmsrl(MSR_GS_BASE, pda);
+ mb();
+
+ pda->cpunumber = cpu;
+ pda->irqcount = -1;
+ pda->kernelstack = (unsigned long)stack_thread_info() -
+ PDA_STACKOFFSET + THREAD_SIZE;
+ pda->active_mm = &init_mm;
+ pda->mmu_state = 0;
+
+ if (cpu == 0) {
+ /* others are initialized in smpboot.c */
+ pda->pcurrent = &init_task;
+ pda->irqstackptr = boot_cpu_stack;
+ pda->irqstackptr += IRQSTACKSIZE - 64;
+ } else {
+ if (!pda->irqstackptr) {
+ pda->irqstackptr = (char *)
+ __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+ if (!pda->irqstackptr)
+ panic("cannot allocate irqstack for cpu %d",
+ cpu);
+ pda->irqstackptr += IRQSTACKSIZE - 64;
+ }
+
+ if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+ pda->nodenumber = cpu_to_node(cpu);
+ }
+}
+
+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+ DEBUG_STKSZ] __page_aligned_bss;
- for (cvdev = __x86cpuvendor_start ;
- cvdev < __x86cpuvendor_end ;
- cvdev++)
- cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+extern asmlinkage void ignore_sysret(void);
- early_cpu_detect();
- validate_pat_support(&boot_cpu_data);
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+ /*
+ * LSTAR and STAR live in a bit strange symbiosis.
+ * They both write to the same internal register. STAR allows to
+ * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+ */
+ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
+ wrmsrl(MSR_LSTAR, system_call);
+ wrmsrl(MSR_CSTAR, ignore_sysret);
+
+#ifdef CONFIG_IA32_EMULATION
+ syscall32_cpu_init();
+#endif
+
+ /* Flags to clear on syscall */
+ wrmsrl(MSR_SYSCALL_MASK,
+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
}
+unsigned long kernel_eflags;
+
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+
+#else
+
/* Make sure %fs is initialized properly in idle threads */
struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
{
@@ -651,25 +941,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
regs->fs = __KERNEL_PERCPU;
return regs;
}
-
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
-void switch_to_new_gdt(void)
-{
- struct desc_ptr gdt_descr;
-
- gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
- gdt_descr.size = GDT_SIZE - 1;
- load_gdt(&gdt_descr);
- asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
-}
+#endif
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
* and IDT. We reload them nevertheless, this function acts as a
* 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init for 64 bit
*/
+#ifdef CONFIG_X86_64
+void __cpuinit cpu_init(void)
+{
+ int cpu = stack_smp_processor_id();
+ struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+ unsigned long v;
+ char *estacks = NULL;
+ struct task_struct *me;
+ int i;
+
+ /* CPU 0 is initialised in head64.c */
+ if (cpu != 0)
+ pda_init(cpu);
+ else
+ estacks = boot_exception_stacks;
+
+ me = current;
+
+ if (cpu_test_and_set(cpu, cpu_initialized))
+ panic("CPU#%d already initialized!\n", cpu);
+
+ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+ /*
+ * Initialize the per-CPU GDT with the boot GDT,
+ * and set up the GDT descriptor:
+ */
+
+ switch_to_new_gdt();
+ load_idt((const struct desc_ptr *)&idt_descr);
+
+ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
+
+ wrmsrl(MSR_FS_BASE, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ barrier();
+
+ check_efer();
+ if (cpu != 0 && x2apic)
+ enable_x2apic();
+
+ /*
+ * set up and load the per-CPU TSS
+ */
+ if (!orig_ist->ist[0]) {
+ static const unsigned int order[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+ };
+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+ if (cpu) {
+ estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
+ if (!estacks)
+ panic("Cannot allocate exception "
+ "stack %ld %d\n", v, cpu);
+ }
+ estacks += PAGE_SIZE << order[v];
+ orig_ist->ist[v] = t->x86_tss.ist[v] =
+ (unsigned long)estacks;
+ }
+ }
+
+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ /*
+ * <= is required because the CPU will access up to
+ * 8 bits beyond the end of the IO permission bitmap.
+ */
+ for (i = 0; i <= IO_BITMAP_LONGS; i++)
+ t->io_bitmap[i] = ~0UL;
+
+ atomic_inc(&init_mm.mm_count);
+ me->active_mm = &init_mm;
+ if (me->mm)
+ BUG();
+ enter_lazy_tlb(&init_mm, me);
+
+ load_sp0(t, &current->thread);
+ set_tss_desc(cpu, t);
+ load_TR_desc();
+ load_LDT(&init_mm.context);
+
+#ifdef CONFIG_KGDB
+ /*
+ * If the kgdb is connected no debug regs should be altered. This
+ * is only applicable when KGDB and a KGDB I/O module are built
+ * into the kernel and you are using early debugging with
+ * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+ */
+ if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ else {
+#endif
+ /*
+ * Clear all 6 debug registers:
+ */
+
+ set_debugreg(0UL, 0);
+ set_debugreg(0UL, 1);
+ set_debugreg(0UL, 2);
+ set_debugreg(0UL, 3);
+ set_debugreg(0UL, 6);
+ set_debugreg(0UL, 7);
+#ifdef CONFIG_KGDB
+ /* If the kgdb is connected no debug regs should be altered. */
+ }
+#endif
+
+ fpu_init();
+
+ raw_local_save_flags(kernel_eflags);
+
+ if (is_uv_system())
+ uv_cpu_init();
+}
+
+#else
+
void __cpuinit cpu_init(void)
{
int cpu = smp_processor_id();
@@ -723,19 +1124,21 @@ void __cpuinit cpu_init(void)
/*
* Force FPU initialization:
*/
- current_thread_info()->status = 0;
+ if (cpu_has_xsave)
+ current_thread_info()->status = TS_XSAVE;
+ else
+ current_thread_info()->status = 0;
clear_used_math();
mxcsr_feature_mask_init();
-}
-#ifdef CONFIG_HOTPLUG_CPU
-void __cpuinit cpu_uninit(void)
-{
- int cpu = raw_smp_processor_id();
- cpu_clear(cpu, cpu_initialized);
+ /*
+ * Boot processor to setup the FP and extended state context info.
+ */
+ if (!smp_processor_id())
+ init_thread_xstate();
- /* lazy TLB state */
- per_cpu(cpu_tlbstate, cpu).state = 0;
- per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
+ xsave_init();
}
+
+
#endif
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
deleted file mode 100644
index a11f5d4477c..00000000000
--- a/arch/x86/kernel/cpu/common_64.c
+++ /dev/null
@@ -1,712 +0,0 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/bootmem.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/kgdb.h>
-#include <linux/topology.h>
-#include <linux/delay.h>
-#include <linux/smp.h>
-#include <linux/percpu.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/linkage.h>
-#include <asm/mmu_context.h>
-#include <asm/mtrr.h>
-#include <asm/mce.h>
-#include <asm/pat.h>
-#include <asm/asm.h>
-#include <asm/numa.h>
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <mach_apic.h>
-#endif
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/genapic.h>
-
-#include "cpu.h"
-
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types kkeil 2000/10/28
- * Also sysret mandates a special GDT layout
- */
-/* The TLS descriptors are currently at a different place compared to i386.
- Hopefully nobody expects them at a fixed place (Wine?) */
-DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
- [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
- [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
-} };
-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
-
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
-void switch_to_new_gdt(void)
-{
- struct desc_ptr gdt_descr;
-
- gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
- gdt_descr.size = GDT_SIZE - 1;
- load_gdt(&gdt_descr);
-}
-
-struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
-
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
-{
- display_cacheinfo(c);
-}
-
-static struct cpu_dev __cpuinitdata default_cpu = {
- .c_init = default_init,
- .c_vendor = "Unknown",
-};
-static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
-
-int __cpuinit get_model_name(struct cpuinfo_x86 *c)
-{
- unsigned int *v;
-
- if (c->extended_cpuid_level < 0x80000004)
- return 0;
-
- v = (unsigned int *) c->x86_model_id;
- cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
- cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
- cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
- c->x86_model_id[48] = 0;
- return 1;
-}
-
-
-void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
-{
- unsigned int n, dummy, ebx, ecx, edx;
-
- n = c->extended_cpuid_level;
-
- if (n >= 0x80000005) {
- cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
- printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
- "D cache %dK (%d bytes/line)\n",
- edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
- c->x86_cache_size = (ecx>>24) + (edx>>24);
- /* On K8 L1 TLB is inclusive, so don't count it */
- c->x86_tlbsize = 0;
- }
-
- if (n >= 0x80000006) {
- cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
- ecx = cpuid_ecx(0x80000006);
- c->x86_cache_size = ecx >> 16;
- c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
-
- printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
- c->x86_cache_size, ecx & 0xFF);
- }
-}
-
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- u32 eax, ebx, ecx, edx;
- int index_msb, core_bits;
-
- cpuid(1, &eax, &ebx, &ecx, &edx);
-
-
- if (!cpu_has(c, X86_FEATURE_HT))
- return;
- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
- goto out;
-
- smp_num_siblings = (ebx & 0xff0000) >> 16;
-
- if (smp_num_siblings == 1) {
- printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- } else if (smp_num_siblings > 1) {
-
- if (smp_num_siblings > NR_CPUS) {
- printk(KERN_WARNING "CPU: Unsupported number of "
- "siblings %d", smp_num_siblings);
- smp_num_siblings = 1;
- return;
- }
-
- index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = phys_pkg_id(index_msb);
-
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
- index_msb = get_count_order(smp_num_siblings);
-
- core_bits = get_count_order(c->x86_max_cores);
-
- c->cpu_core_id = phys_pkg_id(index_msb) &
- ((1 << core_bits) - 1);
- }
-out:
- if ((c->x86_max_cores * smp_num_siblings) > 1) {
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- }
-
-#endif
-}
-
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
-{
- char *v = c->x86_vendor_id;
- int i;
- static int printed;
-
- for (i = 0; i < X86_VENDOR_NUM; i++) {
- if (cpu_devs[i]) {
- if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
- (cpu_devs[i]->c_ident[1] &&
- !strcmp(v, cpu_devs[i]->c_ident[1]))) {
- c->x86_vendor = i;
- this_cpu = cpu_devs[i];
- return;
- }
- }
- }
- if (!printed) {
- printed++;
- printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
- printk(KERN_ERR "CPU: Your system may be unstable.\n");
- }
- c->x86_vendor = X86_VENDOR_UNKNOWN;
-}
-
-static void __init early_cpu_support_print(void)
-{
- int i,j;
- struct cpu_dev *cpu_devx;
-
- printk("KERNEL supported cpus:\n");
- for (i = 0; i < X86_VENDOR_NUM; i++) {
- cpu_devx = cpu_devs[i];
- if (!cpu_devx)
- continue;
- for (j = 0; j < 2; j++) {
- if (!cpu_devx->c_ident[j])
- continue;
- printk(" %s %s\n", cpu_devx->c_vendor,
- cpu_devx->c_ident[j]);
- }
- }
-}
-
-/*
- * The NOPL instruction is supposed to exist on all CPUs with
- * family >= 6, unfortunately, that's not true in practice because
- * of early VIA chips and (more importantly) broken virtualizers that
- * are not easy to detect. Hence, probe for it based on first
- * principles.
- *
- * Note: no 64-bit chip is known to lack these, but put the code here
- * for consistency with 32 bits, and to make it utterly trivial to
- * diagnose the problem should it ever surface.
- */
-static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
-{
- const u32 nopl_signature = 0x888c53b1; /* Random number */
- u32 has_nopl = nopl_signature;
-
- clear_cpu_cap(c, X86_FEATURE_NOPL);
- if (c->x86 >= 6) {
- asm volatile("\n"
- "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
- "2:\n"
- " .section .fixup,\"ax\"\n"
- "3: xor %0,%0\n"
- " jmp 2b\n"
- " .previous\n"
- _ASM_EXTABLE(1b,3b)
- : "+a" (has_nopl));
-
- if (has_nopl == nopl_signature)
- set_cpu_cap(c, X86_FEATURE_NOPL);
- }
-}
-
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
-
-void __init early_cpu_init(void)
-{
- struct cpu_vendor_dev *cvdev;
-
- for (cvdev = __x86cpuvendor_start ;
- cvdev < __x86cpuvendor_end ;
- cvdev++)
- cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
- early_cpu_support_print();
- early_identify_cpu(&boot_cpu_data);
-}
-
-/* Do some early cpuid on the boot CPU to get some parameter that are
- needed before check_bugs. Everything advanced is in identify_cpu
- below. */
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
-{
- u32 tfms, xlvl;
-
- c->loops_per_jiffy = loops_per_jiffy;
- c->x86_cache_size = -1;
- c->x86_vendor = X86_VENDOR_UNKNOWN;
- c->x86_model = c->x86_mask = 0; /* So far unknown... */
- c->x86_vendor_id[0] = '\0'; /* Unset */
- c->x86_model_id[0] = '\0'; /* Unset */
- c->x86_clflush_size = 64;
- c->x86_cache_alignment = c->x86_clflush_size;
- c->x86_max_cores = 1;
- c->x86_coreid_bits = 0;
- c->extended_cpuid_level = 0;
- memset(&c->x86_capability, 0, sizeof c->x86_capability);
-
- /* Get vendor name */
- cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
- (unsigned int *)&c->x86_vendor_id[0],
- (unsigned int *)&c->x86_vendor_id[8],
- (unsigned int *)&c->x86_vendor_id[4]);
-
- get_cpu_vendor(c);
-
- /* Initialize the standard set of capabilities */
- /* Note that the vendor-specific code below might override */
-
- /* Intel-defined flags: level 0x00000001 */
- if (c->cpuid_level >= 0x00000001) {
- __u32 misc;
- cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
- &c->x86_capability[0]);
- c->x86 = (tfms >> 8) & 0xf;
- c->x86_model = (tfms >> 4) & 0xf;
- c->x86_mask = tfms & 0xf;
- if (c->x86 == 0xf)
- c->x86 += (tfms >> 20) & 0xff;
- if (c->x86 >= 0x6)
- c->x86_model += ((tfms >> 16) & 0xF) << 4;
- if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
- c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
- } else {
- /* Have CPUID level 0 only - unheard of */
- c->x86 = 4;
- }
-
- c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
-#ifdef CONFIG_SMP
- c->phys_proc_id = c->initial_apicid;
-#endif
- /* AMD-defined flags: level 0x80000001 */
- xlvl = cpuid_eax(0x80000000);
- c->extended_cpuid_level = xlvl;
- if ((xlvl & 0xffff0000) == 0x80000000) {
- if (xlvl >= 0x80000001) {
- c->x86_capability[1] = cpuid_edx(0x80000001);
- c->x86_capability[6] = cpuid_ecx(0x80000001);
- }
- if (xlvl >= 0x80000004)
- get_model_name(c); /* Default name */
- }
-
- /* Transmeta-defined flags: level 0x80860001 */
- xlvl = cpuid_eax(0x80860000);
- if ((xlvl & 0xffff0000) == 0x80860000) {
- /* Don't set x86_cpuid_level here for now to not confuse. */
- if (xlvl >= 0x80860001)
- c->x86_capability[2] = cpuid_edx(0x80860001);
- }
-
- if (c->extended_cpuid_level >= 0x80000007)
- c->x86_power = cpuid_edx(0x80000007);
-
- if (c->extended_cpuid_level >= 0x80000008) {
- u32 eax = cpuid_eax(0x80000008);
-
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- }
-
- detect_nopl(c);
-
- if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
- cpu_devs[c->x86_vendor]->c_early_init)
- cpu_devs[c->x86_vendor]->c_early_init(c);
-
- validate_pat_support(c);
-}
-
-/*
- * This does the hard work of actually picking apart the CPU stuff...
- */
-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
-{
- int i;
-
- early_identify_cpu(c);
-
- init_scattered_cpuid_features(c);
-
- c->apicid = phys_pkg_id(0);
-
- /*
- * Vendor-specific initialization. In this section we
- * canonicalize the feature flags, meaning if there are
- * features a certain CPU supports which CPUID doesn't
- * tell us, CPUID claiming incorrect flags, or other bugs,
- * we handle them here.
- *
- * At the end of this section, c->x86_capability better
- * indicate the features this CPU genuinely supports!
- */
- if (this_cpu->c_init)
- this_cpu->c_init(c);
-
- detect_ht(c);
-
- /*
- * On SMP, boot_cpu_data holds the common feature set between
- * all CPUs; so make sure that we indicate which features are
- * common between the CPUs. The first time this routine gets
- * executed, c == &boot_cpu_data.
- */
- if (c != &boot_cpu_data) {
- /* AND the already accumulated flags with these */
- for (i = 0; i < NCAPINTS; i++)
- boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
- }
-
- /* Clear all flags overriden by options */
- for (i = 0; i < NCAPINTS; i++)
- c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
-#ifdef CONFIG_X86_MCE
- mcheck_init(c);
-#endif
- select_idle_routine(c);
-
-#ifdef CONFIG_NUMA
- numa_add_cpu(smp_processor_id());
-#endif
-
-}
-
-void __cpuinit identify_boot_cpu(void)
-{
- identify_cpu(&boot_cpu_data);
-}
-
-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
-{
- BUG_ON(c == &boot_cpu_data);
- identify_cpu(c);
- mtrr_ap_init();
-}
-
-static __init int setup_noclflush(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
- return 1;
-}
-__setup("noclflush", setup_noclflush);
-
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
-{
- if (c->x86_model_id[0])
- printk(KERN_CONT "%s", c->x86_model_id);
-
- if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT " stepping %02x\n", c->x86_mask);
- else
- printk(KERN_CONT "\n");
-}
-
-static __init int setup_disablecpuid(char *arg)
-{
- int bit;
- if (get_option(&arg, &bit) && bit < NCAPINTS*32)
- setup_clear_cpu_cap(bit);
- else
- return 0;
- return 1;
-}
-__setup("clearcpuid=", setup_disablecpuid);
-
-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
-struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-
-char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
-
-unsigned long __supported_pte_mask __read_mostly = ~0UL;
-EXPORT_SYMBOL_GPL(__supported_pte_mask);
-
-static int do_not_nx __cpuinitdata;
-
-/* noexec=on|off
-Control non executable mappings for 64bit processes.
-
-on Enable(default)
-off Disable
-*/
-static int __init nonx_setup(char *str)
-{
- if (!str)
- return -EINVAL;
- if (!strncmp(str, "on", 2)) {
- __supported_pte_mask |= _PAGE_NX;
- do_not_nx = 0;
- } else if (!strncmp(str, "off", 3)) {
- do_not_nx = 1;
- __supported_pte_mask &= ~_PAGE_NX;
- }
- return 0;
-}
-early_param("noexec", nonx_setup);
-
-int force_personality32;
-
-/* noexec32=on|off
-Control non executable heap for 32bit processes.
-To control the stack too use noexec=off
-
-on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
-off PROT_READ implies PROT_EXEC
-*/
-static int __init nonx32_setup(char *str)
-{
- if (!strcmp(str, "on"))
- force_personality32 &= ~READ_IMPLIES_EXEC;
- else if (!strcmp(str, "off"))
- force_personality32 |= READ_IMPLIES_EXEC;
- return 1;
-}
-__setup("noexec32=", nonx32_setup);
-
-void pda_init(int cpu)
-{
- struct x8664_pda *pda = cpu_pda(cpu);
-
- /* Setup up data that may be needed in __get_free_pages early */
- loadsegment(fs, 0);
- loadsegment(gs, 0);
- /* Memory clobbers used to order PDA accessed */
- mb();
- wrmsrl(MSR_GS_BASE, pda);
- mb();
-
- pda->cpunumber = cpu;
- pda->irqcount = -1;
- pda->kernelstack = (unsigned long)stack_thread_info() -
- PDA_STACKOFFSET + THREAD_SIZE;
- pda->active_mm = &init_mm;
- pda->mmu_state = 0;
-
- if (cpu == 0) {
- /* others are initialized in smpboot.c */
- pda->pcurrent = &init_task;
- pda->irqstackptr = boot_cpu_stack;
- pda->irqstackptr += IRQSTACKSIZE - 64;
- } else {
- if (!pda->irqstackptr) {
- pda->irqstackptr = (char *)
- __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
- if (!pda->irqstackptr)
- panic("cannot allocate irqstack for cpu %d",
- cpu);
- pda->irqstackptr += IRQSTACKSIZE - 64;
- }
-
- if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
- pda->nodenumber = cpu_to_node(cpu);
- }
-}
-
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
- DEBUG_STKSZ] __page_aligned_bss;
-
-extern asmlinkage void ignore_sysret(void);
-
-/* May not be marked __init: used by software suspend */
-void syscall_init(void)
-{
- /*
- * LSTAR and STAR live in a bit strange symbiosis.
- * They both write to the same internal register. STAR allows to
- * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
- */
- wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, system_call);
- wrmsrl(MSR_CSTAR, ignore_sysret);
-
-#ifdef CONFIG_IA32_EMULATION
- syscall32_cpu_init();
-#endif
-
- /* Flags to clear on syscall */
- wrmsrl(MSR_SYSCALL_MASK,
- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
-}
-
-void __cpuinit check_efer(void)
-{
- unsigned long efer;
-
- rdmsrl(MSR_EFER, efer);
- if (!(efer & EFER_NX) || do_not_nx)
- __supported_pte_mask &= ~_PAGE_NX;
-}
-
-unsigned long kernel_eflags;
-
-/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
- */
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
-
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init.
- */
-void __cpuinit cpu_init(void)
-{
- int cpu = stack_smp_processor_id();
- struct tss_struct *t = &per_cpu(init_tss, cpu);
- struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
- unsigned long v;
- char *estacks = NULL;
- struct task_struct *me;
- int i;
-
- /* CPU 0 is initialised in head64.c */
- if (cpu != 0)
- pda_init(cpu);
- else
- estacks = boot_exception_stacks;
-
- me = current;
-
- if (cpu_test_and_set(cpu, cpu_initialized))
- panic("CPU#%d already initialized!\n", cpu);
-
- printk(KERN_INFO "Initializing CPU#%d\n", cpu);
-
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
- /*
- * Initialize the per-CPU GDT with the boot GDT,
- * and set up the GDT descriptor:
- */
-
- switch_to_new_gdt();
- load_idt((const struct desc_ptr *)&idt_descr);
-
- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
- syscall_init();
-
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
- barrier();
-
- check_efer();
-
- /*
- * set up and load the per-CPU TSS
- */
- if (!orig_ist->ist[0]) {
- static const unsigned int order[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
- };
- for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- if (cpu) {
- estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
- if (!estacks)
- panic("Cannot allocate exception "
- "stack %ld %d\n", v, cpu);
- }
- estacks += PAGE_SIZE << order[v];
- orig_ist->ist[v] = t->x86_tss.ist[v] =
- (unsigned long)estacks;
- }
- }
-
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
- /*
- * <= is required because the CPU will access up to
- * 8 bits beyond the end of the IO permission bitmap.
- */
- for (i = 0; i <= IO_BITMAP_LONGS; i++)
- t->io_bitmap[i] = ~0UL;
-
- atomic_inc(&init_mm.mm_count);
- me->active_mm = &init_mm;
- if (me->mm)
- BUG();
- enter_lazy_tlb(&init_mm, me);
-
- load_sp0(t, &current->thread);
- set_tss_desc(cpu, t);
- load_TR_desc();
- load_LDT(&init_mm.context);
-
-#ifdef CONFIG_KGDB
- /*
- * If the kgdb is connected no debug regs should be altered. This
- * is only applicable when KGDB and a KGDB I/O module are built
- * into the kernel and you are using early debugging with
- * kgdbwait. KGDB will control the kernel HW breakpoint registers.
- */
- if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
- arch_kgdb_ops.correct_hw_break();
- else {
-#endif
- /*
- * Clear all 6 debug registers:
- */
-
- set_debugreg(0UL, 0);
- set_debugreg(0UL, 1);
- set_debugreg(0UL, 2);
- set_debugreg(0UL, 3);
- set_debugreg(0UL, 6);
- set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
- /* If the kgdb is connected no debug regs should be altered. */
- }
-#endif
-
- fpu_init();
-
- raw_local_save_flags(kernel_eflags);
-
- if (is_uv_system())
- uv_cpu_init();
-}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4d894e8565f..de4094a3921 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -21,23 +21,16 @@ struct cpu_dev {
void (*c_init)(struct cpuinfo_x86 * c);
void (*c_identify)(struct cpuinfo_x86 * c);
unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
+ int c_x86_vendor;
};
-extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
+#define cpu_dev_register(cpu_devX) \
+ static struct cpu_dev *__cpu_dev_##cpu_devX __used \
+ __attribute__((__section__(".x86_cpu_dev.init"))) = \
+ &cpu_devX;
-struct cpu_vendor_dev {
- int vendor;
- struct cpu_dev *cpu_dev;
-};
-
-#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \
- static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \
- __attribute__((__section__(".x86cpuvendor.init"))) = \
- { cpu_vendor_id, cpu_dev }
-
-extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
+extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
-extern int get_model_name(struct cpuinfo_x86 *c);
extern void display_cacheinfo(struct cpuinfo_x86 *c);
#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index dd097b83583..8e48c5d4467 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask)
* Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
* no meaning should be associated with absolute values of these MSRs.
*/
-static unsigned int get_measured_perf(unsigned int cpu)
+static unsigned int get_measured_perf(struct cpufreq_policy *policy,
+ unsigned int cpu)
{
union {
struct {
@@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
#endif
- retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100;
+ retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
put_cpu();
set_cpus_allowed_ptr(current, &saved_mask);
@@ -779,13 +780,20 @@ static int __init acpi_cpufreq_init(void)
{
int ret;
+ if (acpi_disabled)
+ return 0;
+
dprintk("acpi_cpufreq_init\n");
ret = acpi_cpufreq_early_init();
if (ret)
return ret;
- return cpufreq_register_driver(&acpi_cpufreq_driver);
+ ret = cpufreq_register_driver(&acpi_cpufreq_driver);
+ if (ret)
+ free_percpu(acpi_perf_data);
+
+ return ret;
}
static void __exit acpi_cpufreq_exit(void)
@@ -795,8 +803,6 @@ static void __exit acpi_cpufreq_exit(void)
cpufreq_unregister_driver(&acpi_cpufreq_driver);
free_percpu(acpi_perf_data);
-
- return;
}
module_param(acpi_pstate_strict, uint, 0644);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index e4a4bf870e9..fe613c93b36 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -25,8 +25,8 @@
#include <linux/cpufreq.h>
#include <asm/msr.h>
-#include <asm/timex.h>
-#include <asm/io.h>
+#include <linux/timex.h>
+#include <linux/io.h>
#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
u8 clockspeed_reg; /* Clock Speed Register */
local_irq_disable();
- outb_p(0x80,REG_CSCIR);
+ outb_p(0x80, REG_CSCIR);
clockspeed_reg = inb_p(REG_CSCDR);
local_irq_enable();
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
}
/* 33 MHz is not 32 MHz... */
- if ((clockspeed_reg & 0xE0)==0xA0)
+ if ((clockspeed_reg & 0xE0) == 0xA0)
return 33000;
- return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000);
+ return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
}
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
* There is no return value.
*/
-static void elanfreq_set_cpu_state (unsigned int state)
+static void elanfreq_set_cpu_state(unsigned int state)
{
struct cpufreq_freqs freqs;
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state)
*/
local_irq_disable();
- outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */
- outb_p(0x00,REG_CSCDR);
+ outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
+ outb_p(0x00, REG_CSCDR);
local_irq_enable(); /* wait till internal pipelines and */
udelay(1000); /* buffers have cleaned up */
local_irq_disable();
/* now, set the CPU clock speed register (0x80) */
- outb_p(0x80,REG_CSCIR);
- outb_p(elan_multiplier[state].val80h,REG_CSCDR);
+ outb_p(0x80, REG_CSCIR);
+ outb_p(elan_multiplier[state].val80h, REG_CSCDR);
/* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
- outb_p(0x40,REG_CSCIR);
- outb_p(elan_multiplier[state].val40h,REG_CSCDR);
+ outb_p(0x40, REG_CSCIR);
+ outb_p(elan_multiplier[state].val40h, REG_CSCDR);
udelay(10000);
local_irq_enable();
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state)
* for the hardware supported by the driver.
*/
-static int elanfreq_verify (struct cpufreq_policy *policy)
+static int elanfreq_verify(struct cpufreq_policy *policy)
{
return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
}
-static int elanfreq_target (struct cpufreq_policy *policy,
+static int elanfreq_target(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation)
{
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
/* capability check */
if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model!=10))
+ (c->x86 != 4) || (c->x86_model != 10))
return -ENODEV;
/* max freq */
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
max_freq = elanfreq_get_cpu_frequency(0);
/* table init */
- for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
+ for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
if (elanfreq_table[i].frequency > max_freq)
elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
}
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
if (result)
- return (result);
+ return result;
cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
return 0;
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup);
#endif
-static struct freq_attr* elanfreq_attr[] = {
+static struct freq_attr *elanfreq_attr[] = {
&cpufreq_freq_attr_scaling_available_freqs,
NULL,
};
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void)
/* Test if we have the right hardware */
if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model!=10)) {
+ (c->x86 != 4) || (c->x86_model != 10)) {
printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
- return -ENODEV;
+ return -ENODEV;
}
return cpufreq_register_driver(&elanfreq_driver);
}
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void)
}
-module_param (max_freq, int, 0444);
+module_param(max_freq, int, 0444);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 06fcce516d5..b0461856acf 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -1,5 +1,5 @@
/*
- * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk>
+ * (C) 2001-2004 Dave Jones. <davej@redhat.com>
* (C) 2002 Padraig Brady. <padraig@antefacto.com>
*
* Licensed under the terms of the GNU GPL License version 2.
@@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
module_param(revid_errata, int, 0644);
MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
MODULE_LICENSE ("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index f1685fb91fb..b8e05ee4f73 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
}
if (c->x86 != 0xF) {
- printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n");
+ printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
return 0;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index eb9b62b0830..c1ac5790c63 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -15,12 +15,11 @@
#include <linux/slab.h>
#include <asm/msr.h>
-#include <asm/timex.h>
-#include <asm/io.h>
+#include <linux/timex.h>
+#include <linux/io.h>
-
-#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
- as it is unused */
+#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
+ as it is unused */
static unsigned int busfreq; /* FSB, in 10 kHz */
static unsigned int max_multiplier;
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void)
msrval = POWERNOW_IOPORT + 0x1;
wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue=inl(POWERNOW_IOPORT + 0x8);
+ invalue = inl(POWERNOW_IOPORT + 0x8);
msrval = POWERNOW_IOPORT + 0x0;
wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void)
*
* Tries to change the PowerNow! multiplier
*/
-static void powernow_k6_set_state (unsigned int best_i)
+static void powernow_k6_set_state(unsigned int best_i)
{
- unsigned long outvalue=0, invalue=0;
+ unsigned long outvalue = 0, invalue = 0;
unsigned long msrval;
struct cpufreq_freqs freqs;
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i)
msrval = POWERNOW_IOPORT + 0x1;
wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue=inl(POWERNOW_IOPORT + 0x8);
+ invalue = inl(POWERNOW_IOPORT + 0x8);
invalue = invalue & 0xf;
outvalue = outvalue | invalue;
- outl(outvalue ,(POWERNOW_IOPORT + 0x8));
+ outl(outvalue , (POWERNOW_IOPORT + 0x8));
msrval = POWERNOW_IOPORT + 0x0;
wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
*
* sets a new CPUFreq policy
*/
-static int powernow_k6_target (struct cpufreq_policy *policy,
+static int powernow_k6_target(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation)
{
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
busfreq = cpu_khz / max_multiplier;
/* table init */
- for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
+ for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
if (clock_ratio[i].index > max_multiplier)
clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
else
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
if (result)
- return (result);
+ return result;
cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
{
unsigned int i;
- for (i=0; i<8; i++) {
- if (i==max_multiplier)
+ for (i = 0; i < 8; i++) {
+ if (i == max_multiplier)
powernow_k6_set_state(i);
}
cpufreq_frequency_table_put_attr(policy->cpu);
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu)
return busfreq * powernow_k6_get_cpu_multiplier();
}
-static struct freq_attr* powernow_k6_attr[] = {
+static struct freq_attr *powernow_k6_attr[] = {
&cpufreq_freq_attr_scaling_available_freqs,
NULL,
};
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void)
}
if (cpufreq_register_driver(&powernow_k6_driver)) {
- release_region (POWERNOW_IOPORT, 16);
+ release_region(POWERNOW_IOPORT, 16);
return -EINVAL;
}
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void)
static void __exit powernow_k6_exit(void)
{
cpufreq_unregister_driver(&powernow_k6_driver);
- release_region (POWERNOW_IOPORT, 16);
+ release_region(POWERNOW_IOPORT, 16);
}
-MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
-MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
+MODULE_LICENSE("GPL");
module_init(powernow_k6_init);
module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 0a61159d7b7..7c7d56b4313 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -1,6 +1,6 @@
/*
* AMD K7 Powernow driver.
- * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs.
+ * (C) 2003 Dave Jones on behalf of SuSE Labs.
* (C) 2003-2004 Dave Jones <davej@redhat.com>
*
* Licensed under the terms of the GNU GPL License version 2.
@@ -692,7 +692,7 @@ static void __exit powernow_exit (void)
module_param(acpi_force, int, 0444);
MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
MODULE_LICENSE ("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 84bb395038d..d3dcd58b87c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -7,7 +7,7 @@
* Support : mark.langsdorf@amd.com
*
* Based on the powernow-k7.c module written by Dave Jones.
- * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs
+ * (C) 2003 Dave Jones on behalf of SuSE Labs
* (C) 2004 Dominik Brodowski <linux@brodo.de>
* (C) 2004 Pavel Machek <pavel@suse.cz>
* Licensed under the terms of the GNU GPL License version 2.
@@ -45,7 +45,6 @@
#endif
#define PFX "powernow-k8: "
-#define BFX PFX "BIOS error: "
#define VERSION "version 2.20.00"
#include "powernow-k8.h"
@@ -536,35 +535,40 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
for (j = 0; j < data->numps; j++) {
if (pst[j].vid > LEAST_VID) {
- printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid);
+ printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
+ j, pst[j].vid);
return -EINVAL;
}
if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */
- printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j);
+ printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
+ " %d\n", j);
return -ENODEV;
}
if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */
- printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j);
+ printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
+ " %d\n", j);
return -ENODEV;
}
if (pst[j].fid > MAX_FID) {
- printk(KERN_ERR BFX "maxfid exceeded with pstate %d\n", j);
+ printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
+ " %d\n", j);
return -ENODEV;
}
if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
/* Only first fid is allowed to be in "low" range */
- printk(KERN_ERR BFX "two low fids - %d : 0x%x\n", j, pst[j].fid);
+ printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
+ "0x%x\n", j, pst[j].fid);
return -EINVAL;
}
if (pst[j].fid < lastfid)
lastfid = pst[j].fid;
}
if (lastfid & 1) {
- printk(KERN_ERR BFX "lastfid invalid\n");
+ printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
return -EINVAL;
}
if (lastfid > LO_FID_TABLE_TOP)
- printk(KERN_INFO BFX "first fid not from lo freq table\n");
+ printk(KERN_INFO FW_BUG PFX "first fid not from lo freq table\n");
return 0;
}
@@ -672,13 +676,13 @@ static int find_psb_table(struct powernow_k8_data *data)
dprintk("table vers: 0x%x\n", psb->tableversion);
if (psb->tableversion != PSB_VERSION_1_4) {
- printk(KERN_ERR BFX "PSB table is not v1.4\n");
+ printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
return -ENODEV;
}
dprintk("flags: 0x%x\n", psb->flags1);
if (psb->flags1) {
- printk(KERN_ERR BFX "unknown flags\n");
+ printk(KERN_ERR FW_BUG PFX "unknown flags\n");
return -ENODEV;
}
@@ -705,7 +709,7 @@ static int find_psb_table(struct powernow_k8_data *data)
}
}
if (cpst != 1) {
- printk(KERN_ERR BFX "numpst must be 1\n");
+ printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
return -ENODEV;
}
@@ -1130,17 +1134,19 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
"ACPI Processor module before starting this "
"driver.\n");
#else
- printk(KERN_ERR PFX "Your BIOS does not provide ACPI "
- "_PSS objects in a way that Linux understands. "
- "Please report this to the Linux ACPI maintainers"
- " and complain to your BIOS vendor.\n");
+ printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide"
+ " ACPI _PSS objects in a way that Linux "
+ "understands. Please report this to the Linux "
+ "ACPI maintainers and complain to your BIOS "
+ "vendor.\n");
#endif
kfree(data);
return -ENODEV;
}
if (pol->cpu != 0) {
- printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than "
- "CPU0. Complain to your BIOS vendor.\n");
+ printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
+ "CPU other than CPU0. Complain to your BIOS "
+ "vendor.\n");
kfree(data);
return -ENODEV;
}
@@ -1193,7 +1199,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
/* min/max the cpu is capable of */
if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
- printk(KERN_ERR PFX "invalid powernow_table\n");
+ printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
powernow_k8_cpu_exit_acpi(data);
kfree(data->powernow_table);
kfree(data);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 15e13c01cc3..3b5f06423e7 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,7 +26,7 @@
#include <asm/cpufeature.h>
#define PFX "speedstep-centrino: "
-#define MAINTAINER "cpufreq@lists.linux.org.uk"
+#define MAINTAINER "cpufreq@vger.kernel.org"
#define dprintk(msg...) \
cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 191f7263c61..04d0376b64b 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -431,7 +431,7 @@ static void __exit speedstep_exit(void)
}
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
MODULE_LICENSE ("GPL");
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 898a5a2002e..ffd0f5ed071 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -121,7 +121,7 @@ static void __cpuinit set_cx86_reorder(void)
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* Load/Store Serialize to mem access disable (=reorder it) */
- setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
+ setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
/* set load/store serialize from 1GB to 4GB */
ccr3 |= 0xe0;
setCx86(CX86_CCR3, ccr3);
@@ -132,11 +132,11 @@ static void __cpuinit set_cx86_memwb(void)
printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
/* CCR2 bit 2: unlock NW bit */
- setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
+ setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
/* set 'Not Write-through' */
write_cr0(read_cr0() | X86_CR0_NW);
/* CCR2 bit 2: lock NW bit and set WT1 */
- setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
+ setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
}
/*
@@ -150,14 +150,14 @@ static void __cpuinit geode_configure(void)
local_irq_save(flags);
/* Suspend on halt power saving and enable #SUSP pin */
- setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+ setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* FPU fast, DTE cache, Mem bypass */
- setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
+ setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
set_cx86_memwb();
@@ -291,7 +291,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
/* Enable cxMMX extensions (GX1 Datasheet 54) */
- setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
+ setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
/*
* GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -301,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
*/
if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
geode_configure();
- get_model_name(c); /* get CPU marketing name */
return;
} else { /* MediaGX */
Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
@@ -314,7 +313,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
if (dir1 > 7) {
dir0_msn++; /* M II */
/* Enable MMX extensions (App note 108) */
- setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
+ setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
} else {
c->coma_bug = 1; /* 6x86MX, it has the bug. */
}
@@ -429,7 +428,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
- setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */
+ setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
local_irq_restore(flags);
}
@@ -442,14 +441,16 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
.c_early_init = early_init_cyrix,
.c_init = init_cyrix,
.c_identify = cyrix_identify,
+ .c_x86_vendor = X86_VENDOR_CYRIX,
};
-cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev);
+cpu_dev_register(cyrix_cpu_dev);
static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
+ .c_x86_vendor = X86_VENDOR_NSC,
};
-cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev);
+cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
deleted file mode 100644
index c9017799497..00000000000
--- a/arch/x86/kernel/cpu/feature_names.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Strings for the various x86 capability flags.
- *
- * This file must not contain any executable code.
- */
-
-#include <asm/cpufeature.h>
-
-/*
- * These flag bits must match the definitions in <asm/cpufeature.h>.
- * NULL means this bit is undefined or reserved; either way it doesn't
- * have meaning as far as Linux is concerned. Note that it's important
- * to realize there is a difference between this table and CPUID -- if
- * applications want to get the raw CPUID data, they should access
- * /dev/cpu/<cpu_nr>/cpuid instead.
- */
-const char * const x86_cap_flags[NCAPINTS*32] = {
- /* Intel-defined */
- "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
- "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
- "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
- "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
-
- /* AMD-defined */
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
- NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
- "3dnowext", "3dnow",
-
- /* Transmeta-defined */
- "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
- /* Other (Linux-defined) */
- "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
- NULL, NULL, NULL, NULL,
- "constant_tsc", "up", NULL, "arch_perfmon",
- "pebs", "bts", NULL, NULL,
- "rep_good", NULL, NULL, NULL,
- "nopl", NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
- /* Intel-defined (#2) */
- "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
- "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
- NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
- /* VIA/Cyrix/Centaur-defined */
- NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
- "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
- /* AMD-defined (#2) */
- "lahf_lm", "cmp_legacy", "svm", "extapic",
- "cr8_legacy", "abm", "sse4a", "misalignsse",
- "3dnowprefetch", "osvw", "ibs", "sse5",
- "skinit", "wdt", NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
- /* Auxiliary (Linux-defined) */
- "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-};
-
-const char *const x86_power_flags[32] = {
- "ts", /* temperature sensor */
- "fid", /* frequency id control */
- "vid", /* voltage id control */
- "ttp", /* thermal trip */
- "tm",
- "stc",
- "100mhzsteps",
- "hwpstate",
- "", /* tsc invariant mapped to constant_tsc */
- /* nothing */
-};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b75f2569b8f..cce0b6118d5 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
#include <asm/ds.h>
#include <asm/bugs.h>
+#ifdef CONFIG_X86_64
+#include <asm/topology.h>
+#include <asm/numa_64.h>
+#endif
+
#include "cpu.h"
#ifdef CONFIG_X86_LOCAL_APIC
@@ -23,23 +28,22 @@
#include <mach_apic.h>
#endif
-#ifdef CONFIG_X86_INTEL_USERCOPY
-/*
- * Alignment at which movsl is preferred for bulk memory copies.
- */
-struct movsl_mask movsl_mask __read_mostly;
-#endif
-
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
{
- /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
- if (c->x86 == 15 && c->x86_cache_alignment == 64)
- c->x86_cache_alignment = 128;
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
(c->x86 == 0x6 && c->x86_model >= 0x0e))
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#else
+ /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
+ if (c->x86 == 15 && c->x86_cache_alignment == 64)
+ c->x86_cache_alignment = 128;
+#endif
}
+#ifdef CONFIG_X86_32
/*
* Early probe support logic for ppro memory erratum #50
*
@@ -59,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void)
return 0;
}
+#ifdef CONFIG_X86_F00F_BUG
+static void __cpuinit trap_init_f00f_bug(void)
+{
+ __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
-/*
- * P4 Xeon errata 037 workaround.
- * Hardware prefetcher may cause stale data to be loaded into the cache.
- */
-static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
+ /*
+ * Update the IDT descriptor and reload the IDT so that
+ * it uses the read-only mapped virtual address.
+ */
+ idt_descr.address = fix_to_virt(FIX_F00F_IDT);
+ load_idt(&idt_descr);
+}
+#endif
+
+static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
{
unsigned long lo, hi;
+#ifdef CONFIG_X86_F00F_BUG
+ /*
+ * All current models of Pentium and Pentium with MMX technology CPUs
+ * have the F0 0F bug, which lets nonprivileged users lock up the system.
+ * Note that the workaround only should be initialized once...
+ */
+ c->f00f_bug = 0;
+ if (!paravirt_enabled() && c->x86 == 5) {
+ static int f00f_workaround_enabled;
+
+ c->f00f_bug = 1;
+ if (!f00f_workaround_enabled) {
+ trap_init_f00f_bug();
+ printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+ f00f_workaround_enabled = 1;
+ }
+ }
+#endif
+
+ /*
+ * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
+ * model 3 mask 3
+ */
+ if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
+ clear_cpu_cap(c, X86_FEATURE_SEP);
+
+ /*
+ * P4 Xeon errata 037 workaround.
+ * Hardware prefetcher may cause stale data to be loaded into the cache.
+ */
if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
if ((lo & (1<<9)) == 0) {
@@ -77,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
}
}
+
+ /*
+ * See if we have a good local APIC by checking for buggy Pentia,
+ * i.e. all B steppings and the C2 stepping of P54C when using their
+ * integrated APIC (see 11AP erratum in "Pentium Processor
+ * Specification Update").
+ */
+ if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+ (c->x86_mask < 0x6 || c->x86_mask == 0xb))
+ set_cpu_cap(c, X86_FEATURE_11AP);
+
+
+#ifdef CONFIG_X86_INTEL_USERCOPY
+ /*
+ * Set up the preferred alignment for movsl bulk memory moves
+ */
+ switch (c->x86) {
+ case 4: /* 486: untested */
+ break;
+ case 5: /* Old Pentia: untested */
+ break;
+ case 6: /* PII/PIII only like movsl with 8-byte alignment */
+ movsl_mask.mask = 7;
+ break;
+ case 15: /* P4 is OK down to 8-byte alignment */
+ movsl_mask.mask = 7;
+ break;
+ }
+#endif
+
+#ifdef CONFIG_X86_NUMAQ
+ numaq_tsc_disable();
+#endif
}
+#else
+static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
+{
+}
+#endif
+static void __cpuinit srat_detect_node(void)
+{
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+ unsigned node;
+ int cpu = smp_processor_id();
+ int apicid = hard_smp_processor_id();
+
+ /* Don't do the funky fallback heuristics the AMD version employs
+ for now. */
+ node = apicid_to_node[apicid];
+ if (node == NUMA_NO_NODE || !node_online(node))
+ node = first_node(node_online_map);
+ numa_set_node(cpu, node);
+
+ printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
/*
* find out the number of processor cores on the die
*/
-static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
{
unsigned int eax, ebx, ecx, edx;
@@ -98,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
return 1;
}
-#ifdef CONFIG_X86_F00F_BUG
-static void __cpuinit trap_init_f00f_bug(void)
+static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
{
- __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
-
- /*
- * Update the IDT descriptor and reload the IDT so that
- * it uses the read-only mapped virtual address.
- */
- idt_descr.address = fix_to_virt(FIX_F00F_IDT);
- load_idt(&idt_descr);
+ /* Intel VMX MSR indicated features */
+#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
+#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
+#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
+#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
+#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
+#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
+
+ u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+
+ clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ clear_cpu_cap(c, X86_FEATURE_VNMI);
+ clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ clear_cpu_cap(c, X86_FEATURE_EPT);
+ clear_cpu_cap(c, X86_FEATURE_VPID);
+
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+ msr_ctl = vmx_msr_high | vmx_msr_low;
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ vmx_msr_low, vmx_msr_high);
+ msr_ctl2 = vmx_msr_high | vmx_msr_low;
+ if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
+ (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
+ set_cpu_cap(c, X86_FEATURE_VPID);
+ }
}
-#endif
static void __cpuinit init_intel(struct cpuinfo_x86 *c)
{
unsigned int l2 = 0;
- char *p = NULL;
early_init_intel(c);
-#ifdef CONFIG_X86_F00F_BUG
- /*
- * All current models of Pentium and Pentium with MMX technology CPUs
- * have the F0 0F bug, which lets nonprivileged users lock up the system.
- * Note that the workaround only should be initialized once...
- */
- c->f00f_bug = 0;
- if (!paravirt_enabled() && c->x86 == 5) {
- static int f00f_workaround_enabled;
-
- c->f00f_bug = 1;
- if (!f00f_workaround_enabled) {
- trap_init_f00f_bug();
- printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
- f00f_workaround_enabled = 1;
- }
- }
-#endif
+ intel_workarounds(c);
l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
@@ -146,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
- /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */
- if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
- clear_cpu_cap(c, X86_FEATURE_SEP);
+ if (cpu_has_xmm2)
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ if (cpu_has_ds) {
+ unsigned int l1;
+ rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+ if (!(l1 & (1<<11)))
+ set_cpu_cap(c, X86_FEATURE_BTS);
+ if (!(l1 & (1<<12)))
+ set_cpu_cap(c, X86_FEATURE_PEBS);
+ ds_init_intel(c);
+ }
+#ifdef CONFIG_X86_64
+ if (c->x86 == 15)
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+#else
/*
* Names for the Pentium II/Celeron processors
* detectable only by also checking the cache size.
* Dixon is NOT a Celeron.
*/
if (c->x86 == 6) {
+ char *p = NULL;
+
switch (c->x86_model) {
case 5:
if (c->x86_mask == 0) {
@@ -178,70 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
p = "Celeron (Coppermine)";
break;
}
- }
-
- if (p)
- strcpy(c->x86_model_id, p);
-
- c->x86_max_cores = num_cpu_cores(c);
-
- detect_ht(c);
- /* Work around errata */
- Intel_errata_workarounds(c);
-
-#ifdef CONFIG_X86_INTEL_USERCOPY
- /*
- * Set up the preferred alignment for movsl bulk memory moves
- */
- switch (c->x86) {
- case 4: /* 486: untested */
- break;
- case 5: /* Old Pentia: untested */
- break;
- case 6: /* PII/PIII only like movsl with 8-byte alignment */
- movsl_mask.mask = 7;
- break;
- case 15: /* P4 is OK down to 8-byte alignment */
- movsl_mask.mask = 7;
- break;
+ if (p)
+ strcpy(c->x86_model_id, p);
}
-#endif
- if (cpu_has_xmm2)
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- if (c->x86 == 15) {
+ if (c->x86 == 15)
set_cpu_cap(c, X86_FEATURE_P4);
- }
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_P3);
- if (cpu_has_ds) {
- unsigned int l1;
- rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
- if (!(l1 & (1<<11)))
- set_cpu_cap(c, X86_FEATURE_BTS);
- if (!(l1 & (1<<12)))
- set_cpu_cap(c, X86_FEATURE_PEBS);
- }
if (cpu_has_bts)
- ds_init_intel(c);
+ ptrace_bts_init_intel(c);
- /*
- * See if we have a good local APIC by checking for buggy Pentia,
- * i.e. all B steppings and the C2 stepping of P54C when using their
- * integrated APIC (see 11AP erratum in "Pentium Processor
- * Specification Update").
- */
- if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
- (c->x86_mask < 0x6 || c->x86_mask == 0xb))
- set_cpu_cap(c, X86_FEATURE_11AP);
+#endif
-#ifdef CONFIG_X86_NUMAQ
- numaq_tsc_disable();
+ detect_extended_topology(c);
+ if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
+ /*
+ * let's use the legacy cpuid vector 0x1 and 0x4 for topology
+ * detection.
+ */
+ c->x86_max_cores = intel_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
#endif
+ }
+
+ /* Work around errata */
+ srat_detect_node();
+
+ if (cpu_has(c, X86_FEATURE_VMX))
+ detect_vmx_virtcap(c);
}
+#ifdef CONFIG_X86_32
static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/*
@@ -254,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
size = 256;
return size;
}
+#endif
static struct cpu_dev intel_cpu_dev __cpuinitdata = {
.c_vendor = "Intel",
.c_ident = { "GenuineIntel" },
+#ifdef CONFIG_X86_32
.c_models = {
{ .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
{
@@ -307,76 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
}
},
},
+ .c_size_cache = intel_size_cache,
+#endif
.c_early_init = early_init_intel,
.c_init = init_intel,
- .c_size_cache = intel_size_cache,
+ .c_x86_vendor = X86_VENDOR_INTEL,
};
-cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
-
-#ifndef CONFIG_X86_CMPXCHG
-unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
-{
- u8 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u8 *)ptr;
- if (prev == old)
- *(u8 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u8);
-
-unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
-{
- u16 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u16 *)ptr;
- if (prev == old)
- *(u16 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u16);
-
-unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
-{
- u32 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u32 *)ptr;
- if (prev == old)
- *(u32 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u32);
-#endif
-
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
- u64 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u64 *)ptr;
- if (prev == old)
- *(u64 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
-
-/* arch_initcall(intel_cpu_init); */
+cpu_dev_register(intel_cpu_dev);
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
deleted file mode 100644
index 1019c58d39f..00000000000
--- a/arch/x86/kernel/cpu/intel_64.c
+++ /dev/null
@@ -1,95 +0,0 @@
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/topology.h>
-#include <asm/numa_64.h>
-
-#include "cpu.h"
-
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
-{
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
- set_cpu_cap(c, X86_FEATURE_SYSENTER32);
-}
-
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
- unsigned int eax, t;
-
- if (c->cpuid_level < 4)
- return 1;
-
- cpuid_count(4, 0, &eax, &t, &t, &t);
-
- if (eax & 0x1f)
- return ((eax >> 26) + 1);
- else
- return 1;
-}
-
-static void __cpuinit srat_detect_node(void)
-{
-#ifdef CONFIG_NUMA
- unsigned node;
- int cpu = smp_processor_id();
- int apicid = hard_smp_processor_id();
-
- /* Don't do the funky fallback heuristics the AMD version employs
- for now. */
- node = apicid_to_node[apicid];
- if (node == NUMA_NO_NODE || !node_online(node))
- node = first_node(node_online_map);
- numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-}
-
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-{
- init_intel_cacheinfo(c);
- if (c->cpuid_level > 9) {
- unsigned eax = cpuid_eax(10);
- /* Check for version and the number of counters */
- if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
- set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
- }
-
- if (cpu_has_ds) {
- unsigned int l1, l2;
- rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
- if (!(l1 & (1<<11)))
- set_cpu_cap(c, X86_FEATURE_BTS);
- if (!(l1 & (1<<12)))
- set_cpu_cap(c, X86_FEATURE_PEBS);
- }
-
-
- if (cpu_has_bts)
- ds_init_intel(c);
-
- if (c->x86 == 15)
- c->x86_cache_alignment = c->x86_clflush_size * 2;
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- c->x86_max_cores = intel_num_cpu_cores(c);
-
- srat_detect_node();
-}
-
-static struct cpu_dev intel_cpu_dev __cpuinitdata = {
- .c_vendor = "Intel",
- .c_ident = { "GenuineIntel" },
- .c_early_init = early_init_intel,
- .c_init = init_intel,
-};
-cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b0a10b002f..3f46afbb1cf 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1,8 +1,8 @@
/*
- * Routines to indentify caches on Intel CPU.
+ * Routines to indentify caches on Intel CPU.
*
- * Changes:
- * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
+ * Changes:
+ * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
* Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
* Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
*/
@@ -13,6 +13,7 @@
#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/pci.h>
#include <asm/processor.h>
#include <asm/smp.h>
@@ -130,9 +131,18 @@ struct _cpuid4_info {
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
+ unsigned long can_disable;
cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */
};
+#ifdef CONFIG_PCI
+static struct pci_device_id k8_nb_id[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
+ {}
+};
+#endif
+
unsigned short num_cache_leaves;
/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -182,9 +192,10 @@ static unsigned short assocs[] __cpuinitdata = {
static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
-static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
- union _cpuid4_leaf_ebx *ebx,
- union _cpuid4_leaf_ecx *ecx)
+static void __cpuinit
+amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
+ union _cpuid4_leaf_ebx *ebx,
+ union _cpuid4_leaf_ecx *ecx)
{
unsigned dummy;
unsigned line_size, lines_per_tag, assoc, size_in_kb;
@@ -251,27 +262,40 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
(ebx->split.ways_of_associativity + 1) - 1;
}
-static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+static void __cpuinit
+amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
+{
+ if (index < 3)
+ return;
+ this_leaf->can_disable = 1;
+}
+
+static int
+__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
{
union _cpuid4_leaf_eax eax;
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned edx;
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
amd_cpuid4(index, &eax, &ebx, &ecx);
- else
- cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
+ if (boot_cpu_data.x86 >= 0x10)
+ amd_check_l3_disable(index, this_leaf);
+ } else {
+ cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
+ }
+
if (eax.split.type == CACHE_TYPE_NULL)
return -EIO; /* better error ? */
this_leaf->eax = eax;
this_leaf->ebx = ebx;
this_leaf->ecx = ecx;
- this_leaf->size = (ecx.split.number_of_sets + 1) *
- (ebx.split.coherency_line_size + 1) *
- (ebx.split.physical_line_partition + 1) *
- (ebx.split.ways_of_associativity + 1);
+ this_leaf->size = (ecx.split.number_of_sets + 1) *
+ (ebx.split.coherency_line_size + 1) *
+ (ebx.split.physical_line_partition + 1) *
+ (ebx.split.ways_of_associativity + 1);
return 0;
}
@@ -453,7 +477,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
/* pointer to _cpuid4_info array (for each cache leaf) */
static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
-#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
+#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
#ifdef CONFIG_SMP
static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -490,7 +514,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
this_leaf = CPUID4_INFO_IDX(cpu, index);
for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
- sibling_leaf = CPUID4_INFO_IDX(sibling, index);
+ sibling_leaf = CPUID4_INFO_IDX(sibling, index);
cpu_clear(cpu, sibling_leaf->shared_cpu_map);
}
}
@@ -572,7 +596,7 @@ struct _index_kobject {
/* pointer to array of kobjects for cpuX/cache/indexY */
static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
-#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
+#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
#define show_one_plus(file_name, object, val) \
static ssize_t show_##file_name \
@@ -637,6 +661,99 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
}
}
+#define to_object(k) container_of(k, struct _index_kobject, kobj)
+#define to_attr(a) container_of(a, struct _cache_attr, attr)
+
+#ifdef CONFIG_PCI
+static struct pci_dev *get_k8_northbridge(int node)
+{
+ struct pci_dev *dev = NULL;
+ int i;
+
+ for (i = 0; i <= node; i++) {
+ do {
+ dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ } while (!pci_match_id(&k8_nb_id[0], dev));
+ if (!dev)
+ break;
+ }
+ return dev;
+}
+#else
+static struct pci_dev *get_k8_northbridge(int node)
+{
+ return NULL;
+}
+#endif
+
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
+{
+ int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+ struct pci_dev *dev = NULL;
+ ssize_t ret = 0;
+ int i;
+
+ if (!this_leaf->can_disable)
+ return sprintf(buf, "Feature not enabled\n");
+
+ dev = get_k8_northbridge(node);
+ if (!dev) {
+ printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
+ return -EINVAL;
+ }
+
+ for (i = 0; i < 2; i++) {
+ unsigned int reg;
+
+ pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
+
+ ret += sprintf(buf, "%sEntry: %d\n", buf, i);
+ ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n",
+ buf,
+ reg & 0x80000000 ? "Disabled" : "Allowed",
+ reg & 0x40000000 ? "Disabled" : "Allowed");
+ ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
+ buf, (reg & 0x30000) >> 16, reg & 0xfff);
+ }
+ return ret;
+}
+
+static ssize_t
+store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
+ size_t count)
+{
+ int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+ struct pci_dev *dev = NULL;
+ unsigned int ret, index, val;
+
+ if (!this_leaf->can_disable)
+ return 0;
+
+ if (strlen(buf) > 15)
+ return -EINVAL;
+
+ ret = sscanf(buf, "%x %x", &index, &val);
+ if (ret != 2)
+ return -EINVAL;
+ if (index > 1)
+ return -EINVAL;
+
+ val |= 0xc0000000;
+ dev = get_k8_northbridge(node);
+ if (!dev) {
+ printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
+ return -EINVAL;
+ }
+
+ pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
+ wbinvd();
+ pci_write_config_dword(dev, 0x1BC + index * 4, val);
+
+ return 1;
+}
+
struct _cache_attr {
struct attribute attr;
ssize_t (*show)(struct _cpuid4_info *, char *);
@@ -657,6 +774,8 @@ define_one_ro(size);
define_one_ro(shared_cpu_map);
define_one_ro(shared_cpu_list);
+static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
+
static struct attribute * default_attrs[] = {
&type.attr,
&level.attr,
@@ -667,12 +786,10 @@ static struct attribute * default_attrs[] = {
&size.attr,
&shared_cpu_map.attr,
&shared_cpu_list.attr,
+ &cache_disable.attr,
NULL
};
-#define to_object(k) container_of(k, struct _index_kobject, kobj)
-#define to_attr(a) container_of(a, struct _cache_attr, attr)
-
static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
{
struct _cache_attr *fattr = to_attr(attr);
@@ -682,14 +799,22 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
ret = fattr->show ?
fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
buf) :
- 0;
+ 0;
return ret;
}
static ssize_t store(struct kobject * kobj, struct attribute * attr,
const char * buf, size_t count)
{
- return 0;
+ struct _cache_attr *fattr = to_attr(attr);
+ struct _index_kobject *this_leaf = to_object(kobj);
+ ssize_t ret;
+
+ ret = fattr->store ?
+ fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
+ buf, count) :
+ 0;
+ return ret;
}
static struct sysfs_ops sysfs_ops = {
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index f390c9f6635..dd3af6e7b39 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -1,6 +1,6 @@
/*
- * Athlon/Hammer specific Machine Check Exception Reporting
- * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk>
+ * Athlon specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Dave Jones <davej@redhat.com>
*/
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 774d87cfd8c..0ebf3fc6a61 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -1,6 +1,6 @@
/*
* mce.c - x86 Machine Check Exception Reporting
- * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
+ * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com>
*/
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 726a5fcdf34..4b031a4ac85 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -860,7 +860,7 @@ error:
return err;
}
-static void mce_remove_device(unsigned int cpu)
+static __cpuinit void mce_remove_device(unsigned int cpu)
{
int i;
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index cc1fccdd31e..a74af128efc 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -1,7 +1,7 @@
/*
* Non Fatal Machine Check Exception Reporting
*
- * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk>
+ * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
*
* This file contains routines to check for non-fatal MCEs every 15s
*
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
new file mode 100644
index 00000000000..dfea390e160
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -0,0 +1,32 @@
+#!/usr/bin/perl
+#
+# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
+#
+
+($in, $out) = @ARGV;
+
+open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
+open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
+
+print OUT "#include <asm/cpufeature.h>\n\n";
+print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
+
+while (defined($line = <IN>)) {
+ if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
+ $macro = $1;
+ $feature = $2;
+ $tail = $3;
+ if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
+ $feature = $1;
+ }
+
+ if ($feature ne '') {
+ printf OUT "\t%-32s = \"%s\",\n",
+ "[$macro]", "\L$feature";
+ }
+ }
+}
+print OUT "};\n";
+
+close(IN);
+close(OUT);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index cb7d3b6a80e..4e8d77f01ee 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -401,12 +401,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
tmp |= ~((1<<(hi - 1)) - 1);
if (tmp != mask_lo) {
- static int once = 1;
-
- if (once) {
- printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
- once = 0;
- }
+ WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
mask_lo = tmp;
}
}
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb371..4c4214690dd 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
}
/* RED-PEN: base can be > 32bit */
len += seq_printf(seq,
- "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
+ "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
i, base, base >> (20 - PAGE_SHIFT), size, factor,
- mtrr_attrib_to_str(type), mtrr_usage_table[i]);
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
}
}
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index b117d7f8a56..c78c04821ea 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -729,7 +729,7 @@ struct var_mtrr_range_state {
mtrr_type type;
};
-struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
static int __initdata debug_print;
static int __init
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
/* take out UC ranges */
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
- if (type != MTRR_TYPE_UNCACHABLE)
+ if (type != MTRR_TYPE_UNCACHABLE &&
+ type != MTRR_TYPE_WRPROT)
continue;
size = range_state[i].size_pfn;
if (!size)
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str)
enable_mtrr_cleanup = 1;
return 0;
}
-early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
+early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+ debug_print = 1;
+ return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
struct var_mtrr_state {
unsigned long range_startk;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
}
}
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+ char factor;
+ unsigned long base = sizek;
+
+ if (base & ((1<<10) - 1)) {
+ /* not MB alignment */
+ factor = 'K';
+ } else if (base & ((1<<20) - 1)){
+ factor = 'M';
+ base >>= 10;
+ } else {
+ factor = 'G';
+ base >>= 20;
+ }
+
+ *factorp = factor;
+
+ return base;
+}
+
static unsigned int __init
range_to_mtrr(unsigned int reg, unsigned long range_startk,
unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
align = max_align;
sizek = 1 << align;
- if (debug_print)
+ if (debug_print) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ start_base = to_size_factor(range_startk, &start_factor),
+ size_base = to_size_factor(sizek, &size_factor),
+
printk(KERN_DEBUG "Setting variable MTRR %d, "
- "base: %ldMB, range: %ldMB, type %s\n",
- reg, range_startk >> 10, sizek >> 10,
+ "base: %ld%cB, range: %ld%cB, type %s\n",
+ reg, start_base, start_factor,
+ size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE)?"UC":
((type == MTRR_TYPE_WRBACK)?"WB":"Other")
);
+ }
save_var_mtrr(reg++, range_startk, sizek, type);
range_startk += sizek;
range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
/* try to append some small hole */
range0_basek = state->range_startk;
range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+ /* no increase */
if (range0_sizek == state->range_sizek) {
if (debug_print)
printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
return 0;
}
- range0_sizek -= chunk_sizek;
- if (range0_sizek && sizek) {
- while (range0_basek + range0_sizek > (basek + sizek)) {
- range0_sizek -= chunk_sizek;
- if (!range0_sizek)
- break;
- }
+ /* only cut back, when it is not the last */
+ if (sizek) {
+ while (range0_basek + range0_sizek > (basek + sizek)) {
+ if (range0_sizek >= chunk_sizek)
+ range0_sizek -= chunk_sizek;
+ else
+ range0_sizek = 0;
+
+ if (!range0_sizek)
+ break;
+ }
+ }
+
+second_try:
+ range_basek = range0_basek + range0_sizek;
+
+ /* one hole in the middle */
+ if (range_basek > basek && range_basek <= (basek + sizek))
+ second_sizek = range_basek - basek;
+
+ if (range0_sizek > state->range_sizek) {
+
+ /* one hole in middle or at end */
+ hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+ /* hole size should be less than half of range0 size */
+ if (hole_sizek >= (range0_sizek >> 1) &&
+ range0_sizek >= chunk_sizek) {
+ range0_sizek -= chunk_sizek;
+ second_sizek = 0;
+ hole_sizek = 0;
+
+ goto second_try;
+ }
}
if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
(range0_basek + range0_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range0_basek,
range0_sizek, MTRR_TYPE_WRBACK);
-
- }
-
- range_basek = range0_basek + range0_sizek;
- range_sizek = chunk_sizek;
-
- if (range_basek + range_sizek > basek &&
- range_basek + range_sizek <= (basek + sizek)) {
- /* one hole */
- second_basek = basek;
- second_sizek = range_basek + range_sizek - basek;
}
- /* if last piece, only could one hole near end */
- if ((second_basek || !basek) &&
- range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
- (chunk_sizek >> 1)) {
- /*
- * one hole in middle (second_sizek is 0) or at end
- * (second_sizek is 0 )
- */
- hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
- - second_sizek;
- hole_basek = range_basek + range_sizek - hole_sizek
- - second_sizek;
- } else {
- /* fallback for big hole, or several holes */
+ if (range0_sizek < state->range_sizek) {
+ /* need to handle left over */
range_sizek = state->range_sizek - range0_sizek;
- second_basek = 0;
- second_sizek = 0;
+
+ if (debug_print)
+ printk(KERN_DEBUG "range: %016lx - %016lx\n",
+ range_basek<<10,
+ (range_basek + range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range_basek,
+ range_sizek, MTRR_TYPE_WRBACK);
}
- if (debug_print)
- printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
- (range_basek + range_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
- MTRR_TYPE_WRBACK);
if (hole_sizek) {
+ hole_basek = range_basek - hole_sizek - second_sizek;
if (debug_print)
printk(KERN_DEBUG "hole: %016lx - %016lx\n",
- hole_basek<<10, (hole_basek + hole_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
- MTRR_TYPE_UNCACHABLE);
-
+ hole_basek<<10,
+ (hole_basek + hole_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, hole_basek,
+ hole_sizek, MTRR_TYPE_UNCACHABLE);
}
return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
};
/*
- * gran_size: 1M, 2M, ..., 2G
- * chunk size: gran_size, ..., 4G
- * so we need (2+13)*6
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
*/
-#define NUM_RESULT 90
+#define NUM_RESULT 136
#define PSHIFT (PAGE_SHIFT - 10)
static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
static int __init mtrr_cleanup(unsigned address_bits)
{
unsigned long extra_remove_base, extra_remove_size;
- unsigned long i, base, size, def, dummy;
+ unsigned long base, size, def, dummy;
mtrr_type type;
int nr_range, nr_range_new;
u64 chunk_size, gran_size;
unsigned long range_sums, range_sums_new;
int index_good;
int num_reg_good;
+ int i;
/* extra one for all 0 */
int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
continue;
if (!size)
type = MTRR_NUM_TYPES;
+ if (type == MTRR_TYPE_WRPROT)
+ type = MTRR_TYPE_UNCACHABLE;
num[type]++;
}
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
+ /* print original var MTRRs at first, for debugging: */
+ printk(KERN_DEBUG "original variable MTRRs\n");
+ for (i = 0; i < num_var_ranges; i++) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+ if (!size_base)
+ continue;
+
+ size_base = to_size_factor(size_base, &size_factor),
+ start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+ start_base = to_size_factor(start_base, &start_factor),
+ type = range_state[i].type;
+
+ printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ i, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRPROT) ? "WP" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+ );
+ }
+
memset(range, 0, sizeof(range));
extra_remove_size = 0;
- if (mtrr_tom2) {
- extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ if (mtrr_tom2)
extra_remove_size =
(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
- }
nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
extra_remove_size);
+ /*
+ * [0, 1M) should always be coverred by var mtrr with WB
+ * and fixed mtrrs should take effective before var mtrr for it
+ */
+ nr_range = add_range_with_merge(range, nr_range, 0,
+ (1ULL<<(20 - PAGE_SHIFT)) - 1);
+ /* sort the ranges */
+ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM coverred: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
int num_reg;
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
- debug_print = 1;
+ debug_print++;
/* convert ranges to var ranges state */
num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
result[i].lose_cover_sizek =
(range_sums - range_sums_new) << PSHIFT;
- printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
- result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n",
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad?"*BAD*":" ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
result[i].num_reg, result[i].bad?"-":"",
- result[i].lose_cover_sizek >> 10);
+ lose_base, lose_factor);
if (!result[i].bad) {
set_var_mtrr_all(address_bits);
return 1;
}
printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
"will find optimal one\n");
- debug_print = 0;
+ debug_print--;
memset(result, 0, sizeof(result[0]));
}
i = 0;
memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
memset(result, 0, sizeof(result));
- for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
- for (chunk_size = gran_size; chunk_size < (1ULL<<33);
+ for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+ char gran_factor;
+ unsigned long gran_base;
+
+ if (debug_print)
+ gran_base = to_size_factor(gran_size >> 10, &gran_factor);
+
+ for (chunk_size = gran_size; chunk_size < (1ULL<<32);
chunk_size <<= 1) {
int num_reg;
- if (debug_print)
- printk(KERN_INFO
- "\ngran_size: %lldM chunk_size_size: %lldM\n",
- gran_size >> 20, chunk_size >> 20);
+ if (debug_print) {
+ char chunk_factor;
+ unsigned long chunk_base;
+
+ chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
+ printk(KERN_INFO "\n");
+ printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ }
if (i >= NUM_RESULT)
continue;
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
/* print out all */
for (i = 0; i < NUM_RESULT; i++) {
- printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
- result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
- result[i].num_reg, result[i].bad?"-":"",
- result[i].lose_cover_sizek >> 10);
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad?"*BAD*":" ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ result[i].num_reg, result[i].bad?"-":"",
+ lose_base, lose_factor);
}
/* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
nr_mtrr_spare_reg = num_var_ranges - 1;
num_reg_good = -1;
for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
- if (!min_loss_pfn[i]) {
+ if (!min_loss_pfn[i])
num_reg_good = i;
- break;
- }
}
index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
}
if (index_good != -1) {
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
i = index_good;
- printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
- result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
- result[i].num_reg,
- result[i].lose_cover_sizek >> 10);
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
+ result[i].num_reg, lose_base, lose_factor);
/* convert ranges to var ranges state */
chunk_size = result[i].chunk_sizek;
chunk_size <<= 10;
gran_size = result[i].gran_sizek;
gran_size <<= 10;
- debug_print = 1;
+ debug_print++;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+ debug_print--;
set_var_mtrr_all(address_bits);
return 1;
}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 05cc22dbd4f..9abd48b2267 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -17,6 +17,8 @@
#include <linux/bitops.h>
#include <linux/smp.h>
#include <linux/nmi.h>
+#include <linux/kprobes.h>
+
#include <asm/apic.h>
#include <asm/intel_arch_perfmon.h>
@@ -295,13 +297,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
+ /* initialize the wd struct before enabling */
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= K7_EVNTSEL_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
return 1;
}
@@ -330,7 +338,8 @@ static void single_msr_unreserve(void)
release_perfctr_nmi(wd_ops->perfctr);
}
-static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes
+single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
{
/* start the cycle over again */
write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
@@ -379,17 +388,23 @@ static int setup_p6_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
+ /* initialize the wd struct before enabling */
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= P6_EVNTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
return 1;
}
-static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
{
/*
* P6 based Pentium M need to re-unmask
@@ -432,6 +447,27 @@ static const struct wd_ops p6_wd_ops = {
#define P4_CCCR_ENABLE (1 << 12)
#define P4_CCCR_OVF (1 << 31)
+#define P4_CONTROLS 18
+static unsigned int p4_controls[18] = {
+ MSR_P4_BPU_CCCR0,
+ MSR_P4_BPU_CCCR1,
+ MSR_P4_BPU_CCCR2,
+ MSR_P4_BPU_CCCR3,
+ MSR_P4_MS_CCCR0,
+ MSR_P4_MS_CCCR1,
+ MSR_P4_MS_CCCR2,
+ MSR_P4_MS_CCCR3,
+ MSR_P4_FLAME_CCCR0,
+ MSR_P4_FLAME_CCCR1,
+ MSR_P4_FLAME_CCCR2,
+ MSR_P4_FLAME_CCCR3,
+ MSR_P4_IQ_CCCR0,
+ MSR_P4_IQ_CCCR1,
+ MSR_P4_IQ_CCCR2,
+ MSR_P4_IQ_CCCR3,
+ MSR_P4_IQ_CCCR4,
+ MSR_P4_IQ_CCCR5,
+};
/*
* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
* CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,6 +509,26 @@ static int setup_p4_watchdog(unsigned nmi_hz)
evntsel_msr = MSR_P4_CRU_ESCR0;
cccr_msr = MSR_P4_IQ_CCCR0;
cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
+
+ /*
+ * If we're on the kdump kernel or other situation, we may
+ * still have other performance counter registers set to
+ * interrupt and they'll keep interrupting forever because
+ * of the P4_CCCR_OVF quirk. So we need to ACK all the
+ * pending interrupts and disable all the registers here,
+ * before reenabling the NMI delivery. Refer to p4_rearm()
+ * about the P4_CCCR_OVF quirk.
+ */
+ if (reset_devices) {
+ unsigned int low, high;
+ int i;
+
+ for (i = 0; i < P4_CONTROLS; i++) {
+ rdmsr(p4_controls[i], low, high);
+ low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
+ wrmsr(p4_controls[i], low, high);
+ }
+ }
} else {
/* logical cpu 1 */
perfctr_msr = MSR_P4_IQ_PERFCTR1;
@@ -499,12 +555,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
wrmsr(cccr_msr, cccr_val, 0);
write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- cccr_val |= P4_CCCR_ENABLE;
- wrmsr(cccr_msr, cccr_val, 0);
+
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = cccr_msr;
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ cccr_val |= P4_CCCR_ENABLE;
+ wrmsr(cccr_msr, cccr_val, 0);
return 1;
}
@@ -547,7 +608,7 @@ static void p4_unreserve(void)
release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
}
-static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
{
unsigned dummy;
/*
@@ -620,13 +681,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
return 1;
}
@@ -722,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz)
return hz;
}
-int lapic_wd_event(unsigned nmi_hz)
+int __kprobes lapic_wd_event(unsigned nmi_hz)
{
struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
u64 ctr;
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
new file mode 100644
index 00000000000..5abbea297e0
--- /dev/null
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -0,0 +1,20 @@
+/*
+ * Strings for the various x86 power flags
+ *
+ * This file must not contain any executable code.
+ */
+
+#include <asm/cpufeature.h>
+
+const char *const x86_power_flags[32] = {
+ "ts", /* temperature sensor */
+ "fid", /* frequency id control */
+ "vid", /* voltage id control */
+ "ttp", /* thermal trip */
+ "tm",
+ "stc",
+ "100mhzsteps",
+ "hwpstate",
+ "", /* tsc invariant mapped to constant_tsc */
+ /* nothing */
+};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index a26c480b949..01b1244ef1c 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,14 +160,16 @@ static void *c_start(struct seq_file *m, loff_t *pos)
{
if (*pos == 0) /* just in case, cpu 0 is not the first */
*pos = first_cpu(cpu_online_map);
- if ((*pos) < nr_cpu_ids && cpu_online(*pos))
+ else
+ *pos = next_cpu_nr(*pos - 1, cpu_online_map);
+ if ((*pos) < nr_cpu_ids)
return &cpu_data(*pos);
return NULL;
}
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
{
- *pos = next_cpu(*pos, cpu_online_map);
+ (*pos)++;
return c_start(m, pos);
}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index b911a2c61b8..52b3fefbd5a 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
#include <asm/msr.h>
#include "cpu.h"
+static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
+{
+ u32 xlvl;
+
+ /* Transmeta-defined flags: level 0x80860001 */
+ xlvl = cpuid_eax(0x80860000);
+ if ((xlvl & 0xffff0000) == 0x80860000) {
+ if (xlvl >= 0x80860001)
+ c->x86_capability[2] = cpuid_edx(0x80860001);
+ }
+}
+
static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
{
unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
char cpu_info[65];
- get_model_name(c); /* Same as AMD/Cyrix */
+ early_init_transmeta(c);
+
display_cacheinfo(c);
/* Print CMS and CPU revision */
@@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
#endif
}
-static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
-{
- u32 xlvl;
-
- /* Transmeta-defined flags: level 0x80860001 */
- xlvl = cpuid_eax(0x80860000);
- if ((xlvl & 0xffff0000) == 0x80860000) {
- if (xlvl >= 0x80860001)
- c->x86_capability[2] = cpuid_edx(0x80860001);
- }
-}
-
static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
.c_vendor = "Transmeta",
.c_ident = { "GenuineTMx86", "TransmetaCPU" },
+ .c_early_init = early_init_transmeta,
.c_init = init_transmeta,
- .c_identify = transmeta_identify,
+ .c_x86_vendor = X86_VENDOR_TRANSMETA,
};
-cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev);
+cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index b1fc90989d7..e777f79e096 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = {
}
},
},
+ .c_x86_vendor = X86_VENDOR_UMC,
};
-cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev);
+cpu_dev_register(umc_cpu_dev);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8e9cd6a8ec1..72cefd1e649 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
-#include <linux/smp_lock.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
@@ -148,8 +147,8 @@ static __cpuinit int cpuid_device_create(int cpu)
{
struct device *dev;
- dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu),
- NULL, "cpu%d", cpu);
+ dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
+ "cpu%d", cpu);
return IS_ERR(dev) ? PTR_ERR(dev) : 0;
}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 72d0c56c1b4..f7cdb3b457a 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,6 +13,9 @@
static void *kdump_buf_page;
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
/**
* copy_oldmem_page - copy one page from "oldmem"
* @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a4..045b36cada6 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,11 @@
#include <linux/errno.h>
#include <linux/crash_dump.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
/**
* copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +27,7 @@
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+ size_t csize, unsigned long offset, int userbuf)
{
void *vaddr;
@@ -33,14 +35,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
return 0;
vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!vaddr)
+ return -ENOMEM;
if (userbuf) {
- if (copy_to_user(buf, (vaddr + offset), csize)) {
+ if (copy_to_user(buf, vaddr + offset, csize)) {
iounmap(vaddr);
return -EFAULT;
}
} else
- memcpy(buf, (vaddr + offset), csize);
+ memcpy(buf, vaddr + offset, csize);
iounmap(vaddr);
return csize;
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index a47798b59f0..b4f14c6c09d 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
- .__cr3 = __pa(swapper_pg_dir)
+ .__cr3 = __pa_nodebug(swapper_pg_dir),
}
};
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48..2b69994fd3a 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
* Debug Store support
*
* This provides a low-level interface to the hardware's Debug Store
- * feature that is used for last branch recording (LBR) and
+ * feature that is used for branch trace store (BTS) and
* precise-event based sampling (PEBS).
*
- * Different architectures use a different DS layout/pointer size.
- * The below functions therefore work on a void*.
+ * It manages:
+ * - per-thread and per-cpu allocation of BTS and PEBS
+ * - buffer memory allocation (optional)
+ * - buffer overflow handling
+ * - buffer access
*
+ * It assumes:
+ * - get_task_struct on all parameter tasks
+ * - current is allowed to trace parameter tasks
*
- * Since there is no user for PEBS, yet, only LBR (or branch
- * trace store, BTS) is supported.
*
- *
- * Copyright (C) 2007 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
+ * Copyright (C) 2007-2008 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
*/
+
+#ifdef CONFIG_X86_DS
+
#include <asm/ds.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+
+
+/*
+ * The configuration for a particular DS hardware implementation.
+ */
+struct ds_configuration {
+ /* the size of the DS structure in bytes */
+ unsigned char sizeof_ds;
+ /* the size of one pointer-typed field in the DS structure in bytes;
+ this covers the first 8 fields related to buffer management. */
+ unsigned char sizeof_field;
+ /* the size of a BTS/PEBS record in bytes */
+ unsigned char sizeof_rec[2];
+};
+static struct ds_configuration ds_cfg;
/*
@@ -44,378 +67,747 @@
* (interrupt occurs when write pointer passes interrupt pointer)
* - value to which counter is reset following counter overflow
*
- * On later architectures, the last branch recording hardware uses
- * 64bit pointers even in 32bit mode.
- *
- *
- * Branch Trace Store (BTS) records store information about control
- * flow changes. They at least provide the following information:
- * - source linear address
- * - destination linear address
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
*
- * Netburst supported a predicated bit that had been dropped in later
- * architectures. We do not suppor it.
*
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ * - an offset giving the start of the respective region
*
- * In order to abstract from the actual DS and BTS layout, we describe
- * the access to the relevant fields.
- * Thanks to Andi Kleen for proposing this design.
+ * This offset is further used to index various arrays holding
+ * information for BTS and PEBS at the respective index.
*
- * The implementation, however, is not as general as it might seem. In
- * order to stay somewhat simple and efficient, we assume an
- * underlying unsigned type (mostly a pointer type) and we expect the
- * field to be at least as big as that type.
+ * On later 32bit processors, we only access the lower 32bit of the
+ * 64bit pointer fields. The upper halves will be zeroed out.
*/
-/*
- * A special from_ip address to indicate that the BTS record is an
- * info record that needs to be interpreted or skipped.
- */
-#define BTS_ESCAPE_ADDRESS (-1)
+enum ds_field {
+ ds_buffer_base = 0,
+ ds_index,
+ ds_absolute_maximum,
+ ds_interrupt_threshold,
+};
-/*
- * A field access descriptor
- */
-struct access_desc {
- unsigned char offset;
- unsigned char size;
+enum ds_qualifier {
+ ds_bts = 0,
+ ds_pebs
};
+static inline unsigned long ds_get(const unsigned char *base,
+ enum ds_qualifier qual, enum ds_field field)
+{
+ base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ return *(unsigned long *)base;
+}
+
+static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
+ enum ds_field field, unsigned long value)
+{
+ base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ (*(unsigned long *)base) = value;
+}
+
+
/*
- * The configuration for a particular DS/BTS hardware implementation.
+ * Locking is done only for allocating BTS or PEBS resources and for
+ * guarding context and buffer memory allocation.
+ *
+ * Most functions require the current task to own the ds context part
+ * they are going to access. All the locking is done when validating
+ * access to the context.
*/
-struct ds_configuration {
- /* the DS configuration */
- unsigned char sizeof_ds;
- struct access_desc bts_buffer_base;
- struct access_desc bts_index;
- struct access_desc bts_absolute_maximum;
- struct access_desc bts_interrupt_threshold;
- /* the BTS configuration */
- unsigned char sizeof_bts;
- struct access_desc from_ip;
- struct access_desc to_ip;
- /* BTS variants used to store additional information like
- timestamps */
- struct access_desc info_type;
- struct access_desc info_data;
- unsigned long debugctl_mask;
-};
+static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
/*
- * The global configuration used by the below accessor functions
+ * Validate that the current task is allowed to access the BTS/PEBS
+ * buffer of the parameter task.
+ *
+ * Returns 0, if access is granted; -Eerrno, otherwise.
*/
-static struct ds_configuration ds_cfg;
+static inline int ds_validate_access(struct ds_context *context,
+ enum ds_qualifier qual)
+{
+ if (!context)
+ return -EPERM;
+
+ if (context->owner[qual] == current)
+ return 0;
+
+ return -EPERM;
+}
+
/*
- * Accessor functions for some DS and BTS fields using the above
- * global ptrace_bts_cfg.
+ * We either support (system-wide) per-cpu or per-thread allocation.
+ * We distinguish the two based on the task_struct pointer, where a
+ * NULL pointer indicates per-cpu allocation for the current cpu.
+ *
+ * Allocations are use-counted. As soon as resources are allocated,
+ * further allocations must be of the same type (per-cpu or
+ * per-thread). We model this by counting allocations (i.e. the number
+ * of tracers of a certain type) for one type negatively:
+ * =0 no tracers
+ * >0 number of per-thread tracers
+ * <0 number of per-cpu tracers
+ *
+ * The below functions to get and put tracers and to check the
+ * allocation type require the ds_lock to be held by the caller.
+ *
+ * Tracers essentially gives the number of ds contexts for a certain
+ * type of allocation.
*/
-static inline unsigned long get_bts_buffer_base(char *base)
+static long tracers;
+
+static inline void get_tracer(struct task_struct *task)
{
- return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset);
+ tracers += (task ? 1 : -1);
}
-static inline void set_bts_buffer_base(char *base, unsigned long value)
+
+static inline void put_tracer(struct task_struct *task)
{
- (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value;
+ tracers -= (task ? 1 : -1);
}
-static inline unsigned long get_bts_index(char *base)
+
+static inline int check_tracer(struct task_struct *task)
{
- return *(unsigned long *)(base + ds_cfg.bts_index.offset);
+ return (task ? (tracers >= 0) : (tracers <= 0));
}
-static inline void set_bts_index(char *base, unsigned long value)
+
+
+/*
+ * The DS context is either attached to a thread or to a cpu:
+ * - in the former case, the thread_struct contains a pointer to the
+ * attached context.
+ * - in the latter case, we use a static array of per-cpu context
+ * pointers.
+ *
+ * Contexts are use-counted. They are allocated on first access and
+ * deallocated when the last user puts the context.
+ *
+ * We distinguish between an allocating and a non-allocating get of a
+ * context:
+ * - the allocating get is used for requesting BTS/PEBS resources. It
+ * requires the caller to hold the global ds_lock.
+ * - the non-allocating get is used for all other cases. A
+ * non-existing context indicates an error. It acquires and releases
+ * the ds_lock itself for obtaining the context.
+ *
+ * A context and its DS configuration are allocated and deallocated
+ * together. A context always has a DS configuration of the
+ * appropriate size.
+ */
+static DEFINE_PER_CPU(struct ds_context *, system_context);
+
+#define this_system_context per_cpu(system_context, smp_processor_id())
+
+/*
+ * Returns the pointer to the parameter task's context or to the
+ * system-wide context, if task is NULL.
+ *
+ * Increases the use count of the returned context, if not NULL.
+ */
+static inline struct ds_context *ds_get_context(struct task_struct *task)
{
- (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value;
+ struct ds_context *context;
+
+ spin_lock(&ds_lock);
+
+ context = (task ? task->thread.ds_ctx : this_system_context);
+ if (context)
+ context->count++;
+
+ spin_unlock(&ds_lock);
+
+ return context;
}
-static inline unsigned long get_bts_absolute_maximum(char *base)
+
+/*
+ * Same as ds_get_context, but allocates the context and it's DS
+ * structure, if necessary; returns NULL; if out of memory.
+ *
+ * pre: requires ds_lock to be held
+ */
+static inline struct ds_context *ds_alloc_context(struct task_struct *task)
{
- return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset);
+ struct ds_context **p_context =
+ (task ? &task->thread.ds_ctx : &this_system_context);
+ struct ds_context *context = *p_context;
+
+ if (!context) {
+ context = kzalloc(sizeof(*context), GFP_KERNEL);
+
+ if (!context)
+ return NULL;
+
+ context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
+ if (!context->ds) {
+ kfree(context);
+ return NULL;
+ }
+
+ *p_context = context;
+
+ context->this = p_context;
+ context->task = task;
+
+ if (task)
+ set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
+
+ if (!task || (task == current))
+ wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
+
+ get_tracer(task);
+ }
+
+ context->count++;
+
+ return context;
}
-static inline void set_bts_absolute_maximum(char *base, unsigned long value)
+
+/*
+ * Decreases the use count of the parameter context, if not NULL.
+ * Deallocates the context, if the use count reaches zero.
+ */
+static inline void ds_put_context(struct ds_context *context)
{
- (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
+ if (!context)
+ return;
+
+ spin_lock(&ds_lock);
+
+ if (--context->count)
+ goto out;
+
+ *(context->this) = NULL;
+
+ if (context->task)
+ clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+
+ if (!context->task || (context->task == current))
+ wrmsrl(MSR_IA32_DS_AREA, 0);
+
+ put_tracer(context->task);
+
+ /* free any leftover buffers from tracers that did not
+ * deallocate them properly. */
+ kfree(context->buffer[ds_bts]);
+ kfree(context->buffer[ds_pebs]);
+ kfree(context->ds);
+ kfree(context);
+ out:
+ spin_unlock(&ds_lock);
}
-static inline unsigned long get_bts_interrupt_threshold(char *base)
+
+
+/*
+ * Handle a buffer overflow
+ *
+ * task: the task whose buffers are overflowing;
+ * NULL for a buffer overflow on the current cpu
+ * context: the ds context
+ * qual: the buffer type
+ */
+static void ds_overflow(struct task_struct *task, struct ds_context *context,
+ enum ds_qualifier qual)
{
- return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset);
+ if (!context)
+ return;
+
+ if (context->callback[qual])
+ (*context->callback[qual])(task);
+
+ /* todo: do some more overflow handling */
}
-static inline void set_bts_interrupt_threshold(char *base, unsigned long value)
+
+
+/*
+ * Allocate a non-pageable buffer of the parameter size.
+ * Checks the memory and the locked memory rlimit.
+ *
+ * Returns the buffer, if successful;
+ * NULL, if out of memory or rlimit exceeded.
+ *
+ * size: the requested buffer size in bytes
+ * pages (out): if not NULL, contains the number of pages reserved
+ */
+static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
{
- (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
+ unsigned long rlim, vm, pgsz;
+ void *buffer;
+
+ pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+ vm = current->mm->total_vm + pgsz;
+ if (rlim < vm)
+ return NULL;
+
+ rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+ vm = current->mm->locked_vm + pgsz;
+ if (rlim < vm)
+ return NULL;
+
+ buffer = kzalloc(size, GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ current->mm->total_vm += pgsz;
+ current->mm->locked_vm += pgsz;
+
+ if (pages)
+ *pages = pgsz;
+
+ return buffer;
}
-static inline unsigned long get_from_ip(char *base)
+
+static int ds_request(struct task_struct *task, void *base, size_t size,
+ ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
{
- return *(unsigned long *)(base + ds_cfg.from_ip.offset);
+ struct ds_context *context;
+ unsigned long buffer, adj;
+ const unsigned long alignment = (1 << 3);
+ int error = 0;
+
+ if (!ds_cfg.sizeof_ds)
+ return -EOPNOTSUPP;
+
+ /* we require some space to do alignment adjustments below */
+ if (size < (alignment + ds_cfg.sizeof_rec[qual]))
+ return -EINVAL;
+
+ /* buffer overflow notification is not yet implemented */
+ if (ovfl)
+ return -EOPNOTSUPP;
+
+
+ spin_lock(&ds_lock);
+
+ if (!check_tracer(task))
+ return -EPERM;
+
+ error = -ENOMEM;
+ context = ds_alloc_context(task);
+ if (!context)
+ goto out_unlock;
+
+ error = -EALREADY;
+ if (context->owner[qual] == current)
+ goto out_unlock;
+ error = -EPERM;
+ if (context->owner[qual] != NULL)
+ goto out_unlock;
+ context->owner[qual] = current;
+
+ spin_unlock(&ds_lock);
+
+
+ error = -ENOMEM;
+ if (!base) {
+ base = ds_allocate_buffer(size, &context->pages[qual]);
+ if (!base)
+ goto out_release;
+
+ context->buffer[qual] = base;
+ }
+ error = 0;
+
+ context->callback[qual] = ovfl;
+
+ /* adjust the buffer address and size to meet alignment
+ * constraints:
+ * - buffer is double-word aligned
+ * - size is multiple of record size
+ *
+ * We checked the size at the very beginning; we have enough
+ * space to do the adjustment.
+ */
+ buffer = (unsigned long)base;
+
+ adj = ALIGN(buffer, alignment) - buffer;
+ buffer += adj;
+ size -= adj;
+
+ size /= ds_cfg.sizeof_rec[qual];
+ size *= ds_cfg.sizeof_rec[qual];
+
+ ds_set(context->ds, qual, ds_buffer_base, buffer);
+ ds_set(context->ds, qual, ds_index, buffer);
+ ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
+
+ if (ovfl) {
+ /* todo: select a suitable interrupt threshold */
+ } else
+ ds_set(context->ds, qual,
+ ds_interrupt_threshold, buffer + size + 1);
+
+ /* we keep the context until ds_release */
+ return error;
+
+ out_release:
+ context->owner[qual] = NULL;
+ ds_put_context(context);
+ return error;
+
+ out_unlock:
+ spin_unlock(&ds_lock);
+ ds_put_context(context);
+ return error;
}
-static inline void set_from_ip(char *base, unsigned long value)
+
+int ds_request_bts(struct task_struct *task, void *base, size_t size,
+ ds_ovfl_callback_t ovfl)
{
- (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value;
+ return ds_request(task, base, size, ovfl, ds_bts);
}
-static inline unsigned long get_to_ip(char *base)
+
+int ds_request_pebs(struct task_struct *task, void *base, size_t size,
+ ds_ovfl_callback_t ovfl)
{
- return *(unsigned long *)(base + ds_cfg.to_ip.offset);
+ return ds_request(task, base, size, ovfl, ds_pebs);
}
-static inline void set_to_ip(char *base, unsigned long value)
+
+static int ds_release(struct task_struct *task, enum ds_qualifier qual)
{
- (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value;
+ struct ds_context *context;
+ int error;
+
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+
+ kfree(context->buffer[qual]);
+ context->buffer[qual] = NULL;
+
+ current->mm->total_vm -= context->pages[qual];
+ current->mm->locked_vm -= context->pages[qual];
+ context->pages[qual] = 0;
+ context->owner[qual] = NULL;
+
+ /*
+ * we put the context twice:
+ * once for the ds_get_context
+ * once for the corresponding ds_request
+ */
+ ds_put_context(context);
+ out:
+ ds_put_context(context);
+ return error;
}
-static inline unsigned char get_info_type(char *base)
+
+int ds_release_bts(struct task_struct *task)
{
- return *(unsigned char *)(base + ds_cfg.info_type.offset);
+ return ds_release(task, ds_bts);
}
-static inline void set_info_type(char *base, unsigned char value)
+
+int ds_release_pebs(struct task_struct *task)
{
- (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value;
+ return ds_release(task, ds_pebs);
}
-static inline unsigned long get_info_data(char *base)
+
+static int ds_get_index(struct task_struct *task, size_t *pos,
+ enum ds_qualifier qual)
{
- return *(unsigned long *)(base + ds_cfg.info_data.offset);
+ struct ds_context *context;
+ unsigned long base, index;
+ int error;
+
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ index = ds_get(context->ds, qual, ds_index);
+
+ error = ((index - base) / ds_cfg.sizeof_rec[qual]);
+ if (pos)
+ *pos = error;
+ out:
+ ds_put_context(context);
+ return error;
}
-static inline void set_info_data(char *base, unsigned long value)
+
+int ds_get_bts_index(struct task_struct *task, size_t *pos)
{
- (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value;
+ return ds_get_index(task, pos, ds_bts);
}
+int ds_get_pebs_index(struct task_struct *task, size_t *pos)
+{
+ return ds_get_index(task, pos, ds_pebs);
+}
-int ds_allocate(void **dsp, size_t bts_size_in_bytes)
+static int ds_get_end(struct task_struct *task, size_t *pos,
+ enum ds_qualifier qual)
{
- size_t bts_size_in_records;
- unsigned long bts;
- void *ds;
+ struct ds_context *context;
+ unsigned long base, end;
+ int error;
+
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ end = ds_get(context->ds, qual, ds_absolute_maximum);
+
+ error = ((end - base) / ds_cfg.sizeof_rec[qual]);
+ if (pos)
+ *pos = error;
+ out:
+ ds_put_context(context);
+ return error;
+}
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
+int ds_get_bts_end(struct task_struct *task, size_t *pos)
+{
+ return ds_get_end(task, pos, ds_bts);
+}
- if (bts_size_in_bytes < 0)
- return -EINVAL;
+int ds_get_pebs_end(struct task_struct *task, size_t *pos)
+{
+ return ds_get_end(task, pos, ds_pebs);
+}
- bts_size_in_records =
- bts_size_in_bytes / ds_cfg.sizeof_bts;
- bts_size_in_bytes =
- bts_size_in_records * ds_cfg.sizeof_bts;
+static int ds_access(struct task_struct *task, size_t index,
+ const void **record, enum ds_qualifier qual)
+{
+ struct ds_context *context;
+ unsigned long base, idx;
+ int error;
- if (bts_size_in_bytes <= 0)
+ if (!record)
return -EINVAL;
- bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL);
-
- if (!bts)
- return -ENOMEM;
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
- ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ idx = base + (index * ds_cfg.sizeof_rec[qual]);
- if (!ds) {
- kfree((void *)bts);
- return -ENOMEM;
- }
-
- set_bts_buffer_base(ds, bts);
- set_bts_index(ds, bts);
- set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
- set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
+ error = -EINVAL;
+ if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
+ goto out;
- *dsp = ds;
- return 0;
+ *record = (const void *)idx;
+ error = ds_cfg.sizeof_rec[qual];
+ out:
+ ds_put_context(context);
+ return error;
}
-int ds_free(void **dsp)
+int ds_access_bts(struct task_struct *task, size_t index, const void **record)
{
- if (*dsp) {
- kfree((void *)get_bts_buffer_base(*dsp));
- kfree(*dsp);
- *dsp = NULL;
- }
- return 0;
+ return ds_access(task, index, record, ds_bts);
}
-int ds_get_bts_size(void *ds)
+int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
{
- int size_in_bytes;
-
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
-
- if (!ds)
- return 0;
-
- size_in_bytes =
- get_bts_absolute_maximum(ds) -
- get_bts_buffer_base(ds);
- return size_in_bytes;
+ return ds_access(task, index, record, ds_pebs);
}
-int ds_get_bts_end(void *ds)
+static int ds_write(struct task_struct *task, const void *record, size_t size,
+ enum ds_qualifier qual, int force)
{
- int size_in_bytes = ds_get_bts_size(ds);
-
- if (size_in_bytes <= 0)
- return size_in_bytes;
+ struct ds_context *context;
+ int error;
- return size_in_bytes / ds_cfg.sizeof_bts;
-}
+ if (!record)
+ return -EINVAL;
-int ds_get_bts_index(void *ds)
-{
- int index_offset_in_bytes;
+ error = -EPERM;
+ context = ds_get_context(task);
+ if (!context)
+ goto out;
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
+ if (!force) {
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+ }
- index_offset_in_bytes =
- get_bts_index(ds) -
- get_bts_buffer_base(ds);
+ error = 0;
+ while (size) {
+ unsigned long base, index, end, write_end, int_th;
+ unsigned long write_size, adj_write_size;
+
+ /*
+ * write as much as possible without producing an
+ * overflow interrupt.
+ *
+ * interrupt_threshold must either be
+ * - bigger than absolute_maximum or
+ * - point to a record between buffer_base and absolute_maximum
+ *
+ * index points to a valid record.
+ */
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ index = ds_get(context->ds, qual, ds_index);
+ end = ds_get(context->ds, qual, ds_absolute_maximum);
+ int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
+
+ write_end = min(end, int_th);
+
+ /* if we are already beyond the interrupt threshold,
+ * we fill the entire buffer */
+ if (write_end <= index)
+ write_end = end;
+
+ if (write_end <= index)
+ goto out;
+
+ write_size = min((unsigned long) size, write_end - index);
+ memcpy((void *)index, record, write_size);
+
+ record = (const char *)record + write_size;
+ size -= write_size;
+ error += write_size;
+
+ adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
+ adj_write_size *= ds_cfg.sizeof_rec[qual];
+
+ /* zero out trailing bytes */
+ memset((char *)index + write_size, 0,
+ adj_write_size - write_size);
+ index += adj_write_size;
+
+ if (index >= end)
+ index = base;
+ ds_set(context->ds, qual, ds_index, index);
+
+ if (index >= int_th)
+ ds_overflow(task, context, qual);
+ }
- return index_offset_in_bytes / ds_cfg.sizeof_bts;
+ out:
+ ds_put_context(context);
+ return error;
}
-int ds_set_overflow(void *ds, int method)
+int ds_write_bts(struct task_struct *task, const void *record, size_t size)
{
- switch (method) {
- case DS_O_SIGNAL:
- return -EOPNOTSUPP;
- case DS_O_WRAP:
- return 0;
- default:
- return -EINVAL;
- }
+ return ds_write(task, record, size, ds_bts, /* force = */ 0);
}
-int ds_get_overflow(void *ds)
+int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
{
- return DS_O_WRAP;
+ return ds_write(task, record, size, ds_pebs, /* force = */ 0);
}
-int ds_clear(void *ds)
+int ds_unchecked_write_bts(struct task_struct *task,
+ const void *record, size_t size)
{
- int bts_size = ds_get_bts_size(ds);
- unsigned long bts_base;
-
- if (bts_size <= 0)
- return bts_size;
-
- bts_base = get_bts_buffer_base(ds);
- memset((void *)bts_base, 0, bts_size);
-
- set_bts_index(ds, bts_base);
- return 0;
+ return ds_write(task, record, size, ds_bts, /* force = */ 1);
}
-int ds_read_bts(void *ds, int index, struct bts_struct *out)
+int ds_unchecked_write_pebs(struct task_struct *task,
+ const void *record, size_t size)
{
- void *bts;
+ return ds_write(task, record, size, ds_pebs, /* force = */ 1);
+}
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
+static int ds_reset_or_clear(struct task_struct *task,
+ enum ds_qualifier qual, int clear)
+{
+ struct ds_context *context;
+ unsigned long base, end;
+ int error;
- if (index < 0)
- return -EINVAL;
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
- if (index >= ds_get_bts_size(ds))
- return -EINVAL;
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ end = ds_get(context->ds, qual, ds_absolute_maximum);
- bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts));
+ if (clear)
+ memset((void *)base, 0, end - base);
- memset(out, 0, sizeof(*out));
- if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
- out->qualifier = get_info_type(bts);
- out->variant.jiffies = get_info_data(bts);
- } else {
- out->qualifier = BTS_BRANCH;
- out->variant.lbr.from_ip = get_from_ip(bts);
- out->variant.lbr.to_ip = get_to_ip(bts);
- }
+ ds_set(context->ds, qual, ds_index, base);
- return sizeof(*out);;
+ error = 0;
+ out:
+ ds_put_context(context);
+ return error;
}
-int ds_write_bts(void *ds, const struct bts_struct *in)
+int ds_reset_bts(struct task_struct *task)
{
- unsigned long bts;
-
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
-
- if (ds_get_bts_size(ds) <= 0)
- return -ENXIO;
+ return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
+}
- bts = get_bts_index(ds);
+int ds_reset_pebs(struct task_struct *task)
+{
+ return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+}
- memset((void *)bts, 0, ds_cfg.sizeof_bts);
- switch (in->qualifier) {
- case BTS_INVALID:
- break;
+int ds_clear_bts(struct task_struct *task)
+{
+ return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
+}
- case BTS_BRANCH:
- set_from_ip((void *)bts, in->variant.lbr.from_ip);
- set_to_ip((void *)bts, in->variant.lbr.to_ip);
- break;
+int ds_clear_pebs(struct task_struct *task)
+{
+ return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
+}
- case BTS_TASK_ARRIVES:
- case BTS_TASK_DEPARTS:
- set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS);
- set_info_type((void *)bts, in->qualifier);
- set_info_data((void *)bts, in->variant.jiffies);
- break;
+int ds_get_pebs_reset(struct task_struct *task, u64 *value)
+{
+ struct ds_context *context;
+ int error;
- default:
+ if (!value)
return -EINVAL;
- }
- bts = bts + ds_cfg.sizeof_bts;
- if (bts >= get_bts_absolute_maximum(ds))
- bts = get_bts_buffer_base(ds);
- set_bts_index(ds, bts);
+ context = ds_get_context(task);
+ error = ds_validate_access(context, ds_pebs);
+ if (error < 0)
+ goto out;
- return ds_cfg.sizeof_bts;
+ *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
+
+ error = 0;
+ out:
+ ds_put_context(context);
+ return error;
}
-unsigned long ds_debugctl_mask(void)
+int ds_set_pebs_reset(struct task_struct *task, u64 value)
{
- return ds_cfg.debugctl_mask;
-}
+ struct ds_context *context;
+ int error;
-#ifdef __i386__
-static const struct ds_configuration ds_cfg_netburst = {
- .sizeof_ds = 9 * 4,
- .bts_buffer_base = { 0, 4 },
- .bts_index = { 4, 4 },
- .bts_absolute_maximum = { 8, 4 },
- .bts_interrupt_threshold = { 12, 4 },
- .sizeof_bts = 3 * 4,
- .from_ip = { 0, 4 },
- .to_ip = { 4, 4 },
- .info_type = { 4, 1 },
- .info_data = { 8, 4 },
- .debugctl_mask = (1<<2)|(1<<3)
-};
+ context = ds_get_context(task);
+ error = ds_validate_access(context, ds_pebs);
+ if (error < 0)
+ goto out;
-static const struct ds_configuration ds_cfg_pentium_m = {
- .sizeof_ds = 9 * 4,
- .bts_buffer_base = { 0, 4 },
- .bts_index = { 4, 4 },
- .bts_absolute_maximum = { 8, 4 },
- .bts_interrupt_threshold = { 12, 4 },
- .sizeof_bts = 3 * 4,
- .from_ip = { 0, 4 },
- .to_ip = { 4, 4 },
- .info_type = { 4, 1 },
- .info_data = { 8, 4 },
- .debugctl_mask = (1<<6)|(1<<7)
+ *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
+
+ error = 0;
+ out:
+ ds_put_context(context);
+ return error;
+}
+
+static const struct ds_configuration ds_cfg_var = {
+ .sizeof_ds = sizeof(long) * 12,
+ .sizeof_field = sizeof(long),
+ .sizeof_rec[ds_bts] = sizeof(long) * 3,
+ .sizeof_rec[ds_pebs] = sizeof(long) * 10
};
-#endif /* _i386_ */
-
-static const struct ds_configuration ds_cfg_core2 = {
- .sizeof_ds = 9 * 8,
- .bts_buffer_base = { 0, 8 },
- .bts_index = { 8, 8 },
- .bts_absolute_maximum = { 16, 8 },
- .bts_interrupt_threshold = { 24, 8 },
- .sizeof_bts = 3 * 8,
- .from_ip = { 0, 8 },
- .to_ip = { 8, 8 },
- .info_type = { 8, 1 },
- .info_data = { 16, 8 },
- .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
+static const struct ds_configuration ds_cfg_64 = {
+ .sizeof_ds = 8 * 12,
+ .sizeof_field = 8,
+ .sizeof_rec[ds_bts] = 8 * 3,
+ .sizeof_rec[ds_pebs] = 8 * 10
};
static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
switch (c->x86) {
case 0x6:
switch (c->x86_model) {
-#ifdef __i386__
case 0xD:
case 0xE: /* Pentium M */
- ds_configure(&ds_cfg_pentium_m);
+ ds_configure(&ds_cfg_var);
break;
-#endif /* _i386_ */
case 0xF: /* Core2 */
- ds_configure(&ds_cfg_core2);
+ case 0x1C: /* Atom */
+ ds_configure(&ds_cfg_64);
break;
default:
/* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
break;
case 0xF:
switch (c->x86_model) {
-#ifdef __i386__
case 0x0:
case 0x1:
case 0x2: /* Netburst */
- ds_configure(&ds_cfg_netburst);
+ ds_configure(&ds_cfg_var);
break;
-#endif /* _i386_ */
default:
/* sorry, don't know about them */
break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
break;
}
}
+
+void ds_free(struct ds_context *context)
+{
+ /* This is called when the task owning the parameter context
+ * is dying. There should not be any user of that context left
+ * to disturb us, anymore. */
+ unsigned long leftovers = context->count;
+ while (leftovers--)
+ ds_put_context(context);
+}
+#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
new file mode 100644
index 00000000000..b3614752197
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+
+#include <asm/stacktrace.h>
+
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+
+int panic_on_unrecovered_nmi;
+int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
+static unsigned int code_bytes = 64;
+static int die_counter;
+
+void printk_address(unsigned long address, int reliable)
+{
+ printk(" [<%p>] %s%pS\n", (void *) address,
+ reliable ? "" : "? ", (void *) address);
+}
+
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+ void *p, unsigned int size, void *end)
+{
+ void *t = tinfo;
+ if (end) {
+ if (p < end && p >= (end-THREAD_SIZE))
+ return 1;
+ else
+ return 0;
+ }
+ return p > t && p < t + THREAD_SIZE - size;
+}
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+ struct stack_frame *next_frame;
+ unsigned long return_address;
+};
+
+static inline unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end)
+{
+ struct stack_frame *frame = (struct stack_frame *)bp;
+
+ while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+ unsigned long addr;
+
+ addr = *stack;
+ if (__kernel_text_address(addr)) {
+ if ((unsigned long) stack == bp + sizeof(long)) {
+ ops->address(data, addr, 1);
+ frame = frame->next_frame;
+ bp = (unsigned long) frame;
+ } else {
+ ops->address(data, addr, bp == 0);
+ }
+ }
+ stack++;
+ }
+ return bp;
+}
+
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data)
+{
+ if (!task)
+ task = current;
+
+ if (!stack) {
+ unsigned long dummy;
+ stack = &dummy;
+ if (task && task != current)
+ stack = (unsigned long *)task->thread.sp;
+ }
+
+#ifdef CONFIG_FRAME_POINTER
+ if (!bp) {
+ if (task == current) {
+ /* Grab bp right from our regs */
+ get_bp(bp);
+ } else {
+ /* bp is the last reg pushed by switch_to */
+ bp = *(unsigned long *) task->thread.sp;
+ }
+ }
+#endif
+
+ for (;;) {
+ struct thread_info *context;
+
+ context = (struct thread_info *)
+ ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+ bp = print_context_stack(context, stack, bp, ops, data, NULL);
+
+ stack = (unsigned long *)context->previous_esp;
+ if (!stack)
+ break;
+ if (ops->stack(data, "IRQ") < 0)
+ break;
+ touch_nmi_watchdog();
+ }
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ printk(data);
+ print_symbol(msg, symbol);
+ printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+ printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+ printk("%s <%s> ", (char *)data, name);
+ return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+ touch_nmi_watchdog();
+ printk(data);
+ printk_address(addr, reliable);
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+};
+
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl)
+{
+ printk("%sCall Trace:\n", log_lvl);
+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp)
+{
+ show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
+static void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp, char *log_lvl)
+{
+ unsigned long *stack;
+ int i;
+
+ if (sp == NULL) {
+ if (task)
+ sp = (unsigned long *)task->thread.sp;
+ else
+ sp = (unsigned long *)&sp;
+ }
+
+ stack = sp;
+ for (i = 0; i < kstack_depth_to_print; i++) {
+ if (kstack_end(stack))
+ break;
+ if (i && ((i % STACKSLOTS_PER_LINE) == 0))
+ printk("\n%s", log_lvl);
+ printk(" %08lx", *stack++);
+ touch_nmi_watchdog();
+ }
+ printk("\n");
+ show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+ show_stack_log_lvl(task, NULL, sp, 0, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+ unsigned long bp = 0;
+ unsigned long stack;
+
+#ifdef CONFIG_FRAME_POINTER
+ if (!bp)
+ get_bp(bp);
+#endif
+
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ current->pid, current->comm, print_tainted(),
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ show_trace(NULL, NULL, &stack, bp);
+}
+
+EXPORT_SYMBOL(dump_stack);
+
+void show_registers(struct pt_regs *regs)
+{
+ int i;
+
+ print_modules();
+ __show_regs(regs, 0);
+
+ printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
+ TASK_COMM_LEN, current->comm, task_pid_nr(current),
+ current_thread_info(), current, task_thread_info(current));
+ /*
+ * When in-kernel, we also print out the stack and code at the
+ * time of the fault..
+ */
+ if (!user_mode_vm(regs)) {
+ unsigned int code_prologue = code_bytes * 43 / 64;
+ unsigned int code_len = code_bytes;
+ unsigned char c;
+ u8 *ip;
+
+ printk(KERN_EMERG "Stack:\n");
+ show_stack_log_lvl(NULL, regs, &regs->sp,
+ 0, KERN_EMERG);
+
+ printk(KERN_EMERG "Code: ");
+
+ ip = (u8 *)regs->ip - code_prologue;
+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
+ /* try starting at IP */
+ ip = (u8 *)regs->ip;
+ code_len = code_len - code_prologue + 1;
+ }
+ for (i = 0; i < code_len; i++, ip++) {
+ if (ip < (u8 *)PAGE_OFFSET ||
+ probe_kernel_address(ip, c)) {
+ printk(" Bad EIP value.");
+ break;
+ }
+ if (ip == (u8 *)regs->ip)
+ printk("<%02x> ", c);
+ else
+ printk("%02x ", c);
+ }
+ }
+ printk("\n");
+}
+
+int is_valid_bugaddr(unsigned long ip)
+{
+ unsigned short ud2;
+
+ if (ip < PAGE_OFFSET)
+ return 0;
+ if (probe_kernel_address((unsigned short *)ip, ud2))
+ return 0;
+
+ return ud2 == 0x0b0f;
+}
+
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+ unsigned long flags;
+
+ oops_enter();
+
+ if (die_owner != raw_smp_processor_id()) {
+ console_verbose();
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&die_lock);
+ die_owner = smp_processor_id();
+ die_nest_count = 0;
+ bust_spinlocks(1);
+ } else {
+ raw_local_irq_save(flags);
+ }
+ die_nest_count++;
+ return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+ bust_spinlocks(0);
+ die_owner = -1;
+ add_taint(TAINT_DIE);
+ __raw_spin_unlock(&die_lock);
+ raw_local_irq_restore(flags);
+
+ if (!regs)
+ return;
+
+ if (kexec_should_crash(current))
+ crash_kexec(regs);
+ if (in_interrupt())
+ panic("Fatal exception in interrupt");
+ if (panic_on_oops)
+ panic("Fatal exception");
+ oops_exit();
+ do_exit(signr);
+}
+
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+ unsigned short ss;
+ unsigned long sp;
+
+ printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+ printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+ printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk("DEBUG_PAGEALLOC");
+#endif
+ printk("\n");
+ sysfs_printk_last_file();
+ if (notify_die(DIE_OOPS, str, regs, err,
+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ return 1;
+
+ show_registers(regs);
+ /* Executive summary in case the oops scrolled away */
+ sp = (unsigned long) (&regs->sp);
+ savesegment(ss, ss);
+ if (user_mode(regs)) {
+ sp = regs->sp;
+ ss = regs->ss & 0xffff;
+ }
+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+ print_symbol("%s", regs->ip);
+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
+ return 0;
+}
+
+/*
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
+ */
+void die(const char *str, struct pt_regs *regs, long err)
+{
+ unsigned long flags = oops_begin();
+
+ if (die_nest_count < 3) {
+ report_bug(regs->ip, regs);
+
+ if (__die(str, regs, err))
+ regs = NULL;
+ } else {
+ printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
+ }
+
+ oops_end(flags, regs, SIGSEGV);
+}
+
+static DEFINE_SPINLOCK(nmi_print_lock);
+
+void notrace __kprobes
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ return;
+
+ spin_lock(&nmi_print_lock);
+ /*
+ * We are in trouble anyway, lets at least try
+ * to get a message out:
+ */
+ bust_spinlocks(1);
+ printk(KERN_EMERG "%s", str);
+ printk(" on CPU%d, ip %08lx, registers:\n",
+ smp_processor_id(), regs->ip);
+ show_registers(regs);
+ if (do_panic)
+ panic("Non maskable interrupt");
+ console_silent();
+ spin_unlock(&nmi_print_lock);
+
+ /*
+ * If we are in kernel we are probably nested up pretty bad
+ * and might aswell get out now while we still can:
+ */
+ if (!user_mode_vm(regs)) {
+ current->thread.trap_no = 2;
+ crash_kexec(regs);
+ }
+
+ bust_spinlocks(0);
+ do_exit(SIGSEGV);
+}
+
+static int __init oops_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+ return 0;
+}
+early_param("kstack", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+ code_bytes = simple_strtoul(s, NULL, 0);
+ if (code_bytes > 8192)
+ code_bytes = 8192;
+
+ return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
new file mode 100644
index 00000000000..96a5db7da8a
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+
+#include <asm/stacktrace.h>
+
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+
+int panic_on_unrecovered_nmi;
+int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
+static unsigned int code_bytes = 64;
+static int die_counter;
+
+void printk_address(unsigned long address, int reliable)
+{
+ printk(" [<%p>] %s%pS\n", (void *) address,
+ reliable ? "" : "? ", (void *) address);
+}
+
+static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
+ unsigned *usedp, char **idp)
+{
+ static char ids[][8] = {
+ [DEBUG_STACK - 1] = "#DB",
+ [NMI_STACK - 1] = "NMI",
+ [DOUBLEFAULT_STACK - 1] = "#DF",
+ [STACKFAULT_STACK - 1] = "#SS",
+ [MCE_STACK - 1] = "#MC",
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+ [N_EXCEPTION_STACKS ...
+ N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
+#endif
+ };
+ unsigned k;
+
+ /*
+ * Iterate over all exception stacks, and figure out whether
+ * 'stack' is in one of them:
+ */
+ for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+ unsigned long end = per_cpu(orig_ist, cpu).ist[k];
+ /*
+ * Is 'stack' above this exception frame's end?
+ * If yes then skip to the next frame.
+ */
+ if (stack >= end)
+ continue;
+ /*
+ * Is 'stack' above this exception frame's start address?
+ * If yes then we found the right frame.
+ */
+ if (stack >= end - EXCEPTION_STKSZ) {
+ /*
+ * Make sure we only iterate through an exception
+ * stack once. If it comes up for the second time
+ * then there's something wrong going on - just
+ * break out and return NULL:
+ */
+ if (*usedp & (1U << k))
+ break;
+ *usedp |= 1U << k;
+ *idp = ids[k];
+ return (unsigned long *)end;
+ }
+ /*
+ * If this is a debug stack, and if it has a larger size than
+ * the usual exception stacks, then 'stack' might still
+ * be within the lower portion of the debug stack:
+ */
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+ if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
+ unsigned j = N_EXCEPTION_STACKS - 1;
+
+ /*
+ * Black magic. A large debug stack is composed of
+ * multiple exception stack entries, which we
+ * iterate through now. Dont look:
+ */
+ do {
+ ++j;
+ end -= EXCEPTION_STKSZ;
+ ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
+ } while (stack < end - EXCEPTION_STKSZ);
+ if (*usedp & (1U << j))
+ break;
+ *usedp |= 1U << j;
+ *idp = ids[j];
+ return (unsigned long *)end;
+ }
+#endif
+ }
+ return NULL;
+}
+
+/*
+ * x86-64 can have up to three kernel stacks:
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+ void *p, unsigned int size, void *end)
+{
+ void *t = tinfo;
+ if (end) {
+ if (p < end && p >= (end-THREAD_SIZE))
+ return 1;
+ else
+ return 0;
+ }
+ return p > t && p < t + THREAD_SIZE - size;
+}
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+ struct stack_frame *next_frame;
+ unsigned long return_address;
+};
+
+static inline unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end)
+{
+ struct stack_frame *frame = (struct stack_frame *)bp;
+
+ while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+ unsigned long addr;
+
+ addr = *stack;
+ if (__kernel_text_address(addr)) {
+ if ((unsigned long) stack == bp + sizeof(long)) {
+ ops->address(data, addr, 1);
+ frame = frame->next_frame;
+ bp = (unsigned long) frame;
+ } else {
+ ops->address(data, addr, bp == 0);
+ }
+ }
+ stack++;
+ }
+ return bp;
+}
+
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data)
+{
+ const unsigned cpu = get_cpu();
+ unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+ unsigned used = 0;
+ struct thread_info *tinfo;
+
+ if (!task)
+ task = current;
+
+ if (!stack) {
+ unsigned long dummy;
+ stack = &dummy;
+ if (task && task != current)
+ stack = (unsigned long *)task->thread.sp;
+ }
+
+#ifdef CONFIG_FRAME_POINTER
+ if (!bp) {
+ if (task == current) {
+ /* Grab bp right from our regs */
+ get_bp(bp);
+ } else {
+ /* bp is the last reg pushed by switch_to */
+ bp = *(unsigned long *) task->thread.sp;
+ }
+ }
+#endif
+
+ /*
+ * Print function call entries in all stacks, starting at the
+ * current stack address. If the stacks consist of nested
+ * exceptions
+ */
+ tinfo = task_thread_info(task);
+ for (;;) {
+ char *id;
+ unsigned long *estack_end;
+ estack_end = in_exception_stack(cpu, (unsigned long)stack,
+ &used, &id);
+
+ if (estack_end) {
+ if (ops->stack(data, id) < 0)
+ break;
+
+ bp = print_context_stack(tinfo, stack, bp, ops,
+ data, estack_end);
+ ops->stack(data, "<EOE>");
+ /*
+ * We link to the next stack via the
+ * second-to-last pointer (index -2 to end) in the
+ * exception stack:
+ */
+ stack = (unsigned long *) estack_end[-2];
+ continue;
+ }
+ if (irqstack_end) {
+ unsigned long *irqstack;
+ irqstack = irqstack_end -
+ (IRQSTACKSIZE - 64) / sizeof(*irqstack);
+
+ if (stack >= irqstack && stack < irqstack_end) {
+ if (ops->stack(data, "IRQ") < 0)
+ break;
+ bp = print_context_stack(tinfo, stack, bp,
+ ops, data, irqstack_end);
+ /*
+ * We link to the next stack (which would be
+ * the process stack normally) the last
+ * pointer (index -1 to end) in the IRQ stack:
+ */
+ stack = (unsigned long *) (irqstack_end[-1]);
+ irqstack_end = NULL;
+ ops->stack(data, "EOI");
+ continue;
+ }
+ }
+ break;
+ }
+
+ /*
+ * This handles the process stack:
+ */
+ bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
+ put_cpu();
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ printk(data);
+ print_symbol(msg, symbol);
+ printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+ printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+ printk("%s <%s> ", (char *)data, name);
+ return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+ touch_nmi_watchdog();
+ printk(data);
+ printk_address(addr, reliable);
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+};
+
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl)
+{
+ printk("%sCall Trace:\n", log_lvl);
+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp)
+{
+ show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
+static void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp, char *log_lvl)
+{
+ unsigned long *stack;
+ int i;
+ const int cpu = smp_processor_id();
+ unsigned long *irqstack_end =
+ (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+ unsigned long *irqstack =
+ (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+
+ /*
+ * debugging aid: "show_stack(NULL, NULL);" prints the
+ * back trace for this cpu.
+ */
+
+ if (sp == NULL) {
+ if (task)
+ sp = (unsigned long *)task->thread.sp;
+ else
+ sp = (unsigned long *)&sp;
+ }
+
+ stack = sp;
+ for (i = 0; i < kstack_depth_to_print; i++) {
+ if (stack >= irqstack && stack <= irqstack_end) {
+ if (stack == irqstack_end) {
+ stack = (unsigned long *) (irqstack_end[-1]);
+ printk(" <EOI> ");
+ }
+ } else {
+ if (((long) stack & (THREAD_SIZE-1)) == 0)
+ break;
+ }
+ if (i && ((i % STACKSLOTS_PER_LINE) == 0))
+ printk("\n%s", log_lvl);
+ printk(" %016lx", *stack++);
+ touch_nmi_watchdog();
+ }
+ printk("\n");
+ show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+ show_stack_log_lvl(task, NULL, sp, 0, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+ unsigned long bp = 0;
+ unsigned long stack;
+
+#ifdef CONFIG_FRAME_POINTER
+ if (!bp)
+ get_bp(bp);
+#endif
+
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ current->pid, current->comm, print_tainted(),
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ show_trace(NULL, NULL, &stack, bp);
+}
+EXPORT_SYMBOL(dump_stack);
+
+void show_registers(struct pt_regs *regs)
+{
+ int i;
+ unsigned long sp;
+ const int cpu = smp_processor_id();
+ struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+
+ sp = regs->sp;
+ printk("CPU %d ", cpu);
+ __show_regs(regs, 1);
+ printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
+ cur->comm, cur->pid, task_thread_info(cur), cur);
+
+ /*
+ * When in-kernel, we also print out the stack and code at the
+ * time of the fault..
+ */
+ if (!user_mode(regs)) {
+ unsigned int code_prologue = code_bytes * 43 / 64;
+ unsigned int code_len = code_bytes;
+ unsigned char c;
+ u8 *ip;
+
+ printk(KERN_EMERG "Stack:\n");
+ show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
+ regs->bp, KERN_EMERG);
+
+ printk(KERN_EMERG "Code: ");
+
+ ip = (u8 *)regs->ip - code_prologue;
+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
+ /* try starting at IP */
+ ip = (u8 *)regs->ip;
+ code_len = code_len - code_prologue + 1;
+ }
+ for (i = 0; i < code_len; i++, ip++) {
+ if (ip < (u8 *)PAGE_OFFSET ||
+ probe_kernel_address(ip, c)) {
+ printk(" Bad RIP value.");
+ break;
+ }
+ if (ip == (u8 *)regs->ip)
+ printk("<%02x> ", c);
+ else
+ printk("%02x ", c);
+ }
+ }
+ printk("\n");
+}
+
+int is_valid_bugaddr(unsigned long ip)
+{
+ unsigned short ud2;
+
+ if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
+ return 0;
+
+ return ud2 == 0x0b0f;
+}
+
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+ int cpu;
+ unsigned long flags;
+
+ oops_enter();
+
+ /* racy, but better than risking deadlock. */
+ raw_local_irq_save(flags);
+ cpu = smp_processor_id();
+ if (!__raw_spin_trylock(&die_lock)) {
+ if (cpu == die_owner)
+ /* nested oops. should stop eventually */;
+ else
+ __raw_spin_lock(&die_lock);
+ }
+ die_nest_count++;
+ die_owner = cpu;
+ console_verbose();
+ bust_spinlocks(1);
+ return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+ die_owner = -1;
+ bust_spinlocks(0);
+ die_nest_count--;
+ if (!die_nest_count)
+ /* Nest count reaches zero, release the lock. */
+ __raw_spin_unlock(&die_lock);
+ raw_local_irq_restore(flags);
+ if (!regs) {
+ oops_exit();
+ return;
+ }
+ if (in_interrupt())
+ panic("Fatal exception in interrupt");
+ if (panic_on_oops)
+ panic("Fatal exception");
+ oops_exit();
+ do_exit(signr);
+}
+
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+ printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+ printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+ printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk("DEBUG_PAGEALLOC");
+#endif
+ printk("\n");
+ sysfs_printk_last_file();
+ if (notify_die(DIE_OOPS, str, regs, err,
+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ return 1;
+
+ show_registers(regs);
+ add_taint(TAINT_DIE);
+ /* Executive summary in case the oops scrolled away */
+ printk(KERN_ALERT "RIP ");
+ printk_address(regs->ip, 1);
+ printk(" RSP <%016lx>\n", regs->sp);
+ if (kexec_should_crash(current))
+ crash_kexec(regs);
+ return 0;
+}
+
+void die(const char *str, struct pt_regs *regs, long err)
+{
+ unsigned long flags = oops_begin();
+
+ if (!user_mode(regs))
+ report_bug(regs->ip, regs);
+
+ if (__die(str, regs, err))
+ regs = NULL;
+ oops_end(flags, regs, SIGSEGV);
+}
+
+notrace __kprobes void
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+ unsigned long flags;
+
+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ return;
+
+ flags = oops_begin();
+ /*
+ * We are in trouble anyway, lets at least try
+ * to get a message out.
+ */
+ printk(KERN_EMERG "%s", str);
+ printk(" on CPU%d, ip %08lx, registers:\n",
+ smp_processor_id(), regs->ip);
+ show_registers(regs);
+ if (kexec_should_crash(current))
+ crash_kexec(regs);
+ if (do_panic || panic_on_oops)
+ panic("Non maskable interrupt");
+ oops_end(flags, NULL, SIGBUS);
+ nmi_exit();
+ local_irq_enable();
+ do_exit(SIGBUS);
+}
+
+static int __init oops_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+ return 0;
+}
+early_param("kstack", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+ code_bytes = simple_strtoul(s, NULL, 0);
+ if (code_bytes > 8192)
+ code_bytes = 8192;
+
+ return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 66e48aa2dd1..ce97bf3bed1 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -148,6 +148,9 @@ void __init e820_print_map(char *who)
case E820_NVS:
printk(KERN_CONT "(ACPI NVS)\n");
break;
+ case E820_UNUSABLE:
+ printk("(unusable)\n");
+ break;
default:
printk(KERN_CONT "type %u\n", e820.map[i].type);
break;
@@ -1260,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type)
case E820_RAM: return "System RAM";
case E820_ACPI: return "ACPI Tables";
case E820_NVS: return "ACPI Non-volatile Storage";
+ case E820_UNUSABLE: return "Unusable memory";
default: return "reserved";
}
}
@@ -1267,6 +1271,7 @@ static inline const char *e820_type_to_string(int e820_type)
/*
* Mark e820 reserved areas as busy for the resource manager.
*/
+static struct resource __initdata *e820_res;
void __init e820_reserve_resources(void)
{
int i;
@@ -1274,20 +1279,26 @@ void __init e820_reserve_resources(void)
u64 end;
res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+ e820_res = res;
for (i = 0; i < e820.nr_map; i++) {
end = e820.map[i].addr + e820.map[i].size - 1;
-#ifndef CONFIG_RESOURCES_64BIT
- if (end > 0x100000000ULL) {
+ if (end != (resource_size_t)end) {
res++;
continue;
}
-#endif
res->name = e820_type_to_string(e820.map[i].type);
res->start = e820.map[i].addr;
res->end = end;
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- insert_resource(&iomem_resource, res);
+
+ /*
+ * don't register the region that could be conflicted with
+ * pci device BAR resource and insert them later in
+ * pcibios_resource_survey()
+ */
+ if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20))
+ insert_resource(&iomem_resource, res);
res++;
}
@@ -1299,6 +1310,19 @@ void __init e820_reserve_resources(void)
}
}
+void __init e820_reserve_resources_late(void)
+{
+ int i;
+ struct resource *res;
+
+ res = e820_res;
+ for (i = 0; i < e820.nr_map; i++) {
+ if (!res->parent && res->end)
+ reserve_region_with_split(&iomem_resource, res->start, res->end, res->name);
+ res++;
+ }
+}
+
char *__init default_machine_specific_memory_setup(void)
{
char *who = "BIOS-e820";
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 4353cf5e6fa..3ce029ffaa5 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -95,6 +95,113 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
+static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
+{
+ u32 d;
+ u8 b;
+
+ b = read_pci_config_byte(num, slot, func, 0xac);
+ b &= ~(1<<5);
+ write_pci_config_byte(num, slot, func, 0xac, b);
+
+ d = read_pci_config(num, slot, func, 0x70);
+ d |= 1<<8;
+ write_pci_config(num, slot, func, 0x70, d);
+
+ d = read_pci_config(num, slot, func, 0x8);
+ d &= 0xff;
+ return d;
+}
+
+static void __init ati_bugs(int num, int slot, int func)
+{
+ u32 d;
+ u8 b;
+
+ if (acpi_use_timer_override)
+ return;
+
+ d = ati_ixp4x0_rev(num, slot, func);
+ if (d < 0x82)
+ acpi_skip_timer_override = 1;
+ else {
+ /* check for IRQ0 interrupt swap */
+ outb(0x72, 0xcd6); b = inb(0xcd7);
+ if (!(b & 0x2))
+ acpi_skip_timer_override = 1;
+ }
+
+ if (acpi_skip_timer_override) {
+ printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
+ printk(KERN_INFO "Ignoring ACPI timer override.\n");
+ printk(KERN_INFO "If you got timer trouble "
+ "try acpi_use_timer_override\n");
+ }
+}
+
+static u32 __init ati_sbx00_rev(int num, int slot, int func)
+{
+ u32 old, d;
+
+ d = read_pci_config(num, slot, func, 0x70);
+ old = d;
+ d &= ~(1<<8);
+ write_pci_config(num, slot, func, 0x70, d);
+ d = read_pci_config(num, slot, func, 0x8);
+ d &= 0xff;
+ write_pci_config(num, slot, func, 0x70, old);
+
+ return d;
+}
+
+static void __init ati_bugs_contd(int num, int slot, int func)
+{
+ u32 d, rev;
+
+ if (acpi_use_timer_override)
+ return;
+
+ rev = ati_sbx00_rev(num, slot, func);
+ if (rev > 0x13)
+ return;
+
+ /* check for IRQ0 interrupt swap */
+ d = read_pci_config(num, slot, func, 0x64);
+ if (!(d & (1<<14)))
+ acpi_skip_timer_override = 1;
+
+ if (acpi_skip_timer_override) {
+ printk(KERN_INFO "SB600 revision 0x%x\n", rev);
+ printk(KERN_INFO "Ignoring ACPI timer override.\n");
+ printk(KERN_INFO "If you got timer trouble "
+ "try acpi_use_timer_override\n");
+ }
+}
+#else
+static void __init ati_bugs(int num, int slot, int func)
+{
+}
+
+static void __init ati_bugs_contd(int num, int slot, int func)
+{
+}
+#endif
+
+#ifdef CONFIG_DMAR
+static void __init intel_g33_dmar(int num, int slot, int func)
+{
+ struct acpi_table_header *dmar_tbl;
+ acpi_status status;
+
+ status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
+ if (ACPI_SUCCESS(status)) {
+ printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
+ dmar_disabled = 1;
+ }
+}
+#endif
+
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -114,6 +221,14 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
+ { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
+ PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
+ { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
+ PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
+#ifdef CONFIG_DMAR
+ { PCI_VENDOR_ID_INTEL, 0x29c0,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
+#endif
{}
};
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index ff9e7350da5..34ad997d383 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -3,11 +3,19 @@
#include <linux/init.h>
#include <linux/string.h>
#include <linux/screen_info.h>
+#include <linux/usb/ch9.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/errno.h>
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/fcntl.h>
#include <asm/setup.h>
#include <xen/hvc-console.h>
+#include <asm/pci-direct.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <linux/usb/ehci_def.h>
/* Simple VGA output */
#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
static int early_serial_putc(unsigned char ch)
{
unsigned timeout = 0xffff;
+
while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
cpu_relax();
outb(ch, early_serial_base + TXR);
@@ -111,7 +120,7 @@ static __init void early_serial_init(char *s)
if (!strncmp(s, "0x", 2)) {
early_serial_base = simple_strtoul(s, &e, 16);
} else {
- static int bases[] = { 0x3f8, 0x2f8 };
+ static const int __initconst bases[] = { 0x3f8, 0x2f8 };
if (!strncmp(s, "ttyS", 4))
s += 4;
@@ -151,6 +160,721 @@ static struct console early_serial_console = {
.index = -1,
};
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+
+static struct ehci_caps __iomem *ehci_caps;
+static struct ehci_regs __iomem *ehci_regs;
+static struct ehci_dbg_port __iomem *ehci_debug;
+static unsigned int dbgp_endpoint_out;
+
+struct ehci_dev {
+ u32 bus;
+ u32 slot;
+ u32 func;
+};
+
+static struct ehci_dev ehci_dev;
+
+#define USB_DEBUG_DEVNUM 127
+
+#define DBGP_DATA_TOGGLE 0x8800
+
+static inline u32 dbgp_pid_update(u32 x, u32 tok)
+{
+ return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
+}
+
+static inline u32 dbgp_len_update(u32 x, u32 len)
+{
+ return (x & ~0x0f) | (len & 0x0f);
+}
+
+/*
+ * USB Packet IDs (PIDs)
+ */
+
+/* token */
+#define USB_PID_OUT 0xe1
+#define USB_PID_IN 0x69
+#define USB_PID_SOF 0xa5
+#define USB_PID_SETUP 0x2d
+/* handshake */
+#define USB_PID_ACK 0xd2
+#define USB_PID_NAK 0x5a
+#define USB_PID_STALL 0x1e
+#define USB_PID_NYET 0x96
+/* data */
+#define USB_PID_DATA0 0xc3
+#define USB_PID_DATA1 0x4b
+#define USB_PID_DATA2 0x87
+#define USB_PID_MDATA 0x0f
+/* Special */
+#define USB_PID_PREAMBLE 0x3c
+#define USB_PID_ERR 0x3c
+#define USB_PID_SPLIT 0x78
+#define USB_PID_PING 0xb4
+#define USB_PID_UNDEF_0 0xf0
+
+#define USB_PID_DATA_TOGGLE 0x88
+#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
+
+#define PCI_CAP_ID_EHCI_DEBUG 0xa
+
+#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
+#define HUB_SHORT_RESET_TIME 10
+#define HUB_LONG_RESET_TIME 200
+#define HUB_RESET_TIMEOUT 500
+
+#define DBGP_MAX_PACKET 8
+
+static int dbgp_wait_until_complete(void)
+{
+ u32 ctrl;
+ int loop = 0x100000;
+
+ do {
+ ctrl = readl(&ehci_debug->control);
+ /* Stop when the transaction is finished */
+ if (ctrl & DBGP_DONE)
+ break;
+ } while (--loop > 0);
+
+ if (!loop)
+ return -1;
+
+ /*
+ * Now that we have observed the completed transaction,
+ * clear the done bit.
+ */
+ writel(ctrl | DBGP_DONE, &ehci_debug->control);
+ return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
+}
+
+static void dbgp_mdelay(int ms)
+{
+ int i;
+
+ while (ms--) {
+ for (i = 0; i < 1000; i++)
+ outb(0x1, 0x80);
+ }
+}
+
+static void dbgp_breath(void)
+{
+ /* Sleep to give the debug port a chance to breathe */
+}
+
+static int dbgp_wait_until_done(unsigned ctrl)
+{
+ u32 pids, lpid;
+ int ret;
+ int loop = 3;
+
+retry:
+ writel(ctrl | DBGP_GO, &ehci_debug->control);
+ ret = dbgp_wait_until_complete();
+ pids = readl(&ehci_debug->pids);
+ lpid = DBGP_PID_GET(pids);
+
+ if (ret < 0)
+ return ret;
+
+ /*
+ * If the port is getting full or it has dropped data
+ * start pacing ourselves, not necessary but it's friendly.
+ */
+ if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
+ dbgp_breath();
+
+ /* If I get a NACK reissue the transmission */
+ if (lpid == USB_PID_NAK) {
+ if (--loop > 0)
+ goto retry;
+ }
+
+ return ret;
+}
+
+static void dbgp_set_data(const void *buf, int size)
+{
+ const unsigned char *bytes = buf;
+ u32 lo, hi;
+ int i;
+
+ lo = hi = 0;
+ for (i = 0; i < 4 && i < size; i++)
+ lo |= bytes[i] << (8*i);
+ for (; i < 8 && i < size; i++)
+ hi |= bytes[i] << (8*(i - 4));
+ writel(lo, &ehci_debug->data03);
+ writel(hi, &ehci_debug->data47);
+}
+
+static void dbgp_get_data(void *buf, int size)
+{
+ unsigned char *bytes = buf;
+ u32 lo, hi;
+ int i;
+
+ lo = readl(&ehci_debug->data03);
+ hi = readl(&ehci_debug->data47);
+ for (i = 0; i < 4 && i < size; i++)
+ bytes[i] = (lo >> (8*i)) & 0xff;
+ for (; i < 8 && i < size; i++)
+ bytes[i] = (hi >> (8*(i - 4))) & 0xff;
+}
+
+static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
+ const char *bytes, int size)
+{
+ u32 pids, addr, ctrl;
+ int ret;
+
+ if (size > DBGP_MAX_PACKET)
+ return -1;
+
+ addr = DBGP_EPADDR(devnum, endpoint);
+
+ pids = readl(&ehci_debug->pids);
+ pids = dbgp_pid_update(pids, USB_PID_OUT);
+
+ ctrl = readl(&ehci_debug->control);
+ ctrl = dbgp_len_update(ctrl, size);
+ ctrl |= DBGP_OUT;
+ ctrl |= DBGP_GO;
+
+ dbgp_set_data(bytes, size);
+ writel(addr, &ehci_debug->address);
+ writel(pids, &ehci_debug->pids);
+
+ ret = dbgp_wait_until_done(ctrl);
+ if (ret < 0)
+ return ret;
+
+ return ret;
+}
+
+static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+ int size)
+{
+ u32 pids, addr, ctrl;
+ int ret;
+
+ if (size > DBGP_MAX_PACKET)
+ return -1;
+
+ addr = DBGP_EPADDR(devnum, endpoint);
+
+ pids = readl(&ehci_debug->pids);
+ pids = dbgp_pid_update(pids, USB_PID_IN);
+
+ ctrl = readl(&ehci_debug->control);
+ ctrl = dbgp_len_update(ctrl, size);
+ ctrl &= ~DBGP_OUT;
+ ctrl |= DBGP_GO;
+
+ writel(addr, &ehci_debug->address);
+ writel(pids, &ehci_debug->pids);
+ ret = dbgp_wait_until_done(ctrl);
+ if (ret < 0)
+ return ret;
+
+ if (size > ret)
+ size = ret;
+ dbgp_get_data(data, size);
+ return ret;
+}
+
+static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
+ int value, int index, void *data, int size)
+{
+ u32 pids, addr, ctrl;
+ struct usb_ctrlrequest req;
+ int read;
+ int ret;
+
+ read = (requesttype & USB_DIR_IN) != 0;
+ if (size > (read ? DBGP_MAX_PACKET:0))
+ return -1;
+
+ /* Compute the control message */
+ req.bRequestType = requesttype;
+ req.bRequest = request;
+ req.wValue = cpu_to_le16(value);
+ req.wIndex = cpu_to_le16(index);
+ req.wLength = cpu_to_le16(size);
+
+ pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
+ addr = DBGP_EPADDR(devnum, 0);
+
+ ctrl = readl(&ehci_debug->control);
+ ctrl = dbgp_len_update(ctrl, sizeof(req));
+ ctrl |= DBGP_OUT;
+ ctrl |= DBGP_GO;
+
+ /* Send the setup message */
+ dbgp_set_data(&req, sizeof(req));
+ writel(addr, &ehci_debug->address);
+ writel(pids, &ehci_debug->pids);
+ ret = dbgp_wait_until_done(ctrl);
+ if (ret < 0)
+ return ret;
+
+ /* Read the result */
+ return dbgp_bulk_read(devnum, 0, data, size);
+}
+
+
+/* Find a PCI capability */
+static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
+{
+ u8 pos;
+ int bytes;
+
+ if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
+ PCI_STATUS_CAP_LIST))
+ return 0;
+
+ pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
+ for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
+ u8 id;
+
+ pos &= ~3;
+ id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
+ if (id == 0xff)
+ break;
+ if (id == cap)
+ return pos;
+
+ pos = read_pci_config_byte(num, slot, func,
+ pos+PCI_CAP_LIST_NEXT);
+ }
+ return 0;
+}
+
+static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
+{
+ u32 class;
+
+ class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+ if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
+ return 0;
+
+ return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
+}
+
+static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
+{
+ u32 bus, slot, func;
+
+ for (bus = 0; bus < 256; bus++) {
+ for (slot = 0; slot < 32; slot++) {
+ for (func = 0; func < 8; func++) {
+ unsigned cap;
+
+ cap = __find_dbgp(bus, slot, func);
+
+ if (!cap)
+ continue;
+ if (ehci_num-- != 0)
+ continue;
+ *rbus = bus;
+ *rslot = slot;
+ *rfunc = func;
+ return cap;
+ }
+ }
+ }
+ return 0;
+}
+
+static int ehci_reset_port(int port)
+{
+ u32 portsc;
+ u32 delay_time, delay;
+ int loop;
+
+ /* Reset the usb debug port */
+ portsc = readl(&ehci_regs->port_status[port - 1]);
+ portsc &= ~PORT_PE;
+ portsc |= PORT_RESET;
+ writel(portsc, &ehci_regs->port_status[port - 1]);
+
+ delay = HUB_ROOT_RESET_TIME;
+ for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
+ delay_time += delay) {
+ dbgp_mdelay(delay);
+
+ portsc = readl(&ehci_regs->port_status[port - 1]);
+ if (portsc & PORT_RESET) {
+ /* force reset to complete */
+ loop = 2;
+ writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
+ &ehci_regs->port_status[port - 1]);
+ do {
+ portsc = readl(&ehci_regs->port_status[port-1]);
+ } while ((portsc & PORT_RESET) && (--loop > 0));
+ }
+
+ /* Device went away? */
+ if (!(portsc & PORT_CONNECT))
+ return -ENOTCONN;
+
+ /* bomb out completely if something weird happend */
+ if ((portsc & PORT_CSC))
+ return -EINVAL;
+
+ /* If we've finished resetting, then break out of the loop */
+ if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
+ return 0;
+ }
+ return -EBUSY;
+}
+
+static int ehci_wait_for_port(int port)
+{
+ u32 status;
+ int ret, reps;
+
+ for (reps = 0; reps < 3; reps++) {
+ dbgp_mdelay(100);
+ status = readl(&ehci_regs->status);
+ if (status & STS_PCD) {
+ ret = ehci_reset_port(port);
+ if (ret == 0)
+ return 0;
+ }
+ }
+ return -ENOTCONN;
+}
+
+#ifdef DBGP_DEBUG
+# define dbgp_printk early_printk
+#else
+static inline void dbgp_printk(const char *fmt, ...) { }
+#endif
+
+typedef void (*set_debug_port_t)(int port);
+
+static void default_set_debug_port(int port)
+{
+}
+
+static set_debug_port_t set_debug_port = default_set_debug_port;
+
+static void nvidia_set_debug_port(int port)
+{
+ u32 dword;
+ dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
+ 0x74);
+ dword &= ~(0x0f<<12);
+ dword |= ((port & 0x0f)<<12);
+ write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
+ dword);
+ dbgp_printk("set debug port to %d\n", port);
+}
+
+static void __init detect_set_debug_port(void)
+{
+ u32 vendorid;
+
+ vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
+ 0x00);
+
+ if ((vendorid & 0xffff) == 0x10de) {
+ dbgp_printk("using nvidia set_debug_port\n");
+ set_debug_port = nvidia_set_debug_port;
+ }
+}
+
+static int __init ehci_setup(void)
+{
+ struct usb_debug_descriptor dbgp_desc;
+ u32 cmd, ctrl, status, portsc, hcs_params;
+ u32 debug_port, new_debug_port = 0, n_ports;
+ u32 devnum;
+ int ret, i;
+ int loop;
+ int port_map_tried;
+ int playtimes = 3;
+
+try_next_time:
+ port_map_tried = 0;
+
+try_next_port:
+
+ hcs_params = readl(&ehci_caps->hcs_params);
+ debug_port = HCS_DEBUG_PORT(hcs_params);
+ n_ports = HCS_N_PORTS(hcs_params);
+
+ dbgp_printk("debug_port: %d\n", debug_port);
+ dbgp_printk("n_ports: %d\n", n_ports);
+
+ for (i = 1; i <= n_ports; i++) {
+ portsc = readl(&ehci_regs->port_status[i-1]);
+ dbgp_printk("portstatus%d: %08x\n", i, portsc);
+ }
+
+ if (port_map_tried && (new_debug_port != debug_port)) {
+ if (--playtimes) {
+ set_debug_port(new_debug_port);
+ goto try_next_time;
+ }
+ return -1;
+ }
+
+ loop = 10;
+ /* Reset the EHCI controller */
+ cmd = readl(&ehci_regs->command);
+ cmd |= CMD_RESET;
+ writel(cmd, &ehci_regs->command);
+ do {
+ cmd = readl(&ehci_regs->command);
+ } while ((cmd & CMD_RESET) && (--loop > 0));
+
+ if (!loop) {
+ dbgp_printk("can not reset ehci\n");
+ return -1;
+ }
+ dbgp_printk("ehci reset done\n");
+
+ /* Claim ownership, but do not enable yet */
+ ctrl = readl(&ehci_debug->control);
+ ctrl |= DBGP_OWNER;
+ ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
+ writel(ctrl, &ehci_debug->control);
+
+ /* Start the ehci running */
+ cmd = readl(&ehci_regs->command);
+ cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
+ cmd |= CMD_RUN;
+ writel(cmd, &ehci_regs->command);
+
+ /* Ensure everything is routed to the EHCI */
+ writel(FLAG_CF, &ehci_regs->configured_flag);
+
+ /* Wait until the controller is no longer halted */
+ loop = 10;
+ do {
+ status = readl(&ehci_regs->status);
+ } while ((status & STS_HALT) && (--loop > 0));
+
+ if (!loop) {
+ dbgp_printk("ehci can be started\n");
+ return -1;
+ }
+ dbgp_printk("ehci started\n");
+
+ /* Wait for a device to show up in the debug port */
+ ret = ehci_wait_for_port(debug_port);
+ if (ret < 0) {
+ dbgp_printk("No device found in debug port\n");
+ goto next_debug_port;
+ }
+ dbgp_printk("ehci wait for port done\n");
+
+ /* Enable the debug port */
+ ctrl = readl(&ehci_debug->control);
+ ctrl |= DBGP_CLAIM;
+ writel(ctrl, &ehci_debug->control);
+ ctrl = readl(&ehci_debug->control);
+ if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
+ dbgp_printk("No device in debug port\n");
+ writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
+ goto err;
+ }
+ dbgp_printk("debug ported enabled\n");
+
+ /* Completely transfer the debug device to the debug controller */
+ portsc = readl(&ehci_regs->port_status[debug_port - 1]);
+ portsc &= ~PORT_PE;
+ writel(portsc, &ehci_regs->port_status[debug_port - 1]);
+
+ dbgp_mdelay(100);
+
+ /* Find the debug device and make it device number 127 */
+ for (devnum = 0; devnum <= 127; devnum++) {
+ ret = dbgp_control_msg(devnum,
+ USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+ USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
+ &dbgp_desc, sizeof(dbgp_desc));
+ if (ret > 0)
+ break;
+ }
+ if (devnum > 127) {
+ dbgp_printk("Could not find attached debug device\n");
+ goto err;
+ }
+ if (ret < 0) {
+ dbgp_printk("Attached device is not a debug device\n");
+ goto err;
+ }
+ dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
+
+ /* Move the device to 127 if it isn't already there */
+ if (devnum != USB_DEBUG_DEVNUM) {
+ ret = dbgp_control_msg(devnum,
+ USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+ USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
+ if (ret < 0) {
+ dbgp_printk("Could not move attached device to %d\n",
+ USB_DEBUG_DEVNUM);
+ goto err;
+ }
+ devnum = USB_DEBUG_DEVNUM;
+ dbgp_printk("debug device renamed to 127\n");
+ }
+
+ /* Enable the debug interface */
+ ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
+ USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+ USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
+ if (ret < 0) {
+ dbgp_printk(" Could not enable the debug device\n");
+ goto err;
+ }
+ dbgp_printk("debug interface enabled\n");
+
+ /* Perform a small write to get the even/odd data state in sync
+ */
+ ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
+ if (ret < 0) {
+ dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
+ goto err;
+ }
+ dbgp_printk("small write doned\n");
+
+ return 0;
+err:
+ /* Things didn't work so remove my claim */
+ ctrl = readl(&ehci_debug->control);
+ ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
+ writel(ctrl, &ehci_debug->control);
+ return -1;
+
+next_debug_port:
+ port_map_tried |= (1<<(debug_port - 1));
+ new_debug_port = ((debug_port-1+1)%n_ports) + 1;
+ if (port_map_tried != ((1<<n_ports) - 1)) {
+ set_debug_port(new_debug_port);
+ goto try_next_port;
+ }
+ if (--playtimes) {
+ set_debug_port(new_debug_port);
+ goto try_next_time;
+ }
+
+ return -1;
+}
+
+static int __init early_dbgp_init(char *s)
+{
+ u32 debug_port, bar, offset;
+ u32 bus, slot, func, cap;
+ void __iomem *ehci_bar;
+ u32 dbgp_num;
+ u32 bar_val;
+ char *e;
+ int ret;
+ u8 byte;
+
+ if (!early_pci_allowed())
+ return -1;
+
+ dbgp_num = 0;
+ if (*s)
+ dbgp_num = simple_strtoul(s, &e, 10);
+ dbgp_printk("dbgp_num: %d\n", dbgp_num);
+
+ cap = find_dbgp(dbgp_num, &bus, &slot, &func);
+ if (!cap)
+ return -1;
+
+ dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
+ func);
+
+ debug_port = read_pci_config(bus, slot, func, cap);
+ bar = (debug_port >> 29) & 0x7;
+ bar = (bar * 4) + 0xc;
+ offset = (debug_port >> 16) & 0xfff;
+ dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
+ if (bar != PCI_BASE_ADDRESS_0) {
+ dbgp_printk("only debug ports on bar 1 handled.\n");
+
+ return -1;
+ }
+
+ bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+ dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
+ if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
+ dbgp_printk("only simple 32bit mmio bars supported\n");
+
+ return -1;
+ }
+
+ /* double check if the mem space is enabled */
+ byte = read_pci_config_byte(bus, slot, func, 0x04);
+ if (!(byte & 0x2)) {
+ byte |= 0x02;
+ write_pci_config_byte(bus, slot, func, 0x04, byte);
+ dbgp_printk("mmio for ehci enabled\n");
+ }
+
+ /*
+ * FIXME I don't have the bar size so just guess PAGE_SIZE is more
+ * than enough. 1K is the biggest I have seen.
+ */
+ set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
+ ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
+ ehci_bar += bar_val & ~PAGE_MASK;
+ dbgp_printk("ehci_bar: %p\n", ehci_bar);
+
+ ehci_caps = ehci_bar;
+ ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
+ ehci_debug = ehci_bar + offset;
+ ehci_dev.bus = bus;
+ ehci_dev.slot = slot;
+ ehci_dev.func = func;
+
+ detect_set_debug_port();
+
+ ret = ehci_setup();
+ if (ret < 0) {
+ dbgp_printk("ehci_setup failed\n");
+ ehci_debug = NULL;
+
+ return -1;
+ }
+
+ return 0;
+}
+
+static void early_dbgp_write(struct console *con, const char *str, u32 n)
+{
+ int chunk, ret;
+
+ if (!ehci_debug)
+ return;
+ while (n > 0) {
+ chunk = n;
+ if (chunk > DBGP_MAX_PACKET)
+ chunk = DBGP_MAX_PACKET;
+ ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
+ dbgp_endpoint_out, str, chunk);
+ str += chunk;
+ n -= chunk;
+ }
+}
+
+static struct console early_dbgp_console = {
+ .name = "earlydbg",
+ .write = early_dbgp_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+#endif
+
/* Console interface to a host file on AMD's SimNow! */
static int simnow_fd;
@@ -165,6 +889,7 @@ enum {
static noinline long simnow(long cmd, long a, long b, long c)
{
long ret;
+
asm volatile("cpuid" :
"=a" (ret) :
"b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
@@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c)
static void __init simnow_init(char *str)
{
char *fn = "klog";
+
if (*str == '=')
fn = ++str;
/* error ignored */
@@ -194,7 +920,7 @@ static struct console simnow_console = {
/* Direct interface for emergencies */
static struct console *early_console = &early_vga_console;
-static int early_console_initialized;
+static int __initdata early_console_initialized;
asmlinkage void early_printk(const char *fmt, ...)
{
@@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...)
va_end(ap);
}
-static int __initdata keep_early;
static int __init setup_early_printk(char *buf)
{
+ int keep_early;
+
if (!buf)
return 0;
@@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf)
return 0;
early_console_initialized = 1;
- if (strstr(buf, "keep"))
- keep_early = 1;
+ keep_early = (strstr(buf, "keep") != NULL);
if (!strncmp(buf, "serial", 6)) {
early_serial_init(buf + 6);
@@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf)
simnow_init(buf + 6);
early_console = &simnow_console;
keep_early = 1;
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+ } else if (!strncmp(buf, "dbgp", 4)) {
+ if (early_dbgp_init(buf+4) < 0)
+ return 0;
+ early_console = &early_dbgp_console;
+ /*
+ * usb subsys will reset ehci controller, so don't keep
+ * that early console
+ */
+ keep_early = 0;
+#endif
#ifdef CONFIG_HVC_XEN
} else if (!strncmp(buf, "xen", 3)) {
early_console = &xenboot_console;
@@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf)
register_console(early_console);
return 0;
}
+
early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b..1119d247fe1 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -367,6 +367,10 @@ void __init efi_init(void)
efi.smbios = config_tables[i].table;
printk(" SMBIOS=0x%lx ", config_tables[i].table);
} else if (!efi_guidcmp(config_tables[i].guid,
+ UV_SYSTEM_TABLE_GUID)) {
+ efi.uv_systab = config_tables[i].table;
+ printk(" UVsystab=0x%lx ", config_tables[i].table);
+ } else if (!efi_guidcmp(config_tables[i].guid,
HCDP_TABLE_GUID)) {
efi.hcdp = config_tables[i].table;
printk(" HCDP=0x%lx ", config_tables[i].table);
@@ -414,9 +418,11 @@ void __init efi_init(void)
if (memmap.map == NULL)
printk(KERN_ERR "Could not map the EFI memory map!\n");
memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
if (memmap.desc_size != sizeof(efi_memory_desc_t))
- printk(KERN_WARNING "Kernel-defined memdesc"
- "doesn't match the one from EFI!\n");
+ printk(KERN_WARNING
+ "Kernel-defined memdesc doesn't match the one from EFI!\n");
+
if (add_efi_memmap)
do_add_efi_memmap();
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 109792bc7cf..dd65143941a 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -629,7 +629,7 @@ ENTRY(interrupt)
ENTRY(irq_entries_start)
RING0_INT_FRAME
vector=0
-.rept NR_IRQS
+.rept NR_VECTORS
ALIGN
.if vector
CFI_ADJUST_CFA_OFFSET -4
@@ -730,6 +730,7 @@ error_code:
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
+ TRACE_IRQS_OFF
movl %esp,%eax # pt_regs pointer
call *%edi
jmp ret_from_exception
@@ -760,20 +761,9 @@ ENTRY(device_not_available)
RING0_INT_FRAME
pushl $-1 # mark this as an int
CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- GET_CR0_INTO_EAX
- testl $0x4, %eax # EM (math emulation bit)
- jne device_not_available_emulate
- preempt_stop(CLBR_ANY)
- call math_state_restore
- jmp ret_from_exception
-device_not_available_emulate:
- pushl $0 # temporary storage for ORIG_EIP
+ pushl $do_device_not_available
CFI_ADJUST_CFA_OFFSET 4
- call math_emulate
- addl $4, %esp
- CFI_ADJUST_CFA_OFFSET -4
- jmp ret_from_exception
+ jmp error_code
CFI_ENDPROC
END(device_not_available)
@@ -814,6 +804,7 @@ debug_stack_correct:
pushl $-1 # mark this as an int
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
+ TRACE_IRQS_OFF
xorl %edx,%edx # error code 0
movl %esp,%eax # pt_regs pointer
call do_debug
@@ -858,6 +849,7 @@ nmi_stack_correct:
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
+ TRACE_IRQS_OFF
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
call do_nmi
@@ -898,6 +890,7 @@ nmi_espfix_stack:
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
+ TRACE_IRQS_OFF
FIXUP_ESPFIX_STACK # %eax == %esp
xorl %edx,%edx # zero error code
call do_nmi
@@ -928,6 +921,7 @@ KPROBE_ENTRY(int3)
pushl $-1 # mark this as an int
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
+ TRACE_IRQS_OFF
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
call do_int3
@@ -1159,20 +1153,6 @@ ENDPROC(xen_failsafe_callback)
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- subl $MCOUNT_INSN_SIZE, %eax
-
-.globl mcount_call
-mcount_call:
- call ftrace_stub
-
- popl %edx
- popl %ecx
- popl %eax
-
ret
END(mcount)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 89434d43960..09e7145484c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -64,32 +64,6 @@
#ifdef CONFIG_FTRACE
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
-
- subq $0x38, %rsp
- movq %rax, (%rsp)
- movq %rcx, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rsi, 24(%rsp)
- movq %rdi, 32(%rsp)
- movq %r8, 40(%rsp)
- movq %r9, 48(%rsp)
-
- movq 0x38(%rsp), %rdi
- subq $MCOUNT_INSN_SIZE, %rdi
-
-.globl mcount_call
-mcount_call:
- call ftrace_stub
-
- movq 48(%rsp), %r9
- movq 40(%rsp), %r8
- movq 32(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rdx
- movq 8(%rsp), %rcx
- movq (%rsp), %rax
- addq $0x38, %rsp
-
retq
END(mcount)
@@ -275,9 +249,9 @@ ENTRY(native_usergs_sysret64)
ENTRY(ret_from_fork)
CFI_DEFAULT_STACK
push kernel_eflags(%rip)
- CFI_ADJUST_CFA_OFFSET 4
+ CFI_ADJUST_CFA_OFFSET 8
popf # reset kernel eflags
- CFI_ADJUST_CFA_OFFSET -4
+ CFI_ADJUST_CFA_OFFSET -8
call schedule_tail
GET_THREAD_INFO(%rcx)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
@@ -667,6 +641,13 @@ END(stub_rt_sigreturn)
SAVE_ARGS
leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
pushq %rbp
+ /*
+ * Save rbp twice: One is for marking the stack frame, as usual, and the
+ * other, to fill pt_regs properly. This is because bx comes right
+ * before the last saved register in that structure, and not bp. If the
+ * base pointer were in the place bx is today, this would not be needed.
+ */
+ movq %rbp, -8(%rsp)
CFI_ADJUST_CFA_OFFSET 8
CFI_REL_OFFSET rbp, 0
movq %rsp,%rbp
@@ -932,6 +913,9 @@ END(spurious_interrupt)
.if \ist
movq %gs:pda_data_offset, %rbp
.endif
+ .if \irqtrace
+ TRACE_IRQS_OFF
+ .endif
movq %rsp,%rdi
movq ORIG_RAX(%rsp),%rsi
movq $-1,ORIG_RAX(%rsp)
@@ -1058,7 +1042,8 @@ KPROBE_ENTRY(error_entry)
je error_kernelspace
error_swapgs:
SWAPGS
-error_sti:
+error_sti:
+ TRACE_IRQS_OFF
movq %rdi,RDI(%rsp)
CFI_REL_OFFSET rdi,RDI
movq %rsp,%rdi
@@ -1232,7 +1217,7 @@ ENTRY(simd_coprocessor_error)
END(simd_coprocessor_error)
ENTRY(device_not_available)
- zeroentry math_state_restore
+ zeroentry do_device_not_available
END(device_not_available)
/* runs on exception stack */
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/kernel/es7000_32.c
index 50189af14b8..f454c78fcef 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -39,10 +39,94 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/apicdef.h>
-#include "es7000.h"
#include <mach_mpparse.h>
/*
+ * ES7000 chipsets
+ */
+
+#define NON_UNISYS 0
+#define ES7000_CLASSIC 1
+#define ES7000_ZORRO 2
+
+
+#define MIP_REG 1
+#define MIP_PSAI_REG 4
+
+#define MIP_BUSY 1
+#define MIP_SPIN 0xf0000
+#define MIP_VALID 0x0100000000000000ULL
+#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
+
+#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
+
+struct mip_reg_info {
+ unsigned long long mip_info;
+ unsigned long long delivery_info;
+ unsigned long long host_reg;
+ unsigned long long mip_reg;
+};
+
+struct part_info {
+ unsigned char type;
+ unsigned char length;
+ unsigned char part_id;
+ unsigned char apic_mode;
+ unsigned long snum;
+ char ptype[16];
+ char sname[64];
+ char pname[64];
+};
+
+struct psai {
+ unsigned long long entry_type;
+ unsigned long long addr;
+ unsigned long long bep_addr;
+};
+
+struct es7000_mem_info {
+ unsigned char type;
+ unsigned char length;
+ unsigned char resv[6];
+ unsigned long long start;
+ unsigned long long size;
+};
+
+struct es7000_oem_table {
+ unsigned long long hdr;
+ struct mip_reg_info mip;
+ struct part_info pif;
+ struct es7000_mem_info shm;
+ struct psai psai;
+};
+
+#ifdef CONFIG_ACPI
+
+struct oem_table {
+ struct acpi_table_header Header;
+ u32 OEMTableAddr;
+ u32 OEMTableSize;
+};
+
+extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
+extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
+#endif
+
+struct mip_reg {
+ unsigned long long off_0;
+ unsigned long long off_8;
+ unsigned long long off_10;
+ unsigned long long off_18;
+ unsigned long long off_20;
+ unsigned long long off_28;
+ unsigned long long off_30;
+ unsigned long long off_38;
+};
+
+#define MIP_SW_APIC 0x1020b
+#define MIP_FUNC(VALUE) (VALUE & 0xff)
+
+/*
* ES7000 Globals
*/
@@ -72,7 +156,7 @@ es7000_rename_gsi(int ioapic, int gsi)
base += nr_ioapic_registers[i];
}
- if (!ioapic && (gsi < 16))
+ if (!ioapic && (gsi < 16))
gsi += base;
return gsi;
}
@@ -160,21 +244,38 @@ parse_unisys_oem (char *oemptr)
}
#ifdef CONFIG_ACPI
-int __init
-find_unisys_acpi_oem_table(unsigned long *oem_addr)
+static unsigned long oem_addrX;
+static unsigned long oem_size;
+int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
{
struct acpi_table_header *header = NULL;
int i = 0;
- while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) {
+ acpi_size tbl_size;
+
+ while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) {
if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
struct oem_table *t = (struct oem_table *)header;
- *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr,
- t->OEMTableSize);
+
+ oem_addrX = t->OEMTableAddr;
+ oem_size = t->OEMTableSize;
+ early_acpi_os_unmap_memory(header, tbl_size);
+
+ *oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
+ oem_size);
return 0;
}
+ early_acpi_os_unmap_memory(header, tbl_size);
}
return -1;
}
+
+void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
+{
+ if (!oem_addr)
+ return;
+
+ __acpi_unmap_table((char *)oem_addr, oem_size);
+}
#endif
static void
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ab115cd15fd..d073d981a73 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -11,17 +11,18 @@
#include <linux/spinlock.h>
#include <linux/hardirq.h>
+#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/list.h>
-#include <asm/alternative.h>
#include <asm/ftrace.h>
+#include <asm/nops.h>
/* Long is fine, even if it is only 4 bytes ;-) */
-static long *ftrace_nop;
+static unsigned long *ftrace_nop;
union ftrace_code_union {
char code[MCOUNT_INSN_SIZE];
@@ -60,11 +61,7 @@ notrace int
ftrace_modify_code(unsigned long ip, unsigned char *old_code,
unsigned char *new_code)
{
- unsigned replaced;
- unsigned old = *(unsigned *)old_code; /* 4 bytes */
- unsigned new = *(unsigned *)new_code; /* 4 bytes */
- unsigned char newch = new_code[4];
- int faulted = 0;
+ unsigned char replaced[MCOUNT_INSN_SIZE];
/*
* Note: Due to modules and __init, code can
@@ -72,29 +69,20 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
* as well as code changing.
*
* No real locking needed, this code is run through
- * kstop_machine.
+ * kstop_machine, or before SMP starts.
*/
- asm volatile (
- "1: lock\n"
- " cmpxchg %3, (%2)\n"
- " jnz 2f\n"
- " movb %b4, 4(%2)\n"
- "2:\n"
- ".section .fixup, \"ax\"\n"
- "3: movl $1, %0\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : "=r"(faulted), "=a"(replaced)
- : "r"(ip), "r"(new), "c"(newch),
- "0"(faulted), "a"(old)
- : "memory");
- sync_core();
+ if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE))
+ return 1;
+
+ if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+ return 2;
- if (replaced != old && replaced != new)
- faulted = 2;
+ WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code,
+ MCOUNT_INSN_SIZE));
- return faulted;
+ sync_core();
+
+ return 0;
}
notrace int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -112,30 +100,76 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func)
notrace int ftrace_mcount_set(unsigned long *data)
{
- unsigned long ip = (long)(&mcount_call);
- unsigned long *addr = data;
- unsigned char old[MCOUNT_INSN_SIZE], *new;
-
- /*
- * Replace the mcount stub with a pointer to the
- * ip recorder function.
- */
- memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
- new = ftrace_call_replace(ip, *addr);
- *addr = ftrace_modify_code(ip, old, new);
-
+ /* mcount is initialized as a nop */
+ *data = 0;
return 0;
}
int __init ftrace_dyn_arch_init(void *data)
{
- const unsigned char *const *noptable = find_nop_table();
-
- /* This is running in kstop_machine */
-
- ftrace_mcount_set(data);
+ extern const unsigned char ftrace_test_p6nop[];
+ extern const unsigned char ftrace_test_nop5[];
+ extern const unsigned char ftrace_test_jmp[];
+ int faulted = 0;
- ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
+ /*
+ * There is no good nop for all x86 archs.
+ * We will default to using the P6_NOP5, but first we
+ * will test to make sure that the nop will actually
+ * work on this CPU. If it faults, we will then
+ * go to a lesser efficient 5 byte nop. If that fails
+ * we then just use a jmp as our nop. This isn't the most
+ * efficient nop, but we can not use a multi part nop
+ * since we would then risk being preempted in the middle
+ * of that nop, and if we enabled tracing then, it might
+ * cause a system crash.
+ *
+ * TODO: check the cpuid to determine the best nop.
+ */
+ asm volatile (
+ "jmp ftrace_test_jmp\n"
+ /* This code needs to stay around */
+ ".section .text, \"ax\"\n"
+ "ftrace_test_jmp:"
+ "jmp ftrace_test_p6nop\n"
+ "nop\n"
+ "nop\n"
+ "nop\n" /* 2 byte jmp + 3 bytes */
+ "ftrace_test_p6nop:"
+ P6_NOP5
+ "jmp 1f\n"
+ "ftrace_test_nop5:"
+ ".byte 0x66,0x66,0x66,0x66,0x90\n"
+ "jmp 1f\n"
+ ".previous\n"
+ "1:"
+ ".section .fixup, \"ax\"\n"
+ "2: movl $1, %0\n"
+ " jmp ftrace_test_nop5\n"
+ "3: movl $2, %0\n"
+ " jmp 1b\n"
+ ".previous\n"
+ _ASM_EXTABLE(ftrace_test_p6nop, 2b)
+ _ASM_EXTABLE(ftrace_test_nop5, 3b)
+ : "=r"(faulted) : "0" (faulted));
+
+ switch (faulted) {
+ case 0:
+ pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
+ ftrace_nop = (unsigned long *)ftrace_test_p6nop;
+ break;
+ case 1:
+ pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
+ ftrace_nop = (unsigned long *)ftrace_test_nop5;
+ break;
+ case 2:
+ pr_info("ftrace: converting mcount calls to jmp . + 5\n");
+ ftrace_nop = (unsigned long *)ftrace_test_jmp;
+ break;
+ }
+
+ /* The return code is retured via data */
+ *(unsigned long *)data = 0;
return 0;
}
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index eaff0bbb144..6c9bfc9e1e9 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -16,87 +16,63 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/hardirq.h>
+#include <linux/dmar.h>
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
-
-DEFINE_PER_CPU(int, x2apic_extra_bits);
+extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
+extern struct genapic apic_x2xpic_uv_x;
+extern struct genapic apic_x2apic_phys;
+extern struct genapic apic_x2apic_cluster;
struct genapic __read_mostly *genapic = &apic_flat;
-static enum uv_system_type uv_system_type;
+static struct genapic *apic_probe[] __initdata = {
+ &apic_x2apic_uv_x,
+ &apic_x2apic_phys,
+ &apic_x2apic_cluster,
+ &apic_physflat,
+ NULL,
+};
/*
* Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
*/
void __init setup_apic_routing(void)
{
- if (uv_system_type == UV_NON_UNIQUE_APIC)
- genapic = &apic_x2apic_uv_x;
- else
-#ifdef CONFIG_ACPI
- /*
- * Quirk: some x86_64 machines can only use physical APIC mode
- * regardless of how many processors are present (x86_64 ES7000
- * is an example).
- */
- if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
- genapic = &apic_physflat;
- else
-#endif
-
- if (max_physical_apicid < 8)
- genapic = &apic_flat;
- else
- genapic = &apic_physflat;
+ if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
+ if (!intr_remapping_enabled)
+ genapic = &apic_flat;
+ }
- printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+ if (genapic == &apic_flat) {
+ if (max_physical_apicid >= 8)
+ genapic = &apic_physflat;
+ printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+ }
}
/* Same for both flat and physical. */
-void send_IPI_self(int vector)
+void apic_send_IPI_self(int vector)
{
__send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
}
int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- if (!strcmp(oem_id, "SGI")) {
- if (!strcmp(oem_table_id, "UVL"))
- uv_system_type = UV_LEGACY_APIC;
- else if (!strcmp(oem_table_id, "UVX"))
- uv_system_type = UV_X2APIC;
- else if (!strcmp(oem_table_id, "UVH"))
- uv_system_type = UV_NON_UNIQUE_APIC;
+ int i;
+
+ for (i = 0; apic_probe[i]; ++i) {
+ if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
+ genapic = apic_probe[i];
+ printk(KERN_INFO "Setting APIC routing to %s.\n",
+ genapic->name);
+ return 1;
+ }
}
return 0;
}
-
-unsigned int read_apic_id(void)
-{
- unsigned int id;
-
- WARN_ON(preemptible() && num_online_cpus() > 1);
- id = apic_read(APIC_ID);
- if (uv_system_type >= UV_X2APIC)
- id |= __get_cpu_var(x2apic_extra_bits);
- return id;
-}
-
-enum uv_system_type get_uv_system_type(void)
-{
- return uv_system_type;
-}
-
-int is_uv_system(void)
-{
- return uv_system_type != UV_NONE;
-}
-EXPORT_SYMBOL_GPL(is_uv_system);
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 786548a62d3..c0262791bda 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -15,9 +15,20 @@
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/init.h>
+#include <linux/hardirq.h>
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
+#include <mach_apicdef.h>
+
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+
+static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return 1;
+}
static cpumask_t flat_target_cpus(void)
{
@@ -95,9 +106,33 @@ static void flat_send_IPI_all(int vector)
__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
}
+static unsigned int get_apic_id(unsigned long x)
+{
+ unsigned int id;
+
+ id = (((x)>>24) & 0xFFu);
+ return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+ unsigned long x;
+
+ x = ((id & 0xFFu)<<24);
+ return x;
+}
+
+static unsigned int read_xapic_id(void)
+{
+ unsigned int id;
+
+ id = get_apic_id(apic_read(APIC_ID));
+ return id;
+}
+
static int flat_apic_id_registered(void)
{
- return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map);
+ return physid_isset(read_xapic_id(), phys_cpu_present_map);
}
static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
@@ -112,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb)
struct genapic apic_flat = {
.name = "flat",
+ .acpi_madt_oem_check = flat_acpi_madt_oem_check,
.int_delivery_mode = dest_LowestPrio,
.int_dest_mode = (APIC_DEST_LOGICAL != 0),
.target_cpus = flat_target_cpus,
@@ -121,8 +157,12 @@ struct genapic apic_flat = {
.send_IPI_all = flat_send_IPI_all,
.send_IPI_allbutself = flat_send_IPI_allbutself,
.send_IPI_mask = flat_send_IPI_mask,
+ .send_IPI_self = apic_send_IPI_self,
.cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
.phys_pkg_id = phys_pkg_id,
+ .get_apic_id = get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = (0xFFu<<24),
};
/*
@@ -130,6 +170,23 @@ struct genapic apic_flat = {
* We cannot use logical delivery in this case because the mask
* overflows, so use physical mode.
*/
+static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+#ifdef CONFIG_ACPI
+ /*
+ * Quirk: some x86_64 machines can only use physical APIC mode
+ * regardless of how many processors are present (x86_64 ES7000
+ * is an example).
+ */
+ if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+ printk(KERN_DEBUG "system APIC only can use physical flat");
+ return 1;
+ }
+#endif
+
+ return 0;
+}
static cpumask_t physflat_target_cpus(void)
{
@@ -176,6 +233,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
struct genapic apic_physflat = {
.name = "physical flat",
+ .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
.int_delivery_mode = dest_Fixed,
.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
.target_cpus = physflat_target_cpus,
@@ -185,6 +243,10 @@ struct genapic apic_physflat = {
.send_IPI_all = physflat_send_IPI_all,
.send_IPI_allbutself = physflat_send_IPI_allbutself,
.send_IPI_mask = physflat_send_IPI_mask,
+ .send_IPI_self = apic_send_IPI_self,
.cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
.phys_pkg_id = phys_pkg_id,
+ .get_apic_id = get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = (0xFFu<<24),
};
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
new file mode 100644
index 00000000000..f6a2c8eb48a
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -0,0 +1,159 @@
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include <asm/genapic.h>
+
+DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (cpu_has_x2apic)
+ return 1;
+
+ return 0;
+}
+
+/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
+
+static cpumask_t x2apic_target_cpus(void)
+{
+ return cpumask_of_cpu(0);
+}
+
+/*
+ * for now each logical cpu is in its own vector allocation domain.
+ */
+static cpumask_t x2apic_vector_allocation_domain(int cpu)
+{
+ cpumask_t domain = CPU_MASK_NONE;
+ cpu_set(cpu, domain);
+ return domain;
+}
+
+static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
+ unsigned int dest)
+{
+ unsigned long cfg;
+
+ cfg = __prepare_ICR(0, vector, dest);
+
+ /*
+ * send the IPI.
+ */
+ x2apic_icr_write(cfg, apicid);
+}
+
+/*
+ * for now, we send the IPI's one by one in the cpumask.
+ * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
+ * at once. We have 16 cpu's in a cluster. This will minimize IPI register
+ * writes.
+ */
+static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+{
+ unsigned long flags;
+ unsigned long query_cpu;
+
+ local_irq_save(flags);
+ for_each_cpu_mask(query_cpu, mask) {
+ __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, APIC_DEST_LOGICAL);
+ }
+ local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ x2apic_send_IPI_mask(mask, vector);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+ x2apic_send_IPI_mask(cpu_online_map, vector);
+}
+
+static int x2apic_apic_id_registered(void)
+{
+ return 1;
+}
+
+static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = first_cpu(cpumask);
+ if ((unsigned)cpu < NR_CPUS)
+ return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ else
+ return BAD_APICID;
+}
+
+static unsigned int get_apic_id(unsigned long x)
+{
+ unsigned int id;
+
+ id = x;
+ return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+ unsigned long x;
+
+ x = id;
+ return x;
+}
+
+static unsigned int phys_pkg_id(int index_msb)
+{
+ return current_cpu_data.initial_apicid >> index_msb;
+}
+
+static void x2apic_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+static void init_x2apic_ldr(void)
+{
+ int cpu = smp_processor_id();
+
+ per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+ return;
+}
+
+struct genapic apic_x2apic_cluster = {
+ .name = "cluster x2apic",
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .int_delivery_mode = dest_LowestPrio,
+ .int_dest_mode = (APIC_DEST_LOGICAL != 0),
+ .target_cpus = x2apic_target_cpus,
+ .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .apic_id_registered = x2apic_apic_id_registered,
+ .init_apic_ldr = init_x2apic_ldr,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_self = x2apic_send_IPI_self,
+ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+ .phys_pkg_id = phys_pkg_id,
+ .get_apic_id = get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = (0xFFFFFFFFu),
+};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
new file mode 100644
index 00000000000..d042211768b
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -0,0 +1,154 @@
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include <asm/genapic.h>
+
+static int x2apic_phys;
+
+static int set_x2apic_phys_mode(char *arg)
+{
+ x2apic_phys = 1;
+ return 0;
+}
+early_param("x2apic_phys", set_x2apic_phys_mode);
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (cpu_has_x2apic && x2apic_phys)
+ return 1;
+
+ return 0;
+}
+
+/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
+
+static cpumask_t x2apic_target_cpus(void)
+{
+ return cpumask_of_cpu(0);
+}
+
+static cpumask_t x2apic_vector_allocation_domain(int cpu)
+{
+ cpumask_t domain = CPU_MASK_NONE;
+ cpu_set(cpu, domain);
+ return domain;
+}
+
+static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
+ unsigned int dest)
+{
+ unsigned long cfg;
+
+ cfg = __prepare_ICR(0, vector, dest);
+
+ /*
+ * send the IPI.
+ */
+ x2apic_icr_write(cfg, apicid);
+}
+
+static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+{
+ unsigned long flags;
+ unsigned long query_cpu;
+
+ local_irq_save(flags);
+ for_each_cpu_mask(query_cpu, mask) {
+ __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
+ vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ x2apic_send_IPI_mask(mask, vector);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+ x2apic_send_IPI_mask(cpu_online_map, vector);
+}
+
+static int x2apic_apic_id_registered(void)
+{
+ return 1;
+}
+
+static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = first_cpu(cpumask);
+ if ((unsigned)cpu < NR_CPUS)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+ else
+ return BAD_APICID;
+}
+
+static unsigned int get_apic_id(unsigned long x)
+{
+ unsigned int id;
+
+ id = x;
+ return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+ unsigned long x;
+
+ x = id;
+ return x;
+}
+
+static unsigned int phys_pkg_id(int index_msb)
+{
+ return current_cpu_data.initial_apicid >> index_msb;
+}
+
+void x2apic_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+void init_x2apic_ldr(void)
+{
+ return;
+}
+
+struct genapic apic_x2apic_phys = {
+ .name = "physical x2apic",
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .int_delivery_mode = dest_Fixed,
+ .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+ .target_cpus = x2apic_target_cpus,
+ .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .apic_id_registered = x2apic_apic_id_registered,
+ .init_apic_ldr = init_x2apic_ldr,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_self = x2apic_send_IPI_self,
+ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+ .phys_pkg_id = phys_pkg_id,
+ .get_apic_id = get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = (0xFFFFFFFFu),
+};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index bfa837cb16b..680a06557c5 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -12,12 +12,12 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
-#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/bootmem.h>
#include <linux/module.h>
+#include <linux/hardirq.h>
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
@@ -26,6 +26,36 @@
#include <asm/uv/uv_hub.h>
#include <asm/uv/bios.h>
+DEFINE_PER_CPU(int, x2apic_extra_bits);
+
+static enum uv_system_type uv_system_type;
+
+static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (!strcmp(oem_id, "SGI")) {
+ if (!strcmp(oem_table_id, "UVL"))
+ uv_system_type = UV_LEGACY_APIC;
+ else if (!strcmp(oem_table_id, "UVX"))
+ uv_system_type = UV_X2APIC;
+ else if (!strcmp(oem_table_id, "UVH")) {
+ uv_system_type = UV_NON_UNIQUE_APIC;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+enum uv_system_type get_uv_system_type(void)
+{
+ return uv_system_type;
+}
+
+int is_uv_system(void)
+{
+ return uv_system_type != UV_NONE;
+}
+EXPORT_SYMBOL_GPL(is_uv_system);
+
DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -84,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector)
unsigned long val, apicid, lapicid;
int pnode;
- apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
lapicid = apicid & 0x3f; /* ZZZ macro needed */
pnode = uv_apicid_to_pnode(apicid);
val =
@@ -123,6 +153,10 @@ static int uv_apic_id_registered(void)
return 1;
}
+static void uv_init_apic_ldr(void)
+{
+}
+
static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
{
int cpu;
@@ -138,31 +172,59 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
return BAD_APICID;
}
+static unsigned int get_apic_id(unsigned long x)
+{
+ unsigned int id;
+
+ WARN_ON(preemptible() && num_online_cpus() > 1);
+ id = x | __get_cpu_var(x2apic_extra_bits);
+
+ return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+ unsigned long x;
+
+ /* maskout x2apic_extra_bits ? */
+ x = id;
+ return x;
+}
+
+static unsigned int uv_read_apic_id(void)
+{
+
+ return get_apic_id(apic_read(APIC_ID));
+}
+
static unsigned int phys_pkg_id(int index_msb)
{
- return GET_APIC_ID(read_apic_id()) >> index_msb;
+ return uv_read_apic_id() >> index_msb;
}
-#ifdef ZZZ /* Needs x2apic patch */
static void uv_send_IPI_self(int vector)
{
apic_write(APIC_SELF_IPI, vector);
}
-#endif
struct genapic apic_x2apic_uv_x = {
.name = "UV large system",
+ .acpi_madt_oem_check = uv_acpi_madt_oem_check,
.int_delivery_mode = dest_Fixed,
.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
.target_cpus = uv_target_cpus,
- .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */
+ .vector_allocation_domain = uv_vector_allocation_domain,
.apic_id_registered = uv_apic_id_registered,
+ .init_apic_ldr = uv_init_apic_ldr,
.send_IPI_all = uv_send_IPI_all,
.send_IPI_allbutself = uv_send_IPI_allbutself,
.send_IPI_mask = uv_send_IPI_mask,
- /* ZZZ.send_IPI_self = uv_send_IPI_self, */
+ .send_IPI_self = uv_send_IPI_self,
.cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
- .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */
+ .phys_pkg_id = phys_pkg_id,
+ .get_apic_id = get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = (0xFFFFFFFFu),
};
static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -222,12 +284,13 @@ static __init void map_low_mmrs(void)
enum map_type {map_wb, map_uc};
-static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
+static __init void map_high(char *id, unsigned long base, int shift,
+ int max_pnode, enum map_type map_type)
{
unsigned long bytes, paddr;
paddr = base << shift;
- bytes = (1UL << shift);
+ bytes = (1UL << shift) * (max_pnode + 1);
printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
paddr + bytes);
if (map_type == map_uc)
@@ -243,7 +306,7 @@ static __init void map_gru_high(int max_pnode)
gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
if (gru.s.enable)
- map_high("GRU", gru.s.base, shift, map_wb);
+ map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
}
static __init void map_config_high(int max_pnode)
@@ -253,7 +316,7 @@ static __init void map_config_high(int max_pnode)
cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
if (cfg.s.enable)
- map_high("CONFIG", cfg.s.base, shift, map_uc);
+ map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
}
static __init void map_mmr_high(int max_pnode)
@@ -263,7 +326,7 @@ static __init void map_mmr_high(int max_pnode)
mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
if (mmr.s.enable)
- map_high("MMR", mmr.s.base, shift, map_uc);
+ map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
}
static __init void map_mmioh_high(int max_pnode)
@@ -273,17 +336,17 @@ static __init void map_mmioh_high(int max_pnode)
mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
if (mmioh.s.enable)
- map_high("MMIOH", mmioh.s.base, shift, map_uc);
+ map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
}
static __init void uv_rtc_init(void)
{
- long status, ticks_per_sec, drift;
+ long status;
+ u64 ticks_per_sec;
- status =
- x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
- &drift);
- if (status != 0 || ticks_per_sec < 100000) {
+ status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
+ &ticks_per_sec);
+ if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
printk(KERN_WARNING
"unable to determine platform RTC clock frequency, "
"guessing.\n");
@@ -293,7 +356,22 @@ static __init void uv_rtc_init(void)
sn_rtc_cycles_per_second = ticks_per_sec;
}
-static bool uv_system_inited;
+/*
+ * Called on each cpu to initialize the per_cpu UV data area.
+ * ZZZ hotplug not supported yet
+ */
+void __cpuinit uv_cpu_init(void)
+{
+ /* CPU 0 initilization will be done via uv_system_init. */
+ if (!uv_blade_info)
+ return;
+
+ uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
+
+ if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
+ set_x2apic_extra_bits(uv_hub_info->pnode);
+}
+
void __init uv_system_init(void)
{
@@ -349,6 +427,9 @@ void __init uv_system_init(void)
gnode_upper = (((unsigned long)node_id.s.node_id) &
~((1 << n_val) - 1)) << m_val;
+ uv_bios_init();
+ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
+ &uv_coherency_id, &uv_region_size);
uv_rtc_init();
for_each_present_cpu(cpu) {
@@ -370,7 +451,7 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
- uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
+ uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
max_pnode = max(pnode, max_pnode);
@@ -385,19 +466,6 @@ void __init uv_system_init(void)
map_mmr_high(max_pnode);
map_config_high(max_pnode);
map_mmioh_high(max_pnode);
- uv_system_inited = true;
-}
-
-/*
- * Called on each cpu to initialize the per_cpu UV data area.
- * ZZZ hotplug not supported yet
- */
-void __cpuinit uv_cpu_init(void)
-{
- BUG_ON(!uv_system_inited);
- uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
-
- if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
- set_x2apic_extra_bits(uv_hub_info->pnode);
+ uv_cpu_init();
}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9..1dcb0f13897 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,6 +35,7 @@ void __init reserve_ebda_region(void)
/* start of EBDA area */
ebda_addr = get_bios_ebda();
+ printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
/* Fixup: bios puts an EBDA in the top 64K segment */
/* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 9bfc4d72fb2..d16084f9064 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -108,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
}
load_idt((const struct desc_ptr *)&idt_descr);
- early_printk("Kernel alive\n");
+ if (console_loglevel == 10)
+ early_printk("Kernel alive\n");
x86_64_init_pda();
- early_printk("Kernel really alive\n");
-
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index a7010c3a377..e835b4eea70 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
*
* Note that the stack is not yet set up!
*/
-#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
-#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
-#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
-
default_entry:
#ifdef CONFIG_X86_PAE
@@ -196,9 +192,9 @@ default_entry:
movl $pa(pg0), %edi
movl %edi, pa(init_pg_tables_start)
movl $pa(swapper_pg_pmd), %edx
- movl $PTE_ATTR, %eax
+ movl $PTE_IDENT_ATTR, %eax
10:
- leal PDE_ATTR(%edi),%ecx /* Create PMD entry */
+ leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
movl %ecx,(%edx) /* Store PMD entry */
/* Upper half already zero */
addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
* End condition: we must map up to and including INIT_MAP_BEYOND_END
* bytes beyond the end of our own page tables.
*/
- leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
+ leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
cmpl %ebp,%eax
jb 10b
1:
@@ -224,7 +220,7 @@ default_entry:
movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
+ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
#else /* Not PAE */
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
movl $pa(pg0), %edi
movl %edi, pa(init_pg_tables_start)
movl $pa(swapper_pg_dir), %edx
- movl $PTE_ATTR, %eax
+ movl $PTE_IDENT_ATTR, %eax
10:
- leal PDE_ATTR(%edi),%ecx /* Create PDE entry */
+ leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
movl %ecx,(%edx) /* Store identity PDE entry */
movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
* bytes beyond the end of our own page tables; the +0x007 is
* the attribute bits
*/
- leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
+ leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
cmpl %ebp,%eax
jb 10b
movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
+ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
movl %eax,pa(swapper_pg_dir+0xffc)
#endif
jmp 3f
@@ -634,19 +630,19 @@ ENTRY(empty_zero_page)
/* Page-aligned for the benefit of paravirt? */
.align PAGE_SIZE_asm
ENTRY(swapper_pg_dir)
- .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
# if KPMDS == 3
- .long pa(swapper_pg_pmd+PGD_ATTR),0
- .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
- .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
# elif KPMDS == 2
.long 0,0
- .long pa(swapper_pg_pmd+PGD_ATTR),0
- .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
# elif KPMDS == 1
.long 0,0
.long 0,0
- .long pa(swapper_pg_pmd+PGD_ATTR),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
# else
# error "Kernel PMDs should be 1, 2 or 3"
# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index db3280afe88..26cfdc1d7c7 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
movq %rdi, %rax
shrq $PMD_SHIFT, %rax
andq $(PTRS_PER_PMD - 1), %rax
- leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
+ leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
leaq level2_spare_pgt(%rip), %rbx
movq %rdx, 0(%rbx, %rax, 8)
ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
/* Since I easily can, map the first 1G.
* Don't set NX because code runs from these pages.
*/
- PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
NEXT_PAGE(level2_kernel_pgt)
/*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 73deaffadd0..77017e834cf 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,29 +1,49 @@
#include <linux/clocksource.h>
#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/sysdev.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/hpet.h>
#include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/cpu.h>
#include <linux/pm.h>
+#include <linux/io.h>
#include <asm/fixmap.h>
-#include <asm/hpet.h>
#include <asm/i8253.h>
-#include <asm/io.h>
+#include <asm/hpet.h>
-#define HPET_MASK CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT 22
+#define HPET_MASK CLOCKSOURCE_MASK(32)
+#define HPET_SHIFT 22
/* FSEC = 10^-15
NSEC = 10^-9 */
-#define FSEC_PER_NSEC 1000000L
+#define FSEC_PER_NSEC 1000000L
+
+#define HPET_DEV_USED_BIT 2
+#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
+#define HPET_DEV_VALID 0x8
+#define HPET_DEV_FSB_CAP 0x1000
+#define HPET_DEV_PERI_CAP 0x2000
+
+#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
/*
* HPET address is set in acpi/boot.c, when an ACPI entry exists
*/
-unsigned long hpet_address;
-static void __iomem *hpet_virt_address;
+unsigned long hpet_address;
+unsigned long hpet_num_timers;
+static void __iomem *hpet_virt_address;
+
+struct hpet_dev {
+ struct clock_event_device evt;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ unsigned int flags;
+ char name[10];
+};
unsigned long hpet_readl(unsigned long a)
{
@@ -59,7 +79,7 @@ static inline void hpet_clear_mapping(void)
static int boot_hpet_disable;
int hpet_force_user;
-static int __init hpet_setup(char* str)
+static int __init hpet_setup(char *str)
{
if (str) {
if (!strncmp("disable", str, 7))
@@ -80,7 +100,7 @@ __setup("nohpet", disable_hpet);
static inline int is_hpet_capable(void)
{
- return (!boot_hpet_disable && hpet_address);
+ return !boot_hpet_disable && hpet_address;
}
/*
@@ -102,6 +122,9 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled);
* timer 0 and timer 1 in case of RTC emulation.
*/
#ifdef CONFIG_HPET
+
+static void hpet_reserve_msi_timers(struct hpet_data *hd);
+
static void hpet_reserve_platform_timers(unsigned long id)
{
struct hpet __iomem *hpet = hpet_virt_address;
@@ -111,25 +134,31 @@ static void hpet_reserve_platform_timers(unsigned long id)
nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
- memset(&hd, 0, sizeof (hd));
- hd.hd_phys_address = hpet_address;
- hd.hd_address = hpet;
- hd.hd_nirqs = nrtimers;
- hd.hd_flags = HPET_DATA_PLATFORM;
+ memset(&hd, 0, sizeof(hd));
+ hd.hd_phys_address = hpet_address;
+ hd.hd_address = hpet;
+ hd.hd_nirqs = nrtimers;
hpet_reserve_timer(&hd, 0);
#ifdef CONFIG_HPET_EMULATE_RTC
hpet_reserve_timer(&hd, 1);
#endif
+ /*
+ * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
+ * is wrong for i8259!) not the output IRQ. Many BIOS writers
+ * don't bother configuring *any* comparator interrupts.
+ */
hd.hd_irq[0] = HPET_LEGACY_8254;
hd.hd_irq[1] = HPET_LEGACY_RTC;
for (i = 2; i < nrtimers; timer++, i++) {
- hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >>
- Tn_INT_ROUTE_CNF_SHIFT;
+ hd.hd_irq[i] = (readl(&timer->hpet_config) &
+ Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
}
+ hpet_reserve_msi_timers(&hd);
+
hpet_alloc(&hd);
}
@@ -223,60 +252,70 @@ static void hpet_legacy_clockevent_register(void)
printk(KERN_DEBUG "hpet clockevent registered\n");
}
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int hpet_setup_msi_irq(unsigned int irq);
+
+static void hpet_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt, int timer)
{
unsigned long cfg, cmp, now;
uint64_t delta;
- switch(mode) {
+ switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
- delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
- delta >>= hpet_clockevent.shift;
+ delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
+ delta >>= evt->shift;
now = hpet_readl(HPET_COUNTER);
cmp = now + (unsigned long) delta;
- cfg = hpet_readl(HPET_T0_CFG);
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
HPET_TN_SETVAL | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_T0_CFG);
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
/*
* The first write after writing TN_SETVAL to the
* config register sets the counter value, the second
* write sets the period.
*/
- hpet_writel(cmp, HPET_T0_CMP);
+ hpet_writel(cmp, HPET_Tn_CMP(timer));
udelay(1);
- hpet_writel((unsigned long) delta, HPET_T0_CMP);
+ hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
break;
case CLOCK_EVT_MODE_ONESHOT:
- cfg = hpet_readl(HPET_T0_CFG);
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
cfg &= ~HPET_TN_PERIODIC;
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_T0_CFG);
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
- cfg = hpet_readl(HPET_T0_CFG);
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_T0_CFG);
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
break;
case CLOCK_EVT_MODE_RESUME:
- hpet_enable_legacy_int();
+ if (timer == 0) {
+ hpet_enable_legacy_int();
+ } else {
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ hpet_setup_msi_irq(hdev->irq);
+ disable_irq(hdev->irq);
+ irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+ enable_irq(hdev->irq);
+ }
break;
}
}
-static int hpet_legacy_next_event(unsigned long delta,
- struct clock_event_device *evt)
+static int hpet_next_event(unsigned long delta,
+ struct clock_event_device *evt, int timer)
{
u32 cnt;
cnt = hpet_readl(HPET_COUNTER);
cnt += (u32) delta;
- hpet_writel(cnt, HPET_T0_CMP);
+ hpet_writel(cnt, HPET_Tn_CMP(timer));
/*
* We need to read back the CMP register to make sure that
@@ -288,6 +327,347 @@ static int hpet_legacy_next_event(unsigned long delta,
return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
}
+static void hpet_legacy_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ hpet_set_mode(mode, evt, 0);
+}
+
+static int hpet_legacy_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ return hpet_next_event(delta, evt, 0);
+}
+
+/*
+ * HPET MSI Support
+ */
+#ifdef CONFIG_PCI_MSI
+
+static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
+static struct hpet_dev *hpet_devs;
+
+void hpet_msi_unmask(unsigned int irq)
+{
+ struct hpet_dev *hdev = get_irq_data(irq);
+ unsigned long cfg;
+
+ /* unmask it */
+ cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg |= HPET_TN_FSB;
+ hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_mask(unsigned int irq)
+{
+ unsigned long cfg;
+ struct hpet_dev *hdev = get_irq_data(irq);
+
+ /* mask it */
+ cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg &= ~HPET_TN_FSB;
+ hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+{
+ struct hpet_dev *hdev = get_irq_data(irq);
+
+ hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
+ hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
+}
+
+void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+{
+ struct hpet_dev *hdev = get_irq_data(irq);
+
+ msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
+ msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
+ msg->address_hi = 0;
+}
+
+static void hpet_msi_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ hpet_set_mode(mode, evt, hdev->num);
+}
+
+static int hpet_msi_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ return hpet_next_event(delta, evt, hdev->num);
+}
+
+static int hpet_setup_msi_irq(unsigned int irq)
+{
+ if (arch_setup_hpet_msi(irq)) {
+ destroy_irq(irq);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int hpet_assign_irq(struct hpet_dev *dev)
+{
+ unsigned int irq;
+
+ irq = create_irq();
+ if (!irq)
+ return -EINVAL;
+
+ set_irq_data(irq, dev);
+
+ if (hpet_setup_msi_irq(irq))
+ return -EINVAL;
+
+ dev->irq = irq;
+ return 0;
+}
+
+static irqreturn_t hpet_interrupt_handler(int irq, void *data)
+{
+ struct hpet_dev *dev = (struct hpet_dev *)data;
+ struct clock_event_device *hevt = &dev->evt;
+
+ if (!hevt->event_handler) {
+ printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
+ dev->num);
+ return IRQ_HANDLED;
+ }
+
+ hevt->event_handler(hevt);
+ return IRQ_HANDLED;
+}
+
+static int hpet_setup_irq(struct hpet_dev *dev)
+{
+
+ if (request_irq(dev->irq, hpet_interrupt_handler,
+ IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev))
+ return -1;
+
+ disable_irq(dev->irq);
+ irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+ enable_irq(dev->irq);
+
+ printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
+ dev->name, dev->irq);
+
+ return 0;
+}
+
+/* This should be called in specific @cpu */
+static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
+{
+ struct clock_event_device *evt = &hdev->evt;
+ uint64_t hpet_freq;
+
+ WARN_ON(cpu != smp_processor_id());
+ if (!(hdev->flags & HPET_DEV_VALID))
+ return;
+
+ if (hpet_setup_msi_irq(hdev->irq))
+ return;
+
+ hdev->cpu = cpu;
+ per_cpu(cpu_hpet_dev, cpu) = hdev;
+ evt->name = hdev->name;
+ hpet_setup_irq(hdev);
+ evt->irq = hdev->irq;
+
+ evt->rating = 110;
+ evt->features = CLOCK_EVT_FEAT_ONESHOT;
+ if (hdev->flags & HPET_DEV_PERI_CAP)
+ evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+
+ evt->set_mode = hpet_msi_set_mode;
+ evt->set_next_event = hpet_msi_next_event;
+ evt->shift = 32;
+
+ /*
+ * The period is a femto seconds value. We need to calculate the
+ * scaled math multiplication factor for nanosecond to hpet tick
+ * conversion.
+ */
+ hpet_freq = 1000000000000000ULL;
+ do_div(hpet_freq, hpet_period);
+ evt->mult = div_sc((unsigned long) hpet_freq,
+ NSEC_PER_SEC, evt->shift);
+ /* Calculate the max delta */
+ evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
+ /* 5 usec minimum reprogramming delta. */
+ evt->min_delta_ns = 5000;
+
+ evt->cpumask = cpumask_of_cpu(hdev->cpu);
+ clockevents_register_device(evt);
+}
+
+#ifdef CONFIG_HPET
+/* Reserve at least one timer for userspace (/dev/hpet) */
+#define RESERVE_TIMERS 1
+#else
+#define RESERVE_TIMERS 0
+#endif
+
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+ unsigned int id;
+ unsigned int num_timers;
+ unsigned int num_timers_used = 0;
+ int i;
+
+ id = hpet_readl(HPET_ID);
+
+ num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+ num_timers++; /* Value read out starts from 0 */
+
+ hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
+ if (!hpet_devs)
+ return;
+
+ hpet_num_timers = num_timers;
+
+ for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
+ struct hpet_dev *hdev = &hpet_devs[num_timers_used];
+ unsigned long cfg = hpet_readl(HPET_Tn_CFG(i));
+
+ /* Only consider HPET timer with MSI support */
+ if (!(cfg & HPET_TN_FSB_CAP))
+ continue;
+
+ hdev->flags = 0;
+ if (cfg & HPET_TN_PERIODIC_CAP)
+ hdev->flags |= HPET_DEV_PERI_CAP;
+ hdev->num = i;
+
+ sprintf(hdev->name, "hpet%d", i);
+ if (hpet_assign_irq(hdev))
+ continue;
+
+ hdev->flags |= HPET_DEV_FSB_CAP;
+ hdev->flags |= HPET_DEV_VALID;
+ num_timers_used++;
+ if (num_timers_used == num_possible_cpus())
+ break;
+ }
+
+ printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
+ num_timers, num_timers_used);
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+ int i;
+
+ if (!hpet_devs)
+ return;
+
+ for (i = 0; i < hpet_num_timers; i++) {
+ struct hpet_dev *hdev = &hpet_devs[i];
+
+ if (!(hdev->flags & HPET_DEV_VALID))
+ continue;
+
+ hd->hd_irq[hdev->num] = hdev->irq;
+ hpet_reserve_timer(hd, hdev->num);
+ }
+}
+#endif
+
+static struct hpet_dev *hpet_get_unused_timer(void)
+{
+ int i;
+
+ if (!hpet_devs)
+ return NULL;
+
+ for (i = 0; i < hpet_num_timers; i++) {
+ struct hpet_dev *hdev = &hpet_devs[i];
+
+ if (!(hdev->flags & HPET_DEV_VALID))
+ continue;
+ if (test_and_set_bit(HPET_DEV_USED_BIT,
+ (unsigned long *)&hdev->flags))
+ continue;
+ return hdev;
+ }
+ return NULL;
+}
+
+struct hpet_work_struct {
+ struct delayed_work work;
+ struct completion complete;
+};
+
+static void hpet_work(struct work_struct *w)
+{
+ struct hpet_dev *hdev;
+ int cpu = smp_processor_id();
+ struct hpet_work_struct *hpet_work;
+
+ hpet_work = container_of(w, struct hpet_work_struct, work.work);
+
+ hdev = hpet_get_unused_timer();
+ if (hdev)
+ init_one_hpet_msi_clockevent(hdev, cpu);
+
+ complete(&hpet_work->complete);
+}
+
+static int hpet_cpuhp_notify(struct notifier_block *n,
+ unsigned long action, void *hcpu)
+{
+ unsigned long cpu = (unsigned long)hcpu;
+ struct hpet_work_struct work;
+ struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
+
+ switch (action & 0xf) {
+ case CPU_ONLINE:
+ INIT_DELAYED_WORK(&work.work, hpet_work);
+ init_completion(&work.complete);
+ /* FIXME: add schedule_work_on() */
+ schedule_delayed_work_on(cpu, &work.work, 0);
+ wait_for_completion(&work.complete);
+ break;
+ case CPU_DEAD:
+ if (hdev) {
+ free_irq(hdev->irq, hdev);
+ hdev->flags &= ~HPET_DEV_USED;
+ per_cpu(cpu_hpet_dev, cpu) = NULL;
+ }
+ break;
+ }
+ return NOTIFY_OK;
+}
+#else
+
+static int hpet_setup_msi_irq(unsigned int irq)
+{
+ return 0;
+}
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+ return;
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+ return;
+}
+#endif
+
+static int hpet_cpuhp_notify(struct notifier_block *n,
+ unsigned long action, void *hcpu)
+{
+ return NOTIFY_OK;
+}
+
+#endif
+
/*
* Clock source related code
*/
@@ -423,8 +803,10 @@ int __init hpet_enable(void)
if (id & HPET_ID_LEGSUP) {
hpet_legacy_clockevent_register();
+ hpet_msi_capability_lookup(2);
return 1;
}
+ hpet_msi_capability_lookup(0);
return 0;
out_nohpet:
@@ -441,6 +823,8 @@ out_nohpet:
*/
static __init int hpet_late_init(void)
{
+ int cpu;
+
if (boot_hpet_disable)
return -ENODEV;
@@ -456,6 +840,13 @@ static __init int hpet_late_init(void)
hpet_reserve_platform_timers(hpet_readl(HPET_ID));
+ for_each_online_cpu(cpu) {
+ hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
+ }
+
+ /* This notifier should be called after workqueue is ready */
+ hotcpu_notifier(hpet_cpuhp_notify, -20);
+
return 0;
}
fs_initcall(hpet_late_init);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index eb9ddd8efb8..1f20608d4ca 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -21,9 +21,12 @@
# include <asm/sigcontext32.h>
# include <asm/user32.h>
#else
-# define save_i387_ia32 save_i387
-# define restore_i387_ia32 restore_i387
+# define save_i387_xstate_ia32 save_i387_xstate
+# define restore_i387_xstate_ia32 restore_i387_xstate
# define _fpstate_ia32 _fpstate
+# define _xstate_ia32 _xstate
+# define sig_xstate_ia32_size sig_xstate_size
+# define fx_sw_reserved_ia32 fx_sw_reserved
# define user_i387_ia32_struct user_i387_struct
# define user32_fxsr_struct user_fxsr_struct
#endif
@@ -36,6 +39,7 @@
static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
unsigned int xstate_size;
+unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
static struct i387_fxsave_struct fx_scratch __cpuinitdata;
void __cpuinit mxcsr_feature_mask_init(void)
@@ -61,6 +65,11 @@ void __init init_thread_xstate(void)
return;
}
+ if (cpu_has_xsave) {
+ xsave_cntxt_init();
+ return;
+ }
+
if (cpu_has_fxsr)
xstate_size = sizeof(struct i387_fxsave_struct);
#ifdef CONFIG_X86_32
@@ -83,9 +92,19 @@ void __cpuinit fpu_init(void)
write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
+ /*
+ * Boot processor to setup the FP and extended state context info.
+ */
+ if (!smp_processor_id())
+ init_thread_xstate();
+ xsave_init();
+
mxcsr_feature_mask_init();
/* clean state in init */
- current_thread_info()->status = 0;
+ if (cpu_has_xsave)
+ current_thread_info()->status = TS_XSAVE;
+ else
+ current_thread_info()->status = 0;
clear_used_math();
}
#endif /* CONFIG_X86_64 */
@@ -195,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
*/
target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+ /*
+ * update the header bits in the xsave header, indicating the
+ * presence of FP and SSE state.
+ */
+ if (cpu_has_xsave)
+ target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+
return ret;
}
@@ -395,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
if (!ret)
convert_to_fxsr(target, &env);
+ /*
+ * update the header bit in the xsave header, indicating the
+ * presence of FP.
+ */
+ if (cpu_has_xsave)
+ target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
return ret;
}
@@ -407,7 +439,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
struct task_struct *tsk = current;
struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
- unlazy_fpu(tsk);
fp->status = fp->swd;
if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
return -1;
@@ -421,8 +452,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
struct user_i387_ia32_struct env;
int err = 0;
- unlazy_fpu(tsk);
-
convert_from_fxsr(&env, tsk);
if (__copy_to_user(buf, &env, sizeof(env)))
return -1;
@@ -432,16 +461,54 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
if (err)
return -1;
- if (__copy_to_user(&buf->_fxsr_env[0], fx,
- sizeof(struct i387_fxsave_struct)))
+ if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
+ return -1;
+ return 1;
+}
+
+static int save_i387_xsave(void __user *buf)
+{
+ struct task_struct *tsk = current;
+ struct _fpstate_ia32 __user *fx = buf;
+ int err = 0;
+
+ /*
+ * For legacy compatible, we always set FP/SSE bits in the bit
+ * vector while saving the state to the user context.
+ * This will enable us capturing any changes(during sigreturn) to
+ * the FP/SSE bits by the legacy applications which don't touch
+ * xstate_bv in the xsave header.
+ *
+ * xsave aware applications can change the xstate_bv in the xsave
+ * header as well as change any contents in the memory layout.
+ * xrestore as part of sigreturn will capture all the changes.
+ */
+ tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+
+ if (save_i387_fxsave(fx) < 0)
+ return -1;
+
+ err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
+ sizeof(struct _fpx_sw_bytes));
+ err |= __put_user(FP_XSTATE_MAGIC2,
+ (__u32 __user *) (buf + sig_xstate_ia32_size
+ - FP_XSTATE_MAGIC2_SIZE));
+ if (err)
return -1;
+
return 1;
}
-int save_i387_ia32(struct _fpstate_ia32 __user *buf)
+int save_i387_xstate_ia32(void __user *buf)
{
+ struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
+ struct task_struct *tsk = current;
+
if (!used_math())
return 0;
+
+ if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
+ return -EACCES;
/*
* This will cause a "finit" to be triggered by the next
* attempted FPU operation by the 'current' process.
@@ -451,13 +518,17 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf)
if (!HAVE_HWFP) {
return fpregs_soft_get(current, NULL,
0, sizeof(struct user_i387_ia32_struct),
- NULL, buf) ? -1 : 1;
+ NULL, fp) ? -1 : 1;
}
+ unlazy_fpu(tsk);
+
+ if (cpu_has_xsave)
+ return save_i387_xsave(fp);
if (cpu_has_fxsr)
- return save_i387_fxsave(buf);
+ return save_i387_fxsave(fp);
else
- return save_i387_fsave(buf);
+ return save_i387_fsave(fp);
}
static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
@@ -468,14 +539,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
sizeof(struct i387_fsave_struct));
}
-static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
+static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
+ unsigned int size)
{
struct task_struct *tsk = current;
struct user_i387_ia32_struct env;
int err;
err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
- sizeof(struct i387_fxsave_struct));
+ size);
/* mxcsr reserved bits must be masked to zero for security reasons */
tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
if (err || __copy_from_user(&env, buf, sizeof(env)))
@@ -485,14 +557,69 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
return 0;
}
-int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
+static int restore_i387_xsave(void __user *buf)
+{
+ struct _fpx_sw_bytes fx_sw_user;
+ struct _fpstate_ia32 __user *fx_user =
+ ((struct _fpstate_ia32 __user *) buf);
+ struct i387_fxsave_struct __user *fx =
+ (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
+ struct xsave_hdr_struct *xsave_hdr =
+ &current->thread.xstate->xsave.xsave_hdr;
+ u64 mask;
+ int err;
+
+ if (check_for_xstate(fx, buf, &fx_sw_user))
+ goto fx_only;
+
+ mask = fx_sw_user.xstate_bv;
+
+ err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
+
+ xsave_hdr->xstate_bv &= pcntxt_mask;
+ /*
+ * These bits must be zero.
+ */
+ xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+
+ /*
+ * Init the state that is not present in the memory layout
+ * and enabled by the OS.
+ */
+ mask = ~(pcntxt_mask & ~mask);
+ xsave_hdr->xstate_bv &= mask;
+
+ return err;
+fx_only:
+ /*
+ * Couldn't find the extended state information in the memory
+ * layout. Restore the FP/SSE and init the other extended state
+ * enabled by the OS.
+ */
+ xsave_hdr->xstate_bv = XSTATE_FPSSE;
+ return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
+}
+
+int restore_i387_xstate_ia32(void __user *buf)
{
int err;
struct task_struct *tsk = current;
+ struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
if (HAVE_HWFP)
clear_fpu(tsk);
+ if (!buf) {
+ if (used_math()) {
+ clear_fpu(tsk);
+ clear_used_math();
+ }
+
+ return 0;
+ } else
+ if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
+ return -EACCES;
+
if (!used_math()) {
err = init_fpu(tsk);
if (err)
@@ -500,14 +627,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
}
if (HAVE_HWFP) {
- if (cpu_has_fxsr)
- err = restore_i387_fxsave(buf);
+ if (cpu_has_xsave)
+ err = restore_i387_xsave(buf);
+ else if (cpu_has_fxsr)
+ err = restore_i387_fxsave(fp, sizeof(struct
+ i387_fxsave_struct));
else
- err = restore_i387_fsave(buf);
+ err = restore_i387_fsave(fp);
} else {
err = fpregs_soft_set(current, NULL,
0, sizeof(struct user_i387_ia32_struct),
- NULL, buf) != 0;
+ NULL, fp) != 0;
}
set_used_math();
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index dc92b49d920..4b8a53d841f 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -282,6 +282,30 @@ static int __init i8259A_init_sysfs(void)
device_initcall(i8259A_init_sysfs);
+void mask_8259A(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+void unmask_8259A(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
+
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
void init_8259A(int auto_eoi)
{
unsigned long flags;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic.c
index 61a83b70c18..b764d7429c6 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic.c
@@ -27,16 +27,21 @@
#include <linux/sched.h>
#include <linux/pci.h>
#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
#include <linux/acpi.h>
+#include <linux/module.h>
#include <linux/sysdev.h>
#include <linux/msi.h>
#include <linux/htirq.h>
-#include <linux/dmar.h>
-#include <linux/jiffies.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h> /* time_after() */
#ifdef CONFIG_ACPI
#include <acpi/acpi_bus.h>
#endif
#include <linux/bootmem.h>
+#include <linux/dmar.h>
+#include <linux/hpet.h>
#include <asm/idle.h>
#include <asm/io.h>
@@ -45,60 +50,28 @@
#include <asm/proto.h>
#include <asm/acpi.h>
#include <asm/dma.h>
+#include <asm/timer.h>
#include <asm/i8259.h>
#include <asm/nmi.h>
#include <asm/msidef.h>
#include <asm/hypertransport.h>
+#include <asm/setup.h>
+#include <asm/irq_remapping.h>
+#include <asm/hpet.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_irq.h>
#include <mach_ipi.h>
#include <mach_apic.h>
+#include <mach_apicdef.h>
-struct irq_cfg {
- cpumask_t domain;
- cpumask_t old_domain;
- unsigned move_cleanup_count;
- u8 vector;
- u8 move_in_progress : 1;
-};
-
-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
- [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
- [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
- [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
- [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
- [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
- [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
- [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
- [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
- [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
- [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
- [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
- [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
- [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
- [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
- [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
- [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
-};
-
-static int assign_irq_vector(int irq, cpumask_t mask);
+#define __apicdebuginit(type) static type __init
-int first_system_vector = 0xfe;
-
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
-
-#define __apicdebuginit __init
-
-int sis_apic_bug; /* not actually supported, dummy for compile */
-
-static int no_timer_check;
-
-static int disable_timer_pin_1 __initdata;
-
-int timer_through_8259 __initdata;
-
-/* Where if anywhere is the i8259 connect in external int mode */
-static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+/*
+ * Is the SiS APIC rmw bug present ?
+ * -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
static DEFINE_SPINLOCK(ioapic_lock);
static DEFINE_SPINLOCK(vector_lock);
@@ -118,11 +91,69 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+int skip_ioapic_setup;
+
+static int __init parse_noapic(char *str)
+{
+ /* disable IO-APIC */
+ disable_ioapic_setup();
+ return 0;
+}
+early_param("noapic", parse_noapic);
+
+struct irq_pin_list;
+struct irq_cfg {
+ unsigned int irq;
+ struct irq_pin_list *irq_2_pin;
+ cpumask_t domain;
+ cpumask_t old_domain;
+ unsigned move_cleanup_count;
+ u8 vector;
+ u8 move_in_progress : 1;
+};
+
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+static struct irq_cfg irq_cfgx[NR_IRQS] = {
+ [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
+ [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
+ [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
+ [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
+ [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
+ [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
+ [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
+ [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
+ [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
+ [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
+ [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+ [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+ [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+ [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+ [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+ [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+};
+
+#define for_each_irq_cfg(irq, cfg) \
+ for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+{
+ return irq_cfg(irq);
+}
+
/*
- * Rough estimation of how many shared IRQs there are, can
- * be changed anytime.
+ * Rough estimation of how many shared IRQs there are, can be changed
+ * anytime.
*/
#define MAX_PLUS_SHARED_IRQS NR_IRQS
#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
@@ -134,9 +165,36 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
* between pins and IRQs.
*/
-static struct irq_pin_list {
- short apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
+struct irq_pin_list {
+ int apic, pin;
+ struct irq_pin_list *next;
+};
+
+static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
+static struct irq_pin_list *irq_2_pin_ptr;
+
+static void __init irq_2_pin_init(void)
+{
+ struct irq_pin_list *pin = irq_2_pin_head;
+ int i;
+
+ for (i = 1; i < PIN_MAP_SIZE; i++)
+ pin[i-1].next = &pin[i];
+
+ irq_2_pin_ptr = &pin[0];
+}
+
+static struct irq_pin_list *get_one_free_irq_2_pin(void)
+{
+ struct irq_pin_list *pin = irq_2_pin_ptr;
+
+ if (!pin)
+ panic("can not get more irq_2_pin\n");
+
+ irq_2_pin_ptr = pin->next;
+ pin->next = NULL;
+ return pin;
+}
struct io_apic {
unsigned int index;
@@ -167,10 +225,15 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
/*
* Re-write a value: to be used for read-modify-write
* cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
*/
-static inline void io_apic_modify(unsigned int apic, unsigned int value)
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+ if (sis_apic_bug)
+ writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
@@ -178,16 +241,17 @@ static bool io_apic_level_ack_pending(unsigned int irq)
{
struct irq_pin_list *entry;
unsigned long flags;
+ struct irq_cfg *cfg = irq_cfg(irq);
spin_lock_irqsave(&ioapic_lock, flags);
- entry = irq_2_pin + irq;
+ entry = cfg->irq_2_pin;
for (;;) {
unsigned int reg;
int pin;
- pin = entry->pin;
- if (pin == -1)
+ if (!entry)
break;
+ pin = entry->pin;
reg = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
if (reg & IO_APIC_REDIR_REMOTE_IRR) {
@@ -196,45 +260,13 @@ static bool io_apic_level_ack_pending(unsigned int irq)
}
if (!entry->next)
break;
- entry = irq_2_pin + entry->next;
+ entry = entry->next;
}
spin_unlock_irqrestore(&ioapic_lock, flags);
return false;
}
-/*
- * Synchronize the IO-APIC and the CPU by doing
- * a dummy read from the IO-APIC
- */
-static inline void io_apic_sync(unsigned int apic)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
- readl(&io_apic->data);
-}
-
-#define __DO_ACTION(R, ACTION, FINAL) \
- \
-{ \
- int pin; \
- struct irq_pin_list *entry = irq_2_pin + irq; \
- \
- BUG_ON(irq >= NR_IRQS); \
- for (;;) { \
- unsigned int reg; \
- pin = entry->pin; \
- if (pin == -1) \
- break; \
- reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
- reg ACTION; \
- io_apic_modify(entry->apic, reg); \
- FINAL; \
- if (!entry->next) \
- break; \
- entry = irq_2_pin + entry->next; \
- } \
-}
-
union entry_union {
struct { u32 w1, w2; };
struct IO_APIC_route_entry entry;
@@ -294,54 +326,71 @@ static void ioapic_mask_entry(int apic, int pin)
static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
{
int apic, pin;
- struct irq_pin_list *entry = irq_2_pin + irq;
+ struct irq_cfg *cfg;
+ struct irq_pin_list *entry;
- BUG_ON(irq >= NR_IRQS);
+ cfg = irq_cfg(irq);
+ entry = cfg->irq_2_pin;
for (;;) {
unsigned int reg;
+
+ if (!entry)
+ break;
+
apic = entry->apic;
pin = entry->pin;
- if (pin == -1)
- break;
+#ifdef CONFIG_INTR_REMAP
+ /*
+ * With interrupt-remapping, destination information comes
+ * from interrupt-remapping table entry.
+ */
+ if (!irq_remapped(irq))
+ io_apic_write(apic, 0x11 + pin*2, dest);
+#else
io_apic_write(apic, 0x11 + pin*2, dest);
+#endif
reg = io_apic_read(apic, 0x10 + pin*2);
reg &= ~IO_APIC_REDIR_VECTOR_MASK;
reg |= vector;
- io_apic_modify(apic, reg);
+ io_apic_modify(apic, 0x10 + pin*2, reg);
if (!entry->next)
break;
- entry = irq_2_pin + entry->next;
+ entry = entry->next;
}
}
+static int assign_irq_vector(int irq, cpumask_t mask);
+
static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
unsigned long flags;
unsigned int dest;
cpumask_t tmp;
+ struct irq_desc *desc;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
return;
+ cfg = irq_cfg(irq);
if (assign_irq_vector(irq, mask))
return;
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
-
/*
* Only the high 8 bits are valid.
*/
dest = SET_APIC_LOGICAL_ID(dest);
+ desc = irq_to_desc(irq);
spin_lock_irqsave(&ioapic_lock, flags);
__target_IO_APIC_irq(irq, dest, cfg->vector);
- irq_desc[irq].affinity = mask;
+ desc->affinity = mask;
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-#endif
+#endif /* CONFIG_SMP */
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
@@ -350,19 +399,30 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
*/
static void add_pin_to_irq(unsigned int irq, int apic, int pin)
{
- static int first_free_entry = NR_IRQS;
- struct irq_pin_list *entry = irq_2_pin + irq;
+ struct irq_cfg *cfg;
+ struct irq_pin_list *entry;
+
+ /* first time to refer irq_cfg, so with new */
+ cfg = irq_cfg_alloc(irq);
+ entry = cfg->irq_2_pin;
+ if (!entry) {
+ entry = get_one_free_irq_2_pin();
+ cfg->irq_2_pin = entry;
+ entry->apic = apic;
+ entry->pin = pin;
+ return;
+ }
- BUG_ON(irq >= NR_IRQS);
- while (entry->next)
- entry = irq_2_pin + entry->next;
+ while (entry->next) {
+ /* not again, please */
+ if (entry->apic == apic && entry->pin == pin)
+ return;
- if (entry->pin != -1) {
- entry->next = first_free_entry;
- entry = irq_2_pin + entry->next;
- if (++first_free_entry >= PIN_MAP_SIZE)
- panic("io_apic.c: ran out of irq_2_pin entries!");
+ entry = entry->next;
}
+
+ entry->next = get_one_free_irq_2_pin();
+ entry = entry->next;
entry->apic = apic;
entry->pin = pin;
}
@@ -374,30 +434,86 @@ static void __init replace_pin_at_irq(unsigned int irq,
int oldapic, int oldpin,
int newapic, int newpin)
{
- struct irq_pin_list *entry = irq_2_pin + irq;
+ struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_pin_list *entry = cfg->irq_2_pin;
+ int replaced = 0;
- while (1) {
+ while (entry) {
if (entry->apic == oldapic && entry->pin == oldpin) {
entry->apic = newapic;
entry->pin = newpin;
- }
- if (!entry->next)
+ replaced = 1;
+ /* every one is different, right? */
break;
- entry = irq_2_pin + entry->next;
+ }
+ entry = entry->next;
+ }
+
+ /* why? call replace before add? */
+ if (!replaced)
+ add_pin_to_irq(irq, newapic, newpin);
+}
+
+static inline void io_apic_modify_irq(unsigned int irq,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+{
+ int pin;
+ struct irq_cfg *cfg;
+ struct irq_pin_list *entry;
+
+ cfg = irq_cfg(irq);
+ for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
+ unsigned int reg;
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin * 2);
+ reg &= mask_and;
+ reg |= mask_or;
+ io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
+ if (final)
+ final(entry);
}
}
+static void __unmask_IO_APIC_irq(unsigned int irq)
+{
+ io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+}
-#define DO_ACTION(name,R,ACTION, FINAL) \
- \
- static void name##_IO_APIC_irq (unsigned int irq) \
- __DO_ACTION(R, ACTION, FINAL)
+#ifdef CONFIG_X86_64
+void io_apic_sync(struct irq_pin_list *entry)
+{
+ /*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+ struct io_apic __iomem *io_apic;
+ io_apic = io_apic_base(entry->apic);
+ readl(&io_apic->data);
+}
-/* mask = 1 */
-DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+static void __mask_IO_APIC_irq(unsigned int irq)
+{
+ io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+}
+#else /* CONFIG_X86_32 */
+static void __mask_IO_APIC_irq(unsigned int irq)
+{
+ io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+}
+
+static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+{
+ io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+ IO_APIC_REDIR_MASKED, NULL);
+}
-/* mask = 0 */
-DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
+static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+{
+ io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+ IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
+}
+#endif /* CONFIG_X86_32 */
static void mask_IO_APIC_irq (unsigned int irq)
{
@@ -440,24 +556,145 @@ static void clear_IO_APIC (void)
clear_IO_APIC_pin(apic, pin);
}
-int skip_ioapic_setup;
-int ioapic_force;
+#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
+void send_IPI_self(int vector)
+{
+ unsigned int cfg;
-static int __init parse_noapic(char *str)
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+ cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ apic_write(APIC_ICR, cfg);
+}
+#endif /* !CONFIG_SMP && CONFIG_X86_32*/
+
+#ifdef CONFIG_X86_32
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+
+static int __init ioapic_pirq_setup(char *str)
{
- disable_ioapic_setup();
+ int i, max;
+ int ints[MAX_PIRQS+1];
+
+ get_options(str, ARRAY_SIZE(ints), ints);
+
+ for (i = 0; i < MAX_PIRQS; i++)
+ pirq_entries[i] = -1;
+
+ pirqs_enabled = 1;
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "PIRQ redirection, working around broken MP-BIOS.\n");
+ max = MAX_PIRQS;
+ if (ints[0] < MAX_PIRQS)
+ max = ints[0];
+
+ for (i = 0; i < max; i++) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+ /*
+ * PIRQs are mapped upside down, usually.
+ */
+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+ }
+ return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_INTR_REMAP
+/* I/O APIC RTE contents at the OS boot up */
+static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
+
+/*
+ * Saves and masks all the unmasked IO-APIC RTE's
+ */
+int save_mask_IO_APIC_setup(void)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+ int apic, pin;
+
+ /*
+ * The number of IO-APIC IRQ registers (== #pins):
+ */
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(apic, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+ }
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ early_ioapic_entries[apic] =
+ kzalloc(sizeof(struct IO_APIC_route_entry) *
+ nr_ioapic_registers[apic], GFP_KERNEL);
+ if (!early_ioapic_entries[apic])
+ goto nomem;
+ }
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ struct IO_APIC_route_entry entry;
+
+ entry = early_ioapic_entries[apic][pin] =
+ ioapic_read_entry(apic, pin);
+ if (!entry.mask) {
+ entry.mask = 1;
+ ioapic_write_entry(apic, pin, entry);
+ }
+ }
+
return 0;
+
+nomem:
+ while (apic >= 0)
+ kfree(early_ioapic_entries[apic--]);
+ memset(early_ioapic_entries, 0,
+ ARRAY_SIZE(early_ioapic_entries));
+
+ return -ENOMEM;
}
-early_param("noapic", parse_noapic);
-/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
-static int __init disable_timer_pin_setup(char *arg)
+void restore_IO_APIC_setup(void)
{
- disable_timer_pin_1 = 1;
- return 1;
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (!early_ioapic_entries[apic])
+ break;
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+ ioapic_write_entry(apic, pin,
+ early_ioapic_entries[apic][pin]);
+ kfree(early_ioapic_entries[apic]);
+ early_ioapic_entries[apic] = NULL;
+ }
}
-__setup("disable_timer_pin_1", disable_timer_pin_setup);
+void reinit_intr_remapped_IO_APIC(int intr_remapping)
+{
+ /*
+ * for now plain restore of previous settings.
+ * TBD: In the case of OS enabling interrupt-remapping,
+ * IO-APIC RTE's need to be setup to point to interrupt-remapping
+ * table entries. for now, do a plain restore, and wait for
+ * the setup_IO_APIC_irqs() to do proper initialization.
+ */
+ restore_IO_APIC_setup();
+}
+#endif
/*
* Find the IRQ entry number of a certain pin.
@@ -561,22 +798,54 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
best_guess = irq;
}
}
- BUG_ON(best_guess >= NR_IRQS);
return best_guess;
}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+ if (irq < 16) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return 0;
+}
+
+#endif
+
/* ISA interrupts are always polarity zero edge triggered,
* when listed as conforming in the MP table. */
#define default_ISA_trigger(idx) (0)
#define default_ISA_polarity(idx) (0)
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
+#define default_EISA_polarity(idx) default_ISA_polarity(idx)
+
/* PCI interrupts are always polarity one level triggered,
* when listed as conforming in the MP table. */
#define default_PCI_trigger(idx) (1)
#define default_PCI_polarity(idx) (1)
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx) (1)
+#define default_MCA_polarity(idx) default_ISA_polarity(idx)
+
static int MPBIOS_polarity(int idx)
{
int bus = mp_irqs[idx].mp_srcbus;
@@ -634,6 +903,36 @@ static int MPBIOS_trigger(int idx)
trigger = default_ISA_trigger(idx);
else
trigger = default_PCI_trigger(idx);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_ISA: /* ISA pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ case MP_BUS_EISA: /* EISA pin */
+ {
+ trigger = default_EISA_trigger(idx);
+ break;
+ }
+ case MP_BUS_PCI: /* PCI pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ case MP_BUS_MCA: /* MCA pin */
+ {
+ trigger = default_MCA_trigger(idx);
+ break;
+ }
+ default:
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ }
+#endif
break;
case 1: /* edge */
{
@@ -671,6 +970,7 @@ static inline int irq_trigger(int idx)
return MPBIOS_trigger(idx);
}
+int (*ioapic_renumber_irq)(int ioapic, int irq);
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
@@ -692,8 +992,32 @@ static int pin_2_irq(int idx, int apic, int pin)
while (i < apic)
irq += nr_ioapic_registers[i++];
irq += pin;
+ /*
+ * For MPS mode, so far only needed by ES7000 platform
+ */
+ if (ioapic_renumber_irq)
+ irq = ioapic_renumber_irq(apic, irq);
}
- BUG_ON(irq >= NR_IRQS);
+
+#ifdef CONFIG_X86_32
+ /*
+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
+ */
+ if ((pin >= 16) && (pin <= 23)) {
+ if (pirq_entries[pin-16] != -1) {
+ if (!pirq_entries[pin-16]) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "disabling PIRQ%d\n", pin-16);
+ } else {
+ irq = pirq_entries[pin-16];
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "using PIRQ%d -> IRQ %d\n",
+ pin-16, irq);
+ }
+ }
+ }
+#endif
+
return irq;
}
@@ -728,8 +1052,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
int cpu;
struct irq_cfg *cfg;
- BUG_ON((unsigned)irq >= NR_IRQS);
- cfg = &irq_cfg[irq];
+ cfg = irq_cfg(irq);
/* Only try and allocate irqs on cpus that are present */
cpus_and(mask, mask, cpu_online_map);
@@ -764,8 +1087,13 @@ next:
}
if (unlikely(current_vector == vector))
continue;
+#ifdef CONFIG_X86_64
if (vector == IA32_SYSCALL_VECTOR)
goto next;
+#else
+ if (vector == SYSCALL_VECTOR)
+ goto next;
+#endif
for_each_cpu_mask_nr(new_cpu, new_mask)
if (per_cpu(vector_irq, new_cpu)[vector] != -1)
goto next;
@@ -802,8 +1130,7 @@ static void __clear_irq_vector(int irq)
cpumask_t mask;
int cpu, vector;
- BUG_ON((unsigned)irq >= NR_IRQS);
- cfg = &irq_cfg[irq];
+ cfg = irq_cfg(irq);
BUG_ON(!cfg->vector);
vector = cfg->vector;
@@ -820,12 +1147,13 @@ void __setup_vector_irq(int cpu)
/* Initialize vector_irq on a new cpu */
/* This function must be called with vector_lock held */
int irq, vector;
+ struct irq_cfg *cfg;
/* Mark the inuse vectors */
- for (irq = 0; irq < NR_IRQS; ++irq) {
- if (!cpu_isset(cpu, irq_cfg[irq].domain))
+ for_each_irq_cfg(irq, cfg) {
+ if (!cpu_isset(cpu, cfg->domain))
continue;
- vector = irq_cfg[irq].vector;
+ vector = cfg->vector;
per_cpu(vector_irq, cpu)[vector] = irq;
}
/* Mark the free vectors */
@@ -833,36 +1161,154 @@ void __setup_vector_irq(int cpu)
irq = per_cpu(vector_irq, cpu)[vector];
if (irq < 0)
continue;
- if (!cpu_isset(cpu, irq_cfg[irq].domain))
+
+ cfg = irq_cfg(irq);
+ if (!cpu_isset(cpu, cfg->domain))
per_cpu(vector_irq, cpu)[vector] = -1;
}
}
static struct irq_chip ioapic_chip;
+#ifdef CONFIG_INTR_REMAP
+static struct irq_chip ir_ioapic_chip;
+#endif
+
+#define IOAPIC_AUTO -1
+#define IOAPIC_EDGE 0
+#define IOAPIC_LEVEL 1
+
+#ifdef CONFIG_X86_32
+static inline int IO_APIC_irq_trigger(int irq)
+{
+ int apic, idx, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
+ return irq_trigger(idx);
+ }
+ }
+ /*
+ * nonexistent IRQs are edge default
+ */
+ return 0;
+}
+#else
+static inline int IO_APIC_irq_trigger(int irq)
+{
+ return 1;
+}
+#endif
static void ioapic_register_intr(int irq, unsigned long trigger)
{
- if (trigger) {
- irq_desc[irq].status |= IRQ_LEVEL;
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL)
+ desc->status |= IRQ_LEVEL;
+ else
+ desc->status &= ~IRQ_LEVEL;
+
+#ifdef CONFIG_INTR_REMAP
+ if (irq_remapped(irq)) {
+ desc->status |= IRQ_MOVE_PCNTXT;
+ if (trigger)
+ set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
+ handle_fasteoi_irq,
+ "fasteoi");
+ else
+ set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
+ handle_edge_irq, "edge");
+ return;
+ }
+#endif
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL)
set_irq_chip_and_handler_name(irq, &ioapic_chip,
- handle_fasteoi_irq, "fasteoi");
- } else {
- irq_desc[irq].status &= ~IRQ_LEVEL;
+ handle_fasteoi_irq,
+ "fasteoi");
+ else
set_irq_chip_and_handler_name(irq, &ioapic_chip,
handle_edge_irq, "edge");
+}
+
+static int setup_ioapic_entry(int apic, int irq,
+ struct IO_APIC_route_entry *entry,
+ unsigned int destination, int trigger,
+ int polarity, int vector)
+{
+ /*
+ * add it to the IO-APIC irq-routing table:
+ */
+ memset(entry,0,sizeof(*entry));
+
+#ifdef CONFIG_INTR_REMAP
+ if (intr_remapping_enabled) {
+ struct intel_iommu *iommu = map_ioapic_to_ir(apic);
+ struct irte irte;
+ struct IR_IO_APIC_route_entry *ir_entry =
+ (struct IR_IO_APIC_route_entry *) entry;
+ int index;
+
+ if (!iommu)
+ panic("No mapping iommu for ioapic %d\n", apic);
+
+ index = alloc_irte(iommu, irq, 1);
+ if (index < 0)
+ panic("Failed to allocate IRTE for ioapic %d\n", apic);
+
+ memset(&irte, 0, sizeof(irte));
+
+ irte.present = 1;
+ irte.dst_mode = INT_DEST_MODE;
+ irte.trigger_mode = trigger;
+ irte.dlvry_mode = INT_DELIVERY_MODE;
+ irte.vector = vector;
+ irte.dest_id = IRTE_DEST(destination);
+
+ modify_irte(irq, &irte);
+
+ ir_entry->index2 = (index >> 15) & 0x1;
+ ir_entry->zero = 0;
+ ir_entry->format = 1;
+ ir_entry->index = (index & 0x7fff);
+ } else
+#endif
+ {
+ entry->delivery_mode = INT_DELIVERY_MODE;
+ entry->dest_mode = INT_DEST_MODE;
+ entry->dest = destination;
}
+
+ entry->mask = 0; /* enable IRQ */
+ entry->trigger = trigger;
+ entry->polarity = polarity;
+ entry->vector = vector;
+
+ /* Mask level triggered irqs.
+ * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+ */
+ if (trigger)
+ entry->mask = 1;
+ return 0;
}
static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
int trigger, int polarity)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
struct IO_APIC_route_entry entry;
cpumask_t mask;
if (!IO_APIC_IRQ(irq))
return;
+ cfg = irq_cfg(irq);
+
mask = TARGET_CPUS;
if (assign_irq_vector(irq, mask))
return;
@@ -875,24 +1321,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
irq, trigger, polarity);
- /*
- * add it to the IO-APIC irq-routing table:
- */
- memset(&entry,0,sizeof(entry));
-
- entry.delivery_mode = INT_DELIVERY_MODE;
- entry.dest_mode = INT_DEST_MODE;
- entry.dest = cpu_mask_to_apicid(mask);
- entry.mask = 0; /* enable IRQ */
- entry.trigger = trigger;
- entry.polarity = polarity;
- entry.vector = cfg->vector;
- /* Mask level triggered irqs.
- * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
- */
- if (trigger)
- entry.mask = 1;
+ if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
+ cpu_mask_to_apicid(mask), trigger, polarity,
+ cfg->vector)) {
+ printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
+ mp_ioapics[apic].mp_apicid, pin);
+ __clear_irq_vector(irq);
+ return;
+ }
ioapic_register_intr(irq, trigger);
if (irq < 16)
@@ -903,37 +1340,49 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
static void __init setup_IO_APIC_irqs(void)
{
- int apic, pin, idx, irq, first_notcon = 1;
+ int apic, pin, idx, irq;
+ int notcon = 0;
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-
- idx = find_irq_entry(apic,pin,mp_INT);
- if (idx == -1) {
- if (first_notcon) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
- first_notcon = 0;
- } else
- apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
- continue;
- }
- if (!first_notcon) {
- apic_printk(APIC_VERBOSE, " not connected.\n");
- first_notcon = 1;
- }
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- irq = pin_2_irq(idx, apic, pin);
- add_pin_to_irq(irq, apic, pin);
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if (idx == -1) {
+ if (!notcon) {
+ notcon = 1;
+ apic_printk(APIC_VERBOSE,
+ KERN_DEBUG " %d-%d",
+ mp_ioapics[apic].mp_apicid,
+ pin);
+ } else
+ apic_printk(APIC_VERBOSE, " %d-%d",
+ mp_ioapics[apic].mp_apicid,
+ pin);
+ continue;
+ }
+ if (notcon) {
+ apic_printk(APIC_VERBOSE,
+ " (apicid-pin) not connected\n");
+ notcon = 0;
+ }
- setup_IO_APIC_irq(apic, pin, irq,
- irq_trigger(idx), irq_polarity(idx));
- }
+ irq = pin_2_irq(idx, apic, pin);
+#ifdef CONFIG_X86_32
+ if (multi_timer_check(apic, irq))
+ continue;
+#endif
+ add_pin_to_irq(irq, apic, pin);
+
+ setup_IO_APIC_irq(apic, pin, irq,
+ irq_trigger(idx), irq_polarity(idx));
+ }
}
- if (!first_notcon)
- apic_printk(APIC_VERBOSE, " not connected.\n");
+ if (notcon)
+ apic_printk(APIC_VERBOSE,
+ " (apicid-pin) not connected\n");
}
/*
@@ -944,6 +1393,11 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
{
struct IO_APIC_route_entry entry;
+#ifdef CONFIG_INTR_REMAP
+ if (intr_remapping_enabled)
+ return;
+#endif
+
memset(&entry, 0, sizeof(entry));
/*
@@ -970,13 +1424,17 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
ioapic_write_entry(apic, pin, entry);
}
-void __apicdebuginit print_IO_APIC(void)
+
+__apicdebuginit(void) print_IO_APIC(void)
{
int apic, i;
union IO_APIC_reg_00 reg_00;
union IO_APIC_reg_01 reg_01;
union IO_APIC_reg_02 reg_02;
+ union IO_APIC_reg_03 reg_03;
unsigned long flags;
+ struct irq_cfg *cfg;
+ unsigned int irq;
if (apic_verbosity == APIC_QUIET)
return;
@@ -999,12 +1457,16 @@ void __apicdebuginit print_IO_APIC(void)
reg_01.raw = io_apic_read(apic, 1);
if (reg_01.bits.version >= 0x10)
reg_02.raw = io_apic_read(apic, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(apic, 3);
spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
@@ -1012,11 +1474,27 @@ void __apicdebuginit print_IO_APIC(void)
printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
- if (reg_01.bits.version >= 0x10) {
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+ * but the value of reg_02 is read as the previous read register
+ * value, so ignore it if reg_02 == reg_01.
+ */
+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
}
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+ * or reg_03, but the value of reg_0[23] is read as the previous read
+ * register value, so ignore it if reg_03 == reg_0[12].
+ */
+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+ reg_03.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ }
+
printk(KERN_DEBUG ".... IRQ redirection table:\n");
printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
@@ -1045,16 +1523,16 @@ void __apicdebuginit print_IO_APIC(void)
}
}
printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for (i = 0; i < NR_IRQS; i++) {
- struct irq_pin_list *entry = irq_2_pin + i;
- if (entry->pin < 0)
+ for_each_irq_cfg(irq, cfg) {
+ struct irq_pin_list *entry = cfg->irq_2_pin;
+ if (!entry)
continue;
- printk(KERN_DEBUG "IRQ%d ", i);
+ printk(KERN_DEBUG "IRQ%d ", irq);
for (;;) {
printk("-> %d:%d", entry->apic, entry->pin);
if (!entry->next)
break;
- entry = irq_2_pin + entry->next;
+ entry = entry->next;
}
printk("\n");
}
@@ -1064,9 +1542,7 @@ void __apicdebuginit print_IO_APIC(void)
return;
}
-#if 0
-
-static __apicdebuginit void print_APIC_bitfield (int base)
+__apicdebuginit(void) print_APIC_bitfield(int base)
{
unsigned int v;
int i, j;
@@ -1087,9 +1563,10 @@ static __apicdebuginit void print_APIC_bitfield (int base)
}
}
-void __apicdebuginit print_local_APIC(void * dummy)
+__apicdebuginit(void) print_local_APIC(void *dummy)
{
unsigned int v, ver, maxlvt;
+ u64 icr;
if (apic_verbosity == APIC_QUIET)
return;
@@ -1097,7 +1574,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
v = apic_read(APIC_ID);
- printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
v = apic_read(APIC_LVR);
printk(KERN_INFO "... APIC VERSION: %08x\n", v);
ver = GET_APIC_VERSION(v);
@@ -1106,20 +1583,31 @@ void __apicdebuginit print_local_APIC(void * dummy)
v = apic_read(APIC_TASKPRI);
printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
- v = apic_read(APIC_ARBPRI);
- printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
- v & APIC_ARBPRI_MASK);
- v = apic_read(APIC_PROCPRI);
- printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (!APIC_XAPIC(ver)) {
+ v = apic_read(APIC_ARBPRI);
+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+ v & APIC_ARBPRI_MASK);
+ }
+ v = apic_read(APIC_PROCPRI);
+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ }
+
+ /*
+ * Remote read supported only in the 82489DX and local APIC for
+ * Pentium processors.
+ */
+ if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+ v = apic_read(APIC_RRR);
+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+ }
- v = apic_read(APIC_EOI);
- printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
- v = apic_read(APIC_RRR);
- printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
v = apic_read(APIC_LDR);
printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
- v = apic_read(APIC_DFR);
- printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+ if (!x2apic_enabled()) {
+ v = apic_read(APIC_DFR);
+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+ }
v = apic_read(APIC_SPIV);
printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
@@ -1130,13 +1618,17 @@ void __apicdebuginit print_local_APIC(void * dummy)
printk(KERN_DEBUG "... APIC IRR field:\n");
print_APIC_bitfield(APIC_IRR);
- v = apic_read(APIC_ESR);
- printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
- v = apic_read(APIC_ICR);
- printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
- v = apic_read(APIC_ICR2);
- printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+ v = apic_read(APIC_ESR);
+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+ }
+
+ icr = apic_icr_read();
+ printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
v = apic_read(APIC_LVTT);
printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1164,12 +1656,17 @@ void __apicdebuginit print_local_APIC(void * dummy)
printk("\n");
}
-void print_all_local_APICs (void)
+__apicdebuginit(void) print_all_local_APICs(void)
{
- on_each_cpu(print_local_APIC, NULL, 1);
+ int cpu;
+
+ preempt_disable();
+ for_each_online_cpu(cpu)
+ smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ preempt_enable();
}
-void __apicdebuginit print_PIC(void)
+__apicdebuginit(void) print_PIC(void)
{
unsigned int v;
unsigned long flags;
@@ -1201,19 +1698,34 @@ void __apicdebuginit print_PIC(void)
printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
}
-#endif /* 0 */
+__apicdebuginit(int) print_all_ICs(void)
+{
+ print_PIC();
+ print_all_local_APICs();
+ print_IO_APIC();
+
+ return 0;
+}
+
+fs_initcall(print_all_ICs);
+
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
void __init enable_IO_APIC(void)
{
union IO_APIC_reg_01 reg_01;
int i8259_apic, i8259_pin;
- int i, apic;
+ int apic;
unsigned long flags;
- for (i = 0; i < PIN_MAP_SIZE; i++) {
- irq_2_pin[i].pin = -1;
- irq_2_pin[i].next = 0;
- }
+#ifdef CONFIG_X86_32
+ int i;
+ if (!pirqs_enabled)
+ for (i = 0; i < MAX_PIRQS; i++)
+ pirq_entries[i] = -1;
+#endif
/*
* The number of IO-APIC IRQ registers (== #pins):
@@ -1243,6 +1755,10 @@ void __init enable_IO_APIC(void)
}
found_i8259:
/* Look to see what if the MP table has reported the ExtINT */
+ /* If we could not find the appropriate pin by looking at the ioapic
+ * the i8259 probably is not connected the ioapic but give the
+ * mptable a chance anyway.
+ */
i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
/* Trust the MP table if nothing is setup in the hardware */
@@ -1291,7 +1807,7 @@ void disable_IO_APIC(void)
entry.dest_mode = 0; /* Physical */
entry.delivery_mode = dest_ExtINT; /* ExtInt */
entry.vector = 0;
- entry.dest = GET_APIC_ID(read_apic_id());
+ entry.dest = read_apic_id();
/*
* Add it to the IO-APIC irq-routing table:
@@ -1302,6 +1818,133 @@ void disable_IO_APIC(void)
disconnect_bsp_APIC(ioapic_i8259.pin != -1);
}
+#ifdef CONFIG_X86_32
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
+ */
+
+static void __init setup_ioapic_ids_from_mpc(void)
+{
+ union IO_APIC_reg_00 reg_00;
+ physid_mask_t phys_id_present_map;
+ int apic;
+ int i;
+ unsigned char old_id;
+ unsigned long flags;
+
+ if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
+ return;
+
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return;
+ /*
+ * This is broken; anything with a real cpu count has to
+ * circumvent this idiocy regardless.
+ */
+ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+ /*
+ * Set the IOAPIC ID to the value stored in the MPC table.
+ */
+ for (apic = 0; apic < nr_ioapics; apic++) {
+
+ /* Read the register 0 value */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ old_id = mp_ioapics[apic].mp_apicid;
+
+ if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+ apic, mp_ioapics[apic].mp_apicid);
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ reg_00.bits.ID);
+ mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
+ }
+
+ /*
+ * Sanity check, is the ID really free? Every APIC in a
+ * system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (check_apicid_used(phys_id_present_map,
+ mp_ioapics[apic].mp_apicid)) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+ apic, mp_ioapics[apic].mp_apicid);
+ for (i = 0; i < get_physical_broadcast(); i++)
+ if (!physid_isset(i, phys_id_present_map))
+ break;
+ if (i >= get_physical_broadcast())
+ panic("Max APIC ID exceeded!\n");
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ i);
+ physid_set(i, phys_id_present_map);
+ mp_ioapics[apic].mp_apicid = i;
+ } else {
+ physid_mask_t tmp;
+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
+ apic_printk(APIC_VERBOSE, "Setting %d in the "
+ "phys_id_present_map\n",
+ mp_ioapics[apic].mp_apicid);
+ physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ }
+
+
+ /*
+ * We need to adjust the IRQ routing table
+ * if the ID changed.
+ */
+ if (old_id != mp_ioapics[apic].mp_apicid)
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].mp_dstapic == old_id)
+ mp_irqs[i].mp_dstapic
+ = mp_ioapics[apic].mp_apicid;
+
+ /*
+ * Read the right value from the MPC table and
+ * write it into the ID register.
+ */
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "...changing IO-APIC physical APIC ID to %d ...",
+ mp_ioapics[apic].mp_apicid);
+
+ reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0, reg_00.raw);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /*
+ * Sanity check
+ */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
+ printk("could not set ID!\n");
+ else
+ apic_printk(APIC_VERBOSE, " ok.\n");
+ }
+}
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+ no_timer_check = 1;
+ return 1;
+}
+__setup("no_timer_check", notimercheck);
+
/*
* There is a nasty bug in some older SMP boards, their mptable lies
* about the timer IRQ. We do the following to work around the situation:
@@ -1315,6 +1958,9 @@ static int __init timer_irq_works(void)
unsigned long t1 = jiffies;
unsigned long flags;
+ if (no_timer_check)
+ return 1;
+
local_save_flags(flags);
local_irq_enable();
/* Let ten ticks pass... */
@@ -1375,9 +2021,11 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
return was_pending;
}
+#ifdef CONFIG_X86_64
static int ioapic_retrigger_irq(unsigned int irq)
{
- struct irq_cfg *cfg = &irq_cfg[irq];
+
+ struct irq_cfg *cfg = irq_cfg(irq);
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
@@ -1386,6 +2034,14 @@ static int ioapic_retrigger_irq(unsigned int irq)
return 1;
}
+#else
+static int ioapic_retrigger_irq(unsigned int irq)
+{
+ send_IPI_self(irq_cfg(irq)->vector);
+
+ return 1;
+}
+#endif
/*
* Level and edge triggered IO-APIC interrupts need different handling,
@@ -1397,11 +2053,159 @@ static int ioapic_retrigger_irq(unsigned int irq)
*/
#ifdef CONFIG_SMP
+
+#ifdef CONFIG_INTR_REMAP
+static void ir_irq_migration(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
+
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For edge triggered, irq migration is a simple atomic update(of vector
+ * and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we need to modify the io-apic RTE aswell with the update
+ * vector information, along with modifying IRTE with vector and destination.
+ * So irq migration for level triggered is little bit more complex compared to
+ * edge triggered migration. But the good news is, we use the same algorithm
+ * for level triggered migration as we have today, only difference being,
+ * we now initiate the irq migration from process context instead of the
+ * interrupt context.
+ *
+ * In future, when we do a directed EOI (combined with cpu EOI broadcast
+ * suppression) to the IO-APIC, level triggered irq migration will also be
+ * as simple as edge triggered migration and we can do the irq migration
+ * with a simple atomic update to IO-APIC RTE.
+ */
+static void migrate_ioapic_irq(int irq, cpumask_t mask)
+{
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ cpumask_t tmp, cleanup_mask;
+ struct irte irte;
+ int modify_ioapic_rte;
+ unsigned int dest;
+ unsigned long flags;
+
+ cpus_and(tmp, mask, cpu_online_map);
+ if (cpus_empty(tmp))
+ return;
+
+ if (get_irte(irq, &irte))
+ return;
+
+ if (assign_irq_vector(irq, mask))
+ return;
+
+ cfg = irq_cfg(irq);
+ cpus_and(tmp, cfg->domain, mask);
+ dest = cpu_mask_to_apicid(tmp);
+
+ desc = irq_to_desc(irq);
+ modify_ioapic_rte = desc->status & IRQ_LEVEL;
+ if (modify_ioapic_rte) {
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __target_IO_APIC_irq(irq, dest, cfg->vector);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
+ irte.vector = cfg->vector;
+ irte.dest_id = IRTE_DEST(dest);
+
+ /*
+ * Modified the IRTE and flushes the Interrupt entry cache.
+ */
+ modify_irte(irq, &irte);
+
+ if (cfg->move_in_progress) {
+ cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+ cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+ send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ cfg->move_in_progress = 0;
+ }
+
+ desc->affinity = mask;
+}
+
+static int migrate_irq_remapped_level(int irq)
+{
+ int ret = -1;
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ mask_IO_APIC_irq(irq);
+
+ if (io_apic_level_ack_pending(irq)) {
+ /*
+ * Interrupt in progress. Migrating irq now will change the
+ * vector information in the IO-APIC RTE and that will confuse
+ * the EOI broadcast performed by cpu.
+ * So, delay the irq migration to the next instance.
+ */
+ schedule_delayed_work(&ir_migration_work, 1);
+ goto unmask;
+ }
+
+ /* everthing is clear. we have right of way */
+ migrate_ioapic_irq(irq, desc->pending_mask);
+
+ ret = 0;
+ desc->status &= ~IRQ_MOVE_PENDING;
+ cpus_clear(desc->pending_mask);
+
+unmask:
+ unmask_IO_APIC_irq(irq);
+ return ret;
+}
+
+static void ir_irq_migration(struct work_struct *work)
+{
+ unsigned int irq;
+ struct irq_desc *desc;
+
+ for_each_irq_desc(irq, desc) {
+ if (desc->status & IRQ_MOVE_PENDING) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ if (!desc->chip->set_affinity ||
+ !(desc->status & IRQ_MOVE_PENDING)) {
+ desc->status &= ~IRQ_MOVE_PENDING;
+ spin_unlock_irqrestore(&desc->lock, flags);
+ continue;
+ }
+
+ desc->chip->set_affinity(irq, desc->pending_mask);
+ spin_unlock_irqrestore(&desc->lock, flags);
+ }
+ }
+}
+
+/*
+ * Migrates the IRQ destination in the process context.
+ */
+static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc->status & IRQ_LEVEL) {
+ desc->status |= IRQ_MOVE_PENDING;
+ desc->pending_mask = mask;
+ migrate_irq_remapped_level(irq);
+ return;
+ }
+
+ migrate_ioapic_irq(irq, mask);
+}
+#endif
+
asmlinkage void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
ack_APIC_irq();
+#ifdef CONFIG_X86_64
exit_idle();
+#endif
irq_enter();
me = smp_processor_id();
@@ -1410,11 +2214,12 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
struct irq_desc *desc;
struct irq_cfg *cfg;
irq = __get_cpu_var(vector_irq)[vector];
- if (irq >= NR_IRQS)
+
+ desc = irq_to_desc(irq);
+ if (!desc)
continue;
- desc = irq_desc + irq;
- cfg = irq_cfg + irq;
+ cfg = irq_cfg(irq);
spin_lock(&desc->lock);
if (!cfg->move_cleanup_count)
goto unlock;
@@ -1433,7 +2238,7 @@ unlock:
static void irq_complete_move(unsigned int irq)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg = irq_cfg(irq);
unsigned vector, me;
if (likely(!cfg->move_in_progress))
@@ -1453,6 +2258,17 @@ static void irq_complete_move(unsigned int irq)
#else
static inline void irq_complete_move(unsigned int irq) {}
#endif
+#ifdef CONFIG_INTR_REMAP
+static void ack_x2apic_level(unsigned int irq)
+{
+ ack_x2APIC_irq();
+}
+
+static void ack_x2apic_edge(unsigned int irq)
+{
+ ack_x2APIC_irq();
+}
+#endif
static void ack_apic_edge(unsigned int irq)
{
@@ -1461,19 +2277,50 @@ static void ack_apic_edge(unsigned int irq)
ack_APIC_irq();
}
+atomic_t irq_mis_count;
+
static void ack_apic_level(unsigned int irq)
{
+#ifdef CONFIG_X86_32
+ unsigned long v;
+ int i;
+#endif
int do_unmask_irq = 0;
irq_complete_move(irq);
#ifdef CONFIG_GENERIC_PENDING_IRQ
/* If we are moving the irq we need to mask it */
- if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+ if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
mask_IO_APIC_irq(irq);
}
#endif
+#ifdef CONFIG_X86_32
+ /*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets). Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source. The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually. We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt. We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul. --macro
+ */
+ i = irq_cfg(irq)->vector;
+
+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+#endif
+
/*
* We must acknowledge the irq before we move it or the acknowledge will
* not propagate properly.
@@ -1512,24 +2359,51 @@ static void ack_apic_level(unsigned int irq)
move_masked_irq(irq);
unmask_IO_APIC_irq(irq);
}
+
+#ifdef CONFIG_X86_32
+ if (!(v & (1 << (i & 0x1f)))) {
+ atomic_inc(&irq_mis_count);
+ spin_lock(&ioapic_lock);
+ __mask_and_edge_IO_APIC_irq(irq);
+ __unmask_and_level_IO_APIC_irq(irq);
+ spin_unlock(&ioapic_lock);
+ }
+#endif
}
static struct irq_chip ioapic_chip __read_mostly = {
- .name = "IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_apic_edge,
- .eoi = ack_apic_level,
+ .name = "IO-APIC",
+ .startup = startup_ioapic_irq,
+ .mask = mask_IO_APIC_irq,
+ .unmask = unmask_IO_APIC_irq,
+ .ack = ack_apic_edge,
+ .eoi = ack_apic_level,
+#ifdef CONFIG_SMP
+ .set_affinity = set_ioapic_affinity_irq,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+
+#ifdef CONFIG_INTR_REMAP
+static struct irq_chip ir_ioapic_chip __read_mostly = {
+ .name = "IR-IO-APIC",
+ .startup = startup_ioapic_irq,
+ .mask = mask_IO_APIC_irq,
+ .unmask = unmask_IO_APIC_irq,
+ .ack = ack_x2apic_edge,
+ .eoi = ack_x2apic_level,
#ifdef CONFIG_SMP
- .set_affinity = set_ioapic_affinity_irq,
+ .set_affinity = set_ir_ioapic_affinity_irq,
#endif
.retrigger = ioapic_retrigger_irq,
};
+#endif
static inline void init_IO_APIC_traps(void)
{
int irq;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
/*
* NOTE! The local APIC isn't very good at handling
@@ -1542,8 +2416,8 @@ static inline void init_IO_APIC_traps(void)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- for (irq = 0; irq < NR_IRQS ; irq++) {
- if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) {
+ for_each_irq_cfg(irq, cfg) {
+ if (IO_APIC_IRQ(irq) && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
@@ -1551,27 +2425,33 @@ static inline void init_IO_APIC_traps(void)
*/
if (irq < 16)
make_8259A_irq(irq);
- else
+ else {
+ desc = irq_to_desc(irq);
/* Strange. Oh, well.. */
- irq_desc[irq].chip = &no_irq_chip;
+ desc->chip = &no_irq_chip;
+ }
}
}
}
-static void unmask_lapic_irq(unsigned int irq)
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void mask_lapic_irq(unsigned int irq)
{
unsigned long v;
v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
-static void mask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
{
unsigned long v;
v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
static void ack_lapic_irq (unsigned int irq)
@@ -1588,7 +2468,10 @@ static struct irq_chip lapic_chip __read_mostly = {
static void lapic_register_intr(int irq)
{
- irq_desc[irq].status &= ~IRQ_LEVEL;
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+ desc->status &= ~IRQ_LEVEL;
set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
"edge");
}
@@ -1596,19 +2479,19 @@ static void lapic_register_intr(int irq)
static void __init setup_nmi(void)
{
/*
- * Dirty trick to enable the NMI watchdog ...
+ * Dirty trick to enable the NMI watchdog ...
* We put the 8259A master into AEOI mode and
* unmask on all local APICs LVT0 as NMI.
*
* The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
* is from Maciej W. Rozycki - so we do not have to EOI from
* the NMI handler or the timer interrupt.
- */
- printk(KERN_INFO "activating NMI Watchdog ...");
+ */
+ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
enable_NMI_through_LVT0();
- printk(" done.\n");
+ apic_printk(APIC_VERBOSE, " done.\n");
}
/*
@@ -1625,12 +2508,17 @@ static inline void __init unlock_ExtINT_logic(void)
unsigned char save_control, save_freq_select;
pin = find_isa_irq_pin(8, mp_INT);
+ if (pin == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
apic = find_isa_irq_apic(8, mp_INT);
- if (pin == -1)
+ if (apic == -1) {
+ WARN_ON_ONCE(1);
return;
+ }
entry0 = ioapic_read_entry(apic, pin);
-
clear_IO_APIC_pin(apic, pin);
memset(&entry1, 0, sizeof(entry1));
@@ -1665,23 +2553,38 @@ static inline void __init unlock_ExtINT_logic(void)
ioapic_write_entry(apic, pin, entry0);
}
+static int disable_timer_pin_1 __initdata;
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+ disable_timer_pin_1 = 1;
+ return 0;
+}
+early_param("disable_timer_pin_1", disable_timer_pin_setup);
+
+int timer_through_8259 __initdata;
+
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
* is so screwy. Thanks to Brian Perkins for testing/hacking this beast
* fanatically on his truly buggy board.
*
- * FIXME: really need to revamp this for modern platforms only.
+ * FIXME: really need to revamp this for all platforms.
*/
static inline void __init check_timer(void)
{
- struct irq_cfg *cfg = irq_cfg + 0;
+ struct irq_cfg *cfg = irq_cfg(0);
int apic1, pin1, apic2, pin2;
unsigned long flags;
+ unsigned int ver;
int no_pin1 = 0;
local_irq_save(flags);
+ ver = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(ver);
+
/*
* get/set the timer IRQ vector:
*/
@@ -1690,10 +2593,18 @@ static inline void __init check_timer(void)
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
- * wire has to be disabled in the local APIC.
+ * wire has to be disabled in the local APIC. Also
+ * timer interrupts need to be acknowledged manually in
+ * the 8259A for the i82489DX when using the NMI
+ * watchdog as that APIC treats NMIs as level-triggered.
+ * The AEOI mode will finish them in the 8259A
+ * automatically.
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
+#ifdef CONFIG_X86_32
+ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+#endif
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
@@ -1712,6 +2623,10 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
+#ifdef CONFIG_INTR_REMAP
+ if (intr_remapping_enabled)
+ panic("BIOS bug: timer not connected to IO-APIC");
+#endif
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
@@ -1729,7 +2644,7 @@ static inline void __init check_timer(void)
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
}
unmask_IO_APIC_irq(0);
- if (!no_timer_check && timer_irq_works()) {
+ if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
enable_8259A_irq(0);
@@ -1738,6 +2653,10 @@ static inline void __init check_timer(void)
clear_IO_APIC_pin(0, pin1);
goto out;
}
+#ifdef CONFIG_INTR_REMAP
+ if (intr_remapping_enabled)
+ panic("timer doesn't work through Interrupt-remapped IO-APIC");
+#endif
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -1777,6 +2696,9 @@ static inline void __init check_timer(void)
"through the IO-APIC - disabling NMI Watchdog!\n");
nmi_watchdog = NMI_NONE;
}
+#ifdef CONFIG_X86_32
+ timer_ack = 0;
+#endif
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
@@ -1813,13 +2735,6 @@ out:
local_irq_restore(flags);
}
-static int __init notimercheck(char *s)
-{
- no_timer_check = 1;
- return 1;
-}
-__setup("no_timer_check", notimercheck);
-
/*
* Traditionally ISA IRQ2 is the cascade IRQ, and is not available
* to devices. However there may be an I/O APIC pin available for
@@ -1837,27 +2752,49 @@ __setup("no_timer_check", notimercheck);
* the I/O APIC in all cases now. No actual device should request
* it anyway. --macro
*/
-#define PIC_IRQS (1<<2)
+#define PIC_IRQS (1 << PIC_CASCADE_IR)
void __init setup_IO_APIC(void)
{
+#ifdef CONFIG_X86_32
+ enable_IO_APIC();
+#else
/*
* calling enable_IO_APIC() is moved to setup_local_APIC for BP
*/
+#endif
io_apic_irqs = ~PIC_IRQS;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
-
+ /*
+ * Set up IO-APIC IRQ routing.
+ */
+#ifdef CONFIG_X86_32
+ if (!acpi_ioapic)
+ setup_ioapic_ids_from_mpc();
+#endif
sync_Arb_IDs();
setup_IO_APIC_irqs();
init_IO_APIC_traps();
check_timer();
- if (!acpi_ioapic)
- print_IO_APIC();
}
+/*
+ * Called after all the initialization is done. If we didnt find any
+ * APIC bugs then we can allow the modify fast path
+ */
+
+static int __init io_apic_bug_finalize(void)
+{
+ if (sis_apic_bug == -1)
+ sis_apic_bug = 0;
+ return 0;
+}
+
+late_initcall(io_apic_bug_finalize);
+
struct sysfs_ioapic_data {
struct sys_device dev;
struct IO_APIC_route_entry entry[0];
@@ -1945,38 +2882,60 @@ device_initcall(ioapic_init_sysfs);
/*
* Dynamic irq allocate and deallocation
*/
-int create_irq(void)
+unsigned int create_irq_nr(unsigned int irq_want)
{
/* Allocate an unused irq */
- int irq;
- int new;
+ unsigned int irq;
+ unsigned int new;
unsigned long flags;
+ struct irq_cfg *cfg_new;
- irq = -ENOSPC;
+ irq_want = nr_irqs - 1;
+
+ irq = 0;
spin_lock_irqsave(&vector_lock, flags);
- for (new = (NR_IRQS - 1); new >= 0; new--) {
+ for (new = irq_want; new > 0; new--) {
if (platform_legacy_irq(new))
continue;
- if (irq_cfg[new].vector != 0)
+ cfg_new = irq_cfg(new);
+ if (cfg_new && cfg_new->vector != 0)
continue;
+ /* check if need to create one */
+ if (!cfg_new)
+ cfg_new = irq_cfg_alloc(new);
if (__assign_irq_vector(new, TARGET_CPUS) == 0)
irq = new;
break;
}
spin_unlock_irqrestore(&vector_lock, flags);
- if (irq >= 0) {
+ if (irq > 0) {
dynamic_irq_init(irq);
}
return irq;
}
+int create_irq(void)
+{
+ int irq;
+
+ irq = create_irq_nr(nr_irqs - 1);
+
+ if (irq == 0)
+ irq = -1;
+
+ return irq;
+}
+
void destroy_irq(unsigned int irq)
{
unsigned long flags;
dynamic_irq_cleanup(irq);
+#ifdef CONFIG_INTR_REMAP
+ free_irte(irq);
+#endif
spin_lock_irqsave(&vector_lock, flags);
__clear_irq_vector(irq);
spin_unlock_irqrestore(&vector_lock, flags);
@@ -1988,17 +2947,49 @@ void destroy_irq(unsigned int irq)
#ifdef CONFIG_PCI_MSI
static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
int err;
unsigned dest;
cpumask_t tmp;
tmp = TARGET_CPUS;
err = assign_irq_vector(irq, tmp);
- if (!err) {
- cpus_and(tmp, cfg->domain, tmp);
- dest = cpu_mask_to_apicid(tmp);
+ if (err)
+ return err;
+
+ cfg = irq_cfg(irq);
+ cpus_and(tmp, cfg->domain, tmp);
+ dest = cpu_mask_to_apicid(tmp);
+
+#ifdef CONFIG_INTR_REMAP
+ if (irq_remapped(irq)) {
+ struct irte irte;
+ int ir_index;
+ u16 sub_handle;
+
+ ir_index = map_irq_to_irte_handle(irq, &sub_handle);
+ BUG_ON(ir_index == -1);
+
+ memset (&irte, 0, sizeof(irte));
+ irte.present = 1;
+ irte.dst_mode = INT_DEST_MODE;
+ irte.trigger_mode = 0; /* edge */
+ irte.dlvry_mode = INT_DELIVERY_MODE;
+ irte.vector = cfg->vector;
+ irte.dest_id = IRTE_DEST(dest);
+
+ modify_irte(irq, &irte);
+
+ msg->address_hi = MSI_ADDR_BASE_HI;
+ msg->data = sub_handle;
+ msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+ MSI_ADDR_IR_SHV |
+ MSI_ADDR_IR_INDEX1(ir_index) |
+ MSI_ADDR_IR_INDEX2(ir_index);
+ } else
+#endif
+ {
msg->address_hi = MSI_ADDR_BASE_HI;
msg->address_lo =
MSI_ADDR_BASE_LO |
@@ -2024,10 +3015,11 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
#ifdef CONFIG_SMP
static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
cpumask_t tmp;
+ struct irq_desc *desc;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
@@ -2036,6 +3028,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
if (assign_irq_vector(irq, mask))
return;
+ cfg = irq_cfg(irq);
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
@@ -2047,8 +3040,61 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
write_msi_msg(irq, &msg);
- irq_desc[irq].affinity = mask;
+ desc = irq_to_desc(irq);
+ desc->affinity = mask;
+}
+
+#ifdef CONFIG_INTR_REMAP
+/*
+ * Migrate the MSI irq to another cpumask. This migration is
+ * done in the process context using interrupt-remapping hardware.
+ */
+static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+ struct irq_cfg *cfg;
+ unsigned int dest;
+ cpumask_t tmp, cleanup_mask;
+ struct irte irte;
+ struct irq_desc *desc;
+
+ cpus_and(tmp, mask, cpu_online_map);
+ if (cpus_empty(tmp))
+ return;
+
+ if (get_irte(irq, &irte))
+ return;
+
+ if (assign_irq_vector(irq, mask))
+ return;
+
+ cfg = irq_cfg(irq);
+ cpus_and(tmp, cfg->domain, mask);
+ dest = cpu_mask_to_apicid(tmp);
+
+ irte.vector = cfg->vector;
+ irte.dest_id = IRTE_DEST(dest);
+
+ /*
+ * atomically update the IRTE with the new destination and vector.
+ */
+ modify_irte(irq, &irte);
+
+ /*
+ * After this point, all the interrupts will start arriving
+ * at the new destination. So, time to cleanup the previous
+ * vector allocation.
+ */
+ if (cfg->move_in_progress) {
+ cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+ cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+ send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ cfg->move_in_progress = 0;
+ }
+
+ desc = irq_to_desc(irq);
+ desc->affinity = mask;
}
+#endif
#endif /* CONFIG_SMP */
/*
@@ -2066,26 +3112,179 @@ static struct irq_chip msi_chip = {
.retrigger = ioapic_retrigger_irq,
};
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+#ifdef CONFIG_INTR_REMAP
+static struct irq_chip msi_ir_chip = {
+ .name = "IR-PCI-MSI",
+ .unmask = unmask_msi_irq,
+ .mask = mask_msi_irq,
+ .ack = ack_x2apic_edge,
+#ifdef CONFIG_SMP
+ .set_affinity = ir_set_msi_irq_affinity,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+
+/*
+ * Map the PCI dev to the corresponding remapping hardware unit
+ * and allocate 'nvec' consecutive interrupt-remapping table entries
+ * in it.
+ */
+static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
+{
+ struct intel_iommu *iommu;
+ int index;
+
+ iommu = map_dev_to_ir(dev);
+ if (!iommu) {
+ printk(KERN_ERR
+ "Unable to map PCI %s to iommu\n", pci_name(dev));
+ return -ENOENT;
+ }
+
+ index = alloc_irte(iommu, irq, nvec);
+ if (index < 0) {
+ printk(KERN_ERR
+ "Unable to allocate %d IRTE for PCI %s\n", nvec,
+ pci_name(dev));
+ return -ENOSPC;
+ }
+ return index;
+}
+#endif
+
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
{
+ int ret;
struct msi_msg msg;
- int irq, ret;
- irq = create_irq();
- if (irq < 0)
- return irq;
ret = msi_compose_msg(dev, irq, &msg);
+ if (ret < 0)
+ return ret;
+
+ set_irq_msi(irq, desc);
+ write_msi_msg(irq, &msg);
+
+#ifdef CONFIG_INTR_REMAP
+ if (irq_remapped(irq)) {
+ struct irq_desc *desc = irq_to_desc(irq);
+ /*
+ * irq migration in process context
+ */
+ desc->status |= IRQ_MOVE_PCNTXT;
+ set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
+ } else
+#endif
+ set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+
+ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+
+ return 0;
+}
+
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+ unsigned int irq;
+
+ irq = dev->bus->number;
+ irq <<= 8;
+ irq |= dev->devfn;
+ irq <<= 12;
+
+ return irq;
+}
+
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+ unsigned int irq;
+ int ret;
+ unsigned int irq_want;
+
+ irq_want = build_irq_for_pci_dev(dev) + 0x100;
+
+ irq = create_irq_nr(irq_want);
+ if (irq == 0)
+ return -1;
+
+#ifdef CONFIG_INTR_REMAP
+ if (!intr_remapping_enabled)
+ goto no_ir;
+
+ ret = msi_alloc_irte(dev, irq, 1);
+ if (ret < 0)
+ goto error;
+no_ir:
+#endif
+ ret = setup_msi_irq(dev, desc, irq);
if (ret < 0) {
destroy_irq(irq);
return ret;
}
+ return 0;
- set_irq_msi(irq, desc);
- write_msi_msg(irq, &msg);
+#ifdef CONFIG_INTR_REMAP
+error:
+ destroy_irq(irq);
+ return ret;
+#endif
+}
+
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ unsigned int irq;
+ int ret, sub_handle;
+ struct msi_desc *desc;
+ unsigned int irq_want;
- set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+#ifdef CONFIG_INTR_REMAP
+ struct intel_iommu *iommu = 0;
+ int index = 0;
+#endif
+ irq_want = build_irq_for_pci_dev(dev) + 0x100;
+ sub_handle = 0;
+ list_for_each_entry(desc, &dev->msi_list, list) {
+ irq = create_irq_nr(irq_want--);
+ if (irq == 0)
+ return -1;
+#ifdef CONFIG_INTR_REMAP
+ if (!intr_remapping_enabled)
+ goto no_ir;
+
+ if (!sub_handle) {
+ /*
+ * allocate the consecutive block of IRTE's
+ * for 'nvec'
+ */
+ index = msi_alloc_irte(dev, irq, nvec);
+ if (index < 0) {
+ ret = index;
+ goto error;
+ }
+ } else {
+ iommu = map_dev_to_ir(dev);
+ if (!iommu) {
+ ret = -ENOENT;
+ goto error;
+ }
+ /*
+ * setup the mapping between the irq and the IRTE
+ * base index, the sub_handle pointing to the
+ * appropriate interrupt remap table entry.
+ */
+ set_irte_irq(irq, iommu, index, sub_handle);
+ }
+no_ir:
+#endif
+ ret = setup_msi_irq(dev, desc, irq);
+ if (ret < 0)
+ goto error;
+ sub_handle++;
+ }
return 0;
+
+error:
+ destroy_irq(irq);
+ return ret;
}
void arch_teardown_msi_irq(unsigned int irq)
@@ -2097,10 +3296,11 @@ void arch_teardown_msi_irq(unsigned int irq)
#ifdef CONFIG_SMP
static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
cpumask_t tmp;
+ struct irq_desc *desc;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
@@ -2109,6 +3309,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
if (assign_irq_vector(irq, mask))
return;
+ cfg = irq_cfg(irq);
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
@@ -2120,7 +3321,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
dmar_msi_write(irq, &msg);
- irq_desc[irq].affinity = mask;
+ desc = irq_to_desc(irq);
+ desc->affinity = mask;
}
#endif /* CONFIG_SMP */
@@ -2150,6 +3352,69 @@ int arch_setup_dmar_msi(unsigned int irq)
}
#endif
+#ifdef CONFIG_HPET_TIMER
+
+#ifdef CONFIG_SMP
+static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ struct msi_msg msg;
+ unsigned int dest;
+ cpumask_t tmp;
+
+ cpus_and(tmp, mask, cpu_online_map);
+ if (cpus_empty(tmp))
+ return;
+
+ if (assign_irq_vector(irq, mask))
+ return;
+
+ cfg = irq_cfg(irq);
+ cpus_and(tmp, cfg->domain, mask);
+ dest = cpu_mask_to_apicid(tmp);
+
+ hpet_msi_read(irq, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ hpet_msi_write(irq, &msg);
+ desc = irq_to_desc(irq);
+ desc->affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+struct irq_chip hpet_msi_type = {
+ .name = "HPET_MSI",
+ .unmask = hpet_msi_unmask,
+ .mask = hpet_msi_mask,
+ .ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+ .set_affinity = hpet_msi_set_affinity,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_hpet_msi(unsigned int irq)
+{
+ int ret;
+ struct msi_msg msg;
+
+ ret = msi_compose_msg(NULL, irq, &msg);
+ if (ret < 0)
+ return ret;
+
+ hpet_msi_write(irq, &msg);
+ set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
+ "edge");
+
+ return 0;
+}
+#endif
+
#endif /* CONFIG_PCI_MSI */
/*
* Hypertransport interrupt support
@@ -2174,9 +3439,10 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
unsigned int dest;
cpumask_t tmp;
+ struct irq_desc *desc;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
@@ -2185,11 +3451,13 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
if (assign_irq_vector(irq, mask))
return;
+ cfg = irq_cfg(irq);
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
target_ht_irq(irq, dest, cfg->vector);
- irq_desc[irq].affinity = mask;
+ desc = irq_to_desc(irq);
+ desc->affinity = mask;
}
#endif
@@ -2206,7 +3474,7 @@ static struct irq_chip ht_irq_chip = {
int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
int err;
cpumask_t tmp;
@@ -2216,6 +3484,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
struct ht_irq_msg msg;
unsigned dest;
+ cfg = irq_cfg(irq);
cpus_and(tmp, cfg->domain, tmp);
dest = cpu_mask_to_apicid(tmp);
@@ -2238,20 +3507,196 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
set_irq_chip_and_handler_name(irq, &ht_irq_chip,
handle_edge_irq, "edge");
+
+ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
}
return err;
}
#endif /* CONFIG_HT_IRQ */
+#ifdef CONFIG_X86_64
+/*
+ * Re-target the irq to the specified CPU and enable the specified MMR located
+ * on the specified blade to allow the sending of MSIs to the specified CPU.
+ */
+int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
+ unsigned long mmr_offset)
+{
+ const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+ struct irq_cfg *cfg;
+ int mmr_pnode;
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+ unsigned long flags;
+ int err;
+
+ err = assign_irq_vector(irq, *eligible_cpu);
+ if (err != 0)
+ return err;
+
+ spin_lock_irqsave(&vector_lock, flags);
+ set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
+ irq_name);
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+ cfg = irq_cfg(irq);
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
+ entry->vector = cfg->vector;
+ entry->delivery_mode = INT_DELIVERY_MODE;
+ entry->dest_mode = INT_DEST_MODE;
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+ entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+
+ mmr_pnode = uv_blade_to_pnode(mmr_blade);
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+
+ return irq;
+}
+
+/*
+ * Disable the specified MMR located on the specified blade so that MSIs are
+ * longer allowed to be sent.
+ */
+void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
+{
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+ int mmr_pnode;
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
+ entry->mask = 1;
+
+ mmr_pnode = uv_blade_to_pnode(mmr_blade);
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+}
+#endif /* CONFIG_X86_64 */
+
+int __init io_apic_get_redir_entries (int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return reg_01.bits.entries;
+}
+
+int __init probe_nr_irqs(void)
+{
+ int idx;
+ int nr = 0;
+#ifndef CONFIG_XEN
+ int nr_min = 32;
+#else
+ int nr_min = NR_IRQS;
+#endif
+
+ for (idx = 0; idx < nr_ioapics; idx++)
+ nr += io_apic_get_redir_entries(idx) + 1;
+
+ /* double it for hotplug and msi and nmi */
+ nr <<= 1;
+
+ /* something wrong ? */
+ if (nr < nr_min)
+ nr = nr_min;
+
+ return nr;
+}
+
/* --------------------------------------------------------------------------
ACPI-based IOAPIC Configuration
-------------------------------------------------------------------------- */
#ifdef CONFIG_ACPI
-#define IO_APIC_MAX_ID 0xFE
+#ifdef CONFIG_X86_32
+int __init io_apic_get_unique_id(int ioapic, int apic_id)
+{
+ union IO_APIC_reg_00 reg_00;
+ static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+ physid_mask_t tmp;
+ unsigned long flags;
+ int i = 0;
-int __init io_apic_get_redir_entries (int ioapic)
+ /*
+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
+ * supports up to 16 on one shared APIC bus.
+ *
+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+ * advantage of new APIC bus architecture.
+ */
+
+ if (physids_empty(apic_id_map))
+ apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ if (apic_id >= get_physical_broadcast()) {
+ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+ "%d\n", ioapic, apic_id, reg_00.bits.ID);
+ apic_id = reg_00.bits.ID;
+ }
+
+ /*
+ * Every APIC in a system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (check_apicid_used(apic_id_map, apic_id)) {
+
+ for (i = 0; i < get_physical_broadcast(); i++) {
+ if (!check_apicid_used(apic_id_map, i))
+ break;
+ }
+
+ if (i == get_physical_broadcast())
+ panic("Max apic_id exceeded!\n");
+
+ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+ "trying %d\n", ioapic, apic_id, i);
+
+ apic_id = i;
+ }
+
+ tmp = apicid_to_cpu_present(apic_id);
+ physids_or(apic_id_map, apic_id_map, tmp);
+
+ if (reg_00.bits.ID != apic_id) {
+ reg_00.bits.ID = apic_id;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /* Sanity check */
+ if (reg_00.bits.ID != apic_id) {
+ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+ return -1;
+ }
+ }
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+ return apic_id;
+}
+
+int __init io_apic_get_version(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -2260,9 +3705,9 @@ int __init io_apic_get_redir_entries (int ioapic)
reg_01.raw = io_apic_read(ioapic, 1);
spin_unlock_irqrestore(&ioapic_lock, flags);
- return reg_01.bits.entries;
+ return reg_01.bits.version;
}
-
+#endif
int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
{
@@ -2314,6 +3759,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
void __init setup_ioapic_dest(void)
{
int pin, ioapic, irq, irq_entry;
+ struct irq_cfg *cfg;
if (skip_ioapic_setup == 1)
return;
@@ -2329,10 +3775,15 @@ void __init setup_ioapic_dest(void)
* when you have too many devices, because at that time only boot
* cpu is online.
*/
- if (!irq_cfg[irq].vector)
+ cfg = irq_cfg(irq);
+ if (!cfg->vector)
setup_IO_APIC_irq(ioapic, pin, irq,
irq_trigger(irq_entry),
irq_polarity(irq_entry));
+#ifdef CONFIG_INTR_REMAP
+ else if (intr_remapping_enabled)
+ set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
+#endif
else
set_ioapic_affinity_irq(irq, TARGET_CPUS);
}
@@ -2383,18 +3834,33 @@ void __init ioapic_init_mappings(void)
struct resource *ioapic_res;
int i;
+ irq_2_pin_init();
ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
ioapic_phys = mp_ioapics[i].mp_apicaddr;
+#ifdef CONFIG_X86_32
+ if (!ioapic_phys) {
+ printk(KERN_ERR
+ "WARNING: bogus zero IO-APIC "
+ "address found in MPTABLE, "
+ "disabling IO/APIC support!\n");
+ smp_found_config = 0;
+ skip_ioapic_setup = 1;
+ goto fake_ioapic_page;
+ }
+#endif
} else {
+#ifdef CONFIG_X86_32
+fake_ioapic_page:
+#endif
ioapic_phys = (unsigned long)
alloc_bootmem_pages(PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
set_fixmap_nocache(idx, ioapic_phys);
apic_printk(APIC_VERBOSE,
- "mapped IOAPIC to %016lx (%016lx)\n",
+ "mapped IOAPIC to %08lx (%08lx)\n",
__fix_to_virt(idx), ioapic_phys);
idx++;
@@ -2428,4 +3894,3 @@ static int __init ioapic_insert_resources(void)
/* Insert the IO APIC resources after PCI initialization has occured to handle
* IO APICS that are mapped in on a BAR in PCI space. */
late_initcall(ioapic_insert_resources);
-
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
deleted file mode 100644
index 09cddb57bec..00000000000
--- a/arch/x86/kernel/io_apic_32.c
+++ /dev/null
@@ -1,2901 +0,0 @@
-/*
- * Intel IO-APIC support for multi-Pentium hosts.
- *
- * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
- *
- * Many thanks to Stig Venaas for trying out countless experimental
- * patches and reporting/debugging problems patiently!
- *
- * (c) 1999, Multiple IO-APIC support, developed by
- * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
- * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
- * further tested and cleaned up by Zach Brown <zab@redhat.com>
- * and Ingo Molnar <mingo@redhat.com>
- *
- * Fixes
- * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
- * thanks to Eric Gilmore
- * and Rolf G. Tews
- * for testing these extensively
- * Paul Diefenbaugh : Added full ACPI support
- */
-
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/bootmem.h>
-#include <linux/mc146818rtc.h>
-#include <linux/compiler.h>
-#include <linux/acpi.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/pci.h>
-#include <linux/msi.h>
-#include <linux/htirq.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/jiffies.h> /* time_after() */
-
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/desc.h>
-#include <asm/timer.h>
-#include <asm/i8259.h>
-#include <asm/nmi.h>
-#include <asm/msidef.h>
-#include <asm/hypertransport.h>
-
-#include <mach_apic.h>
-#include <mach_apicdef.h>
-
-int (*ioapic_renumber_irq)(int ioapic, int irq);
-atomic_t irq_mis_count;
-
-/* Where if anywhere is the i8259 connect in external int mode */
-static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
-
-static DEFINE_SPINLOCK(ioapic_lock);
-DEFINE_SPINLOCK(vector_lock);
-
-int timer_through_8259 __initdata;
-
-/*
- * Is the SiS APIC rmw bug present ?
- * -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
-
-/* I/O APIC entries */
-struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
-int nr_ioapics;
-
-/* MP IRQ source entries */
-struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* # of MP IRQ source entries */
-int mp_irq_entries;
-
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
-
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-
-static int disable_timer_pin_1 __initdata;
-
-/*
- * Rough estimation of how many shared IRQs there are, can
- * be changed anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
-
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
-
-static struct irq_pin_list {
- int apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
-
-struct io_apic {
- unsigned int index;
- unsigned int unused[3];
- unsigned int data;
-};
-
-static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
-{
- return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
-}
-
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
- writel(reg, &io_apic->index);
- return readl(&io_apic->data);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
- writel(reg, &io_apic->index);
- writel(value, &io_apic->data);
-}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
- volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
- if (sis_apic_bug)
- writel(reg, &io_apic->index);
- writel(value, &io_apic->data);
-}
-
-union entry_union {
- struct { u32 w1, w2; };
- struct IO_APIC_route_entry entry;
-};
-
-static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
-{
- union entry_union eu;
- unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
- eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
- eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- return eu.entry;
-}
-
-/*
- * When we write a new IO APIC routing entry, we need to write the high
- * word first! If the mask bit in the low word is clear, we will enable
- * the interrupt, and we need to make sure the entry is fully populated
- * before that happens.
- */
-static void
-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
- union entry_union eu;
- eu.entry = e;
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
-}
-
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
- unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
- __ioapic_write_entry(apic, pin, e);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * When we mask an IO APIC routing entry, we need to write the low
- * word first, in order to set the mask bit before we change the
- * high bits!
- */
-static void ioapic_mask_entry(int apic, int pin)
-{
- unsigned long flags;
- union entry_union eu = { .entry.mask = 1 };
-
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
- * shared ISA-space IRQs, so we have to support them. We are super
- * fast in the common case, and fast for shared ISA-space IRQs.
- */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
-{
- static int first_free_entry = NR_IRQS;
- struct irq_pin_list *entry = irq_2_pin + irq;
-
- while (entry->next)
- entry = irq_2_pin + entry->next;
-
- if (entry->pin != -1) {
- entry->next = first_free_entry;
- entry = irq_2_pin + entry->next;
- if (++first_free_entry >= PIN_MAP_SIZE)
- panic("io_apic.c: whoops");
- }
- entry->apic = apic;
- entry->pin = pin;
-}
-
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq(unsigned int irq,
- int oldapic, int oldpin,
- int newapic, int newpin)
-{
- struct irq_pin_list *entry = irq_2_pin + irq;
-
- while (1) {
- if (entry->apic == oldapic && entry->pin == oldpin) {
- entry->apic = newapic;
- entry->pin = newpin;
- }
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
-}
-
-static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
-{
- struct irq_pin_list *entry = irq_2_pin + irq;
- unsigned int pin, reg;
-
- for (;;) {
- pin = entry->pin;
- if (pin == -1)
- break;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
- reg &= ~disable;
- reg |= enable;
- io_apic_modify(entry->apic, 0x10 + pin*2, reg);
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
-}
-
-/* mask = 1 */
-static void __mask_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
-}
-
-/* mask = 0 */
-static void __unmask_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
-}
-
-/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
- IO_APIC_REDIR_LEVEL_TRIGGER);
-}
-
-/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
- IO_APIC_REDIR_MASKED);
-}
-
-static void mask_IO_APIC_irq(unsigned int irq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(irq);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void unmask_IO_APIC_irq(unsigned int irq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(irq);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
-{
- struct IO_APIC_route_entry entry;
-
- /* Check delivery_mode to be sure we're not clearing an SMI pin */
- entry = ioapic_read_entry(apic, pin);
- if (entry.delivery_mode == dest_SMI)
- return;
-
- /*
- * Disable it in the IO-APIC irq-routing table:
- */
- ioapic_mask_entry(apic, pin);
-}
-
-static void clear_IO_APIC(void)
-{
- int apic, pin;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
- clear_IO_APIC_pin(apic, pin);
-}
-
-#ifdef CONFIG_SMP
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
-{
- unsigned long flags;
- int pin;
- struct irq_pin_list *entry = irq_2_pin + irq;
- unsigned int apicid_value;
- cpumask_t tmp;
-
- cpus_and(tmp, cpumask, cpu_online_map);
- if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- cpus_and(cpumask, tmp, CPU_MASK_ALL);
-
- apicid_value = cpu_mask_to_apicid(cpumask);
- /* Prepare to do the io_apic_write */
- apicid_value = apicid_value << 24;
- spin_lock_irqsave(&ioapic_lock, flags);
- for (;;) {
- pin = entry->pin;
- if (pin == -1)
- break;
- io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
- irq_desc[irq].affinity = cpumask;
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-#if defined(CONFIG_IRQBALANCE)
-# include <asm/processor.h> /* kernel_thread() */
-# include <linux/kernel_stat.h> /* kstat */
-# include <linux/slab.h> /* kmalloc() */
-# include <linux/timer.h>
-
-#define IRQBALANCE_CHECK_ARCH -999
-#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
-#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
-#define BALANCED_IRQ_MORE_DELTA (HZ/10)
-#define BALANCED_IRQ_LESS_DELTA (HZ)
-
-static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
-static int physical_balance __read_mostly;
-static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
-
-static struct irq_cpu_info {
- unsigned long *last_irq;
- unsigned long *irq_delta;
- unsigned long irq;
-} irq_cpu_data[NR_CPUS];
-
-#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
-
-#define IDLE_ENOUGH(cpu,now) \
- (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
-
-#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
-
-#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
-
-static cpumask_t balance_irq_affinity[NR_IRQS] = {
- [0 ... NR_IRQS-1] = CPU_MASK_ALL
-};
-
-void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
-{
- balance_irq_affinity[irq] = mask;
-}
-
-static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
- unsigned long now, int direction)
-{
- int search_idle = 1;
- int cpu = curr_cpu;
-
- goto inside;
-
- do {
- if (unlikely(cpu == curr_cpu))
- search_idle = 0;
-inside:
- if (direction == 1) {
- cpu++;
- if (cpu >= NR_CPUS)
- cpu = 0;
- } else {
- cpu--;
- if (cpu == -1)
- cpu = NR_CPUS-1;
- }
- } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
- (search_idle && !IDLE_ENOUGH(cpu, now)));
-
- return cpu;
-}
-
-static inline void balance_irq(int cpu, int irq)
-{
- unsigned long now = jiffies;
- cpumask_t allowed_mask;
- unsigned int new_cpu;
-
- if (irqbalance_disabled)
- return;
-
- cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
- new_cpu = move(cpu, allowed_mask, now, 1);
- if (cpu != new_cpu)
- set_pending_irq(irq, cpumask_of_cpu(new_cpu));
-}
-
-static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
-{
- int i, j;
-
- for_each_online_cpu(i) {
- for (j = 0; j < NR_IRQS; j++) {
- if (!irq_desc[j].action)
- continue;
- /* Is it a significant load ? */
- if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
- useful_load_threshold)
- continue;
- balance_irq(i, j);
- }
- }
- balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
- return;
-}
-
-static void do_irq_balance(void)
-{
- int i, j;
- unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
- unsigned long move_this_load = 0;
- int max_loaded = 0, min_loaded = 0;
- int load;
- unsigned long useful_load_threshold = balanced_irq_interval + 10;
- int selected_irq;
- int tmp_loaded, first_attempt = 1;
- unsigned long tmp_cpu_irq;
- unsigned long imbalance = 0;
- cpumask_t allowed_mask, target_cpu_mask, tmp;
-
- for_each_possible_cpu(i) {
- int package_index;
- CPU_IRQ(i) = 0;
- if (!cpu_online(i))
- continue;
- package_index = CPU_TO_PACKAGEINDEX(i);
- for (j = 0; j < NR_IRQS; j++) {
- unsigned long value_now, delta;
- /* Is this an active IRQ or balancing disabled ? */
- if (!irq_desc[j].action || irq_balancing_disabled(j))
- continue;
- if (package_index == i)
- IRQ_DELTA(package_index, j) = 0;
- /* Determine the total count per processor per IRQ */
- value_now = (unsigned long) kstat_cpu(i).irqs[j];
-
- /* Determine the activity per processor per IRQ */
- delta = value_now - LAST_CPU_IRQ(i, j);
-
- /* Update last_cpu_irq[][] for the next time */
- LAST_CPU_IRQ(i, j) = value_now;
-
- /* Ignore IRQs whose rate is less than the clock */
- if (delta < useful_load_threshold)
- continue;
- /* update the load for the processor or package total */
- IRQ_DELTA(package_index, j) += delta;
-
- /* Keep track of the higher numbered sibling as well */
- if (i != package_index)
- CPU_IRQ(i) += delta;
- /*
- * We have sibling A and sibling B in the package
- *
- * cpu_irq[A] = load for cpu A + load for cpu B
- * cpu_irq[B] = load for cpu B
- */
- CPU_IRQ(package_index) += delta;
- }
- }
- /* Find the least loaded processor package */
- for_each_online_cpu(i) {
- if (i != CPU_TO_PACKAGEINDEX(i))
- continue;
- if (min_cpu_irq > CPU_IRQ(i)) {
- min_cpu_irq = CPU_IRQ(i);
- min_loaded = i;
- }
- }
- max_cpu_irq = ULONG_MAX;
-
-tryanothercpu:
- /*
- * Look for heaviest loaded processor.
- * We may come back to get the next heaviest loaded processor.
- * Skip processors with trivial loads.
- */
- tmp_cpu_irq = 0;
- tmp_loaded = -1;
- for_each_online_cpu(i) {
- if (i != CPU_TO_PACKAGEINDEX(i))
- continue;
- if (max_cpu_irq <= CPU_IRQ(i))
- continue;
- if (tmp_cpu_irq < CPU_IRQ(i)) {
- tmp_cpu_irq = CPU_IRQ(i);
- tmp_loaded = i;
- }
- }
-
- if (tmp_loaded == -1) {
- /*
- * In the case of small number of heavy interrupt sources,
- * loading some of the cpus too much. We use Ingo's original
- * approach to rotate them around.
- */
- if (!first_attempt && imbalance >= useful_load_threshold) {
- rotate_irqs_among_cpus(useful_load_threshold);
- return;
- }
- goto not_worth_the_effort;
- }
-
- first_attempt = 0; /* heaviest search */
- max_cpu_irq = tmp_cpu_irq; /* load */
- max_loaded = tmp_loaded; /* processor */
- imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-
- /*
- * if imbalance is less than approx 10% of max load, then
- * observe diminishing returns action. - quit
- */
- if (imbalance < (max_cpu_irq >> 3))
- goto not_worth_the_effort;
-
-tryanotherirq:
- /* if we select an IRQ to move that can't go where we want, then
- * see if there is another one to try.
- */
- move_this_load = 0;
- selected_irq = -1;
- for (j = 0; j < NR_IRQS; j++) {
- /* Is this an active IRQ? */
- if (!irq_desc[j].action)
- continue;
- if (imbalance <= IRQ_DELTA(max_loaded, j))
- continue;
- /* Try to find the IRQ that is closest to the imbalance
- * without going over.
- */
- if (move_this_load < IRQ_DELTA(max_loaded, j)) {
- move_this_load = IRQ_DELTA(max_loaded, j);
- selected_irq = j;
- }
- }
- if (selected_irq == -1)
- goto tryanothercpu;
-
- imbalance = move_this_load;
-
- /* For physical_balance case, we accumulated both load
- * values in the one of the siblings cpu_irq[],
- * to use the same code for physical and logical processors
- * as much as possible.
- *
- * NOTE: the cpu_irq[] array holds the sum of the load for
- * sibling A and sibling B in the slot for the lowest numbered
- * sibling (A), _AND_ the load for sibling B in the slot for
- * the higher numbered sibling.
- *
- * We seek the least loaded sibling by making the comparison
- * (A+B)/2 vs B
- */
- load = CPU_IRQ(min_loaded) >> 1;
- for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
- if (load > CPU_IRQ(j)) {
- /* This won't change cpu_sibling_map[min_loaded] */
- load = CPU_IRQ(j);
- min_loaded = j;
- }
- }
-
- cpus_and(allowed_mask,
- cpu_online_map,
- balance_irq_affinity[selected_irq]);
- target_cpu_mask = cpumask_of_cpu(min_loaded);
- cpus_and(tmp, target_cpu_mask, allowed_mask);
-
- if (!cpus_empty(tmp)) {
- /* mark for change destination */
- set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
-
- /* Since we made a change, come back sooner to
- * check for more variation.
- */
- balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
- return;
- }
- goto tryanotherirq;
-
-not_worth_the_effort:
- /*
- * if we did not find an IRQ to move, then adjust the time interval
- * upward
- */
- balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
- return;
-}
-
-static int balanced_irq(void *unused)
-{
- int i;
- unsigned long prev_balance_time = jiffies;
- long time_remaining = balanced_irq_interval;
-
- /* push everything to CPU 0 to give us a starting point. */
- for (i = 0 ; i < NR_IRQS ; i++) {
- irq_desc[i].pending_mask = cpumask_of_cpu(0);
- set_pending_irq(i, cpumask_of_cpu(0));
- }
-
- set_freezable();
- for ( ; ; ) {
- time_remaining = schedule_timeout_interruptible(time_remaining);
- try_to_freeze();
- if (time_after(jiffies,
- prev_balance_time+balanced_irq_interval)) {
- preempt_disable();
- do_irq_balance();
- prev_balance_time = jiffies;
- time_remaining = balanced_irq_interval;
- preempt_enable();
- }
- }
- return 0;
-}
-
-static int __init balanced_irq_init(void)
-{
- int i;
- struct cpuinfo_x86 *c;
- cpumask_t tmp;
-
- cpus_shift_right(tmp, cpu_online_map, 2);
- c = &boot_cpu_data;
- /* When not overwritten by the command line ask subarchitecture. */
- if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
- irqbalance_disabled = NO_BALANCE_IRQ;
- if (irqbalance_disabled)
- return 0;
-
- /* disable irqbalance completely if there is only one processor online */
- if (num_online_cpus() < 2) {
- irqbalance_disabled = 1;
- return 0;
- }
- /*
- * Enable physical balance only if more than 1 physical processor
- * is present
- */
- if (smp_num_siblings > 1 && !cpus_empty(tmp))
- physical_balance = 1;
-
- for_each_online_cpu(i) {
- irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
- irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
- if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
- printk(KERN_ERR "balanced_irq_init: out of memory");
- goto failed;
- }
- }
-
- printk(KERN_INFO "Starting balanced_irq\n");
- if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
- return 0;
- printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
-failed:
- for_each_possible_cpu(i) {
- kfree(irq_cpu_data[i].irq_delta);
- irq_cpu_data[i].irq_delta = NULL;
- kfree(irq_cpu_data[i].last_irq);
- irq_cpu_data[i].last_irq = NULL;
- }
- return 0;
-}
-
-int __devinit irqbalance_disable(char *str)
-{
- irqbalance_disabled = 1;
- return 1;
-}
-
-__setup("noirqbalance", irqbalance_disable);
-
-late_initcall(balanced_irq_init);
-#endif /* CONFIG_IRQBALANCE */
-#endif /* CONFIG_SMP */
-
-#ifndef CONFIG_SMP
-void send_IPI_self(int vector)
-{
- unsigned int cfg;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
- cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write(APIC_ICR, cfg);
-}
-#endif /* !CONFIG_SMP */
-
-
-/*
- * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
- * specific CPU-side IRQs.
- */
-
-#define MAX_PIRQS 8
-static int pirq_entries [MAX_PIRQS];
-static int pirqs_enabled;
-int skip_ioapic_setup;
-
-static int __init ioapic_pirq_setup(char *str)
-{
- int i, max;
- int ints[MAX_PIRQS+1];
-
- get_options(str, ARRAY_SIZE(ints), ints);
-
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-
- pirqs_enabled = 1;
- apic_printk(APIC_VERBOSE, KERN_INFO
- "PIRQ redirection, working around broken MP-BIOS.\n");
- max = MAX_PIRQS;
- if (ints[0] < MAX_PIRQS)
- max = ints[0];
-
- for (i = 0; i < max; i++) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
- /*
- * PIRQs are mapped upside down, usually.
- */
- pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
- }
- return 1;
-}
-
-__setup("pirq=", ioapic_pirq_setup);
-
-/*
- * Find the IRQ entry number of a certain pin.
- */
-static int find_irq_entry(int apic, int pin, int type)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == type &&
- (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mp_dstirq == pin)
- return i;
-
- return -1;
-}
-
-/*
- * Find the pin to which IRQ[irq] (ISA) is connected
- */
-static int __init find_isa_irq_pin(int irq, int type)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
-
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
-
- return mp_irqs[i].mp_dstirq;
- }
- return -1;
-}
-
-static int __init find_isa_irq_apic(int irq, int type)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
-
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
- break;
- }
- if (i < mp_irq_entries) {
- int apic;
- for (apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
- return apic;
- }
- }
-
- return -1;
-}
-
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
- int apic, i, best_guess = -1;
-
- apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
- "slot:%d, pin:%d.\n", bus, slot, pin);
- if (test_bit(bus, mp_bus_not_pci)) {
- printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
- return -1;
- }
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL)
- break;
-
- if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mp_irqtype &&
- (bus == lbus) &&
- (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
-
- if (!(apic || IO_APIC_IRQ(irq)))
- continue;
-
- if (pin == (mp_irqs[i].mp_srcbusirq & 3))
- return irq;
- /*
- * Use the first all-but-pin matching entry as a
- * best-guess fuzzy result for broken mptables.
- */
- if (best_guess < 0)
- best_guess = irq;
- }
- }
- return best_guess;
-}
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
-/*
- * This function currently is only a helper for the i386 smp boot process where
- * we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be TARGET_CPUS
- */
-#ifdef CONFIG_SMP
-void __init setup_ioapic_dest(void)
-{
- int pin, ioapic, irq, irq_entry;
-
- if (skip_ioapic_setup == 1)
- return;
-
- for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
- for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
- irq_entry = find_irq_entry(ioapic, pin, mp_INT);
- if (irq_entry == -1)
- continue;
- irq = pin_2_irq(irq_entry, ioapic, pin);
- set_ioapic_affinity_irq(irq, TARGET_CPUS);
- }
-
- }
-}
-#endif
-
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < 16) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-#endif
-
-/* ISA interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx) (0)
-#define default_ISA_polarity(idx) (0)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
-#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-
-/* PCI interrupts are always polarity one level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx) (1)
-#define default_PCI_polarity(idx) (1)
-
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx) (1)
-#define default_MCA_polarity(idx) default_ISA_polarity(idx)
-
-static int MPBIOS_polarity(int idx)
-{
- int bus = mp_irqs[idx].mp_srcbus;
- int polarity;
-
- /*
- * Determine IRQ line polarity (high active or low active):
- */
- switch (mp_irqs[idx].mp_irqflag & 3) {
- case 0: /* conforms, ie. bus-type dependent polarity */
- {
- polarity = test_bit(bus, mp_bus_not_pci)?
- default_ISA_polarity(idx):
- default_PCI_polarity(idx);
- break;
- }
- case 1: /* high active */
- {
- polarity = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- case 3: /* low active */
- {
- polarity = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- }
- return polarity;
-}
-
-static int MPBIOS_trigger(int idx)
-{
- int bus = mp_irqs[idx].mp_srcbus;
- int trigger;
-
- /*
- * Determine IRQ trigger mode (edge or level sensitive):
- */
- switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
- case 0: /* conforms, ie. bus-type dependent */
- {
- trigger = test_bit(bus, mp_bus_not_pci)?
- default_ISA_trigger(idx):
- default_PCI_trigger(idx);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
- switch (mp_bus_id_to_type[bus]) {
- case MP_BUS_ISA: /* ISA pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
-#endif
- break;
- }
- case 1: /* edge */
- {
- trigger = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- case 3: /* level */
- {
- trigger = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 0;
- break;
- }
- }
- return trigger;
-}
-
-static inline int irq_polarity(int idx)
-{
- return MPBIOS_polarity(idx);
-}
-
-static inline int irq_trigger(int idx)
-{
- return MPBIOS_trigger(idx);
-}
-
-static int pin_2_irq(int idx, int apic, int pin)
-{
- int irq, i;
- int bus = mp_irqs[idx].mp_srcbus;
-
- /*
- * Debugging check, we are in big trouble if this message pops up!
- */
- if (mp_irqs[idx].mp_dstirq != pin)
- printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
-
- if (test_bit(bus, mp_bus_not_pci))
- irq = mp_irqs[idx].mp_srcbusirq;
- else {
- /*
- * PCI IRQs are mapped in order
- */
- i = irq = 0;
- while (i < apic)
- irq += nr_ioapic_registers[i++];
- irq += pin;
-
- /*
- * For MPS mode, so far only needed by ES7000 platform
- */
- if (ioapic_renumber_irq)
- irq = ioapic_renumber_irq(apic, irq);
- }
-
- /*
- * PCI IRQ command line redirection. Yes, limits are hardcoded.
- */
- if ((pin >= 16) && (pin <= 23)) {
- if (pirq_entries[pin-16] != -1) {
- if (!pirq_entries[pin-16]) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "disabling PIRQ%d\n", pin-16);
- } else {
- irq = pirq_entries[pin-16];
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "using PIRQ%d -> IRQ %d\n",
- pin-16, irq);
- }
- }
- }
- return irq;
-}
-
-static inline int IO_APIC_irq_trigger(int irq)
-{
- int apic, idx, pin;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
- return irq_trigger(idx);
- }
- }
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-
-/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
-
-static int __assign_irq_vector(int irq)
-{
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
- int vector, offset;
-
- BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
-
- if (irq_vector[irq] > 0)
- return irq_vector[irq];
-
- vector = current_vector;
- offset = current_offset;
-next:
- vector += 8;
- if (vector >= first_system_vector) {
- offset = (offset + 1) % 8;
- vector = FIRST_DEVICE_VECTOR + offset;
- }
- if (vector == current_vector)
- return -ENOSPC;
- if (test_and_set_bit(vector, used_vectors))
- goto next;
-
- current_vector = vector;
- current_offset = offset;
- irq_vector[irq] = vector;
-
- return vector;
-}
-
-static int assign_irq_vector(int irq)
-{
- unsigned long flags;
- int vector;
-
- spin_lock_irqsave(&vector_lock, flags);
- vector = __assign_irq_vector(irq);
- spin_unlock_irqrestore(&vector_lock, flags);
-
- return vector;
-}
-
-static struct irq_chip ioapic_chip;
-
-#define IOAPIC_AUTO -1
-#define IOAPIC_EDGE 0
-#define IOAPIC_LEVEL 1
-
-static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
-{
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL) {
- irq_desc[irq].status |= IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &ioapic_chip,
- handle_fasteoi_irq, "fasteoi");
- } else {
- irq_desc[irq].status &= ~IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &ioapic_chip,
- handle_edge_irq, "edge");
- }
- set_intr_gate(vector, interrupt[irq]);
-}
-
-static void __init setup_IO_APIC_irqs(void)
-{
- struct IO_APIC_route_entry entry;
- int apic, pin, idx, irq, first_notcon = 1, vector;
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-
- /*
- * add it to the IO-APIC irq-routing table:
- */
- memset(&entry, 0, sizeof(entry));
-
- entry.delivery_mode = INT_DELIVERY_MODE;
- entry.dest_mode = INT_DEST_MODE;
- entry.mask = 0; /* enable IRQ */
- entry.dest.logical.logical_dest =
- cpu_mask_to_apicid(TARGET_CPUS);
-
- idx = find_irq_entry(apic, pin, mp_INT);
- if (idx == -1) {
- if (first_notcon) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- " IO-APIC (apicid-pin) %d-%d",
- mp_ioapics[apic].mp_apicid,
- pin);
- first_notcon = 0;
- } else
- apic_printk(APIC_VERBOSE, ", %d-%d",
- mp_ioapics[apic].mp_apicid, pin);
- continue;
- }
-
- if (!first_notcon) {
- apic_printk(APIC_VERBOSE, " not connected.\n");
- first_notcon = 1;
- }
-
- entry.trigger = irq_trigger(idx);
- entry.polarity = irq_polarity(idx);
-
- if (irq_trigger(idx)) {
- entry.trigger = 1;
- entry.mask = 1;
- }
-
- irq = pin_2_irq(idx, apic, pin);
- /*
- * skip adding the timer int on secondary nodes, which causes
- * a small but painful rift in the time-space continuum
- */
- if (multi_timer_check(apic, irq))
- continue;
- else
- add_pin_to_irq(irq, apic, pin);
-
- if (!apic && !IO_APIC_IRQ(irq))
- continue;
-
- if (IO_APIC_IRQ(irq)) {
- vector = assign_irq_vector(irq);
- entry.vector = vector;
- ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-
- if (!apic && (irq < 16))
- disable_8259A_irq(irq);
- }
- ioapic_write_entry(apic, pin, entry);
- }
- }
-
- if (!first_notcon)
- apic_printk(APIC_VERBOSE, " not connected.\n");
-}
-
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
- int vector)
-{
- struct IO_APIC_route_entry entry;
-
- memset(&entry, 0, sizeof(entry));
-
- /*
- * We use logical delivery to get the timer IRQ
- * to the first CPU.
- */
- entry.dest_mode = INT_DEST_MODE;
- entry.mask = 1; /* mask IRQ now */
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
- entry.delivery_mode = INT_DELIVERY_MODE;
- entry.polarity = 0;
- entry.trigger = 0;
- entry.vector = vector;
-
- /*
- * The timer IRQ doesn't have to know that behind the
- * scene we may have a 8259A-master in AEOI mode ...
- */
- ioapic_register_intr(0, vector, IOAPIC_EDGE);
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
- ioapic_write_entry(apic, pin, entry);
-}
-
-void __init print_IO_APIC(void)
-{
- int apic, i;
- union IO_APIC_reg_00 reg_00;
- union IO_APIC_reg_01 reg_01;
- union IO_APIC_reg_02 reg_02;
- union IO_APIC_reg_03 reg_03;
- unsigned long flags;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
- for (i = 0; i < nr_ioapics; i++)
- printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
-
- /*
- * We are a bit conservative about what we expect. We have to
- * know about every hardware change ASAP.
- */
- printk(KERN_INFO "testing the IO APIC.......................\n");
-
- for (apic = 0; apic < nr_ioapics; apic++) {
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- reg_01.raw = io_apic_read(apic, 1);
- if (reg_01.bits.version >= 0x10)
- reg_02.raw = io_apic_read(apic, 2);
- if (reg_01.bits.version >= 0x20)
- reg_03.raw = io_apic_read(apic, 3);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
- printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
- printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
- printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
- printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
-
- printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
- printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
-
- printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
- printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
-
- /*
- * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
- * but the value of reg_02 is read as the previous read register
- * value, so ignore it if reg_02 == reg_01.
- */
- if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
- printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
- }
-
- /*
- * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
- * or reg_03, but the value of reg_0[23] is read as the previous read
- * register value, so ignore it if reg_03 == reg_0[12].
- */
- if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
- reg_03.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
- printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
- }
-
- printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
- printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
- " Stat Dest Deli Vect: \n");
-
- for (i = 0; i <= reg_01.bits.entries; i++) {
- struct IO_APIC_route_entry entry;
-
- entry = ioapic_read_entry(apic, i);
-
- printk(KERN_DEBUG " %02x %03X %02X ",
- i,
- entry.dest.logical.logical_dest,
- entry.dest.physical.physical_dest
- );
-
- printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
- entry.mask,
- entry.trigger,
- entry.irr,
- entry.polarity,
- entry.delivery_status,
- entry.dest_mode,
- entry.delivery_mode,
- entry.vector
- );
- }
- }
- printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for (i = 0; i < NR_IRQS; i++) {
- struct irq_pin_list *entry = irq_2_pin + i;
- if (entry->pin < 0)
- continue;
- printk(KERN_DEBUG "IRQ%d ", i);
- for (;;) {
- printk("-> %d:%d", entry->apic, entry->pin);
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
- printk("\n");
- }
-
- printk(KERN_INFO ".................................... done.\n");
-
- return;
-}
-
-#if 0
-
-static void print_APIC_bitfield(int base)
-{
- unsigned int v;
- int i, j;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
- for (i = 0; i < 8; i++) {
- v = apic_read(base + i*0x10);
- for (j = 0; j < 32; j++) {
- if (v & (1<<j))
- printk("1");
- else
- printk("0");
- }
- printk("\n");
- }
-}
-
-void /*__init*/ print_local_APIC(void *dummy)
-{
- unsigned int v, ver, maxlvt;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
- smp_processor_id(), hard_smp_processor_id());
- v = apic_read(APIC_ID);
- printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
- GET_APIC_ID(read_apic_id()));
- v = apic_read(APIC_LVR);
- printk(KERN_INFO "... APIC VERSION: %08x\n", v);
- ver = GET_APIC_VERSION(v);
- maxlvt = lapic_get_maxlvt();
-
- v = apic_read(APIC_TASKPRI);
- printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- v = apic_read(APIC_ARBPRI);
- printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
- v & APIC_ARBPRI_MASK);
- v = apic_read(APIC_PROCPRI);
- printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
- }
-
- v = apic_read(APIC_EOI);
- printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
- v = apic_read(APIC_RRR);
- printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
- v = apic_read(APIC_LDR);
- printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
- v = apic_read(APIC_DFR);
- printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
- v = apic_read(APIC_SPIV);
- printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
-
- printk(KERN_DEBUG "... APIC ISR field:\n");
- print_APIC_bitfield(APIC_ISR);
- printk(KERN_DEBUG "... APIC TMR field:\n");
- print_APIC_bitfield(APIC_TMR);
- printk(KERN_DEBUG "... APIC IRR field:\n");
- print_APIC_bitfield(APIC_IRR);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- v = apic_read(APIC_ESR);
- printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
- }
-
- v = apic_read(APIC_ICR);
- printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
- v = apic_read(APIC_ICR2);
- printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
-
- v = apic_read(APIC_LVTT);
- printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
-
- if (maxlvt > 3) { /* PC is LVT#4. */
- v = apic_read(APIC_LVTPC);
- printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
- }
- v = apic_read(APIC_LVT0);
- printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
- v = apic_read(APIC_LVT1);
- printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
-
- if (maxlvt > 2) { /* ERR is LVT#3. */
- v = apic_read(APIC_LVTERR);
- printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
- }
-
- v = apic_read(APIC_TMICT);
- printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
- v = apic_read(APIC_TMCCT);
- printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
- v = apic_read(APIC_TDCR);
- printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
- printk("\n");
-}
-
-void print_all_local_APICs(void)
-{
- on_each_cpu(print_local_APIC, NULL, 1);
-}
-
-void /*__init*/ print_PIC(void)
-{
- unsigned int v;
- unsigned long flags;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG "\nprinting PIC contents\n");
-
- spin_lock_irqsave(&i8259A_lock, flags);
-
- v = inb(0xa1) << 8 | inb(0x21);
- printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
-
- v = inb(0xa0) << 8 | inb(0x20);
- printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
-
- outb(0x0b, 0xa0);
- outb(0x0b, 0x20);
- v = inb(0xa0) << 8 | inb(0x20);
- outb(0x0a, 0xa0);
- outb(0x0a, 0x20);
-
- spin_unlock_irqrestore(&i8259A_lock, flags);
-
- printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
-
- v = inb(0x4d1) << 8 | inb(0x4d0);
- printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
-}
-
-#endif /* 0 */
-
-static void __init enable_IO_APIC(void)
-{
- union IO_APIC_reg_01 reg_01;
- int i8259_apic, i8259_pin;
- int i, apic;
- unsigned long flags;
-
- for (i = 0; i < PIN_MAP_SIZE; i++) {
- irq_2_pin[i].pin = -1;
- irq_2_pin[i].next = 0;
- }
- if (!pirqs_enabled)
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-
- /*
- * The number of IO-APIC IRQ registers (== #pins):
- */
- for (apic = 0; apic < nr_ioapics; apic++) {
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(apic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- nr_ioapic_registers[apic] = reg_01.bits.entries+1;
- }
- for (apic = 0; apic < nr_ioapics; apic++) {
- int pin;
- /* See if any of the pins is in ExtINT mode */
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- struct IO_APIC_route_entry entry;
- entry = ioapic_read_entry(apic, pin);
-
-
- /* If the interrupt line is enabled and in ExtInt mode
- * I have found the pin where the i8259 is connected.
- */
- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
- ioapic_i8259.apic = apic;
- ioapic_i8259.pin = pin;
- goto found_i8259;
- }
- }
- }
- found_i8259:
- /* Look to see what if the MP table has reported the ExtINT */
- /* If we could not find the appropriate pin by looking at the ioapic
- * the i8259 probably is not connected the ioapic but give the
- * mptable a chance anyway.
- */
- i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
- i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
- /* Trust the MP table if nothing is setup in the hardware */
- if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
- printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
- ioapic_i8259.pin = i8259_pin;
- ioapic_i8259.apic = i8259_apic;
- }
- /* Complain if the MP table and the hardware disagree */
- if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
- (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
- {
- printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
- }
-
- /*
- * Do not trust the IO-APIC being empty at bootup
- */
- clear_IO_APIC();
-}
-
-/*
- * Not an __init, needed by the reboot code
- */
-void disable_IO_APIC(void)
-{
- /*
- * Clear the IO-APIC before rebooting:
- */
- clear_IO_APIC();
-
- /*
- * If the i8259 is routed through an IOAPIC
- * Put that IOAPIC in virtual wire mode
- * so legacy interrupts can be delivered.
- */
- if (ioapic_i8259.pin != -1) {
- struct IO_APIC_route_entry entry;
-
- memset(&entry, 0, sizeof(entry));
- entry.mask = 0; /* Enabled */
- entry.trigger = 0; /* Edge */
- entry.irr = 0;
- entry.polarity = 0; /* High */
- entry.delivery_status = 0;
- entry.dest_mode = 0; /* Physical */
- entry.delivery_mode = dest_ExtINT; /* ExtInt */
- entry.vector = 0;
- entry.dest.physical.physical_dest =
- GET_APIC_ID(read_apic_id());
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
- ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
- }
- disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-}
-
-/*
- * function to set the IO-APIC physical IDs based on the
- * values stored in the MPC table.
- *
- * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
- */
-
-static void __init setup_ioapic_ids_from_mpc(void)
-{
- union IO_APIC_reg_00 reg_00;
- physid_mask_t phys_id_present_map;
- int apic;
- int i;
- unsigned char old_id;
- unsigned long flags;
-
-#ifdef CONFIG_X86_NUMAQ
- if (found_numaq)
- return;
-#endif
-
- /*
- * Don't check I/O APIC IDs for xAPIC systems. They have
- * no meaning without the serial APIC bus.
- */
- if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return;
- /*
- * This is broken; anything with a real cpu count has to
- * circumvent this idiocy regardless.
- */
- phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
-
- /*
- * Set the IOAPIC ID to the value stored in the MPC table.
- */
- for (apic = 0; apic < nr_ioapics; apic++) {
-
- /* Read the register 0 value */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- old_id = mp_ioapics[apic].mp_apicid;
-
- if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic, mp_ioapics[apic].mp_apicid);
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- reg_00.bits.ID);
- mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
- }
-
- /*
- * Sanity check, is the ID really free? Every APIC in a
- * system must have a unique ID or we get lots of nice
- * 'stuck on smp_invalidate_needed IPI wait' messages.
- */
- if (check_apicid_used(phys_id_present_map,
- mp_ioapics[apic].mp_apicid)) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic, mp_ioapics[apic].mp_apicid);
- for (i = 0; i < get_physical_broadcast(); i++)
- if (!physid_isset(i, phys_id_present_map))
- break;
- if (i >= get_physical_broadcast())
- panic("Max APIC ID exceeded!\n");
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- i);
- physid_set(i, phys_id_present_map);
- mp_ioapics[apic].mp_apicid = i;
- } else {
- physid_mask_t tmp;
- tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
- apic_printk(APIC_VERBOSE, "Setting %d in the "
- "phys_id_present_map\n",
- mp_ioapics[apic].mp_apicid);
- physids_or(phys_id_present_map, phys_id_present_map, tmp);
- }
-
-
- /*
- * We need to adjust the IRQ routing table
- * if the ID changed.
- */
- if (old_id != mp_ioapics[apic].mp_apicid)
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_dstapic == old_id)
- mp_irqs[i].mp_dstapic
- = mp_ioapics[apic].mp_apicid;
-
- /*
- * Read the right value from the MPC table and
- * write it into the ID register.
- */
- apic_printk(APIC_VERBOSE, KERN_INFO
- "...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mp_apicid);
-
- reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0, reg_00.raw);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /*
- * Sanity check
- */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
- printk("could not set ID!\n");
- else
- apic_printk(APIC_VERBOSE, " ok.\n");
- }
-}
-
-int no_timer_check __initdata;
-
-static int __init notimercheck(char *s)
-{
- no_timer_check = 1;
- return 1;
-}
-__setup("no_timer_check", notimercheck);
-
-/*
- * There is a nasty bug in some older SMP boards, their mptable lies
- * about the timer IRQ. We do the following to work around the situation:
- *
- * - timer IRQ defaults to IO-APIC IRQ
- * - if this function detects that timer IRQs are defunct, then we fall
- * back to ISA timer IRQs
- */
-static int __init timer_irq_works(void)
-{
- unsigned long t1 = jiffies;
- unsigned long flags;
-
- if (no_timer_check)
- return 1;
-
- local_save_flags(flags);
- local_irq_enable();
- /* Let ten ticks pass... */
- mdelay((10 * 1000) / HZ);
- local_irq_restore(flags);
-
- /*
- * Expect a few ticks at least, to be sure some possible
- * glue logic does not lock up after one or two first
- * ticks in a non-ExtINT mode. Also the local APIC
- * might have cached one ExtINT interrupt. Finally, at
- * least one tick may be lost due to delays.
- */
- if (time_after(jiffies, t1 + 4))
- return 1;
-
- return 0;
-}
-
-/*
- * In the SMP+IOAPIC case it might happen that there are an unspecified
- * number of pending IRQ events unhandled. These cases are very rare,
- * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
- * better to do it this way as thus we do not have to be aware of
- * 'pending' interrupts in the IRQ path, except at this point.
- */
-/*
- * Edge triggered needs to resend any interrupt
- * that was delayed but this is now handled in the device
- * independent code.
- */
-
-/*
- * Startup quirk:
- *
- * Starting up a edge-triggered IO-APIC interrupt is
- * nasty - we need to make sure that we get the edge.
- * If it is already asserted for some reason, we need
- * return 1 to indicate that is was pending.
- *
- * This is not complete - we should be able to fake
- * an edge even if it isn't on the 8259A...
- *
- * (We do this for level-triggered IRQs too - it cannot hurt.)
- */
-static unsigned int startup_ioapic_irq(unsigned int irq)
-{
- int was_pending = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < 16) {
- disable_8259A_irq(irq);
- if (i8259A_irq_pending(irq))
- was_pending = 1;
- }
- __unmask_IO_APIC_irq(irq);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return was_pending;
-}
-
-static void ack_ioapic_irq(unsigned int irq)
-{
- move_native_irq(irq);
- ack_APIC_irq();
-}
-
-static void ack_ioapic_quirk_irq(unsigned int irq)
-{
- unsigned long v;
- int i;
-
- move_native_irq(irq);
-/*
- * It appears there is an erratum which affects at least version 0x11
- * of I/O APIC (that's the 82093AA and cores integrated into various
- * chipsets). Under certain conditions a level-triggered interrupt is
- * erroneously delivered as edge-triggered one but the respective IRR
- * bit gets set nevertheless. As a result the I/O unit expects an EOI
- * message but it will never arrive and further interrupts are blocked
- * from the source. The exact reason is so far unknown, but the
- * phenomenon was observed when two consecutive interrupt requests
- * from a given source get delivered to the same CPU and the source is
- * temporarily disabled in between.
- *
- * A workaround is to simulate an EOI message manually. We achieve it
- * by setting the trigger mode to edge and then to level when the edge
- * trigger mode gets detected in the TMR of a local APIC for a
- * level-triggered interrupt. We mask the source for the time of the
- * operation to prevent an edge-triggered interrupt escaping meanwhile.
- * The idea is from Manfred Spraul. --macro
- */
- i = irq_vector[irq];
-
- v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
-
- ack_APIC_irq();
-
- if (!(v & (1 << (i & 0x1f)))) {
- atomic_inc(&irq_mis_count);
- spin_lock(&ioapic_lock);
- __mask_and_edge_IO_APIC_irq(irq);
- __unmask_and_level_IO_APIC_irq(irq);
- spin_unlock(&ioapic_lock);
- }
-}
-
-static int ioapic_retrigger_irq(unsigned int irq)
-{
- send_IPI_self(irq_vector[irq]);
-
- return 1;
-}
-
-static struct irq_chip ioapic_chip __read_mostly = {
- .name = "IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_ioapic_irq,
- .eoi = ack_ioapic_quirk_irq,
-#ifdef CONFIG_SMP
- .set_affinity = set_ioapic_affinity_irq,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-
-static inline void init_IO_APIC_traps(void)
-{
- int irq;
-
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
- for (irq = 0; irq < NR_IRQS ; irq++) {
- if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
- /*
- * Hmm.. We don't have an entry for this,
- * so default to an old-fashioned 8259
- * interrupt if we can..
- */
- if (irq < 16)
- make_8259A_irq(irq);
- else
- /* Strange. Oh, well.. */
- irq_desc[irq].chip = &no_irq_chip;
- }
- }
-}
-
-/*
- * The local APIC irq-chip implementation:
- */
-
-static void ack_lapic_irq(unsigned int irq)
-{
- ack_APIC_irq();
-}
-
-static void mask_lapic_irq(unsigned int irq)
-{
- unsigned long v;
-
- v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
-}
-
-static void unmask_lapic_irq(unsigned int irq)
-{
- unsigned long v;
-
- v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
-}
-
-static struct irq_chip lapic_chip __read_mostly = {
- .name = "local-APIC",
- .mask = mask_lapic_irq,
- .unmask = unmask_lapic_irq,
- .ack = ack_lapic_irq,
-};
-
-static void lapic_register_intr(int irq, int vector)
-{
- irq_desc[irq].status &= ~IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
- "edge");
- set_intr_gate(vector, interrupt[irq]);
-}
-
-static void __init setup_nmi(void)
-{
- /*
- * Dirty trick to enable the NMI watchdog ...
- * We put the 8259A master into AEOI mode and
- * unmask on all local APICs LVT0 as NMI.
- *
- * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
- * is from Maciej W. Rozycki - so we do not have to EOI from
- * the NMI handler or the timer interrupt.
- */
- apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
- enable_NMI_through_LVT0();
-
- apic_printk(APIC_VERBOSE, " done.\n");
-}
-
-/*
- * This looks a bit hackish but it's about the only one way of sending
- * a few INTA cycles to 8259As and any associated glue logic. ICR does
- * not support the ExtINT mode, unfortunately. We need to send these
- * cycles as some i82489DX-based boards have glue logic that keeps the
- * 8259A interrupt line asserted until INTA. --macro
- */
-static inline void __init unlock_ExtINT_logic(void)
-{
- int apic, pin, i;
- struct IO_APIC_route_entry entry0, entry1;
- unsigned char save_control, save_freq_select;
-
- pin = find_isa_irq_pin(8, mp_INT);
- if (pin == -1) {
- WARN_ON_ONCE(1);
- return;
- }
- apic = find_isa_irq_apic(8, mp_INT);
- if (apic == -1) {
- WARN_ON_ONCE(1);
- return;
- }
-
- entry0 = ioapic_read_entry(apic, pin);
- clear_IO_APIC_pin(apic, pin);
-
- memset(&entry1, 0, sizeof(entry1));
-
- entry1.dest_mode = 0; /* physical delivery */
- entry1.mask = 0; /* unmask IRQ now */
- entry1.dest.physical.physical_dest = hard_smp_processor_id();
- entry1.delivery_mode = dest_ExtINT;
- entry1.polarity = entry0.polarity;
- entry1.trigger = 0;
- entry1.vector = 0;
-
- ioapic_write_entry(apic, pin, entry1);
-
- save_control = CMOS_READ(RTC_CONTROL);
- save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
- CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
- RTC_FREQ_SELECT);
- CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
-
- i = 100;
- while (i-- > 0) {
- mdelay(10);
- if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
- i -= 10;
- }
-
- CMOS_WRITE(save_control, RTC_CONTROL);
- CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
- clear_IO_APIC_pin(apic, pin);
-
- ioapic_write_entry(apic, pin, entry0);
-}
-
-/*
- * This code may look a bit paranoid, but it's supposed to cooperate with
- * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
- * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
- * fanatically on his truly buggy board.
- */
-static inline void __init check_timer(void)
-{
- int apic1, pin1, apic2, pin2;
- int no_pin1 = 0;
- int vector;
- unsigned int ver;
- unsigned long flags;
-
- local_irq_save(flags);
-
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
-
- /*
- * get/set the timer IRQ vector:
- */
- disable_8259A_irq(0);
- vector = assign_irq_vector(0);
- set_intr_gate(vector, interrupt[0]);
-
- /*
- * As IRQ0 is to be enabled in the 8259A, the virtual
- * wire has to be disabled in the local APIC. Also
- * timer interrupts need to be acknowledged manually in
- * the 8259A for the i82489DX when using the NMI
- * watchdog as that APIC treats NMIs as level-triggered.
- * The AEOI mode will finish them in the 8259A
- * automatically.
- */
- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
- init_8259A(1);
- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
-
- pin1 = find_isa_irq_pin(0, mp_INT);
- apic1 = find_isa_irq_apic(0, mp_INT);
- pin2 = ioapic_i8259.pin;
- apic2 = ioapic_i8259.apic;
-
- apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
- "apic1=%d pin1=%d apic2=%d pin2=%d\n",
- vector, apic1, pin1, apic2, pin2);
-
- /*
- * Some BIOS writers are clueless and report the ExtINTA
- * I/O APIC input from the cascaded 8259A as the timer
- * interrupt input. So just in case, if only one pin
- * was found above, try it both directly and through the
- * 8259A.
- */
- if (pin1 == -1) {
- pin1 = pin2;
- apic1 = apic2;
- no_pin1 = 1;
- } else if (pin2 == -1) {
- pin2 = pin1;
- apic2 = apic1;
- }
-
- if (pin1 != -1) {
- /*
- * Ok, does IRQ0 through the IOAPIC work?
- */
- if (no_pin1) {
- add_pin_to_irq(0, apic1, pin1);
- setup_timer_IRQ0_pin(apic1, pin1, vector);
- }
- unmask_IO_APIC_irq(0);
- if (timer_irq_works()) {
- if (nmi_watchdog == NMI_IO_APIC) {
- setup_nmi();
- enable_8259A_irq(0);
- }
- if (disable_timer_pin_1 > 0)
- clear_IO_APIC_pin(0, pin1);
- goto out;
- }
- clear_IO_APIC_pin(apic1, pin1);
- if (!no_pin1)
- apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
- "8254 timer not connected to IO-APIC\n");
-
- apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
- "(IRQ0) through the 8259A ...\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "..... (found apic %d pin %d) ...\n", apic2, pin2);
- /*
- * legacy devices should be connected to IO APIC #0
- */
- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
- setup_timer_IRQ0_pin(apic2, pin2, vector);
- unmask_IO_APIC_irq(0);
- enable_8259A_irq(0);
- if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
- timer_through_8259 = 1;
- if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
- setup_nmi();
- enable_8259A_irq(0);
- }
- goto out;
- }
- /*
- * Cleanup, just in case ...
- */
- disable_8259A_irq(0);
- clear_IO_APIC_pin(apic2, pin2);
- apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
- }
-
- if (nmi_watchdog == NMI_IO_APIC) {
- apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
- "through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = NMI_NONE;
- }
- timer_ack = 0;
-
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as Virtual Wire IRQ...\n");
-
- lapic_register_intr(0, vector);
- apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
- enable_8259A_irq(0);
-
- if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
- goto out;
- }
- disable_8259A_irq(0);
- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
- apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
-
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as ExtINT IRQ...\n");
-
- init_8259A(0);
- make_8259A_irq(0);
- apic_write(APIC_LVT0, APIC_DM_EXTINT);
-
- unlock_ExtINT_logic();
-
- if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
- goto out;
- }
- apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
- panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
- "report. Then try booting with the 'noapic' option.\n");
-out:
- local_irq_restore(flags);
-}
-
-/*
- * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
- * to devices. However there may be an I/O APIC pin available for
- * this interrupt regardless. The pin may be left unconnected, but
- * typically it will be reused as an ExtINT cascade interrupt for
- * the master 8259A. In the MPS case such a pin will normally be
- * reported as an ExtINT interrupt in the MP table. With ACPI
- * there is no provision for ExtINT interrupts, and in the absence
- * of an override it would be treated as an ordinary ISA I/O APIC
- * interrupt, that is edge-triggered and unmasked by default. We
- * used to do this, but it caused problems on some systems because
- * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
- * the same ExtINT cascade interrupt to drive the local APIC of the
- * bootstrap processor. Therefore we refrain from routing IRQ2 to
- * the I/O APIC in all cases now. No actual device should request
- * it anyway. --macro
- */
-#define PIC_IRQS (1 << PIC_CASCADE_IR)
-
-void __init setup_IO_APIC(void)
-{
- int i;
-
- /* Reserve all the system vectors. */
- for (i = first_system_vector; i < NR_VECTORS; i++)
- set_bit(i, used_vectors);
-
- enable_IO_APIC();
-
- io_apic_irqs = ~PIC_IRQS;
-
- printk("ENABLING IO-APIC IRQs\n");
-
- /*
- * Set up IO-APIC IRQ routing.
- */
- if (!acpi_ioapic)
- setup_ioapic_ids_from_mpc();
- sync_Arb_IDs();
- setup_IO_APIC_irqs();
- init_IO_APIC_traps();
- check_timer();
- if (!acpi_ioapic)
- print_IO_APIC();
-}
-
-/*
- * Called after all the initialization is done. If we didnt find any
- * APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
-{
- if (sis_apic_bug == -1)
- sis_apic_bug = 0;
- return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
-struct sysfs_ioapic_data {
- struct sys_device dev;
- struct IO_APIC_route_entry entry[0];
-};
-static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
-
-static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
-{
- struct IO_APIC_route_entry *entry;
- struct sysfs_ioapic_data *data;
- int i;
-
- data = container_of(dev, struct sysfs_ioapic_data, dev);
- entry = data->entry;
- for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
- entry[i] = ioapic_read_entry(dev->id, i);
-
- return 0;
-}
-
-static int ioapic_resume(struct sys_device *dev)
-{
- struct IO_APIC_route_entry *entry;
- struct sysfs_ioapic_data *data;
- unsigned long flags;
- union IO_APIC_reg_00 reg_00;
- int i;
-
- data = container_of(dev, struct sysfs_ioapic_data, dev);
- entry = data->entry;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
- io_apic_write(dev->id, 0, reg_00.raw);
- }
- spin_unlock_irqrestore(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
- ioapic_write_entry(dev->id, i, entry[i]);
-
- return 0;
-}
-
-static struct sysdev_class ioapic_sysdev_class = {
- .name = "ioapic",
- .suspend = ioapic_suspend,
- .resume = ioapic_resume,
-};
-
-static int __init ioapic_init_sysfs(void)
-{
- struct sys_device *dev;
- int i, size, error = 0;
-
- error = sysdev_class_register(&ioapic_sysdev_class);
- if (error)
- return error;
-
- for (i = 0; i < nr_ioapics; i++) {
- size = sizeof(struct sys_device) + nr_ioapic_registers[i]
- * sizeof(struct IO_APIC_route_entry);
- mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
- if (!mp_ioapic_data[i]) {
- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
- continue;
- }
- dev = &mp_ioapic_data[i]->dev;
- dev->id = i;
- dev->cls = &ioapic_sysdev_class;
- error = sysdev_register(dev);
- if (error) {
- kfree(mp_ioapic_data[i]);
- mp_ioapic_data[i] = NULL;
- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
- continue;
- }
- }
-
- return 0;
-}
-
-device_initcall(ioapic_init_sysfs);
-
-/*
- * Dynamic irq allocate and deallocation
- */
-int create_irq(void)
-{
- /* Allocate an unused irq */
- int irq, new, vector = 0;
- unsigned long flags;
-
- irq = -ENOSPC;
- spin_lock_irqsave(&vector_lock, flags);
- for (new = (NR_IRQS - 1); new >= 0; new--) {
- if (platform_legacy_irq(new))
- continue;
- if (irq_vector[new] != 0)
- continue;
- vector = __assign_irq_vector(new);
- if (likely(vector > 0))
- irq = new;
- break;
- }
- spin_unlock_irqrestore(&vector_lock, flags);
-
- if (irq >= 0) {
- set_intr_gate(vector, interrupt[irq]);
- dynamic_irq_init(irq);
- }
- return irq;
-}
-
-void destroy_irq(unsigned int irq)
-{
- unsigned long flags;
-
- dynamic_irq_cleanup(irq);
-
- spin_lock_irqsave(&vector_lock, flags);
- clear_bit(irq_vector[irq], used_vectors);
- irq_vector[irq] = 0;
- spin_unlock_irqrestore(&vector_lock, flags);
-}
-
-/*
- * MSI message composition
- */
-#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
-{
- int vector;
- unsigned dest;
-
- vector = assign_irq_vector(irq);
- if (vector >= 0) {
- dest = cpu_mask_to_apicid(TARGET_CPUS);
-
- msg->address_hi = MSI_ADDR_BASE_HI;
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((INT_DEST_MODE == 0) ?
-MSI_ADDR_DEST_MODE_PHYSICAL:
- MSI_ADDR_DEST_MODE_LOGICAL) |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
- MSI_ADDR_REDIRECTION_CPU:
- MSI_ADDR_REDIRECTION_LOWPRI) |
- MSI_ADDR_DEST_ID(dest);
-
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-MSI_DATA_DELIVERY_FIXED:
- MSI_DATA_DELIVERY_LOWPRI) |
- MSI_DATA_VECTOR(vector);
- }
- return vector;
-}
-
-#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
-{
- struct msi_msg msg;
- unsigned int dest;
- cpumask_t tmp;
- int vector;
-
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- vector = assign_irq_vector(irq);
- if (vector < 0)
- return;
-
- dest = cpu_mask_to_apicid(mask);
-
- read_msi_msg(irq, &msg);
-
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
- write_msi_msg(irq, &msg);
- irq_desc[irq].affinity = mask;
-}
-#endif /* CONFIG_SMP */
-
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
- .name = "PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
- .ack = ack_ioapic_irq,
-#ifdef CONFIG_SMP
- .set_affinity = set_msi_irq_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
-{
- struct msi_msg msg;
- int irq, ret;
- irq = create_irq();
- if (irq < 0)
- return irq;
-
- ret = msi_compose_msg(dev, irq, &msg);
- if (ret < 0) {
- destroy_irq(irq);
- return ret;
- }
-
- set_irq_msi(irq, desc);
- write_msi_msg(irq, &msg);
-
- set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
- "edge");
-
- return 0;
-}
-
-void arch_teardown_msi_irq(unsigned int irq)
-{
- destroy_irq(irq);
-}
-
-#endif /* CONFIG_PCI_MSI */
-
-/*
- * Hypertransport interrupt support
- */
-#ifdef CONFIG_HT_IRQ
-
-#ifdef CONFIG_SMP
-
-static void target_ht_irq(unsigned int irq, unsigned int dest)
-{
- struct ht_irq_msg msg;
- fetch_ht_irq_msg(irq, &msg);
-
- msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
- msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
- msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
- msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
- write_ht_irq_msg(irq, &msg);
-}
-
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
-{
- unsigned int dest;
- cpumask_t tmp;
-
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- cpus_and(mask, tmp, CPU_MASK_ALL);
-
- dest = cpu_mask_to_apicid(mask);
-
- target_ht_irq(irq, dest);
- irq_desc[irq].affinity = mask;
-}
-#endif
-
-static struct irq_chip ht_irq_chip = {
- .name = "PCI-HT",
- .mask = mask_ht_irq,
- .unmask = unmask_ht_irq,
- .ack = ack_ioapic_irq,
-#ifdef CONFIG_SMP
- .set_affinity = set_ht_irq_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
-{
- int vector;
-
- vector = assign_irq_vector(irq);
- if (vector >= 0) {
- struct ht_irq_msg msg;
- unsigned dest;
- cpumask_t tmp;
-
- cpus_clear(tmp);
- cpu_set(vector >> 8, tmp);
- dest = cpu_mask_to_apicid(tmp);
-
- msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
-
- msg.address_lo =
- HT_IRQ_LOW_BASE |
- HT_IRQ_LOW_DEST_ID(dest) |
- HT_IRQ_LOW_VECTOR(vector) |
- ((INT_DEST_MODE == 0) ?
- HT_IRQ_LOW_DM_PHYSICAL :
- HT_IRQ_LOW_DM_LOGICAL) |
- HT_IRQ_LOW_RQEOI_EDGE |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
- HT_IRQ_LOW_MT_FIXED :
- HT_IRQ_LOW_MT_ARBITRATED) |
- HT_IRQ_LOW_IRQ_MASKED;
-
- write_ht_irq_msg(irq, &msg);
-
- set_irq_chip_and_handler_name(irq, &ht_irq_chip,
- handle_edge_irq, "edge");
- }
- return vector;
-}
-#endif /* CONFIG_HT_IRQ */
-
-/* --------------------------------------------------------------------------
- ACPI-based IOAPIC Configuration
- -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-int __init io_apic_get_unique_id(int ioapic, int apic_id)
-{
- union IO_APIC_reg_00 reg_00;
- static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
- physid_mask_t tmp;
- unsigned long flags;
- int i = 0;
-
- /*
- * The P4 platform supports up to 256 APIC IDs on two separate APIC
- * buses (one for LAPICs, one for IOAPICs), where predecessors only
- * supports up to 16 on one shared APIC bus.
- *
- * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
- * advantage of new APIC bus architecture.
- */
-
- if (physids_empty(apic_id_map))
- apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- if (apic_id >= get_physical_broadcast()) {
- printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
- "%d\n", ioapic, apic_id, reg_00.bits.ID);
- apic_id = reg_00.bits.ID;
- }
-
- /*
- * Every APIC in a system must have a unique ID or we get lots of nice
- * 'stuck on smp_invalidate_needed IPI wait' messages.
- */
- if (check_apicid_used(apic_id_map, apic_id)) {
-
- for (i = 0; i < get_physical_broadcast(); i++) {
- if (!check_apicid_used(apic_id_map, i))
- break;
- }
-
- if (i == get_physical_broadcast())
- panic("Max apic_id exceeded!\n");
-
- printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
- "trying %d\n", ioapic, apic_id, i);
-
- apic_id = i;
- }
-
- tmp = apicid_to_cpu_present(apic_id);
- physids_or(apic_id_map, apic_id_map, tmp);
-
- if (reg_00.bits.ID != apic_id) {
- reg_00.bits.ID = apic_id;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0, reg_00.raw);
- reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /* Sanity check */
- if (reg_00.bits.ID != apic_id) {
- printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
- return -1;
- }
- }
-
- apic_printk(APIC_VERBOSE, KERN_INFO
- "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
-
- return apic_id;
-}
-
-
-int __init io_apic_get_version(int ioapic)
-{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return reg_01.bits.version;
-}
-
-
-int __init io_apic_get_redir_entries(int ioapic)
-{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return reg_01.bits.entries;
-}
-
-
-int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
-{
- struct IO_APIC_route_entry entry;
-
- if (!IO_APIC_IRQ(irq)) {
- printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
- ioapic);
- return -EINVAL;
- }
-
- /*
- * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
- * Note that we mask (disable) IRQs now -- these get enabled when the
- * corresponding device driver registers for this IRQ.
- */
-
- memset(&entry, 0, sizeof(entry));
-
- entry.delivery_mode = INT_DELIVERY_MODE;
- entry.dest_mode = INT_DEST_MODE;
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
- entry.trigger = edge_level;
- entry.polarity = active_high_low;
- entry.mask = 1;
-
- /*
- * IRQs < 16 are already in the irq_2_pin[] map
- */
- if (irq >= 16)
- add_pin_to_irq(irq, ioapic, pin);
-
- entry.vector = assign_irq_vector(irq);
-
- apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
- "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
- mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
- edge_level, active_high_low);
-
- ioapic_register_intr(irq, entry.vector, edge_level);
-
- if (!ioapic && (irq < 16))
- disable_8259A_irq(irq);
-
- ioapic_write_entry(ioapic, pin, entry);
-
- return 0;
-}
-
-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
-{
- int i;
-
- if (skip_ioapic_setup)
- return -1;
-
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == mp_INT &&
- mp_irqs[i].mp_srcbusirq == bus_irq)
- break;
- if (i >= mp_irq_entries)
- return -1;
-
- *trigger = irq_trigger(i);
- *polarity = irq_polarity(i);
- return 0;
-}
-
-#endif /* CONFIG_ACPI */
-
-static int __init parse_disable_timer_pin_1(char *arg)
-{
- disable_timer_pin_1 = 1;
- return 0;
-}
-early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
-
-static int __init parse_enable_timer_pin_1(char *arg)
-{
- disable_timer_pin_1 = -1;
- return 0;
-}
-early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
-
-static int __init parse_noapic(char *arg)
-{
- /* disable IO-APIC */
- disable_ioapic_setup();
- return 0;
-}
-early_param("noapic", parse_noapic);
-
-void __init ioapic_init_mappings(void)
-{
- unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
- int i;
-
- for (i = 0; i < nr_ioapics; i++) {
- if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mp_apicaddr;
- if (!ioapic_phys) {
- printk(KERN_ERR
- "WARNING: bogus zero IO-APIC "
- "address found in MPTABLE, "
- "disabling IO/APIC support!\n");
- smp_found_config = 0;
- skip_ioapic_setup = 1;
- goto fake_ioapic_page;
- }
- } else {
-fake_ioapic_page:
- ioapic_phys = (unsigned long)
- alloc_bootmem_pages(PAGE_SIZE);
- ioapic_phys = __pa(ioapic_phys);
- }
- set_fixmap_nocache(idx, ioapic_phys);
- printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx), ioapic_phys);
- idx++;
- }
-}
-
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 50e5e4a31c8..19191430274 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/thread_info.h>
#include <linux/syscalls.h>
+#include <asm/syscalls.h>
/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
static void set_bitmap(unsigned long *bitmap, unsigned int base,
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 3f7537b669d..f1c688e46f3 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -20,6 +20,8 @@
#ifdef CONFIG_X86_32
#include <mach_apic.h>
+#include <mach_ipi.h>
+
/*
* the following functions deal with sending IPIs between CPUs.
*
@@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
}
/* must come after the send_IPI functions above for inlining */
-#include <mach_ipi.h>
static int convert_apicid_to_cpu(int apic_id)
{
int i;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
new file mode 100644
index 00000000000..d1d4dc52f64
--- /dev/null
+++ b/arch/x86/kernel/irq.c
@@ -0,0 +1,189 @@
+/*
+ * Common interrupt code for 32 and 64 bit
+ */
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/seq_file.h>
+
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/smp.h>
+
+atomic_t irq_err_count;
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+ printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * Currently unexpected vectors happen only on SMP and APIC.
+ * We _must_ ack these because every local APIC has only N
+ * irq slots per priority level, and a 'hanging, unacked' IRQ
+ * holds up an irq slot - in excessive cases (when multiple
+ * unexpected vectors occur) that might lock up the APIC
+ * completely.
+ * But only ack when the APIC is enabled -AK
+ */
+ if (cpu_has_apic)
+ ack_APIC_irq();
+#endif
+}
+
+#ifdef CONFIG_X86_32
+# define irq_stats(x) (&per_cpu(irq_stat, x))
+#else
+# define irq_stats(x) cpu_pda(x)
+#endif
+/*
+ * /proc/interrupts printing:
+ */
+static int show_other_interrupts(struct seq_file *p)
+{
+ int j;
+
+ seq_printf(p, "NMI: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
+ seq_printf(p, " Non-maskable interrupts\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(p, "LOC: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
+ seq_printf(p, " Local timer interrupts\n");
+#endif
+#ifdef CONFIG_SMP
+ seq_printf(p, "RES: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
+ seq_printf(p, " Rescheduling interrupts\n");
+ seq_printf(p, "CAL: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+ seq_printf(p, " Function call interrupts\n");
+ seq_printf(p, "TLB: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
+ seq_printf(p, " TLB shootdowns\n");
+#endif
+#ifdef CONFIG_X86_MCE
+ seq_printf(p, "TRM: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
+ seq_printf(p, " Thermal event interrupts\n");
+# ifdef CONFIG_X86_64
+ seq_printf(p, "THR: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
+ seq_printf(p, " Threshold APIC interrupts\n");
+# endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(p, "SPU: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+ seq_printf(p, " Spurious interrupts\n");
+#endif
+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+ return 0;
+}
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+ unsigned long flags, any_count = 0;
+ int i = *(loff_t *) v, j;
+ struct irqaction *action;
+ struct irq_desc *desc;
+
+ if (i > nr_irqs)
+ return 0;
+
+ if (i == nr_irqs)
+ return show_other_interrupts(p);
+
+ /* print header */
+ if (i == 0) {
+ seq_printf(p, " ");
+ for_each_online_cpu(j)
+ seq_printf(p, "CPU%-8d", j);
+ seq_putc(p, '\n');
+ }
+
+ desc = irq_to_desc(i);
+ spin_lock_irqsave(&desc->lock, flags);
+#ifndef CONFIG_SMP
+ any_count = kstat_irqs(i);
+#else
+ for_each_online_cpu(j)
+ any_count |= kstat_irqs_cpu(i, j);
+#endif
+ action = desc->action;
+ if (!action && !any_count)
+ goto out;
+
+ seq_printf(p, "%3d: ", i);
+#ifndef CONFIG_SMP
+ seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+#endif
+ seq_printf(p, " %8s", desc->chip->name);
+ seq_printf(p, "-%-8s", desc->name);
+
+ if (action) {
+ seq_printf(p, " %s", action->name);
+ while ((action = action->next) != NULL)
+ seq_printf(p, ", %s", action->name);
+ }
+
+ seq_putc(p, '\n');
+out:
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+ u64 sum = irq_stats(cpu)->__nmi_count;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += irq_stats(cpu)->apic_timer_irqs;
+#endif
+#ifdef CONFIG_SMP
+ sum += irq_stats(cpu)->irq_resched_count;
+ sum += irq_stats(cpu)->irq_call_count;
+ sum += irq_stats(cpu)->irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+ sum += irq_stats(cpu)->irq_thermal_count;
+# ifdef CONFIG_X86_64
+ sum += irq_stats(cpu)->irq_threshold_count;
+#endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += irq_stats(cpu)->irq_spurious_count;
+#endif
+ return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+ u64 sum = atomic_read(&irq_err_count);
+
+#ifdef CONFIG_X86_IO_APIC
+ sum += atomic_read(&irq_mis_count);
+#endif
+ return sum;
+}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 1cf8c1fcc08..a51382672de 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
DEFINE_PER_CPU(struct pt_regs *, irq_regs);
EXPORT_PER_CPU_SYMBOL(irq_regs);
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
- printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Currently unexpected vectors happen only on SMP and APIC.
- * We _must_ ack these because every local APIC has only N
- * irq slots per priority level, and a 'hanging, unacked' IRQ
- * holds up an irq slot - in excessive cases (when multiple
- * unexpected vectors occur) that might lock up the APIC
- * completely.
- * But only ack when the APIC is enabled -AK
- */
- if (cpu_has_apic)
- ack_APIC_irq();
-#endif
-}
-
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/* Debugging check for stack overflow: is there less than 1KB free? */
static int check_stack_overflow(void)
@@ -223,20 +200,25 @@ unsigned int do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs;
/* high bit used in ret_from_ code */
- int overflow, irq = ~regs->orig_ax;
- struct irq_desc *desc = irq_desc + irq;
+ int overflow;
+ unsigned vector = ~regs->orig_ax;
+ struct irq_desc *desc;
+ unsigned irq;
- if (unlikely((unsigned)irq >= NR_IRQS)) {
- printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
- __func__, irq);
- BUG();
- }
old_regs = set_irq_regs(regs);
irq_enter();
+ irq = __get_cpu_var(vector_irq)[vector];
overflow = check_stack_overflow();
+ desc = irq_to_desc(irq);
+ if (unlikely(!desc)) {
+ printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n",
+ __func__, irq, vector, smp_processor_id());
+ BUG();
+ }
+
if (!execute_on_irq_stack(overflow, desc, irq)) {
if (unlikely(overflow))
print_stack_overflow();
@@ -248,146 +230,6 @@ unsigned int do_IRQ(struct pt_regs *regs)
return 1;
}
-/*
- * Interrupt statistics:
- */
-
-atomic_t irq_err_count;
-
-/*
- * /proc/interrupts printing:
- */
-
-int show_interrupts(struct seq_file *p, void *v)
-{
- int i = *(loff_t *) v, j;
- struct irqaction * action;
- unsigned long flags;
-
- if (i == 0) {
- seq_printf(p, " ");
- for_each_online_cpu(j)
- seq_printf(p, "CPU%-8d",j);
- seq_putc(p, '\n');
- }
-
- if (i < NR_IRQS) {
- unsigned any_count = 0;
-
- spin_lock_irqsave(&irq_desc[i].lock, flags);
-#ifndef CONFIG_SMP
- any_count = kstat_irqs(i);
-#else
- for_each_online_cpu(j)
- any_count |= kstat_cpu(j).irqs[i];
-#endif
- action = irq_desc[i].action;
- if (!action && !any_count)
- goto skip;
- seq_printf(p, "%3d: ",i);
-#ifndef CONFIG_SMP
- seq_printf(p, "%10u ", kstat_irqs(i));
-#else
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
-#endif
- seq_printf(p, " %8s", irq_desc[i].chip->name);
- seq_printf(p, "-%-8s", irq_desc[i].name);
-
- if (action) {
- seq_printf(p, " %s", action->name);
- while ((action = action->next) != NULL)
- seq_printf(p, ", %s", action->name);
- }
-
- seq_putc(p, '\n');
-skip:
- spin_unlock_irqrestore(&irq_desc[i].lock, flags);
- } else if (i == NR_IRQS) {
- seq_printf(p, "NMI: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", nmi_count(j));
- seq_printf(p, " Non-maskable interrupts\n");
-#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "LOC: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).apic_timer_irqs);
- seq_printf(p, " Local timer interrupts\n");
-#endif
-#ifdef CONFIG_SMP
- seq_printf(p, "RES: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_resched_count);
- seq_printf(p, " Rescheduling interrupts\n");
- seq_printf(p, "CAL: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_call_count);
- seq_printf(p, " function call interrupts\n");
- seq_printf(p, "TLB: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_tlb_count);
- seq_printf(p, " TLB shootdowns\n");
-#endif
-#ifdef CONFIG_X86_MCE
- seq_printf(p, "TRM: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_thermal_count);
- seq_printf(p, " Thermal event interrupts\n");
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "SPU: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
-#endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-#if defined(CONFIG_X86_IO_APIC)
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
-#endif
- }
- return 0;
-}
-
-/*
- * /proc/stat helpers
- */
-u64 arch_irq_stat_cpu(unsigned int cpu)
-{
- u64 sum = nmi_count(cpu);
-
-#ifdef CONFIG_X86_LOCAL_APIC
- sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
-#endif
-#ifdef CONFIG_SMP
- sum += per_cpu(irq_stat, cpu).irq_resched_count;
- sum += per_cpu(irq_stat, cpu).irq_call_count;
- sum += per_cpu(irq_stat, cpu).irq_tlb_count;
-#endif
-#ifdef CONFIG_X86_MCE
- sum += per_cpu(irq_stat, cpu).irq_thermal_count;
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- sum += per_cpu(irq_stat, cpu).irq_spurious_count;
-#endif
- return sum;
-}
-
-u64 arch_irq_stat(void)
-{
- u64 sum = atomic_read(&irq_err_count);
-
-#ifdef CONFIG_X86_IO_APIC
- sum += atomic_read(&irq_mis_count);
-#endif
- return sum;
-}
-
#ifdef CONFIG_HOTPLUG_CPU
#include <mach_apic.h>
@@ -395,20 +237,22 @@ void fixup_irqs(cpumask_t map)
{
unsigned int irq;
static int warned;
+ struct irq_desc *desc;
- for (irq = 0; irq < NR_IRQS; irq++) {
+ for_each_irq_desc(irq, desc) {
cpumask_t mask;
+
if (irq == 2)
continue;
- cpus_and(mask, irq_desc[irq].affinity, map);
+ cpus_and(mask, desc->affinity, map);
if (any_online_cpu(mask) == NR_CPUS) {
printk("Breaking affinity for irq %i\n", irq);
mask = map;
}
- if (irq_desc[irq].chip->set_affinity)
- irq_desc[irq].chip->set_affinity(irq, mask);
- else if (irq_desc[irq].action && !(warned++))
+ if (desc->chip->set_affinity)
+ desc->chip->set_affinity(irq, mask);
+ else if (desc->action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1f78b238d8d..60eb84eb77a 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,28 +18,6 @@
#include <asm/idle.h>
#include <asm/smp.h>
-atomic_t irq_err_count;
-
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
- printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
- /*
- * Currently unexpected vectors happen only on SMP and APIC.
- * We _must_ ack these because every local APIC has only N
- * irq slots per priority level, and a 'hanging, unacked' IRQ
- * holds up an irq slot - in excessive cases (when multiple
- * unexpected vectors occur) that might lock up the APIC
- * completely.
- * But don't ack when the APIC is disabled. -AK
- */
- if (!disable_apic)
- ack_APIC_irq();
-}
-
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/*
* Probabilistic stack overflow check:
@@ -65,122 +43,6 @@ static inline void stack_overflow_check(struct pt_regs *regs)
#endif
/*
- * Generic, controller-independent functions:
- */
-
-int show_interrupts(struct seq_file *p, void *v)
-{
- int i = *(loff_t *) v, j;
- struct irqaction * action;
- unsigned long flags;
-
- if (i == 0) {
- seq_printf(p, " ");
- for_each_online_cpu(j)
- seq_printf(p, "CPU%-8d",j);
- seq_putc(p, '\n');
- }
-
- if (i < NR_IRQS) {
- unsigned any_count = 0;
-
- spin_lock_irqsave(&irq_desc[i].lock, flags);
-#ifndef CONFIG_SMP
- any_count = kstat_irqs(i);
-#else
- for_each_online_cpu(j)
- any_count |= kstat_cpu(j).irqs[i];
-#endif
- action = irq_desc[i].action;
- if (!action && !any_count)
- goto skip;
- seq_printf(p, "%3d: ",i);
-#ifndef CONFIG_SMP
- seq_printf(p, "%10u ", kstat_irqs(i));
-#else
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
-#endif
- seq_printf(p, " %8s", irq_desc[i].chip->name);
- seq_printf(p, "-%-8s", irq_desc[i].name);
-
- if (action) {
- seq_printf(p, " %s", action->name);
- while ((action = action->next) != NULL)
- seq_printf(p, ", %s", action->name);
- }
- seq_putc(p, '\n');
-skip:
- spin_unlock_irqrestore(&irq_desc[i].lock, flags);
- } else if (i == NR_IRQS) {
- seq_printf(p, "NMI: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
- seq_printf(p, " Non-maskable interrupts\n");
- seq_printf(p, "LOC: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
- seq_printf(p, " Local timer interrupts\n");
-#ifdef CONFIG_SMP
- seq_printf(p, "RES: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count);
- seq_printf(p, " Rescheduling interrupts\n");
- seq_printf(p, "CAL: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
- seq_printf(p, " function call interrupts\n");
- seq_printf(p, "TLB: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
- seq_printf(p, " TLB shootdowns\n");
-#endif
-#ifdef CONFIG_X86_MCE
- seq_printf(p, "TRM: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
- seq_printf(p, " Thermal event interrupts\n");
- seq_printf(p, "THR: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
- seq_printf(p, " Threshold APIC interrupts\n");
-#endif
- seq_printf(p, "SPU: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
- }
- return 0;
-}
-
-/*
- * /proc/stat helpers
- */
-u64 arch_irq_stat_cpu(unsigned int cpu)
-{
- u64 sum = cpu_pda(cpu)->__nmi_count;
-
- sum += cpu_pda(cpu)->apic_timer_irqs;
-#ifdef CONFIG_SMP
- sum += cpu_pda(cpu)->irq_resched_count;
- sum += cpu_pda(cpu)->irq_call_count;
- sum += cpu_pda(cpu)->irq_tlb_count;
-#endif
-#ifdef CONFIG_X86_MCE
- sum += cpu_pda(cpu)->irq_thermal_count;
- sum += cpu_pda(cpu)->irq_threshold_count;
-#endif
- sum += cpu_pda(cpu)->irq_spurious_count;
- return sum;
-}
-
-u64 arch_irq_stat(void)
-{
- return atomic_read(&irq_err_count);
-}
-
-/*
* do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
@@ -188,6 +50,7 @@ u64 arch_irq_stat(void)
asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
+ struct irq_desc *desc;
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax;
@@ -201,8 +64,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
stack_overflow_check(regs);
#endif
- if (likely(irq < NR_IRQS))
- generic_handle_irq(irq);
+ desc = irq_to_desc(irq);
+ if (likely(desc))
+ generic_handle_irq_desc(irq, desc);
else {
if (!disable_apic)
ack_APIC_irq();
@@ -223,8 +87,9 @@ void fixup_irqs(cpumask_t map)
{
unsigned int irq;
static int warned;
+ struct irq_desc *desc;
- for (irq = 0; irq < NR_IRQS; irq++) {
+ for_each_irq_desc(irq, desc) {
cpumask_t mask;
int break_affinity = 0;
int set_affinity = 1;
@@ -233,32 +98,32 @@ void fixup_irqs(cpumask_t map)
continue;
/* interrupt's are disabled at this point */
- spin_lock(&irq_desc[irq].lock);
+ spin_lock(&desc->lock);
if (!irq_has_action(irq) ||
- cpus_equal(irq_desc[irq].affinity, map)) {
- spin_unlock(&irq_desc[irq].lock);
+ cpus_equal(desc->affinity, map)) {
+ spin_unlock(&desc->lock);
continue;
}
- cpus_and(mask, irq_desc[irq].affinity, map);
+ cpus_and(mask, desc->affinity, map);
if (cpus_empty(mask)) {
break_affinity = 1;
mask = map;
}
- if (irq_desc[irq].chip->mask)
- irq_desc[irq].chip->mask(irq);
+ if (desc->chip->mask)
+ desc->chip->mask(irq);
- if (irq_desc[irq].chip->set_affinity)
- irq_desc[irq].chip->set_affinity(irq, mask);
+ if (desc->chip->set_affinity)
+ desc->chip->set_affinity(irq, mask);
else if (!(warned++))
set_affinity = 0;
- if (irq_desc[irq].chip->unmask)
- irq_desc[irq].chip->unmask(irq);
+ if (desc->chip->unmask)
+ desc->chip->unmask(irq);
- spin_unlock(&irq_desc[irq].lock);
+ spin_unlock(&desc->lock);
if (break_affinity && set_affinity)
printk("Broke affinity for irq %i\n", irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index d66914287ee..845aa9803e8 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -69,11 +69,48 @@ void __init init_ISA_irqs (void)
* 16 old-style INTA-cycle interrupts:
*/
for (i = 0; i < 16; i++) {
+ /* first time call this irq_desc */
+ struct irq_desc *desc = irq_to_desc(i);
+
+ desc->status = IRQ_DISABLED;
+ desc->action = NULL;
+ desc->depth = 1;
+
set_irq_chip_and_handler_name(i, &i8259A_chip,
handle_level_irq, "XT");
}
}
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = {
+ .handler = no_action,
+ .mask = CPU_MASK_NONE,
+ .name = "cascade",
+};
+
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+ [0 ... IRQ0_VECTOR - 1] = -1,
+ [IRQ0_VECTOR] = 0,
+ [IRQ1_VECTOR] = 1,
+ [IRQ2_VECTOR] = 2,
+ [IRQ3_VECTOR] = 3,
+ [IRQ4_VECTOR] = 4,
+ [IRQ5_VECTOR] = 5,
+ [IRQ6_VECTOR] = 6,
+ [IRQ7_VECTOR] = 7,
+ [IRQ8_VECTOR] = 8,
+ [IRQ9_VECTOR] = 9,
+ [IRQ10_VECTOR] = 10,
+ [IRQ11_VECTOR] = 11,
+ [IRQ12_VECTOR] = 12,
+ [IRQ13_VECTOR] = 13,
+ [IRQ14_VECTOR] = 14,
+ [IRQ15_VECTOR] = 15,
+ [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
/* Overridden in paravirt.c */
void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
@@ -89,15 +126,50 @@ void __init native_init_IRQ(void)
* us. (some of these will be overridden and become
* 'special' SMP interrupts)
*/
- for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- if (i >= NR_IRQS)
- break;
+ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* SYSCALL_VECTOR was reserved in trap_init. */
- if (!test_bit(vector, used_vectors))
- set_intr_gate(vector, interrupt[i]);
+ if (i != SYSCALL_VECTOR)
+ set_intr_gate(i, interrupt[i]);
}
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
+ /*
+ * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+ * IPI, driven by wakeup.
+ */
+ alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+ /* IPI for invalidation */
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+
+ /* IPI for generic function call */
+ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+ /* IPI for single call function */
+ set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+
+ /* Low priority IPI to cleanup after moving an irq */
+ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /* self generated IPI for local APIC timer */
+ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+ /* IPI vectors for APIC spurious and error interrupts */
+ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+ alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+#endif
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+ /* thermal monitor LVT interrupt */
+ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+
+ if (!acpi_ioapic)
+ setup_irq(2, &irq2);
+
/* setup after call gates are initialised (usually add in
* the architecture specific gates)
*/
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 1f26fd9ec4f..ff023539128 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -135,51 +135,33 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
};
-static void __init init_ISA_irqs (void)
+void __init init_ISA_irqs(void)
{
int i;
init_bsp_APIC();
init_8259A(0);
- for (i = 0; i < NR_IRQS; i++) {
- irq_desc[i].status = IRQ_DISABLED;
- irq_desc[i].action = NULL;
- irq_desc[i].depth = 1;
+ for (i = 0; i < 16; i++) {
+ /* first time call this irq_desc */
+ struct irq_desc *desc = irq_to_desc(i);
- if (i < 16) {
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- set_irq_chip_and_handler_name(i, &i8259A_chip,
+ desc->status = IRQ_DISABLED;
+ desc->action = NULL;
+ desc->depth = 1;
+
+ /*
+ * 16 old-style INTA-cycle interrupts:
+ */
+ set_irq_chip_and_handler_name(i, &i8259A_chip,
handle_level_irq, "XT");
- } else {
- /*
- * 'high' PCI IRQs filled in on demand
- */
- irq_desc[i].chip = &no_irq_chip;
- }
}
}
void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-void __init native_init_IRQ(void)
+static void __init smp_intr_init(void)
{
- int i;
-
- init_ISA_irqs();
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- if (vector != IA32_SYSCALL_VECTOR)
- set_intr_gate(vector, interrupt[i]);
- }
-
#ifdef CONFIG_SMP
/*
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -207,6 +189,12 @@ void __init native_init_IRQ(void)
/* Low priority IPI to cleanup after moving an irq */
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
#endif
+}
+
+static void __init apic_intr_init(void)
+{
+ smp_intr_init();
+
alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
@@ -216,6 +204,25 @@ void __init native_init_IRQ(void)
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+}
+
+void __init native_init_IRQ(void)
+{
+ int i;
+
+ init_ISA_irqs();
+ /*
+ * Cover the whole vector space, no vector can escape
+ * us. (some of these will be overridden and become
+ * 'special' SMP interrupts)
+ */
+ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ if (vector != IA32_SYSCALL_VECTOR)
+ set_intr_gate(vector, interrupt[i]);
+ }
+
+ apic_intr_init();
if (!acpi_ioapic)
setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb2133..304d8bad655 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
static u32 *flush_words;
struct pci_device_id k8_nb_ids[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
{}
};
EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b..10435a120d2 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
*/
void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
{
+#ifndef CONFIG_X86_32
+ u32 *gdb_regs32 = (u32 *)gdb_regs;
+#endif
gdb_regs[GDB_AX] = regs->ax;
gdb_regs[GDB_BX] = regs->bx;
gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
gdb_regs[GDB_SI] = regs->si;
gdb_regs[GDB_DI] = regs->di;
gdb_regs[GDB_BP] = regs->bp;
- gdb_regs[GDB_PS] = regs->flags;
gdb_regs[GDB_PC] = regs->ip;
#ifdef CONFIG_X86_32
+ gdb_regs[GDB_PS] = regs->flags;
gdb_regs[GDB_DS] = regs->ds;
gdb_regs[GDB_ES] = regs->es;
gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
gdb_regs[GDB_R13] = regs->r13;
gdb_regs[GDB_R14] = regs->r14;
gdb_regs[GDB_R15] = regs->r15;
+ gdb_regs32[GDB_PS] = regs->flags;
+ gdb_regs32[GDB_CS] = regs->cs;
+ gdb_regs32[GDB_SS] = regs->ss;
#endif
gdb_regs[GDB_SP] = regs->sp;
}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
*/
void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
{
+#ifndef CONFIG_X86_32
+ u32 *gdb_regs32 = (u32 *)gdb_regs;
+#endif
gdb_regs[GDB_AX] = 0;
gdb_regs[GDB_BX] = 0;
gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_FS] = 0xFFFF;
gdb_regs[GDB_GS] = 0xFFFF;
#else
- gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
- gdb_regs[GDB_PC] = 0;
+ gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
+ gdb_regs32[GDB_CS] = __KERNEL_CS;
+ gdb_regs32[GDB_SS] = __KERNEL_DS;
+ gdb_regs[GDB_PC] = p->thread.ip;
gdb_regs[GDB_R8] = 0;
gdb_regs[GDB_R9] = 0;
gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
*/
void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
{
+#ifndef CONFIG_X86_32
+ u32 *gdb_regs32 = (u32 *)gdb_regs;
+#endif
regs->ax = gdb_regs[GDB_AX];
regs->bx = gdb_regs[GDB_BX];
regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
regs->si = gdb_regs[GDB_SI];
regs->di = gdb_regs[GDB_DI];
regs->bp = gdb_regs[GDB_BP];
- regs->flags = gdb_regs[GDB_PS];
regs->ip = gdb_regs[GDB_PC];
#ifdef CONFIG_X86_32
+ regs->flags = gdb_regs[GDB_PS];
regs->ds = gdb_regs[GDB_DS];
regs->es = gdb_regs[GDB_ES];
regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
regs->r13 = gdb_regs[GDB_R13];
regs->r14 = gdb_regs[GDB_R14];
regs->r15 = gdb_regs[GDB_R15];
+ regs->flags = gdb_regs32[GDB_PS];
+ regs->cs = gdb_regs32[GDB_CS];
+ regs->ss = gdb_regs32[GDB_SS];
#endif
}
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
if (remcomInBuffer[0] == 's') {
linux_regs->flags |= X86_EFLAGS_TF;
kgdb_single_step = 1;
- if (kgdb_contthread) {
- atomic_set(&kgdb_cpu_doing_single_step,
- raw_smp_processor_id());
- }
+ atomic_set(&kgdb_cpu_doing_single_step,
+ raw_smp_processor_id());
}
get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
return NOTIFY_DONE;
case DIE_NMI_IPI:
- if (atomic_read(&kgdb_active) != -1) {
- /* KGDB CPU roundup */
- kgdb_nmicallback(raw_smp_processor_id(), regs);
- was_in_debug_nmi[raw_smp_processor_id()] = 1;
- touch_nmi_watchdog();
- }
+ /* Just ignore, we will handle the roundup on DIE_NMI. */
return NOTIFY_DONE;
case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
case DIE_DEBUG:
if (atomic_read(&kgdb_cpu_doing_single_step) ==
- raw_smp_processor_id() &&
- user_mode(regs))
- return single_step_cont(regs, args);
+ raw_smp_processor_id()) {
+ if (user_mode(regs))
+ return single_step_cont(regs, args);
+ break;
+ } else if (test_thread_flag(TIF_SINGLESTEP))
+ /* This means a user thread is single stepping
+ * a system call which should be ignored
+ */
+ return NOTIFY_DONE;
/* fall through */
default:
if (user_mode(regs))
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2..478bca986ec 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
}
-static void kvm_release_pt(u32 pfn)
+static void kvm_release_pt(unsigned long pfn)
{
struct kvm_mmu_op_release_pt rpt = {
.header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d02def06ca9..774ac499156 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
return ret;
}
+/*
+ * If we don't do that, there is the possibility that the guest
+ * will calibrate under heavy load - thus, getting a lower lpj -
+ * and execute the delays themselves without load. This is wrong,
+ * because no delay loop can finish beforehand.
+ * Any heuristics is subject to fail, because ultimately, a large
+ * poll of guests can be running and trouble each other. So we preset
+ * lpj here
+ */
+static unsigned long kvm_get_tsc_khz(void)
+{
+ return preset_lpj;
+}
+
+static void kvm_get_preset_lpj(void)
+{
+ struct pvclock_vcpu_time_info *src;
+ unsigned long khz;
+ u64 lpj;
+
+ src = &per_cpu(hv_clock, 0);
+ khz = pvclock_tsc_khz(src);
+
+ lpj = ((u64)khz * 1000);
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+}
+
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_read,
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock;
pv_time_ops.sched_clock = kvm_clock_read;
+ pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
#ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
#endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
+ kvm_get_preset_lpj();
clocksource_register(&kvm_clock);
}
}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b68e21f06f4..eee32b43fee 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
+#include <asm/syscalls.h>
#ifdef CONFIG_SMP
static void flush_ldt(void *current_mm)
@@ -51,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
(mincount - oldsize) * LDT_ENTRY_SIZE);
+ paravirt_alloc_ldt(newldt, mincount);
+
#ifdef CONFIG_X86_64
/* CHECKME: Do we really need this ? */
wmb();
@@ -73,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
#endif
}
if (oldsize) {
+ paravirt_free_ldt(oldldt, oldsize);
if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(oldldt);
else
@@ -84,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
{
int err = alloc_ldt(new, old->size, 0);
+ int i;
if (err < 0)
return err;
- memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
+
+ for(i = 0; i < old->size; i++)
+ write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
return 0;
}
@@ -124,6 +131,7 @@ void destroy_context(struct mm_struct *mm)
if (mm == current->active_mm)
clear_LDT();
#endif
+ paravirt_free_ldt(mm->context.ldt, mm->context.size);
if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(mm->context.ldt);
else
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
deleted file mode 100644
index 652fa5c38eb..00000000000
--- a/arch/x86/kernel/microcode.c
+++ /dev/null
@@ -1,853 +0,0 @@
-/*
- * Intel CPU Microcode Update Driver for Linux
- *
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- * 2006 Shaohua Li <shaohua.li@intel.com>
- *
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- * Software Developer's Manual
- * Order Number 253668 or free download from:
- *
- * http://developer.intel.com/design/pentium4/manuals/253668.htm
- *
- * For more information, go to http://www.urbanmyth.org/microcode
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Initial release.
- * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added read() support + cleanups.
- * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added 'device trimming' support. open(O_WRONLY) zeroes
- * and frees the saved copy of applied microcode.
- * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Made to use devfs (/dev/cpu/microcode) + cleanups.
- * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Added misc device support (now uses both devfs and misc).
- * Added MICROCODE_IOCFREE ioctl to clear memory.
- * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Messages for error cases (non Intel & no suitable microcode).
- * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- * Removed ->release(). Removed exclusive open and status bitmap.
- * Added microcode_rwsem to serialize read()/write()/ioctl().
- * Removed global kernel lock usage.
- * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- * Write 0 to 0x8B msr and then cpuid before reading revision,
- * so that it works even if there were no update done by the
- * BIOS. Otherwise, reading from 0x8B gives junk (which happened
- * to be 0 on my machine which is why it worked even when I
- * disabled update by the BIOS)
- * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>
- * Intel Pentium 4 processor support and bugfixes.
- * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- * Bugfix for HT (Hyper-Threading) enabled processors
- * whereby processor resources are shared by all logical processors
- * in a single CPU package.
- * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>,
- * Serialize updates as required on HT processors due to speculative
- * nature of implementation.
- * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- * Fix the panic when writing zero-length microcode chunk.
- * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- * Jun Nakajima <jun.nakajima@intel.com>
- * Support for the microcode updates in the new format.
- * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- * because we no longer hold a copy of applied microcode
- * in kernel memory.
- * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- * Fix sigmatch() macro to handle old CPUs with pf == 0.
- * Thanks to Stuart Swales for pointing out this bug.
- */
-
-//#define DEBUG /* pr_debug */
-#include <linux/capability.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/smp_lock.h>
-#include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/miscdevice.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/mutex.h>
-#include <linux/cpu.h>
-#include <linux/firmware.h>
-#include <linux/platform_device.h>
-
-#include <asm/msr.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-
-MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-#define MICROCODE_VERSION "1.14a"
-
-#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
-#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
-#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
-#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
-#define DWSIZE (sizeof (u32))
-#define get_totalsize(mc) \
- (((microcode_t *)mc)->hdr.totalsize ? \
- ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
-#define get_datasize(mc) \
- (((microcode_t *)mc)->hdr.datasize ? \
- ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define sigmatch(s1, s2, p1, p2) \
- (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
-static DEFINE_MUTEX(microcode_mutex);
-
-static struct ucode_cpu_info {
- int valid;
- unsigned int sig;
- unsigned int pf;
- unsigned int rev;
- microcode_t *mc;
-} ucode_cpu_info[NR_CPUS];
-
-static void collect_cpu_info(int cpu_num)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu_num);
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- unsigned int val[2];
-
- /* We should bind the task to the CPU */
- BUG_ON(raw_smp_processor_id() != cpu_num);
- uci->pf = uci->rev = 0;
- uci->mc = NULL;
- uci->valid = 1;
-
- if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
- cpu_has(c, X86_FEATURE_IA64)) {
- printk(KERN_ERR "microcode: CPU%d not a capable Intel "
- "processor\n", cpu_num);
- uci->valid = 0;
- return;
- }
-
- uci->sig = cpuid_eax(0x00000001);
-
- if ((c->x86_model >= 5) || (c->x86 > 6)) {
- /* get processor flags from MSR 0x17 */
- rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- uci->pf = 1 << ((val[1] >> 18) & 7);
- }
-
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
- /* see notes above for revision 1.07. Apparent chip bug */
- sync_core();
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
- pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
- uci->sig, uci->pf, uci->rev);
-}
-
-static inline int microcode_update_match(int cpu_num,
- microcode_header_t *mc_header, int sig, int pf)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
- if (!sigmatch(sig, uci->sig, pf, uci->pf)
- || mc_header->rev <= uci->rev)
- return 0;
- return 1;
-}
-
-static int microcode_sanity_check(void *mc)
-{
- microcode_header_t *mc_header = mc;
- struct extended_sigtable *ext_header = NULL;
- struct extended_signature *ext_sig;
- unsigned long total_size, data_size, ext_table_size;
- int sum, orig_sum, ext_sigcount = 0, i;
-
- total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
- if (data_size + MC_HEADER_SIZE > total_size) {
- printk(KERN_ERR "microcode: error! "
- "Bad data size in microcode data file\n");
- return -EINVAL;
- }
-
- if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
- printk(KERN_ERR "microcode: error! "
- "Unknown microcode update format\n");
- return -EINVAL;
- }
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- printk(KERN_ERR "microcode: error! "
- "Small exttable size in microcode data file\n");
- return -EINVAL;
- }
- ext_header = mc + MC_HEADER_SIZE + data_size;
- if (ext_table_size != exttable_size(ext_header)) {
- printk(KERN_ERR "microcode: error! "
- "Bad exttable size in microcode data file\n");
- return -EFAULT;
- }
- ext_sigcount = ext_header->count;
- }
-
- /* check extended table checksum */
- if (ext_table_size) {
- int ext_table_sum = 0;
- int *ext_tablep = (int *)ext_header;
-
- i = ext_table_size / DWSIZE;
- while (i--)
- ext_table_sum += ext_tablep[i];
- if (ext_table_sum) {
- printk(KERN_WARNING "microcode: aborting, "
- "bad extended signature table checksum\n");
- return -EINVAL;
- }
- }
-
- /* calculate the checksum */
- orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / DWSIZE;
- while (i--)
- orig_sum += ((int *)mc)[i];
- if (orig_sum) {
- printk(KERN_ERR "microcode: aborting, bad checksum\n");
- return -EINVAL;
- }
- if (!ext_table_size)
- return 0;
- /* check extended signature checksum */
- for (i = 0; i < ext_sigcount; i++) {
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
- EXT_SIGNATURE_SIZE * i;
- sum = orig_sum
- - (mc_header->sig + mc_header->pf + mc_header->cksum)
- + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
- if (sum) {
- printk(KERN_ERR "microcode: aborting, bad checksum\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
-/*
- * return 0 - no update found
- * return 1 - found update
- * return < 0 - error
- */
-static int get_maching_microcode(void *mc, int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- microcode_header_t *mc_header = mc;
- struct extended_sigtable *ext_header;
- unsigned long total_size = get_totalsize(mc_header);
- int ext_sigcount, i;
- struct extended_signature *ext_sig;
- void *new_mc;
-
- if (microcode_update_match(cpu, mc_header,
- mc_header->sig, mc_header->pf))
- goto find;
-
- if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
- return 0;
-
- ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
- ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
- for (i = 0; i < ext_sigcount; i++) {
- if (microcode_update_match(cpu, mc_header,
- ext_sig->sig, ext_sig->pf))
- goto find;
- ext_sig++;
- }
- return 0;
-find:
- pr_debug("microcode: CPU%d found a matching microcode update with"
- " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
- new_mc = vmalloc(total_size);
- if (!new_mc) {
- printk(KERN_ERR "microcode: error! Can not allocate memory\n");
- return -ENOMEM;
- }
-
- /* free previous update file */
- vfree(uci->mc);
-
- memcpy(new_mc, mc, total_size);
- uci->mc = new_mc;
- return 1;
-}
-
-static void apply_microcode(int cpu)
-{
- unsigned long flags;
- unsigned int val[2];
- int cpu_num = raw_smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu_num != cpu);
-
- if (uci->mc == NULL)
- return;
-
- /* serialize access to the physical write to MSR 0x79 */
- spin_lock_irqsave(&microcode_update_lock, flags);
-
- /* write microcode via MSR 0x79 */
- wrmsr(MSR_IA32_UCODE_WRITE,
- (unsigned long) uci->mc->bits,
- (unsigned long) uci->mc->bits >> 16 >> 16);
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
- /* see notes above for revision 1.07. Apparent chip bug */
- sync_core();
-
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
- spin_unlock_irqrestore(&microcode_update_lock, flags);
- if (val[1] != uci->mc->hdr.rev) {
- printk(KERN_ERR "microcode: CPU%d update from revision "
- "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
- return;
- }
- printk(KERN_INFO "microcode: CPU%d updated from revision "
- "0x%x to 0x%x, date = %08x \n",
- cpu_num, uci->rev, val[1], uci->mc->hdr.date);
- uci->rev = val[1];
-}
-
-#ifdef CONFIG_MICROCODE_OLD_INTERFACE
-static void __user *user_buffer; /* user area microcode data buffer */
-static unsigned int user_buffer_size; /* it's size */
-
-static long get_next_ucode(void **mc, long offset)
-{
- microcode_header_t mc_header;
- unsigned long total_size;
-
- /* No more data */
- if (offset >= user_buffer_size)
- return 0;
- if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- return -EFAULT;
- }
- total_size = get_totalsize(&mc_header);
- if (offset + total_size > user_buffer_size) {
- printk(KERN_ERR "microcode: error! Bad total size in microcode "
- "data file\n");
- return -EINVAL;
- }
- *mc = vmalloc(total_size);
- if (!*mc)
- return -ENOMEM;
- if (copy_from_user(*mc, user_buffer + offset, total_size)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- vfree(*mc);
- return -EFAULT;
- }
- return offset + total_size;
-}
-
-static int do_microcode_update (void)
-{
- long cursor = 0;
- int error = 0;
- void *new_mc = NULL;
- int cpu;
- cpumask_t old;
-
- old = current->cpus_allowed;
-
- while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
- error = microcode_sanity_check(new_mc);
- if (error)
- goto out;
- /*
- * It's possible the data file has multiple matching ucode,
- * lets keep searching till the latest version
- */
- for_each_online_cpu(cpu) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- if (!uci->valid)
- continue;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- error = get_maching_microcode(new_mc, cpu);
- if (error < 0)
- goto out;
- if (error == 1)
- apply_microcode(cpu);
- }
- vfree(new_mc);
- }
-out:
- if (cursor > 0)
- vfree(new_mc);
- if (cursor < 0)
- error = cursor;
- set_cpus_allowed_ptr(current, &old);
- return error;
-}
-
-static int microcode_open (struct inode *unused1, struct file *unused2)
-{
- cycle_kernel_lock();
- return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
-}
-
-static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
-{
- ssize_t ret;
-
- if ((len >> PAGE_SHIFT) > num_physpages) {
- printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
- return -EINVAL;
- }
-
- get_online_cpus();
- mutex_lock(&microcode_mutex);
-
- user_buffer = (void __user *) buf;
- user_buffer_size = (int) len;
-
- ret = do_microcode_update();
- if (!ret)
- ret = (ssize_t)len;
-
- mutex_unlock(&microcode_mutex);
- put_online_cpus();
-
- return ret;
-}
-
-static const struct file_operations microcode_fops = {
- .owner = THIS_MODULE,
- .write = microcode_write,
- .open = microcode_open,
-};
-
-static struct miscdevice microcode_dev = {
- .minor = MICROCODE_MINOR,
- .name = "microcode",
- .fops = &microcode_fops,
-};
-
-static int __init microcode_dev_init (void)
-{
- int error;
-
- error = misc_register(&microcode_dev);
- if (error) {
- printk(KERN_ERR
- "microcode: can't misc_register on minor=%d\n",
- MICROCODE_MINOR);
- return error;
- }
-
- return 0;
-}
-
-static void microcode_dev_exit (void)
-{
- misc_deregister(&microcode_dev);
-}
-
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
-#else
-#define microcode_dev_init() 0
-#define microcode_dev_exit() do { } while(0)
-#endif
-
-static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
- unsigned long size, long offset)
-{
- microcode_header_t *mc_header;
- unsigned long total_size;
-
- /* No more data */
- if (offset >= size)
- return 0;
- mc_header = (microcode_header_t *)(buf + offset);
- total_size = get_totalsize(mc_header);
-
- if (offset + total_size > size) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- return -EINVAL;
- }
-
- *mc = vmalloc(total_size);
- if (!*mc) {
- printk(KERN_ERR "microcode: error! Can not allocate memory\n");
- return -ENOMEM;
- }
- memcpy(*mc, buf + offset, total_size);
- return offset + total_size;
-}
-
-/* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
-
-static int cpu_request_microcode(int cpu)
-{
- char name[30];
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- const struct firmware *firmware;
- const u8 *buf;
- unsigned long size;
- long offset = 0;
- int error;
- void *mc;
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu != raw_smp_processor_id());
- sprintf(name,"intel-ucode/%02x-%02x-%02x",
- c->x86, c->x86_model, c->x86_mask);
- error = request_firmware(&firmware, name, &microcode_pdev->dev);
- if (error) {
- pr_debug("microcode: data file %s load failed\n", name);
- return error;
- }
- buf = firmware->data;
- size = firmware->size;
- while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
- > 0) {
- error = microcode_sanity_check(mc);
- if (error)
- break;
- error = get_maching_microcode(mc, cpu);
- if (error < 0)
- break;
- /*
- * It's possible the data file has multiple matching ucode,
- * lets keep searching till the latest version
- */
- if (error == 1) {
- apply_microcode(cpu);
- error = 0;
- }
- vfree(mc);
- }
- if (offset > 0)
- vfree(mc);
- if (offset < 0)
- error = offset;
- release_firmware(firmware);
-
- return error;
-}
-
-static int apply_microcode_check_cpu(int cpu)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- cpumask_t old;
- unsigned int val[2];
- int err = 0;
-
- /* Check if the microcode is available */
- if (!uci->mc)
- return 0;
-
- old = current->cpus_allowed;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-
- /* Check if the microcode we have in memory matches the CPU */
- if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
- cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
- err = -EINVAL;
-
- if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
- /* get processor flags from MSR 0x17 */
- rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- if (uci->pf != (1 << ((val[1] >> 18) & 7)))
- err = -EINVAL;
- }
-
- if (!err) {
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
- /* see notes above for revision 1.07. Apparent chip bug */
- sync_core();
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- if (uci->rev != val[1])
- err = -EINVAL;
- }
-
- if (!err)
- apply_microcode(cpu);
- else
- printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
- " sig=0x%x, pf=0x%x, rev=0x%x\n",
- cpu, uci->sig, uci->pf, uci->rev);
-
- set_cpus_allowed_ptr(current, &old);
- return err;
-}
-
-static void microcode_init_cpu(int cpu, int resume)
-{
- cpumask_t old;
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- old = current->cpus_allowed;
-
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- mutex_lock(&microcode_mutex);
- collect_cpu_info(cpu);
- if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
- cpu_request_microcode(cpu);
- mutex_unlock(&microcode_mutex);
- set_cpus_allowed_ptr(current, &old);
-}
-
-static void microcode_fini_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- mutex_lock(&microcode_mutex);
- uci->valid = 0;
- vfree(uci->mc);
- uci->mc = NULL;
- mutex_unlock(&microcode_mutex);
-}
-
-static ssize_t reload_store(struct sys_device *dev,
- struct sysdev_attribute *attr,
- const char *buf, size_t sz)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
- char *end;
- unsigned long val = simple_strtoul(buf, &end, 0);
- int err = 0;
- int cpu = dev->id;
-
- if (end == buf)
- return -EINVAL;
- if (val == 1) {
- cpumask_t old = current->cpus_allowed;
-
- get_online_cpus();
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-
- mutex_lock(&microcode_mutex);
- if (uci->valid)
- err = cpu_request_microcode(cpu);
- mutex_unlock(&microcode_mutex);
- put_online_cpus();
- set_cpus_allowed_ptr(current, &old);
- }
- if (err)
- return err;
- return sz;
-}
-
-static ssize_t version_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
- return sprintf(buf, "0x%x\n", uci->rev);
-}
-
-static ssize_t pf_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
- return sprintf(buf, "0x%x\n", uci->pf);
-}
-
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
-
-static struct attribute *mc_default_attrs[] = {
- &attr_reload.attr,
- &attr_version.attr,
- &attr_processor_flags.attr,
- NULL
-};
-
-static struct attribute_group mc_attr_group = {
- .attrs = mc_default_attrs,
- .name = "microcode",
-};
-
-static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
-{
- int err, cpu = sys_dev->id;
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- if (!cpu_online(cpu))
- return 0;
-
- pr_debug("microcode: CPU%d added\n", cpu);
- memset(uci, 0, sizeof(*uci));
-
- err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
- if (err)
- return err;
-
- microcode_init_cpu(cpu, resume);
-
- return 0;
-}
-
-static int mc_sysdev_add(struct sys_device *sys_dev)
-{
- return __mc_sysdev_add(sys_dev, 0);
-}
-
-static int mc_sysdev_remove(struct sys_device *sys_dev)
-{
- int cpu = sys_dev->id;
-
- if (!cpu_online(cpu))
- return 0;
-
- pr_debug("microcode: CPU%d removed\n", cpu);
- microcode_fini_cpu(cpu);
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
- return 0;
-}
-
-static int mc_sysdev_resume(struct sys_device *dev)
-{
- int cpu = dev->id;
-
- if (!cpu_online(cpu))
- return 0;
- pr_debug("microcode: CPU%d resumed\n", cpu);
- /* only CPU 0 will apply ucode here */
- apply_microcode(0);
- return 0;
-}
-
-static struct sysdev_driver mc_sysdev_driver = {
- .add = mc_sysdev_add,
- .remove = mc_sysdev_remove,
- .resume = mc_sysdev_resume,
-};
-
-static __cpuinit int
-mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
-
- sys_dev = get_cpu_sysdev(cpu);
- switch (action) {
- case CPU_UP_CANCELED_FROZEN:
- /* The CPU refused to come up during a system resume */
- microcode_fini_cpu(cpu);
- break;
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- mc_sysdev_add(sys_dev);
- break;
- case CPU_ONLINE_FROZEN:
- /* System-wide resume is in progress, try to apply microcode */
- if (apply_microcode_check_cpu(cpu)) {
- /* The application of microcode failed */
- microcode_fini_cpu(cpu);
- __mc_sysdev_add(sys_dev, 1);
- break;
- }
- case CPU_DOWN_FAILED_FROZEN:
- if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
- printk(KERN_ERR "microcode: Failed to create the sysfs "
- "group for CPU%d\n", cpu);
- break;
- case CPU_DOWN_PREPARE:
- mc_sysdev_remove(sys_dev);
- break;
- case CPU_DOWN_PREPARE_FROZEN:
- /* Suspend is in progress, only remove the interface */
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block __refdata mc_cpu_notifier = {
- .notifier_call = mc_cpu_callback,
-};
-
-static int __init microcode_init (void)
-{
- int error;
-
- printk(KERN_INFO
- "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
-
- error = microcode_dev_init();
- if (error)
- return error;
- microcode_pdev = platform_device_register_simple("microcode", -1,
- NULL, 0);
- if (IS_ERR(microcode_pdev)) {
- microcode_dev_exit();
- return PTR_ERR(microcode_pdev);
- }
-
- get_online_cpus();
- error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
- put_online_cpus();
- if (error) {
- microcode_dev_exit();
- platform_device_unregister(microcode_pdev);
- return error;
- }
-
- register_hotcpu_notifier(&mc_cpu_notifier);
- return 0;
-}
-
-static void __exit microcode_exit (void)
-{
- microcode_dev_exit();
-
- unregister_hotcpu_notifier(&mc_cpu_notifier);
-
- get_online_cpus();
- sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
- put_online_cpus();
-
- platform_device_unregister(microcode_pdev);
-}
-
-module_init(microcode_init)
-module_exit(microcode_exit)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
new file mode 100644
index 00000000000..7a1f8eeac2c
--- /dev/null
+++ b/arch/x86/kernel/microcode_amd.c
@@ -0,0 +1,435 @@
+/*
+ * AMD CPU Microcode Update Driver for Linux
+ * Copyright (C) 2008 Advanced Micro Devices Inc.
+ *
+ * Author: Peter Oruba <peter.oruba@amd.com>
+ *
+ * Based on work by:
+ * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *
+ * This driver allows to upgrade microcode on AMD
+ * family 0x10 and 0x11 processors.
+ *
+ * Licensed unter the terms of the GNU General Public
+ * License version 2. See file COPYING for details.
+*/
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/microcode.h>
+
+MODULE_DESCRIPTION("AMD Microcode Update Driver");
+MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>");
+MODULE_LICENSE("GPL v2");
+
+#define UCODE_MAGIC 0x00414d44
+#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
+#define UCODE_UCODE_TYPE 0x00000001
+
+struct equiv_cpu_entry {
+ unsigned int installed_cpu;
+ unsigned int fixed_errata_mask;
+ unsigned int fixed_errata_compare;
+ unsigned int equiv_cpu;
+};
+
+struct microcode_header_amd {
+ unsigned int data_code;
+ unsigned int patch_id;
+ unsigned char mc_patch_data_id[2];
+ unsigned char mc_patch_data_len;
+ unsigned char init_flag;
+ unsigned int mc_patch_data_checksum;
+ unsigned int nb_dev_id;
+ unsigned int sb_dev_id;
+ unsigned char processor_rev_id[2];
+ unsigned char nb_rev_id;
+ unsigned char sb_rev_id;
+ unsigned char bios_api_rev;
+ unsigned char reserved1[3];
+ unsigned int match_reg[8];
+};
+
+struct microcode_amd {
+ struct microcode_header_amd hdr;
+ unsigned int mpb[0];
+};
+
+#define UCODE_MAX_SIZE (2048)
+#define DEFAULT_UCODE_DATASIZE (896)
+#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd))
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define DWSIZE (sizeof(u32))
+/* For now we support a fixed ucode total size only */
+#define get_totalsize(mc) \
+ ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
+ + MC_HEADER_SIZE)
+
+/* serialize access to the physical write */
+static DEFINE_SPINLOCK(microcode_update_lock);
+
+static struct equiv_cpu_entry *equiv_cpu_table;
+
+static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ memset(csig, 0, sizeof(*csig));
+
+ if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+ printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
+ cpu);
+ return -1;
+ }
+
+ asm volatile("movl %1, %%ecx; rdmsr"
+ : "=a" (csig->rev)
+ : "i" (0x0000008B) : "ecx");
+
+ printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
+ csig->rev);
+
+ return 0;
+}
+
+static int get_matching_microcode(int cpu, void *mc, int rev)
+{
+ struct microcode_header_amd *mc_header = mc;
+ struct pci_dev *nb_pci_dev, *sb_pci_dev;
+ unsigned int current_cpu_id;
+ unsigned int equiv_cpu_id = 0x00;
+ unsigned int i = 0;
+
+ BUG_ON(equiv_cpu_table == NULL);
+ current_cpu_id = cpuid_eax(0x00000001);
+
+ while (equiv_cpu_table[i].installed_cpu != 0) {
+ if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
+ equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
+ break;
+ }
+ i++;
+ }
+
+ if (!equiv_cpu_id) {
+ printk(KERN_ERR "microcode: CPU%d cpu_id "
+ "not found in equivalent cpu table \n", cpu);
+ return 0;
+ }
+
+ if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
+ printk(KERN_ERR
+ "microcode: CPU%d patch does not match "
+ "(patch is %x, cpu extended is %x) \n",
+ cpu, mc_header->processor_rev_id[0],
+ (equiv_cpu_id & 0xff));
+ return 0;
+ }
+
+ if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
+ printk(KERN_ERR "microcode: CPU%d patch does not match "
+ "(patch is %x, cpu base id is %x) \n",
+ cpu, mc_header->processor_rev_id[1],
+ ((equiv_cpu_id >> 16) & 0xff));
+
+ return 0;
+ }
+
+ /* ucode may be northbridge specific */
+ if (mc_header->nb_dev_id) {
+ nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
+ (mc_header->nb_dev_id & 0xff),
+ NULL);
+ if ((!nb_pci_dev) ||
+ (mc_header->nb_rev_id != nb_pci_dev->revision)) {
+ printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
+ pci_dev_put(nb_pci_dev);
+ return 0;
+ }
+ pci_dev_put(nb_pci_dev);
+ }
+
+ /* ucode may be southbridge specific */
+ if (mc_header->sb_dev_id) {
+ sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
+ (mc_header->sb_dev_id & 0xff),
+ NULL);
+ if ((!sb_pci_dev) ||
+ (mc_header->sb_rev_id != sb_pci_dev->revision)) {
+ printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
+ pci_dev_put(sb_pci_dev);
+ return 0;
+ }
+ pci_dev_put(sb_pci_dev);
+ }
+
+ if (mc_header->patch_id <= rev)
+ return 0;
+
+ return 1;
+}
+
+static void apply_microcode_amd(int cpu)
+{
+ unsigned long flags;
+ unsigned int eax, edx;
+ unsigned int rev;
+ int cpu_num = raw_smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+ struct microcode_amd *mc_amd = uci->mc;
+ unsigned long addr;
+
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu_num != cpu);
+
+ if (mc_amd == NULL)
+ return;
+
+ spin_lock_irqsave(&microcode_update_lock, flags);
+
+ addr = (unsigned long)&mc_amd->hdr.data_code;
+ edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
+ eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
+
+ asm volatile("movl %0, %%ecx; wrmsr" :
+ : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
+
+ /* get patch id after patching */
+ asm volatile("movl %1, %%ecx; rdmsr"
+ : "=a" (rev)
+ : "i" (0x0000008B) : "ecx");
+
+ spin_unlock_irqrestore(&microcode_update_lock, flags);
+
+ /* check current patch id and patch's id for match */
+ if (rev != mc_amd->hdr.patch_id) {
+ printk(KERN_ERR "microcode: CPU%d update from revision "
+ "0x%x to 0x%x failed\n", cpu_num,
+ mc_amd->hdr.patch_id, rev);
+ return;
+ }
+
+ printk(KERN_INFO "microcode: CPU%d updated from revision "
+ "0x%x to 0x%x \n",
+ cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
+
+ uci->cpu_sig.rev = rev;
+}
+
+static void * get_next_ucode(u8 *buf, unsigned int size,
+ int (*get_ucode_data)(void *, const void *, size_t),
+ unsigned int *mc_size)
+{
+ unsigned int total_size;
+#define UCODE_CONTAINER_SECTION_HDR 8
+ u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
+ void *mc;
+
+ if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
+ return NULL;
+
+ if (section_hdr[0] != UCODE_UCODE_TYPE) {
+ printk(KERN_ERR "microcode: error! "
+ "Wrong microcode payload type field\n");
+ return NULL;
+ }
+
+ total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
+
+ printk(KERN_INFO "microcode: size %u, total_size %u\n",
+ size, total_size);
+
+ if (total_size > size || total_size > UCODE_MAX_SIZE) {
+ printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+ return NULL;
+ }
+
+ mc = vmalloc(UCODE_MAX_SIZE);
+ if (mc) {
+ memset(mc, 0, UCODE_MAX_SIZE);
+ if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
+ vfree(mc);
+ mc = NULL;
+ } else
+ *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
+ }
+#undef UCODE_CONTAINER_SECTION_HDR
+ return mc;
+}
+
+
+static int install_equiv_cpu_table(u8 *buf,
+ int (*get_ucode_data)(void *, const void *, size_t))
+{
+#define UCODE_CONTAINER_HEADER_SIZE 12
+ u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
+ unsigned int *buf_pos = (unsigned int *)container_hdr;
+ unsigned long size;
+
+ if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
+ return 0;
+
+ size = buf_pos[2];
+
+ if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
+ printk(KERN_ERR "microcode: error! "
+ "Wrong microcode equivalnet cpu table\n");
+ return 0;
+ }
+
+ equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
+ if (!equiv_cpu_table) {
+ printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
+ return 0;
+ }
+
+ buf += UCODE_CONTAINER_HEADER_SIZE;
+ if (get_ucode_data(equiv_cpu_table, buf, size)) {
+ vfree(equiv_cpu_table);
+ return 0;
+ }
+
+ return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
+#undef UCODE_CONTAINER_HEADER_SIZE
+}
+
+static void free_equiv_cpu_table(void)
+{
+ if (equiv_cpu_table) {
+ vfree(equiv_cpu_table);
+ equiv_cpu_table = NULL;
+ }
+}
+
+static int generic_load_microcode(int cpu, void *data, size_t size,
+ int (*get_ucode_data)(void *, const void *, size_t))
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+ int new_rev = uci->cpu_sig.rev;
+ unsigned int leftover;
+ unsigned long offset;
+
+ offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
+ if (!offset) {
+ printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
+ return -EINVAL;
+ }
+
+ ucode_ptr += offset;
+ leftover = size - offset;
+
+ while (leftover) {
+ unsigned int uninitialized_var(mc_size);
+ struct microcode_header_amd *mc_header;
+
+ mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
+ if (!mc)
+ break;
+
+ mc_header = (struct microcode_header_amd *)mc;
+ if (get_matching_microcode(cpu, mc, new_rev)) {
+ if (new_mc)
+ vfree(new_mc);
+ new_rev = mc_header->patch_id;
+ new_mc = mc;
+ } else
+ vfree(mc);
+
+ ucode_ptr += mc_size;
+ leftover -= mc_size;
+ }
+
+ if (new_mc) {
+ if (!leftover) {
+ if (uci->mc)
+ vfree(uci->mc);
+ uci->mc = new_mc;
+ pr_debug("microcode: CPU%d found a matching microcode update with"
+ " version 0x%x (current=0x%x)\n",
+ cpu, new_rev, uci->cpu_sig.rev);
+ } else
+ vfree(new_mc);
+ }
+
+ free_equiv_cpu_table();
+
+ return (int)leftover;
+}
+
+static int get_ucode_fw(void *to, const void *from, size_t n)
+{
+ memcpy(to, from, n);
+ return 0;
+}
+
+static int request_microcode_fw(int cpu, struct device *device)
+{
+ const char *fw_name = "amd-ucode/microcode_amd.bin";
+ const struct firmware *firmware;
+ int ret;
+
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu != raw_smp_processor_id());
+
+ ret = request_firmware(&firmware, fw_name, device);
+ if (ret) {
+ printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
+ return ret;
+ }
+
+ ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
+ &get_ucode_fw);
+
+ release_firmware(firmware);
+
+ return ret;
+}
+
+static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+ printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
+ "is not supported\n");
+ return -1;
+}
+
+static void microcode_fini_cpu_amd(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ vfree(uci->mc);
+ uci->mc = NULL;
+}
+
+static struct microcode_ops microcode_amd_ops = {
+ .request_microcode_user = request_microcode_user,
+ .request_microcode_fw = request_microcode_fw,
+ .collect_cpu_info = collect_cpu_info_amd,
+ .apply_microcode = apply_microcode_amd,
+ .microcode_fini_cpu = microcode_fini_cpu_amd,
+};
+
+struct microcode_ops * __init init_amd_microcode(void)
+{
+ return &microcode_amd_ops;
+}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
new file mode 100644
index 00000000000..936d8d55f23
--- /dev/null
+++ b/arch/x86/kernel/microcode_core.c
@@ -0,0 +1,508 @@
+/*
+ * Intel CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ *
+ * This driver allows to upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ * Software Developer's Manual
+ * Order Number 253668 or free download from:
+ *
+ * http://developer.intel.com/design/pentium4/manuals/253668.htm
+ *
+ * For more information, go to http://www.urbanmyth.org/microcode
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ * Initial release.
+ * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ * Added read() support + cleanups.
+ * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ * Added 'device trimming' support. open(O_WRONLY) zeroes
+ * and frees the saved copy of applied microcode.
+ * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ * Made to use devfs (/dev/cpu/microcode) + cleanups.
+ * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
+ * Added misc device support (now uses both devfs and misc).
+ * Added MICROCODE_IOCFREE ioctl to clear memory.
+ * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
+ * Messages for error cases (non Intel & no suitable microcode).
+ * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
+ * Removed ->release(). Removed exclusive open and status bitmap.
+ * Added microcode_rwsem to serialize read()/write()/ioctl().
+ * Removed global kernel lock usage.
+ * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
+ * Write 0 to 0x8B msr and then cpuid before reading revision,
+ * so that it works even if there were no update done by the
+ * BIOS. Otherwise, reading from 0x8B gives junk (which happened
+ * to be 0 on my machine which is why it worked even when I
+ * disabled update by the BIOS)
+ * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
+ * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
+ * Tigran Aivazian <tigran@veritas.com>
+ * Intel Pentium 4 processor support and bugfixes.
+ * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
+ * Bugfix for HT (Hyper-Threading) enabled processors
+ * whereby processor resources are shared by all logical processors
+ * in a single CPU package.
+ * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
+ * Tigran Aivazian <tigran@veritas.com>,
+ * Serialize updates as required on HT processors due to
+ * speculative nature of implementation.
+ * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
+ * Fix the panic when writing zero-length microcode chunk.
+ * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
+ * Jun Nakajima <jun.nakajima@intel.com>
+ * Support for the microcode updates in the new format.
+ * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
+ * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
+ * because we no longer hold a copy of applied microcode
+ * in kernel memory.
+ * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
+ * Fix sigmatch() macro to handle old CPUs with pf == 0.
+ * Thanks to Stuart Swales for pointing out this bug.
+ */
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/microcode.h>
+
+MODULE_DESCRIPTION("Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+#define MICROCODE_VERSION "2.00"
+
+struct microcode_ops *microcode_ops;
+
+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+static DEFINE_MUTEX(microcode_mutex);
+
+struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
+EXPORT_SYMBOL_GPL(ucode_cpu_info);
+
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static int do_microcode_update(const void __user *buf, size_t size)
+{
+ cpumask_t old;
+ int error = 0;
+ int cpu;
+
+ old = current->cpus_allowed;
+
+ for_each_online_cpu(cpu) {
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (!uci->valid)
+ continue;
+
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+ error = microcode_ops->request_microcode_user(cpu, buf, size);
+ if (error < 0)
+ goto out;
+ if (!error)
+ microcode_ops->apply_microcode(cpu);
+ }
+out:
+ set_cpus_allowed_ptr(current, &old);
+ return error;
+}
+
+static int microcode_open(struct inode *unused1, struct file *unused2)
+{
+ cycle_kernel_lock();
+ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+static ssize_t microcode_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ ssize_t ret;
+
+ if ((len >> PAGE_SHIFT) > num_physpages) {
+ printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
+ num_physpages);
+ return -EINVAL;
+ }
+
+ get_online_cpus();
+ mutex_lock(&microcode_mutex);
+
+ ret = do_microcode_update(buf, len);
+ if (!ret)
+ ret = (ssize_t)len;
+
+ mutex_unlock(&microcode_mutex);
+ put_online_cpus();
+
+ return ret;
+}
+
+static const struct file_operations microcode_fops = {
+ .owner = THIS_MODULE,
+ .write = microcode_write,
+ .open = microcode_open,
+};
+
+static struct miscdevice microcode_dev = {
+ .minor = MICROCODE_MINOR,
+ .name = "microcode",
+ .fops = &microcode_fops,
+};
+
+static int __init microcode_dev_init(void)
+{
+ int error;
+
+ error = misc_register(&microcode_dev);
+ if (error) {
+ printk(KERN_ERR
+ "microcode: can't misc_register on minor=%d\n",
+ MICROCODE_MINOR);
+ return error;
+ }
+
+ return 0;
+}
+
+static void microcode_dev_exit(void)
+{
+ misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while (0)
+#endif
+
+/* fake device for request_firmware */
+struct platform_device *microcode_pdev;
+
+static ssize_t reload_store(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ const char *buf, size_t sz)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+ char *end;
+ unsigned long val = simple_strtoul(buf, &end, 0);
+ int err = 0;
+ int cpu = dev->id;
+
+ if (end == buf)
+ return -EINVAL;
+ if (val == 1) {
+ cpumask_t old = current->cpus_allowed;
+
+ get_online_cpus();
+ if (cpu_online(cpu)) {
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+ mutex_lock(&microcode_mutex);
+ if (uci->valid) {
+ err = microcode_ops->request_microcode_fw(cpu,
+ &microcode_pdev->dev);
+ if (!err)
+ microcode_ops->apply_microcode(cpu);
+ }
+ mutex_unlock(&microcode_mutex);
+ set_cpus_allowed_ptr(current, &old);
+ }
+ put_online_cpus();
+ }
+ if (err)
+ return err;
+ return sz;
+}
+
+static ssize_t version_show(struct sys_device *dev,
+ struct sysdev_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
+}
+
+static ssize_t pf_show(struct sys_device *dev,
+ struct sysdev_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
+}
+
+static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
+static SYSDEV_ATTR(version, 0400, version_show, NULL);
+static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+
+static struct attribute *mc_default_attrs[] = {
+ &attr_reload.attr,
+ &attr_version.attr,
+ &attr_processor_flags.attr,
+ NULL
+};
+
+static struct attribute_group mc_attr_group = {
+ .attrs = mc_default_attrs,
+ .name = "microcode",
+};
+
+static void microcode_fini_cpu(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ mutex_lock(&microcode_mutex);
+ microcode_ops->microcode_fini_cpu(cpu);
+ uci->valid = 0;
+ mutex_unlock(&microcode_mutex);
+}
+
+static void collect_cpu_info(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ memset(uci, 0, sizeof(*uci));
+ if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
+ uci->valid = 1;
+}
+
+static int microcode_resume_cpu(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct cpu_signature nsig;
+
+ pr_debug("microcode: CPU%d resumed\n", cpu);
+
+ if (!uci->mc)
+ return 1;
+
+ /*
+ * Let's verify that the 'cached' ucode does belong
+ * to this cpu (a bit of paranoia):
+ */
+ if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
+ microcode_fini_cpu(cpu);
+ return -1;
+ }
+
+ if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
+ microcode_fini_cpu(cpu);
+ /* Should we look for a new ucode here? */
+ return 1;
+ }
+
+ return 0;
+}
+
+void microcode_update_cpu(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ int err = 0;
+
+ /*
+ * Check if the system resume is in progress (uci->valid != NULL),
+ * otherwise just request a firmware:
+ */
+ if (uci->valid) {
+ err = microcode_resume_cpu(cpu);
+ } else {
+ collect_cpu_info(cpu);
+ if (uci->valid && system_state == SYSTEM_RUNNING)
+ err = microcode_ops->request_microcode_fw(cpu,
+ &microcode_pdev->dev);
+ }
+ if (!err)
+ microcode_ops->apply_microcode(cpu);
+}
+
+static void microcode_init_cpu(int cpu)
+{
+ cpumask_t old = current->cpus_allowed;
+
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+ /* We should bind the task to the CPU */
+ BUG_ON(raw_smp_processor_id() != cpu);
+
+ mutex_lock(&microcode_mutex);
+ microcode_update_cpu(cpu);
+ mutex_unlock(&microcode_mutex);
+
+ set_cpus_allowed_ptr(current, &old);
+}
+
+static int mc_sysdev_add(struct sys_device *sys_dev)
+{
+ int err, cpu = sys_dev->id;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (!cpu_online(cpu))
+ return 0;
+
+ pr_debug("microcode: CPU%d added\n", cpu);
+ memset(uci, 0, sizeof(*uci));
+
+ err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+ if (err)
+ return err;
+
+ microcode_init_cpu(cpu);
+ return 0;
+}
+
+static int mc_sysdev_remove(struct sys_device *sys_dev)
+{
+ int cpu = sys_dev->id;
+
+ if (!cpu_online(cpu))
+ return 0;
+
+ pr_debug("microcode: CPU%d removed\n", cpu);
+ microcode_fini_cpu(cpu);
+ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ return 0;
+}
+
+static int mc_sysdev_resume(struct sys_device *dev)
+{
+ int cpu = dev->id;
+
+ if (!cpu_online(cpu))
+ return 0;
+
+ /* only CPU 0 will apply ucode here */
+ microcode_update_cpu(0);
+ return 0;
+}
+
+static struct sysdev_driver mc_sysdev_driver = {
+ .add = mc_sysdev_add,
+ .remove = mc_sysdev_remove,
+ .resume = mc_sysdev_resume,
+};
+
+static __cpuinit int
+mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct sys_device *sys_dev;
+
+ sys_dev = get_cpu_sysdev(cpu);
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ microcode_init_cpu(cpu);
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ pr_debug("microcode: CPU%d added\n", cpu);
+ if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+ printk(KERN_ERR "microcode: Failed to create the sysfs "
+ "group for CPU%d\n", cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ /* Suspend is in progress, only remove the interface */
+ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ pr_debug("microcode: CPU%d removed\n", cpu);
+ break;
+ case CPU_DEAD:
+ case CPU_UP_CANCELED_FROZEN:
+ /* The CPU refused to come up during a system resume */
+ microcode_fini_cpu(cpu);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __refdata mc_cpu_notifier = {
+ .notifier_call = mc_cpu_callback,
+};
+
+static int __init microcode_init(void)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+ int error;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ microcode_ops = init_intel_microcode();
+ else if (c->x86_vendor == X86_VENDOR_AMD)
+ microcode_ops = init_amd_microcode();
+
+ if (!microcode_ops) {
+ printk(KERN_ERR "microcode: no support for this CPU vendor\n");
+ return -ENODEV;
+ }
+
+ error = microcode_dev_init();
+ if (error)
+ return error;
+ microcode_pdev = platform_device_register_simple("microcode", -1,
+ NULL, 0);
+ if (IS_ERR(microcode_pdev)) {
+ microcode_dev_exit();
+ return PTR_ERR(microcode_pdev);
+ }
+
+ get_online_cpus();
+ error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+ put_online_cpus();
+ if (error) {
+ microcode_dev_exit();
+ platform_device_unregister(microcode_pdev);
+ return error;
+ }
+
+ register_hotcpu_notifier(&mc_cpu_notifier);
+
+ printk(KERN_INFO
+ "Microcode Update Driver: v" MICROCODE_VERSION
+ " <tigran@aivazian.fsnet.co.uk>"
+ " <peter.oruba@amd.com>\n");
+
+ return 0;
+}
+
+static void __exit microcode_exit(void)
+{
+ microcode_dev_exit();
+
+ unregister_hotcpu_notifier(&mc_cpu_notifier);
+
+ get_online_cpus();
+ sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+ put_online_cpus();
+
+ platform_device_unregister(microcode_pdev);
+
+ microcode_ops = NULL;
+
+ printk(KERN_INFO
+ "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+}
+
+module_init(microcode_init);
+module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
new file mode 100644
index 00000000000..622dc4a2178
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel.c
@@ -0,0 +1,480 @@
+/*
+ * Intel CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ *
+ * This driver allows to upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ * Software Developer's Manual
+ * Order Number 253668 or free download from:
+ *
+ * http://developer.intel.com/design/pentium4/manuals/253668.htm
+ *
+ * For more information, go to http://www.urbanmyth.org/microcode
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ * Initial release.
+ * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ * Added read() support + cleanups.
+ * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ * Added 'device trimming' support. open(O_WRONLY) zeroes
+ * and frees the saved copy of applied microcode.
+ * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ * Made to use devfs (/dev/cpu/microcode) + cleanups.
+ * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
+ * Added misc device support (now uses both devfs and misc).
+ * Added MICROCODE_IOCFREE ioctl to clear memory.
+ * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
+ * Messages for error cases (non Intel & no suitable microcode).
+ * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
+ * Removed ->release(). Removed exclusive open and status bitmap.
+ * Added microcode_rwsem to serialize read()/write()/ioctl().
+ * Removed global kernel lock usage.
+ * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
+ * Write 0 to 0x8B msr and then cpuid before reading revision,
+ * so that it works even if there were no update done by the
+ * BIOS. Otherwise, reading from 0x8B gives junk (which happened
+ * to be 0 on my machine which is why it worked even when I
+ * disabled update by the BIOS)
+ * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
+ * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
+ * Tigran Aivazian <tigran@veritas.com>
+ * Intel Pentium 4 processor support and bugfixes.
+ * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
+ * Bugfix for HT (Hyper-Threading) enabled processors
+ * whereby processor resources are shared by all logical processors
+ * in a single CPU package.
+ * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
+ * Tigran Aivazian <tigran@veritas.com>,
+ * Serialize updates as required on HT processors due to
+ * speculative nature of implementation.
+ * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
+ * Fix the panic when writing zero-length microcode chunk.
+ * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
+ * Jun Nakajima <jun.nakajima@intel.com>
+ * Support for the microcode updates in the new format.
+ * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
+ * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
+ * because we no longer hold a copy of applied microcode
+ * in kernel memory.
+ * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
+ * Fix sigmatch() macro to handle old CPUs with pf == 0.
+ * Thanks to Stuart Swales for pointing out this bug.
+ */
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/microcode.h>
+
+MODULE_DESCRIPTION("Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+struct microcode_header_intel {
+ unsigned int hdrver;
+ unsigned int rev;
+ unsigned int date;
+ unsigned int sig;
+ unsigned int cksum;
+ unsigned int ldrver;
+ unsigned int pf;
+ unsigned int datasize;
+ unsigned int totalsize;
+ unsigned int reserved[3];
+};
+
+struct microcode_intel {
+ struct microcode_header_intel hdr;
+ unsigned int bits[0];
+};
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int cksum;
+};
+
+struct extended_sigtable {
+ unsigned int count;
+ unsigned int cksum;
+ unsigned int reserved[3];
+ struct extended_signature sigs[0];
+};
+
+#define DEFAULT_UCODE_DATASIZE (2000)
+#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
+#define DWSIZE (sizeof(u32))
+#define get_totalsize(mc) \
+ (((struct microcode_intel *)mc)->hdr.totalsize ? \
+ ((struct microcode_intel *)mc)->hdr.totalsize : \
+ DEFAULT_UCODE_TOTALSIZE)
+
+#define get_datasize(mc) \
+ (((struct microcode_intel *)mc)->hdr.datasize ? \
+ ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
+
+#define sigmatch(s1, s2, p1, p2) \
+ (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
+
+#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
+
+/* serialize access to the physical write to MSR 0x79 */
+static DEFINE_SPINLOCK(microcode_update_lock);
+
+static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu_num);
+ unsigned int val[2];
+
+ memset(csig, 0, sizeof(*csig));
+
+ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+ cpu_has(c, X86_FEATURE_IA64)) {
+ printk(KERN_ERR "microcode: CPU%d not a capable Intel "
+ "processor\n", cpu_num);
+ return -1;
+ }
+
+ csig->sig = cpuid_eax(0x00000001);
+
+ if ((c->x86_model >= 5) || (c->x86 > 6)) {
+ /* get processor flags from MSR 0x17 */
+ rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig->pf = 1 << ((val[1] >> 18) & 7);
+ }
+
+ wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+ /* see notes above for revision 1.07. Apparent chip bug */
+ sync_core();
+ /* get the current revision from MSR 0x8B */
+ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
+ pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
+ csig->sig, csig->pf, csig->rev);
+
+ return 0;
+}
+
+static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
+{
+ return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
+}
+
+static inline int
+update_match_revision(struct microcode_header_intel *mc_header, int rev)
+{
+ return (mc_header->rev <= rev) ? 0 : 1;
+}
+
+static int microcode_sanity_check(void *mc)
+{
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ struct extended_signature *ext_sig;
+ unsigned long total_size, data_size, ext_table_size;
+ int sum, orig_sum, ext_sigcount = 0, i;
+
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ printk(KERN_ERR "microcode: error! "
+ "Bad data size in microcode data file\n");
+ return -EINVAL;
+ }
+
+ if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+ printk(KERN_ERR "microcode: error! "
+ "Unknown microcode update format\n");
+ return -EINVAL;
+ }
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ if ((ext_table_size < EXT_HEADER_SIZE)
+ || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ printk(KERN_ERR "microcode: error! "
+ "Small exttable size in microcode data file\n");
+ return -EINVAL;
+ }
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ printk(KERN_ERR "microcode: error! "
+ "Bad exttable size in microcode data file\n");
+ return -EFAULT;
+ }
+ ext_sigcount = ext_header->count;
+ }
+
+ /* check extended table checksum */
+ if (ext_table_size) {
+ int ext_table_sum = 0;
+ int *ext_tablep = (int *)ext_header;
+
+ i = ext_table_size / DWSIZE;
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+ if (ext_table_sum) {
+ printk(KERN_WARNING "microcode: aborting, "
+ "bad extended signature table checksum\n");
+ return -EINVAL;
+ }
+ }
+
+ /* calculate the checksum */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+ while (i--)
+ orig_sum += ((int *)mc)[i];
+ if (orig_sum) {
+ printk(KERN_ERR "microcode: aborting, bad checksum\n");
+ return -EINVAL;
+ }
+ if (!ext_table_size)
+ return 0;
+ /* check extended signature checksum */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+ sum = orig_sum
+ - (mc_header->sig + mc_header->pf + mc_header->cksum)
+ + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ printk(KERN_ERR "microcode: aborting, bad checksum\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+static int
+get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
+{
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header;
+ unsigned long total_size = get_totalsize(mc_header);
+ int ext_sigcount, i;
+ struct extended_signature *ext_sig;
+
+ if (!update_match_revision(mc_header, rev))
+ return 0;
+
+ if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
+ return 1;
+
+ /* Look for ext. headers: */
+ if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+ return 0;
+
+ ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_sigcount; i++) {
+ if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
+ return 1;
+ ext_sig++;
+ }
+ return 0;
+}
+
+static void apply_microcode(int cpu)
+{
+ unsigned long flags;
+ unsigned int val[2];
+ int cpu_num = raw_smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct microcode_intel *mc_intel = uci->mc;
+
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu_num != cpu);
+
+ if (mc_intel == NULL)
+ return;
+
+ /* serialize access to the physical write to MSR 0x79 */
+ spin_lock_irqsave(&microcode_update_lock, flags);
+
+ /* write microcode via MSR 0x79 */
+ wrmsr(MSR_IA32_UCODE_WRITE,
+ (unsigned long) mc_intel->bits,
+ (unsigned long) mc_intel->bits >> 16 >> 16);
+ wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+ /* see notes above for revision 1.07. Apparent chip bug */
+ sync_core();
+
+ /* get the current revision from MSR 0x8B */
+ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+ spin_unlock_irqrestore(&microcode_update_lock, flags);
+ if (val[1] != mc_intel->hdr.rev) {
+ printk(KERN_ERR "microcode: CPU%d update from revision "
+ "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
+ return;
+ }
+ printk(KERN_INFO "microcode: CPU%d updated from revision "
+ "0x%x to 0x%x, date = %04x-%02x-%02x \n",
+ cpu_num, uci->cpu_sig.rev, val[1],
+ mc_intel->hdr.date & 0xffff,
+ mc_intel->hdr.date >> 24,
+ (mc_intel->hdr.date >> 16) & 0xff);
+ uci->cpu_sig.rev = val[1];
+}
+
+static int generic_load_microcode(int cpu, void *data, size_t size,
+ int (*get_ucode_data)(void *, const void *, size_t))
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+ int new_rev = uci->cpu_sig.rev;
+ unsigned int leftover = size;
+
+ while (leftover) {
+ struct microcode_header_intel mc_header;
+ unsigned int mc_size;
+
+ if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
+ break;
+
+ mc_size = get_totalsize(&mc_header);
+ if (!mc_size || mc_size > leftover) {
+ printk(KERN_ERR "microcode: error!"
+ "Bad data in microcode data file\n");
+ break;
+ }
+
+ mc = vmalloc(mc_size);
+ if (!mc)
+ break;
+
+ if (get_ucode_data(mc, ucode_ptr, mc_size) ||
+ microcode_sanity_check(mc) < 0) {
+ vfree(mc);
+ break;
+ }
+
+ if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
+ if (new_mc)
+ vfree(new_mc);
+ new_rev = mc_header.rev;
+ new_mc = mc;
+ } else
+ vfree(mc);
+
+ ucode_ptr += mc_size;
+ leftover -= mc_size;
+ }
+
+ if (new_mc) {
+ if (!leftover) {
+ if (uci->mc)
+ vfree(uci->mc);
+ uci->mc = (struct microcode_intel *)new_mc;
+ pr_debug("microcode: CPU%d found a matching microcode update with"
+ " version 0x%x (current=0x%x)\n",
+ cpu, new_rev, uci->cpu_sig.rev);
+ } else
+ vfree(new_mc);
+ }
+
+ return (int)leftover;
+}
+
+static int get_ucode_fw(void *to, const void *from, size_t n)
+{
+ memcpy(to, from, n);
+ return 0;
+}
+
+static int request_microcode_fw(int cpu, struct device *device)
+{
+ char name[30];
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ const struct firmware *firmware;
+ int ret;
+
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu != raw_smp_processor_id());
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ c->x86, c->x86_model, c->x86_mask);
+ ret = request_firmware(&firmware, name, device);
+ if (ret) {
+ pr_debug("microcode: data file %s load failed\n", name);
+ return ret;
+ }
+
+ ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
+ &get_ucode_fw);
+
+ release_firmware(firmware);
+
+ return ret;
+}
+
+static int get_ucode_user(void *to, const void *from, size_t n)
+{
+ return copy_from_user(to, from, n);
+}
+
+static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu != raw_smp_processor_id());
+
+ return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
+}
+
+static void microcode_fini_cpu(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ vfree(uci->mc);
+ uci->mc = NULL;
+}
+
+struct microcode_ops microcode_intel_ops = {
+ .request_microcode_user = request_microcode_user,
+ .request_microcode_fw = request_microcode_fw,
+ .collect_cpu_info = collect_cpu_info,
+ .apply_microcode = apply_microcode,
+ .microcode_fini_cpu = microcode_fini_cpu,
+};
+
+struct microcode_ops * __init init_intel_microcode(void)
+{
+ return &microcode_intel_ops;
+}
+
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b3fb430725c..f98f4e1dba0 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -397,7 +397,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
generic_bigsmp_probe();
#endif
+#ifdef CONFIG_X86_32
setup_apic_routing();
+#endif
if (!num_processors)
printk(KERN_ERR "MPTABLE: no processors registered!\n");
return num_processors;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 2e2af5d1819..82a7c7ed6d4 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -163,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu)
{
struct device *dev;
- dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu),
- NULL, "msr%d", cpu);
+ dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
+ "msr%d", cpu);
return IS_ERR(dev) ? PTR_ERR(dev) : 0;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index abb78a2cc4a..2c97f07f1c2 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -299,6 +299,15 @@ void acpi_nmi_disable(void)
on_each_cpu(__acpi_nmi_disable, NULL, 1);
}
+/*
+ * This function is called as soon the LAPIC NMI watchdog driver has everything
+ * in place and it's ready to check if the NMIs belong to the NMI watchdog
+ */
+void cpu_nmi_set_wd_enabled(void)
+{
+ __get_cpu_var(wd_enabled) = 1;
+}
+
void setup_apic_nmi_watchdog(void *unused)
{
if (__get_cpu_var(wd_enabled))
@@ -311,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
switch (nmi_watchdog) {
case NMI_LOCAL_APIC:
- /* enable it before to avoid race with handler */
- __get_cpu_var(wd_enabled) = 1;
if (lapic_watchdog_init(nmi_hz) < 0) {
__get_cpu_var(wd_enabled) = 0;
return;
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index eecc8c18f01..4caff39078e 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -229,6 +229,12 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
}
}
+static int __init numaq_setup_ioapic_ids(void)
+{
+ /* so can skip it */
+ return 1;
+}
+
static struct x86_quirks numaq_x86_quirks __initdata = {
.arch_pre_time_init = numaq_pre_time_init,
.arch_time_init = NULL,
@@ -243,6 +249,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
.mpc_oem_bus_info = mpc_oem_bus_info,
.mpc_oem_pci_bus = mpc_oem_pci_bus,
.smp_read_mpc_oem = smp_read_mpc_oem,
+ .setup_ioapic_ids = numaq_setup_ioapic_ids,
};
void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e667227480..7a13fac63a1 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
static void __init platform_detect(void)
{
size_t propsize;
- u32 rev;
+ __be32 rev;
if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
&propsize) || propsize != 4) {
printk(KERN_ERR "ofw: getprop call failed!\n");
- rev = 0;
+ rev = cpu_to_be32(0);
}
olpc_platform_info.boardrev = be32_to_cpu(rev);
}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
static void __init platform_detect(void)
{
/* stopgap until OFW support is added to the kernel */
- olpc_platform_info.boardrev = be32_to_cpu(0xc2);
+ olpc_platform_info.boardrev = 0xc2;
}
#endif
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 00000000000..0e9f1982b1d
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,37 @@
+/*
+ * Split spinlock implementation out into its own file, so it can be
+ * compiled in a FTRACE-compatible way.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <asm/paravirt.h>
+
+static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
+{
+ __raw_spin_lock(lock);
+}
+
+struct pv_lock_ops pv_lock_ops = {
+#ifdef CONFIG_SMP
+ .spin_is_locked = __ticket_spin_is_locked,
+ .spin_is_contended = __ticket_spin_is_contended,
+
+ .spin_lock = __ticket_spin_lock,
+ .spin_lock_flags = default_spin_lock_flags,
+ .spin_trylock = __ticket_spin_trylock,
+ .spin_unlock = __ticket_spin_unlock,
+#endif
+};
+EXPORT_SYMBOL(pv_lock_ops);
+
+void __init paravirt_use_bytelocks(void)
+{
+#ifdef CONFIG_SMP
+ pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
+ pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
+ pv_lock_ops.spin_lock = __byte_spin_lock;
+ pv_lock_ops.spin_trylock = __byte_spin_trylock;
+ pv_lock_ops.spin_unlock = __byte_spin_unlock;
+#endif
+}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 300da17e61c..e4c8fb60887 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
return __get_cpu_var(paravirt_lazy_mode);
}
-void __init paravirt_use_bytelocks(void)
-{
-#ifdef CONFIG_SMP
- pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
- pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
- pv_lock_ops.spin_lock = __byte_spin_lock;
- pv_lock_ops.spin_trylock = __byte_spin_trylock;
- pv_lock_ops.spin_unlock = __byte_spin_unlock;
-#endif
-}
-
struct pv_info pv_info = {
.name = "bare hardware",
.paravirt_enabled = 0,
@@ -330,6 +319,7 @@ struct pv_cpu_ops pv_cpu_ops = {
#endif
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
+ .read_msr_amd = native_read_msr_amd_safe,
.write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
@@ -348,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = {
.write_ldt_entry = native_write_ldt_entry,
.write_gdt_entry = native_write_gdt_entry,
.write_idt_entry = native_write_idt_entry,
+
+ .alloc_ldt = paravirt_nop,
+ .free_ldt = paravirt_nop,
+
.load_sp0 = native_load_sp0,
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -373,8 +367,6 @@ struct pv_cpu_ops pv_cpu_ops = {
struct pv_apic_ops pv_apic_ops = {
#ifdef CONFIG_X86_LOCAL_APIC
- .apic_write = native_apic_write,
- .apic_read = native_apic_read,
.setup_boot_clock = setup_boot_APIC_clock,
.setup_secondary_clock = setup_secondary_APIC_clock,
.startup_ipi_hook = paravirt_nop,
@@ -461,18 +453,6 @@ struct pv_mmu_ops pv_mmu_ops = {
.set_fixmap = native_set_fixmap,
};
-struct pv_lock_ops pv_lock_ops = {
-#ifdef CONFIG_SMP
- .spin_is_locked = __ticket_spin_is_locked,
- .spin_is_contended = __ticket_spin_is_contended,
-
- .spin_lock = __ticket_spin_lock,
- .spin_trylock = __ticket_spin_trylock,
- .spin_unlock = __ticket_spin_unlock,
-#endif
-};
-EXPORT_SYMBOL(pv_lock_ops);
-
EXPORT_SYMBOL_GPL(pv_time_ops);
EXPORT_SYMBOL (pv_cpu_ops);
EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 58262218781..9fe644f4861 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
start = start_##ops##_##x; \
end = end_##ops##_##x; \
goto patch_site
- switch(type) {
+ switch (type) {
PATCH_SITE(pv_irq_ops, irq_disable);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index dcdac6c826e..e1e731d78f3 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -217,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
#endif /* CONFIG_IOMMU_DEBUG */
-static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
-{
- unsigned int npages;
-
- npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
- npages >>= PAGE_SHIFT;
-
- return npages;
-}
-
static inline int translation_enabled(struct iommu_table *tbl)
{
/* only PHBs with translation enabled have an IOMMU table */
@@ -261,7 +251,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
badbit, tbl, start_addr, npages);
}
- set_bit_string(tbl->it_map, index, npages);
+ iommu_area_reserve(tbl->it_map, index, npages);
spin_unlock_irqrestore(&tbl->it_lock, flags);
}
@@ -408,7 +398,7 @@ static void calgary_unmap_sg(struct device *dev,
if (dmalen == 0)
break;
- npages = num_dma_pages(dma, dmalen);
+ npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
iommu_free(tbl, dma, npages);
}
}
@@ -427,7 +417,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
BUG_ON(!sg_page(s));
vaddr = (unsigned long) sg_virt(s);
- npages = num_dma_pages(vaddr, s->length);
+ npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
entry = iommu_range_alloc(dev, tbl, npages);
if (entry == bad_dma_address) {
@@ -464,7 +454,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
struct iommu_table *tbl = find_iommu_table(dev);
uaddr = (unsigned long)vaddr;
- npages = num_dma_pages(uaddr, size);
+ npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
return iommu_alloc(dev, tbl, vaddr, npages, direction);
}
@@ -475,7 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
struct iommu_table *tbl = find_iommu_table(dev);
unsigned int npages;
- npages = num_dma_pages(dma_handle, size);
+ npages = iommu_num_pages(dma_handle, size, PAGE_SIZE);
iommu_free(tbl, dma_handle, npages);
}
@@ -491,6 +481,8 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
npages = size >> PAGE_SHIFT;
order = get_order(size);
+ flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
/* alloc enough pages (and possibly more) */
ret = (void *)__get_free_pages(flag, order);
if (!ret)
@@ -510,8 +502,22 @@ error:
return ret;
}
+static void calgary_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle)
+{
+ unsigned int npages;
+ struct iommu_table *tbl = find_iommu_table(dev);
+
+ size = PAGE_ALIGN(size);
+ npages = size >> PAGE_SHIFT;
+
+ iommu_free(tbl, dma_handle, npages);
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
static struct dma_mapping_ops calgary_dma_ops = {
.alloc_coherent = calgary_alloc_coherent,
+ .free_coherent = calgary_free_coherent,
.map_single = calgary_map_single,
.unmap_single = calgary_unmap_single,
.map_sg = calgary_map_sg,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 87d4d6964ec..1972266e8ba 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -9,8 +9,6 @@
#include <asm/calgary.h>
#include <asm/amd_iommu.h>
-static int forbid_dac __read_mostly;
-
struct dma_mapping_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
@@ -41,11 +39,12 @@ EXPORT_SYMBOL(bad_dma_address);
/* Dummy device used for NULL arguments (normally ISA). Better would
be probably a smaller DMA mask, but this is bug-to-bug compatible
to older i386. */
-struct device fallback_dev = {
+struct device x86_dma_fallback_dev = {
.bus_id = "fallback device",
.coherent_dma_mask = DMA_32BIT_MASK,
- .dma_mask = &fallback_dev.coherent_dma_mask,
+ .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
};
+EXPORT_SYMBOL(x86_dma_fallback_dev);
int dma_set_mask(struct device *dev, u64 mask)
{
@@ -82,7 +81,7 @@ void __init dma32_reserve_bootmem(void)
* using 512M as goal
*/
align = 64ULL<<20;
- size = round_up(dma32_bootmem_size, align);
+ size = roundup(dma32_bootmem_size, align);
dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
512ULL<<20);
if (dma32_bootmem_ptr)
@@ -124,15 +123,46 @@ void __init pci_iommu_alloc(void)
pci_swiotlb_init();
}
-unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
+unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
{
unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
return size >> PAGE_SHIFT;
}
-EXPORT_SYMBOL(iommu_num_pages);
+EXPORT_SYMBOL(iommu_nr_pages);
#endif
+void *dma_generic_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_addr, gfp_t flag)
+{
+ unsigned long dma_mask;
+ struct page *page;
+ dma_addr_t addr;
+
+ dma_mask = dma_alloc_coherent_mask(dev, flag);
+
+ flag |= __GFP_ZERO;
+again:
+ page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
+ if (!page)
+ return NULL;
+
+ addr = page_to_phys(page);
+ if (!is_buffer_dma_capable(dma_mask, addr, size)) {
+ __free_pages(page, get_order(size));
+
+ if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
+ flag = (flag & ~GFP_DMA32) | GFP_DMA;
+ goto again;
+ }
+
+ return NULL;
+ }
+
+ *dma_addr = addr;
+ return page_address(page);
+}
+
/*
* See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
* documentation.
@@ -241,147 +271,6 @@ int dma_supported(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_supported);
-/* Allocate DMA memory on node near device */
-static noinline struct page *
-dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
-{
- int node;
-
- node = dev_to_node(dev);
-
- return alloc_pages_node(node, gfp, order);
-}
-
-/*
- * Allocate memory for a coherent mapping.
- */
-void *
-dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t gfp)
-{
- struct dma_mapping_ops *ops = get_dma_ops(dev);
- void *memory = NULL;
- struct page *page;
- unsigned long dma_mask = 0;
- dma_addr_t bus;
- int noretry = 0;
-
- /* ignore region specifiers */
- gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
- if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
- return memory;
-
- if (!dev) {
- dev = &fallback_dev;
- gfp |= GFP_DMA;
- }
- dma_mask = dev->coherent_dma_mask;
- if (dma_mask == 0)
- dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
-
- /* Device not DMA able */
- if (dev->dma_mask == NULL)
- return NULL;
-
- /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
- if (gfp & __GFP_DMA)
- noretry = 1;
-
-#ifdef CONFIG_X86_64
- /* Why <=? Even when the mask is smaller than 4GB it is often
- larger than 16MB and in this case we have a chance of
- finding fitting memory in the next higher zone first. If
- not retry with true GFP_DMA. -AK */
- if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
- gfp |= GFP_DMA32;
- if (dma_mask < DMA_32BIT_MASK)
- noretry = 1;
- }
-#endif
-
- again:
- page = dma_alloc_pages(dev,
- noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
- if (page == NULL)
- return NULL;
-
- {
- int high, mmu;
- bus = page_to_phys(page);
- memory = page_address(page);
- high = (bus + size) >= dma_mask;
- mmu = high;
- if (force_iommu && !(gfp & GFP_DMA))
- mmu = 1;
- else if (high) {
- free_pages((unsigned long)memory,
- get_order(size));
-
- /* Don't use the 16MB ZONE_DMA unless absolutely
- needed. It's better to use remapping first. */
- if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
- gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
- goto again;
- }
-
- /* Let low level make its own zone decisions */
- gfp &= ~(GFP_DMA32|GFP_DMA);
-
- if (ops->alloc_coherent)
- return ops->alloc_coherent(dev, size,
- dma_handle, gfp);
- return NULL;
- }
-
- memset(memory, 0, size);
- if (!mmu) {
- *dma_handle = bus;
- return memory;
- }
- }
-
- if (ops->alloc_coherent) {
- free_pages((unsigned long)memory, get_order(size));
- gfp &= ~(GFP_DMA|GFP_DMA32);
- return ops->alloc_coherent(dev, size, dma_handle, gfp);
- }
-
- if (ops->map_simple) {
- *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
- size,
- PCI_DMA_BIDIRECTIONAL);
- if (*dma_handle != bad_dma_address)
- return memory;
- }
-
- if (panic_on_overflow)
- panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
- (unsigned long)size);
- free_pages((unsigned long)memory, get_order(size));
- return NULL;
-}
-EXPORT_SYMBOL(dma_alloc_coherent);
-
-/*
- * Unmap coherent memory.
- * The caller must ensure that the device has finished accessing the mapping.
- */
-void dma_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t bus)
-{
- struct dma_mapping_ops *ops = get_dma_ops(dev);
-
- int order = get_order(size);
- WARN_ON(irqs_disabled()); /* for portability */
- if (dma_release_from_coherent(dev, order, vaddr))
- return;
- if (ops->unmap_single)
- ops->unmap_single(dev, bus, size, 0);
- free_pages((unsigned long)vaddr, order);
-}
-EXPORT_SYMBOL(dma_free_coherent);
-
static int __init pci_iommu_init(void)
{
calgary_iommu_init();
@@ -402,17 +291,3 @@ void pci_iommu_shutdown(void)
}
/* Must execute after PCI subsystem */
fs_initcall(pci_iommu_init);
-
-#ifdef CONFIG_PCI
-/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-
-static __devinit void via_no_dac(struct pci_dev *dev)
-{
- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
- printk(KERN_INFO "PCI: VIA PCI bridge detected."
- "Disabling DAC.\n");
- forbid_dac = 1;
- }
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
-#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 49285f8fd4d..e3f75bbcede 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,8 +27,8 @@
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
#include <linux/sysdev.h>
+#include <linux/io.h>
#include <asm/atomic.h>
-#include <asm/io.h>
#include <asm/mtrr.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
@@ -80,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
AGPEXTERN __u32 *agp_gatt_table;
static unsigned long next_bit; /* protected by iommu_bitmap_lock */
-static int need_flush; /* global flush state. set for each gart wrap */
+static bool need_flush; /* global flush state. set for each gart wrap */
-static unsigned long alloc_iommu(struct device *dev, int size)
+static unsigned long alloc_iommu(struct device *dev, int size,
+ unsigned long align_mask)
{
unsigned long offset, flags;
unsigned long boundary_size;
@@ -90,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
PAGE_SIZE) >> PAGE_SHIFT;
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
PAGE_SIZE) >> PAGE_SHIFT;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
- size, base_index, boundary_size, 0);
+ size, base_index, boundary_size, align_mask);
if (offset == -1) {
- need_flush = 1;
+ need_flush = true;
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
- size, base_index, boundary_size, 0);
+ size, base_index, boundary_size,
+ align_mask);
}
if (offset != -1) {
next_bit = offset+size;
if (next_bit >= iommu_pages) {
next_bit = 0;
- need_flush = 1;
+ need_flush = true;
}
}
if (iommu_fullflush)
- need_flush = 1;
+ need_flush = true;
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
return offset;
@@ -134,7 +136,7 @@ static void flush_gart(void)
spin_lock_irqsave(&iommu_bitmap_lock, flags);
if (need_flush) {
k8_flush_garts();
- need_flush = 0;
+ need_flush = false;
}
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
}
@@ -173,7 +175,8 @@ static void dump_leak(void)
iommu_leak_pages);
for (i = 0; i < iommu_leak_pages; i += 2) {
printk(KERN_DEBUG "%lu: ", iommu_pages-i);
- printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0);
+ printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
+ 0);
printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
}
printk(KERN_DEBUG "\n");
@@ -212,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
static inline int
need_iommu(struct device *dev, unsigned long addr, size_t size)
{
- u64 mask = *dev->dma_mask;
- int high = addr + size > mask;
- int mmu = high;
-
- if (force_iommu)
- mmu = 1;
-
- return mmu;
+ return force_iommu ||
+ !is_buffer_dma_capable(*dev->dma_mask, addr, size);
}
static inline int
nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
{
- u64 mask = *dev->dma_mask;
- int high = addr + size > mask;
- int mmu = high;
-
- return mmu;
+ return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
}
/* Map a single continuous physical area into the IOMMU.
* Caller needs to check if the iommu is needed and flush.
*/
static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
- size_t size, int dir)
+ size_t size, int dir, unsigned long align_mask)
{
- unsigned long npages = iommu_num_pages(phys_mem, size);
- unsigned long iommu_page = alloc_iommu(dev, npages);
+ unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
+ unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
int i;
if (iommu_page == -1) {
@@ -259,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
}
-static dma_addr_t
-gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
-{
- dma_addr_t map = dma_map_area(dev, paddr, size, dir);
-
- flush_gart();
-
- return map;
-}
-
/* Map a single area into the IOMMU */
static dma_addr_t
gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -276,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
unsigned long bus;
if (!dev)
- dev = &fallback_dev;
+ dev = &x86_dma_fallback_dev;
if (!need_iommu(dev, paddr, size))
return paddr;
- bus = gart_map_simple(dev, paddr, size, dir);
+ bus = dma_map_area(dev, paddr, size, dir, 0);
+ flush_gart();
return bus;
}
@@ -301,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
return;
iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
- npages = iommu_num_pages(dma_addr, size);
+ npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
CLEAR_LEAK(iommu_page + i);
@@ -340,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
unsigned long addr = sg_phys(s);
if (nonforced_iommu(dev, addr, s->length)) {
- addr = dma_map_area(dev, addr, s->length, dir);
+ addr = dma_map_area(dev, addr, s->length, dir, 0);
if (addr == bad_dma_address) {
if (i > 0)
gart_unmap_sg(dev, sg, i, dir);
@@ -362,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
int nelems, struct scatterlist *sout,
unsigned long pages)
{
- unsigned long iommu_start = alloc_iommu(dev, pages);
+ unsigned long iommu_start = alloc_iommu(dev, pages, 0);
unsigned long iommu_page = iommu_start;
struct scatterlist *s;
int i;
@@ -384,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
}
addr = phys_addr;
- pages = iommu_num_pages(s->offset, s->length);
+ pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
while (pages--) {
iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
SET_LEAK(iommu_page);
@@ -427,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
return 0;
if (!dev)
- dev = &fallback_dev;
+ dev = &x86_dma_fallback_dev;
out = 0;
start = 0;
@@ -467,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
seg_size += s->length;
need = nextneed;
- pages += iommu_num_pages(s->offset, s->length);
+ pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
ps = s;
}
if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
@@ -499,6 +483,46 @@ error:
return 0;
}
+/* allocate and map a coherent mapping */
+static void *
+gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
+ gfp_t flag)
+{
+ dma_addr_t paddr;
+ unsigned long align_mask;
+ struct page *page;
+
+ if (force_iommu && !(flag & GFP_DMA)) {
+ flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+ page = alloc_pages(flag | __GFP_ZERO, get_order(size));
+ if (!page)
+ return NULL;
+
+ align_mask = (1UL << get_order(size)) - 1;
+ paddr = dma_map_area(dev, page_to_phys(page), size,
+ DMA_BIDIRECTIONAL, align_mask);
+
+ flush_gart();
+ if (paddr != bad_dma_address) {
+ *dma_addr = paddr;
+ return page_address(page);
+ }
+ __free_pages(page, get_order(size));
+ } else
+ return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
+
+ return NULL;
+}
+
+/* free a coherent mapping */
+static void
+gart_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_addr)
+{
+ gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
static int no_agp;
static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -626,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
struct pci_dev *dev;
void *gatt;
int i, error;
- unsigned long start_pfn, end_pfn;
printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
aper_size = aper_base = info->aper_size = 0;
@@ -650,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
info->aper_size = aper_size >> 20;
gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
- gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
+ gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(gatt_size));
if (!gatt)
panic("Cannot allocate GATT table");
if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
panic("Could not set GART PTEs to uncacheable pages");
- memset(gatt, 0, gatt_size);
agp_gatt_table = gatt;
enable_gart_translations();
@@ -665,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
if (!error)
error = sysdev_register(&device_gart);
if (error)
- panic("Could not register gart_sysdev -- would corrupt data on next suspend");
+ panic("Could not register gart_sysdev -- "
+ "would corrupt data on next suspend");
flush_gart();
printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
aper_base, aper_size>>10);
- /* need to map that range */
- end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
- if (end_pfn > max_low_pfn_mapped) {
- start_pfn = (aper_base>>PAGE_SHIFT);
- init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
- }
return 0;
nommu:
@@ -687,20 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
return -1;
}
-extern int agp_amd64_init(void);
-
static struct dma_mapping_ops gart_dma_ops = {
.map_single = gart_map_single,
- .map_simple = gart_map_simple,
.unmap_single = gart_unmap_single,
- .sync_single_for_cpu = NULL,
- .sync_single_for_device = NULL,
- .sync_single_range_for_cpu = NULL,
- .sync_single_range_for_device = NULL,
- .sync_sg_for_cpu = NULL,
- .sync_sg_for_device = NULL,
.map_sg = gart_map_sg,
.unmap_sg = gart_unmap_sg,
+ .alloc_coherent = gart_alloc_coherent,
+ .free_coherent = gart_free_coherent,
};
void gart_iommu_shutdown(void)
@@ -727,7 +738,8 @@ void __init gart_iommu_init(void)
{
struct agp_kern_info info;
unsigned long iommu_start;
- unsigned long aper_size;
+ unsigned long aper_base, aper_size;
+ unsigned long start_pfn, end_pfn;
unsigned long scratch;
long i;
@@ -759,30 +771,35 @@ void __init gart_iommu_init(void)
(no_agp && init_k8_gatt(&info) < 0)) {
if (max_pfn > MAX_DMA32_PFN) {
printk(KERN_WARNING "More than 4GB of memory "
- "but GART IOMMU not available.\n"
- KERN_WARNING "falling back to iommu=soft.\n");
+ "but GART IOMMU not available.\n");
+ printk(KERN_WARNING "falling back to iommu=soft.\n");
}
return;
}
+ /* need to map that range */
+ aper_size = info.aper_size << 20;
+ aper_base = info.aper_base;
+ end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+ if (end_pfn > max_low_pfn_mapped) {
+ start_pfn = (aper_base>>PAGE_SHIFT);
+ init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+ }
+
printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
- aper_size = info.aper_size * 1024 * 1024;
iommu_size = check_iommu_size(info.aper_base, aper_size);
iommu_pages = iommu_size >> PAGE_SHIFT;
- iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL,
+ iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(iommu_pages/8));
if (!iommu_gart_bitmap)
panic("Cannot allocate iommu bitmap\n");
- memset(iommu_gart_bitmap, 0, iommu_pages/8);
#ifdef CONFIG_IOMMU_LEAK
if (leak_trace) {
- iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
+ iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
get_order(iommu_pages*sizeof(void *)));
- if (iommu_leak_tab)
- memset(iommu_leak_tab, 0, iommu_pages * 8);
- else
+ if (!iommu_leak_tab)
printk(KERN_DEBUG
"PCI-DMA: Cannot allocate leak trace area\n");
}
@@ -792,7 +809,7 @@ void __init gart_iommu_init(void)
* Out of IOMMU space handling.
* Reserve some invalid pages at the beginning of the GART.
*/
- set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
+ iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
agp_memory_reserved = iommu_size;
printk(KERN_INFO
@@ -850,7 +867,8 @@ void __init gart_parse_options(char *p)
if (!strncmp(p, "leak", 4)) {
leak_trace = 1;
p += 4;
- if (*p == '=') ++p;
+ if (*p == '=')
+ ++p;
if (isdigit(*p) && get_option(&p, &arg))
iommu_leak_pages = arg;
}
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 3f91f71cdc3..c70ab5a5d4c 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
static int
check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
{
- if (hwdev && bus + size > *hwdev->dma_mask) {
+ if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
if (*hwdev->dma_mask >= DMA_32BIT_MASK)
printk(KERN_ERR
"nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,7 +72,15 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
return nents;
}
+static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_addr)
+{
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
struct dma_mapping_ops nommu_dma_ops = {
+ .alloc_coherent = dma_generic_alloc_coherent,
+ .free_coherent = nommu_free_coherent,
.map_single = nommu_map_single,
.map_sg = nommu_map_sg,
.is_phys = 1,
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea27..a311ffcaad1 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
#include <linux/platform_device.h>
-#include <linux/errno.h>
+#include <linux/err.h>
#include <linux/init.h>
static __init int add_pcspkr(void)
{
struct platform_device *pd;
- int ret;
- pd = platform_device_alloc("pcspkr", -1);
- if (!pd)
- return -ENOMEM;
+ pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
- ret = platform_device_add(pd);
- if (ret)
- platform_device_put(pd);
-
- return ret;
+ return IS_ERR(pd) ? PTR_ERR(pd) : 0;
}
device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7fc4d5b0a6a..c622772744d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,7 +15,6 @@ unsigned long idle_nomwait;
EXPORT_SYMBOL(idle_nomwait);
struct kmem_cache *task_xstate_cachep;
-static int force_mwait __cpuinitdata;
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
@@ -185,7 +184,8 @@ static void mwait_idle(void)
static void poll_idle(void)
{
local_irq_enable();
- cpu_relax();
+ while (!need_resched())
+ cpu_relax();
}
/*
@@ -246,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
return 1;
}
+static cpumask_t c1e_mask = CPU_MASK_NONE;
+static int c1e_detected;
+
+void c1e_remove_cpu(int cpu)
+{
+ cpu_clear(cpu, c1e_mask);
+}
+
/*
* C1E aware idle routine. We check for C1E active in the interrupt
* pending message MSR. If we detect C1E, then we handle it the same
@@ -253,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
*/
static void c1e_idle(void)
{
- static cpumask_t c1e_mask = CPU_MASK_NONE;
- static int c1e_detected;
-
if (need_resched())
return;
@@ -265,8 +270,10 @@ static void c1e_idle(void)
rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
if (lo & K8_INTP_C1E_ACTIVE_MASK) {
c1e_detected = 1;
- mark_tsc_unstable("TSC halt in C1E");
- printk(KERN_INFO "System has C1E enabled\n");
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ mark_tsc_unstable("TSC halt in AMD C1E");
+ printk(KERN_INFO "System has AMD C1E enabled\n");
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
}
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 3b7a1ddcc0b..0a1302fe6d4 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
#include <linux/tick.h>
#include <linux/percpu.h>
#include <linux/prctl.h>
+#include <linux/dmi.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -55,6 +56,9 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
#include <asm/kdebug.h>
+#include <asm/idle.h>
+#include <asm/syscalls.h>
+#include <asm/smp.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -72,46 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
return ((unsigned long *)tsk->thread.sp)[3];
}
-#ifdef CONFIG_HOTPLUG_CPU
-#include <asm/nmi.h>
-
-static void cpu_exit_clear(void)
-{
- int cpu = raw_smp_processor_id();
-
- idle_task_exit();
-
- cpu_uninit();
- irq_ctx_exit(cpu);
-
- cpu_clear(cpu, cpu_callout_map);
- cpu_clear(cpu, cpu_callin_map);
-
- numa_remove_cpu(cpu);
-}
-
-/* We don't actually take CPU down, just spin without interrupts. */
-static inline void play_dead(void)
-{
- /* This must be done before dead CPU ack */
- cpu_exit_clear();
- mb();
- /* Ack it */
- __get_cpu_var(cpu_state) = CPU_DEAD;
-
- /*
- * With physical CPU hotplug, we should halt the cpu
- */
- local_irq_disable();
- /* mask all interrupts, flush any and all caches, and halt */
- wbinvd_halt();
-}
-#else
+#ifndef CONFIG_SMP
static inline void play_dead(void)
{
BUG();
}
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif
/*
* The idle thread. There's no useful work to be
@@ -153,12 +123,13 @@ void cpu_idle(void)
}
}
-void __show_registers(struct pt_regs *regs, int all)
+void __show_regs(struct pt_regs *regs, int all)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
unsigned long d0, d1, d2, d3, d6, d7;
unsigned long sp;
unsigned short ss, gs;
+ const char *board;
if (user_mode_vm(regs)) {
sp = regs->sp;
@@ -171,11 +142,15 @@ void __show_registers(struct pt_regs *regs, int all)
}
printk("\n");
- printk("Pid: %d, comm: %s %s (%s %.*s)\n",
+
+ board = dmi_get_system_info(DMI_PRODUCT_NAME);
+ if (!board)
+ board = "";
+ printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
task_pid_nr(current), current->comm,
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
+ init_utsname()->version, board);
printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
(u16)regs->cs, regs->ip, regs->flags,
@@ -214,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all)
void show_regs(struct pt_regs *regs)
{
- __show_registers(regs, 1);
+ __show_regs(regs, 1);
show_trace(NULL, regs, &regs->sp, regs->bp);
}
@@ -275,6 +250,14 @@ void exit_thread(void)
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
put_cpu();
}
+#ifdef CONFIG_X86_DS
+ /* Free any DS contexts that have not been properly released. */
+ if (unlikely(current->thread.ds_ctx)) {
+ /* we clear debugctl to make sure DS is not used. */
+ update_debugctlmsr(0);
+ ds_free(current->thread.ds_ctx);
+ }
+#endif /* CONFIG_X86_DS */
}
void flush_thread(void)
@@ -436,6 +419,35 @@ int set_tsc_mode(unsigned int val)
return 0;
}
+#ifdef CONFIG_X86_DS
+static int update_debugctl(struct thread_struct *prev,
+ struct thread_struct *next, unsigned long debugctl)
+{
+ unsigned long ds_prev = 0;
+ unsigned long ds_next = 0;
+
+ if (prev->ds_ctx)
+ ds_prev = (unsigned long)prev->ds_ctx->ds;
+ if (next->ds_ctx)
+ ds_next = (unsigned long)next->ds_ctx->ds;
+
+ if (ds_next != ds_prev) {
+ /* we clear debugctl to make sure DS
+ * is not in use when we change it */
+ debugctl = 0;
+ update_debugctlmsr(0);
+ wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
+ }
+ return debugctl;
+}
+#else
+static int update_debugctl(struct thread_struct *prev,
+ struct thread_struct *next, unsigned long debugctl)
+{
+ return debugctl;
+}
+#endif /* CONFIG_X86_DS */
+
static noinline void
__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
@@ -446,14 +458,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
prev = &prev_p->thread;
next = &next_p->thread;
- debugctl = prev->debugctlmsr;
- if (next->ds_area_msr != prev->ds_area_msr) {
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
- }
+ debugctl = update_debugctl(prev, next, prev->debugctlmsr);
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
@@ -477,13 +482,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
hard_enable_TSC();
}
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 71553b664e2..c958120fb1b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
#include <linux/kdebug.h>
#include <linux/tick.h>
#include <linux/prctl.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
-#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
-#include <asm/io.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/mmu_context.h>
@@ -51,6 +51,7 @@
#include <asm/proto.h>
#include <asm/ia32.h>
#include <asm/idle.h>
+#include <asm/syscalls.h>
asmlinkage extern void ret_from_fork(void);
@@ -62,6 +63,13 @@ void idle_notifier_register(struct notifier_block *n)
{
atomic_notifier_chain_register(&idle_notifier, n);
}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+ atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
void enter_idle(void)
{
@@ -85,28 +93,12 @@ void exit_idle(void)
__exit_idle();
}
-#ifdef CONFIG_HOTPLUG_CPU
-DECLARE_PER_CPU(int, cpu_state);
-
-#include <asm/nmi.h>
-/* We halt the CPU with physical CPU hotplug */
-static inline void play_dead(void)
-{
- idle_task_exit();
- mb();
- /* Ack it */
- __get_cpu_var(cpu_state) = CPU_DEAD;
-
- local_irq_disable();
- /* mask all interrupts, flush any and all caches, and halt */
- wbinvd_halt();
-}
-#else
+#ifndef CONFIG_SMP
static inline void play_dead(void)
{
BUG();
}
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif
/*
* The idle thread. There's no useful work to be
@@ -151,7 +143,7 @@ void cpu_idle(void)
}
/* Prints also some state that isn't saved in the pt_regs */
-void __show_regs(struct pt_regs * regs)
+void __show_regs(struct pt_regs *regs, int all)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
unsigned long d0, d1, d2, d3, d6, d7;
@@ -160,60 +152,65 @@ void __show_regs(struct pt_regs * regs)
printk("\n");
print_modules();
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+ printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
printk_address(regs->ip, 1);
- printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
- regs->flags);
- printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+ printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
+ regs->sp, regs->flags);
+ printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->ax, regs->bx, regs->cx);
- printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+ printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
regs->dx, regs->si, regs->di);
- printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+ printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
regs->bp, regs->r8, regs->r9);
- printk("R10: %016lx R11: %016lx R12: %016lx\n",
- regs->r10, regs->r11, regs->r12);
- printk("R13: %016lx R14: %016lx R15: %016lx\n",
- regs->r13, regs->r14, regs->r15);
-
- asm("movl %%ds,%0" : "=r" (ds));
- asm("movl %%cs,%0" : "=r" (cs));
- asm("movl %%es,%0" : "=r" (es));
+ printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
+ regs->r10, regs->r11, regs->r12);
+ printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
+ regs->r13, regs->r14, regs->r15);
+
+ asm("movl %%ds,%0" : "=r" (ds));
+ asm("movl %%cs,%0" : "=r" (cs));
+ asm("movl %%es,%0" : "=r" (es));
asm("movl %%fs,%0" : "=r" (fsindex));
asm("movl %%gs,%0" : "=r" (gsindex));
rdmsrl(MSR_FS_BASE, fs);
- rdmsrl(MSR_GS_BASE, gs);
- rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ rdmsrl(MSR_GS_BASE, gs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+
+ if (!all)
+ return;
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
cr4 = read_cr4();
- printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
- fs,fsindex,gs,gsindex,shadowgs);
- printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
- printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+ printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ fs, fsindex, gs, gsindex, shadowgs);
+ printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+ es, cr0);
+ printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
+ cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
- printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+ printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
get_debugreg(d3, 3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
- printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
}
void show_regs(struct pt_regs *regs)
{
- printk("CPU %d:", smp_processor_id());
- __show_regs(regs);
+ printk(KERN_INFO "CPU %d:", smp_processor_id());
+ __show_regs(regs, 1);
show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
}
@@ -238,6 +235,14 @@ void exit_thread(void)
t->io_bitmap_max = 0;
put_cpu();
}
+#ifdef CONFIG_X86_DS
+ /* Free any DS contexts that have not been properly released. */
+ if (unlikely(t->ds_ctx)) {
+ /* we clear debugctl to make sure DS is not used. */
+ update_debugctlmsr(0);
+ ds_free(t->ds_ctx);
+ }
+#endif /* CONFIG_X86_DS */
}
void flush_thread(void)
@@ -313,10 +318,10 @@ void prepare_to_copy(struct task_struct *tsk)
int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
unsigned long unused,
- struct task_struct * p, struct pt_regs * regs)
+ struct task_struct *p, struct pt_regs *regs)
{
int err;
- struct pt_regs * childregs;
+ struct pt_regs *childregs;
struct task_struct *me = current;
childregs = ((struct pt_regs *)
@@ -361,10 +366,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
if (test_thread_flag(TIF_IA32))
err = do_set_thread_area(p, -1,
(struct user_desc __user *)childregs->si, 0);
- else
-#endif
- err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
- if (err)
+ else
+#endif
+ err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+ if (err)
goto out;
}
err = 0;
@@ -471,13 +476,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
next = &next_p->thread;
debugctl = prev->debugctlmsr;
- if (next->ds_area_msr != prev->ds_area_msr) {
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
+
+#ifdef CONFIG_X86_DS
+ {
+ unsigned long ds_prev = 0, ds_next = 0;
+
+ if (prev->ds_ctx)
+ ds_prev = (unsigned long)prev->ds_ctx->ds;
+ if (next->ds_ctx)
+ ds_next = (unsigned long)next->ds_ctx->ds;
+
+ if (ds_next != ds_prev) {
+ /*
+ * We clear debugctl to make sure DS
+ * is not in use when we change it:
+ */
+ debugctl = 0;
+ update_debugctlmsr(0);
+ wrmsrl(MSR_IA32_DS_AREA, ds_next);
+ }
}
+#endif /* CONFIG_X86_DS */
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
@@ -515,13 +534,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
}
/*
@@ -543,7 +562,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
unsigned fsindex, gsindex;
/* we're going to use this soon, after a few expensive things */
- if (next_p->fpu_counter>5)
+ if (next_p->fpu_counter > 5)
prefetch(next->xstate);
/*
@@ -551,13 +570,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
load_sp0(tss, next);
- /*
+ /*
* Switch DS and ES.
* This won't pick up thread selector changes, but I guess that is ok.
*/
savesegment(es, prev->es);
if (unlikely(next->es | prev->es))
- loadsegment(es, next->es);
+ loadsegment(es, next->es);
savesegment(ds, prev->ds);
if (unlikely(next->ds | prev->ds))
@@ -583,7 +602,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
arch_leave_lazy_cpu_mode();
- /*
+ /*
* Switch FS and GS.
*
* Segment register != 0 always requires a reload. Also
@@ -592,13 +611,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
- /*
+ /*
* Check if the user used a selector != 0; if yes
* clear 64bit base, since overloaded base is always
* mapped to the Null selector
*/
if (fsindex)
- prev->fs = 0;
+ prev->fs = 0;
}
/* when next process has a 64bit base use it */
if (next->fs)
@@ -608,7 +627,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
if (gsindex)
- prev->gs = 0;
+ prev->gs = 0;
}
if (next->gs)
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
@@ -617,12 +636,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Must be after DS reload */
unlazy_fpu(prev_p);
- /*
+ /*
* Switch the PDA and FPU contexts.
*/
prev->usersp = read_pda(oldrsp);
write_pda(oldrsp, next->usersp);
- write_pda(pcurrent, next_p);
+ write_pda(pcurrent, next_p);
write_pda(kernelstack,
(unsigned long)task_stack_page(next_p) +
@@ -663,7 +682,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
char __user * __user *envp, struct pt_regs *regs)
{
long error;
- char * filename;
+ char *filename;
filename = getname(name);
error = PTR_ERR(filename);
@@ -721,55 +740,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
unsigned long get_wchan(struct task_struct *p)
{
unsigned long stack;
- u64 fp,ip;
+ u64 fp, ip;
int count = 0;
- if (!p || p == current || p->state==TASK_RUNNING)
- return 0;
+ if (!p || p == current || p->state == TASK_RUNNING)
+ return 0;
stack = (unsigned long)task_stack_page(p);
- if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
+ if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
return 0;
fp = *(u64 *)(p->thread.sp);
- do {
+ do {
if (fp < (unsigned long)stack ||
- fp > (unsigned long)stack+THREAD_SIZE)
- return 0;
+ fp >= (unsigned long)stack+THREAD_SIZE)
+ return 0;
ip = *(u64 *)(fp+8);
if (!in_sched_functions(ip))
return ip;
- fp = *(u64 *)fp;
- } while (count++ < 16);
+ fp = *(u64 *)fp;
+ } while (count++ < 16);
return 0;
}
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
-{
- int ret = 0;
+{
+ int ret = 0;
int doit = task == current;
int cpu;
- switch (code) {
+ switch (code) {
case ARCH_SET_GS:
if (addr >= TASK_SIZE_OF(task))
- return -EPERM;
+ return -EPERM;
cpu = get_cpu();
- /* handle small bases via the GDT because that's faster to
+ /* handle small bases via the GDT because that's faster to
switch. */
- if (addr <= 0xffffffff) {
- set_32bit_tls(task, GS_TLS, addr);
- if (doit) {
+ if (addr <= 0xffffffff) {
+ set_32bit_tls(task, GS_TLS, addr);
+ if (doit) {
load_TLS(&task->thread, cpu);
- load_gs_index(GS_TLS_SEL);
+ load_gs_index(GS_TLS_SEL);
}
- task->thread.gsindex = GS_TLS_SEL;
+ task->thread.gsindex = GS_TLS_SEL;
task->thread.gs = 0;
- } else {
+ } else {
task->thread.gsindex = 0;
task->thread.gs = addr;
if (doit) {
load_gs_index(0);
ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
- }
+ }
}
put_cpu();
break;
@@ -823,8 +842,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
base = task->thread.gs;
- }
- else
+ } else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e37dccce85d..0a6d8c12e10 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/regset.h>
+#include <linux/tracehook.h>
#include <linux/user.h>
#include <linux/elf.h>
#include <linux/security.h>
@@ -39,7 +40,9 @@ enum x86_regset {
REGSET_GENERAL,
REGSET_FP,
REGSET_XFP,
+ REGSET_IOPERM64 = REGSET_XFP,
REGSET_TLS,
+ REGSET_IOPERM32,
};
/*
@@ -69,7 +72,7 @@ static inline bool invalid_selector(u16 value)
#define FLAG_MASK FLAG_MASK_32
-static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
+static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
regno >>= 2;
@@ -554,45 +557,138 @@ static int ptrace_set_debugreg(struct task_struct *child,
return 0;
}
-#ifdef X86_BTS
+/*
+ * These access the current or another (stopped) task's io permission
+ * bitmap for debugging or core dump.
+ */
+static int ioperm_active(struct task_struct *target,
+ const struct user_regset *regset)
+{
+ return target->thread.io_bitmap_max / regset->size;
+}
-static int ptrace_bts_get_size(struct task_struct *child)
+static int ioperm_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
{
- if (!child->thread.ds_area_msr)
+ if (!target->thread.io_bitmap_ptr)
return -ENXIO;
- return ds_get_bts_index((void *)child->thread.ds_area_msr);
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ target->thread.io_bitmap_ptr,
+ 0, IO_BITMAP_BYTES);
+}
+
+#ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * The configuration for a particular BTS hardware implementation.
+ */
+struct bts_configuration {
+ /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
+ unsigned char sizeof_bts;
+ /* the size of a field in the BTS record in bytes */
+ unsigned char sizeof_field;
+ /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
+ unsigned long debugctl_mask;
+};
+static struct bts_configuration bts_cfg;
+
+#define BTS_MAX_RECORD_SIZE (8 * 3)
+
+
+/*
+ * Branch Trace Store (BTS) uses the following format. Different
+ * architectures vary in the size of those fields.
+ * - source linear address
+ * - destination linear address
+ * - flags
+ *
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
+ *
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ *
+ * In order to store additional information in the BTS buffer, we use
+ * a special source address to indicate that the record requires
+ * special interpretation.
+ *
+ * Netburst indicated via a bit in the flags field whether the branch
+ * was predicted; this is ignored.
+ */
+
+enum bts_field {
+ bts_from = 0,
+ bts_to,
+ bts_flags,
+
+ bts_escape = (unsigned long)-1,
+ bts_qual = bts_to,
+ bts_jiffies = bts_flags
+};
+
+static inline unsigned long bts_get(const char *base, enum bts_field field)
+{
+ base += (bts_cfg.sizeof_field * field);
+ return *(unsigned long *)base;
+}
+
+static inline void bts_set(char *base, enum bts_field field, unsigned long val)
+{
+ base += (bts_cfg.sizeof_field * field);;
+ (*(unsigned long *)base) = val;
+}
+
+/*
+ * Translate a BTS record from the raw format into the bts_struct format
+ *
+ * out (out): bts_struct interpretation
+ * raw: raw BTS record
+ */
+static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
+{
+ memset(out, 0, sizeof(*out));
+ if (bts_get(raw, bts_from) == bts_escape) {
+ out->qualifier = bts_get(raw, bts_qual);
+ out->variant.jiffies = bts_get(raw, bts_jiffies);
+ } else {
+ out->qualifier = BTS_BRANCH;
+ out->variant.lbr.from_ip = bts_get(raw, bts_from);
+ out->variant.lbr.to_ip = bts_get(raw, bts_to);
+ }
}
-static int ptrace_bts_read_record(struct task_struct *child,
- long index,
+static int ptrace_bts_read_record(struct task_struct *child, size_t index,
struct bts_struct __user *out)
{
struct bts_struct ret;
- int retval;
- int bts_end;
- int bts_index;
-
- if (!child->thread.ds_area_msr)
- return -ENXIO;
+ const void *bts_record;
+ size_t bts_index, bts_end;
+ int error;
- if (index < 0)
- return -EINVAL;
+ error = ds_get_bts_end(child, &bts_end);
+ if (error < 0)
+ return error;
- bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
if (bts_end <= index)
return -EINVAL;
+ error = ds_get_bts_index(child, &bts_index);
+ if (error < 0)
+ return error;
+
/* translate the ptrace bts index into the ds bts index */
- bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr);
- bts_index -= (index + 1);
- if (bts_index < 0)
- bts_index += bts_end;
+ bts_index += bts_end - (index + 1);
+ if (bts_end <= bts_index)
+ bts_index -= bts_end;
+
+ error = ds_access_bts(child, bts_index, &bts_record);
+ if (error < 0)
+ return error;
- retval = ds_read_bts((void *)child->thread.ds_area_msr,
- bts_index, &ret);
- if (retval < 0)
- return retval;
+ ptrace_bts_translate_record(&ret, bts_record);
if (copy_to_user(out, &ret, sizeof(ret)))
return -EFAULT;
@@ -600,101 +696,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
return sizeof(ret);
}
-static int ptrace_bts_clear(struct task_struct *child)
-{
- if (!child->thread.ds_area_msr)
- return -ENXIO;
-
- return ds_clear((void *)child->thread.ds_area_msr);
-}
-
static int ptrace_bts_drain(struct task_struct *child,
long size,
struct bts_struct __user *out)
{
- int end, i;
- void *ds = (void *)child->thread.ds_area_msr;
-
- if (!ds)
- return -ENXIO;
+ struct bts_struct ret;
+ const unsigned char *raw;
+ size_t end, i;
+ int error;
- end = ds_get_bts_index(ds);
- if (end <= 0)
- return end;
+ error = ds_get_bts_index(child, &end);
+ if (error < 0)
+ return error;
if (size < (end * sizeof(struct bts_struct)))
return -EIO;
- for (i = 0; i < end; i++, out++) {
- struct bts_struct ret;
- int retval;
+ error = ds_access_bts(child, 0, (const void **)&raw);
+ if (error < 0)
+ return error;
- retval = ds_read_bts(ds, i, &ret);
- if (retval < 0)
- return retval;
+ for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
+ ptrace_bts_translate_record(&ret, raw);
if (copy_to_user(out, &ret, sizeof(ret)))
return -EFAULT;
}
- ds_clear(ds);
+ error = ds_clear_bts(child);
+ if (error < 0)
+ return error;
return end;
}
+static void ptrace_bts_ovfl(struct task_struct *child)
+{
+ send_sig(child->thread.bts_ovfl_signal, child, 0);
+}
+
static int ptrace_bts_config(struct task_struct *child,
long cfg_size,
const struct ptrace_bts_config __user *ucfg)
{
struct ptrace_bts_config cfg;
- int bts_size, ret = 0;
- void *ds;
+ int error = 0;
+ error = -EOPNOTSUPP;
+ if (!bts_cfg.sizeof_bts)
+ goto errout;
+
+ error = -EIO;
if (cfg_size < sizeof(cfg))
- return -EIO;
+ goto errout;
+ error = -EFAULT;
if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
- return -EFAULT;
+ goto errout;
- if ((int)cfg.size < 0)
- return -EINVAL;
+ error = -EINVAL;
+ if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
+ !(cfg.flags & PTRACE_BTS_O_ALLOC))
+ goto errout;
- bts_size = 0;
- ds = (void *)child->thread.ds_area_msr;
- if (ds) {
- bts_size = ds_get_bts_size(ds);
- if (bts_size < 0)
- return bts_size;
- }
- cfg.size = PAGE_ALIGN(cfg.size);
+ if (cfg.flags & PTRACE_BTS_O_ALLOC) {
+ ds_ovfl_callback_t ovfl = NULL;
+ unsigned int sig = 0;
+
+ /* we ignore the error in case we were not tracing child */
+ (void)ds_release_bts(child);
+
+ if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
+ if (!cfg.signal)
+ goto errout;
+
+ sig = cfg.signal;
+ ovfl = ptrace_bts_ovfl;
+ }
- if (bts_size != cfg.size) {
- ret = ptrace_bts_realloc(child, cfg.size,
- cfg.flags & PTRACE_BTS_O_CUT_SIZE);
- if (ret < 0)
+ error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
+ if (error < 0)
goto errout;
- ds = (void *)child->thread.ds_area_msr;
+ child->thread.bts_ovfl_signal = sig;
}
- if (cfg.flags & PTRACE_BTS_O_SIGNAL)
- ret = ds_set_overflow(ds, DS_O_SIGNAL);
- else
- ret = ds_set_overflow(ds, DS_O_WRAP);
- if (ret < 0)
+ error = -EINVAL;
+ if (!child->thread.ds_ctx && cfg.flags)
goto errout;
if (cfg.flags & PTRACE_BTS_O_TRACE)
- child->thread.debugctlmsr |= ds_debugctl_mask();
+ child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
else
- child->thread.debugctlmsr &= ~ds_debugctl_mask();
+ child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
if (cfg.flags & PTRACE_BTS_O_SCHED)
set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
else
clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- ret = sizeof(cfg);
+ error = sizeof(cfg);
out:
if (child->thread.debugctlmsr)
@@ -702,10 +803,10 @@ out:
else
clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- return ret;
+ return error;
errout:
- child->thread.debugctlmsr &= ~ds_debugctl_mask();
+ child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
goto out;
}
@@ -714,29 +815,40 @@ static int ptrace_bts_status(struct task_struct *child,
long cfg_size,
struct ptrace_bts_config __user *ucfg)
{
- void *ds = (void *)child->thread.ds_area_msr;
struct ptrace_bts_config cfg;
+ size_t end;
+ const void *base, *max;
+ int error;
if (cfg_size < sizeof(cfg))
return -EIO;
- memset(&cfg, 0, sizeof(cfg));
+ error = ds_get_bts_end(child, &end);
+ if (error < 0)
+ return error;
- if (ds) {
- cfg.size = ds_get_bts_size(ds);
+ error = ds_access_bts(child, /* index = */ 0, &base);
+ if (error < 0)
+ return error;
- if (ds_get_overflow(ds) == DS_O_SIGNAL)
- cfg.flags |= PTRACE_BTS_O_SIGNAL;
+ error = ds_access_bts(child, /* index = */ end, &max);
+ if (error < 0)
+ return error;
- if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
- child->thread.debugctlmsr & ds_debugctl_mask())
- cfg.flags |= PTRACE_BTS_O_TRACE;
+ memset(&cfg, 0, sizeof(cfg));
+ cfg.size = (max - base);
+ cfg.signal = child->thread.bts_ovfl_signal;
+ cfg.bts_size = sizeof(struct bts_struct);
- if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
- cfg.flags |= PTRACE_BTS_O_SCHED;
- }
+ if (cfg.signal)
+ cfg.flags |= PTRACE_BTS_O_SIGNAL;
- cfg.bts_size = sizeof(struct bts_struct);
+ if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
+ child->thread.debugctlmsr & bts_cfg.debugctl_mask)
+ cfg.flags |= PTRACE_BTS_O_TRACE;
+
+ if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+ cfg.flags |= PTRACE_BTS_O_SCHED;
if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
return -EFAULT;
@@ -744,89 +856,38 @@ static int ptrace_bts_status(struct task_struct *child,
return sizeof(cfg);
}
-
static int ptrace_bts_write_record(struct task_struct *child,
const struct bts_struct *in)
{
- int retval;
+ unsigned char bts_record[BTS_MAX_RECORD_SIZE];
- if (!child->thread.ds_area_msr)
- return -ENXIO;
+ BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
- retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
- if (retval)
- return retval;
+ memset(bts_record, 0, bts_cfg.sizeof_bts);
+ switch (in->qualifier) {
+ case BTS_INVALID:
+ break;
- return sizeof(*in);
-}
+ case BTS_BRANCH:
+ bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
+ bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
+ break;
-static int ptrace_bts_realloc(struct task_struct *child,
- int size, int reduce_size)
-{
- unsigned long rlim, vm;
- int ret, old_size;
+ case BTS_TASK_ARRIVES:
+ case BTS_TASK_DEPARTS:
+ bts_set(bts_record, bts_from, bts_escape);
+ bts_set(bts_record, bts_qual, in->qualifier);
+ bts_set(bts_record, bts_jiffies, in->variant.jiffies);
+ break;
- if (size < 0)
+ default:
return -EINVAL;
-
- old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
- if (old_size < 0)
- return old_size;
-
- ret = ds_free((void **)&child->thread.ds_area_msr);
- if (ret < 0)
- goto out;
-
- size >>= PAGE_SHIFT;
- old_size >>= PAGE_SHIFT;
-
- current->mm->total_vm -= old_size;
- current->mm->locked_vm -= old_size;
-
- if (size == 0)
- goto out;
-
- rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->total_vm + size;
- if (rlim < vm) {
- ret = -ENOMEM;
-
- if (!reduce_size)
- goto out;
-
- size = rlim - current->mm->total_vm;
- if (size <= 0)
- goto out;
- }
-
- rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->locked_vm + size;
- if (rlim < vm) {
- ret = -ENOMEM;
-
- if (!reduce_size)
- goto out;
-
- size = rlim - current->mm->locked_vm;
- if (size <= 0)
- goto out;
}
- ret = ds_allocate((void **)&child->thread.ds_area_msr,
- size << PAGE_SHIFT);
- if (ret < 0)
- goto out;
-
- current->mm->total_vm += size;
- current->mm->locked_vm += size;
-
-out:
- if (child->thread.ds_area_msr)
- set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
- else
- clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
-
- return ret;
+ /* The writing task will be the switched-to task on a context
+ * switch. It needs to write into the switched-from task's BTS
+ * buffer. */
+ return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
}
void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +900,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
ptrace_bts_write_record(tsk, &rec);
}
-#endif /* X86_BTS */
+
+static const struct bts_configuration bts_cfg_netburst = {
+ .sizeof_bts = sizeof(long) * 3,
+ .sizeof_field = sizeof(long),
+ .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
+};
+
+static const struct bts_configuration bts_cfg_pentium_m = {
+ .sizeof_bts = sizeof(long) * 3,
+ .sizeof_field = sizeof(long),
+ .debugctl_mask = (1<<6)|(1<<7)
+};
+
+static const struct bts_configuration bts_cfg_core2 = {
+ .sizeof_bts = 8 * 3,
+ .sizeof_field = 8,
+ .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
+};
+
+static inline void bts_configure(const struct bts_configuration *cfg)
+{
+ bts_cfg = *cfg;
+}
+
+void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
+{
+ switch (c->x86) {
+ case 0x6:
+ switch (c->x86_model) {
+ case 0xD:
+ case 0xE: /* Pentium M */
+ bts_configure(&bts_cfg_pentium_m);
+ break;
+ case 0xF: /* Core2 */
+ case 0x1C: /* Atom */
+ bts_configure(&bts_cfg_core2);
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+ break;
+ case 0xF:
+ switch (c->x86_model) {
+ case 0x0:
+ case 0x1:
+ case 0x2: /* Netburst */
+ bts_configure(&bts_cfg_netburst);
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+}
+#endif /* CONFIG_X86_PTRACE_BTS */
/*
* Called by kernel/ptrace.c when detaching..
@@ -852,15 +972,15 @@ void ptrace_disable(struct task_struct *child)
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
- if (child->thread.ds_area_msr) {
-#ifdef X86_BTS
- ptrace_bts_realloc(child, 0, 0);
-#endif
- child->thread.debugctlmsr &= ~ds_debugctl_mask();
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- }
+#ifdef CONFIG_X86_PTRACE_BTS
+ (void)ds_release_bts(child);
+
+ child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+ if (!child->thread.debugctlmsr)
+ clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+
+ clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+#endif /* CONFIG_X86_PTRACE_BTS */
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1100,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
/*
* These bits need more cooking - not enabled yet:
*/
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
case PTRACE_BTS_CONFIG:
ret = ptrace_bts_config
(child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1112,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_SIZE:
- ret = ptrace_bts_get_size(child);
+ ret = ds_get_bts_index(child, /* pos = */ NULL);
break;
case PTRACE_BTS_GET:
@@ -1001,14 +1121,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_CLEAR:
- ret = ptrace_bts_clear(child);
+ ret = ds_clear_bts(child);
break;
case PTRACE_BTS_DRAIN:
ret = ptrace_bts_drain
(child, data, (struct bts_struct __user *) addr);
break;
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
default:
ret = ptrace_request(child, request, addr, data);
@@ -1290,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = {
.size = sizeof(long), .align = sizeof(long),
.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
},
+ [REGSET_IOPERM64] = {
+ .core_note_type = NT_386_IOPERM,
+ .n = IO_BITMAP_LONGS,
+ .size = sizeof(long), .align = sizeof(long),
+ .active = ioperm_active, .get = ioperm_get
+ },
};
static const struct user_regset_view user_x86_64_view = {
@@ -1336,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = {
.active = regset_tls_active,
.get = regset_tls_get, .set = regset_tls_set
},
+ [REGSET_IOPERM32] = {
+ .core_note_type = NT_386_IOPERM,
+ .n = IO_BITMAP_BYTES / sizeof(u32),
+ .size = sizeof(u32), .align = sizeof(u32),
+ .active = ioperm_active, .get = ioperm_get
+ },
};
static const struct user_regset_view user_x86_32_view = {
@@ -1357,7 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
#endif
}
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
+ int error_code, int si_code)
{
struct siginfo info;
@@ -1366,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
memset(&info, 0, sizeof(info));
info.si_signo = SIGTRAP;
- info.si_code = TRAP_BRKPT;
+ info.si_code = si_code;
/* User-mode ip? */
info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
@@ -1375,30 +1508,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
force_sig_info(SIGTRAP, &info, tsk);
}
-static void syscall_trace(struct pt_regs *regs)
-{
- if (!(current->ptrace & PT_PTRACED))
- return;
-
-#if 0
- printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
- current->comm,
- regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
- current_thread_info()->flags, current->ptrace);
-#endif
-
- ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
- ? 0x80 : 0));
- /*
- * this isn't the same as continuing with a signal, but it will do
- * for normal use. strace only continues with a signal if the
- * stopping signal is not SIGTRAP. -brl
- */
- if (current->exit_code) {
- send_sig(current->exit_code, current, 1);
- current->exit_code = 0;
- }
-}
#ifdef CONFIG_X86_32
# define IS_IA32 1
@@ -1432,8 +1541,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
- if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
- syscall_trace(regs);
+ if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
+ tracehook_report_syscall_entry(regs))
+ ret = -1L;
if (unlikely(current->audit_context)) {
if (IS_IA32)
@@ -1459,7 +1569,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
if (test_thread_flag(TIF_SYSCALL_TRACE))
- syscall_trace(regs);
+ tracehook_report_syscall_exit(regs, 0);
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
@@ -1475,6 +1585,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
* system call instruction.
*/
if (test_thread_flag(TIF_SINGLESTEP) &&
- (current->ptrace & PT_PTRACED))
- send_sigtrap(current, regs, 0);
+ tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
+ send_sigtrap(current, regs, 0, TRAP_BRKPT);
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a0325..4f9c55f3a7c 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
return dst->version;
}
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+ u64 pv_tsc_khz = 1000000ULL << 32;
+
+ do_div(pv_tsc_khz, src->tsc_to_system_mul);
+ if (src->tsc_shift < 0)
+ pv_tsc_khz <<= -src->tsc_shift;
+ else
+ pv_tsc_khz >>= src->tsc_shift;
+ return pv_tsc_khz;
+}
+
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d1385881810..67465ed8931 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
if (!(word & (1 << 13))) {
dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
"disabling irq balancing and affinity\n");
-#ifdef CONFIG_IRQBALANCE
- irqbalance_disable("");
-#endif
noirqdebug_setup("");
#ifdef CONFIG_PROC_FS
no_irq_affinity = 1;
@@ -354,9 +351,27 @@ static void ati_force_hpet_resume(void)
printk(KERN_DEBUG "Force enabled HPET at resume\n");
}
+static u32 ati_ixp4x0_rev(struct pci_dev *dev)
+{
+ u32 d;
+ u8 b;
+
+ pci_read_config_byte(dev, 0xac, &b);
+ b &= ~(1<<5);
+ pci_write_config_byte(dev, 0xac, b);
+ pci_read_config_dword(dev, 0x70, &d);
+ d |= 1<<8;
+ pci_write_config_dword(dev, 0x70, d);
+ pci_read_config_dword(dev, 0x8, &d);
+ d &= 0xff;
+ dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
+ return d;
+}
+
static void ati_force_enable_hpet(struct pci_dev *dev)
{
- u32 uninitialized_var(val);
+ u32 d, val;
+ u8 b;
if (hpet_address || force_hpet_address)
return;
@@ -366,14 +381,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev)
return;
}
+ d = ati_ixp4x0_rev(dev);
+ if (d < 0x82)
+ return;
+
+ /* base address */
pci_write_config_dword(dev, 0x14, 0xfed00000);
pci_read_config_dword(dev, 0x14, &val);
+
+ /* enable interrupt */
+ outb(0x72, 0xcd6); b = inb(0xcd7);
+ b |= 0x1;
+ outb(0x72, 0xcd6); outb(b, 0xcd7);
+ outb(0x72, 0xcd6); b = inb(0xcd7);
+ if (!(b & 0x1))
+ return;
+ pci_read_config_dword(dev, 0x64, &d);
+ d |= (1<<10);
+ pci_write_config_dword(dev, 0x64, d);
+ pci_read_config_dword(dev, 0x64, &d);
+ if (!(d & (1<<10)))
+ return;
+
force_hpet_address = val;
force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
force_hpet_address);
cached_dev = dev;
- return;
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
ati_force_enable_hpet);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 724adfc63cb..f4c93f1cfc1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
static const struct desc_ptr no_idt = {};
static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
+/*
+ * Keyboard reset and triple fault may result in INIT, not RESET, which
+ * doesn't work when we're in vmx root mode. Try ACPI first.
+ */
+enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 05191bbc68b..dd6f2b71561 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
cmos_minutes = CMOS_READ(RTC_MINUTES);
if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
- BCD_TO_BIN(cmos_minutes);
+ cmos_minutes = bcd2bin(cmos_minutes);
/*
* since we're only adjusting minutes and seconds,
@@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
if (abs(real_minutes - cmos_minutes) < 30) {
if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
- BIN_TO_BCD(real_seconds);
- BIN_TO_BCD(real_minutes);
+ real_seconds = bin2bcd(real_seconds);
+ real_minutes = bin2bcd(real_minutes);
}
CMOS_WRITE(real_seconds,RTC_SECONDS);
CMOS_WRITE(real_minutes,RTC_MINUTES);
@@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void)
WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
- BCD_TO_BIN(sec);
- BCD_TO_BIN(min);
- BCD_TO_BIN(hour);
- BCD_TO_BIN(day);
- BCD_TO_BIN(mon);
- BCD_TO_BIN(year);
+ sec = bcd2bin(sec);
+ min = bcd2bin(min);
+ hour = bcd2bin(hour);
+ day = bcd2bin(day);
+ mon = bcd2bin(mon);
+ year = bcd2bin(year);
}
if (century) {
- BCD_TO_BIN(century);
+ century = bcd2bin(century);
year += century * 100;
printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
} else
@@ -223,11 +223,25 @@ static struct platform_device rtc_device = {
static __init int add_rtc_cmos(void)
{
#ifdef CONFIG_PNP
- if (!pnp_platform_devices)
- platform_device_register(&rtc_device);
-#else
+ static const char *ids[] __initconst =
+ { "PNP0b00", "PNP0b01", "PNP0b02", };
+ struct pnp_dev *dev;
+ struct pnp_id *id;
+ int i;
+
+ pnp_for_each_dev(dev) {
+ for (id = dev->id; id; id = id->next) {
+ for (i = 0; i < ARRAY_SIZE(ids); i++) {
+ if (compare_pnp_id(id, ids[i]) != 0)
+ return 0;
+ }
+ }
+ }
+#endif
+
platform_device_register(&rtc_device);
-#endif /* CONFIG_PNP */
+ dev_info(&rtc_device.dev,
+ "registered platform RTC device (no PNP device found)\n");
return 0;
}
device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9838f2539df..0fa6790c1dd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -223,6 +223,9 @@ unsigned long saved_video_mode;
#define RAMDISK_LOAD_FLAG 0x4000
static char __initdata command_line[COMMAND_LINE_SIZE];
+#ifdef CONFIG_CMDLINE_BOOL
+static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
struct edd edd;
@@ -299,7 +302,7 @@ static void __init relocate_initrd(void)
if (clen > MAX_MAP_CHUNK-slop)
clen = MAX_MAP_CHUNK-slop;
mapaddr = ramdisk_image & PAGE_MASK;
- p = early_ioremap(mapaddr, clen+slop);
+ p = early_memremap(mapaddr, clen+slop);
memcpy(q, p+slop, clen);
early_iounmap(p, clen+slop);
q += clen;
@@ -376,7 +379,7 @@ static void __init parse_setup_data(void)
return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- data = early_ioremap(pa_data, PAGE_SIZE);
+ data = early_memremap(pa_data, PAGE_SIZE);
switch (data->type) {
case SETUP_E820_EXT:
parse_e820_ext(data, pa_data);
@@ -399,7 +402,7 @@ static void __init e820_reserve_setup_data(void)
return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- data = early_ioremap(pa_data, sizeof(*data));
+ data = early_memremap(pa_data, sizeof(*data));
e820_update_range(pa_data, sizeof(*data)+data->len,
E820_RAM, E820_RESERVED_KERN);
found = 1;
@@ -425,7 +428,7 @@ static void __init reserve_early_setup_data(void)
return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- data = early_ioremap(pa_data, sizeof(*data));
+ data = early_memremap(pa_data, sizeof(*data));
sprintf(buf, "setup data %x", data->type);
reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
pa_data = data->next;
@@ -558,7 +561,13 @@ static void __init reserve_standard_io_resources(void)
}
-#ifdef CONFIG_PROC_VMCORE
+/*
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence
+ * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+
+#ifdef CONFIG_CRASH_DUMP
/* elfcorehdr= specifies the location of elf core header
* stored by the crashed kernel. This option will be passed
* by kexec loader to the capture kernel.
@@ -579,6 +588,190 @@ static struct x86_quirks default_x86_quirks __initdata;
struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
/*
+ * Some BIOSes seem to corrupt the low 64k of memory during events
+ * like suspend/resume and unplugging an HDMI cable. Reserve all
+ * remaining free memory in that area and fill it with a distinct
+ * pattern.
+ */
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+#define MAX_SCAN_AREAS 8
+
+static int __read_mostly memory_corruption_check = -1;
+
+static unsigned __read_mostly corruption_check_size = 64*1024;
+static unsigned __read_mostly corruption_check_period = 60; /* seconds */
+
+static struct e820entry scan_areas[MAX_SCAN_AREAS];
+static int num_scan_areas;
+
+
+static int set_corruption_check(char *arg)
+{
+ char *end;
+
+ memory_corruption_check = simple_strtol(arg, &end, 10);
+
+ return (*end == 0) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check", set_corruption_check);
+
+static int set_corruption_check_period(char *arg)
+{
+ char *end;
+
+ corruption_check_period = simple_strtoul(arg, &end, 10);
+
+ return (*end == 0) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_period", set_corruption_check_period);
+
+static int set_corruption_check_size(char *arg)
+{
+ char *end;
+ unsigned size;
+
+ size = memparse(arg, &end);
+
+ if (*end == '\0')
+ corruption_check_size = size;
+
+ return (size == corruption_check_size) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_size", set_corruption_check_size);
+
+
+static void __init setup_bios_corruption_check(void)
+{
+ u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
+
+ if (memory_corruption_check == -1) {
+ memory_corruption_check =
+#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
+ 1
+#else
+ 0
+#endif
+ ;
+ }
+
+ if (corruption_check_size == 0)
+ memory_corruption_check = 0;
+
+ if (!memory_corruption_check)
+ return;
+
+ corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
+
+ while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
+ u64 size;
+ addr = find_e820_area_size(addr, &size, PAGE_SIZE);
+
+ if (addr == 0)
+ break;
+
+ if ((addr + size) > corruption_check_size)
+ size = corruption_check_size - addr;
+
+ if (size == 0)
+ break;
+
+ e820_update_range(addr, size, E820_RAM, E820_RESERVED);
+ scan_areas[num_scan_areas].addr = addr;
+ scan_areas[num_scan_areas].size = size;
+ num_scan_areas++;
+
+ /* Assume we've already mapped this early memory */
+ memset(__va(addr), 0, size);
+
+ addr += size;
+ }
+
+ printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
+ num_scan_areas);
+ update_e820();
+}
+
+static struct timer_list periodic_check_timer;
+
+void check_for_bios_corruption(void)
+{
+ int i;
+ int corruption = 0;
+
+ if (!memory_corruption_check)
+ return;
+
+ for(i = 0; i < num_scan_areas; i++) {
+ unsigned long *addr = __va(scan_areas[i].addr);
+ unsigned long size = scan_areas[i].size;
+
+ for(; size; addr++, size -= sizeof(unsigned long)) {
+ if (!*addr)
+ continue;
+ printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
+ addr, __pa(addr), *addr);
+ corruption = 1;
+ *addr = 0;
+ }
+ }
+
+ WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
+}
+
+static void periodic_check_for_corruption(unsigned long data)
+{
+ check_for_bios_corruption();
+ mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
+}
+
+void start_periodic_check_for_corruption(void)
+{
+ if (!memory_corruption_check || corruption_check_period == 0)
+ return;
+
+ printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
+ corruption_check_period);
+
+ init_timer(&periodic_check_timer);
+ periodic_check_timer.function = &periodic_check_for_corruption;
+ periodic_check_for_corruption(0);
+}
+#endif
+
+static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
+{
+ printk(KERN_NOTICE
+ "%s detected: BIOS may corrupt low RAM, working it around.\n",
+ d->ident);
+
+ e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
+ return 0;
+}
+
+/* List of systems that have known low memory corruption BIOS problems */
+static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
+#ifdef CONFIG_X86_RESERVE_LOW_64K
+ {
+ .callback = dmi_low_memory_corruption,
+ .ident = "AMI BIOS",
+ .matches = {
+ DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+ },
+ },
+ {
+ .callback = dmi_low_memory_corruption,
+ .ident = "Phoenix BIOS",
+ .matches = {
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+ },
+ },
+#endif
+ {}
+};
+
+/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
* for initialization. Note, the efi init code path is determined by the
@@ -665,6 +858,19 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = virt_to_phys(&__bss_start);
bss_resource.end = virt_to_phys(&__bss_stop)-1;
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
@@ -699,6 +905,10 @@ void __init setup_arch(char **cmdline_p)
finish_e820_parsing();
+ dmi_scan_machine();
+
+ dmi_check_system(bad_bios_dmi_table);
+
#ifdef CONFIG_X86_32
probe_roms();
#endif
@@ -742,6 +952,8 @@ void __init setup_arch(char **cmdline_p)
#else
num_physpages = max_pfn;
+ if (cpu_has_x2apic)
+ check_x2apic();
/* How many end-of-memory variables you have, grandma! */
/* need this before calling reserve_initrd */
@@ -753,6 +965,10 @@ void __init setup_arch(char **cmdline_p)
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
#endif
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+ setup_bios_corruption_check();
+#endif
+
/* max_pfn_mapped is updated here */
max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
max_pfn_mapped = max_low_pfn_mapped;
@@ -781,8 +997,6 @@ void __init setup_arch(char **cmdline_p)
vsmp_init();
#endif
- dmi_scan_machine();
-
io_delay_init();
/*
@@ -790,6 +1004,8 @@ void __init setup_arch(char **cmdline_p)
*/
acpi_boot_table_init();
+ early_acpi_boot_init();
+
#ifdef CONFIG_ACPI_NUMA
/*
* Parse SRAT to discover nodes.
@@ -857,6 +1073,7 @@ void __init setup_arch(char **cmdline_p)
#endif
prefill_possible_map();
+
#ifdef CONFIG_X86_64
init_cpu_to_node();
#endif
@@ -864,6 +1081,9 @@ void __init setup_arch(char **cmdline_p)
init_apic_mappings();
ioapic_init_mappings();
+ /* need to wait for io_apic is mapped */
+ nr_irqs = probe_nr_irqs();
+
kvm_guest_init();
e820_reserve_resources();
@@ -885,3 +1105,5 @@ void __init setup_arch(char **cmdline_p)
#endif
#endif
}
+
+
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 76e305e064f..ae0c0d3bb77 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -140,35 +140,47 @@ static void __init setup_cpu_pda_map(void)
*/
void __init setup_per_cpu_areas(void)
{
- ssize_t size = PERCPU_ENOUGH_ROOM;
+ ssize_t size, old_size;
char *ptr;
int cpu;
+ unsigned long align = 1;
/* Setup cpu_pda map */
setup_cpu_pda_map();
/* Copy section for each CPU (we discard the original) */
- size = PERCPU_ENOUGH_ROOM;
+ old_size = PERCPU_ENOUGH_ROOM;
+ align = max_t(unsigned long, PAGE_SIZE, align);
+ size = roundup(old_size, align);
printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
size);
for_each_possible_cpu(cpu) {
#ifndef CONFIG_NEED_MULTIPLE_NODES
- ptr = alloc_bootmem_pages(size);
+ ptr = __alloc_bootmem(size, align,
+ __pa(MAX_DMA_ADDRESS));
#else
int node = early_cpu_to_node(cpu);
if (!node_online(node) || !NODE_DATA(node)) {
- ptr = alloc_bootmem_pages(size);
+ ptr = __alloc_bootmem(size, align,
+ __pa(MAX_DMA_ADDRESS));
printk(KERN_INFO
"cpu %d has no node %d or node-local memory\n",
cpu, node);
+ if (ptr)
+ printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
+ cpu, __pa(ptr));
+ }
+ else {
+ ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
+ __pa(MAX_DMA_ADDRESS));
+ if (ptr)
+ printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
+ cpu, node, __pa(ptr));
}
- else
- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
#endif
per_cpu_offset(cpu) = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-
}
printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
@@ -206,7 +218,7 @@ static void __init setup_node_to_cpumask_map(void)
/* allocate the map */
map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
- pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
+ pr_debug("Node to cpumask map at %p for %d nodes\n",
map, nr_node_ids);
/* node_to_cpumask() will now work */
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 72bbb519d2d..cc673aa55ce 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -3,9 +3,18 @@ struct sigframe {
char __user *pretcode;
int sig;
struct sigcontext sc;
- struct _fpstate fpstate;
+ /*
+ * fpstate is unused. fpstate is moved/allocated after
+ * retcode[] below. This movement allows to have the FP state and the
+ * future state extensions (xsave) stay together.
+ * And at the same time retaining the unused fpstate, prevents changing
+ * the offset of extramask[] in the sigframe and thus prevent any
+ * legacy application accessing/modifying it.
+ */
+ struct _fpstate fpstate_unused;
unsigned long extramask[_NSIG_WORDS-1];
char retcode[8];
+ /* fp state follows here */
};
struct rt_sigframe {
@@ -15,13 +24,19 @@ struct rt_sigframe {
void __user *puc;
struct siginfo info;
struct ucontext uc;
- struct _fpstate fpstate;
char retcode[8];
+ /* fp state follows here */
};
#else
struct rt_sigframe {
char __user *pretcode;
struct ucontext uc;
struct siginfo info;
+ /* fp state follows here */
};
+
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+ sigset_t *set, struct pt_regs *regs);
#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 6fb5bcdd893..d6dd057d0f2 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/wait.h>
+#include <linux/tracehook.h>
#include <linux/elf.h>
#include <linux/smp.h>
#include <linux/mm.h>
@@ -26,6 +27,8 @@
#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/vdso.h>
+#include <asm/syscall.h>
+#include <asm/syscalls.h>
#include "sigframe.h"
@@ -110,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
return do_sigaltstack(uss, uoss, regs->sp);
}
+#define COPY(x) { \
+ err |= __get_user(regs->x, &sc->x); \
+}
+
+#define COPY_SEG(seg) { \
+ unsigned short tmp; \
+ err |= __get_user(tmp, &sc->seg); \
+ regs->seg = tmp; \
+}
+
+#define COPY_SEG_STRICT(seg) { \
+ unsigned short tmp; \
+ err |= __get_user(tmp, &sc->seg); \
+ regs->seg = tmp | 3; \
+}
+
+#define GET_SEG(seg) { \
+ unsigned short tmp; \
+ err |= __get_user(tmp, &sc->seg); \
+ loadsegment(seg, tmp); \
+}
/*
* Do a signal return; undo the signal stack.
@@ -118,28 +142,13 @@ static int
restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
unsigned long *pax)
{
+ void __user *buf;
+ unsigned int tmpflags;
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
-#define COPY(x) err |= __get_user(regs->x, &sc->x)
-
-#define COPY_SEG(seg) \
- { unsigned short tmp; \
- err |= __get_user(tmp, &sc->seg); \
- regs->seg = tmp; }
-
-#define COPY_SEG_STRICT(seg) \
- { unsigned short tmp; \
- err |= __get_user(tmp, &sc->seg); \
- regs->seg = tmp|3; }
-
-#define GET_SEG(seg) \
- { unsigned short tmp; \
- err |= __get_user(tmp, &sc->seg); \
- loadsegment(seg, tmp); }
-
GET_SEG(gs);
COPY_SEG(fs);
COPY_SEG(es);
@@ -149,38 +158,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
COPY_SEG_STRICT(cs);
COPY_SEG_STRICT(ss);
- {
- unsigned int tmpflags;
-
- err |= __get_user(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~FIX_EFLAGS) |
- (tmpflags & FIX_EFLAGS);
- regs->orig_ax = -1; /* disable syscall checks */
- }
+ err |= __get_user(tmpflags, &sc->flags);
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+ regs->orig_ax = -1; /* disable syscall checks */
- {
- struct _fpstate __user *buf;
-
- err |= __get_user(buf, &sc->fpstate);
- if (buf) {
- if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
- goto badframe;
- err |= restore_i387(buf);
- } else {
- struct task_struct *me = current;
-
- if (used_math()) {
- clear_fpu(me);
- clear_used_math();
- }
- }
- }
+ err |= __get_user(buf, &sc->fpstate);
+ err |= restore_i387_xstate(buf);
err |= __get_user(*pax, &sc->ax);
return err;
-
-badframe:
- return 1;
}
asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
@@ -226,9 +212,8 @@ badframe:
return 0;
}
-asmlinkage int sys_rt_sigreturn(unsigned long __unused)
+static long do_rt_sigreturn(struct pt_regs *regs)
{
- struct pt_regs *regs = (struct pt_regs *)&__unused;
struct rt_sigframe __user *frame;
unsigned long ax;
sigset_t set;
@@ -254,15 +239,22 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
return ax;
badframe:
- force_sig(SIGSEGV, current);
+ signal_fault(regs, frame, "rt_sigreturn");
return 0;
}
+asmlinkage int sys_rt_sigreturn(unsigned long __unused)
+{
+ struct pt_regs *regs = (struct pt_regs *)&__unused;
+
+ return do_rt_sigreturn(regs);
+}
+
/*
* Set up a signal frame.
*/
static int
-setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
+setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
struct pt_regs *regs, unsigned long mask)
{
int tmp, err = 0;
@@ -289,7 +281,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
err |= __put_user(regs->sp, &sc->sp_at_signal);
err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
- tmp = save_i387(fpstate);
+ tmp = save_i387_xstate(fpstate);
if (tmp < 0)
err = 1;
else
@@ -306,7 +298,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
* Determine which stack to use..
*/
static inline void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
+get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+ void **fpstate)
{
unsigned long sp;
@@ -332,6 +325,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
sp = (unsigned long) ka->sa.sa_restorer;
}
+ if (used_math()) {
+ sp = sp - sig_xstate_size;
+ *fpstate = (struct _fpstate *) sp;
+ }
+
sp -= frame_size;
/*
* Align the stack pointer according to the i386 ABI,
@@ -343,38 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
}
static int
-setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
- struct pt_regs *regs)
+__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
+ struct pt_regs *regs)
{
struct sigframe __user *frame;
void __user *restorer;
int err = 0;
- int usig;
+ void __user *fpstate = NULL;
- frame = get_sigframe(ka, regs, sizeof(*frame));
+ frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- goto give_sigsegv;
+ return -EFAULT;
- usig = current_thread_info()->exec_domain
- && current_thread_info()->exec_domain->signal_invmap
- && sig < 32
- ? current_thread_info()->exec_domain->signal_invmap[sig]
- : sig;
+ if (__put_user(sig, &frame->sig))
+ return -EFAULT;
- err = __put_user(usig, &frame->sig);
- if (err)
- goto give_sigsegv;
-
- err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
- if (err)
- goto give_sigsegv;
+ if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
+ return -EFAULT;
if (_NSIG_WORDS > 1) {
- err = __copy_to_user(&frame->extramask, &set->sig[1],
- sizeof(frame->extramask));
- if (err)
- goto give_sigsegv;
+ if (__copy_to_user(&frame->extramask, &set->sig[1],
+ sizeof(frame->extramask)))
+ return -EFAULT;
}
if (current->mm->context.vdso)
@@ -399,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
if (err)
- goto give_sigsegv;
+ return -EFAULT;
/* Set up registers for signal handler */
regs->sp = (unsigned long)frame;
@@ -414,50 +403,43 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
regs->cs = __USER_CS;
return 0;
-
-give_sigsegv:
- force_sigsegv(sig, current);
- return -EFAULT;
}
-static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
+static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
void __user *restorer;
int err = 0;
- int usig;
+ void __user *fpstate = NULL;
- frame = get_sigframe(ka, regs, sizeof(*frame));
+ frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- goto give_sigsegv;
+ return -EFAULT;
- usig = current_thread_info()->exec_domain
- && current_thread_info()->exec_domain->signal_invmap
- && sig < 32
- ? current_thread_info()->exec_domain->signal_invmap[sig]
- : sig;
-
- err |= __put_user(usig, &frame->sig);
+ err |= __put_user(sig, &frame->sig);
err |= __put_user(&frame->info, &frame->pinfo);
err |= __put_user(&frame->uc, &frame->puc);
err |= copy_siginfo_to_user(&frame->info, info);
if (err)
- goto give_sigsegv;
+ return -EFAULT;
/* Create the ucontext. */
- err |= __put_user(0, &frame->uc.uc_flags);
+ if (cpu_has_xsave)
+ err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ err |= __put_user(0, &frame->uc.uc_flags);
err |= __put_user(0, &frame->uc.uc_link);
err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
err |= __put_user(sas_ss_flags(regs->sp),
&frame->uc.uc_stack.ss_flags);
err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
if (err)
- goto give_sigsegv;
+ return -EFAULT;
/* Set up to return from userspace. */
restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -477,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
if (err)
- goto give_sigsegv;
+ return -EFAULT;
/* Set up registers for signal handler */
regs->sp = (unsigned long)frame;
regs->ip = (unsigned long)ka->sa.sa_handler;
- regs->ax = (unsigned long)usig;
+ regs->ax = (unsigned long)sig;
regs->dx = (unsigned long)&frame->info;
regs->cx = (unsigned long)&frame->uc;
@@ -492,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->cs = __USER_CS;
return 0;
-
-give_sigsegv:
- force_sigsegv(sig, current);
- return -EFAULT;
}
/*
* OK, we're invoking a handler:
*/
+static int signr_convert(int sig)
+{
+ struct thread_info *info = current_thread_info();
+
+ if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
+ return info->exec_domain->signal_invmap[sig];
+ return sig;
+}
+
+#define is_ia32 1
+#define ia32_setup_frame __setup_frame
+#define ia32_setup_rt_frame __setup_rt_frame
+
+static int
+setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs)
+{
+ int usig = signr_convert(sig);
+ int ret;
+
+ /* Set up the stack frame */
+ if (is_ia32) {
+ if (ka->sa.sa_flags & SA_SIGINFO)
+ ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+ else
+ ret = ia32_setup_frame(usig, ka, set, regs);
+ } else
+ ret = __setup_rt_frame(sig, ka, info, set, regs);
+
+ if (ret) {
+ force_sigsegv(sig, current);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
sigset_t *oldset, struct pt_regs *regs)
@@ -508,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
int ret;
/* Are we from a system call? */
- if ((long)regs->orig_ax >= 0) {
+ if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
- switch (regs->ax) {
+ switch (syscall_get_error(current, regs)) {
case -ERESTART_RESTARTBLOCK:
case -ERESTARTNOHAND:
regs->ax = -EINTR;
@@ -537,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
regs->flags &= ~X86_EFLAGS_TF;
- /* Set up the stack frame */
- if (ka->sa.sa_flags & SA_SIGINFO)
- ret = setup_rt_frame(sig, ka, info, oldset, regs);
- else
- ret = setup_frame(sig, ka, oldset, regs);
+ ret = setup_rt_frame(sig, ka, info, oldset, regs);
if (ret)
return ret;
+#ifdef CONFIG_X86_64
+ /*
+ * This has nothing to do with segment registers,
+ * despite the name. This magic affects uaccess.h
+ * macros' behavior. Reset it to the normal setting.
+ */
+ set_fs(USER_DS);
+#endif
+
/*
* Clear the direction flag as per the ABI for function entry.
*/
@@ -558,8 +578,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
* handler too.
*/
regs->flags &= ~X86_EFLAGS_TF;
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -568,9 +586,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
+ tracehook_signal_handler(sig, info, ka, regs,
+ test_thread_flag(TIF_SINGLESTEP));
+
return 0;
}
+#define NR_restart_syscall __NR_restart_syscall
/*
* Note that 'init' is a special process: it doesn't get signals it doesn't
* want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs)
}
/* Did we come from a system call? */
- if ((long)regs->orig_ax >= 0) {
+ if (syscall_get_nr(current, regs) >= 0) {
/* Restart the system call - no handlers present */
- switch (regs->ax) {
+ switch (syscall_get_error(current, regs)) {
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
@@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs)
break;
case -ERESTART_RESTARTBLOCK:
- regs->ax = __NR_restart_syscall;
+ regs->ax = NR_restart_syscall;
regs->ip -= 2;
break;
}
@@ -657,9 +679,38 @@ static void do_signal(struct pt_regs *regs)
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+ /* notify userspace of pending MCEs */
+ if (thread_info_flags & _TIF_MCE_NOTIFY)
+ mce_notify_user();
+#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
+
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
+ if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ }
+
+#ifdef CONFIG_X86_32
clear_thread_flag(TIF_IRET);
+#endif /* CONFIG_X86_32 */
+}
+
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
+{
+ struct task_struct *me = current;
+
+ if (show_unhandled_signals && printk_ratelimit()) {
+ printk(KERN_INFO
+ "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+ me->comm, me->pid, where, frame,
+ regs->ip, regs->sp, regs->orig_ax);
+ print_vma_addr(" in ", regs->ip);
+ printk(KERN_CONT "\n");
+ }
+
+ force_sig(SIGSEGV, me);
}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ca316b5b742..a5c9627f4db 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,21 @@
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/ptrace.h>
+#include <linux/tracehook.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/compiler.h>
+#include <linux/uaccess.h>
+
#include <asm/processor.h>
#include <asm/ucontext.h>
-#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
#include <asm/mce.h>
+#include <asm/syscall.h>
+#include <asm/syscalls.h>
#include "sigframe.h"
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -41,11 +45,6 @@
# define FIX_EFLAGS __FIX_EFLAGS
#endif
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs * regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs * regs);
-
asmlinkage long
sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
struct pt_regs *regs)
@@ -53,67 +52,14 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
return do_sigaltstack(uss, uoss, regs->sp);
}
-/*
- * Signal frame handlers.
- */
-
-static inline int save_i387(struct _fpstate __user *buf)
-{
- struct task_struct *tsk = current;
- int err = 0;
-
- BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
- sizeof(tsk->thread.xstate->fxsave));
-
- if ((unsigned long)buf % 16)
- printk("save_i387: bad fpstate %p\n", buf);
-
- if (!used_math())
- return 0;
- clear_used_math(); /* trigger finit */
- if (task_thread_info(tsk)->status & TS_USEDFPU) {
- err = save_i387_checking((struct i387_fxsave_struct __user *)
- buf);
- if (err)
- return err;
- task_thread_info(tsk)->status &= ~TS_USEDFPU;
- stts();
- } else {
- if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
- sizeof(struct i387_fxsave_struct)))
- return -1;
- }
- return 1;
+#define COPY(x) { \
+ err |= __get_user(regs->x, &sc->x); \
}
-/*
- * This restores directly out of user space. Exceptions are handled.
- */
-static inline int restore_i387(struct _fpstate __user *buf)
-{
- struct task_struct *tsk = current;
- int err;
-
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
- }
-
- if (!(task_thread_info(current)->status & TS_USEDFPU)) {
- clts();
- task_thread_info(current)->status |= TS_USEDFPU;
- }
- err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
- if (unlikely(err)) {
- /*
- * Encountered an error while doing the restore from the
- * user buffer, clear the fpu state.
- */
- clear_fpu(tsk);
- clear_used_math();
- }
- return err;
+#define COPY_SEG_STRICT(seg) { \
+ unsigned short tmp; \
+ err |= __get_user(tmp, &sc->seg); \
+ regs->seg = tmp | 3; \
}
/*
@@ -123,13 +69,13 @@ static int
restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
unsigned long *pax)
{
+ void __user *buf;
+ unsigned int tmpflags;
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
-#define COPY(x) err |= __get_user(regs->x, &sc->x)
-
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
COPY(dx); COPY(cx); COPY(ip);
COPY(r8);
@@ -144,48 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
/* Kernel saves and restores only the CS segment register on signals,
* which is the bare minimum needed to allow mixed 32/64-bit code.
* App's signal handler can save/restore other segments if needed. */
- {
- unsigned cs;
- err |= __get_user(cs, &sc->cs);
- regs->cs = cs | 3; /* Force into user mode */
- }
+ COPY_SEG_STRICT(cs);
- {
- unsigned int tmpflags;
- err |= __get_user(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
- regs->orig_ax = -1; /* disable syscall checks */
- }
+ err |= __get_user(tmpflags, &sc->flags);
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+ regs->orig_ax = -1; /* disable syscall checks */
- {
- struct _fpstate __user * buf;
- err |= __get_user(buf, &sc->fpstate);
-
- if (buf) {
- if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
- goto badframe;
- err |= restore_i387(buf);
- } else {
- struct task_struct *me = current;
- if (used_math()) {
- clear_fpu(me);
- clear_used_math();
- }
- }
- }
+ err |= __get_user(buf, &sc->fpstate);
+ err |= restore_i387_xstate(buf);
err |= __get_user(*pax, &sc->ax);
return err;
-
-badframe:
- return 1;
}
-asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
+static long do_rt_sigreturn(struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
- sigset_t set;
unsigned long ax;
+ sigset_t set;
frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -198,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
current->blocked = set;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
-
+
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
@@ -208,16 +130,22 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
return ax;
badframe:
- signal_fault(regs,frame,"sigreturn");
+ signal_fault(regs, frame, "rt_sigreturn");
return 0;
-}
+}
+
+asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
+{
+ return do_rt_sigreturn(regs);
+}
/*
* Set up a signal frame.
*/
static inline int
-setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
+ unsigned long mask, struct task_struct *me)
{
int err = 0;
@@ -269,41 +197,40 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
sp = current->sas_ss_sp + current->sas_ss_size;
}
- return (void __user *)round_down(sp - size, 16);
+ return (void __user *)round_down(sp - size, 64);
}
-static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs * regs)
+static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
- struct _fpstate __user *fp = NULL;
+ void __user *fp = NULL;
int err = 0;
struct task_struct *me = current;
if (used_math()) {
- fp = get_stack(ka, regs, sizeof(struct _fpstate));
+ fp = get_stack(ka, regs, sig_xstate_size);
frame = (void __user *)round_down(
(unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
- if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
- goto give_sigsegv;
-
- if (save_i387(fp) < 0)
- err |= -1;
+ if (save_i387_xstate(fp) < 0)
+ return -EFAULT;
} else
frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- goto give_sigsegv;
+ return -EFAULT;
- if (ka->sa.sa_flags & SA_SIGINFO) {
- err |= copy_siginfo_to_user(&frame->info, info);
- if (err)
- goto give_sigsegv;
+ if (ka->sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user(&frame->info, info))
+ return -EFAULT;
}
-
+
/* Create the ucontext. */
- err |= __put_user(0, &frame->uc.uc_flags);
+ if (cpu_has_xsave)
+ err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ err |= __put_user(0, &frame->uc.uc_flags);
err |= __put_user(0, &frame->uc.uc_link);
err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
err |= __put_user(sas_ss_flags(regs->sp),
@@ -311,9 +238,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
- if (sizeof(*set) == 16) {
+ if (sizeof(*set) == 16) {
__put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
- __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
+ __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
} else
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
@@ -324,15 +251,15 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
} else {
/* could use a vstub here */
- goto give_sigsegv;
+ return -EFAULT;
}
if (err)
- goto give_sigsegv;
+ return -EFAULT;
/* Set up registers for signal handler */
regs->di = sig;
- /* In case the signal handler was declared without prototypes */
+ /* In case the signal handler was declared without prototypes */
regs->ax = 0;
/* This also works for non SA_SIGINFO handlers because they expect the
@@ -348,44 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->cs = __USER_CS;
return 0;
-
-give_sigsegv:
- force_sigsegv(sig, current);
- return -EFAULT;
}
/*
- * Return -1L or the syscall number that @regs is executing.
+ * OK, we're invoking a handler
*/
-static long current_syscall(struct pt_regs *regs)
+static int signr_convert(int sig)
{
- /*
- * We always sign-extend a -1 value being set here,
- * so this is always either -1L or a syscall number.
- */
- return regs->orig_ax;
+ return sig;
}
-/*
- * Return a value that is -EFOO if the system call in @regs->orig_ax
- * returned an error. This only works for @regs from @current.
- */
-static long current_syscall_ret(struct pt_regs *regs)
-{
#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
- /*
- * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
- * and will match correctly in comparisons.
- */
- return (int) regs->ax;
+#define is_ia32 test_thread_flag(TIF_IA32)
+#else
+#define is_ia32 0
#endif
- return regs->ax;
-}
-/*
- * OK, we're invoking a handler
- */
+static int
+setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs)
+{
+ int usig = signr_convert(sig);
+ int ret;
+
+ /* Set up the stack frame */
+ if (is_ia32) {
+ if (ka->sa.sa_flags & SA_SIGINFO)
+ ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+ else
+ ret = ia32_setup_frame(usig, ka, set, regs);
+ } else
+ ret = __setup_rt_frame(sig, ka, info, set, regs);
+
+ if (ret) {
+ force_sigsegv(sig, current);
+ return -EFAULT;
+ }
+
+ return ret;
+}
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -394,9 +322,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
int ret;
/* Are we from a system call? */
- if (current_syscall(regs) >= 0) {
+ if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
- switch (current_syscall_ret(regs)) {
+ switch (syscall_get_error(current, regs)) {
case -ERESTART_RESTARTBLOCK:
case -ERESTARTNOHAND:
regs->ax = -EINTR;
@@ -423,50 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
regs->flags &= ~X86_EFLAGS_TF;
-#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32)) {
- if (ka->sa.sa_flags & SA_SIGINFO)
- ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
- else
- ret = ia32_setup_frame(sig, ka, oldset, regs);
- } else
-#endif
ret = setup_rt_frame(sig, ka, info, oldset, regs);
- if (ret == 0) {
- /*
- * This has nothing to do with segment registers,
- * despite the name. This magic affects uaccess.h
- * macros' behavior. Reset it to the normal setting.
- */
- set_fs(USER_DS);
+ if (ret)
+ return ret;
- /*
- * Clear the direction flag as per the ABI for function entry.
- */
- regs->flags &= ~X86_EFLAGS_DF;
+#ifdef CONFIG_X86_64
+ /*
+ * This has nothing to do with segment registers,
+ * despite the name. This magic affects uaccess.h
+ * macros' behavior. Reset it to the normal setting.
+ */
+ set_fs(USER_DS);
+#endif
- /*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
- */
- regs->flags &= ~X86_EFLAGS_TF;
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
-
- spin_lock_irq(&current->sighand->siglock);
- sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&current->blocked,sig);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- }
+ /*
+ * Clear the direction flag as per the ABI for function entry.
+ */
+ regs->flags &= ~X86_EFLAGS_DF;
- return ret;
+ /*
+ * Clear TF when entering the signal handler, but
+ * notify any tracer that was single-stepping it.
+ * The tracer may want to single-step inside the
+ * handler too.
+ */
+ regs->flags &= ~X86_EFLAGS_TF;
+
+ spin_lock_irq(&current->sighand->siglock);
+ sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
+ if (!(ka->sa.sa_flags & SA_NODEFER))
+ sigaddset(&current->blocked, sig);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ tracehook_signal_handler(sig, info, ka, regs,
+ test_thread_flag(TIF_SINGLESTEP));
+
+ return 0;
}
+#define NR_restart_syscall \
+ test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
/*
* Note that 'init' is a special process: it doesn't get signals it doesn't
* want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -496,7 +422,8 @@ static void do_signal(struct pt_regs *regs)
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) {
- /* Re-enable any watchpoints before delivering the
+ /*
+ * Re-enable any watchpoints before delivering the
* signal to user space. The processor register will
* have been cleared if the watchpoint triggered
* inside the kernel.
@@ -504,7 +431,7 @@ static void do_signal(struct pt_regs *regs)
if (current->thread.debugreg7)
set_debugreg(current->thread.debugreg7, 7);
- /* Whee! Actually deliver the signal. */
+ /* Whee! Actually deliver the signal. */
if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
/*
* A signal was successfully delivered; the saved
@@ -518,19 +445,18 @@ static void do_signal(struct pt_regs *regs)
}
/* Did we come from a system call? */
- if (current_syscall(regs) >= 0) {
+ if (syscall_get_nr(current, regs) >= 0) {
/* Restart the system call - no handlers present */
- switch (current_syscall_ret(regs)) {
+ switch (syscall_get_error(current, regs)) {
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
regs->ax = regs->orig_ax;
regs->ip -= 2;
break;
+
case -ERESTART_RESTARTBLOCK:
- regs->ax = test_thread_flag(TIF_IA32) ?
- __NR_ia32_restart_syscall :
- __NR_restart_syscall;
+ regs->ax = NR_restart_syscall;
regs->ip -= 2;
break;
}
@@ -546,29 +472,45 @@ static void do_signal(struct pt_regs *regs)
}
}
-void do_notify_resume(struct pt_regs *regs, void *unused,
- __u32 thread_info_flags)
+/*
+ * notification of userspace execution resumption
+ * - triggered by the TIF_WORK_MASK flags
+ */
+void
+do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
-#ifdef CONFIG_X86_MCE
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
mce_notify_user();
-#endif /* CONFIG_X86_MCE */
+#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
+
+ if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ }
+
+#ifdef CONFIG_X86_32
+ clear_thread_flag(TIF_IRET);
+#endif /* CONFIG_X86_32 */
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
-{
- struct task_struct *me = current;
+{
+ struct task_struct *me = current;
+
if (show_unhandled_signals && printk_ratelimit()) {
- printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
- me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
+ printk(KERN_INFO
+ "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+ me->comm, me->pid, where, frame,
+ regs->ip, regs->sp, regs->orig_ax);
print_vma_addr(" in ", regs->ip);
- printk("\n");
+ printk(KERN_CONT "\n");
}
- force_sig(SIGSEGV, me);
-}
+ force_sig(SIGSEGV, me);
+}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 361b7a4c640..18f9b19f5f8 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
struct smp_ops smp_ops = {
.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
.smp_prepare_cpus = native_smp_prepare_cpus,
- .cpu_up = native_cpu_up,
.smp_cpus_done = native_smp_cpus_done,
.smp_send_stop = native_smp_send_stop,
.smp_send_reschedule = native_smp_send_reschedule,
+ .cpu_up = native_cpu_up,
+ .cpu_die = native_cpu_die,
+ .cpu_disable = native_cpu_disable,
+ .play_dead = native_play_dead,
+
.send_call_func_ipi = native_send_call_func_ipi,
.send_call_func_single_ipi = native_send_call_func_single_ipi,
};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7985c5b3f91..7b109339731 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
+#include <asm/idle.h>
#include <asm/smp.h>
#include <asm/trampoline.h>
#include <asm/cpu.h>
@@ -88,7 +89,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
#else
-struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
#define get_idle_for_cpu(x) (idle_thread_array[(x)])
#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
#endif
@@ -123,13 +124,12 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
static atomic_t init_deasserted;
-static int boot_cpu_logical_apicid;
/* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map;
/* Set if we find a B stepping CPU */
-int __cpuinitdata smp_b_stepping;
+static int __cpuinitdata smp_b_stepping;
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
@@ -165,6 +165,8 @@ static void unmap_cpu_to_node(int cpu)
#endif
#ifdef CONFIG_X86_32
+static int boot_cpu_logical_apicid;
+
u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
{ [0 ... NR_CPUS-1] = BAD_APICID };
@@ -210,7 +212,7 @@ static void __cpuinit smp_callin(void)
/*
* (This works even if the APIC is not enabled.)
*/
- phys_id = GET_APIC_ID(read_apic_id());
+ phys_id = read_apic_id();
cpuid = smp_processor_id();
if (cpu_isset(cpuid, cpu_callin_map)) {
panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
@@ -257,6 +259,7 @@ static void __cpuinit smp_callin(void)
end_local_APIC_setup();
map_cpu_to_logical_apicid();
+ notify_cpu_starting(cpuid);
/*
* Get our bogomips.
*
@@ -279,6 +282,8 @@ static void __cpuinit smp_callin(void)
cpu_set(cpuid, cpu_callin_map);
}
+static int __cpuinitdata unsafe_smp;
+
/*
* Activate a secondary processor.
*/
@@ -331,14 +336,17 @@ static void __cpuinit start_secondary(void *unused)
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
*/
- ipi_call_lock_irq();
+ ipi_call_lock();
lock_vector_lock();
__setup_vector_irq(smp_processor_id());
cpu_set(smp_processor_id(), cpu_online_map);
unlock_vector_lock();
- ipi_call_unlock_irq();
+ ipi_call_unlock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ /* enable local interrupts */
+ local_irq_enable();
+
setup_secondary_clock();
wmb();
@@ -391,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
goto valid_k7;
/* If we get here, not a certified SMP capable AMD system. */
- add_taint(TAINT_UNSAFE_SMP);
+ unsafe_smp = 1;
}
valid_k7:
@@ -408,12 +416,10 @@ static void __cpuinit smp_checks(void)
* Don't taint if we are running SMP kernel on a single non-MP
* approved Athlon
*/
- if (tainted & TAINT_UNSAFE_SMP) {
- if (num_online_cpus())
- printk(KERN_INFO "WARNING: This combination of AMD"
- "processors is not suitable for SMP.\n");
- else
- tainted &= ~TAINT_UNSAFE_SMP;
+ if (unsafe_smp && num_online_cpus() > 1) {
+ printk(KERN_INFO "WARNING: This combination of AMD"
+ "processors is not suitable for SMP.\n");
+ add_taint(TAINT_UNSAFE_SMP);
}
}
@@ -537,10 +543,10 @@ static inline void __inquire_remote_apic(int apicid)
int timeout;
u32 status;
- printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
+ printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
for (i = 0; i < ARRAY_SIZE(regs); i++) {
- printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]);
+ printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
/*
* Wait for idle.
@@ -550,8 +556,7 @@ static inline void __inquire_remote_apic(int apicid)
printk(KERN_CONT
"a previous APIC delivery may have failed\n");
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
- apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
+ apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
timeout = 0;
do {
@@ -583,11 +588,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
int maxlvt;
/* Target chip */
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
-
/* Boot on the stack */
/* Kick the second */
- apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
+ apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@@ -596,10 +599,12 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
- maxlvt = lapic_get_maxlvt();
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- accept_status = (apic_read(APIC_ESR) & 0xEF);
+ if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+ maxlvt = lapic_get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ accept_status = (apic_read(APIC_ESR) & 0xEF);
+ }
pr_debug("NMI sent.\n");
if (send_status)
@@ -640,13 +645,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
/*
* Turn INIT on target chip
*/
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
/*
* Send IPI
*/
- apic_write(APIC_ICR,
- APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
+ apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
+ phys_apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@@ -656,10 +659,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
pr_debug("Deasserting INIT.\n");
/* Target chip */
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
/* Send IPI */
- apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+ apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@@ -702,11 +703,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
*/
/* Target chip */
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
/* Boot on the stack */
/* Kick the second */
- apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
+ apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
+ phys_apicid);
/*
* Give the other CPU some time to accept the IPI.
@@ -874,7 +874,7 @@ do_rest:
start_ip = setup_trampoline();
/* So we see what's up */
- printk(KERN_INFO "Booting processor %d/%d ip %lx\n",
+ printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n",
cpu, apicid, start_ip);
/*
@@ -893,9 +893,11 @@ do_rest:
smpboot_setup_warm_reset_vector(start_ip);
/*
* Be paranoid about clearing APIC errors.
- */
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
+ */
+ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
}
/*
@@ -1175,10 +1177,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
* Setup boot CPU information
*/
smp_store_cpu_info(0); /* Final full version of the data */
+#ifdef CONFIG_X86_32
boot_cpu_logical_apicid = logical_smp_processor_id();
+#endif
current_thread_info()->cpu = 0; /* needed? */
set_cpu_sibling_map(0);
+#ifdef CONFIG_X86_64
+ enable_IR_x2apic();
+ setup_apic_routing();
+#endif
+
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
disable_smp();
@@ -1186,9 +1195,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
}
preempt_disable();
- if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) {
+ if (read_apic_id() != boot_cpu_physical_apicid) {
panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
- GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid);
+ read_apic_id(), boot_cpu_physical_apicid);
/* Or can we switch back to PIC here? */
}
preempt_enable();
@@ -1254,39 +1263,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
check_nmi_watchdog();
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void remove_siblinginfo(int cpu)
-{
- int sibling;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
- cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
- /*/
- * last thread sibling in this cpu core going down
- */
- if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
- cpu_data(sibling).booted_cores--;
- }
-
- for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
- cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
- cpus_clear(per_cpu(cpu_sibling_map, cpu));
- cpus_clear(per_cpu(cpu_core_map, cpu));
- c->phys_proc_id = 0;
- c->cpu_core_id = 0;
- cpu_clear(cpu, cpu_sibling_setup_map);
-}
-
-static int additional_cpus __initdata = -1;
-
-static __init int setup_additional_cpus(char *s)
-{
- return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
-}
-early_param("additional_cpus", setup_additional_cpus);
-
/*
* cpu_possible_map should be static, it cannot change as cpu's
* are onlined, or offlined. The reason is per-cpu data-structures
@@ -1306,24 +1282,13 @@ early_param("additional_cpus", setup_additional_cpus);
*/
__init void prefill_possible_map(void)
{
- int i;
- int possible;
+ int i, possible;
/* no processor from mptable or madt */
if (!num_processors)
num_processors = 1;
-#ifdef CONFIG_HOTPLUG_CPU
- if (additional_cpus == -1) {
- if (disabled_cpus > 0)
- additional_cpus = disabled_cpus;
- else
- additional_cpus = 0;
- }
-#else
- additional_cpus = 0;
-#endif
- possible = num_processors + additional_cpus;
+ possible = num_processors + disabled_cpus;
if (possible > NR_CPUS)
possible = NR_CPUS;
@@ -1336,6 +1301,31 @@ __init void prefill_possible_map(void)
nr_cpu_ids = possible;
}
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void remove_siblinginfo(int cpu)
+{
+ int sibling;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
+ cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
+ /*/
+ * last thread sibling in this cpu core going down
+ */
+ if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
+ cpu_data(sibling).booted_cores--;
+ }
+
+ for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
+ cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
+ cpus_clear(per_cpu(cpu_sibling_map, cpu));
+ cpus_clear(per_cpu(cpu_core_map, cpu));
+ c->phys_proc_id = 0;
+ c->cpu_core_id = 0;
+ cpu_clear(cpu, cpu_sibling_setup_map);
+}
+
static void __ref remove_cpu_from_maps(int cpu)
{
cpu_clear(cpu, cpu_online_map);
@@ -1346,25 +1336,9 @@ static void __ref remove_cpu_from_maps(int cpu)
numa_remove_cpu(cpu);
}
-int __cpu_disable(void)
+void cpu_disable_common(void)
{
int cpu = smp_processor_id();
-
- /*
- * Perhaps use cpufreq to drop frequency, but that could go
- * into generic code.
- *
- * We won't take down the boot processor on i386 due to some
- * interrupts only being able to be serviced by the BSP.
- * Especially so if we're not using an IOAPIC -zwane
- */
- if (cpu == 0)
- return -EBUSY;
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- stop_apic_nmi_watchdog(NULL);
- clear_local_APIC();
-
/*
* HACK:
* Allow any queued timer interrupts to get serviced
@@ -1382,10 +1356,32 @@ int __cpu_disable(void)
remove_cpu_from_maps(cpu);
unlock_vector_lock();
fixup_irqs(cpu_online_map);
+}
+
+int native_cpu_disable(void)
+{
+ int cpu = smp_processor_id();
+
+ /*
+ * Perhaps use cpufreq to drop frequency, but that could go
+ * into generic code.
+ *
+ * We won't take down the boot processor on i386 due to some
+ * interrupts only being able to be serviced by the BSP.
+ * Especially so if we're not using an IOAPIC -zwane
+ */
+ if (cpu == 0)
+ return -EBUSY;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ stop_apic_nmi_watchdog(NULL);
+ clear_local_APIC();
+
+ cpu_disable_common();
return 0;
}
-void __cpu_die(unsigned int cpu)
+void native_cpu_die(unsigned int cpu)
{
/* We don't do anything here: idle task is faking death itself. */
unsigned int i;
@@ -1402,15 +1398,45 @@ void __cpu_die(unsigned int cpu)
}
printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}
+
+void play_dead_common(void)
+{
+ idle_task_exit();
+ reset_lazy_tlbstate();
+ irq_ctx_exit(raw_smp_processor_id());
+ c1e_remove_cpu(raw_smp_processor_id());
+
+ mb();
+ /* Ack it */
+ __get_cpu_var(cpu_state) = CPU_DEAD;
+
+ /*
+ * With physical CPU hotplug, we should halt the cpu
+ */
+ local_irq_disable();
+}
+
+void native_play_dead(void)
+{
+ play_dead_common();
+ wbinvd_halt();
+}
+
#else /* ... !CONFIG_HOTPLUG_CPU */
-int __cpu_disable(void)
+int native_cpu_disable(void)
{
return -ENOSYS;
}
-void __cpu_die(unsigned int cpu)
+void native_cpu_die(unsigned int cpu)
{
/* We said "no" in __cpu_disable */
BUG();
}
+
+void native_play_dead(void)
+{
+ BUG();
+}
+
#endif
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index d67ce5f044b..7b987852e87 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -30,7 +30,7 @@
#include <linux/init.h>
#include <asm/io.h>
#include <asm/bios_ebda.h>
-#include <asm/mach-summit/mach_mpparse.h>
+#include <asm/summit/mpparse.h>
static struct rio_table_hdr *rio_table_hdr __initdata;
static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 7066cb855a6..1884a8d12bf 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -22,6 +22,8 @@
#include <linux/uaccess.h>
#include <linux/unistd.h>
+#include <asm/syscalls.h>
+
asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 3b360ef3381..6bc211accf0 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,15 +13,17 @@
#include <linux/utsname.h>
#include <linux/personality.h>
#include <linux/random.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
#include <asm/ia32.h>
+#include <asm/syscalls.h>
-asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long off)
+asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long off)
{
long error;
- struct file * file;
+ struct file *file;
error = -EINVAL;
if (off & ~PAGE_MASK)
@@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
unmapped base down for this case. This can give
conflicts with the heap, but we assume that glibc
malloc knows how to fall back to mmap. Give it 1GB
- of playground for now. -AK */
- *begin = 0x40000000;
- *end = 0x80000000;
+ of playground for now. -AK */
+ *begin = 0x40000000;
+ *end = 0x80000000;
if (current->flags & PF_RANDOMIZE) {
new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
if (new_begin)
@@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
}
} else {
*begin = TASK_UNMAPPED_BASE;
- *end = TASK_SIZE;
+ *end = TASK_SIZE;
}
-}
+}
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
unsigned long start_addr;
unsigned long begin, end;
-
+
if (flags & MAP_FIXED)
return addr;
- find_start_end(flags, &begin, &end);
+ find_start_end(flags, &begin, &end);
if (len > end)
return -ENOMEM;
@@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
}
if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
&& len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
+ mm->cached_hole_size = 0;
mm->free_area_cache = begin;
}
addr = mm->free_area_cache;
- if (addr < begin)
- addr = begin;
+ if (addr < begin)
+ addr = begin;
start_addr = addr;
full_search:
@@ -127,7 +129,7 @@ full_search:
return addr;
}
if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
+ mm->cached_hole_size = vma->vm_start - addr;
addr = vma->vm_end;
}
@@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr-len);
if (!vma || addr <= vma->vm_start)
/* remember the address as a hint for next time */
- return (mm->free_area_cache = addr-len);
+ return mm->free_area_cache = addr-len;
}
if (mm->mmap_base < len)
@@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr);
if (!vma || addr+len <= vma->vm_start)
/* remember the address as a hint for next time */
- return (mm->free_area_cache = addr);
+ return mm->free_area_cache = addr;
/* remember the largest hole we saw so far */
if (addr + mm->cached_hole_size < vma->vm_start)
@@ -224,13 +226,13 @@ bottomup:
}
-asmlinkage long sys_uname(struct new_utsname __user * name)
+asmlinkage long sys_uname(struct new_utsname __user *name)
{
int err;
down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof (*name));
+ err = copy_to_user(name, utsname(), sizeof(*name));
up_read(&uts_sem);
- if (personality(current->personality) == PER_LINUX32)
- err |= copy_to_user(&name->machine, "i686", 5);
+ if (personality(current->personality) == PER_LINUX32)
+ err |= copy_to_user(&name->machine, "i686", 5);
return err ? -EFAULT : 0;
}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 170d43c1748..de87d600829 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -8,12 +8,12 @@
#define __NO_STUBS
#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_64_UNISTD_H_
+#undef _ASM_X86_UNISTD_64_H
#include <asm/unistd_64.h>
#undef __SYSCALL
#define __SYSCALL(nr, sym) [nr] = sym,
-#undef _ASM_X86_64_UNISTD_H_
+#undef _ASM_X86_UNISTD_64_H
typedef void (*sys_call_ptr_t)(void);
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index ffe3c664afc..77b400f06ea 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -36,6 +36,7 @@
#include <asm/arch_hooks.h>
#include <asm/hpet.h>
#include <asm/time.h>
+#include <asm/timer.h>
#include "do_timer.h"
@@ -46,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs)
unsigned long pc = instruction_pointer(regs);
#ifdef CONFIG_SMP
- if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) &&
- in_lock_functions(pc)) {
+ if (!user_mode_vm(regs) && in_lock_functions(pc)) {
#ifdef CONFIG_FRAME_POINTER
- return *(unsigned long *)(regs->bp + 4);
+ return *(unsigned long *)(regs->bp + sizeof(long));
#else
unsigned long *sp = (unsigned long *)&regs->sp;
@@ -94,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
do_timer_interrupt_hook();
+#ifdef CONFIG_MCA
if (MCA_bus) {
/* The PS/2 uses level-triggered interrupts. You can't
turn them off, nor would you want to (any attempt to
@@ -107,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
u8 irq_v = inb_p( 0x61 ); /* read the current state */
outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
}
+#endif
return IRQ_HANDLED;
}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index e3d49c553af..cb19d650c21 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -16,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/time.h>
+#include <linux/mca.h>
#include <asm/i8253.h>
#include <asm/hpet.h>
@@ -33,23 +34,34 @@ unsigned long profile_pc(struct pt_regs *regs)
/* Assume the lock function has either no stack frame or a copy
of flags from PUSHF
Eflags always has bits 22 and up cleared unlike kernel addresses. */
- if (!user_mode(regs) && in_lock_functions(pc)) {
+ if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+ return *(unsigned long *)(regs->bp + sizeof(long));
+#else
unsigned long *sp = (unsigned long *)regs->sp;
if (sp[0] >> 22)
return sp[0];
if (sp[1] >> 22)
return sp[1];
+#endif
}
return pc;
}
EXPORT_SYMBOL(profile_pc);
-static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
+irqreturn_t timer_interrupt(int irq, void *dev_id)
{
add_pda(irq0_irqs, 1);
global_clock_event->event_handler(global_clock_event);
+#ifdef CONFIG_MCA
+ if (MCA_bus) {
+ u8 irq_v = inb_p(0x61); /* read the current state */
+ outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
+ }
+#endif
+
return IRQ_HANDLED;
}
@@ -100,7 +112,7 @@ unsigned long __init calibrate_cpu(void)
}
static struct irqaction irq0 = {
- .handler = timer_event_interrupt,
+ .handler = timer_interrupt,
.flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING,
.mask = CPU_MASK_NONE,
.name = "timer"
@@ -111,16 +123,13 @@ void __init hpet_time_init(void)
if (!hpet_enable())
setup_pit_timer();
+ irq0.mask = cpumask_of_cpu(0);
setup_irq(0, &irq0);
}
void __init time_init(void)
{
tsc_init();
- if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
- vgetcpu_mode = VGETCPU_RDTSCP;
- else
- vgetcpu_mode = VGETCPU_LSL;
late_time_init = choose_time_init();
}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index fec1ecedc9b..e00534b3353 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -241,3 +241,11 @@ void flush_tlb_all(void)
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
+void reset_lazy_tlbstate(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ per_cpu(cpu_tlbstate, cpu).state = 0;
+ per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
+}
+
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 8b8c0d6640f..04431f34fd1 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -6,7 +6,7 @@
* This code is released under the GNU General Public License version 2 or
* later.
*/
-#include <linux/mc146818rtc.h>
+#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index ab6bf375a30..6bb7b8579e7 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -10,6 +10,7 @@
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/proto.h>
+#include <asm/syscalls.h>
#include "tls.h"
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c
index 03df8e45e5a..04d242ab016 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps.c
@@ -7,13 +7,11 @@
*/
/*
- * 'Traps.c' handles hardware traps and faults after we have saved some
- * state in 'asm.s'.
+ * Handle hardware traps and faults.
*/
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
#include <linux/spinlock.h>
-#include <linux/highmem.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/utsname.h>
@@ -32,6 +30,8 @@
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/io.h>
#ifdef CONFIG_EISA
#include <linux/ioport.h>
@@ -46,21 +46,31 @@
#include <linux/edac.h>
#endif
-#include <asm/arch_hooks.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <asm/atomic.h>
#include <asm/system.h>
#include <asm/unwind.h>
+#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
+
+#include <mach_traps.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#include <asm/pda.h>
+#else
+#include <asm/processor-flags.h>
+#include <asm/arch_hooks.h>
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/io.h>
#include <asm/traps.h>
-#include "mach_traps.h"
+#include "cpu/mcheck/mce.h"
DECLARE_BITMAP(used_vectors, NR_VECTORS);
EXPORT_SYMBOL_GPL(used_vectors);
@@ -77,418 +87,104 @@ char ignore_fpu_irq;
*/
gate_desc idt_table[256]
__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 24;
-static unsigned int code_bytes = 64;
-static int ignore_nmis;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
-#ifdef CONFIG_KALLSYMS
- unsigned long offset = 0;
- unsigned long symsize;
- const char *symname;
- char *modname;
- char *delim = ":";
- char namebuf[KSYM_NAME_LEN];
- char reliab[4] = "";
-
- symname = kallsyms_lookup(address, &symsize, &offset,
- &modname, namebuf);
- if (!symname) {
- printk(" [<%08lx>]\n", address);
- return;
- }
- if (!reliable)
- strcpy(reliab, "? ");
-
- if (!modname)
- modname = delim = "";
- printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
- address, reliab, delim, modname, delim, symname, offset, symsize);
-#else
- printk(" [<%08lx>]\n", address);
-#endif
-}
-
-static inline int valid_stack_ptr(struct thread_info *tinfo,
- void *p, unsigned int size)
-{
- void *t = tinfo;
- return p > t && p <= t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
-
- while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + 4) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, bp == 0);
- }
- }
- stack++;
- }
- return bp;
-}
-
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
-{
- if (!task)
- task = current;
-
- if (!stack) {
- unsigned long dummy;
- stack = &dummy;
- if (task != current)
- stack = (unsigned long *)task->thread.sp;
- }
-
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- asm("movl %%ebp, %0" : "=r" (bp) :);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
#endif
- for (;;) {
- struct thread_info *context;
-
- context = (struct thread_info *)
- ((unsigned long)stack & (~(THREAD_SIZE - 1)));
- bp = print_context_stack(context, stack, bp, ops, data);
- /*
- * Should be after the line below, but somewhere
- * in early boot context comes out corrupted and we
- * can't reference it:
- */
- if (ops->stack(data, "IRQ") < 0)
- break;
- stack = (unsigned long *)context->previous_esp;
- if (!stack)
- break;
- touch_nmi_watchdog();
- }
-}
-EXPORT_SYMBOL(dump_trace);
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- printk(data);
- print_symbol(msg, symbol);
- printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
- printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
- return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
- printk("%s [<%08lx>] ", (char *)data, addr);
- if (!reliable)
- printk("? ");
- print_symbol("%s\n", addr);
- touch_nmi_watchdog();
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
-};
+static int ignore_nmis;
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
+static inline void conditional_sti(struct pt_regs *regs)
{
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
- printk("%s =======================\n", log_lvl);
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
}
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
+static inline void preempt_conditional_sti(struct pt_regs *regs)
{
- show_trace_log_lvl(task, regs, stack, bp, "");
+ inc_preempt_count();
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
}
-static void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+static inline void preempt_conditional_cli(struct pt_regs *regs)
{
- unsigned long *stack;
- int i;
-
- if (sp == NULL) {
- if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
-
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- if (kstack_end(stack))
- break;
- if (i && ((i % 8) == 0))
- printk("\n%s ", log_lvl);
- printk("%08lx ", *stack++);
- }
- printk("\n%sCall Trace:\n", log_lvl);
-
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_disable();
+ dec_preempt_count();
}
-void show_stack(struct task_struct *task, unsigned long *sp)
+#ifdef CONFIG_X86_32
+static inline void
+die_if_kernel(const char *str, struct pt_regs *regs, long err)
{
- printk(" ");
- show_stack_log_lvl(task, NULL, sp, 0, "");
+ if (!user_mode_vm(regs))
+ die(str, regs, err);
}
/*
- * The architecture-independent dump_stack generator
+ * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
+ * invalid offset set (the LAZY one) and the faulting thread has
+ * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
+ * we set the offset field correctly and return 1.
*/
-void dump_stack(void)
+static int lazy_iobitmap_copy(void)
{
- unsigned long bp = 0;
- unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- asm("movl %%ebp, %0" : "=r" (bp):);
-#endif
-
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
-
- show_trace(current, NULL, &stack, bp);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
-void show_registers(struct pt_regs *regs)
-{
- int i;
+ struct thread_struct *thread;
+ struct tss_struct *tss;
+ int cpu;
- print_modules();
- __show_registers(regs, 0);
+ cpu = get_cpu();
+ tss = &per_cpu(init_tss, cpu);
+ thread = &current->thread;
- printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
- TASK_COMM_LEN, current->comm, task_pid_nr(current),
- current_thread_info(), current, task_thread_info(current));
- /*
- * When in-kernel, we also print out the stack and code at the
- * time of the fault..
- */
- if (!user_mode_vm(regs)) {
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
- u8 *ip;
-
- printk("\n" KERN_EMERG "Stack: ");
- show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
-
- printk(KERN_EMERG "Code: ");
-
- ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- /* try starting at EIP */
- ip = (u8 *)regs->ip;
- code_len = code_len - code_prologue + 1;
- }
- for (i = 0; i < code_len; i++, ip++) {
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
- printk(" Bad EIP value.");
- break;
- }
- if (ip == (u8 *)regs->ip)
- printk("<%02x> ", c);
- else
- printk("%02x ", c);
+ if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
+ thread->io_bitmap_ptr) {
+ memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
+ thread->io_bitmap_max);
+ /*
+ * If the previously set map was extending to higher ports
+ * than the current one, pad extra space with 0xff (no access).
+ */
+ if (thread->io_bitmap_max < tss->io_bitmap_max) {
+ memset((char *) tss->io_bitmap +
+ thread->io_bitmap_max, 0xff,
+ tss->io_bitmap_max - thread->io_bitmap_max);
}
- }
- printk("\n");
-}
-
-int is_valid_bugaddr(unsigned long ip)
-{
- unsigned short ud2;
-
- if (ip < PAGE_OFFSET)
- return 0;
- if (probe_kernel_address((unsigned short *)ip, ud2))
- return 0;
-
- return ud2 == 0x0b0f;
-}
-
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
- unsigned long flags;
-
- oops_enter();
-
- if (die_owner != raw_smp_processor_id()) {
- console_verbose();
- raw_local_irq_save(flags);
- __raw_spin_lock(&die_lock);
- die_owner = smp_processor_id();
- die_nest_count = 0;
- bust_spinlocks(1);
- } else {
- raw_local_irq_save(flags);
- }
- die_nest_count++;
- return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
- bust_spinlocks(0);
- die_owner = -1;
- add_taint(TAINT_DIE);
- __raw_spin_unlock(&die_lock);
- raw_local_irq_restore(flags);
-
- if (!regs)
- return;
-
- if (kexec_should_crash(current))
- crash_kexec(regs);
-
- if (in_interrupt())
- panic("Fatal exception in interrupt");
-
- if (panic_on_oops)
- panic("Fatal exception");
-
- oops_exit();
- do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned short ss;
- unsigned long sp;
+ tss->io_bitmap_max = thread->io_bitmap_max;
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+ tss->io_bitmap_owner = thread;
+ put_cpu();
- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
- printk("\n");
- if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
return 1;
-
- show_registers(regs);
- /* Executive summary in case the oops scrolled away */
- sp = (unsigned long) (&regs->sp);
- savesegment(ss, ss);
- if (user_mode(regs)) {
- sp = regs->sp;
- ss = regs->ss & 0xffff;
}
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
- return 0;
-}
-
-/*
- * This is gone through when something in the kernel has done something bad
- * and is about to be terminated:
- */
-void die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned long flags = oops_begin();
-
- if (die_nest_count < 3) {
- report_bug(regs->ip, regs);
-
- if (__die(str, regs, err))
- regs = NULL;
- } else {
- printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
- }
-
- oops_end(flags, regs, SIGSEGV);
-}
+ put_cpu();
-static inline void
-die_if_kernel(const char *str, struct pt_regs *regs, long err)
-{
- if (!user_mode_vm(regs))
- die(str, regs, err);
+ return 0;
}
+#endif
static void __kprobes
-do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs,
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
long error_code, siginfo_t *info)
{
struct task_struct *tsk = current;
+#ifdef CONFIG_X86_32
if (regs->flags & X86_VM_MASK) {
- if (vm86)
+ /*
+ * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
+ * On nmi (interrupt 2), do_trap should not be called.
+ */
+ if (trapnr < 6)
goto vm86_trap;
goto trap_signal;
}
+#endif
if (!user_mode(regs))
goto kernel_trap;
+#ifdef CONFIG_X86_32
trap_signal:
+#endif
/*
* We want error_code and trap_no set for userspace faults and
* kernelspace faults which result in die(), but not
@@ -501,6 +197,18 @@ trap_signal:
tsk->thread.error_code = error_code;
tsk->thread.trap_no = trapnr;
+#ifdef CONFIG_X86_64
+ if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+ printk_ratelimit()) {
+ printk(KERN_INFO
+ "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid, str,
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+#endif
+
if (info)
force_sig_info(signr, info, tsk);
else
@@ -515,120 +223,98 @@ kernel_trap:
}
return;
+#ifdef CONFIG_X86_32
vm86_trap:
if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
error_code, trapnr))
goto trap_signal;
return;
+#endif
}
#define DO_ERROR(trapnr, signr, str, name) \
-void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- trace_hardirqs_fixup(); \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
-}
-
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
-void do_##name(struct pt_regs *regs, long error_code) \
+dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
- siginfo_t info; \
- if (irq) \
- local_irq_enable(); \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
== NOTIFY_STOP) \
return; \
- do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
+ conditional_sti(regs); \
+ do_trap(trapnr, signr, str, regs, error_code, NULL); \
}
-#define DO_VM86_ERROR(trapnr, signr, str, name) \
-void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
-}
-
-#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-void do_##name(struct pt_regs *regs, long error_code) \
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
siginfo_t info; \
info.si_signo = signr; \
info.si_errno = 0; \
info.si_code = sicode; \
info.si_addr = (void __user *)siaddr; \
- trace_hardirqs_fixup(); \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
== NOTIFY_STOP) \
return; \
- do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
+ conditional_sti(regs); \
+ do_trap(trapnr, signr, str, regs, error_code, &info); \
}
-DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-#ifndef CONFIG_KPROBES
-DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
-#endif
-DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
-DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
+DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_ERROR(4, SIGSEGV, "overflow", overflow)
+DO_ERROR(5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+#ifdef CONFIG_X86_32
DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
-DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+#endif
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+
+#ifdef CONFIG_X86_64
+/* Runs on IST stack */
+dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
+{
+ if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
+ 12, SIGBUS) == NOTIFY_STOP)
+ return;
+ preempt_conditional_sti(regs);
+ do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
+ preempt_conditional_cli(regs);
+}
-void __kprobes
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+{
+ static const char str[] = "double fault";
+ struct task_struct *tsk = current;
+
+ /* Return not checked because double check cannot be ignored */
+ notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 8;
+
+ /* This is always a kernel trap and never fixable (and thus must
+ never return). */
+ for (;;)
+ die(str, regs, error_code);
+}
+#endif
+
+dotraplinkage void __kprobes
do_general_protection(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk;
- struct thread_struct *thread;
- struct tss_struct *tss;
- int cpu;
-
- cpu = get_cpu();
- tss = &per_cpu(init_tss, cpu);
- thread = &current->thread;
- /*
- * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
- * invalid offset set (the LAZY one) and the faulting thread has
- * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
- * and we set the offset field correctly. Then we let the CPU to
- * restart the faulting instruction.
- */
- if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
- thread->io_bitmap_ptr) {
- memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
- thread->io_bitmap_max);
- /*
- * If the previously set map was extending to higher ports
- * than the current one, pad extra space with 0xff (no access).
- */
- if (thread->io_bitmap_max < tss->io_bitmap_max) {
- memset((char *) tss->io_bitmap +
- thread->io_bitmap_max, 0xff,
- tss->io_bitmap_max - thread->io_bitmap_max);
- }
- tss->io_bitmap_max = thread->io_bitmap_max;
- tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
- tss->io_bitmap_owner = thread;
- put_cpu();
+ conditional_sti(regs);
+#ifdef CONFIG_X86_32
+ if (lazy_iobitmap_copy()) {
+ /* restart the faulting instruction */
return;
}
- put_cpu();
if (regs->flags & X86_VM_MASK)
goto gp_in_vm86;
+#endif
tsk = current;
if (!user_mode(regs))
@@ -650,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
force_sig(SIGSEGV, tsk);
return;
+#ifdef CONFIG_X86_32
gp_in_vm86:
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
return;
+#endif
gp_in_kernel:
if (fixup_exception(regs))
@@ -690,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
/* Clear and disable the memory parity error line. */
- clear_mem_error(reason);
+ reason = (reason & 0xf) | 4;
+ outb(reason, 0x61);
}
static notrace __kprobes void
@@ -716,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
static notrace __kprobes void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
- if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
+ NOTIFY_STOP)
return;
#ifdef CONFIG_MCA
/*
@@ -739,41 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
}
-static DEFINE_SPINLOCK(nmi_print_lock);
-
-void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
-
- spin_lock(&nmi_print_lock);
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out:
- */
- bust_spinlocks(1);
- printk(KERN_EMERG "%s", str);
- printk(" on CPU%d, ip %08lx, registers:\n",
- smp_processor_id(), regs->ip);
- show_registers(regs);
- if (do_panic)
- panic("Non maskable interrupt");
- console_silent();
- spin_unlock(&nmi_print_lock);
- bust_spinlocks(0);
-
- /*
- * If we are in kernel we are probably nested up pretty bad
- * and might aswell get out now while we still can:
- */
- if (!user_mode_vm(regs)) {
- current->thread.trap_no = 2;
- crash_kexec(regs);
- }
-
- do_exit(SIGSEGV);
-}
-
static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
@@ -812,22 +467,25 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
mem_parity_error(reason, regs);
if (reason & 0x40)
io_check_error(reason, regs);
+#ifdef CONFIG_X86_32
/*
* Reassert NMI in case it became active meanwhile
* as it's edge-triggered:
*/
reassert_nmi();
+#endif
}
-notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
+dotraplinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
{
- int cpu;
-
nmi_enter();
- cpu = smp_processor_id();
-
- ++nmi_count(cpu);
+#ifdef CONFIG_X86_32
+ { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
+#else
+ add_pda(__nmi_count, 1);
+#endif
if (!ignore_nmis)
default_do_nmi(regs);
@@ -847,21 +505,44 @@ void restart_nmi(void)
acpi_nmi_enable();
}
-#ifdef CONFIG_KPROBES
-void __kprobes do_int3(struct pt_regs *regs, long error_code)
+/* May run on IST stack. */
+dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
{
- trace_hardirqs_fixup();
-
+#ifdef CONFIG_KPROBES
if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
== NOTIFY_STOP)
return;
- /*
- * This is an interrupt gate, because kprobes wants interrupts
- * disabled. Normal trap handlers don't.
- */
- restore_interrupts(regs);
+#else
+ if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
+ == NOTIFY_STOP)
+ return;
+#endif
- do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+ preempt_conditional_sti(regs);
+ do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+ preempt_conditional_cli(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* Help handler running on IST stack to switch back to user stack
+ for scheduling or signal handling. The actual stack switch is done in
+ entry.S */
+asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+{
+ struct pt_regs *regs = eregs;
+ /* Did already sync */
+ if (eregs == (struct pt_regs *)eregs->sp)
+ ;
+ /* Exception from user space */
+ else if (user_mode(eregs))
+ regs = task_pt_regs(current);
+ /* Exception from kernel and interrupts are enabled. Move to
+ kernel process stack. */
+ else if (eregs->flags & X86_EFLAGS_IF)
+ regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
+ if (eregs != regs)
+ *regs = *eregs;
+ return regs;
}
#endif
@@ -886,13 +567,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code)
* about restoring all the debug state, and ptrace doesn't have to
* find every occurrence of the TF bit that could be saved away even
* by user code)
+ *
+ * May run on IST stack.
*/
-void __kprobes do_debug(struct pt_regs *regs, long error_code)
+dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
- unsigned int condition;
-
- trace_hardirqs_fixup();
+ unsigned long condition;
+ int si_code;
get_debugreg(condition, 6);
@@ -905,9 +587,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
SIGTRAP) == NOTIFY_STOP)
return;
+
/* It's safe to allow irq's after DR6 has been saved */
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
+ preempt_conditional_sti(regs);
/* Mask out spurious debug traps due to lazy DR7 setting */
if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -915,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
goto clear_dr7;
}
+#ifdef CONFIG_X86_32
if (regs->flags & X86_VM_MASK)
goto debug_vm86;
+#endif
/* Save debug status register where ptrace can see it */
tsk->thread.debugreg6 = condition;
@@ -926,17 +610,13 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
* kernel space (but re-enable TF when returning to user mode).
*/
if (condition & DR_STEP) {
- /*
- * We already checked v86 mode above, so we can
- * check for kernel mode by just checking the CPL
- * of CS.
- */
if (!user_mode(regs))
goto clear_TF_reenable;
}
+ si_code = get_si_code(condition);
/* Ok, finally something we can handle */
- send_sigtrap(tsk, regs, error_code);
+ send_sigtrap(tsk, regs, error_code, si_code);
/*
* Disable additional traps. They'll be re-enabled when
@@ -944,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
*/
clear_dr7:
set_debugreg(0, 7);
+ preempt_conditional_cli(regs);
return;
+#ifdef CONFIG_X86_32
debug_vm86:
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
+ preempt_conditional_cli(regs);
return;
+#endif
clear_TF_reenable:
set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
regs->flags &= ~X86_EFLAGS_TF;
+ preempt_conditional_cli(regs);
return;
}
+#ifdef CONFIG_X86_64
+static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
+{
+ if (fixup_exception(regs))
+ return 1;
+
+ notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
+ /* Illegal floating point operation in the kernel */
+ current->thread.trap_no = trapnr;
+ die(str, regs, 0);
+ return 0;
+}
+#endif
+
/*
* Note that we play around with the 'TS' bit in an attempt to get
* the correct behaviour even in the presence of the asynchronous
@@ -992,7 +691,9 @@ void math_error(void __user *ip)
swd = get_fpu_swd(task);
switch (swd & ~cwd & 0x3f) {
case 0x000: /* No unmasked exception */
+#ifdef CONFIG_X86_32
return;
+#endif
default: /* Multiple exceptions */
break;
case 0x001: /* Invalid Op */
@@ -1020,9 +721,18 @@ void math_error(void __user *ip)
force_sig_info(SIGFPE, &info, task);
}
-void do_coprocessor_error(struct pt_regs *regs, long error_code)
+dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
{
+ conditional_sti(regs);
+
+#ifdef CONFIG_X86_32
ignore_fpu_irq = 1;
+#else
+ if (!user_mode(regs) &&
+ kernel_math_error(regs, "kernel x87 math error", 16))
+ return;
+#endif
+
math_error((void __user *)regs->ip);
}
@@ -1074,8 +784,12 @@ static void simd_math_error(void __user *ip)
force_sig_info(SIGFPE, &info, task);
}
-void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+dotraplinkage void
+do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
{
+ conditional_sti(regs);
+
+#ifdef CONFIG_X86_32
if (cpu_has_xmm) {
/* Handle SIMD FPU exceptions on PIII+ processors. */
ignore_fpu_irq = 1;
@@ -1094,16 +808,25 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
current->thread.error_code = error_code;
die_if_kernel("cache flush denied", regs, error_code);
force_sig(SIGSEGV, current);
+#else
+ if (!user_mode(regs) &&
+ kernel_math_error(regs, "kernel simd math error", 19))
+ return;
+ simd_math_error((void __user *)regs->ip);
+#endif
}
-void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+dotraplinkage void
+do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
{
+ conditional_sti(regs);
#if 0
/* No need to warn about this any longer. */
printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
#endif
}
+#ifdef CONFIG_X86_32
unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
{
struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
@@ -1122,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
return new_kesp;
}
+#else
+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+{
+}
+
+asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
+{
+}
+#endif
/*
* 'math_state_restore()' saves the current math information in the
@@ -1154,14 +886,24 @@ asmlinkage void math_state_restore(void)
}
clts(); /* Allow maths ops (or we recurse) */
+#ifdef CONFIG_X86_32
restore_fpu(tsk);
+#else
+ /*
+ * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+ */
+ if (unlikely(restore_fpu_checking(tsk))) {
+ stts();
+ force_sig(SIGSEGV, tsk);
+ return;
+ }
+#endif
thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
tsk->fpu_counter++;
}
EXPORT_SYMBOL_GPL(math_state_restore);
#ifndef CONFIG_MATH_EMULATION
-
asmlinkage void math_emulate(long arg)
{
printk(KERN_EMERG
@@ -1170,12 +912,46 @@ asmlinkage void math_emulate(long arg)
force_sig(SIGFPE, current);
schedule();
}
-
#endif /* CONFIG_MATH_EMULATION */
+dotraplinkage void __kprobes
+do_device_not_available(struct pt_regs *regs, long error)
+{
+#ifdef CONFIG_X86_32
+ if (read_cr0() & X86_CR0_EM) {
+ conditional_sti(regs);
+ math_emulate(0);
+ } else {
+ math_state_restore(); /* interrupts still off */
+ conditional_sti(regs);
+ }
+#else
+ math_state_restore();
+#endif
+}
+
+#ifdef CONFIG_X86_32
+dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+{
+ siginfo_t info;
+ local_irq_enable();
+
+ info.si_signo = SIGILL;
+ info.si_errno = 0;
+ info.si_code = ILL_BADSTK;
+ info.si_addr = 0;
+ if (notify_die(DIE_TRAP, "iret exception",
+ regs, error_code, 32, SIGILL) == NOTIFY_STOP)
+ return;
+ do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
+}
+#endif
+
void __init trap_init(void)
{
+#ifdef CONFIG_X86_32
int i;
+#endif
#ifdef CONFIG_EISA
void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1185,29 +961,40 @@ void __init trap_init(void)
early_iounmap(p, 4);
#endif
- set_trap_gate(0, &divide_error);
- set_intr_gate(1, &debug);
- set_intr_gate(2, &nmi);
- set_system_intr_gate(3, &int3); /* int3 can be called from all */
- set_system_gate(4, &overflow); /* int4 can be called from all */
- set_trap_gate(5, &bounds);
- set_trap_gate(6, &invalid_op);
- set_trap_gate(7, &device_not_available);
+ set_intr_gate(0, &divide_error);
+ set_intr_gate_ist(1, &debug, DEBUG_STACK);
+ set_intr_gate_ist(2, &nmi, NMI_STACK);
+ /* int3 can be called from all */
+ set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
+ /* int4 can be called from all */
+ set_system_intr_gate(4, &overflow);
+ set_intr_gate(5, &bounds);
+ set_intr_gate(6, &invalid_op);
+ set_intr_gate(7, &device_not_available);
+#ifdef CONFIG_X86_32
set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
- set_trap_gate(9, &coprocessor_segment_overrun);
- set_trap_gate(10, &invalid_TSS);
- set_trap_gate(11, &segment_not_present);
- set_trap_gate(12, &stack_segment);
- set_trap_gate(13, &general_protection);
+#else
+ set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
+#endif
+ set_intr_gate(9, &coprocessor_segment_overrun);
+ set_intr_gate(10, &invalid_TSS);
+ set_intr_gate(11, &segment_not_present);
+ set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
+ set_intr_gate(13, &general_protection);
set_intr_gate(14, &page_fault);
- set_trap_gate(15, &spurious_interrupt_bug);
- set_trap_gate(16, &coprocessor_error);
- set_trap_gate(17, &alignment_check);
+ set_intr_gate(15, &spurious_interrupt_bug);
+ set_intr_gate(16, &coprocessor_error);
+ set_intr_gate(17, &alignment_check);
#ifdef CONFIG_X86_MCE
- set_trap_gate(18, &machine_check);
+ set_intr_gate_ist(18, &machine_check, MCE_STACK);
#endif
- set_trap_gate(19, &simd_coprocessor_error);
+ set_intr_gate(19, &simd_coprocessor_error);
+#ifdef CONFIG_IA32_EMULATION
+ set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+#endif
+
+#ifdef CONFIG_X86_32
if (cpu_has_fxsr) {
printk(KERN_INFO "Enabling fast FPU save and restore... ");
set_in_cr4(X86_CR4_OSFXSR);
@@ -1220,37 +1007,20 @@ void __init trap_init(void)
printk("done.\n");
}
- set_system_gate(SYSCALL_VECTOR, &system_call);
+ set_system_trap_gate(SYSCALL_VECTOR, &system_call);
/* Reserve all the builtin and the syscall vector: */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
set_bit(i, used_vectors);
set_bit(SYSCALL_VECTOR, used_vectors);
-
- init_thread_xstate();
+#endif
/*
* Should be a barrier for any external CPU state:
*/
cpu_init();
+#ifdef CONFIG_X86_32
trap_init_hook();
+#endif
}
-
-static int __init kstack_setup(char *s)
-{
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-
- return 1;
-}
-__setup("kstack=", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
- code_bytes = simple_strtoul(s, NULL, 0);
- if (code_bytes > 8192)
- code_bytes = 8192;
-
- return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
deleted file mode 100644
index 513caaca711..00000000000
--- a/arch/x86/kernel/traps_64.c
+++ /dev/null
@@ -1,1212 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
- *
- * Pentium III FXSR, SSE support
- * Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-/*
- * 'Traps.c' handles hardware traps and faults after we have saved some
- * state in 'entry.S'.
- */
-#include <linux/moduleparam.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/spinlock.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/utsname.h>
-#include <linux/kdebug.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/unwind.h>
-#include <linux/delay.h>
-#include <linux/errno.h>
-#include <linux/kexec.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/init.h>
-#include <linux/bug.h>
-#include <linux/nmi.h>
-#include <linux/mm.h>
-
-#if defined(CONFIG_EDAC)
-#include <linux/edac.h>
-#endif
-
-#include <asm/stacktrace.h>
-#include <asm/processor.h>
-#include <asm/debugreg.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/unwind.h>
-#include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/io.h>
-#include <asm/pgalloc.h>
-#include <asm/proto.h>
-#include <asm/pda.h>
-#include <asm/traps.h>
-
-#include <mach_traps.h>
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 12;
-static unsigned int code_bytes = 64;
-static int ignore_nmis;
-static int die_counter;
-
-static inline void conditional_sti(struct pt_regs *regs)
-{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
-}
-
-static inline void preempt_conditional_sti(struct pt_regs *regs)
-{
- inc_preempt_count();
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
-}
-
-static inline void preempt_conditional_cli(struct pt_regs *regs)
-{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_disable();
- /* Make sure to not schedule here because we could be running
- on an exception stack. */
- dec_preempt_count();
-}
-
-void printk_address(unsigned long address, int reliable)
-{
- printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
-}
-
-static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
- unsigned *usedp, char **idp)
-{
- static char ids[][8] = {
- [DEBUG_STACK - 1] = "#DB",
- [NMI_STACK - 1] = "NMI",
- [DOUBLEFAULT_STACK - 1] = "#DF",
- [STACKFAULT_STACK - 1] = "#SS",
- [MCE_STACK - 1] = "#MC",
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
-#endif
- };
- unsigned k;
-
- /*
- * Iterate over all exception stacks, and figure out whether
- * 'stack' is in one of them:
- */
- for (k = 0; k < N_EXCEPTION_STACKS; k++) {
- unsigned long end = per_cpu(orig_ist, cpu).ist[k];
- /*
- * Is 'stack' above this exception frame's end?
- * If yes then skip to the next frame.
- */
- if (stack >= end)
- continue;
- /*
- * Is 'stack' above this exception frame's start address?
- * If yes then we found the right frame.
- */
- if (stack >= end - EXCEPTION_STKSZ) {
- /*
- * Make sure we only iterate through an exception
- * stack once. If it comes up for the second time
- * then there's something wrong going on - just
- * break out and return NULL:
- */
- if (*usedp & (1U << k))
- break;
- *usedp |= 1U << k;
- *idp = ids[k];
- return (unsigned long *)end;
- }
- /*
- * If this is a debug stack, and if it has a larger size than
- * the usual exception stacks, then 'stack' might still
- * be within the lower portion of the debug stack:
- */
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
- unsigned j = N_EXCEPTION_STACKS - 1;
-
- /*
- * Black magic. A large debug stack is composed of
- * multiple exception stack entries, which we
- * iterate through now. Dont look:
- */
- do {
- ++j;
- end -= EXCEPTION_STKSZ;
- ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
- } while (stack < end - EXCEPTION_STKSZ);
- if (*usedp & (1U << j))
- break;
- *usedp |= 1U << j;
- *idp = ids[j];
- return (unsigned long *)end;
- }
-#endif
- }
- return NULL;
-}
-
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
-
-static inline int valid_stack_ptr(struct thread_info *tinfo,
- void *p, unsigned int size, void *end)
-{
- void *t = tinfo;
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
-
- while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + 8) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, bp == 0);
- }
- }
- stack++;
- }
- return bp;
-}
-
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
-{
- const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
- unsigned used = 0;
- struct thread_info *tinfo;
-
- if (!task)
- task = current;
-
- if (!stack) {
- unsigned long dummy;
- stack = &dummy;
- if (task && task != current)
- stack = (unsigned long *)task->thread.sp;
- }
-
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- asm("movq %%rbp, %0" : "=r" (bp) :);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
-#endif
-
- /*
- * Print function call entries in all stacks, starting at the
- * current stack address. If the stacks consist of nested
- * exceptions
- */
- tinfo = task_thread_info(task);
- for (;;) {
- char *id;
- unsigned long *estack_end;
- estack_end = in_exception_stack(cpu, (unsigned long)stack,
- &used, &id);
-
- if (estack_end) {
- if (ops->stack(data, id) < 0)
- break;
-
- bp = print_context_stack(tinfo, stack, bp, ops,
- data, estack_end);
- ops->stack(data, "<EOE>");
- /*
- * We link to the next stack via the
- * second-to-last pointer (index -2 to end) in the
- * exception stack:
- */
- stack = (unsigned long *) estack_end[-2];
- continue;
- }
- if (irqstack_end) {
- unsigned long *irqstack;
- irqstack = irqstack_end -
- (IRQSTACKSIZE - 64) / sizeof(*irqstack);
-
- if (stack >= irqstack && stack < irqstack_end) {
- if (ops->stack(data, "IRQ") < 0)
- break;
- bp = print_context_stack(tinfo, stack, bp,
- ops, data, irqstack_end);
- /*
- * We link to the next stack (which would be
- * the process stack normally) the last
- * pointer (index -1 to end) in the IRQ stack:
- */
- stack = (unsigned long *) (irqstack_end[-1]);
- irqstack_end = NULL;
- ops->stack(data, "EOI");
- continue;
- }
- }
- break;
- }
-
- /*
- * This handles the process stack:
- */
- bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
- put_cpu();
-}
-EXPORT_SYMBOL(dump_trace);
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- print_symbol(msg, symbol);
- printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
- printk("%s\n", msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
- printk(" <%s> ", name);
- return 0;
-}
-
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
- touch_nmi_watchdog();
- printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
- printk("\nCall Trace:\n");
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
- printk("\n");
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
-{
- unsigned long *stack;
- int i;
- const int cpu = smp_processor_id();
- unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
- unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
-
- // debugging aid: "show_stack(NULL, NULL);" prints the
- // back trace for this cpu.
-
- if (sp == NULL) {
- if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
-
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- if (stack >= irqstack && stack <= irqstack_end) {
- if (stack == irqstack_end) {
- stack = (unsigned long *) (irqstack_end[-1]);
- printk(" <EOI> ");
- }
- } else {
- if (((long) stack & (THREAD_SIZE-1)) == 0)
- break;
- }
- if (i && ((i % 4) == 0))
- printk("\n");
- printk(" %016lx", *stack++);
- touch_nmi_watchdog();
- }
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
-}
-
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
- show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
- unsigned long bp = 0;
- unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- asm("movq %%rbp, %0" : "=r" (bp):);
-#endif
-
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
-void show_registers(struct pt_regs *regs)
-{
- int i;
- unsigned long sp;
- const int cpu = smp_processor_id();
- struct task_struct *cur = cpu_pda(cpu)->pcurrent;
-
- sp = regs->sp;
- printk("CPU %d ", cpu);
- __show_regs(regs);
- printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
- cur->comm, cur->pid, task_thread_info(cur), cur);
-
- /*
- * When in-kernel, we also print out the stack and code at the
- * time of the fault..
- */
- if (!user_mode(regs)) {
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
- u8 *ip;
-
- printk("Stack: ");
- show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
- regs->bp, "");
- printk("\n");
-
- printk(KERN_EMERG "Code: ");
-
- ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- /* try starting at RIP */
- ip = (u8 *)regs->ip;
- code_len = code_len - code_prologue + 1;
- }
- for (i = 0; i < code_len; i++, ip++) {
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
- printk(" Bad RIP value.");
- break;
- }
- if (ip == (u8 *)regs->ip)
- printk("<%02x> ", c);
- else
- printk("%02x ", c);
- }
- }
- printk("\n");
-}
-
-int is_valid_bugaddr(unsigned long ip)
-{
- unsigned short ud2;
-
- if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
- return 0;
-
- return ud2 == 0x0b0f;
-}
-
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
- int cpu;
- unsigned long flags;
-
- oops_enter();
-
- /* racy, but better than risking deadlock. */
- raw_local_irq_save(flags);
- cpu = smp_processor_id();
- if (!__raw_spin_trylock(&die_lock)) {
- if (cpu == die_owner)
- /* nested oops. should stop eventually */;
- else
- __raw_spin_lock(&die_lock);
- }
- die_nest_count++;
- die_owner = cpu;
- console_verbose();
- bust_spinlocks(1);
- return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
- die_owner = -1;
- bust_spinlocks(0);
- die_nest_count--;
- if (!die_nest_count)
- /* Nest count reaches zero, release the lock. */
- __raw_spin_unlock(&die_lock);
- raw_local_irq_restore(flags);
- if (!regs) {
- oops_exit();
- return;
- }
- if (panic_on_oops)
- panic("Fatal exception");
- oops_exit();
- do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
- printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
- printk("\n");
- if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
- return 1;
-
- show_registers(regs);
- add_taint(TAINT_DIE);
- /* Executive summary in case the oops scrolled away */
- printk(KERN_ALERT "RIP ");
- printk_address(regs->ip, 1);
- printk(" RSP <%016lx>\n", regs->sp);
- if (kexec_should_crash(current))
- crash_kexec(regs);
- return 0;
-}
-
-void die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned long flags = oops_begin();
-
- if (!user_mode(regs))
- report_bug(regs->ip, regs);
-
- if (__die(str, regs, err))
- regs = NULL;
- oops_end(flags, regs, SIGSEGV);
-}
-
-notrace __kprobes void
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
- unsigned long flags;
-
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
-
- flags = oops_begin();
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out.
- */
- printk(KERN_EMERG "%s", str);
- printk(" on CPU%d, ip %08lx, registers:\n",
- smp_processor_id(), regs->ip);
- show_registers(regs);
- if (kexec_should_crash(current))
- crash_kexec(regs);
- if (do_panic || panic_on_oops)
- panic("Non maskable interrupt");
- oops_end(flags, NULL, SIGBUS);
- nmi_exit();
- local_irq_enable();
- do_exit(SIGBUS);
-}
-
-static void __kprobes
-do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
- long error_code, siginfo_t *info)
-{
- struct task_struct *tsk = current;
-
- if (!user_mode(regs))
- goto kernel_trap;
-
- /*
- * We want error_code and trap_no set for userspace faults and
- * kernelspace faults which result in die(), but not
- * kernelspace faults which are fixed up. die() gives the
- * process no chance to handle the signal and notice the
- * kernel fault information, so that won't result in polluting
- * the information about previously queued, but not yet
- * delivered, faults. See also do_general_protection below.
- */
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
-
- if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
- printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
- tsk->comm, tsk->pid, str,
- regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
- }
-
- if (info)
- force_sig_info(signr, info, tsk);
- else
- force_sig(signr, tsk);
- return;
-
-kernel_trap:
- if (!fixup_exception(regs)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
- die(str, regs, error_code);
- }
- return;
-}
-
-#define DO_ERROR(trapnr, signr, str, name) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, NULL); \
-}
-
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
- siginfo_t info; \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
- trace_hardirqs_fixup(); \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, &info); \
-}
-
-DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR(4, SIGSEGV, "overflow", overflow)
-DO_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-
-/* Runs on IST stack */
-asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
-{
- if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
- 12, SIGBUS) == NOTIFY_STOP)
- return;
- preempt_conditional_sti(regs);
- do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
- preempt_conditional_cli(regs);
-}
-
-asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
-{
- static const char str[] = "double fault";
- struct task_struct *tsk = current;
-
- /* Return not checked because double check cannot be ignored */
- notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 8;
-
- /* This is always a kernel trap and never fixable (and thus must
- never return). */
- for (;;)
- die(str, regs, error_code);
-}
-
-asmlinkage void __kprobes
-do_general_protection(struct pt_regs *regs, long error_code)
-{
- struct task_struct *tsk;
-
- conditional_sti(regs);
-
- tsk = current;
- if (!user_mode(regs))
- goto gp_in_kernel;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
-
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
- printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
- tsk->comm, tsk->pid,
- regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
- }
-
- force_sig(SIGSEGV, tsk);
- return;
-
-gp_in_kernel:
- if (fixup_exception(regs))
- return;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
- if (notify_die(DIE_GPF, "general protection fault", regs,
- error_code, 13, SIGSEGV) == NOTIFY_STOP)
- return;
- die("general protection fault", regs, error_code);
-}
-
-static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs *regs)
-{
- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
- reason);
- printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
-
-#if defined(CONFIG_EDAC)
- if (edac_handler_set()) {
- edac_atomic_assert_error();
- return;
- }
-#endif
-
- if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
-
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
-
- /* Clear and disable the memory parity error line. */
- reason = (reason & 0xf) | 4;
- outb(reason, 0x61);
-}
-
-static notrace __kprobes void
-io_check_error(unsigned char reason, struct pt_regs *regs)
-{
- printk("NMI: IOCK error (debug interrupt?)\n");
- show_registers(regs);
-
- /* Re-enable the IOCK line, wait for a few seconds */
- reason = (reason & 0xf) | 8;
- outb(reason, 0x61);
- mdelay(2000);
- reason &= ~8;
- outb(reason, 0x61);
-}
-
-static notrace __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-{
- if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
- return;
- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
- reason);
- printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
-
- if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
-
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
-}
-
-/* Runs on IST stack. This code must keep interrupts off all the time.
- Nested NMIs are prevented by the CPU. */
-asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
-{
- unsigned char reason = 0;
- int cpu;
-
- cpu = smp_processor_id();
-
- /* Only the BSP gets external NMIs from the system. */
- if (!cpu)
- reason = get_nmi_reason();
-
- if (!(reason & 0xc0)) {
- if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
- return;
- /*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
- */
- if (nmi_watchdog_tick(regs, reason))
- return;
- if (!do_nmi_callback(regs, cpu))
- unknown_nmi_error(reason, regs);
-
- return;
- }
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
- return;
-
- /* AK: following checks seem to be broken on modern chipsets. FIXME */
- if (reason & 0x80)
- mem_parity_error(reason, regs);
- if (reason & 0x40)
- io_check_error(reason, regs);
-}
-
-asmlinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
-{
- nmi_enter();
-
- add_pda(__nmi_count, 1);
-
- if (!ignore_nmis)
- default_do_nmi(regs);
-
- nmi_exit();
-}
-
-void stop_nmi(void)
-{
- acpi_nmi_disable();
- ignore_nmis++;
-}
-
-void restart_nmi(void)
-{
- ignore_nmis--;
- acpi_nmi_enable();
-}
-
-/* runs on IST stack. */
-asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
-{
- trace_hardirqs_fixup();
-
- if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
- return;
-
- preempt_conditional_sti(regs);
- do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
- preempt_conditional_cli(regs);
-}
-
-/* Help handler running on IST stack to switch back to user stack
- for scheduling or signal handling. The actual stack switch is done in
- entry.S */
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
-{
- struct pt_regs *regs = eregs;
- /* Did already sync */
- if (eregs == (struct pt_regs *)eregs->sp)
- ;
- /* Exception from user space */
- else if (user_mode(eregs))
- regs = task_pt_regs(current);
- /* Exception from kernel and interrupts are enabled. Move to
- kernel process stack. */
- else if (eregs->flags & X86_EFLAGS_IF)
- regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
- if (eregs != regs)
- *regs = *eregs;
- return regs;
-}
-
-/* runs on IST stack. */
-asmlinkage void __kprobes do_debug(struct pt_regs * regs,
- unsigned long error_code)
-{
- struct task_struct *tsk = current;
- unsigned long condition;
- siginfo_t info;
-
- trace_hardirqs_fixup();
-
- get_debugreg(condition, 6);
-
- /*
- * The processor cleared BTF, so don't mark that we need it set.
- */
- clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
- tsk->thread.debugctlmsr = 0;
-
- if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
- SIGTRAP) == NOTIFY_STOP)
- return;
-
- preempt_conditional_sti(regs);
-
- /* Mask out spurious debug traps due to lazy DR7 setting */
- if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
- if (!tsk->thread.debugreg7)
- goto clear_dr7;
- }
-
- tsk->thread.debugreg6 = condition;
-
- /*
- * Single-stepping through TF: make sure we ignore any events in
- * kernel space (but re-enable TF when returning to user mode).
- */
- if (condition & DR_STEP) {
- if (!user_mode(regs))
- goto clear_TF_reenable;
- }
-
- /* Ok, finally something we can handle */
- tsk->thread.trap_no = 1;
- tsk->thread.error_code = error_code;
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
- info.si_code = TRAP_BRKPT;
- info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
- force_sig_info(SIGTRAP, &info, tsk);
-
-clear_dr7:
- set_debugreg(0, 7);
- preempt_conditional_cli(regs);
- return;
-
-clear_TF_reenable:
- set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
- regs->flags &= ~X86_EFLAGS_TF;
- preempt_conditional_cli(regs);
- return;
-}
-
-static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
-{
- if (fixup_exception(regs))
- return 1;
-
- notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
- /* Illegal floating point operation in the kernel */
- current->thread.trap_no = trapnr;
- die(str, regs, 0);
- return 0;
-}
-
-/*
- * Note that we play around with the 'TS' bit in an attempt to get
- * the correct behaviour even in the presence of the asynchronous
- * IRQ13 behaviour
- */
-asmlinkage void do_coprocessor_error(struct pt_regs *regs)
-{
- void __user *ip = (void __user *)(regs->ip);
- struct task_struct *task;
- siginfo_t info;
- unsigned short cwd, swd;
-
- conditional_sti(regs);
- if (!user_mode(regs) &&
- kernel_math_error(regs, "kernel x87 math error", 16))
- return;
-
- /*
- * Save the info for the exception handler and clear the error.
- */
- task = current;
- save_init_fpu(task);
- task->thread.trap_no = 16;
- task->thread.error_code = 0;
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_code = __SI_FAULT;
- info.si_addr = ip;
- /*
- * (~cwd & swd) will mask out exceptions that are not set to unmasked
- * status. 0x3f is the exception bits in these regs, 0x200 is the
- * C1 reg you need in case of a stack fault, 0x040 is the stack
- * fault bit. We should only be taking one exception at a time,
- * so if this combination doesn't produce any single exception,
- * then we have a bad program that isn't synchronizing its FPU usage
- * and it will suffer the consequences since we won't be able to
- * fully reproduce the context of the exception
- */
- cwd = get_fpu_cwd(task);
- swd = get_fpu_swd(task);
- switch (swd & ~cwd & 0x3f) {
- case 0x000: /* No unmasked exception */
- default: /* Multiple exceptions */
- break;
- case 0x001: /* Invalid Op */
- /*
- * swd & 0x240 == 0x040: Stack Underflow
- * swd & 0x240 == 0x240: Stack Overflow
- * User must clear the SF bit (0x40) if set
- */
- info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
- info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
- info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
- info.si_code = FPE_FLTRES;
- break;
- }
- force_sig_info(SIGFPE, &info, task);
-}
-
-asmlinkage void bad_intr(void)
-{
- printk("bad interrupt");
-}
-
-asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
-{
- void __user *ip = (void __user *)(regs->ip);
- struct task_struct *task;
- siginfo_t info;
- unsigned short mxcsr;
-
- conditional_sti(regs);
- if (!user_mode(regs) &&
- kernel_math_error(regs, "kernel simd math error", 19))
- return;
-
- /*
- * Save the info for the exception handler and clear the error.
- */
- task = current;
- save_init_fpu(task);
- task->thread.trap_no = 19;
- task->thread.error_code = 0;
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_code = __SI_FAULT;
- info.si_addr = ip;
- /*
- * The SIMD FPU exceptions are handled a little differently, as there
- * is only a single status/control register. Thus, to determine which
- * unmasked exception was caught we must mask the exception mask bits
- * at 0x1f80, and then use these to mask the exception bits at 0x3f.
- */
- mxcsr = get_fpu_mxcsr(task);
- switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
- case 0x000:
- default:
- break;
- case 0x001: /* Invalid Op */
- info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
- info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
- info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
- info.si_code = FPE_FLTRES;
- break;
- }
- force_sig_info(SIGFPE, &info, task);
-}
-
-asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
-{
-}
-
-asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
-{
-}
-
-asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
-{
-}
-
-/*
- * 'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- */
-asmlinkage void math_state_restore(void)
-{
- struct task_struct *me = current;
-
- if (!used_math()) {
- local_irq_enable();
- /*
- * does a slab alloc which can sleep
- */
- if (init_fpu(me)) {
- /*
- * ran out of memory!
- */
- do_group_exit(SIGKILL);
- return;
- }
- local_irq_disable();
- }
-
- clts(); /* Allow maths ops (or we recurse) */
- /*
- * Paranoid restore. send a SIGSEGV if we fail to restore the state.
- */
- if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
- stts();
- force_sig(SIGSEGV, me);
- return;
- }
- task_thread_info(me)->status |= TS_USEDFPU;
- me->fpu_counter++;
-}
-EXPORT_SYMBOL_GPL(math_state_restore);
-
-void __init trap_init(void)
-{
- set_intr_gate(0, &divide_error);
- set_intr_gate_ist(1, &debug, DEBUG_STACK);
- set_intr_gate_ist(2, &nmi, NMI_STACK);
- set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
- set_system_gate(4, &overflow); /* int4 can be called from all */
- set_intr_gate(5, &bounds);
- set_intr_gate(6, &invalid_op);
- set_intr_gate(7, &device_not_available);
- set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
- set_intr_gate(9, &coprocessor_segment_overrun);
- set_intr_gate(10, &invalid_TSS);
- set_intr_gate(11, &segment_not_present);
- set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
- set_intr_gate(13, &general_protection);
- set_intr_gate(14, &page_fault);
- set_intr_gate(15, &spurious_interrupt_bug);
- set_intr_gate(16, &coprocessor_error);
- set_intr_gate(17, &alignment_check);
-#ifdef CONFIG_X86_MCE
- set_intr_gate_ist(18, &machine_check, MCE_STACK);
-#endif
- set_intr_gate(19, &simd_coprocessor_error);
-
-#ifdef CONFIG_IA32_EMULATION
- set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
-#endif
- /*
- * initialize the per thread extended state:
- */
- init_thread_xstate();
- /*
- * Should be a barrier for any external CPU state:
- */
- cpu_init();
-}
-
-static int __init oops_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- if (!strcmp(s, "panic"))
- panic_on_oops = 1;
- return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
- return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
- code_bytes = simple_strtoul(s, NULL, 0);
- if (code_bytes > 8192)
- code_bytes = 8192;
-
- return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 8f98e9de1b8..161bb850fc4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
/*
* Read TSC and the reference counters. Take care of SMI disturbance
*/
-static u64 tsc_read_refs(u64 *pm, u64 *hpet)
+static u64 tsc_read_refs(u64 *p, int hpet)
{
u64 t1, t2;
int i;
@@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
for (i = 0; i < MAX_RETRIES; i++) {
t1 = get_cycles();
if (hpet)
- *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
+ *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
else
- *pm = acpi_pm_read_early();
+ *p = acpi_pm_read_early();
t2 = get_cycles();
if ((t2 - t1) < SMI_TRESHOLD)
return t2;
@@ -123,13 +123,59 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
}
/*
+ * Calculate the TSC frequency from HPET reference
+ */
+static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
+{
+ u64 tmp;
+
+ if (hpet2 < hpet1)
+ hpet2 += 0x100000000ULL;
+ hpet2 -= hpet1;
+ tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+ do_div(tmp, 1000000);
+ do_div(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+/*
+ * Calculate the TSC frequency from PMTimer reference
+ */
+static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
+{
+ u64 tmp;
+
+ if (!pm1 && !pm2)
+ return ULONG_MAX;
+
+ if (pm2 < pm1)
+ pm2 += (u64)ACPI_PM_OVRRUN;
+ pm2 -= pm1;
+ tmp = pm2 * 1000000000LL;
+ do_div(tmp, PMTMR_TICKS_PER_SEC);
+ do_div(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+#define CAL_MS 10
+#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
+#define CAL_PIT_LOOPS 1000
+
+#define CAL2_MS 50
+#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_PIT_LOOPS 5000
+
+
+/*
* Try to calibrate the TSC against the Programmable
* Interrupt Timer and return the frequency of the TSC
* in kHz.
*
* Return ULONG_MAX on failure to calibrate.
*/
-static unsigned long pit_calibrate_tsc(void)
+static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
{
u64 tsc, t1, t2, delta;
unsigned long tscmin, tscmax;
@@ -144,8 +190,8 @@ static unsigned long pit_calibrate_tsc(void)
* (LSB then MSB) to begin countdown.
*/
outb(0xb0, 0x43);
- outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
- outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+ outb(latch & 0xff, 0x42);
+ outb(latch >> 8, 0x42);
tsc = t1 = t2 = get_cycles();
@@ -166,31 +212,154 @@ static unsigned long pit_calibrate_tsc(void)
/*
* Sanity checks:
*
- * If we were not able to read the PIT more than 5000
+ * If we were not able to read the PIT more than loopmin
* times, then we have been hit by a massive SMI
*
* If the maximum is 10 times larger than the minimum,
* then we got hit by an SMI as well.
*/
- if (pitcnt < 5000 || tscmax > 10 * tscmin)
+ if (pitcnt < loopmin || tscmax > 10 * tscmin)
return ULONG_MAX;
/* Calculate the PIT value */
delta = t2 - t1;
- do_div(delta, 50);
+ do_div(delta, ms);
return delta;
}
+/*
+ * This reads the current MSB of the PIT counter, and
+ * checks if we are running on sufficiently fast and
+ * non-virtualized hardware.
+ *
+ * Our expectations are:
+ *
+ * - the PIT is running at roughly 1.19MHz
+ *
+ * - each IO is going to take about 1us on real hardware,
+ * but we allow it to be much faster (by a factor of 10) or
+ * _slightly_ slower (ie we allow up to a 2us read+counter
+ * update - anything else implies a unacceptably slow CPU
+ * or PIT for the fast calibration to work.
+ *
+ * - with 256 PIT ticks to read the value, we have 214us to
+ * see the same MSB (and overhead like doing a single TSC
+ * read per MSB value etc).
+ *
+ * - We're doing 2 reads per loop (LSB, MSB), and we expect
+ * them each to take about a microsecond on real hardware.
+ * So we expect a count value of around 100. But we'll be
+ * generous, and accept anything over 50.
+ *
+ * - if the PIT is stuck, and we see *many* more reads, we
+ * return early (and the next caller of pit_expect_msb()
+ * then consider it a failure when they don't see the
+ * next expected value).
+ *
+ * These expectations mean that we know that we have seen the
+ * transition from one expected value to another with a fairly
+ * high accuracy, and we didn't miss any events. We can thus
+ * use the TSC value at the transitions to calculate a pretty
+ * good value for the TSC frequencty.
+ */
+static inline int pit_expect_msb(unsigned char val)
+{
+ int count = 0;
+
+ for (count = 0; count < 50000; count++) {
+ /* Ignore LSB */
+ inb(0x42);
+ if (inb(0x42) != val)
+ break;
+ }
+ return count > 50;
+}
+
+/*
+ * How many MSB values do we want to see? We aim for a
+ * 15ms calibration, which assuming a 2us counter read
+ * error should give us roughly 150 ppm precision for
+ * the calibration.
+ */
+#define QUICK_PIT_MS 15
+#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+
+static unsigned long quick_pit_calibrate(void)
+{
+ /* Set the Gate high, disable speaker */
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ /*
+ * Counter 2, mode 0 (one-shot), binary count
+ *
+ * NOTE! Mode 2 decrements by two (and then the
+ * output is flipped each time, giving the same
+ * final output frequency as a decrement-by-one),
+ * so mode 0 is much better when looking at the
+ * individual counts.
+ */
+ outb(0xb0, 0x43);
+
+ /* Start at 0xffff */
+ outb(0xff, 0x42);
+ outb(0xff, 0x42);
+
+ if (pit_expect_msb(0xff)) {
+ int i;
+ u64 t1, t2, delta;
+ unsigned char expect = 0xfe;
+
+ t1 = get_cycles();
+ for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
+ if (!pit_expect_msb(expect))
+ goto failed;
+ }
+ t2 = get_cycles();
+
+ /*
+ * Make sure we can rely on the second TSC timestamp:
+ */
+ if (!pit_expect_msb(expect))
+ goto failed;
+
+ /*
+ * Ok, if we get here, then we've seen the
+ * MSB of the PIT decrement QUICK_PIT_ITERATIONS
+ * times, and each MSB had many hits, so we never
+ * had any sudden jumps.
+ *
+ * As a result, we can depend on there not being
+ * any odd delays anywhere, and the TSC reads are
+ * reliable.
+ *
+ * kHz = ticks / time-in-seconds / 1000;
+ * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
+ * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
+ */
+ delta = (t2 - t1)*PIT_TICK_RATE;
+ do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
+ printk("Fast TSC calibration using PIT\n");
+ return delta;
+ }
+failed:
+ return 0;
+}
/**
* native_calibrate_tsc - calibrate the tsc on boot
*/
unsigned long native_calibrate_tsc(void)
{
- u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
+ u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags;
- int hpet = is_hpet_enabled(), i;
+ unsigned long flags, latch, ms, fast_calibrate;
+ int hpet = is_hpet_enabled(), i, loopmin;
+
+ local_irq_save(flags);
+ fast_calibrate = quick_pit_calibrate();
+ local_irq_restore(flags);
+ if (fast_calibrate)
+ return fast_calibrate;
/*
* Run 5 calibration loops to get the lowest frequency value
@@ -216,7 +385,13 @@ unsigned long native_calibrate_tsc(void)
* calibration delay loop as we have to wait for a certain
* amount of time anyway.
*/
- for (i = 0; i < 5; i++) {
+
+ /* Preset PIT loop values */
+ latch = CAL_LATCH;
+ ms = CAL_MS;
+ loopmin = CAL_PIT_LOOPS;
+
+ for (i = 0; i < 3; i++) {
unsigned long tsc_pit_khz;
/*
@@ -226,16 +401,16 @@ unsigned long native_calibrate_tsc(void)
* read the end value.
*/
local_irq_save(flags);
- tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
- tsc_pit_khz = pit_calibrate_tsc();
- tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
+ tsc1 = tsc_read_refs(&ref1, hpet);
+ tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
+ tsc2 = tsc_read_refs(&ref2, hpet);
local_irq_restore(flags);
/* Pick the lowest PIT TSC calibration so far */
tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
/* hpet or pmtimer available ? */
- if (!hpet && !pm1 && !pm2)
+ if (!hpet && !ref1 && !ref2)
continue;
/* Check, whether the sampling was disturbed by an SMI */
@@ -243,23 +418,41 @@ unsigned long native_calibrate_tsc(void)
continue;
tsc2 = (tsc2 - tsc1) * 1000000LL;
+ if (hpet)
+ tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
+ else
+ tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
- if (hpet) {
- if (hpet2 < hpet1)
- hpet2 += 0x100000000ULL;
- hpet2 -= hpet1;
- tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
- do_div(tsc1, 1000000);
- } else {
- if (pm2 < pm1)
- pm2 += (u64)ACPI_PM_OVRRUN;
- pm2 -= pm1;
- tsc1 = pm2 * 1000000000LL;
- do_div(tsc1, PMTMR_TICKS_PER_SEC);
+ tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+
+ /* Check the reference deviation */
+ delta = ((u64) tsc_pit_min) * 100;
+ do_div(delta, tsc_ref_min);
+
+ /*
+ * If both calibration results are inside a 10% window
+ * then we can be sure, that the calibration
+ * succeeded. We break out of the loop right away. We
+ * use the reference value, as it is more precise.
+ */
+ if (delta >= 90 && delta <= 110) {
+ printk(KERN_INFO
+ "TSC: PIT calibration matches %s. %d loops\n",
+ hpet ? "HPET" : "PMTIMER", i + 1);
+ return tsc_ref_min;
}
- do_div(tsc2, tsc1);
- tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+ /*
+ * Check whether PIT failed more than once. This
+ * happens in virtualized environments. We need to
+ * give the virtual PC a slightly longer timeframe for
+ * the HPET/PMTIMER to make the result precise.
+ */
+ if (i == 1 && tsc_pit_min == ULONG_MAX) {
+ latch = CAL2_LATCH;
+ ms = CAL2_MS;
+ loopmin = CAL2_PIT_LOOPS;
+ }
}
/*
@@ -270,7 +463,7 @@ unsigned long native_calibrate_tsc(void)
printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
/* We don't have an alternative source, disable TSC */
- if (!hpet && !pm1 && !pm2) {
+ if (!hpet && !ref1 && !ref2) {
printk("TSC: No reference (HPET/PMTIMER) available\n");
return 0;
}
@@ -278,7 +471,7 @@ unsigned long native_calibrate_tsc(void)
/* The alternative source failed as well, disable TSC */
if (tsc_ref_min == ULONG_MAX) {
printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
- "failed due to SMI disturbance.\n");
+ "failed.\n");
return 0;
}
@@ -290,44 +483,25 @@ unsigned long native_calibrate_tsc(void)
}
/* We don't have an alternative source, use the PIT calibration value */
- if (!hpet && !pm1 && !pm2) {
+ if (!hpet && !ref1 && !ref2) {
printk(KERN_INFO "TSC: Using PIT calibration value\n");
return tsc_pit_min;
}
/* The alternative source failed, use the PIT calibration value */
if (tsc_ref_min == ULONG_MAX) {
- printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due "
- "to SMI disturbance. Using PIT calibration\n");
+ printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
+ "Using PIT calibration\n");
return tsc_pit_min;
}
- /* Check the reference deviation */
- delta = ((u64) tsc_pit_min) * 100;
- do_div(delta, tsc_ref_min);
-
- /*
- * If both calibration results are inside a 5% window, the we
- * use the lower frequency of those as it is probably the
- * closest estimate.
- */
- if (delta >= 95 && delta <= 105) {
- printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
- hpet ? "HPET" : "PMTIMER");
- printk(KERN_INFO "TSC: using %s calibration value\n",
- tsc_pit_min <= tsc_ref_min ? "PIT" :
- hpet ? "HPET" : "PMTIMER");
- return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
- }
-
- printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
- hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
-
/*
* The calibration values differ too much. In doubt, we use
* the PIT value as we know that there are PMTIMERs around
- * running at double speed.
+ * running at double speed. At least we let the user know:
*/
+ printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
+ hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
printk(KERN_INFO "TSC: Using PIT calibration value\n");
return tsc_pit_min;
}
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
new file mode 100644
index 00000000000..aeef529917e
--- /dev/null
+++ b/arch/x86/kernel/uv_irq.c
@@ -0,0 +1,79 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV IRQ functions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/irq.h>
+
+#include <asm/apic.h>
+#include <asm/uv/uv_irq.h>
+
+static void uv_noop(unsigned int irq)
+{
+}
+
+static unsigned int uv_noop_ret(unsigned int irq)
+{
+ return 0;
+}
+
+static void uv_ack_apic(unsigned int irq)
+{
+ ack_APIC_irq();
+}
+
+struct irq_chip uv_irq_chip = {
+ .name = "UV-CORE",
+ .startup = uv_noop_ret,
+ .shutdown = uv_noop,
+ .enable = uv_noop,
+ .disable = uv_noop,
+ .ack = uv_noop,
+ .mask = uv_noop,
+ .unmask = uv_noop,
+ .eoi = uv_ack_apic,
+ .end = uv_noop,
+};
+
+/*
+ * Set up a mapping of an available irq and vector, and enable the specified
+ * MMR that defines the MSI that is to be sent to the specified CPU when an
+ * interrupt is raised.
+ */
+int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
+ unsigned long mmr_offset)
+{
+ int irq;
+ int ret;
+
+ irq = create_irq();
+ if (irq <= 0)
+ return -EBUSY;
+
+ ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset);
+ if (ret != irq)
+ destroy_irq(irq);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uv_setup_irq);
+
+/*
+ * Tear down a mapping of an irq and vector, and disable the specified MMR that
+ * defined the MSI that was to be sent to the specified CPU when an interrupt
+ * was raised.
+ *
+ * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
+ */
+void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset)
+{
+ arch_disable_uv_irq(mmr_blade, mmr_offset);
+ destroy_irq(irq);
+}
+EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
new file mode 100644
index 00000000000..67f9b9dbf80
--- /dev/null
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -0,0 +1,72 @@
+/*
+ * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson
+ */
+
+#include <linux/sysdev.h>
+#include <asm/uv/bios.h>
+
+struct kobject *sgi_uv_kobj;
+
+static ssize_t partition_id_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
+}
+
+static ssize_t coherence_id_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
+}
+
+static struct kobj_attribute partition_id_attr =
+ __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
+
+static struct kobj_attribute coherence_id_attr =
+ __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
+
+
+static int __init sgi_uv_sysfs_init(void)
+{
+ unsigned long ret;
+
+ if (!sgi_uv_kobj)
+ sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
+ if (!sgi_uv_kobj) {
+ printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n");
+ return -EINVAL;
+ }
+
+ ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
+ if (ret) {
+ printk(KERN_WARNING "sysfs_create_file partition_id failed \n");
+ return ret;
+ }
+
+ ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
+ if (ret) {
+ printk(KERN_WARNING "sysfs_create_file coherence_id failed \n");
+ return ret;
+ }
+
+ return 0;
+}
+
+device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 594ef47f0a6..0c9667f0752 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
#include <asm/visws/cobalt.h>
#include <asm/visws/piix4.h>
#include <asm/arch_hooks.h>
+#include <asm/io_apic.h>
#include <asm/fixmap.h>
#include <asm/reboot.h>
#include <asm/setup.h>
#include <asm/e820.h>
-#include <asm/smp.h>
#include <asm/io.h>
#include <mach_ipi.h>
#include "mach_apic.h"
-#include <linux/init.h>
-#include <linux/smp.h>
-
#include <linux/kernel_stat.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <asm/apic.h>
#include <asm/i8259.h>
#include <asm/irq_vectors.h>
-#include <asm/visws/cobalt.h>
#include <asm/visws/lithium.h>
-#include <asm/visws/piix4.h>
#include <linux/sched.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/pci.h>
#include <linux/pci_ids.h>
extern int no_broadcast;
-#include <asm/io.h>
#include <asm/apic.h>
-#include <asm/arch_hooks.h>
-#include <asm/visws/cobalt.h>
-#include <asm/visws/lithium.h>
char visws_board_type = -1;
char visws_board_rev = -1;
@@ -498,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq)
static unsigned int startup_cobalt_irq(unsigned int irq)
{
unsigned long flags;
+ struct irq_desc *desc = irq_to_desc(irq);
spin_lock_irqsave(&cobalt_lock, flags);
- if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
- irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
+ if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
+ desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
enable_cobalt_irq(irq);
spin_unlock_irqrestore(&cobalt_lock, flags);
return 0;
@@ -520,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq)
static void end_cobalt_irq(unsigned int irq)
{
unsigned long flags;
+ struct irq_desc *desc = irq_to_desc(irq);
spin_lock_irqsave(&cobalt_lock, flags);
- if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
+ if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
enable_cobalt_irq(irq);
spin_unlock_irqrestore(&cobalt_lock, flags);
}
@@ -640,12 +628,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
spin_unlock_irqrestore(&i8259A_lock, flags);
- desc = irq_desc + realirq;
+ desc = irq_to_desc(realirq);
/*
* handle this 'virtual interrupt' as a Cobalt one now.
*/
- kstat_cpu(smp_processor_id()).irqs[realirq]++;
+ kstat_incr_irqs_this_cpu(realirq, desc);
if (likely(desc->action != NULL))
handle_IRQ_event(realirq, desc->action);
@@ -676,27 +664,29 @@ void init_VISWS_APIC_irqs(void)
int i;
for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
- irq_desc[i].status = IRQ_DISABLED;
- irq_desc[i].action = 0;
- irq_desc[i].depth = 1;
+ struct irq_desc *desc = irq_to_desc(i);
+
+ desc->status = IRQ_DISABLED;
+ desc->action = 0;
+ desc->depth = 1;
if (i == 0) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
else if (i == CO_IRQ_IDE0) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
else if (i == CO_IRQ_IDE1) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
else if (i == CO_IRQ_8259) {
- irq_desc[i].chip = &piix4_master_irq_type;
+ desc->chip = &piix4_master_irq_type;
}
else if (i < CO_IRQ_APIC0) {
- irq_desc[i].chip = &piix4_virtual_irq_type;
+ desc->chip = &piix4_virtual_irq_type;
}
else if (IS_CO_APIC(i)) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 38f566fa27d..4eeb5cf9720 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -46,6 +46,7 @@
#include <asm/io.h>
#include <asm/tlbflush.h>
#include <asm/irq.h>
+#include <asm/syscalls.h>
/*
* Known problems:
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 6ca515d6db5..8b6c393ab9f 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
const void *desc)
{
u32 *ldt_entry = (u32 *)desc;
- vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
+ vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
}
static void vmi_load_sp0(struct tss_struct *tss,
@@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
}
#endif
-static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
-static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
{
/*
* This call comes in very early, before mem_map is setup.
@@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
-static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
{
vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
vmi_check_page_type(clonepfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}
-static void vmi_release_pte(u32 pfn)
+static void vmi_release_pte(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L1);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
-static void vmi_release_pmd(u32 pfn)
+static void vmi_release_pmd(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L2);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -905,8 +905,8 @@ static inline int __init activate_vmi(void)
#endif
#ifdef CONFIG_X86_LOCAL_APIC
- para_fill(pv_apic_ops.apic_read, APICRead);
- para_fill(pv_apic_ops.apic_write, APICWrite);
+ para_fill(apic_ops->read, APICRead);
+ para_fill(apic_ops->write, APICWrite);
#endif
/*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 6953859fe28..254ee07f863 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void)
void __init vmi_time_init(void)
{
+ unsigned int cpu;
/* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
vmi_time_init_clockevent();
setup_irq(0, &vmi_clock_action);
+ for_each_possible_cpu(cpu)
+ per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
}
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index af5bdad8460..a9b8560adbc 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -140,10 +140,10 @@ SECTIONS
*(.con_initcall.init)
__con_initcall_end = .;
}
- .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
- __x86cpuvendor_start = .;
- *(.x86cpuvendor.init)
- __x86cpuvendor_end = .;
+ .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+ __x86_cpu_dev_start = .;
+ *(.x86_cpu_dev.init)
+ __x86_cpu_dev_end = .;
}
SECURITY_INIT
. = ALIGN(4);
@@ -180,6 +180,7 @@ SECTIONS
. = ALIGN(PAGE_SIZE);
.data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
__per_cpu_start = .;
+ *(.data.percpu.page_aligned)
*(.data.percpu)
*(.data.percpu.shared_aligned)
__per_cpu_end = .;
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 63e5c1a22e8..46e05447405 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -168,12 +168,11 @@ SECTIONS
*(.con_initcall.init)
}
__con_initcall_end = .;
- . = ALIGN(16);
- __x86cpuvendor_start = .;
- .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
- *(.x86cpuvendor.init)
+ __x86_cpu_dev_start = .;
+ .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+ *(.x86_cpu_dev.init)
}
- __x86cpuvendor_end = .;
+ __x86_cpu_dev_end = .;
SECURITY_INIT
. = ALIGN(8);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c..7766d36983f 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
}
-static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
switch (type) {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
new file mode 100644
index 00000000000..b13acb75e82
--- /dev/null
+++ b/arch/x86/kernel/xsave.c
@@ -0,0 +1,345 @@
+/*
+ * xsave/xrstor support.
+ *
+ * Author: Suresh Siddha <suresh.b.siddha@intel.com>
+ */
+#include <linux/bootmem.h>
+#include <linux/compat.h>
+#include <asm/i387.h>
+#ifdef CONFIG_IA32_EMULATION
+#include <asm/sigcontext32.h>
+#endif
+#include <asm/xcr.h>
+
+/*
+ * Supported feature mask by the CPU and the kernel.
+ */
+u64 pcntxt_mask;
+
+struct _fpx_sw_bytes fx_sw_reserved;
+#ifdef CONFIG_IA32_EMULATION
+struct _fpx_sw_bytes fx_sw_reserved_ia32;
+#endif
+
+/*
+ * Check for the presence of extended state information in the
+ * user fpstate pointer in the sigcontext.
+ */
+int check_for_xstate(struct i387_fxsave_struct __user *buf,
+ void __user *fpstate,
+ struct _fpx_sw_bytes *fx_sw_user)
+{
+ int min_xstate_size = sizeof(struct i387_fxsave_struct) +
+ sizeof(struct xsave_hdr_struct);
+ unsigned int magic2;
+ int err;
+
+ err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
+ sizeof(struct _fpx_sw_bytes));
+
+ if (err)
+ return err;
+
+ /*
+ * First Magic check failed.
+ */
+ if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
+ return -1;
+
+ /*
+ * Check for error scenarios.
+ */
+ if (fx_sw_user->xstate_size < min_xstate_size ||
+ fx_sw_user->xstate_size > xstate_size ||
+ fx_sw_user->xstate_size > fx_sw_user->extended_size)
+ return -1;
+
+ err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
+ fx_sw_user->extended_size -
+ FP_XSTATE_MAGIC2_SIZE));
+ /*
+ * Check for the presence of second magic word at the end of memory
+ * layout. This detects the case where the user just copied the legacy
+ * fpstate layout with out copying the extended state information
+ * in the memory layout.
+ */
+ if (err || magic2 != FP_XSTATE_MAGIC2)
+ return -1;
+
+ return 0;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Signal frame handlers.
+ */
+
+int save_i387_xstate(void __user *buf)
+{
+ struct task_struct *tsk = current;
+ int err = 0;
+
+ if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
+ return -EACCES;
+
+ BUG_ON(sig_xstate_size < xstate_size);
+
+ if ((unsigned long)buf % 64)
+ printk("save_i387_xstate: bad fpstate %p\n", buf);
+
+ if (!used_math())
+ return 0;
+ clear_used_math(); /* trigger finit */
+ if (task_thread_info(tsk)->status & TS_USEDFPU) {
+ /*
+ * Start with clearing the user buffer. This will present a
+ * clean context for the bytes not touched by the fxsave/xsave.
+ */
+ err = __clear_user(buf, sig_xstate_size);
+ if (err)
+ return err;
+
+ if (task_thread_info(tsk)->status & TS_XSAVE)
+ err = xsave_user(buf);
+ else
+ err = fxsave_user(buf);
+
+ if (err)
+ return err;
+ task_thread_info(tsk)->status &= ~TS_USEDFPU;
+ stts();
+ } else {
+ if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
+ xstate_size))
+ return -1;
+ }
+
+ if (task_thread_info(tsk)->status & TS_XSAVE) {
+ struct _fpstate __user *fx = buf;
+ struct _xstate __user *x = buf;
+ u64 xstate_bv;
+
+ err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
+ sizeof(struct _fpx_sw_bytes));
+
+ err |= __put_user(FP_XSTATE_MAGIC2,
+ (__u32 __user *) (buf + sig_xstate_size
+ - FP_XSTATE_MAGIC2_SIZE));
+
+ /*
+ * Read the xstate_bv which we copied (directly from the cpu or
+ * from the state in task struct) to the user buffers and
+ * set the FP/SSE bits.
+ */
+ err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
+
+ /*
+ * For legacy compatible, we always set FP/SSE bits in the bit
+ * vector while saving the state to the user context. This will
+ * enable us capturing any changes(during sigreturn) to
+ * the FP/SSE bits by the legacy applications which don't touch
+ * xstate_bv in the xsave header.
+ *
+ * xsave aware apps can change the xstate_bv in the xsave
+ * header as well as change any contents in the memory layout.
+ * xrestore as part of sigreturn will capture all the changes.
+ */
+ xstate_bv |= XSTATE_FPSSE;
+
+ err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
+
+ if (err)
+ return err;
+ }
+
+ return 1;
+}
+
+/*
+ * Restore the extended state if present. Otherwise, restore the FP/SSE
+ * state.
+ */
+int restore_user_xstate(void __user *buf)
+{
+ struct _fpx_sw_bytes fx_sw_user;
+ u64 mask;
+ int err;
+
+ if (((unsigned long)buf % 64) ||
+ check_for_xstate(buf, buf, &fx_sw_user))
+ goto fx_only;
+
+ mask = fx_sw_user.xstate_bv;
+
+ /*
+ * restore the state passed by the user.
+ */
+ err = xrestore_user(buf, mask);
+ if (err)
+ return err;
+
+ /*
+ * init the state skipped by the user.
+ */
+ mask = pcntxt_mask & ~mask;
+
+ xrstor_state(init_xstate_buf, mask);
+
+ return 0;
+
+fx_only:
+ /*
+ * couldn't find the extended state information in the
+ * memory layout. Restore just the FP/SSE and init all
+ * the other extended state.
+ */
+ xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
+ return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
+}
+
+/*
+ * This restores directly out of user space. Exceptions are handled.
+ */
+int restore_i387_xstate(void __user *buf)
+{
+ struct task_struct *tsk = current;
+ int err = 0;
+
+ if (!buf) {
+ if (used_math())
+ goto clear;
+ return 0;
+ } else
+ if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
+ return -EACCES;
+
+ if (!used_math()) {
+ err = init_fpu(tsk);
+ if (err)
+ return err;
+ }
+
+ if (!(task_thread_info(current)->status & TS_USEDFPU)) {
+ clts();
+ task_thread_info(current)->status |= TS_USEDFPU;
+ }
+ if (task_thread_info(tsk)->status & TS_XSAVE)
+ err = restore_user_xstate(buf);
+ else
+ err = fxrstor_checking((__force struct i387_fxsave_struct *)
+ buf);
+ if (unlikely(err)) {
+ /*
+ * Encountered an error while doing the restore from the
+ * user buffer, clear the fpu state.
+ */
+clear:
+ clear_fpu(tsk);
+ clear_used_math();
+ }
+ return err;
+}
+#endif
+
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed by the fpstate pointer in the sigcontext.
+ * This will be saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+static void prepare_fx_sw_frame(void)
+{
+ int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
+ FP_XSTATE_MAGIC2_SIZE;
+
+ sig_xstate_size = sizeof(struct _fpstate) + size_extended;
+
+#ifdef CONFIG_IA32_EMULATION
+ sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
+#endif
+
+ memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
+
+ fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
+ fx_sw_reserved.extended_size = sig_xstate_size;
+ fx_sw_reserved.xstate_bv = pcntxt_mask;
+ fx_sw_reserved.xstate_size = xstate_size;
+#ifdef CONFIG_IA32_EMULATION
+ memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
+ sizeof(struct _fpx_sw_bytes));
+ fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
+#endif
+}
+
+/*
+ * Represents init state for the supported extended state.
+ */
+struct xsave_struct *init_xstate_buf;
+
+#ifdef CONFIG_X86_64
+unsigned int sig_xstate_size = sizeof(struct _fpstate);
+#endif
+
+/*
+ * Enable the extended processor state save/restore feature
+ */
+void __cpuinit xsave_init(void)
+{
+ if (!cpu_has_xsave)
+ return;
+
+ set_in_cr4(X86_CR4_OSXSAVE);
+
+ /*
+ * Enable all the features that the HW is capable of
+ * and the Linux kernel is aware of.
+ */
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
+}
+
+/*
+ * setup the xstate image representing the init state
+ */
+static void __init setup_xstate_init(void)
+{
+ init_xstate_buf = alloc_bootmem(xstate_size);
+ init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
+}
+
+/*
+ * Enable and initialize the xsave feature.
+ */
+void __init xsave_cntxt_init(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+ pcntxt_mask = eax + ((u64)edx << 32);
+
+ if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
+ printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
+ pcntxt_mask);
+ BUG();
+ }
+
+ /*
+ * for now OS knows only about FP/SSE
+ */
+ pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
+ xsave_init();
+
+ /*
+ * Recompute the context size for enabled features
+ */
+ cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+ xstate_size = ebx;
+
+ prepare_fx_sw_frame();
+
+ setup_xstate_init();
+
+ printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
+ "cntxt size 0x%x\n",
+ pcntxt_mask, xstate_size);
+}
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d0e940bb6f4..c02343594b4 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,10 +3,13 @@
#
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o)
+ coalesced_mmio.o irq_comm.o)
ifeq ($(CONFIG_KVM_TRACE),y)
common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
endif
+ifeq ($(CONFIG_DMAR),y)
+common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c0f7872a912..11c6725fb79 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
if (!atomic_inc_and_test(&pt->pending))
set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
- if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
- vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ if (vcpu0 && waitqueue_active(&vcpu0->wq))
wake_up_interruptible(&vcpu0->wq);
- }
- pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
- pt->scheduled = ktime_to_ns(pt->timer.expires);
+ hrtimer_add_expires_ns(&pt->timer, pt->period);
+ pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
+ if (pt->period)
+ ps->channels[0].count_load_time = hrtimer_get_expires(&pt->timer);
return (pt->period == 0 ? 0 : 1);
}
@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
+ if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
return atomic_read(&pit->pit_state.pit_timer.pending);
-
return 0;
}
+static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+ irq_ack_notifier);
+ spin_lock(&ps->inject_lock);
+ if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+ atomic_inc(&ps->pit_timer.pending);
+ ps->irq_ack = 1;
+ spin_unlock(&ps->inject_lock);
+}
+
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
struct kvm_kpit_state *ps;
@@ -246,7 +257,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
timer = &pit->pit_state.pit_timer.timer;
if (hrtimer_cancel(timer))
- hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
static void destroy_pit_timer(struct kvm_kpit_timer *pt)
@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
hrtimer_cancel(&pt->timer);
}
-static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{
+ struct kvm_kpit_timer *pt = &ps->pit_timer;
s64 interval;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
pt->period = (is_period == 0) ? 0 : interval;
pt->timer.function = pit_timer_fn;
atomic_set(&pt->pending, 0);
+ ps->irq_ack = 1;
hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
- create_pit_timer(&ps->pit_timer, val, 0);
+ create_pit_timer(ps, val, 0);
break;
case 2:
case 3:
- create_pit_timer(&ps->pit_timer, val, 1);
+ create_pit_timer(ps, val, 1);
break;
default:
destroy_pit_timer(&ps->pit_timer);
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
mutex_unlock(&pit->pit_state.lock);
atomic_set(&pit->pit_state.pit_timer.pending, 0);
- pit->pit_state.inject_pending = 1;
+ pit->pit_state.irq_ack = 1;
}
struct kvm_pit *kvm_create_pit(struct kvm *kvm)
@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
+ spin_lock_init(&pit->pit_state.inject_lock);
/* Initialize PIO device */
pit->dev.read = pit_ioport_read;
@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
pit_state->pit = pit;
hrtimer_init(&pit_state->pit_timer.timer,
CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ pit_state->irq_ack_notifier.gsi = 0;
+ pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+ kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
mutex_unlock(&pit->pit_state.lock);
kvm_pit_reset(pit);
@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm)
static void __inject_pit_timer_intr(struct kvm *kvm)
{
mutex_lock(&kvm->lock);
- kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
- kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
- kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
- kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+ kvm_set_irq(kvm, 0, 1);
+ kvm_set_irq(kvm, 0, 0);
mutex_unlock(&kvm->lock);
}
@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
struct kvm_kpit_state *ps;
if (vcpu && pit) {
+ int inject = 0;
ps = &pit->pit_state;
- /* Try to inject pending interrupts when:
- * 1. Pending exists
- * 2. Last interrupt was accepted or waited for too long time*/
- if (atomic_read(&ps->pit_timer.pending) &&
- (ps->inject_pending ||
- (jiffies - ps->last_injected_time
- >= KVM_MAX_PIT_INTR_INTERVAL))) {
- ps->inject_pending = 0;
- __inject_pit_timer_intr(kvm);
- ps->last_injected_time = jiffies;
- }
- }
-}
-
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
- struct kvm_arch *arch = &vcpu->kvm->arch;
- struct kvm_kpit_state *ps;
-
- if (vcpu && arch->vpit) {
- ps = &arch->vpit->pit_state;
- if (atomic_read(&ps->pit_timer.pending) &&
- (((arch->vpic->pics[0].imr & 1) == 0 &&
- arch->vpic->pics[0].irq_base == vec) ||
- (arch->vioapic->redirtbl[0].fields.vector == vec &&
- arch->vioapic->redirtbl[0].fields.mask != 1))) {
- ps->inject_pending = 1;
- atomic_dec(&ps->pit_timer.pending);
- ps->channels[0].count_load_time = ktime_get();
+ /* Try to inject pending interrupts when
+ * last one has been acked.
+ */
+ spin_lock(&ps->inject_lock);
+ if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
+ ps->irq_ack = 0;
+ inject = 1;
}
+ spin_unlock(&ps->inject_lock);
+ if (inject)
+ __inject_pit_timer_intr(kvm);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index db25c2a6c8c..e436d4983aa 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
int irq;
s64 period; /* unit: ns */
s64 scheduled;
- ktime_t last_update;
atomic_t pending;
};
@@ -34,8 +33,9 @@ struct kvm_kpit_state {
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
- bool inject_pending; /* if inject pending interrupts */
- unsigned long last_injected_time;
+ spinlock_t inject_lock;
+ unsigned long irq_ack;
+ struct kvm_irq_ack_notifier irq_ack_notifier;
};
struct kvm_pit {
@@ -54,7 +54,6 @@ struct kvm_pit {
#define KVM_PIT_CHANNEL_MASK 0x3
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
struct kvm_pit *kvm_create_pit(struct kvm *kvm);
void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index c31164e8aa4..17e41e165f1 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,6 +30,19 @@
#include <linux/kvm_host.h>
+static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
+{
+ s->isr &= ~(1 << irq);
+ s->isr_ack |= (1 << irq);
+}
+
+void kvm_pic_clear_isr_ack(struct kvm *kvm)
+{
+ struct kvm_pic *s = pic_irqchip(kvm);
+ s->pics[0].isr_ack = 0xff;
+ s->pics[1].isr_ack = 0xff;
+}
+
/*
* set irq level. If an edge is detected, then the IRR is set to 1
*/
@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
*/
static inline void pic_intack(struct kvm_kpic_state *s, int irq)
{
+ s->isr |= 1 << irq;
if (s->auto_eoi) {
if (s->rotate_on_auto_eoi)
s->priority_add = (irq + 1) & 7;
- } else
- s->isr |= (1 << irq);
+ pic_clear_isr(s, irq);
+ }
/*
* We don't clear a level sensitive interrupt here
*/
@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
s->irr &= ~(1 << irq);
}
-int kvm_pic_read_irq(struct kvm_pic *s)
+int kvm_pic_read_irq(struct kvm *kvm)
{
int irq, irq2, intno;
+ struct kvm_pic *s = pic_irqchip(kvm);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
+ kvm_notify_acked_irq(kvm, irq);
return intno;
}
void kvm_pic_reset(struct kvm_kpic_state *s)
{
+ int irq, irqbase;
+ struct kvm *kvm = s->pics_state->irq_request_opaque;
+ struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
+
+ if (s == &s->pics_state->pics[0])
+ irqbase = 0;
+ else
+ irqbase = 8;
+
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
+ if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
+ if (s->irr & (1 << irq) || s->isr & (1 << irq))
+ kvm_notify_acked_irq(kvm, irq+irqbase);
+ }
s->last_irr = 0;
s->irr = 0;
s->imr = 0;
s->isr = 0;
+ s->isr_ack = 0xff;
s->priority_add = 0;
s->irq_base = 0;
s->read_reg_select = 0;
@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
priority = get_priority(s, s->isr);
if (priority != 8) {
irq = (priority + s->priority_add) & 7;
- s->isr &= ~(1 << irq);
+ pic_clear_isr(s, irq);
if (cmd == 5)
s->priority_add = (irq + 1) & 7;
pic_update_irq(s->pics_state);
@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
break;
case 3:
irq = val & 7;
- s->isr &= ~(1 << irq);
+ pic_clear_isr(s, irq);
pic_update_irq(s->pics_state);
break;
case 6:
@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
break;
case 7:
irq = val & 7;
- s->isr &= ~(1 << irq);
s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
pic_update_irq(s->pics_state);
break;
default:
@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
s->pics_state->pics[0].irr &= ~(1 << 2);
}
s->irr &= ~(1 << ret);
- s->isr &= ~(1 << ret);
+ pic_clear_isr(s, ret);
if (addr1 >> 7 || ret != 2)
pic_update_irq(s->pics_state);
} else {
@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
{
struct kvm *kvm = opaque;
struct kvm_vcpu *vcpu = kvm->vcpus[0];
+ struct kvm_pic *s = pic_irqchip(kvm);
+ int irq = pic_get_irq(&s->pics[0]);
- pic_irqchip(kvm)->output = level;
- if (vcpu)
+ s->output = level;
+ if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
+ s->pics[0].isr_ack &= ~(1 << irq);
kvm_vcpu_kick(vcpu);
+ }
}
struct kvm_pic *kvm_create_pic(struct kvm *kvm)
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 76d736b5f66..c019b8edcdb 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
if (kvm_apic_accept_pic_intr(v)) {
s = pic_irqchip(v->kvm);
s->output = 0; /* PIC */
- vector = kvm_pic_read_irq(s);
+ vector = kvm_pic_read_irq(v->kvm);
}
}
return vector;
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
{
kvm_apic_timer_intr_post(vcpu, vec);
- kvm_pit_timer_intr_post(vcpu, vec);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7ca47cbb48b..f17c8f5bbf3 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -42,6 +42,7 @@ struct kvm_kpic_state {
u8 irr; /* interrupt request register */
u8 imr; /* interrupt mask register */
u8 isr; /* interrupt service register */
+ u8 isr_ack; /* interrupt ack detection */
u8 priority_add; /* highest irq priority */
u8 irq_base;
u8 read_reg_select;
@@ -63,12 +64,13 @@ struct kvm_pic {
void *irq_request_opaque;
int output; /* intr from master PIC */
struct kvm_io_device dev;
+ void (*ack_notifier)(void *opaque, int irq);
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
+int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
+void kvm_pic_clear_isr_ack(struct kvm *kvm);
static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 00000000000..1ff819dce7d
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,32 @@
+#ifndef ASM_KVM_CACHE_REGS_H
+#define ASM_KVM_CACHE_REGS_H
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, reg);
+
+ return vcpu->arch.regs[reg];
+}
+
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ vcpu->arch.regs[reg] = val;
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73f43de69f6..0fc3cab4894 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,6 +32,7 @@
#include <asm/current.h>
#include <asm/apicdef.h>
#include <asm/atomic.h>
+#include "kvm_cache_regs.h"
#include "irq.h"
#define PRId64 "d"
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
} else
apic_clear_vector(vector, apic->regs + APIC_TMR);
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
- kvm_vcpu_kick(vcpu);
- else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
- }
+ kvm_vcpu_kick(vcpu);
result = (orig_irr == 0);
break;
@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
kvm_vcpu_kick(vcpu);
} else {
- printk(KERN_DEBUG
- "Ignoring de-assert INIT to vcpu %d\n",
- vcpu->vcpu_id);
+ apic_debug("Ignoring de-assert INIT to vcpu %d\n",
+ vcpu->vcpu_id);
}
-
break;
case APIC_DM_STARTUP:
- printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
- vcpu->vcpu_id, vector);
+ apic_debug("SIPI to vcpu %d vector 0x%02x\n",
+ vcpu->vcpu_id, vector);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
+ kvm_vcpu_kick(vcpu);
}
break;
@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
static void apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
-
+ int trigger_mode;
/*
* Not every write EOI will has corresponding ISR,
* one example is when Kernel check timer on setup_IO_APIC
@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
apic_update_ppr(apic);
if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+ kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
}
static void apic_send_ipi(struct kvm_lapic *apic)
@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
struct kvm_run *run = vcpu->run;
set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
- kvm_x86_ops->cache_regs(vcpu);
- run->tpr_access.rip = vcpu->arch.rip;
+ run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
* Refer SDM 8.4.1
*/
if (len != 4 || alignment) {
- if (printk_ratelimit())
- printk(KERN_ERR "apic write: bad size=%d %lx\n",
- len, (long)address);
+ /* Don't shout loud, $infamous_os would cause only noise. */
+ apic_debug("apic write: bad size=%d %lx\n",
+ len, (long)address);
return;
}
@@ -947,15 +941,12 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
if(!atomic_inc_and_test(&apic->timer.pending))
set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
- if (waitqueue_active(q)) {
- apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ if (waitqueue_active(q))
wake_up_interruptible(q);
- }
+
if (apic_lvtt_period(apic)) {
result = 1;
- apic->timer.dev.expires = ktime_add_ns(
- apic->timer.dev.expires,
- apic->timer.period);
+ hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
}
return result;
}
@@ -1124,7 +1115,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
timer = &apic->timer.dev;
if (hrtimer_cancel(timer))
- hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3da2508eb22..99c239c5c0a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,6 +70,9 @@ static int dbg = 0;
module_param(dbg, bool, 0644);
#endif
+static int oos_shadow = 1;
+module_param(oos_shadow, bool, 0644);
+
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-struct kvm_pv_mmu_op_buffer {
- void *ptr;
- unsigned len;
- unsigned processed;
- char buf[512] __aligned(sizeof(long));
-};
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
struct kvm_rmap_desc {
u64 *shadow_ptes[RMAP_EXT];
struct kvm_rmap_desc *more;
};
+struct kvm_shadow_walk {
+ int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
+ u64 addr, u64 *spte, int level);
+};
+
+struct kvm_unsync_walk {
+ int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
+};
+
+typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
{
struct vm_area_struct *vma;
unsigned long addr;
+ int ret = 0;
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
- return 0;
+ return ret;
+ down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, addr);
if (vma && is_vm_hugetlb_page(vma))
- return 1;
+ ret = 1;
+ up_read(&current->mm->mmap_sem);
- return 0;
+ return ret;
}
static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
if (write_protected)
kvm_flush_remote_tlbs(kvm);
-
- account_shadowed(kvm, gfn);
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
BUG();
}
+
+static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ mmu_parent_walk_fn fn)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ struct kvm_mmu_page *parent_sp;
+ int i;
+
+ if (!sp->multimapped && sp->parent_pte) {
+ parent_sp = page_header(__pa(sp->parent_pte));
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
+ return;
+ }
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
+ }
+}
+
+static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+{
+ unsigned int index;
+ struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+ index = spte - sp->spt;
+ __set_bit(index, sp->unsync_child_bitmap);
+ sp->unsync_children = 1;
+}
+
+static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i;
+
+ if (!sp->parent_pte)
+ return;
+
+ if (!sp->multimapped) {
+ kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+ return;
+ }
+
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
+ }
+}
+
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ sp->unsync_children = 1;
+ kvm_mmu_update_parents_unsync(sp);
+ return 1;
+}
+
+static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+ kvm_mmu_update_parents_unsync(sp);
+}
+
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
sp->spt[i] = shadow_trap_nonpresent_pte;
}
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ return 1;
+}
+
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+}
+
+#define for_each_unsync_children(bitmap, idx) \
+ for (idx = find_first_bit(bitmap, 512); \
+ idx < 512; \
+ idx = find_next_bit(bitmap, 512, idx+1))
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_unsync_walk *walker)
+{
+ int i, ret;
+
+ if (!sp->unsync_children)
+ return 0;
+
+ for_each_unsync_children(sp->unsync_child_bitmap, i) {
+ u64 ent = sp->spt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ ret = mmu_unsync_walk(child, walker);
+ if (ret)
+ return ret;
+ __clear_bit(i, sp->unsync_child_bitmap);
+ }
+
+ if (child->unsync) {
+ ret = walker->entry(child, walker);
+ __clear_bit(i, sp->unsync_child_bitmap);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
+ sp->unsync_children = 0;
+
+ return 0;
+}
+
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
{
unsigned index;
@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
return NULL;
}
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ WARN_ON(!sp->unsync);
+ sp->unsync = 0;
+ --kvm->stat.mmu_unsync;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ return 1;
+ }
+
+ rmap_write_protect(vcpu->kvm, sp->gfn);
+ if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ return 1;
+ }
+
+ kvm_mmu_flush_tlb(vcpu);
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ return 0;
+}
+
+struct sync_walker {
+ struct kvm_vcpu *vcpu;
+ struct kvm_unsync_walk walker;
+};
+
+static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+ struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
+ walker);
+ struct kvm_vcpu *vcpu = sync_walk->vcpu;
+
+ kvm_sync_page(vcpu, sp);
+ return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ struct sync_walker walker = {
+ .walker = { .entry = mmu_sync_fn, },
+ .vcpu = vcpu,
+ };
+
+ while (mmu_unsync_walk(sp, &walker.walker))
+ cond_resched_lock(&vcpu->kvm->mmu_lock);
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
unsigned quadrant;
struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node;
+ struct hlist_node *node, *tmp;
role.word = 0;
role.glevels = vcpu->arch.mmu.root_level;
@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn, role.word);
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && sp->role.word == role.word) {
+ hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
+ if (sp->gfn == gfn) {
+ if (sp->unsync)
+ if (kvm_sync_page(vcpu, sp))
+ continue;
+
+ if (sp->role.word != role.word)
+ continue;
+
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ if (sp->unsync_children) {
+ set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+ }
pgprintk("%s: found\n", __func__);
return sp;
}
@@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
- if (!metaphysical)
+ if (!metaphysical) {
rmap_write_protect(vcpu->kvm, gfn);
+ account_shadowed(vcpu->kvm, gfn);
+ }
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
vcpu->arch.mmu.prefetch_page(vcpu, sp);
else
@@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
return sp;
}
+static int walk_shadow(struct kvm_shadow_walk *walker,
+ struct kvm_vcpu *vcpu, u64 addr)
+{
+ hpa_t shadow_addr;
+ int level;
+ int r;
+ u64 *sptep;
+ unsigned index;
+
+ shadow_addr = vcpu->arch.mmu.root_hpa;
+ level = vcpu->arch.mmu.shadow_root_level;
+ if (level == PT32E_ROOT_LEVEL) {
+ shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+ shadow_addr &= PT64_BASE_ADDR_MASK;
+ --level;
+ }
+
+ while (level >= PT_PAGE_TABLE_LEVEL) {
+ index = SHADOW_PT_INDEX(addr, level);
+ sptep = ((u64 *)__va(shadow_addr)) + index;
+ r = walker->entry(walker, vcpu, addr, sptep, level);
+ if (r)
+ return r;
+ shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
+ --level;
+ }
+ return 0;
+}
+
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
@@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
rmap_remove(kvm, &pt[i]);
pt[i] = shadow_trap_nonpresent_pte;
}
- kvm_flush_remote_tlbs(kvm);
return;
}
@@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
}
pt[i] = shadow_trap_nonpresent_pte;
}
- kvm_flush_remote_tlbs(kvm);
}
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
kvm->vcpus[i]->arch.last_pte_updated = NULL;
}
-static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *parent_pte;
- ++kvm->stat.mmu_shadow_zapped;
while (sp->multimapped || sp->parent_pte) {
if (!sp->multimapped)
parent_pte = sp->parent_pte;
@@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_put_page(sp, parent_pte);
set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
}
+}
+
+struct zap_walker {
+ struct kvm_unsync_walk walker;
+ struct kvm *kvm;
+ int zapped;
+};
+
+static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+ struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
+ walker);
+ kvm_mmu_zap_page(zap_walk->kvm, sp);
+ zap_walk->zapped = 1;
+ return 0;
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct zap_walker walker = {
+ .walker = { .entry = mmu_zap_fn, },
+ .kvm = kvm,
+ .zapped = 0,
+ };
+
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ return 0;
+ mmu_unsync_walk(sp, &walker.walker);
+ return walker.zapped;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ int ret;
+ ++kvm->stat.mmu_shadow_zapped;
+ ret = mmu_zap_unsync_children(kvm, sp);
kvm_mmu_page_unlink_children(kvm, sp);
+ kvm_mmu_unlink_parents(kvm, sp);
+ kvm_flush_remote_tlbs(kvm);
+ if (!sp->role.invalid && !sp->role.metaphysical)
+ unaccount_shadowed(kvm, sp->gfn);
+ if (sp->unsync)
+ kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
- if (!sp->role.metaphysical && !sp->role.invalid)
- unaccount_shadowed(kvm, sp->gfn);
hlist_del(&sp->hash_link);
kvm_mmu_free_page(kvm, sp);
} else {
- int invalid = sp->role.invalid;
- list_move(&sp->link, &kvm->arch.active_mmu_pages);
sp->role.invalid = 1;
+ list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
- if (!sp->role.metaphysical && !invalid)
- unaccount_shadowed(kvm, sp->gfn);
}
kvm_mmu_reset_last_pte_updated(kvm);
+ return ret;
}
/*
@@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
if (sp->gfn == gfn && !sp->role.metaphysical) {
pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
sp->role.word);
- kvm_mmu_zap_page(kvm, sp);
r = 1;
+ if (kvm_mmu_zap_page(kvm, sp))
+ n = bucket->first;
}
return r;
}
@@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
__set_bit(slot, &sp->slot_bitmap);
}
+static void mmu_convert_notrap(struct kvm_mmu_page *sp)
+{
+ int i;
+ u64 *pt = sp->spt;
+
+ if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
+ return;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (pt[i] == shadow_notrap_nonpresent_pte)
+ set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
+ }
+}
+
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
struct page *page;
@@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
if (gpa == UNMAPPED_GVA)
return NULL;
- down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
return page;
}
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
- unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault, int dirty,
- int *ptwrite, int largepage, gfn_t gfn,
- pfn_t pfn, bool speculative)
+static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- u64 spte;
- int was_rmapped = 0;
- int was_writeble = is_writeble_pte(*shadow_pte);
+ unsigned index;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *s;
+ struct hlist_node *node, *n;
- pgprintk("%s: spte %llx access %x write_fault %d"
- " user_fault %d gfn %lx\n",
- __func__, *shadow_pte, pt_access,
- write_fault, user_fault, gfn);
+ index = kvm_page_table_hashfn(sp->gfn);
+ bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+ /* don't unsync if pagetable is shadowed with multiple roles */
+ hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+ if (s->gfn != sp->gfn || s->role.metaphysical)
+ continue;
+ if (s->role.word != sp->role.word)
+ return 1;
+ }
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+ ++vcpu->kvm->stat.mmu_unsync;
+ sp->unsync = 1;
+ mmu_convert_notrap(sp);
+ return 0;
+}
- if (is_rmap_pte(*shadow_pte)) {
- /*
- * If we overwrite a PTE page pointer with a 2MB PMD, unlink
- * the parent of the now unreachable PTE.
- */
- if (largepage && !is_large_pte(*shadow_pte)) {
- struct kvm_mmu_page *child;
- u64 pte = *shadow_pte;
+static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool can_unsync)
+{
+ struct kvm_mmu_page *shadow;
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, shadow_pte);
- } else if (pfn != spte_to_pfn(*shadow_pte)) {
- pgprintk("hfn old %lx new %lx\n",
- spte_to_pfn(*shadow_pte), pfn);
- rmap_remove(vcpu->kvm, shadow_pte);
- } else {
- if (largepage)
- was_rmapped = is_large_pte(*shadow_pte);
- else
- was_rmapped = 1;
- }
+ shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+ if (shadow) {
+ if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+ return 1;
+ if (shadow->unsync)
+ return 0;
+ if (can_unsync && oos_shadow)
+ return kvm_unsync_page(vcpu, shadow);
+ return 1;
}
+ return 0;
+}
+static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+ unsigned pte_access, int user_fault,
+ int write_fault, int dirty, int largepage,
+ gfn_t gfn, pfn_t pfn, bool speculative,
+ bool can_unsync)
+{
+ u64 spte;
+ int ret = 0;
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
@@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
*/
spte = shadow_base_present_pte | shadow_dirty_mask;
if (!speculative)
- pte_access |= PT_ACCESSED_MASK;
+ spte |= shadow_accessed_mask;
if (!dirty)
pte_access &= ~ACC_WRITE_MASK;
if (pte_access & ACC_EXEC_MASK)
@@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
if ((pte_access & ACC_WRITE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
- struct kvm_mmu_page *shadow;
+
+ if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
+ ret = 1;
+ spte = shadow_trap_nonpresent_pte;
+ goto set_pte;
+ }
spte |= PT_WRITABLE_MASK;
- shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow ||
- (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
+ if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__func__, gfn);
+ ret = 1;
pte_access &= ~ACC_WRITE_MASK;
- if (is_writeble_pte(spte)) {
+ if (is_writeble_pte(spte))
spte &= ~PT_WRITABLE_MASK;
- kvm_x86_ops->tlb_flush(vcpu);
- }
- if (write_fault)
- *ptwrite = 1;
}
}
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
- pgprintk("%s: setting spte %llx\n", __func__, spte);
- pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
- (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
- (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
+set_pte:
set_shadow_pte(shadow_pte, spte);
- if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
- && (spte & PT_PRESENT_MASK))
+ return ret;
+}
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+ unsigned pt_access, unsigned pte_access,
+ int user_fault, int write_fault, int dirty,
+ int *ptwrite, int largepage, gfn_t gfn,
+ pfn_t pfn, bool speculative)
+{
+ int was_rmapped = 0;
+ int was_writeble = is_writeble_pte(*shadow_pte);
+
+ pgprintk("%s: spte %llx access %x write_fault %d"
+ " user_fault %d gfn %lx\n",
+ __func__, *shadow_pte, pt_access,
+ write_fault, user_fault, gfn);
+
+ if (is_rmap_pte(*shadow_pte)) {
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (largepage && !is_large_pte(*shadow_pte)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *shadow_pte;
+
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, shadow_pte);
+ } else if (pfn != spte_to_pfn(*shadow_pte)) {
+ pgprintk("hfn old %lx new %lx\n",
+ spte_to_pfn(*shadow_pte), pfn);
+ rmap_remove(vcpu->kvm, shadow_pte);
+ } else {
+ if (largepage)
+ was_rmapped = is_large_pte(*shadow_pte);
+ else
+ was_rmapped = 1;
+ }
+ }
+ if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
+ dirty, largepage, gfn, pfn, speculative, true)) {
+ if (write_fault)
+ *ptwrite = 1;
+ kvm_x86_ops->tlb_flush(vcpu);
+ }
+
+ pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
+ pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+ is_large_pte(*shadow_pte)? "2MB" : "4kB",
+ is_present_pte(*shadow_pte)?"RW":"R", gfn,
+ *shadow_pte, shadow_pte);
+ if (!was_rmapped && is_large_pte(*shadow_pte))
++vcpu->kvm->stat.lpages;
page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
@@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int largepage, gfn_t gfn, pfn_t pfn,
- int level)
-{
- hpa_t table_addr = vcpu->arch.mmu.root_hpa;
- int pt_write = 0;
-
- for (; ; level--) {
- u32 index = PT64_INDEX(v, level);
- u64 *table;
-
- ASSERT(VALID_PAGE(table_addr));
- table = __va(table_addr);
+struct direct_shadow_walk {
+ struct kvm_shadow_walk walker;
+ pfn_t pfn;
+ int write;
+ int largepage;
+ int pt_write;
+};
- if (level == 1) {
- mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, 0, gfn, pfn, false);
- return pt_write;
- }
+static int direct_map_entry(struct kvm_shadow_walk *_walk,
+ struct kvm_vcpu *vcpu,
+ u64 addr, u64 *sptep, int level)
+{
+ struct direct_shadow_walk *walk =
+ container_of(_walk, struct direct_shadow_walk, walker);
+ struct kvm_mmu_page *sp;
+ gfn_t pseudo_gfn;
+ gfn_t gfn = addr >> PAGE_SHIFT;
+
+ if (level == PT_PAGE_TABLE_LEVEL
+ || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
+ mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
+ 0, walk->write, 1, &walk->pt_write,
+ walk->largepage, gfn, walk->pfn, false);
+ ++vcpu->stat.pf_fixed;
+ return 1;
+ }
- if (largepage && level == 2) {
- mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, 1, gfn, pfn, false);
- return pt_write;
+ if (*sptep == shadow_trap_nonpresent_pte) {
+ pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
+ 1, ACC_ALL, sptep);
+ if (!sp) {
+ pgprintk("nonpaging_map: ENOMEM\n");
+ kvm_release_pfn_clean(walk->pfn);
+ return -ENOMEM;
}
- if (table[index] == shadow_trap_nonpresent_pte) {
- struct kvm_mmu_page *new_table;
- gfn_t pseudo_gfn;
-
- pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
- v, level - 1,
- 1, ACC_ALL, &table[index]);
- if (!new_table) {
- pgprintk("nonpaging_map: ENOMEM\n");
- kvm_release_pfn_clean(pfn);
- return -ENOMEM;
- }
-
- set_shadow_pte(&table[index],
- __pa(new_table->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask);
- }
- table_addr = table[index] & PT64_BASE_ADDR_MASK;
+ set_shadow_pte(sptep,
+ __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_WRITABLE_MASK
+ | shadow_user_mask | shadow_x_mask);
}
+ return 0;
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+ int largepage, gfn_t gfn, pfn_t pfn)
+{
+ int r;
+ struct direct_shadow_walk walker = {
+ .walker = { .entry = direct_map_entry, },
+ .pfn = pfn,
+ .largepage = largepage,
+ .write = write,
+ .pt_write = 0,
+ };
+
+ r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
+ if (r < 0)
+ return r;
+ return walker.pt_write;
}
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
pfn_t pfn;
unsigned long mmu_seq;
- down_read(&current->mm->mmap_sem);
if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1;
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
- /* implicit mb(), we'll read before PT lock is unlocked */
+ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- up_read(&current->mm->mmap_sem);
/* mmio */
if (is_error_pfn(pfn)) {
@@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
- PT32E_ROOT_LEVEL);
+ r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
}
+static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ }
+ }
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_sync_roots(vcpu);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
return vaddr;
@@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
if (r)
return r;
- down_read(&current->mm->mmap_sem);
if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1;
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
- /* implicit mb(), we'll read before PT lock is unlocked */
+ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- up_read(&current->mm->mmap_sem);
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return 1;
@@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
- largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
+ largepage, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
@@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
context->prefetch_page = paging64_prefetch_page;
+ context->sync_page = paging64_sync_page;
+ context->invlpg = paging64_invlpg;
context->free = paging_free;
context->root_level = level;
context->shadow_root_level = level;
@@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free;
context->prefetch_page = paging32_prefetch_page;
+ context->sync_page = paging32_sync_page;
+ context->invlpg = paging32_invlpg;
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
@@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
context->shadow_root_level = kvm_x86_ops->get_tdp_level();
context->root_hpa = INVALID_PAGE;
@@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
mmu_alloc_roots(vcpu);
+ mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
@@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
- down_read(&current->mm->mmap_sem);
if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
vcpu->arch.update_pte.largepage = 1;
}
vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
- /* implicit mb(), we'll read before PT lock is unlocked */
+ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- up_read(&current->mm->mmap_sem);
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
@@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
- if (sp->gfn != gfn || sp->role.metaphysical)
+ if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
continue;
pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
*/
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ if (kvm_mmu_zap_page(vcpu->kvm, sp))
+ n = bucket->first;
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -1969,6 +2350,16 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.invlpg(vcpu, gva);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_flush_tlb(vcpu);
+ ++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
void kvm_enable_tdp(void)
{
tdp_enabled = true;
@@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *sp;
+ spin_lock(&kvm->mmu_lock);
list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
int i;
u64 *pt;
@@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (pt[i] & PT_WRITABLE_MASK)
pt[i] &= ~PT_WRITABLE_MASK;
}
+ kvm_flush_remote_tlbs(kvm);
+ spin_unlock(&kvm->mmu_lock);
}
void kvm_mmu_zap_all(struct kvm *kvm)
@@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
spin_lock(&kvm->mmu_lock);
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- kvm_mmu_zap_page(kvm, sp);
+ if (kvm_mmu_zap_page(kvm, sp))
+ node = container_of(kvm->arch.active_mmu_pages.next,
+ struct kvm_mmu_page, link);
spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
@@ -2291,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
gpa_t addr, unsigned long *ret)
{
int r;
- struct kvm_pv_mmu_op_buffer buffer;
+ struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
- buffer.ptr = buffer.buf;
- buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
- buffer.processed = 0;
+ buffer->ptr = buffer->buf;
+ buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
+ buffer->processed = 0;
- r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+ r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
if (r)
goto out;
- while (buffer.len) {
- r = kvm_pv_mmu_op_one(vcpu, &buffer);
+ while (buffer->len) {
+ r = kvm_pv_mmu_op_one(vcpu, buffer);
if (r < 0)
goto out;
if (r == 0)
@@ -2311,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
r = 1;
out:
- *ret = buffer.processed;
+ *ret = buffer->processed;
return r;
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4a814bff21f..613ec9aa674 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,11 +25,11 @@
#if PTTYPE == 64
#define pt_element_t u64
#define guest_walker guest_walker64
+ #define shadow_walker shadow_walker64
#define FNAME(name) paging##64_##name
#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
#ifdef CONFIG_X86_64
@@ -42,11 +42,11 @@
#elif PTTYPE == 32
#define pt_element_t u32
#define guest_walker guest_walker32
+ #define shadow_walker shadow_walker32
#define FNAME(name) paging##32_##name
#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
- #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
@@ -73,6 +73,17 @@ struct guest_walker {
u32 error_code;
};
+struct shadow_walker {
+ struct kvm_shadow_walk walker;
+ struct guest_walker *guest_walker;
+ int user_fault;
+ int write_fault;
+ int largepage;
+ int *ptwrite;
+ pfn_t pfn;
+ u64 *sptep;
+};
+
static gfn_t gpte_to_gfn(pt_element_t gpte)
{
return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
pt_element_t *table;
struct page *page;
- down_read(&current->mm->mmap_sem);
page = gfn_to_page(kvm, table_gfn);
- up_read(&current->mm->mmap_sem);
table = kmap_atomic(page, KM_USER0);
-
ret = CMPXCHG(&table[index], orig_pte, new_pte);
-
kunmap_atomic(table, KM_USER0);
kvm_release_page_dirty(page);
@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
- struct guest_walker *walker,
- int user_fault, int write_fault, int largepage,
- int *ptwrite, pfn_t pfn)
+static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
+ struct kvm_vcpu *vcpu, u64 addr,
+ u64 *sptep, int level)
{
- hpa_t shadow_addr;
- int level;
- u64 *shadow_ent;
- unsigned access = walker->pt_access;
-
- if (!is_present_pte(walker->ptes[walker->level - 1]))
- return NULL;
-
- shadow_addr = vcpu->arch.mmu.root_hpa;
- level = vcpu->arch.mmu.shadow_root_level;
- if (level == PT32E_ROOT_LEVEL) {
- shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
- shadow_addr &= PT64_BASE_ADDR_MASK;
- --level;
+ struct shadow_walker *sw =
+ container_of(_sw, struct shadow_walker, walker);
+ struct guest_walker *gw = sw->guest_walker;
+ unsigned access = gw->pt_access;
+ struct kvm_mmu_page *shadow_page;
+ u64 spte;
+ int metaphysical;
+ gfn_t table_gfn;
+ int r;
+ pt_element_t curr_pte;
+
+ if (level == PT_PAGE_TABLE_LEVEL
+ || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
+ mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
+ sw->user_fault, sw->write_fault,
+ gw->ptes[gw->level-1] & PT_DIRTY_MASK,
+ sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
+ false);
+ sw->sptep = sptep;
+ return 1;
}
- for (; ; level--) {
- u32 index = SHADOW_PT_INDEX(addr, level);
- struct kvm_mmu_page *shadow_page;
- u64 shadow_pte;
- int metaphysical;
- gfn_t table_gfn;
-
- shadow_ent = ((u64 *)__va(shadow_addr)) + index;
- if (level == PT_PAGE_TABLE_LEVEL)
- break;
-
- if (largepage && level == PT_DIRECTORY_LEVEL)
- break;
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+ return 0;
- if (is_shadow_present_pte(*shadow_ent)
- && !is_large_pte(*shadow_ent)) {
- shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
- continue;
- }
+ if (is_large_pte(*sptep)) {
+ set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ rmap_remove(vcpu->kvm, sptep);
+ }
- if (is_large_pte(*shadow_ent))
- rmap_remove(vcpu->kvm, shadow_ent);
-
- if (level - 1 == PT_PAGE_TABLE_LEVEL
- && walker->level == PT_DIRECTORY_LEVEL) {
- metaphysical = 1;
- if (!is_dirty_pte(walker->ptes[level - 1]))
- access &= ~ACC_WRITE_MASK;
- table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
- } else {
- metaphysical = 0;
- table_gfn = walker->table_gfn[level - 2];
- }
- shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
- metaphysical, access,
- shadow_ent);
- if (!metaphysical) {
- int r;
- pt_element_t curr_pte;
- r = kvm_read_guest_atomic(vcpu->kvm,
- walker->pte_gpa[level - 2],
- &curr_pte, sizeof(curr_pte));
- if (r || curr_pte != walker->ptes[level - 2]) {
- kvm_release_pfn_clean(pfn);
- return NULL;
- }
+ if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
+ metaphysical = 1;
+ if (!is_dirty_pte(gw->ptes[level - 1]))
+ access &= ~ACC_WRITE_MASK;
+ table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
+ } else {
+ metaphysical = 0;
+ table_gfn = gw->table_gfn[level - 2];
+ }
+ shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
+ metaphysical, access, sptep);
+ if (!metaphysical) {
+ r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
+ &curr_pte, sizeof(curr_pte));
+ if (r || curr_pte != gw->ptes[level - 2]) {
+ kvm_release_pfn_clean(sw->pfn);
+ sw->sptep = NULL;
+ return 1;
}
- shadow_addr = __pa(shadow_page->spt);
- shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- set_shadow_pte(shadow_ent, shadow_pte);
}
- mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
- user_fault, write_fault,
- walker->ptes[walker->level-1] & PT_DIRTY_MASK,
- ptwrite, largepage, walker->gfn, pfn, false);
+ spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ *sptep = spte;
+ return 0;
+}
+
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+ struct guest_walker *guest_walker,
+ int user_fault, int write_fault, int largepage,
+ int *ptwrite, pfn_t pfn)
+{
+ struct shadow_walker walker = {
+ .walker = { .entry = FNAME(shadow_walk_entry), },
+ .guest_walker = guest_walker,
+ .user_fault = user_fault,
+ .write_fault = write_fault,
+ .largepage = largepage,
+ .ptwrite = ptwrite,
+ .pfn = pfn,
+ };
+
+ if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
+ return NULL;
+
+ walk_shadow(&walker.walker, vcpu, addr);
- return shadow_ent;
+ return walker.sptep;
}
/*
@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return 0;
}
- down_read(&current->mm->mmap_sem);
if (walker.level == PT_DIRECTORY_LEVEL) {
gfn_t large_gfn;
large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
}
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
- /* implicit mb(), we'll read before PT lock is unlocked */
+ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
- up_read(&current->mm->mmap_sem);
/* mmio */
if (is_error_pfn(pfn)) {
@@ -453,6 +461,31 @@ out_unlock:
return 0;
}
+static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
+ struct kvm_vcpu *vcpu, u64 addr,
+ u64 *sptep, int level)
+{
+
+ if (level == PT_PAGE_TABLE_LEVEL) {
+ if (is_shadow_present_pte(*sptep))
+ rmap_remove(vcpu->kvm, sptep);
+ set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+ return 1;
+ }
+ if (!is_shadow_present_pte(*sptep))
+ return 1;
+ return 0;
+}
+
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct shadow_walker walker = {
+ .walker = { .entry = FNAME(shadow_invlpg_entry), },
+ };
+
+ walk_shadow(&walker.walker, vcpu, gva);
+}
+
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
{
struct guest_walker walker;
@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
}
}
+/*
+ * Using the cached information from sp->gfns is safe because:
+ * - The spte has a reference to the struct page, so the pfn for a given gfn
+ * can't change unless all sptes pointing to it are nuked first.
+ * - Alias changes zap the entire shadow cache.
+ */
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ int i, offset, nr_present;
+
+ offset = nr_present = 0;
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn = sp->gfns[i];
+
+ if (!is_shadow_present_pte(sp->spt[i]))
+ continue;
+
+ pte_gpa = gfn_to_gpa(sp->gfn);
+ pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+ if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return -EINVAL;
+
+ if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
+ !(gpte & PT_ACCESSED_MASK)) {
+ u64 nonpresent;
+
+ rmap_remove(vcpu->kvm, &sp->spt[i]);
+ if (is_present_pte(gpte))
+ nonpresent = shadow_trap_nonpresent_pte;
+ else
+ nonpresent = shadow_notrap_nonpresent_pte;
+ set_shadow_pte(&sp->spt[i], nonpresent);
+ continue;
+ }
+
+ nr_present++;
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
+ is_dirty_pte(gpte), 0, gfn,
+ spte_to_pfn(sp->spt[i]), true, false);
+ }
+
+ return !nr_present;
+}
+
#undef pt_element_t
#undef guest_walker
+#undef shadow_walker
#undef FNAME
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
-#undef SHADOW_PT_INDEX
#undef PT_LEVEL_MASK
#undef PT_DIR_BASE_ADDR_MASK
#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8233b86c778..9c4ce657d96 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -18,6 +18,7 @@
#include "kvm_svm.h"
#include "irq.h"
#include "mmu.h"
+#include "kvm_cache_regs.h"
#include <linux/module.h>
#include <linux/kernel.h>
@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL");
#define IOPM_ALLOC_ORDER 2
#define MSRPM_ALLOC_ORDER 1
-#define DB_VECTOR 1
-#define UD_VECTOR 6
-#define GP_VECTOR 13
-
#define DR7_GD_MASK (1 << 13)
#define DR6_BD_MASK (1 << 13)
@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL");
#define SVM_FEATURE_NPT (1 << 0)
#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_DEATURE_SVML (1 << 2)
+#define SVM_FEATURE_SVML (1 << 2)
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
@@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
}
- if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
- printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
- __func__,
- svm->vmcb->save.rip,
- svm->next_rip);
+ if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
+ printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
+ __func__, kvm_rip_read(vcpu), svm->next_rip);
- vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
+ kvm_rip_write(vcpu, svm->next_rip);
svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
vcpu->arch.interrupt_window_open = 1;
@@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm)
(1ULL << INTERCEPT_CPUID) |
(1ULL << INTERCEPT_INVD) |
(1ULL << INTERCEPT_HLT) |
+ (1ULL << INTERCEPT_INVLPG) |
(1ULL << INTERCEPT_INVLPGA) |
(1ULL << INTERCEPT_IOIO_PROT) |
(1ULL << INTERCEPT_MSR_PROT) |
@@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
save->dr7 = 0x400;
save->rflags = 2;
save->rip = 0x0000fff0;
+ svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
/*
* cr0 val on cpu init should be 0x60000010, we enable cpu
@@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm)
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl = 1;
- control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH);
+ control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
+ (1ULL << INTERCEPT_INVLPG));
control->intercept_exceptions &= ~(1 << PF_VECTOR);
control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
INTERCEPT_CR3_MASK);
@@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
init_vmcb(svm);
if (vcpu->vcpu_id != 0) {
- svm->vmcb->save.rip = 0;
+ kvm_rip_write(vcpu, 0);
svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
}
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
return 0;
}
@@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
rdtscll(vcpu->arch.host_tsc);
}
-static void svm_cache_regs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.rip = svm->vmcb->save.rip;
-}
-
-static void svm_decache_regs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
- svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->arch.rip;
-}
-
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
return to_svm(vcpu)->vmcb->save.rflags;
@@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
if (npt_enabled)
svm_flush_tlb(&svm->vcpu);
- if (event_injection)
+ if (!npt_enabled && event_injection)
kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
}
@@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- svm->next_rip = svm->vmcb->save.rip + 1;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
skip_emulated_instruction(&svm->vcpu);
return kvm_emulate_halt(&svm->vcpu);
}
static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- svm->next_rip = svm->vmcb->save.rip + 3;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
kvm_emulate_hypercall(&svm->vcpu);
return 1;
@@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm,
static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- svm->next_rip = svm->vmcb->save.rip + 2;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
kvm_emulate_cpuid(&svm->vcpu);
return 1;
}
+static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+ if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
+ pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
+ return 1;
+}
+
static int emulate_on_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
@@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
(u32)(data >> 32), handler);
- svm->vmcb->save.rax = data & 0xffffffff;
+ svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
- svm->next_rip = svm->vmcb->save.rip + 2;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
skip_emulated_instruction(&svm->vcpu);
}
return 1;
@@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u64 data = (svm->vmcb->save.rax & -1u)
+ u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
handler);
- svm->next_rip = svm->vmcb->save.rip + 2;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
if (svm_set_msr(&svm->vcpu, ecx, data))
kvm_inject_gp(&svm->vcpu, 0);
else
@@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
[SVM_EXIT_HLT] = halt_interception,
- [SVM_EXIT_INVLPG] = emulate_on_interception,
+ [SVM_EXIT_INVLPG] = invlpg_interception,
[SVM_EXIT_INVLPGA] = invalid_op_interception,
[SVM_EXIT_IOIO] = io_interception,
[SVM_EXIT_MSR] = msr_interception,
@@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
+ ++svm->vcpu.stat.irq_injections;
control = &svm->vmcb->control;
control->int_vector = irq;
control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}
+#ifdef CONFIG_X86_64
+#define R "r"
+#else
+#define R "e"
+#endif
+
static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
u16 gs_selector;
u16 ldt_selector;
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
pre_svm_run(svm);
sync_lapic_to_cr8(vcpu);
@@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
local_irq_enable();
asm volatile (
+ "push %%"R"bp; \n\t"
+ "mov %c[rbx](%[svm]), %%"R"bx \n\t"
+ "mov %c[rcx](%[svm]), %%"R"cx \n\t"
+ "mov %c[rdx](%[svm]), %%"R"dx \n\t"
+ "mov %c[rsi](%[svm]), %%"R"si \n\t"
+ "mov %c[rdi](%[svm]), %%"R"di \n\t"
+ "mov %c[rbp](%[svm]), %%"R"bp \n\t"
#ifdef CONFIG_X86_64
- "push %%rbp; \n\t"
-#else
- "push %%ebp; \n\t"
-#endif
-
-#ifdef CONFIG_X86_64
- "mov %c[rbx](%[svm]), %%rbx \n\t"
- "mov %c[rcx](%[svm]), %%rcx \n\t"
- "mov %c[rdx](%[svm]), %%rdx \n\t"
- "mov %c[rsi](%[svm]), %%rsi \n\t"
- "mov %c[rdi](%[svm]), %%rdi \n\t"
- "mov %c[rbp](%[svm]), %%rbp \n\t"
"mov %c[r8](%[svm]), %%r8 \n\t"
"mov %c[r9](%[svm]), %%r9 \n\t"
"mov %c[r10](%[svm]), %%r10 \n\t"
@@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %c[r13](%[svm]), %%r13 \n\t"
"mov %c[r14](%[svm]), %%r14 \n\t"
"mov %c[r15](%[svm]), %%r15 \n\t"
-#else
- "mov %c[rbx](%[svm]), %%ebx \n\t"
- "mov %c[rcx](%[svm]), %%ecx \n\t"
- "mov %c[rdx](%[svm]), %%edx \n\t"
- "mov %c[rsi](%[svm]), %%esi \n\t"
- "mov %c[rdi](%[svm]), %%edi \n\t"
- "mov %c[rbp](%[svm]), %%ebp \n\t"
#endif
-#ifdef CONFIG_X86_64
- /* Enter guest mode */
- "push %%rax \n\t"
- "mov %c[vmcb](%[svm]), %%rax \n\t"
- __ex(SVM_VMLOAD) "\n\t"
- __ex(SVM_VMRUN) "\n\t"
- __ex(SVM_VMSAVE) "\n\t"
- "pop %%rax \n\t"
-#else
/* Enter guest mode */
- "push %%eax \n\t"
- "mov %c[vmcb](%[svm]), %%eax \n\t"
+ "push %%"R"ax \n\t"
+ "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
__ex(SVM_VMLOAD) "\n\t"
__ex(SVM_VMRUN) "\n\t"
__ex(SVM_VMSAVE) "\n\t"
- "pop %%eax \n\t"
-#endif
+ "pop %%"R"ax \n\t"
/* Save guest registers, load host registers */
+ "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
+ "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
+ "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
+ "mov %%"R"si, %c[rsi](%[svm]) \n\t"
+ "mov %%"R"di, %c[rdi](%[svm]) \n\t"
+ "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
#ifdef CONFIG_X86_64
- "mov %%rbx, %c[rbx](%[svm]) \n\t"
- "mov %%rcx, %c[rcx](%[svm]) \n\t"
- "mov %%rdx, %c[rdx](%[svm]) \n\t"
- "mov %%rsi, %c[rsi](%[svm]) \n\t"
- "mov %%rdi, %c[rdi](%[svm]) \n\t"
- "mov %%rbp, %c[rbp](%[svm]) \n\t"
"mov %%r8, %c[r8](%[svm]) \n\t"
"mov %%r9, %c[r9](%[svm]) \n\t"
"mov %%r10, %c[r10](%[svm]) \n\t"
@@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %%r13, %c[r13](%[svm]) \n\t"
"mov %%r14, %c[r14](%[svm]) \n\t"
"mov %%r15, %c[r15](%[svm]) \n\t"
-
- "pop %%rbp; \n\t"
-#else
- "mov %%ebx, %c[rbx](%[svm]) \n\t"
- "mov %%ecx, %c[rcx](%[svm]) \n\t"
- "mov %%edx, %c[rdx](%[svm]) \n\t"
- "mov %%esi, %c[rsi](%[svm]) \n\t"
- "mov %%edi, %c[rdi](%[svm]) \n\t"
- "mov %%ebp, %c[rbp](%[svm]) \n\t"
-
- "pop %%ebp; \n\t"
#endif
+ "pop %%"R"bp"
:
: [svm]"a"(svm),
[vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
[r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
#endif
: "cc", "memory"
+ , R"bx", R"cx", R"dx", R"si", R"di"
#ifdef CONFIG_X86_64
- , "rbx", "rcx", "rdx", "rsi", "rdi"
, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#else
- , "ebx", "ecx", "edx" , "esi", "edi"
#endif
);
@@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
load_db_regs(svm->host_db_regs);
vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
write_dr6(svm->host_dr6);
write_dr7(svm->host_dr7);
@@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
svm->next_rip = 0;
}
+#undef R
+
static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_gdt = svm_set_gdt,
.get_dr = svm_get_dr,
.set_dr = svm_set_dr,
- .cache_regs = svm_cache_regs,
- .decache_regs = svm_decache_regs,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7041cc52b56..2643b430d83 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,6 +26,8 @@
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
+#include "kvm_cache_regs.h"
+#include "x86.h"
#include <asm/io.h>
#include <asm/desc.h>
@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0);
static int enable_ept = 1;
module_param(enable_ept, bool, 0);
+static int emulate_invalid_guest_state = 0;
+module_param(emulate_invalid_guest_state, bool, 0);
+
struct vmcs {
u32 revision_id;
u32 abort;
@@ -56,6 +61,7 @@ struct vmcs {
struct vcpu_vmx {
struct kvm_vcpu vcpu;
struct list_head local_vcpus_link;
+ unsigned long host_rsp;
int launched;
u8 fail;
u32 idt_vectoring_info;
@@ -83,6 +89,7 @@ struct vcpu_vmx {
} irq;
} rmode;
int vpid;
+ bool emulation_required;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
if (!vcpu->fpu_active)
eb |= 1u << NM_VECTOR;
if (vcpu->guest_debug.enabled)
- eb |= 1u << 1;
+ eb |= 1u << DB_VECTOR;
if (vcpu->arch.rmode.active)
eb = ~0;
if (vm_need_ept())
@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
unsigned long rip;
u32 interruptibility;
- rip = vmcs_readl(GUEST_RIP);
+ rip = kvm_rip_read(vcpu);
rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- vmcs_writel(GUEST_RIP, rip);
+ kvm_rip_write(vcpu, rip);
/*
* We emulated an instruction, so temporary interrupt blocking
@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (has_error_code)
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+
+ if (vcpu->arch.rmode.active) {
+ vmx->rmode.irq.pending = true;
+ vmx->rmode.irq.vector = nr;
+ vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+ if (nr == BP_VECTOR)
+ vmx->rmode.irq.rip++;
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ nr | INTR_TYPE_SOFT_INTR
+ | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
+ | INTR_INFO_VALID_MASK);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+ kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+ return;
+ }
+
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
nr | INTR_TYPE_EXCEPTION
| (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
| INTR_INFO_VALID_MASK);
- if (has_error_code)
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
}
static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ return false;
}
/*
@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
return ret;
}
-/*
- * Sync the rsp and rip registers into the vcpu structure. This allows
- * registers to be accessed by indexing vcpu->arch.regs.
- */
-static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
- vcpu->arch.rip = vmcs_readl(GUEST_RIP);
-}
-
-/*
- * Syncs rsp and rip back into the vmcs. Should be called after possible
- * modification.
- */
-static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
+static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
- vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- vmcs_writel(GUEST_RIP, vcpu->arch.rip);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ switch (reg) {
+ case VCPU_REGS_RSP:
+ vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+ break;
+ case VCPU_REGS_RIP:
+ vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+ break;
+ default:
+ break;
+ }
}
static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
static int vmx_get_irq(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 idtv_info_field;
-
- idtv_info_field = vmx->idt_vectoring_info;
- if (idtv_info_field & INTR_INFO_VALID_MASK) {
- if (is_external_interrupt(idtv_info_field))
- return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
- else
- printk(KERN_DEBUG "pending exception: not handled yet\n");
- }
- return -1;
+ if (!vcpu->arch.interrupt.pending)
+ return -1;
+ return vcpu->arch.interrupt.nr;
}
static __init int cpu_has_kvm_support(void)
@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void)
u64 msr;
rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
- return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
- == MSR_IA32_FEATURE_CONTROL_LOCKED;
+ return (msr & (FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED))
+ == FEATURE_CONTROL_LOCKED;
/* locked but not enabled */
}
@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage)
INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
- if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
- != (MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+ if ((old & (FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED))
+ != (FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED))
/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
- MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
+ FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED);
write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
asm volatile (ASM_VMX_VMXON_RAX
: : "a"(&phys_addr), "m"(phys_addr)
@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_USE_IO_BITMAPS |
CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETING;
+ CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_INVLPG_EXITING;
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
- /* CR3 accesses don't need to cause VM Exits when EPT enabled */
+ /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
+ enabled */
min &= ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING);
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control) < 0)
return -EIO;
@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
static void enter_pmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ vmx->emulation_required = 1;
vcpu->arch.rmode.active = 0;
vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
update_exception_bitmap(vcpu);
+ if (emulate_invalid_guest_state)
+ return;
+
fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
static void enter_rmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ vmx->emulation_required = 1;
vcpu->arch.rmode.active = 1;
vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
update_exception_bitmap(vcpu);
+ if (emulate_invalid_guest_state)
+ goto continue_rmode;
+
vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
vmcs_write32(GUEST_SS_LIMIT, 0xffff);
vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+continue_rmode:
kvm_mmu_reset_context(vcpu);
init_rmode(vcpu->kvm);
}
@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
vmcs_writel(GUEST_GDTR_BASE, dt->base);
}
+static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ u32 ar;
+
+ vmx_get_segment(vcpu, &var, seg);
+ ar = vmx_segment_access_rights(&var);
+
+ if (var.base != (var.selector << 4))
+ return false;
+ if (var.limit != 0xffff)
+ return false;
+ if (ar != 0xf3)
+ return false;
+
+ return true;
+}
+
+static bool code_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs;
+ unsigned int cs_rpl;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs_rpl = cs.selector & SELECTOR_RPL_MASK;
+
+ if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
+ return false;
+ if (!cs.s)
+ return false;
+ if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
+ if (cs.dpl > cs_rpl)
+ return false;
+ } else if (cs.type & AR_TYPE_CODE_MASK) {
+ if (cs.dpl != cs_rpl)
+ return false;
+ }
+ if (!cs.present)
+ return false;
+
+ /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
+ return true;
+}
+
+static bool stack_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ss;
+ unsigned int ss_rpl;
+
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+ ss_rpl = ss.selector & SELECTOR_RPL_MASK;
+
+ if ((ss.type != 3) || (ss.type != 7))
+ return false;
+ if (!ss.s)
+ return false;
+ if (ss.dpl != ss_rpl) /* DPL != RPL */
+ return false;
+ if (!ss.present)
+ return false;
+
+ return true;
+}
+
+static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ unsigned int rpl;
+
+ vmx_get_segment(vcpu, &var, seg);
+ rpl = var.selector & SELECTOR_RPL_MASK;
+
+ if (!var.s)
+ return false;
+ if (!var.present)
+ return false;
+ if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
+ if (var.dpl < rpl) /* DPL < RPL */
+ return false;
+ }
+
+ /* TODO: Add other members to kvm_segment_field to allow checking for other access
+ * rights flags
+ */
+ return true;
+}
+
+static bool tr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment tr;
+
+ vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
+
+ if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
+ return false;
+ if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
+ return false;
+ if (!tr.present)
+ return false;
+
+ return true;
+}
+
+static bool ldtr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ldtr;
+
+ vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
+
+ if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
+ return false;
+ if (ldtr.type != 2)
+ return false;
+ if (!ldtr.present)
+ return false;
+
+ return true;
+}
+
+static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ss;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+
+ return ((cs.selector & SELECTOR_RPL_MASK) ==
+ (ss.selector & SELECTOR_RPL_MASK));
+}
+
+/*
+ * Check if guest state is valid. Returns true if valid, false if
+ * not.
+ * We assume that registers are always usable
+ */
+static bool guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ /* real mode guest state checks */
+ if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ } else {
+ /* protected mode guest state checks */
+ if (!cs_ss_rpl_check(vcpu))
+ return false;
+ if (!code_segment_valid(vcpu))
+ return false;
+ if (!stack_segment_valid(vcpu))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ if (!tr_valid(vcpu))
+ return false;
+ if (!ldtr_valid(vcpu))
+ return false;
+ }
+ /* TODO:
+ * - Add checks on RIP
+ * - Add checks on RFLAGS
+ */
+
+ return true;
+}
+
static int init_rmode_tss(struct kvm *kvm)
{
gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm)
if (r < 0)
goto out;
data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
- r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
+ r = kvm_write_guest_page(kvm, fn++, &data,
+ TSS_IOPB_BASE_OFFSET, sizeof(u16));
if (r < 0)
goto out;
r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
@@ -1789,7 +1994,7 @@ static void seg_setup(int seg)
vmcs_write16(sf->selector, 0);
vmcs_writel(sf->base, 0);
vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0x93);
+ vmcs_write32(sf->ar_bytes, 0xf3);
}
static int alloc_apic_access_page(struct kvm *kvm)
@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
if (r)
goto out;
- down_read(&current->mm->mmap_sem);
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
- up_read(&current->mm->mmap_sem);
out:
up_write(&kvm->slots_lock);
return r;
@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
if (r)
goto out;
- down_read(&current->mm->mmap_sem);
kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
out:
up_write(&kvm->slots_lock);
return r;
@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
}
if (!vm_need_ept())
exec_control |= CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_CR3_LOAD_EXITING;
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_INVLPG_EXITING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
if (cpu_has_secondary_exec_ctrls()) {
@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
u64 msr;
int ret;
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
down_read(&vcpu->kvm->slots_lock);
if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM;
@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
fx_init(&vmx->vcpu);
+ seg_setup(VCPU_SREG_CS);
/*
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
* insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
}
- vmcs_write32(GUEST_CS_LIMIT, 0xffff);
- vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
seg_setup(VCPU_SREG_DS);
seg_setup(VCPU_SREG_ES);
@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RFLAGS, 0x02);
if (vmx->vcpu.vcpu_id == 0)
- vmcs_writel(GUEST_RIP, 0xfff0);
+ kvm_rip_write(vcpu, 0xfff0);
else
- vmcs_writel(GUEST_RIP, 0);
- vmcs_writel(GUEST_RSP, 0);
+ kvm_rip_write(vcpu, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
/* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
vmcs_writel(GUEST_DR7, 0x400);
@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
ret = 0;
+ /* HACK: Don't enable emulation on guest boot/reset */
+ vmx->emulation_required = 0;
+
out:
up_read(&vcpu->kvm->slots_lock);
return ret;
@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
+ ++vcpu->stat.irq_injections;
if (vcpu->arch.rmode.active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = irq;
- vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
+ vmx->rmode.irq.rip = kvm_rip_read(vcpu);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
- vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
+ kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
return;
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
- vcpu->arch.nmi_pending = 0;
}
static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
if (!vcpu->arch.irq_pending[word_index])
clear_bit(word_index, &vcpu->arch.irq_summary);
- vmx_inject_irq(vcpu, irq);
+ kvm_queue_interrupt(vcpu, irq);
}
@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
if (vcpu->arch.interrupt_window_open &&
- vcpu->arch.irq_summary &&
- !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
- /*
- * If interrupts enabled, and not blocked by sti or mov ss. Good.
- */
+ vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
kvm_do_inject_irq(vcpu);
+ if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
+ vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
if (!vcpu->arch.interrupt_window_open &&
(vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
static int handle_rmode_exception(struct kvm_vcpu *vcpu,
int vec, u32 err_code)
{
- if (!vcpu->arch.rmode.active)
- return 0;
-
/*
* Instruction with address size override prefix opcode 0x67
* Cause the #SS fault with 0 error code in VM86 mode.
@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
return 1;
+ /*
+ * Forward all other exceptions that are valid in real mode.
+ * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+ * the required debugging infrastructure rework.
+ */
+ switch (vec) {
+ case DE_VECTOR:
+ case DB_VECTOR:
+ case BP_VECTOR:
+ case OF_VECTOR:
+ case BR_VECTOR:
+ case UD_VECTOR:
+ case DF_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ case MF_VECTOR:
+ kvm_queue_exception(vcpu, vec);
+ return 1;
+ }
return 0;
}
@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
error_code = 0;
- rip = vmcs_readl(GUEST_RIP);
+ rip = kvm_rip_read(vcpu);
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
cr2 = vmcs_readl(EXIT_QUALIFICATION);
KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
(u32)((u64)cr2 >> 32), handler);
- if (vect_info & VECTORING_INFO_VALID_MASK)
+ if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
- KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg],
- (u32)((u64)vcpu->arch.regs[reg] >> 32), handler);
+ KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
+ (u32)kvm_register_read(vcpu, reg),
+ (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
+ handler);
switch (cr) {
case 0:
- vcpu_load_rsp_rip(vcpu);
- kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
skip_emulated_instruction(vcpu);
return 1;
case 3:
- vcpu_load_rsp_rip(vcpu);
- kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
skip_emulated_instruction(vcpu);
return 1;
case 4:
- vcpu_load_rsp_rip(vcpu);
- kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
skip_emulated_instruction(vcpu);
return 1;
case 8:
- vcpu_load_rsp_rip(vcpu);
- kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
skip_emulated_instruction(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
return 1;
@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
};
break;
case 2: /* clts */
- vcpu_load_rsp_rip(vcpu);
vmx_fpu_deactivate(vcpu);
vcpu->arch.cr0 &= ~X86_CR0_TS;
vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
case 1: /*mov from cr*/
switch (cr) {
case 3:
- vcpu_load_rsp_rip(vcpu);
- vcpu->arch.regs[reg] = vcpu->arch.cr3;
- vcpu_put_rsp_rip(vcpu);
+ kvm_register_write(vcpu, reg, vcpu->arch.cr3);
KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
- (u32)vcpu->arch.regs[reg],
- (u32)((u64)vcpu->arch.regs[reg] >> 32),
+ (u32)kvm_register_read(vcpu, reg),
+ (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
handler);
skip_emulated_instruction(vcpu);
return 1;
case 8:
- vcpu_load_rsp_rip(vcpu);
- vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
- vcpu_put_rsp_rip(vcpu);
+ kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
- (u32)vcpu->arch.regs[reg], handler);
+ (u32)kvm_register_read(vcpu, reg), handler);
skip_emulated_instruction(vcpu);
return 1;
}
@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
dr = exit_qualification & 7;
reg = (exit_qualification >> 8) & 15;
- vcpu_load_rsp_rip(vcpu);
if (exit_qualification & 16) {
/* mov from dr */
switch (dr) {
@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
default:
val = 0;
}
- vcpu->arch.regs[reg] = val;
+ kvm_register_write(vcpu, reg, val);
KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
} else {
/* mov to dr */
}
- vcpu_put_rsp_rip(vcpu);
skip_emulated_instruction(vcpu);
return 1;
}
@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
+static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+
+ kvm_mmu_invlpg(vcpu, exit_qualification);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
skip_emulated_instruction(vcpu);
@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
+static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
+ struct kvm_run *kvm_run)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int err;
+
+ preempt_enable();
+ local_irq_enable();
+
+ while (!guest_state_valid(vcpu)) {
+ err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+ switch (err) {
+ case EMULATE_DONE:
+ break;
+ case EMULATE_DO_MMIO:
+ kvm_report_emulation_failure(vcpu, "mmio");
+ /* TODO: Handle MMIO */
+ return;
+ default:
+ kvm_report_emulation_failure(vcpu, "emulation failure");
+ return;
+ }
+
+ if (signal_pending(current))
+ break;
+ if (need_resched())
+ schedule();
+ }
+
+ local_irq_disable();
+ preempt_disable();
+
+ /* Guest state should be valid now, no more emulation should be needed */
+ vmx->emulation_required = 0;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_MSR_WRITE] = handle_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vectoring_info = vmx->idt_vectoring_info;
- KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
- (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
+ KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
+ (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
@@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
enable_irq_window(vcpu);
}
-static void vmx_intr_assist(struct kvm_vcpu *vcpu)
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 idtv_info_field, intr_info_field, exit_intr_info_field;
- int vector;
+ u32 exit_intr_info;
+ u32 idt_vectoring_info;
+ bool unblock_nmi;
+ u8 vector;
+ int type;
+ bool idtv_info_valid;
+ u32 error;
- update_tpr_threshold(vcpu);
-
- intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
- exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO);
- idtv_info_field = vmx->idt_vectoring_info;
- if (intr_info_field & INTR_INFO_VALID_MASK) {
- if (idtv_info_field & INTR_INFO_VALID_MASK) {
- /* TODO: fault when IDT_Vectoring */
- if (printk_ratelimit())
- printk(KERN_ERR "Fault when IDT_Vectoring\n");
- }
- enable_intr_window(vcpu);
- return;
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ if (cpu_has_virtual_nmis()) {
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 25.7.1.2
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ */
+ if (unblock_nmi && vector != DF_VECTOR)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
}
- if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
- if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
- == INTR_TYPE_EXT_INTR
- && vcpu->arch.rmode.active) {
- u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-
- vmx_inject_irq(vcpu, vect);
- enable_intr_window(vcpu);
- return;
- }
-
- KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
+ idt_vectoring_info = vmx->idt_vectoring_info;
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+ vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+ type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+ if (vmx->vcpu.arch.nmi_injected) {
/*
* SDM 3: 25.7.1.2
* Clear bit "block by NMI" before VM entry if a NMI delivery
* faulted.
*/
- if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
- == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis())
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
- vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- ~GUEST_INTR_STATE_NMI);
-
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
- & ~INTR_INFO_RESVD_BITS_MASK);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
-
- if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
- vmcs_read32(IDT_VECTORING_ERROR_CODE));
- enable_intr_window(vcpu);
- return;
+ if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmx->vcpu.arch.nmi_injected = false;
+ }
+ kvm_clear_exception_queue(&vmx->vcpu);
+ if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
+ error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ kvm_queue_exception_e(&vmx->vcpu, vector, error);
+ } else
+ kvm_queue_exception(&vmx->vcpu, vector);
+ vmx->idt_vectoring_info = 0;
}
+ kvm_clear_interrupt_queue(&vmx->vcpu);
+ if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
+ kvm_queue_interrupt(&vmx->vcpu, vector);
+ vmx->idt_vectoring_info = 0;
+ }
+}
+
+static void vmx_intr_assist(struct kvm_vcpu *vcpu)
+{
+ update_tpr_threshold(vcpu);
+
if (cpu_has_virtual_nmis()) {
- /*
- * SDM 3: 25.7.1.2
- * Re-set bit "block by NMI" before VM entry if vmexit caused by
- * a guest IRET fault.
- */
- if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) &&
- (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8)
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
- vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) |
- GUEST_INTR_STATE_NMI);
- else if (vcpu->arch.nmi_pending) {
- if (vmx_nmi_enabled(vcpu))
- vmx_inject_nmi(vcpu);
+ if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+ if (vmx_nmi_enabled(vcpu)) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ } else {
+ enable_intr_window(vcpu);
+ return;
+ }
+ }
+ if (vcpu->arch.nmi_injected) {
+ vmx_inject_nmi(vcpu);
enable_intr_window(vcpu);
return;
}
-
}
- if (!kvm_cpu_has_interrupt(vcpu))
- return;
- if (vmx_irq_enabled(vcpu)) {
- vector = kvm_cpu_get_interrupt(vcpu);
- vmx_inject_irq(vcpu, vector);
- kvm_timer_intr_post(vcpu, vector);
- } else
- enable_irq_window(vcpu);
+ if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
+ if (vmx_irq_enabled(vcpu))
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
+ else
+ enable_irq_window(vcpu);
+ }
+ if (vcpu->arch.interrupt.pending) {
+ vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+ kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+ }
}
/*
@@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
static void fixup_rmode_irq(struct vcpu_vmx *vmx)
{
vmx->rmode.irq.pending = 0;
- if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
+ if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
return;
- vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
+ kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
@@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
| vmx->rmode.irq.vector;
}
+#ifdef CONFIG_X86_64
+#define R "r"
+#define Q "q"
+#else
+#define R "e"
+#define Q "l"
+#endif
+
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info;
+ /* Handle invalid guest state instead of entering VMX */
+ if (vmx->emulation_required && emulate_invalid_guest_state) {
+ handle_invalid_guest_state(vcpu, kvm_run);
+ return;
+ }
+
+ if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+ if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
/*
* Loading guest fpu may have cleared host cr0.ts
*/
@@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
asm(
/* Store host registers */
-#ifdef CONFIG_X86_64
- "push %%rdx; push %%rbp;"
- "push %%rcx \n\t"
-#else
- "push %%edx; push %%ebp;"
- "push %%ecx \n\t"
-#endif
+ "push %%"R"dx; push %%"R"bp;"
+ "push %%"R"cx \n\t"
+ "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+ "je 1f \n\t"
+ "mov %%"R"sp, %c[host_rsp](%0) \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+ "1: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
+ "mov %c[cr2](%0), %%"R"ax \n\t"
+ "mov %%"R"ax, %%cr2 \n\t"
+ "mov %c[rax](%0), %%"R"ax \n\t"
+ "mov %c[rbx](%0), %%"R"bx \n\t"
+ "mov %c[rdx](%0), %%"R"dx \n\t"
+ "mov %c[rsi](%0), %%"R"si \n\t"
+ "mov %c[rdi](%0), %%"R"di \n\t"
+ "mov %c[rbp](%0), %%"R"bp \n\t"
#ifdef CONFIG_X86_64
- "mov %c[cr2](%0), %%rax \n\t"
- "mov %%rax, %%cr2 \n\t"
- "mov %c[rax](%0), %%rax \n\t"
- "mov %c[rbx](%0), %%rbx \n\t"
- "mov %c[rdx](%0), %%rdx \n\t"
- "mov %c[rsi](%0), %%rsi \n\t"
- "mov %c[rdi](%0), %%rdi \n\t"
- "mov %c[rbp](%0), %%rbp \n\t"
"mov %c[r8](%0), %%r8 \n\t"
"mov %c[r9](%0), %%r9 \n\t"
"mov %c[r10](%0), %%r10 \n\t"
@@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %c[r13](%0), %%r13 \n\t"
"mov %c[r14](%0), %%r14 \n\t"
"mov %c[r15](%0), %%r15 \n\t"
- "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
-#else
- "mov %c[cr2](%0), %%eax \n\t"
- "mov %%eax, %%cr2 \n\t"
- "mov %c[rax](%0), %%eax \n\t"
- "mov %c[rbx](%0), %%ebx \n\t"
- "mov %c[rdx](%0), %%edx \n\t"
- "mov %c[rsi](%0), %%esi \n\t"
- "mov %c[rdi](%0), %%edi \n\t"
- "mov %c[rbp](%0), %%ebp \n\t"
- "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
#endif
+ "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+
/* Enter guest mode */
"jne .Llaunched \n\t"
__ex(ASM_VMX_VMLAUNCH) "\n\t"
@@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
".Lkvm_vmx_return: "
/* Save guest registers, load host registers, keep flags */
+ "xchg %0, (%%"R"sp) \n\t"
+ "mov %%"R"ax, %c[rax](%0) \n\t"
+ "mov %%"R"bx, %c[rbx](%0) \n\t"
+ "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
+ "mov %%"R"dx, %c[rdx](%0) \n\t"
+ "mov %%"R"si, %c[rsi](%0) \n\t"
+ "mov %%"R"di, %c[rdi](%0) \n\t"
+ "mov %%"R"bp, %c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
- "xchg %0, (%%rsp) \n\t"
- "mov %%rax, %c[rax](%0) \n\t"
- "mov %%rbx, %c[rbx](%0) \n\t"
- "pushq (%%rsp); popq %c[rcx](%0) \n\t"
- "mov %%rdx, %c[rdx](%0) \n\t"
- "mov %%rsi, %c[rsi](%0) \n\t"
- "mov %%rdi, %c[rdi](%0) \n\t"
- "mov %%rbp, %c[rbp](%0) \n\t"
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
"mov %%r10, %c[r10](%0) \n\t"
@@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %%r13, %c[r13](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
- "mov %%cr2, %%rax \n\t"
- "mov %%rax, %c[cr2](%0) \n\t"
-
- "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
-#else
- "xchg %0, (%%esp) \n\t"
- "mov %%eax, %c[rax](%0) \n\t"
- "mov %%ebx, %c[rbx](%0) \n\t"
- "pushl (%%esp); popl %c[rcx](%0) \n\t"
- "mov %%edx, %c[rdx](%0) \n\t"
- "mov %%esi, %c[rsi](%0) \n\t"
- "mov %%edi, %c[rdi](%0) \n\t"
- "mov %%ebp, %c[rbp](%0) \n\t"
- "mov %%cr2, %%eax \n\t"
- "mov %%eax, %c[cr2](%0) \n\t"
-
- "pop %%ebp; pop %%ebp; pop %%edx \n\t"
#endif
+ "mov %%cr2, %%"R"ax \n\t"
+ "mov %%"R"ax, %c[cr2](%0) \n\t"
+
+ "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
"setbe %c[fail](%0) \n\t"
: : "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
+ [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
@@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
#endif
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
: "cc", "memory"
+ , R"bx", R"di", R"si"
#ifdef CONFIG_X86_64
- , "rbx", "rdi", "rsi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#else
- , "ebx", "edi", "rsi"
#endif
);
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
+ vcpu->arch.regs_dirty = 0;
+
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx);
@@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
KVMTRACE_0D(NMI, vcpu, handler);
asm("int $2");
}
+
+ vmx_complete_interrupts(vmx);
}
+#undef R
+#undef Q
+
static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
- .cache_regs = vcpu_load_rsp_rip,
- .decache_regs = vcpu_put_rsp_rip,
+ .cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 23e8373507a..3e010d21fdd 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,24 +331,6 @@ enum vmcs_field {
#define AR_RESERVD_MASK 0xfffe0f00
-#define MSR_IA32_VMX_BASIC 0x480
-#define MSR_IA32_VMX_PINBASED_CTLS 0x481
-#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
-#define MSR_IA32_VMX_EXIT_CTLS 0x483
-#define MSR_IA32_VMX_ENTRY_CTLS 0x484
-#define MSR_IA32_VMX_MISC 0x485
-#define MSR_IA32_VMX_CR0_FIXED0 0x486
-#define MSR_IA32_VMX_CR0_FIXED1 0x487
-#define MSR_IA32_VMX_CR4_FIXED0 0x488
-#define MSR_IA32_VMX_CR4_FIXED1 0x489
-#define MSR_IA32_VMX_VMCS_ENUM 0x48a
-#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
-#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c
-
-#define MSR_IA32_FEATURE_CONTROL 0x3a
-#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
-#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
-
#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0d682fc6aeb..4f0677d1eae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4,10 +4,14 @@
* derived from drivers/kvm/kvm_main.c
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
@@ -19,14 +23,18 @@
#include "mmu.h"
#include "i8254.h"
#include "tss.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
#include <linux/clocksource.h>
+#include <linux/interrupt.h>
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/highmem.h>
+#include <linux/intel-iommu.h>
#include <asm/uaccess.h>
#include <asm/msr.h>
@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
struct kvm_x86_ops *kvm_x86_ops;
+EXPORT_SYMBOL_GPL(kvm_x86_ops);
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "fpu_reload", VCPU_STAT(fpu_reload) },
{ "insn_emulation", VCPU_STAT(insn_emulation) },
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+ { "irq_injections", VCPU_STAT(irq_injections) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_flooded", VM_STAT(mmu_flooded) },
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+ { "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
{ NULL }
};
-
unsigned long segment_base(u16 selector)
{
struct descriptor_table gdt;
@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+ kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
return;
}
@@ -564,7 +575,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
- __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
+ __func__, tsc_khz, hv_clock->tsc_shift,
hv_clock->tsc_to_system_mul);
}
@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
__func__, data);
break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!data) {
+ /* We support the non-activated case already */
+ break;
+ } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+ /* Values other than LBR and BTF are vendor-specific,
+ thus reserved and should throw a #GP */
+ return 1;
+ }
+ pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
+ break;
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
break;
@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* ...but clean it before doing the actual write */
vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
- down_read(&current->mm->mmap_sem);
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
if (is_error_page(vcpu->arch.time_page)) {
kvm_release_page_clean(vcpu->arch.time_page);
@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MC0_MISC+8:
case MSR_IA32_MC0_MISC+12:
case MSR_IA32_MC0_MISC+16:
+ case MSR_IA32_MC0_MISC+20:
case MSR_IA32_UCODE_REV:
case MSR_IA32_EBL_CR_POWERON:
+ case MSR_IA32_DEBUGCTLMSR:
+ case MSR_IA32_LASTBRANCHFROMIP:
+ case MSR_IA32_LASTBRANCHTOIP:
+ case MSR_IA32_LASTINTFROMIP:
+ case MSR_IA32_LASTINTTOIP:
data = 0;
break;
case MSR_MTRRcap:
@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PV_MMU:
r = !tdp_enabled;
break;
+ case KVM_CAP_IOMMU:
+ r = intel_iommu_found();
+ break;
default:
r = 0;
break;
@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
+ struct kvm_lapic_state *lapic = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
- struct kvm_lapic_state lapic;
+ lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
- memset(&lapic, 0, sizeof lapic);
- r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
+ r = -ENOMEM;
+ if (!lapic)
+ goto out;
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, &lapic, sizeof lapic))
+ if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
goto out;
r = 0;
break;
}
case KVM_SET_LAPIC: {
- struct kvm_lapic_state lapic;
-
+ lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!lapic)
+ goto out;
r = -EFAULT;
- if (copy_from_user(&lapic, argp, sizeof lapic))
+ if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
goto out;
- r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
if (r)
goto out;
r = 0;
@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
}
out:
+ if (lapic)
+ kfree(lapic);
return r;
}
@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -EINVAL;
+ /*
+ * This union makes it completely explicit to gcc-3.x
+ * that these two variables' stack usage should be
+ * combined, not added together.
+ */
+ union {
+ struct kvm_pit_state ps;
+ struct kvm_memory_alias alias;
+ } u;
switch (ioctl) {
case KVM_SET_TSS_ADDR:
@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break;
- case KVM_SET_MEMORY_ALIAS: {
- struct kvm_memory_alias alias;
-
+ case KVM_SET_MEMORY_ALIAS:
r = -EFAULT;
- if (copy_from_user(&alias, argp, sizeof alias))
+ if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
goto out;
- r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
+ r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
if (r)
goto out;
break;
- }
case KVM_CREATE_IRQCHIP:
r = -ENOMEM;
kvm->arch.vpic = kvm_create_pic(kvm);
@@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
if (irqchip_in_kernel(kvm)) {
mutex_lock(&kvm->lock);
- if (irq_event.irq < 16)
- kvm_pic_set_irq(pic_irqchip(kvm),
- irq_event.irq,
- irq_event.level);
- kvm_ioapic_set_irq(kvm->arch.vioapic,
- irq_event.irq,
- irq_event.level);
+ kvm_set_irq(kvm, irq_event.irq, irq_event.level);
mutex_unlock(&kvm->lock);
r = 0;
}
@@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip chip;
+ struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
- r = -EFAULT;
- if (copy_from_user(&chip, argp, sizeof chip))
+ r = -ENOMEM;
+ if (!chip)
goto out;
+ r = -EFAULT;
+ if (copy_from_user(chip, argp, sizeof *chip))
+ goto get_irqchip_out;
r = -ENXIO;
if (!irqchip_in_kernel(kvm))
- goto out;
- r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+ goto get_irqchip_out;
+ r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
- goto out;
+ goto get_irqchip_out;
r = -EFAULT;
- if (copy_to_user(argp, &chip, sizeof chip))
- goto out;
+ if (copy_to_user(argp, chip, sizeof *chip))
+ goto get_irqchip_out;
r = 0;
+ get_irqchip_out:
+ kfree(chip);
+ if (r)
+ goto out;
break;
}
case KVM_SET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip chip;
+ struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
- r = -EFAULT;
- if (copy_from_user(&chip, argp, sizeof chip))
+ r = -ENOMEM;
+ if (!chip)
goto out;
+ r = -EFAULT;
+ if (copy_from_user(chip, argp, sizeof *chip))
+ goto set_irqchip_out;
r = -ENXIO;
if (!irqchip_in_kernel(kvm))
- goto out;
- r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+ goto set_irqchip_out;
+ r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r)
- goto out;
+ goto set_irqchip_out;
r = 0;
+ set_irqchip_out:
+ kfree(chip);
+ if (r)
+ goto out;
break;
}
case KVM_GET_PIT: {
- struct kvm_pit_state ps;
r = -EFAULT;
- if (copy_from_user(&ps, argp, sizeof ps))
+ if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
goto out;
r = -ENXIO;
if (!kvm->arch.vpit)
goto out;
- r = kvm_vm_ioctl_get_pit(kvm, &ps);
+ r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, &ps, sizeof ps))
+ if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
goto out;
r = 0;
break;
}
case KVM_SET_PIT: {
- struct kvm_pit_state ps;
r = -EFAULT;
- if (copy_from_user(&ps, argp, sizeof ps))
+ if (copy_from_user(&u.ps, argp, sizeof u.ps))
goto out;
r = -ENXIO;
if (!kvm->arch.vpit)
goto out;
- r = kvm_vm_ioctl_set_pit(kvm, &ps);
+ r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
if (r)
goto out;
r = 0;
@@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
val = *(u64 *)new;
- down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
kaddr = kmap_atomic(page, KM_USER0);
set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
@@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
{
+ kvm_mmu_invlpg(vcpu, address);
return X86EMUL_CONTINUE;
}
@@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
{
u8 opcodes[4];
- unsigned long rip = vcpu->arch.rip;
+ unsigned long rip = kvm_rip_read(vcpu);
unsigned long rip_linear;
if (!printk_ratelimit())
@@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = {
.cmpxchg_emulated = emulator_cmpxchg_emulated,
};
+static void cache_all_regs(struct kvm_vcpu *vcpu)
+{
+ kvm_register_read(vcpu, VCPU_REGS_RAX);
+ kvm_register_read(vcpu, VCPU_REGS_RSP);
+ kvm_register_read(vcpu, VCPU_REGS_RIP);
+ vcpu->arch.regs_dirty = ~0;
+}
+
int emulate_instruction(struct kvm_vcpu *vcpu,
struct kvm_run *run,
unsigned long cr2,
@@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
int r;
struct decode_cache *c;
+ kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
- kvm_x86_ops->cache_regs(vcpu);
+ /*
+ * TODO: fix x86_emulate.c to use guest_read/write_register
+ * instead of direct ->regs accesses, can save hundred cycles
+ * on Intel for instructions that don't read/change RSP, for
+ * for example.
+ */
+ cache_all_regs(vcpu);
vcpu->mmio_is_write = 0;
vcpu->arch.pio.string = 0;
@@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DO_MMIO;
}
- kvm_x86_ops->decache_regs(vcpu);
kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
if (vcpu->mmio_is_write) {
@@ -2225,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu)
struct kvm_pio_request *io = &vcpu->arch.pio;
long delta;
int r;
-
- kvm_x86_ops->cache_regs(vcpu);
+ unsigned long val;
if (!io->string) {
- if (io->in)
- memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
- io->size);
+ if (io->in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ memcpy(&val, vcpu->arch.pio_data, io->size);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+ }
} else {
if (io->in) {
r = pio_copy_data(vcpu);
- if (r) {
- kvm_x86_ops->cache_regs(vcpu);
+ if (r)
return r;
- }
}
delta = 1;
@@ -2248,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu)
* The size of the register should really depend on
* current address size.
*/
- vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
+ val = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ val -= delta;
+ kvm_register_write(vcpu, VCPU_REGS_RCX, val);
}
if (io->down)
delta = -delta;
delta *= io->size;
- if (io->in)
- vcpu->arch.regs[VCPU_REGS_RDI] += delta;
- else
- vcpu->arch.regs[VCPU_REGS_RSI] += delta;
+ if (io->in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ val += delta;
+ kvm_register_write(vcpu, VCPU_REGS_RDI, val);
+ } else {
+ val = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ val += delta;
+ kvm_register_write(vcpu, VCPU_REGS_RSI, val);
+ }
}
- kvm_x86_ops->decache_regs(vcpu);
-
io->count -= io->cur_count;
io->cur_count = 0;
@@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned port)
{
struct kvm_io_device *pio_dev;
+ unsigned long val;
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
handler);
- kvm_x86_ops->cache_regs(vcpu);
- memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
+ val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ memcpy(vcpu->arch.pio_data, &val, 4);
kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
KVMTRACE_0D(HLT, vcpu, handler);
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
- up_read(&vcpu->kvm->slots_lock);
- kvm_vcpu_block(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
- return -EINTR;
return 1;
} else {
vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
unsigned long nr, a0, a1, a2, a3, ret;
int r = 1;
- kvm_x86_ops->cache_regs(vcpu);
-
- nr = vcpu->arch.regs[VCPU_REGS_RAX];
- a0 = vcpu->arch.regs[VCPU_REGS_RBX];
- a1 = vcpu->arch.regs[VCPU_REGS_RCX];
- a2 = vcpu->arch.regs[VCPU_REGS_RDX];
- a3 = vcpu->arch.regs[VCPU_REGS_RSI];
+ nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
@@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
ret = -KVM_ENOSYS;
break;
}
- vcpu->arch.regs[VCPU_REGS_RAX] = ret;
- kvm_x86_ops->decache_regs(vcpu);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
++vcpu->stat.hypercalls;
return r;
}
@@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
{
char instruction[3];
int ret = 0;
+ unsigned long rip = kvm_rip_read(vcpu);
/*
@@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
*/
kvm_mmu_zap_all(vcpu->kvm);
- kvm_x86_ops->cache_regs(vcpu);
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
+ if (emulator_write_emulated(rip, instruction, 3, vcpu)
!= X86EMUL_CONTINUE)
ret = -EFAULT;
@@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
u32 function, index;
struct kvm_cpuid_entry2 *e, *best;
- kvm_x86_ops->cache_regs(vcpu);
- function = vcpu->arch.regs[VCPU_REGS_RAX];
- index = vcpu->arch.regs[VCPU_REGS_RCX];
- vcpu->arch.regs[VCPU_REGS_RAX] = 0;
- vcpu->arch.regs[VCPU_REGS_RBX] = 0;
- vcpu->arch.regs[VCPU_REGS_RCX] = 0;
- vcpu->arch.regs[VCPU_REGS_RDX] = 0;
+ function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
best = NULL;
for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
e = &vcpu->arch.cpuid_entries[i];
@@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
best = e;
}
if (best) {
- vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
- vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
- vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
- vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
}
- kvm_x86_ops->decache_regs(vcpu);
kvm_x86_ops->skip_emulated_instruction(vcpu);
KVMTRACE_5D(CPUID, vcpu, function,
- (u32)vcpu->arch.regs[VCPU_REGS_RAX],
- (u32)vcpu->arch.regs[VCPU_REGS_RBX],
- (u32)vcpu->arch.regs[VCPU_REGS_RCX],
- (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);
+ (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
+ (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
+ (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
+ (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
@@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
if (!apic || !apic->vapic_addr)
return;
- down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
vcpu->arch.apic->vapic_page = page;
}
@@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
up_read(&vcpu->kvm->slots_lock);
}
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
- if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
- pr_debug("vcpu %d received sipi with vector # %x\n",
- vcpu->vcpu_id, vcpu->arch.sipi_vector);
- kvm_lapic_reset(vcpu);
- r = kvm_x86_ops->vcpu_reset(vcpu);
- if (r)
- return r;
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- }
-
- down_read(&vcpu->kvm->slots_lock);
- vapic_enter(vcpu);
-
-preempted:
- if (vcpu->guest_debug.enabled)
- kvm_x86_ops->guest_debug_pre(vcpu);
-
-again:
if (vcpu->requests)
if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
kvm_mmu_unload(vcpu);
@@ -2829,6 +2866,8 @@ again:
if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
+ if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+ kvm_mmu_sync_roots(vcpu);
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
@@ -2854,21 +2893,15 @@ again:
local_irq_disable();
- if (vcpu->requests || need_resched()) {
+ if (vcpu->requests || need_resched() || signal_pending(current)) {
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
- if (signal_pending(current)) {
- local_irq_enable();
- preempt_enable();
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- goto out;
- }
+ if (vcpu->guest_debug.enabled)
+ kvm_x86_ops->guest_debug_pre(vcpu);
vcpu->guest_mode = 1;
/*
@@ -2917,8 +2950,8 @@ again:
* Profile KVM exit RIPs:
*/
if (unlikely(prof_on == KVM_PROFILING)) {
- kvm_x86_ops->cache_regs(vcpu);
- profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
+ unsigned long rip = kvm_rip_read(vcpu);
+ profile_hit(KVM_PROFILING, (void *)rip);
}
if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
@@ -2927,26 +2960,63 @@ again:
kvm_lapic_sync_from_vapic(vcpu);
r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+out:
+ return r;
+}
- if (r > 0) {
- if (dm_request_for_irq_injection(vcpu, kvm_run)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.request_irq_exits;
- goto out;
- }
- if (!need_resched())
- goto again;
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+
+ if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
+ pr_debug("vcpu %d received sipi with vector # %x\n",
+ vcpu->vcpu_id, vcpu->arch.sipi_vector);
+ kvm_lapic_reset(vcpu);
+ r = kvm_x86_ops->vcpu_reset(vcpu);
+ if (r)
+ return r;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
-out:
- up_read(&vcpu->kvm->slots_lock);
- if (r > 0) {
- kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- goto preempted;
+ down_read(&vcpu->kvm->slots_lock);
+ vapic_enter(vcpu);
+
+ r = 1;
+ while (r > 0) {
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+ r = vcpu_enter_guest(vcpu, kvm_run);
+ else {
+ up_read(&vcpu->kvm->slots_lock);
+ kvm_vcpu_block(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
+ if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
+ if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
+ r = -EINTR;
+ }
+
+ if (r > 0) {
+ if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.request_irq_exits;
+ }
+ if (signal_pending(current)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
+ if (need_resched()) {
+ up_read(&vcpu->kvm->slots_lock);
+ kvm_resched(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
+ }
+ }
}
+ up_read(&vcpu->kvm->slots_lock);
post_kvm_run_save(vcpu, kvm_run);
vapic_exit(vcpu);
@@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
kvm_vcpu_block(vcpu);
+ clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
r = -EAGAIN;
goto out;
}
@@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
}
#endif
- if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
- kvm_x86_ops->cache_regs(vcpu);
- vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
- kvm_x86_ops->decache_regs(vcpu);
- }
+ if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
+ kvm_register_write(vcpu, VCPU_REGS_RAX,
+ kvm_run->hypercall.ret);
r = __vcpu_run(vcpu, kvm_run);
@@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
vcpu_load(vcpu);
- kvm_x86_ops->cache_regs(vcpu);
-
- regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
- regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
- regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
- regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
- regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
- regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
- regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
+ regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
#ifdef CONFIG_X86_64
- regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
- regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
- regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
- regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
- regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
- regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
- regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
- regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
+ regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
+ regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
+ regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
+ regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
+ regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
+ regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
+ regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
+ regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
#endif
- regs->rip = vcpu->arch.rip;
+ regs->rip = kvm_rip_read(vcpu);
regs->rflags = kvm_x86_ops->get_rflags(vcpu);
/*
@@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
vcpu_load(vcpu);
- vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
- vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
- vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
- vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
- vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
- vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
- vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
- vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
#ifdef CONFIG_X86_64
- vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
- vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
- vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
- vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
- vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
- vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
- vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
- vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
+ kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+ kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+ kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+ kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+ kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+ kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+ kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+ kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+
#endif
- vcpu->arch.rip = regs->rip;
+ kvm_rip_write(vcpu, regs->rip);
kvm_x86_ops->set_rflags(vcpu, regs->rflags);
- kvm_x86_ops->decache_regs(vcpu);
vcpu->arch.exception.pending = false;
@@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
return 0;
}
+static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
+{
+ struct kvm_segment segvar = {
+ .base = selector << 4,
+ .limit = 0xffff,
+ .selector = selector,
+ .type = 3,
+ .present = 1,
+ .dpl = 3,
+ .db = 0,
+ .s = 1,
+ .l = 0,
+ .g = 0,
+ .avl = 0,
+ .unusable = 0,
+ };
+ kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+ return 0;
+}
+
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
int type_bits, int seg)
{
struct kvm_segment kvm_seg;
+ if (!(vcpu->arch.cr0 & X86_CR0_PE))
+ return kvm_load_realmode_segment(vcpu, selector, seg);
if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
return 1;
kvm_seg.type |= type_bits;
@@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
struct tss_segment_32 *tss)
{
tss->cr3 = vcpu->arch.cr3;
- tss->eip = vcpu->arch.rip;
+ tss->eip = kvm_rip_read(vcpu);
tss->eflags = kvm_x86_ops->get_rflags(vcpu);
- tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];
- tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];
- tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];
- tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];
- tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];
- tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];
- tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];
-
+ tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
@@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
{
kvm_set_cr3(vcpu, tss->cr3);
- vcpu->arch.rip = tss->eip;
+ kvm_rip_write(vcpu, tss->eip);
kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
- vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;
- vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;
- vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;
- vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;
- vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;
- vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;
- vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
- vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
return 1;
@@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
static void save_state_to_tss16(struct kvm_vcpu *vcpu,
struct tss_segment_16 *tss)
{
- tss->ip = vcpu->arch.rip;
+ tss->ip = kvm_rip_read(vcpu);
tss->flag = kvm_x86_ops->get_rflags(vcpu);
- tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];
- tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];
- tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];
- tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];
- tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];
- tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];
- tss->si = vcpu->arch.regs[VCPU_REGS_RSI];
- tss->di = vcpu->arch.regs[VCPU_REGS_RDI];
+ tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
@@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
static int load_state_from_tss16(struct kvm_vcpu *vcpu,
struct tss_segment_16 *tss)
{
- vcpu->arch.rip = tss->ip;
+ kvm_rip_write(vcpu, tss->ip);
kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
- vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;
- vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;
- vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;
- vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;
- vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;
- vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;
- vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
- vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
return 1;
@@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
}
kvm_x86_ops->skip_emulated_instruction(vcpu);
- kvm_x86_ops->cache_regs(vcpu);
if (nseg_desc.type & 8)
ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
@@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
tr_seg.type = 11;
kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
out:
- kvm_x86_ops->decache_regs(vcpu);
return ret;
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
pr_debug("Set back pending irq %d\n",
pending_vec);
}
+ kvm_pic_clear_isr_ack(vcpu->kvm);
}
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+ /* Older userspace won't unhalt the vcpu on reset. */
+ if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+ sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+ !(vcpu->arch.cr0 & X86_CR0_PE))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
vcpu_put(vcpu);
return 0;
@@ -3918,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
return kvm;
}
@@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ kvm_iommu_unmap_guest(kvm);
+ kvm_free_all_assigned_devices(kvm);
kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
@@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_ANONYMOUS,
+ MAP_PRIVATE | MAP_ANONYMOUS,
0);
up_write(&current->mm->mmap_sem);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 00000000000..6a4be78a738
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,22 @@
+#ifndef ARCH_X86_KVM_X86_H
+#define ARCH_X86_KVM_X86_H
+
+#include <linux/kvm_host.h>
+
+static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.exception.pending = false;
+}
+
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
+{
+ vcpu->arch.interrupt.pending = true;
+ vcpu->arch.interrupt.nr = vector;
+}
+
+static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.interrupt.pending = false;
+}
+
+#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f2f90468f8b..ea051173b0d 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -26,6 +26,7 @@
#define DPRINTF(_f, _a ...) printf(_f , ## _a)
#else
#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
#define DPRINTF(x...) do {} while (0)
#endif
#include <linux/module.h>
@@ -46,25 +47,26 @@
#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
#define DstReg (2<<1) /* Register operand. */
#define DstMem (3<<1) /* Memory operand. */
-#define DstMask (3<<1)
+#define DstAcc (4<<1) /* Destination Accumulator */
+#define DstMask (7<<1)
/* Source operand type. */
-#define SrcNone (0<<3) /* No source operand. */
-#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
-#define SrcReg (1<<3) /* Register operand. */
-#define SrcMem (2<<3) /* Memory operand. */
-#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
-#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
-#define SrcImm (5<<3) /* Immediate operand. */
-#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
-#define SrcMask (7<<3)
+#define SrcNone (0<<4) /* No source operand. */
+#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
+#define SrcReg (1<<4) /* Register operand. */
+#define SrcMem (2<<4) /* Memory operand. */
+#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
+#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
+#define SrcImm (5<<4) /* Immediate operand. */
+#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
+#define SrcMask (7<<4)
/* Generic ModRM decode. */
-#define ModRM (1<<6)
+#define ModRM (1<<7)
/* Destination is only written; never read. */
-#define Mov (1<<7)
-#define BitOp (1<<8)
-#define MemAbs (1<<9) /* Memory operand is absolute displacement */
-#define String (1<<10) /* String instruction (rep capable) */
-#define Stack (1<<11) /* Stack instruction (push/pop) */
+#define Mov (1<<8)
+#define BitOp (1<<9)
+#define MemAbs (1<<10) /* Memory operand is absolute displacement */
+#define String (1<<12) /* String instruction (rep capable) */
+#define Stack (1<<13) /* Stack instruction (push/pop) */
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
#define GroupMask 0xff /* Group number stored in bits 0:7 */
@@ -94,7 +96,7 @@ static u16 opcode_table[256] = {
/* 0x20 - 0x27 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- SrcImmByte, SrcImm, 0, 0,
+ DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x28 - 0x2F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -106,7 +108,8 @@ static u16 opcode_table[256] = {
/* 0x38 - 0x3F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ 0, 0,
/* 0x40 - 0x47 */
DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
/* 0x48 - 0x4F */
@@ -153,9 +156,16 @@ static u16 opcode_table[256] = {
0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
ByteOp | ImplicitOps | String, ImplicitOps | String,
- /* 0xB0 - 0xBF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xB0 - 0xB7 */
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ /* 0xB8 - 0xBF */
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
/* 0xC0 - 0xC7 */
ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
0, ImplicitOps | Stack, 0, 0,
@@ -169,17 +179,20 @@ static u16 opcode_table[256] = {
/* 0xD8 - 0xDF */
0, 0, 0, 0, 0, 0, 0, 0,
/* 0xE0 - 0xE7 */
- 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xE8 - 0xEF */
ImplicitOps | Stack, SrcImm | ImplicitOps,
ImplicitOps, SrcImmByte | ImplicitOps,
- 0, 0, 0, 0,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xF0 - 0xF7 */
0, 0, 0, 0,
ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
/* 0xF8 - 0xFF */
ImplicitOps, 0, ImplicitOps, ImplicitOps,
- 0, 0, Group | Group4, Group | Group5,
+ ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
};
static u16 twobyte_table[256] = {
@@ -268,15 +281,16 @@ static u16 group_table[] = {
ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
0, 0, 0, 0,
[Group3*8] =
- DstMem | SrcImm | ModRM | SrcImm, 0,
- DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ DstMem | SrcImm | ModRM, 0,
+ DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
0, 0, 0, 0,
[Group4*8] =
ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
0, 0, 0, 0, 0, 0,
[Group5*8] =
- DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0,
- SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
+ DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+ SrcMem | ModRM | Stack, 0,
+ SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
[Group7*8] =
0, 0, ModRM | SrcMem, ModRM | SrcMem,
SrcNone | ModRM | DstMem | Mov, 0,
@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* Shadow copy of register state. Committed on successful emulation. */
memset(c, 0, sizeof(struct decode_cache));
- c->eip = ctxt->vcpu->arch.rip;
+ c->eip = kvm_rip_read(ctxt->vcpu);
ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
@@ -1048,6 +1062,23 @@ done_prefixes:
}
c->dst.type = OP_MEM;
break;
+ case DstAcc:
+ c->dst.type = OP_REG;
+ c->dst.bytes = c->op_bytes;
+ c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+ switch (c->op_bytes) {
+ case 1:
+ c->dst.val = *(u8 *)c->dst.ptr;
+ break;
+ case 2:
+ c->dst.val = *(u16 *)c->dst.ptr;
+ break;
+ case 4:
+ c->dst.val = *(u32 *)c->dst.ptr;
+ break;
+ }
+ c->dst.orig_val = c->dst.val;
+ break;
}
if (c->rip_relative)
@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
case 1: /* dec */
emulate_1op("dec", c->dst, ctxt->eflags);
break;
+ case 2: /* call near abs */ {
+ long int old_eip;
+ old_eip = c->eip;
+ c->eip = c->src.val;
+ c->src.val = old_eip;
+ emulate_push(ctxt);
+ break;
+ }
case 4: /* jmp abs */
c->eip = c->src.val;
break;
@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
u64 msr_data;
unsigned long saved_eip = 0;
struct decode_cache *c = &ctxt->decode;
+ unsigned int port;
+ int io_dir_in;
int rc = 0;
/* Shadow copy of register state. Committed on successful emulation.
@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
if (c->rep_prefix && (c->d & String)) {
/* All REP prefixes have the same first termination condition */
if (c->regs[VCPU_REGS_RCX] == 0) {
- ctxt->vcpu->arch.rip = c->eip;
+ kvm_rip_write(ctxt->vcpu, c->eip);
goto done;
}
/* The second termination condition only applies for REPE
@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
(c->b == 0xae) || (c->b == 0xaf)) {
if ((c->rep_prefix == REPE_PREFIX) &&
((ctxt->eflags & EFLG_ZF) == 0)) {
- ctxt->vcpu->arch.rip = c->eip;
+ kvm_rip_write(ctxt->vcpu, c->eip);
goto done;
}
if ((c->rep_prefix == REPNE_PREFIX) &&
((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
- ctxt->vcpu->arch.rip = c->eip;
+ kvm_rip_write(ctxt->vcpu, c->eip);
goto done;
}
}
c->regs[VCPU_REGS_RCX]--;
- c->eip = ctxt->vcpu->arch.rip;
+ c->eip = kvm_rip_read(ctxt->vcpu);
}
if (c->src.type == OP_MEM) {
@@ -1351,27 +1392,10 @@ special_insn:
sbb: /* sbb */
emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
break;
- case 0x20 ... 0x23:
+ case 0x20 ... 0x25:
and: /* and */
emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
break;
- case 0x24: /* and al imm8 */
- c->dst.type = OP_REG;
- c->dst.ptr = &c->regs[VCPU_REGS_RAX];
- c->dst.val = *(u8 *)c->dst.ptr;
- c->dst.bytes = 1;
- c->dst.orig_val = c->dst.val;
- goto and;
- case 0x25: /* and ax imm16, or eax imm32 */
- c->dst.type = OP_REG;
- c->dst.bytes = c->op_bytes;
- c->dst.ptr = &c->regs[VCPU_REGS_RAX];
- if (c->op_bytes == 2)
- c->dst.val = *(u16 *)c->dst.ptr;
- else
- c->dst.val = *(u32 *)c->dst.ptr;
- c->dst.orig_val = c->dst.val;
- goto and;
case 0x28 ... 0x2d:
sub: /* sub */
emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
@@ -1659,7 +1683,7 @@ special_insn:
case 0xae ... 0xaf: /* scas */
DPRINTF("Urk! I don't handle SCAS.\n");
goto cannot_emulate;
- case 0xb8: /* mov r, imm */
+ case 0xb0 ... 0xbf: /* mov r, imm */
goto mov;
case 0xc0 ... 0xc1:
emulate_grp2(ctxt);
@@ -1679,6 +1703,16 @@ special_insn:
c->src.val = c->regs[VCPU_REGS_RCX];
emulate_grp2(ctxt);
break;
+ case 0xe4: /* inb */
+ case 0xe5: /* in */
+ port = insn_fetch(u8, 1, c->eip);
+ io_dir_in = 1;
+ goto do_io;
+ case 0xe6: /* outb */
+ case 0xe7: /* out */
+ port = insn_fetch(u8, 1, c->eip);
+ io_dir_in = 0;
+ goto do_io;
case 0xe8: /* call (near) */ {
long int rel;
switch (c->op_bytes) {
@@ -1729,6 +1763,22 @@ special_insn:
jmp_rel(c, c->src.val);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
+ case 0xec: /* in al,dx */
+ case 0xed: /* in (e/r)ax,dx */
+ port = c->regs[VCPU_REGS_RDX];
+ io_dir_in = 1;
+ goto do_io;
+ case 0xee: /* out al,dx */
+ case 0xef: /* out (e/r)ax,dx */
+ port = c->regs[VCPU_REGS_RDX];
+ io_dir_in = 0;
+ do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
+ (c->d & ByteOp) ? 1 : c->op_bytes,
+ port) != 0) {
+ c->eip = saved_eip;
+ goto cannot_emulate;
+ }
+ return 0;
case 0xf4: /* hlt */
ctxt->vcpu->arch.halt_request = 1;
break;
@@ -1754,6 +1804,14 @@ special_insn:
ctxt->eflags |= X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
+ case 0xfc: /* cld */
+ ctxt->eflags &= ~EFLG_DF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xfd: /* std */
+ ctxt->eflags |= EFLG_DF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
case 0xfe ... 0xff: /* Grp4/Grp5 */
rc = emulate_grp45(ctxt, ops);
if (rc != 0)
@@ -1768,7 +1826,7 @@ writeback:
/* Commit shadow register state. */
memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
- ctxt->vcpu->arch.rip = c->eip;
+ kvm_rip_write(ctxt->vcpu, c->eip);
done:
if (rc == X86EMUL_UNHANDLEABLE) {
@@ -1793,7 +1851,7 @@ twobyte_insn:
goto done;
/* Let the processor re-execute the fixed hypercall */
- c->eip = ctxt->vcpu->arch.rip;
+ c->eip = kvm_rip_read(ctxt->vcpu);
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
@@ -1889,7 +1947,7 @@ twobyte_insn:
rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
if (rc) {
kvm_inject_gp(ctxt->vcpu, 0);
- c->eip = ctxt->vcpu->arch.rip;
+ c->eip = kvm_rip_read(ctxt->vcpu);
}
rc = X86EMUL_CONTINUE;
c->dst.type = OP_NONE;
@@ -1899,7 +1957,7 @@ twobyte_insn:
rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
if (rc) {
kvm_inject_gp(ctxt->vcpu, 0);
- c->eip = ctxt->vcpu->arch.rip;
+ c->eip = kvm_rip_read(ctxt->vcpu);
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
c->regs[VCPU_REGS_RDX] = msr_data >> 32;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d9249a882aa..48ee4f9435f 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -55,6 +55,7 @@
#include <linux/lguest_launcher.h>
#include <linux/virtio_console.h>
#include <linux/pm.h>
+#include <asm/apic.h>
#include <asm/lguest.h>
#include <asm/paravirt.h>
#include <asm/param.h>
@@ -581,7 +582,7 @@ static void __init lguest_init_IRQ(void)
for (i = 0; i < LGUEST_IRQS; i++) {
int vector = FIRST_EXTERNAL_VECTOR + i;
if (vector != SYSCALL_VECTOR) {
- set_intr_gate(vector, interrupt[i]);
+ set_intr_gate(vector, interrupt[vector]);
set_irq_chip_and_handler_name(i, &lguest_irq_controller,
handle_level_irq,
"level");
@@ -783,14 +784,44 @@ static void lguest_wbinvd(void)
* code qualifies for Advanced. It will also never interrupt anything. It
* does, however, allow us to get through the Linux boot code. */
#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(unsigned long reg, u32 v)
+static void lguest_apic_write(u32 reg, u32 v)
{
}
-static u32 lguest_apic_read(unsigned long reg)
+static u32 lguest_apic_read(u32 reg)
{
return 0;
}
+
+static u64 lguest_apic_icr_read(void)
+{
+ return 0;
+}
+
+static void lguest_apic_icr_write(u32 low, u32 id)
+{
+ /* Warn to see if there's any stray references */
+ WARN_ON(1);
+}
+
+static void lguest_apic_wait_icr_idle(void)
+{
+ return;
+}
+
+static u32 lguest_apic_safe_wait_icr_idle(void)
+{
+ return 0;
+}
+
+static struct apic_ops lguest_basic_apic_ops = {
+ .read = lguest_apic_read,
+ .write = lguest_apic_write,
+ .icr_read = lguest_apic_icr_read,
+ .icr_write = lguest_apic_icr_write,
+ .wait_icr_idle = lguest_apic_wait_icr_idle,
+ .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle,
+};
#endif
/* STOP! Until an interrupt comes in. */
@@ -990,8 +1021,7 @@ __init void lguest_init(void)
#ifdef CONFIG_X86_LOCAL_APIC
/* apic read/write intercepts */
- pv_apic_ops.apic_write = lguest_apic_write;
- pv_apic_ops.apic_read = lguest_apic_read;
+ apic_ops = &lguest_basic_apic_ops;
#endif
/* time operations */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index aa3fa411942..55e11aa6d66 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -17,9 +17,6 @@ ifeq ($(CONFIG_X86_32),y)
lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
obj-y += io_64.o iomap_copy_64.o
-
- CFLAGS_csum-partial_64.o := -funroll-loops
-
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
lib-y += thunk_64.o clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index 01b868ba82f..321cf720dbb 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -16,37 +16,46 @@ static void __rdmsr_on_cpu(void *info)
rdmsr(rv->msr_no, rv->l, rv->h);
}
-static void __rdmsr_safe_on_cpu(void *info)
+static void __wrmsr_on_cpu(void *info)
{
struct msr_info *rv = info;
- rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
+ wrmsr(rv->msr_no, rv->l, rv->h);
}
-static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
- int err = 0;
+ int err;
struct msr_info rv;
rv.msr_no = msr_no;
- if (safe) {
- err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu,
- &rv, 1);
- err = err ? err : rv.err;
- } else {
- err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
- }
+ err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
*l = rv.l;
*h = rv.h;
return err;
}
-static void __wrmsr_on_cpu(void *info)
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ rv.msr_no = msr_no;
+ rv.l = l;
+ rv.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+ return err;
+}
+
+/* These "safe" variants are slower and should be used when the target MSR
+ may not actually exist. */
+static void __rdmsr_safe_on_cpu(void *info)
{
struct msr_info *rv = info;
- wrmsr(rv->msr_no, rv->l, rv->h);
+ rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
}
static void __wrmsr_safe_on_cpu(void *info)
@@ -56,45 +65,30 @@ static void __wrmsr_safe_on_cpu(void *info)
rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
}
-static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
- int err = 0;
+ int err;
struct msr_info rv;
rv.msr_no = msr_no;
- rv.l = l;
- rv.h = h;
- if (safe) {
- err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu,
- &rv, 1);
- err = err ? err : rv.err;
- } else {
- err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
- }
-
- return err;
-}
+ err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
+ *l = rv.l;
+ *h = rv.h;
-int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
- return _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
+ return err ? err : rv.err;
}
-int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- return _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
-}
-
-/* These "safe" variants are slower and should be used when the target MSR
- may not actually exist. */
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
- return _wrmsr_on_cpu(cpu, msr_no, l, h, 1);
-}
+ int err;
+ struct msr_info rv;
-int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- return _rdmsr_on_cpu(cpu, msr_no, l, h, 1);
+ rv.msr_no = msr_no;
+ rv.l = l;
+ rv.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
}
EXPORT_SYMBOL(rdmsr_on_cpu);
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 94972e7c094..82004d2bf05 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src)
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2)
- :"0" (src), "1" (dest) : "memory");
+ : "0" (src), "1" (dest) : "memory");
return dest;
}
EXPORT_SYMBOL(strcpy);
@@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count)
"stosb\n"
"2:"
: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
- :"0" (src), "1" (dest), "2" (count) : "memory");
+ : "0" (src), "1" (dest), "2" (count) : "memory");
return dest;
}
EXPORT_SYMBOL(strncpy);
@@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src)
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
- : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory");
+ : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
return dest;
}
EXPORT_SYMBOL(strcat);
@@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct)
"2:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"3:"
- :"=a" (res), "=&S" (d0), "=&D" (d1)
- :"1" (cs), "2" (ct)
- :"memory");
+ : "=a" (res), "=&S" (d0), "=&D" (d1)
+ : "1" (cs), "2" (ct)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strcmp);
@@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count)
"3:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"4:"
- :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
- :"1" (cs), "2" (ct), "3" (count)
- :"memory");
+ : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+ : "1" (cs), "2" (ct), "3" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strncmp);
@@ -152,9 +152,9 @@ char *strchr(const char *s, int c)
"movl $1,%1\n"
"2:\tmovl %1,%0\n\t"
"decl %0"
- :"=a" (res), "=&S" (d0)
- :"1" (s), "0" (c)
- :"memory");
+ : "=a" (res), "=&S" (d0)
+ : "1" (s), "0" (c)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strchr);
@@ -169,9 +169,9 @@ size_t strlen(const char *s)
"scasb\n\t"
"notl %0\n\t"
"decl %0"
- :"=c" (res), "=&D" (d0)
- :"1" (s), "a" (0), "0" (0xffffffffu)
- :"memory");
+ : "=c" (res), "=&D" (d0)
+ : "1" (s), "a" (0), "0" (0xffffffffu)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strlen);
@@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count)
"je 1f\n\t"
"movl $1,%0\n"
"1:\tdecl %0"
- :"=D" (res), "=&c" (d0)
- :"a" (c), "0" (cs), "1" (count)
- :"memory");
+ : "=D" (res), "=&c" (d0)
+ : "a" (c), "0" (cs), "1" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(memchr);
@@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count)
"cmpl $-1,%1\n\t"
"jne 1b\n"
"3:\tsubl %2,%0"
- :"=a" (res), "=&d" (d0)
- :"c" (s), "1" (count)
- :"memory");
+ : "=a" (res), "=&d" (d0)
+ : "c" (s), "1" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strnlen);
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index 42e8a50303f..8e2d55f754b 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -23,9 +23,9 @@ __asm__ __volatile__(
"jne 1b\n\t"
"xorl %%eax,%%eax\n\t"
"2:"
- :"=a" (__res), "=&c" (d0), "=&S" (d1)
- :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
- :"dx", "di");
+ : "=a" (__res), "=&c" (d0), "=&S" (d1)
+ : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
+ : "dx", "di");
return __res;
}
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index fab5faba1d3..4a20b2f9a38 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -14,6 +14,13 @@
#include <asm/uaccess.h>
#include <asm/mmx.h>
+#ifdef CONFIG_X86_INTEL_USERCOPY
+/*
+ * Alignment at which movsl is preferred for bulk memory copies.
+ */
+struct movsl_mask movsl_mask __read_mostly;
+#endif
+
static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
{
#ifdef CONFIG_X86_INTEL_USERCOPY
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 3d317836be9..37b9ae4d44c 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -10,13 +10,15 @@
#include <asm/e820.h>
#include <asm/setup.h>
+#include <mach_ipi.h>
+
#ifdef CONFIG_HOTPLUG_CPU
#define DEFAULT_SEND_IPI (1)
#else
#define DEFAULT_SEND_IPI (0)
#endif
-int no_broadcast=DEFAULT_SEND_IPI;
+int no_broadcast = DEFAULT_SEND_IPI;
/**
* pre_intr_init_hook - initialisation prior to setting up interrupt vectors
@@ -36,15 +38,6 @@ void __init pre_intr_init_hook(void)
init_ISA_irqs();
}
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-static struct irqaction irq2 = {
- .handler = no_action,
- .mask = CPU_MASK_NONE,
- .name = "cascade",
-};
-
/**
* intr_init_hook - post gate setup interrupt initialisation
*
@@ -60,12 +53,6 @@ void __init intr_init_hook(void)
if (x86_quirks->arch_intr_init())
return;
}
-#ifdef CONFIG_X86_LOCAL_APIC
- apic_intr_init();
-#endif
-
- if (!acpi_ioapic)
- setup_irq(2, &irq2);
}
/**
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
deleted file mode 100644
index 3ef8b43b62f..00000000000
--- a/arch/x86/mach-es7000/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-obj-$(CONFIG_X86_ES7000) := es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h
deleted file mode 100644
index c8d5aa132fa..00000000000
--- a/arch/x86/mach-es7000/es7000.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Written by: Garry Forsgren, Unisys Corporation
- * Natalie Protasevich, Unisys Corporation
- * This file contains the code to configure and interface
- * with Unisys ES7000 series hardware system manager.
- *
- * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Unisys Corporation, Township Line & Union Meeting
- * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
- *
- * http://www.unisys.com
- */
-
-/*
- * ES7000 chipsets
- */
-
-#define NON_UNISYS 0
-#define ES7000_CLASSIC 1
-#define ES7000_ZORRO 2
-
-
-#define MIP_REG 1
-#define MIP_PSAI_REG 4
-
-#define MIP_BUSY 1
-#define MIP_SPIN 0xf0000
-#define MIP_VALID 0x0100000000000000ULL
-#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
-
-#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
-
-struct mip_reg_info {
- unsigned long long mip_info;
- unsigned long long delivery_info;
- unsigned long long host_reg;
- unsigned long long mip_reg;
-};
-
-struct part_info {
- unsigned char type;
- unsigned char length;
- unsigned char part_id;
- unsigned char apic_mode;
- unsigned long snum;
- char ptype[16];
- char sname[64];
- char pname[64];
-};
-
-struct psai {
- unsigned long long entry_type;
- unsigned long long addr;
- unsigned long long bep_addr;
-};
-
-struct es7000_mem_info {
- unsigned char type;
- unsigned char length;
- unsigned char resv[6];
- unsigned long long start;
- unsigned long long size;
-};
-
-struct es7000_oem_table {
- unsigned long long hdr;
- struct mip_reg_info mip;
- struct part_info pif;
- struct es7000_mem_info shm;
- struct psai psai;
-};
-
-#ifdef CONFIG_ACPI
-
-struct oem_table {
- struct acpi_table_header Header;
- u32 OEMTableAddr;
- u32 OEMTableSize;
-};
-
-extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
-#endif
-
-struct mip_reg {
- unsigned long long off_0;
- unsigned long long off_8;
- unsigned long long off_10;
- unsigned long long off_18;
- unsigned long long off_20;
- unsigned long long off_28;
- unsigned long long off_30;
- unsigned long long off_38;
-};
-
-#define MIP_SW_APIC 0x1020b
-#define MIP_FUNC(VALUE) (VALUE & 0xff)
-
-extern int parse_unisys_oem (char *oemptr);
-extern void setup_unisys(void);
-extern int es7000_start_cpu(int cpu, unsigned long eip);
-extern void es7000_sw_apic(void);
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
index 0dbd7803a1d..6730f4e7c74 100644
--- a/arch/x86/mach-generic/Makefile
+++ b/arch/x86/mach-generic/Makefile
@@ -9,4 +9,3 @@ obj-$(CONFIG_X86_NUMAQ) += numaq.o
obj-$(CONFIG_X86_SUMMIT) += summit.o
obj-$(CONFIG_X86_BIGSMP) += bigsmp.o
obj-$(CONFIG_X86_ES7000) += es7000.o
-obj-$(CONFIG_X86_ES7000) += ../../x86/mach-es7000/
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 59d77171455..3c3b471ea49 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -5,18 +5,17 @@
#define APIC_DEFINITION 1
#include <linux/threads.h>
#include <linux/cpumask.h>
-#include <asm/smp.h>
#include <asm/mpspec.h>
#include <asm/genapic.h>
#include <asm/fixmap.h>
#include <asm/apicdef.h>
#include <linux/kernel.h>
-#include <linux/smp.h>
#include <linux/init.h>
#include <linux/dmi.h>
-#include <asm/mach-bigsmp/mach_apic.h>
-#include <asm/mach-bigsmp/mach_apicdef.h>
-#include <asm/mach-bigsmp/mach_ipi.h>
+#include <asm/bigsmp/apicdef.h>
+#include <linux/smp.h>
+#include <asm/bigsmp/apic.h>
+#include <asm/bigsmp/ipi.h>
#include <asm/mach-default/mach_mpparse.h>
static int dmi_bigsmp; /* can be set by dmi scanners */
@@ -42,6 +41,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
{ }
};
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ return cpumask_of_cpu(cpu);
+}
static int probe_bigsmp(void)
{
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 4742626f08c..28459cab3dd 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -4,20 +4,19 @@
#define APIC_DEFINITION 1
#include <linux/threads.h>
#include <linux/cpumask.h>
-#include <asm/smp.h>
#include <asm/mpspec.h>
#include <asm/genapic.h>
#include <asm/fixmap.h>
#include <asm/apicdef.h>
#include <linux/kernel.h>
#include <linux/string.h>
-#include <linux/smp.h>
#include <linux/init.h>
-#include <asm/mach-es7000/mach_apicdef.h>
-#include <asm/mach-es7000/mach_apic.h>
-#include <asm/mach-es7000/mach_ipi.h>
-#include <asm/mach-es7000/mach_mpparse.h>
-#include <asm/mach-es7000/mach_wakecpu.h>
+#include <asm/es7000/apicdef.h>
+#include <linux/smp.h>
+#include <asm/es7000/apic.h>
+#include <asm/es7000/ipi.h>
+#include <asm/es7000/mpparse.h>
+#include <asm/es7000/wakecpu.h>
static int probe_es7000(void)
{
@@ -48,16 +47,26 @@ static __init int mps_oem_check(struct mp_config_table *mpc, char *oem,
/* Hook from generic ACPI tables.c */
static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- unsigned long oem_addr;
+ unsigned long oem_addr = 0;
+ int check_dsdt;
+ int ret = 0;
+
+ /* check dsdt at first to avoid clear fix_map for oem_addr */
+ check_dsdt = es7000_check_dsdt();
+
if (!find_unisys_acpi_oem_table(&oem_addr)) {
- if (es7000_check_dsdt())
- return parse_unisys_oem((char *)oem_addr);
+ if (check_dsdt)
+ ret = parse_unisys_oem((char *)oem_addr);
else {
setup_unisys();
- return 1;
+ ret = 1;
}
+ /*
+ * we need to unmap it
+ */
+ unmap_unisys_acpi_oem_table(oem_addr);
}
- return 0;
+ return ret;
}
#else
static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -66,4 +75,18 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
}
#endif
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
+
struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 8091e68764c..71a309b122e 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -4,7 +4,6 @@
#define APIC_DEFINITION 1
#include <linux/threads.h>
#include <linux/cpumask.h>
-#include <linux/smp.h>
#include <asm/mpspec.h>
#include <asm/genapic.h>
#include <asm/fixmap.h>
@@ -12,11 +11,12 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/init.h>
-#include <asm/mach-numaq/mach_apic.h>
-#include <asm/mach-numaq/mach_apicdef.h>
-#include <asm/mach-numaq/mach_ipi.h>
-#include <asm/mach-numaq/mach_mpparse.h>
-#include <asm/mach-numaq/mach_wakecpu.h>
+#include <asm/numaq/apicdef.h>
+#include <linux/smp.h>
+#include <asm/numaq/apic.h>
+#include <asm/numaq/ipi.h>
+#include <asm/numaq/mpparse.h>
+#include <asm/numaq/wakecpu.h>
#include <asm/numaq.h>
static int mps_oem_check(struct mp_config_table *mpc, char *oem,
@@ -38,4 +38,18 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
+
struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index a97ea0f35b1..6272b5e69da 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -4,19 +4,18 @@
#define APIC_DEFINITION 1
#include <linux/threads.h>
#include <linux/cpumask.h>
-#include <asm/smp.h>
#include <asm/mpspec.h>
#include <asm/genapic.h>
#include <asm/fixmap.h>
#include <asm/apicdef.h>
#include <linux/kernel.h>
#include <linux/string.h>
-#include <linux/smp.h>
#include <linux/init.h>
-#include <asm/mach-summit/mach_apic.h>
-#include <asm/mach-summit/mach_apicdef.h>
-#include <asm/mach-summit/mach_ipi.h>
-#include <asm/mach-summit/mach_mpparse.h>
+#include <asm/summit/apicdef.h>
+#include <linux/smp.h>
+#include <asm/summit/apic.h>
+#include <asm/summit/ipi.h>
+#include <asm/summit/mpparse.h>
static int probe_summit(void)
{
@@ -24,4 +23,18 @@ static int probe_summit(void)
return 0;
}
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
+
struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index ee0fba09215..0f6e8a6523a 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -448,6 +448,8 @@ static void __init start_secondary(void *unused)
VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
+ notify_cpu_starting(cpuid);
+
/* enable interrupts */
local_irq_enable();
@@ -1481,7 +1483,7 @@ static void disable_local_vic_irq(unsigned int irq)
* the interrupt off to another CPU */
static void before_handle_vic_irq(unsigned int irq)
{
- irq_desc_t *desc = irq_desc + irq;
+ irq_desc_t *desc = irq_to_desc(irq);
__u8 cpu = smp_processor_id();
_raw_spin_lock(&vic_irq_lock);
@@ -1516,7 +1518,7 @@ static void before_handle_vic_irq(unsigned int irq)
/* Finish the VIC interrupt: basically mask */
static void after_handle_vic_irq(unsigned int irq)
{
- irq_desc_t *desc = irq_desc + irq;
+ irq_desc_t *desc = irq_to_desc(irq);
_raw_spin_lock(&vic_irq_lock);
{
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index dfb932dcf13..59f89b434b4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -13,12 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
mmiotrace-y := pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
-ifeq ($(CONFIG_X86_32),y)
-obj-$(CONFIG_NUMA) += discontig_32.o
-else
-obj-$(CONFIG_NUMA) += numa_64.o
+obj-$(CONFIG_NUMA) += numa_$(BITS).o
obj-$(CONFIG_K8_NUMA) += k8topology_64.o
-endif
obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a20d1fa64b4..e7277cbcfb4 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* we have now. "break" is either changing perms, levels or
* address space marker.
*/
- prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
- cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);
+ prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
+ cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
if (!st->level) {
/* First entry */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 455f3fe67b4..31e8730fa24 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -35,6 +35,7 @@
#include <asm/tlbflush.h>
#include <asm/proto.h>
#include <asm-generic/sections.h>
+#include <asm/traps.h>
/*
* Page fault error code bits
@@ -357,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
return 0;
}
-void do_invalid_op(struct pt_regs *, unsigned long);
-
static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
@@ -593,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
unsigned long flags;
#endif
- /*
- * We can fault from pretty much anywhere, with unknown IRQ state.
- */
- trace_hardirqs_fixup();
-
tsk = current;
mm = tsk->mm;
prefetchw(&mm->mmap_sem);
@@ -646,24 +640,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
}
-#ifdef CONFIG_X86_32
- /* It's safe to allow irq's after cr2 has been saved and the vmalloc
- fault has been handled. */
- if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
- local_irq_enable();
-
/*
- * If we're in an interrupt, have no user context or are running in an
- * atomic region then we must not take the fault.
+ * It's safe to allow irq's after cr2 has been saved and the
+ * vmalloc fault has been handled.
+ *
+ * User-mode registers count as a user access even for any
+ * potential system fault or CPU buglet.
*/
- if (in_atomic() || !mm)
- goto bad_area_nosemaphore;
-#else /* CONFIG_X86_64 */
- if (likely(regs->flags & X86_EFLAGS_IF))
+ if (user_mode_vm(regs)) {
+ local_irq_enable();
+ error_code |= PF_USER;
+ } else if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
+#ifdef CONFIG_X86_64
if (unlikely(error_code & PF_RSVD))
pgtable_bad(address, regs, error_code);
+#endif
/*
* If we're in an interrupt, have no user context or are running in an
@@ -672,15 +665,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(in_atomic() || !mm))
goto bad_area_nosemaphore;
- /*
- * User-mode registers count as a user access even for any
- * potential system fault or CPU buglet.
- */
- if (user_mode_vm(regs))
- error_code |= PF_USER;
again:
-#endif
- /* When running in the kernel we expect faults to occur only to
+ /*
+ * When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
* erroneous fault occurring in a code path which already holds mmap_sem
@@ -743,9 +730,6 @@ good_area:
goto bad_area;
}
-#ifdef CONFIG_X86_32
-survive:
-#endif
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
@@ -880,12 +864,11 @@ out_of_memory:
up_read(&mm->mmap_sem);
if (is_global_init(tsk)) {
yield();
-#ifdef CONFIG_X86_32
- down_read(&mm->mmap_sem);
- goto survive;
-#else
+ /*
+ * Re-lookup the vma - in theory the vma tree might
+ * have changed:
+ */
goto again;
-#endif
}
printk("VM: killing process %s\n", tsk->comm);
@@ -915,15 +898,15 @@ LIST_HEAD(pgd_list);
void vmalloc_sync_all(void)
{
-#ifdef CONFIG_X86_32
- unsigned long start = VMALLOC_START & PGDIR_MASK;
unsigned long address;
+#ifdef CONFIG_X86_32
if (SHARED_KERNEL_PMD)
return;
- BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
- for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
+ for (address = VMALLOC_START & PMD_MASK;
+ address >= TASK_SIZE && address < FIXADDR_TOP;
+ address += PMD_SIZE) {
unsigned long flags;
struct page *page;
@@ -936,10 +919,8 @@ void vmalloc_sync_all(void)
spin_unlock_irqrestore(&pgd_lock, flags);
}
#else /* CONFIG_X86_64 */
- unsigned long start = VMALLOC_START & PGDIR_MASK;
- unsigned long address;
-
- for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+ for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
+ address += PGDIR_SIZE) {
const pgd_t *pgd_ref = pgd_offset_k(address);
unsigned long flags;
struct page *page;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 007bb06c750..4ba373c5b8c 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -82,7 +82,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
pte_t pte = gup_get_pte(ptep);
struct page *page;
- if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep);
return 0;
}
@@ -116,10 +116,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
- if ((pte_val(pte) & mask) != mask)
+ if ((pte_flags(pte) & mask) != mask)
return 0;
/* hugepages are never "special" */
- VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+ VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
refs = 0;
@@ -173,10 +173,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
- if ((pte_val(pte) & mask) != mask)
+ if ((pte_flags(pte) & mask) != mask)
return 0;
/* hugepages are never "special" */
- VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+ VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
refs = 0;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 165c871ba9a..bcc079c282d 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
return (void*) vaddr;
}
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
struct page *kmap_atomic_to_page(void *ptr)
{
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 60ec1d08ff2..8396868e82c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
#include <linux/cpumask.h>
#include <asm/asm.h>
+#include <asm/bios_ebda.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -47,6 +48,7 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
+#include <asm/smp.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
@@ -194,11 +196,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
- unsigned pages_2m = 0, pages_4k = 0;
+ unsigned pages_2m, pages_4k;
+ int mapping_iter;
+
+ /*
+ * First iteration will setup identity mapping using large/small pages
+ * based on use_pse, with other attributes same as set by
+ * the early code in head_32.S
+ *
+ * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+ * as desired for the kernel identity mapping.
+ *
+ * This two pass mechanism conforms to the TLB app note which says:
+ *
+ * "Software should not write to a paging-structure entry in a way
+ * that would change, for any linear address, both the page size
+ * and either the page frame or attributes."
+ */
+ mapping_iter = 1;
if (!cpu_has_pse)
use_pse = 0;
+repeat:
+ pages_2m = pages_4k = 0;
pfn = start_pfn;
pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
pgd = pgd_base + pgd_idx;
@@ -224,6 +245,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
if (use_pse) {
unsigned int addr2;
pgprot_t prot = PAGE_KERNEL_LARGE;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute + _PAGE_PSE.
+ */
+ pgprot_t init_prot =
+ __pgprot(PTE_IDENT_ATTR |
+ _PAGE_PSE);
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
@@ -233,7 +261,10 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
prot = PAGE_KERNEL_LARGE_EXEC;
pages_2m++;
- set_pmd(pmd, pfn_pmd(pfn, prot));
+ if (mapping_iter == 1)
+ set_pmd(pmd, pfn_pmd(pfn, init_prot));
+ else
+ set_pmd(pmd, pfn_pmd(pfn, prot));
pfn += PTRS_PER_PTE;
continue;
@@ -245,17 +276,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
pgprot_t prot = PAGE_KERNEL;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute.
+ */
+ pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
if (is_kernel_text(addr))
prot = PAGE_KERNEL_EXEC;
pages_4k++;
- set_pte(pte, pfn_pte(pfn, prot));
+ if (mapping_iter == 1)
+ set_pte(pte, pfn_pte(pfn, init_prot));
+ else
+ set_pte(pte, pfn_pte(pfn, prot));
}
}
}
- update_page_count(PG_LEVEL_2M, pages_2m);
- update_page_count(PG_LEVEL_4K, pages_4k);
+ if (mapping_iter == 1) {
+ /*
+ * update direct mapping page count only in the first
+ * iteration.
+ */
+ update_page_count(PG_LEVEL_2M, pages_2m);
+ update_page_count(PG_LEVEL_4K, pages_4k);
+
+ /*
+ * local global flush tlb, which will flush the previous
+ * mappings present in both small and large page TLB's.
+ */
+ __flush_tlb_all();
+
+ /*
+ * Second iteration will set the actual desired PTE attributes.
+ */
+ mapping_iter = 2;
+ goto repeat;
+ }
}
/*
@@ -501,7 +558,7 @@ void zap_low_mappings(void)
int nx_enabled;
-pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
#ifdef CONFIG_X86_PAE
@@ -718,7 +775,7 @@ void __init setup_bootmem_allocator(void)
after_init_bootmem = 1;
}
-static void __init find_early_table_space(unsigned long end)
+static void __init find_early_table_space(unsigned long end, int use_pse)
{
unsigned long puds, pmds, ptes, tables, start;
@@ -728,7 +785,7 @@ static void __init find_early_table_space(unsigned long end)
pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
- if (cpu_has_pse) {
+ if (use_pse) {
unsigned long extra;
extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
@@ -768,12 +825,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
pgd_t *pgd_base = swapper_pg_dir;
unsigned long start_pfn, end_pfn;
unsigned long big_page_start;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ int use_pse = 0;
+#else
+ int use_pse = cpu_has_pse;
+#endif
/*
* Find space for the kernel direct mapping tables.
*/
if (!after_init_bootmem)
- find_early_table_space(end);
+ find_early_table_space(end, use_pse);
#ifdef CONFIG_X86_PAE
set_nx();
@@ -819,7 +886,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
if (start_pfn < end_pfn)
kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
- cpu_has_pse);
+ use_pse);
/* tail is not big page alignment ? */
start_pfn = end_pfn;
@@ -903,6 +970,8 @@ void __init mem_init(void)
int codesize, reservedpages, datasize, initsize;
int tmp;
+ start_periodic_check_for_corruption();
+
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
@@ -982,7 +1051,6 @@ void __init mem_init(void)
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
- cpa_init();
save_pg_dir();
zap_low_mappings();
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d3746efb060..b8e461d4941 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -31,6 +31,7 @@
#include <linux/nmi.h>
#include <asm/processor.h>
+#include <asm/bios_ebda.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -88,6 +89,62 @@ early_param("gbpages", parse_direct_gbpages_on);
int after_bootmem;
+pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+static int do_not_nx __cpuinitdata;
+
+/*
+ * noexec=on|off
+ * Control non-executable mappings for 64-bit processes.
+ *
+ * on Enable (default)
+ * off Disable
+ */
+static int __init nonx_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ __supported_pte_mask |= _PAGE_NX;
+ do_not_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ do_not_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+ return 0;
+}
+early_param("noexec", nonx_setup);
+
+void __cpuinit check_efer(void)
+{
+ unsigned long efer;
+
+ rdmsrl(MSR_EFER, efer);
+ if (!(efer & EFER_NX) || do_not_nx)
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+
+int force_personality32;
+
+/*
+ * noexec32=on|off
+ * Control non executable heap for 32bit processes.
+ * To control the stack too use noexec=off
+ *
+ * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
+ * off PROT_READ implies PROT_EXEC
+ */
+static int __init nonx32_setup(char *str)
+{
+ if (!strcmp(str, "on"))
+ force_personality32 &= ~READ_IMPLIES_EXEC;
+ else if (!strcmp(str, "off"))
+ force_personality32 |= READ_IMPLIES_EXEC;
+ return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -139,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
}
pte = pte_offset_kernel(pmd, vaddr);
- if (!pte_none(*pte) && pte_val(new_pte) &&
- pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
- pte_ERROR(*pte);
set_pte(pte, new_pte);
/*
@@ -225,7 +279,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
void __init cleanup_highmap(void)
{
unsigned long vaddr = __START_KERNEL_map;
- unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
+ unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;
pmd_t *last_pmd = pmd + PTRS_PER_PMD;
@@ -256,7 +310,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
if (pfn >= table_top)
panic("alloc_low_page: ran out of memory");
- adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
+ adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
memset(adr, 0, PAGE_SIZE);
*phys = pfn * PAGE_SIZE;
return adr;
@@ -271,7 +325,8 @@ static __ref void unmap_low_page(void *adr)
}
static unsigned long __meminit
-phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
+ pgprot_t prot)
{
unsigned pages = 0;
unsigned long last_map_addr = end;
@@ -289,36 +344,43 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
break;
}
+ /*
+ * We will re-use the existing mapping.
+ * Xen for example has some special requirements, like mapping
+ * pagetable pages as RO. So assume someone who pre-setup
+ * these mappings are more intelligent.
+ */
if (pte_val(*pte))
continue;
if (0)
printk(" pte=%p addr=%lx pte=%016lx\n",
pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
- set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
- last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
pages++;
+ set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
+ last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
}
+
update_page_count(PG_LEVEL_4K, pages);
return last_map_addr;
}
static unsigned long __meminit
-phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
+phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
+ pgprot_t prot)
{
pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
- return phys_pte_init(pte, address, end);
+ return phys_pte_init(pte, address, end, prot);
}
static unsigned long __meminit
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, pgprot_t prot)
{
unsigned long pages = 0;
unsigned long last_map_addr = end;
- unsigned long start = address;
int i = pmd_index(address);
@@ -326,6 +388,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
unsigned long pte_phys;
pmd_t *pmd = pmd_page + pmd_index(address);
pte_t *pte;
+ pgprot_t new_prot = prot;
if (address >= end) {
if (!after_bootmem) {
@@ -339,27 +402,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
if (!pmd_large(*pmd)) {
spin_lock(&init_mm.page_table_lock);
last_map_addr = phys_pte_update(pmd, address,
- end);
+ end, prot);
spin_unlock(&init_mm.page_table_lock);
+ continue;
}
- /* Count entries we're using from level2_ident_pgt */
- if (start == 0)
- pages++;
- continue;
+ /*
+ * If we are ok with PG_LEVEL_2M mapping, then we will
+ * use the existing mapping,
+ *
+ * Otherwise, we will split the large page mapping but
+ * use the same existing protection bits except for
+ * large page, so that we don't violate Intel's TLB
+ * Application note (317080) which says, while changing
+ * the page sizes, new and old translations should
+ * not differ with respect to page frame and
+ * attributes.
+ */
+ if (page_size_mask & (1 << PG_LEVEL_2M))
+ continue;
+ new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
}
if (page_size_mask & (1<<PG_LEVEL_2M)) {
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pmd,
- pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ pfn_pte(address >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
last_map_addr = (address & PMD_MASK) + PMD_SIZE;
continue;
}
pte = alloc_low_page(&pte_phys);
- last_map_addr = phys_pte_init(pte, address, end);
+ last_map_addr = phys_pte_init(pte, address, end, new_prot);
unmap_low_page(pte);
spin_lock(&init_mm.page_table_lock);
@@ -372,12 +448,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
static unsigned long __meminit
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, pgprot_t prot)
{
pmd_t *pmd = pmd_offset(pud, 0);
unsigned long last_map_addr;
- last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
+ last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
__flush_tlb_all();
return last_map_addr;
}
@@ -394,6 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
unsigned long pmd_phys;
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
+ pgprot_t prot = PAGE_KERNEL;
if (addr >= end)
break;
@@ -405,10 +482,26 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
}
if (pud_val(*pud)) {
- if (!pud_large(*pud))
+ if (!pud_large(*pud)) {
last_map_addr = phys_pmd_update(pud, addr, end,
- page_size_mask);
- continue;
+ page_size_mask, prot);
+ continue;
+ }
+ /*
+ * If we are ok with PG_LEVEL_1G mapping, then we will
+ * use the existing mapping.
+ *
+ * Otherwise, we will split the gbpage mapping but use
+ * the same existing protection bits except for large
+ * page, so that we don't violate Intel's TLB
+ * Application note (317080) which says, while changing
+ * the page sizes, new and old translations should
+ * not differ with respect to page frame and
+ * attributes.
+ */
+ if (page_size_mask & (1 << PG_LEVEL_1G))
+ continue;
+ prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
}
if (page_size_mask & (1<<PG_LEVEL_1G)) {
@@ -422,7 +515,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
}
pmd = alloc_low_page(&pmd_phys);
- last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
+ last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
+ prot);
unmap_low_page(pmd);
spin_lock(&init_mm.page_table_lock);
@@ -430,6 +524,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
spin_unlock(&init_mm.page_table_lock);
}
__flush_tlb_all();
+
update_page_count(PG_LEVEL_1G, pages);
return last_map_addr;
@@ -446,27 +541,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
return phys_pud_init(pud, addr, end, page_size_mask);
}
-static void __init find_early_table_space(unsigned long end)
+static void __init find_early_table_space(unsigned long end, int use_pse,
+ int use_gbpages)
{
unsigned long puds, pmds, ptes, tables, start;
puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
- if (direct_gbpages) {
+ tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+ if (use_gbpages) {
unsigned long extra;
extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
} else
pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+ tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
- if (cpu_has_pse) {
+ if (use_pse) {
unsigned long extra;
extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
} else
ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
- tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
+ tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
/*
* RED-PEN putting page tables only on node 0 could
@@ -528,6 +624,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
pgd_populate(&init_mm, pgd, __va(pud_phys));
spin_unlock(&init_mm.page_table_lock);
}
+ __flush_tlb_all();
return last_map_addr;
}
@@ -571,6 +668,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
struct map_range mr[NR_RANGE_MR];
int nr_range, i;
+ int use_pse, use_gbpages;
printk(KERN_INFO "init_memory_mapping\n");
@@ -584,9 +682,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
if (!after_bootmem)
init_gbpages();
- if (direct_gbpages)
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ use_pse = use_gbpages = 0;
+#else
+ use_pse = cpu_has_pse;
+ use_gbpages = direct_gbpages;
+#endif
+
+ if (use_gbpages)
page_size_mask |= 1 << PG_LEVEL_1G;
- if (cpu_has_pse)
+ if (use_pse)
page_size_mask |= 1 << PG_LEVEL_2M;
memset(mr, 0, sizeof(mr));
@@ -636,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
old_start = mr[i].start;
memmove(&mr[i], &mr[i+1],
(nr_range - 1 - i) * sizeof (struct map_range));
- mr[i].start = old_start;
+ mr[i--].start = old_start;
nr_range--;
}
@@ -647,7 +757,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
if (!after_bootmem)
- find_early_table_space(end);
+ find_early_table_space(end, use_pse, use_gbpages);
for (i = 0; i < nr_range; i++)
last_map_addr = kernel_physical_mapping_init(
@@ -769,6 +879,8 @@ void __init mem_init(void)
{
long codesize, reservedpages, datasize, initsize;
+ start_periodic_check_for_corruption();
+
pci_iommu_alloc();
/* clear_bss() already clear the empty_zero_page */
@@ -806,8 +918,6 @@ void __init mem_init(void)
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
initsize >> 10);
-
- cpa_init();
}
void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index d4b6e6a29ae..ae71e11eb3e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,18 +24,47 @@
#ifdef CONFIG_X86_64
+static inline int phys_addr_valid(unsigned long addr)
+{
+ return addr < (1UL << boot_cpu_data.x86_phys_bits);
+}
+
unsigned long __phys_addr(unsigned long x)
{
- if (x >= __START_KERNEL_map)
- return x - __START_KERNEL_map + phys_base;
- return x - PAGE_OFFSET;
+ if (x >= __START_KERNEL_map) {
+ x -= __START_KERNEL_map;
+ VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
+ x += phys_base;
+ } else {
+ VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+ x -= PAGE_OFFSET;
+ VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
+ !phys_addr_valid(x));
+ }
+ return x;
}
EXPORT_SYMBOL(__phys_addr);
-static inline int phys_addr_valid(unsigned long addr)
+bool __virt_addr_valid(unsigned long x)
{
- return addr < (1UL << boot_cpu_data.x86_phys_bits);
+ if (x >= __START_KERNEL_map) {
+ x -= __START_KERNEL_map;
+ if (x >= KERNEL_IMAGE_SIZE)
+ return false;
+ x += phys_base;
+ } else {
+ if (x < PAGE_OFFSET)
+ return false;
+ x -= PAGE_OFFSET;
+ if (system_state == SYSTEM_BOOTING ?
+ x > MAXMEM : !phys_addr_valid(x)) {
+ return false;
+ }
+ }
+
+ return pfn_valid(x >> PAGE_SHIFT);
}
+EXPORT_SYMBOL(__virt_addr_valid);
#else
@@ -44,6 +73,28 @@ static inline int phys_addr_valid(unsigned long addr)
return 1;
}
+#ifdef CONFIG_DEBUG_VIRTUAL
+unsigned long __phys_addr(unsigned long x)
+{
+ /* VMALLOC_* aren't constants; not available at the boot time */
+ VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+ VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
+ is_vmalloc_addr((void *) x));
+ return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+#endif
+
+bool __virt_addr_valid(unsigned long x)
+{
+ if (x < PAGE_OFFSET)
+ return false;
+ if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+ return false;
+ return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
#endif
int page_is_ram(unsigned long pagenr)
@@ -83,6 +134,25 @@ int page_is_ram(unsigned long pagenr)
return 0;
}
+int pagerange_is_ram(unsigned long start, unsigned long end)
+{
+ int ram_page = 0, not_rampage = 0;
+ unsigned long page_nr;
+
+ for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
+ ++page_nr) {
+ if (page_is_ram(page_nr))
+ ram_page = 1;
+ else
+ not_rampage = 1;
+
+ if (ram_page == not_rampage)
+ return -1;
+ }
+
+ return ram_page;
+}
+
/*
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
@@ -150,6 +220,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
return (__force void __iomem *)phys_to_virt(phys_addr);
/*
+ * Check if the request spans more than any BAR in the iomem resource
+ * tree.
+ */
+ WARN_ON(iomem_map_sanity_check(phys_addr, size));
+
+ /*
* Don't allow anybody to remap normal RAM that we're using..
*/
for (pfn = phys_addr >> PAGE_SHIFT;
@@ -204,16 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
switch (prot_val) {
case _PAGE_CACHE_UC:
default:
- prot = PAGE_KERNEL_NOCACHE;
+ prot = PAGE_KERNEL_IO_NOCACHE;
break;
case _PAGE_CACHE_UC_MINUS:
- prot = PAGE_KERNEL_UC_MINUS;
+ prot = PAGE_KERNEL_IO_UC_MINUS;
break;
case _PAGE_CACHE_WC:
- prot = PAGE_KERNEL_WC;
+ prot = PAGE_KERNEL_IO_WC;
break;
case _PAGE_CACHE_WB:
- prot = PAGE_KERNEL;
+ prot = PAGE_KERNEL_IO;
break;
}
@@ -421,7 +497,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
return;
}
-int __initdata early_ioremap_debug;
+static int __initdata early_ioremap_debug;
static int __init early_ioremap_debug_setup(char *str)
{
@@ -530,12 +606,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
}
static inline void __init early_set_fixmap(enum fixed_addresses idx,
- unsigned long phys)
+ unsigned long phys, pgprot_t prot)
{
if (after_paging_init)
- set_fixmap(idx, phys);
+ __set_fixmap(idx, phys, prot);
else
- __early_set_fixmap(idx, phys, PAGE_KERNEL);
+ __early_set_fixmap(idx, phys, prot);
}
static inline void __init early_clear_fixmap(enum fixed_addresses idx)
@@ -546,16 +622,22 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
__early_set_fixmap(idx, 0, __pgprot(0));
}
-
-int __initdata early_ioremap_nested;
-
+static void *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
static int __init check_early_ioremap_leak(void)
{
- if (!early_ioremap_nested)
+ int count = 0;
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (prev_map[i])
+ count++;
+
+ if (!count)
return 0;
WARN(1, KERN_WARNING
"Debug warning: early ioremap leak of %d areas detected.\n",
- early_ioremap_nested);
+ count);
printk(KERN_WARNING
"please boot with early_ioremap_debug and report the dmesg.\n");
@@ -563,18 +645,33 @@ static int __init check_early_ioremap_leak(void)
}
late_initcall(check_early_ioremap_leak);
-void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
+static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
{
unsigned long offset, last_addr;
- unsigned int nrpages, nesting;
+ unsigned int nrpages;
enum fixed_addresses idx0, idx;
+ int i, slot;
WARN_ON(system_state != SYSTEM_BOOTING);
- nesting = early_ioremap_nested;
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (!prev_map[i]) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (slot < 0) {
+ printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n",
+ phys_addr, size);
+ WARN_ON(1);
+ return NULL;
+ }
+
if (early_ioremap_debug) {
printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
- phys_addr, size, nesting);
+ phys_addr, size, slot);
dump_stack();
}
@@ -585,17 +682,13 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
return NULL;
}
- if (nesting >= FIX_BTMAPS_NESTING) {
- WARN_ON(1);
- return NULL;
- }
- early_ioremap_nested++;
+ prev_size[slot] = size;
/*
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr) - phys_addr;
+ size = PAGE_ALIGN(last_addr + 1) - phys_addr;
/*
* Mappings have to fit in the FIX_BTMAP area.
@@ -609,10 +702,10 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
/*
* Ok, go for it..
*/
- idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+ idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
idx = idx0;
while (nrpages > 0) {
- early_set_fixmap(idx, phys_addr);
+ early_set_fixmap(idx, phys_addr, prot);
phys_addr += PAGE_SIZE;
--idx;
--nrpages;
@@ -620,7 +713,20 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
if (early_ioremap_debug)
printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
- return (void *) (offset + fix_to_virt(idx0));
+ prev_map[slot] = (void *) (offset + fix_to_virt(idx0));
+ return prev_map[slot];
+}
+
+/* Remap an IO device */
+void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
+{
+ return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
+}
+
+/* Remap memory */
+void __init *early_memremap(unsigned long phys_addr, unsigned long size)
+{
+ return __early_ioremap(phys_addr, size, PAGE_KERNEL);
}
void __init early_iounmap(void *addr, unsigned long size)
@@ -629,15 +735,33 @@ void __init early_iounmap(void *addr, unsigned long size)
unsigned long offset;
unsigned int nrpages;
enum fixed_addresses idx;
- int nesting;
+ int i, slot;
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (prev_map[i] == addr) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (slot < 0) {
+ printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
+ addr, size);
+ WARN_ON(1);
+ return;
+ }
- nesting = --early_ioremap_nested;
- if (WARN_ON(nesting < 0))
+ if (prev_size[slot] != size) {
+ printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
+ addr, size, slot, prev_size[slot]);
+ WARN_ON(1);
return;
+ }
if (early_ioremap_debug) {
printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
- size, nesting);
+ size, slot);
dump_stack();
}
@@ -649,12 +773,13 @@ void __init early_iounmap(void *addr, unsigned long size)
offset = virt_addr & ~PAGE_MASK;
nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
- idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
while (nrpages > 0) {
early_clear_fixmap(idx);
--idx;
--nrpages;
}
+ prev_map[slot] = 0;
}
void __this_fixmap_does_not_exist(void)
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 672e17f8262..9cab18b0b85 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -61,9 +61,9 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
last_bad += incr;
} else {
if (start_bad) {
- printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved",
+ printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
val, start_bad, last_bad + incr);
- reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+ reserve_early(start_bad, last_bad + incr, "BAD RAM");
}
start_bad = last_bad = start_phys_aligned;
}
@@ -72,9 +72,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
if (start_bad) {
printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
val, start_bad, last_bad + incr);
- reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+ reserve_early(start_bad, last_bad + incr, "BAD RAM");
}
-
}
/* default is disabled */
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 635b50e8558..2c4baa88f2c 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -56,13 +56,6 @@ struct remap_trace {
static DEFINE_PER_CPU(struct trap_reason, pf_reason);
static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
-#if 0 /* XXX: no way gather this info anymore */
-/* Access to this is not per-cpu. */
-static DEFINE_PER_CPU(atomic_t, dropped);
-#endif
-
-static struct dentry *marker_file;
-
static DEFINE_MUTEX(mmiotrace_mutex);
static DEFINE_SPINLOCK(trace_lock);
static atomic_t mmiotrace_enabled;
@@ -75,7 +68,7 @@ static LIST_HEAD(trace_list); /* struct remap_trace */
* and trace_lock.
* - Routines depending on is_enabled() must take trace_lock.
* - trace_list users must hold trace_lock.
- * - is_enabled() guarantees that mmio_trace_record is allowed.
+ * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
* - pre/post callbacks assume the effect of is_enabled() being true.
*/
@@ -97,44 +90,6 @@ static bool is_enabled(void)
return atomic_read(&mmiotrace_enabled);
}
-#if 0 /* XXX: needs rewrite */
-/*
- * Write callback for the debugfs entry:
- * Read a marker and write it to the mmio trace log
- */
-static ssize_t write_marker(struct file *file, const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- char *event = NULL;
- struct mm_io_header *headp;
- ssize_t len = (count > 65535) ? 65535 : count;
-
- event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
- if (!event)
- return -ENOMEM;
-
- headp = (struct mm_io_header *)event;
- headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
- headp->data_len = len;
-
- if (copy_from_user(event + sizeof(*headp), buffer, len)) {
- kfree(event);
- return -EFAULT;
- }
-
- spin_lock_irq(&trace_lock);
-#if 0 /* XXX: convert this to use tracing */
- if (is_enabled())
- relay_write(chan, event, sizeof(*headp) + len);
- else
-#endif
- len = -EINVAL;
- spin_unlock_irq(&trace_lock);
- kfree(event);
- return len;
-}
-#endif
-
static void print_pte(unsigned long address)
{
unsigned int level;
@@ -307,8 +262,10 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
map.map_id = trace->id;
spin_lock_irq(&trace_lock);
- if (!is_enabled())
+ if (!is_enabled()) {
+ kfree(trace);
goto not_enabled;
+ }
mmio_trace_mapping(&map);
list_add_tail(&trace->list, &trace_list);
@@ -377,6 +334,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr)
iounmap_trace_core(addr);
}
+int mmiotrace_printk(const char *fmt, ...)
+{
+ int ret = 0;
+ va_list args;
+ unsigned long flags;
+ va_start(args, fmt);
+
+ spin_lock_irqsave(&trace_lock, flags);
+ if (is_enabled())
+ ret = mmio_trace_printk(fmt, args);
+ spin_unlock_irqrestore(&trace_lock, flags);
+
+ va_end(args);
+ return ret;
+}
+EXPORT_SYMBOL(mmiotrace_printk);
+
static void clear_trace_list(void)
{
struct remap_trace *trace;
@@ -462,26 +436,12 @@ static void leave_uniprocessor(void)
}
#endif
-#if 0 /* XXX: out of order */
-static struct file_operations fops_marker = {
- .owner = THIS_MODULE,
- .write = write_marker
-};
-#endif
-
void enable_mmiotrace(void)
{
mutex_lock(&mmiotrace_mutex);
if (is_enabled())
goto out;
-#if 0 /* XXX: tracing does not support text entries */
- marker_file = debugfs_create_file("marker", 0660, dir, NULL,
- &fops_marker);
- if (!marker_file)
- pr_err(NAME "marker file creation failed.\n");
-#endif
-
if (nommiotrace)
pr_info(NAME "MMIO tracing disabled.\n");
enter_uniprocessor();
@@ -506,11 +466,6 @@ void disable_mmiotrace(void)
clear_trace_list(); /* guarantees: no more kmmio callbacks */
leave_uniprocessor();
- if (marker_file) {
- debugfs_remove(marker_file);
- marker_file = NULL;
- }
-
pr_info(NAME "disabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/numa_32.c
index 62fa440678d..847c164725f 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -328,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn,
get_memcfg_numa();
- kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
+ kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
do {
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a4dd793d600..cebcbf152d4 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -79,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
return 0;
addr = 0x8000;
- nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
+ nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
nodemap_size, L1_CACHE_BYTES);
if (nodemap_addr == -1UL) {
@@ -176,10 +176,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
unsigned long bootmap_start, nodedata_phys;
void *bootmap;
- const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+ const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
int nid;
- start = round_up(start, ZONE_ALIGN);
+ start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
start, end);
@@ -210,9 +210,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
nid = phys_to_nid(nodedata_phys);
if (nid == nodeid)
- bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+ bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
else
- bootmap_start = round_up(start, PAGE_SIZE);
+ bootmap_start = roundup(start, PAGE_SIZE);
/*
* SMP_CACHE_BYTES could be enough, but init_bootmem_node like
* to use that to align to PAGE_SIZE
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d4aa503caaa..e1d10690921 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -32,7 +32,7 @@ enum {
GPS = (1<<30)
};
-#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1)
+#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
static int pte_testbit(pte_t pte)
{
@@ -118,6 +118,7 @@ static int pageattr_test(void)
unsigned int level;
int i, k;
int err;
+ unsigned long test_addr;
if (print)
printk(KERN_INFO "CPA self-test:\n");
@@ -172,7 +173,8 @@ static int pageattr_test(void)
continue;
}
- err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT);
+ test_addr = addr[i];
+ err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
if (err < 0) {
printk(KERN_ERR "CPA %d failed %d\n", i, err);
failed++;
@@ -204,7 +206,8 @@ static int pageattr_test(void)
failed++;
continue;
}
- err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT);
+ test_addr = addr[i];
+ err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
if (err < 0) {
printk(KERN_ERR "CPA reverting failed: %d\n", err);
failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 43e2f8483e4..f1dc1b75d16 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,27 @@
* The current flushing context - we pass it instead of 5 arguments:
*/
struct cpa_data {
- unsigned long vaddr;
+ unsigned long *vaddr;
pgprot_t mask_set;
pgprot_t mask_clr;
int numpages;
- int flushtlb;
+ int flags;
unsigned long pfn;
unsigned force_split : 1;
+ int curpage;
};
+/*
+ * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
+ * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
+ * entries change the page attribute in parallel to some other cpu
+ * splitting a large page entry along with changing the attribute.
+ */
+static DEFINE_SPINLOCK(cpa_lock);
+
+#define CPA_FLUSHTLB 1
+#define CPA_ARRAY 2
+
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -53,23 +65,22 @@ static void split_page_count(int level)
direct_pages_count[level - 1] += PTRS_PER_PTE;
}
-int arch_report_meminfo(char *page)
+void arch_report_meminfo(struct seq_file *m)
{
- int n = sprintf(page, "DirectMap4k: %8lu kB\n",
+ seq_printf(m, "DirectMap4k: %8lu kB\n",
direct_pages_count[PG_LEVEL_4K] << 2);
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
- n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
+ seq_printf(m, "DirectMap2M: %8lu kB\n",
direct_pages_count[PG_LEVEL_2M] << 11);
#else
- n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
+ seq_printf(m, "DirectMap4M: %8lu kB\n",
direct_pages_count[PG_LEVEL_2M] << 12);
#endif
#ifdef CONFIG_X86_64
if (direct_gbpages)
- n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
+ seq_printf(m, "DirectMap1G: %8lu kB\n",
direct_pages_count[PG_LEVEL_1G] << 20);
#endif
- return n;
}
#else
static inline void split_page_count(int level) { }
@@ -84,7 +95,7 @@ static inline unsigned long highmap_start_pfn(void)
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -190,6 +201,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
}
}
+static void cpa_flush_array(unsigned long *start, int numpages, int cache)
+{
+ unsigned int i, level;
+ unsigned long *addr;
+
+ BUG_ON(irqs_disabled());
+
+ on_each_cpu(__cpa_flush_range, NULL, 1);
+
+ if (!cache)
+ return;
+
+ /* 4M threshold */
+ if (numpages >= 1024) {
+ if (boot_cpu_data.x86_model >= 4)
+ wbinvd();
+ return;
+ }
+ /*
+ * We only need to flush on one CPU,
+ * clflush is a MESI-coherent instruction that
+ * will cause all other CPUs to flush the same
+ * cachelines:
+ */
+ for (i = 0, addr = start; i < numpages; i++, addr++) {
+ pte_t *pte = lookup_address(*addr, &level);
+
+ /*
+ * Only flush present addresses:
+ */
+ if (pte && (pte_val(*pte) & _PAGE_PRESENT))
+ clflush_cache_range((void *) *addr, PAGE_SIZE);
+ }
+}
+
/*
* Certain areas of memory on x86 require very specific protection flags,
* for example the BIOS area or kernel text. Callers don't always get this
@@ -398,7 +444,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
*/
new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
__set_pmd_pte(kpte, address, new_pte);
- cpa->flushtlb = 1;
+ cpa->flags |= CPA_FLUSHTLB;
do_split = 0;
}
@@ -408,84 +454,6 @@ out_unlock:
return do_split;
}
-static LIST_HEAD(page_pool);
-static unsigned long pool_size, pool_pages, pool_low;
-static unsigned long pool_used, pool_failed;
-
-static void cpa_fill_pool(struct page **ret)
-{
- gfp_t gfp = GFP_KERNEL;
- unsigned long flags;
- struct page *p;
-
- /*
- * Avoid recursion (on debug-pagealloc) and also signal
- * our priority to get to these pagetables:
- */
- if (current->flags & PF_MEMALLOC)
- return;
- current->flags |= PF_MEMALLOC;
-
- /*
- * Allocate atomically from atomic contexts:
- */
- if (in_atomic() || irqs_disabled() || debug_pagealloc)
- gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
-
- while (pool_pages < pool_size || (ret && !*ret)) {
- p = alloc_pages(gfp, 0);
- if (!p) {
- pool_failed++;
- break;
- }
- /*
- * If the call site needs a page right now, provide it:
- */
- if (ret && !*ret) {
- *ret = p;
- continue;
- }
- spin_lock_irqsave(&pgd_lock, flags);
- list_add(&p->lru, &page_pool);
- pool_pages++;
- spin_unlock_irqrestore(&pgd_lock, flags);
- }
-
- current->flags &= ~PF_MEMALLOC;
-}
-
-#define SHIFT_MB (20 - PAGE_SHIFT)
-#define ROUND_MB_GB ((1 << 10) - 1)
-#define SHIFT_MB_GB 10
-#define POOL_PAGES_PER_GB 16
-
-void __init cpa_init(void)
-{
- struct sysinfo si;
- unsigned long gb;
-
- si_meminfo(&si);
- /*
- * Calculate the number of pool pages:
- *
- * Convert totalram (nr of pages) to MiB and round to the next
- * GiB. Shift MiB to Gib and multiply the result by
- * POOL_PAGES_PER_GB:
- */
- if (debug_pagealloc) {
- gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
- pool_size = POOL_PAGES_PER_GB * gb;
- } else {
- pool_size = 1;
- }
- pool_low = pool_size;
-
- cpa_fill_pool(NULL);
- printk(KERN_DEBUG
- "CPA: page pool initialized %lu of %lu pages preallocated\n",
- pool_pages, pool_size);
-}
-
static int split_large_page(pte_t *kpte, unsigned long address)
{
unsigned long flags, pfn, pfninc = 1;
@@ -494,28 +462,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
pgprot_t ref_prot;
struct page *base;
- /*
- * Get a page from the pool. The pool list is protected by the
- * pgd_lock, which we have to take anyway for the split
- * operation:
- */
- spin_lock_irqsave(&pgd_lock, flags);
- if (list_empty(&page_pool)) {
- spin_unlock_irqrestore(&pgd_lock, flags);
- base = NULL;
- cpa_fill_pool(&base);
- if (!base)
- return -ENOMEM;
- spin_lock_irqsave(&pgd_lock, flags);
- } else {
- base = list_first_entry(&page_pool, struct page, lru);
- list_del(&base->lru);
- pool_pages--;
-
- if (pool_pages < pool_low)
- pool_low = pool_pages;
- }
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
+ base = alloc_pages(GFP_KERNEL, 0);
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
+ if (!base)
+ return -ENOMEM;
+ spin_lock_irqsave(&pgd_lock, flags);
/*
* Check for races, another CPU might have split this page
* up for us already:
@@ -572,11 +527,8 @@ out_unlock:
* If we dropped out via the lookup_address check under
* pgd_lock then stick the page back into the pool:
*/
- if (base) {
- list_add(&base->lru, &page_pool);
- pool_pages++;
- } else
- pool_used++;
+ if (base)
+ __free_page(base);
spin_unlock_irqrestore(&pgd_lock, flags);
return 0;
@@ -584,11 +536,16 @@ out_unlock:
static int __change_page_attr(struct cpa_data *cpa, int primary)
{
- unsigned long address = cpa->vaddr;
+ unsigned long address;
int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ if (cpa->flags & CPA_ARRAY)
+ address = cpa->vaddr[cpa->curpage];
+ else
+ address = *cpa->vaddr;
+
repeat:
kpte = lookup_address(address, &level);
if (!kpte)
@@ -600,7 +557,7 @@ repeat:
return 0;
WARN(1, KERN_WARNING "CPA: called for zero pte. "
"vaddr = %lx cpa->vaddr = %lx\n", address,
- cpa->vaddr);
+ *cpa->vaddr);
return -EINVAL;
}
@@ -626,7 +583,7 @@ repeat:
*/
if (pte_val(old_pte) != pte_val(new_pte)) {
set_pte_atomic(kpte, new_pte);
- cpa->flushtlb = 1;
+ cpa->flags |= CPA_FLUSHTLB;
}
cpa->numpages = 1;
return 0;
@@ -650,7 +607,25 @@ repeat:
*/
err = split_large_page(kpte, address);
if (!err) {
- cpa->flushtlb = 1;
+ /*
+ * Do a global flush tlb after splitting the large page
+ * and before we do the actual change page attribute in the PTE.
+ *
+ * With out this, we violate the TLB application note, that says
+ * "The TLBs may contain both ordinary and large-page
+ * translations for a 4-KByte range of linear addresses. This
+ * may occur if software modifies the paging structures so that
+ * the page size used for the address range changes. If the two
+ * translations differ with respect to page frame or attributes
+ * (e.g., permissions), processor behavior is undefined and may
+ * be implementation-specific."
+ *
+ * We do this global tlb flush inside the cpa_lock, so that we
+ * don't allow any other cpu, with stale tlb entries change the
+ * page attribute in parallel, that also falls into the
+ * just split large page entry.
+ */
+ flush_tlb_all();
goto repeat;
}
@@ -663,6 +638,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
int ret = 0;
+ unsigned long temp_cpa_vaddr, vaddr;
if (cpa->pfn >= max_pfn_mapped)
return 0;
@@ -675,16 +651,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (!(within(cpa->vaddr, PAGE_OFFSET,
+ if (cpa->flags & CPA_ARRAY)
+ vaddr = cpa->vaddr[cpa->curpage];
+ else
+ vaddr = *cpa->vaddr;
+
+ if (!(within(vaddr, PAGE_OFFSET,
PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
#ifdef CONFIG_X86_64
- || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
+ || within(vaddr, PAGE_OFFSET + (1UL<<32),
PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
#endif
)) {
alias_cpa = *cpa;
- alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+ temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.flags &= ~CPA_ARRAY;
+
ret = __change_page_attr_set_clr(&alias_cpa, 0);
}
@@ -696,7 +680,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the high
* mapping already:
*/
- if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
+ if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
return 0;
/*
@@ -707,8 +691,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
return 0;
alias_cpa = *cpa;
- alias_cpa.vaddr =
- (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
+ temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.flags &= ~CPA_ARRAY;
/*
* The high mapping range is imprecise, so ignore the return value.
@@ -728,8 +713,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
* preservation check.
*/
cpa->numpages = numpages;
+ /* for array changes, we can't use large page */
+ if (cpa->flags & CPA_ARRAY)
+ cpa->numpages = 1;
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
ret = __change_page_attr(cpa, checkalias);
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
if (ret)
return ret;
@@ -746,7 +738,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
*/
BUG_ON(cpa->numpages > numpages);
numpages -= cpa->numpages;
- cpa->vaddr += cpa->numpages * PAGE_SIZE;
+ if (cpa->flags & CPA_ARRAY)
+ cpa->curpage++;
+ else
+ *cpa->vaddr += cpa->numpages * PAGE_SIZE;
+
}
return 0;
}
@@ -757,9 +753,9 @@ static inline int cache_attr(pgprot_t attr)
(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
}
-static int change_page_attr_set_clr(unsigned long addr, int numpages,
+static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
- int force_split)
+ int force_split, int array)
{
struct cpa_data cpa;
int ret, cache, checkalias;
@@ -774,21 +770,40 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
return 0;
/* Ensure we are PAGE_SIZE aligned */
- if (addr & ~PAGE_MASK) {
- addr &= PAGE_MASK;
- /*
- * People should not be passing in unaligned addresses:
- */
- WARN_ON_ONCE(1);
+ if (!array) {
+ if (*addr & ~PAGE_MASK) {
+ *addr &= PAGE_MASK;
+ /*
+ * People should not be passing in unaligned addresses:
+ */
+ WARN_ON_ONCE(1);
+ }
+ } else {
+ int i;
+ for (i = 0; i < numpages; i++) {
+ if (addr[i] & ~PAGE_MASK) {
+ addr[i] &= PAGE_MASK;
+ WARN_ON_ONCE(1);
+ }
+ }
}
+ /* Must avoid aliasing mappings in the highmem code */
+ kmap_flush_unused();
+
+ vm_unmap_aliases();
+
cpa.vaddr = addr;
cpa.numpages = numpages;
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
- cpa.flushtlb = 0;
+ cpa.flags = 0;
+ cpa.curpage = 0;
cpa.force_split = force_split;
+ if (array)
+ cpa.flags |= CPA_ARRAY;
+
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -797,7 +812,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
/*
* Check whether we really changed something:
*/
- if (!cpa.flushtlb)
+ if (!(cpa.flags & CPA_FLUSHTLB))
goto out;
/*
@@ -812,27 +827,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
* error case we fall back to cpa_flush_all (which uses
* wbindv):
*/
- if (!ret && cpu_has_clflush)
- cpa_flush_range(addr, numpages, cache);
- else
+ if (!ret && cpu_has_clflush) {
+ if (cpa.flags & CPA_ARRAY)
+ cpa_flush_array(addr, numpages, cache);
+ else
+ cpa_flush_range(*addr, numpages, cache);
+ } else
cpa_flush_all(cache);
out:
- cpa_fill_pool(NULL);
-
return ret;
}
-static inline int change_page_attr_set(unsigned long addr, int numpages,
- pgprot_t mask)
+static inline int change_page_attr_set(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
{
- return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
+ array);
}
-static inline int change_page_attr_clear(unsigned long addr, int numpages,
- pgprot_t mask)
+static inline int change_page_attr_clear(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
{
- return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
+ array);
}
int _set_memory_uc(unsigned long addr, int numpages)
@@ -840,8 +858,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
/*
* for now UC MINUS. see comments in ioremap_nocache()
*/
- return change_page_attr_set(addr, numpages,
- __pgprot(_PAGE_CACHE_UC_MINUS));
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_CACHE_UC_MINUS), 0);
}
int set_memory_uc(unsigned long addr, int numpages)
@@ -857,10 +875,48 @@ int set_memory_uc(unsigned long addr, int numpages)
}
EXPORT_SYMBOL(set_memory_uc);
+int set_memory_array_uc(unsigned long *addr, int addrinarray)
+{
+ unsigned long start;
+ unsigned long end;
+ int i;
+ /*
+ * for now UC MINUS. see comments in ioremap_nocache()
+ */
+ for (i = 0; i < addrinarray; i++) {
+ start = __pa(addr[i]);
+ for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+ if (end != __pa(addr[i + 1]))
+ break;
+ i++;
+ }
+ if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
+ goto out;
+ }
+
+ return change_page_attr_set(addr, addrinarray,
+ __pgprot(_PAGE_CACHE_UC_MINUS), 1);
+out:
+ for (i = 0; i < addrinarray; i++) {
+ unsigned long tmp = __pa(addr[i]);
+
+ if (tmp == start)
+ break;
+ for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+ if (end != __pa(addr[i + 1]))
+ break;
+ i++;
+ }
+ free_memtype(tmp, end);
+ }
+ return -EINVAL;
+}
+EXPORT_SYMBOL(set_memory_array_uc);
+
int _set_memory_wc(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages,
- __pgprot(_PAGE_CACHE_WC));
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_CACHE_WC), 0);
}
int set_memory_wc(unsigned long addr, int numpages)
@@ -878,8 +934,8 @@ EXPORT_SYMBOL(set_memory_wc);
int _set_memory_wb(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages,
- __pgprot(_PAGE_CACHE_MASK));
+ return change_page_attr_clear(&addr, numpages,
+ __pgprot(_PAGE_CACHE_MASK), 0);
}
int set_memory_wb(unsigned long addr, int numpages)
@@ -890,37 +946,59 @@ int set_memory_wb(unsigned long addr, int numpages)
}
EXPORT_SYMBOL(set_memory_wb);
+int set_memory_array_wb(unsigned long *addr, int addrinarray)
+{
+ int i;
+
+ for (i = 0; i < addrinarray; i++) {
+ unsigned long start = __pa(addr[i]);
+ unsigned long end;
+
+ for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+ if (end != __pa(addr[i + 1]))
+ break;
+ i++;
+ }
+ free_memtype(start, end);
+ }
+ return change_page_attr_clear(addr, addrinarray,
+ __pgprot(_PAGE_CACHE_MASK), 1);
+}
+EXPORT_SYMBOL(set_memory_array_wb);
+
int set_memory_x(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_x);
int set_memory_nx(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_nx);
int set_memory_ro(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
+EXPORT_SYMBOL_GPL(set_memory_ro);
int set_memory_rw(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
+EXPORT_SYMBOL_GPL(set_memory_rw);
int set_memory_np(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}
int set_memory_4k(unsigned long addr, int numpages)
{
- return change_page_attr_set_clr(addr, numpages, __pgprot(0),
- __pgprot(0), 1);
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ __pgprot(0), 1, 0);
}
int set_pages_uc(struct page *page, int numpages)
@@ -973,22 +1051,38 @@ int set_pages_rw(struct page *page, int numpages)
static int __set_pages_p(struct page *page, int numpages)
{
- struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+ unsigned long tempaddr = (unsigned long) page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
- .mask_clr = __pgprot(0)};
+ .mask_clr = __pgprot(0),
+ .flags = 0};
- return __change_page_attr_set_clr(&cpa, 1);
+ /*
+ * No alias checking needed for setting present flag. otherwise,
+ * we may need to break large pages for 64-bit kernel text
+ * mappings (this adds to complexity if we want to do this from
+ * atomic context especially). Let's keep it simple!
+ */
+ return __change_page_attr_set_clr(&cpa, 0);
}
static int __set_pages_np(struct page *page, int numpages)
{
- struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+ unsigned long tempaddr = (unsigned long) page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .flags = 0};
- return __change_page_attr_set_clr(&cpa, 1);
+ /*
+ * No alias checking needed for setting not present flag. otherwise,
+ * we may need to break large pages for 64-bit kernel text
+ * mappings (this adds to complexity if we want to do this from
+ * atomic context especially). Let's keep it simple!
+ */
+ return __change_page_attr_set_clr(&cpa, 0);
}
void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1008,11 +1102,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
/*
* The return value is ignored as the calls cannot fail.
- * Large pages are kept enabled at boot time, and are
- * split up quickly with DEBUG_PAGEALLOC. If a splitup
- * fails here (due to temporary memory shortage) no damage
- * is done because we just keep the largepage intact up
- * to the next attempt when it will likely be split up:
+ * Large pages for identity mappings are not used at boot time
+ * and hence no memory allocations during large page split.
*/
if (enable)
__set_pages_p(page, numpages);
@@ -1024,53 +1115,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
* but that can deadlock->flush only current cpu:
*/
__flush_tlb_all();
-
- /*
- * Try to refill the page pool here. We can do this only after
- * the tlb flush.
- */
- cpa_fill_pool(NULL);
}
-#ifdef CONFIG_DEBUG_FS
-static int dpa_show(struct seq_file *m, void *v)
-{
- seq_puts(m, "DEBUG_PAGEALLOC\n");
- seq_printf(m, "pool_size : %lu\n", pool_size);
- seq_printf(m, "pool_pages : %lu\n", pool_pages);
- seq_printf(m, "pool_low : %lu\n", pool_low);
- seq_printf(m, "pool_used : %lu\n", pool_used);
- seq_printf(m, "pool_failed : %lu\n", pool_failed);
-
- return 0;
-}
-
-static int dpa_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, dpa_show, NULL);
-}
-
-static const struct file_operations dpa_fops = {
- .open = dpa_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int __init debug_pagealloc_proc_init(void)
-{
- struct dentry *de;
-
- de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
- &dpa_fops);
- if (!de)
- return -ENOMEM;
-
- return 0;
-}
-__initcall(debug_pagealloc_proc_init);
-#endif
-
#ifdef CONFIG_HIBERNATION
bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2a50e0fa64a..738fd0f2495 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,24 +7,24 @@
* Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
*/
-#include <linux/mm.h>
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
+#include <linux/mm.h>
#include <linux/fs.h>
-#include <linux/bootmem.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <asm/msr.h>
-#include <asm/tlbflush.h>
+#include <asm/cacheflush.h>
#include <asm/processor.h>
-#include <asm/page.h>
+#include <asm/tlbflush.h>
#include <asm/pgtable.h>
-#include <asm/pat.h>
-#include <asm/e820.h>
-#include <asm/cacheflush.h>
#include <asm/fcntl.h>
+#include <asm/e820.h>
#include <asm/mtrr.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
#include <asm/io.h>
#ifdef CONFIG_X86_PAT
@@ -46,6 +46,7 @@ early_param("nopat", nopat);
static int debug_enable;
+
static int __init pat_debug_setup(char *str)
{
debug_enable = 1;
@@ -145,14 +146,14 @@ static char *cattr_name(unsigned long flags)
*/
struct memtype {
- u64 start;
- u64 end;
- unsigned long type;
- struct list_head nd;
+ u64 start;
+ u64 end;
+ unsigned long type;
+ struct list_head nd;
};
static LIST_HEAD(memtype_list);
-static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
+static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
/*
* Does intersection of PAT memory type and MTRR memory type and returns
@@ -180,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
return req_type;
}
-static int chk_conflict(struct memtype *new, struct memtype *entry,
- unsigned long *type)
+static int
+chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
{
if (new->type != entry->type) {
if (type) {
@@ -211,6 +212,66 @@ static struct memtype *cached_entry;
static u64 cached_start;
/*
+ * For RAM pages, mark the pages as non WB memory type using
+ * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
+ * set_memory_wc() on a RAM page at a time before marking it as WB again.
+ * This is ok, because only one driver will be owning the page and
+ * doing set_memory_*() calls.
+ *
+ * For now, we use PageNonWB to track that the RAM page is being mapped
+ * as non WB. In future, we will have to use one more flag
+ * (or some other mechanism in page_struct) to distinguish between
+ * UC and WC mapping.
+ */
+static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
+ unsigned long *new_type)
+{
+ struct page *page;
+ u64 pfn, end_pfn;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ page = pfn_to_page(pfn);
+ if (page_mapped(page) || PageNonWB(page))
+ goto out;
+
+ SetPageNonWB(page);
+ }
+ return 0;
+
+out:
+ end_pfn = pfn;
+ for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+ page = pfn_to_page(pfn);
+ ClearPageNonWB(page);
+ }
+
+ return -EINVAL;
+}
+
+static int free_ram_pages_type(u64 start, u64 end)
+{
+ struct page *page;
+ u64 pfn, end_pfn;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ page = pfn_to_page(pfn);
+ if (page_mapped(page) || !PageNonWB(page))
+ goto out;
+
+ ClearPageNonWB(page);
+ }
+ return 0;
+
+out:
+ end_pfn = pfn;
+ for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+ page = pfn_to_page(pfn);
+ SetPageNonWB(page);
+ }
+ return -EINVAL;
+}
+
+/*
* req_type typically has one of the:
* - _PAGE_CACHE_WB
* - _PAGE_CACHE_WC
@@ -226,14 +287,15 @@ static u64 cached_start;
* it will return a negative return value.
*/
int reserve_memtype(u64 start, u64 end, unsigned long req_type,
- unsigned long *new_type)
+ unsigned long *new_type)
{
struct memtype *new, *entry;
unsigned long actual_type;
struct list_head *where;
+ int is_range_ram;
int err = 0;
- BUG_ON(start >= end); /* end is exclusive */
+ BUG_ON(start >= end); /* end is exclusive */
if (!pat_enabled) {
/* This is identical to page table setting without PAT */
@@ -266,17 +328,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
actual_type = _PAGE_CACHE_WB;
else
actual_type = _PAGE_CACHE_UC_MINUS;
- } else
+ } else {
actual_type = pat_x_mtrr_type(start, end,
req_type & _PAGE_CACHE_MASK);
+ }
+
+ is_range_ram = pagerange_is_ram(start, end);
+ if (is_range_ram == 1)
+ return reserve_ram_pages_type(start, end, req_type, new_type);
+ else if (is_range_ram < 0)
+ return -EINVAL;
new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
if (!new)
return -ENOMEM;
- new->start = start;
- new->end = end;
- new->type = actual_type;
+ new->start = start;
+ new->end = end;
+ new->type = actual_type;
if (new_type)
*new_type = actual_type;
@@ -335,6 +404,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
start, end, cattr_name(new->type), cattr_name(req_type));
kfree(new);
spin_unlock(&memtype_lock);
+
return err;
}
@@ -358,6 +428,7 @@ int free_memtype(u64 start, u64 end)
{
struct memtype *entry;
int err = -EINVAL;
+ int is_range_ram;
if (!pat_enabled)
return 0;
@@ -366,6 +437,12 @@ int free_memtype(u64 start, u64 end)
if (is_ISA_range(start, end - 1))
return 0;
+ is_range_ram = pagerange_is_ram(start, end);
+ if (is_range_ram == 1)
+ return free_ram_pages_type(start, end);
+ else if (is_range_ram < 0)
+ return -EINVAL;
+
spin_lock(&memtype_lock);
list_for_each_entry(entry, &memtype_list, nd) {
if (entry->start == start && entry->end == end) {
@@ -386,6 +463,7 @@ int free_memtype(u64 start, u64 end)
}
dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+
return err;
}
@@ -492,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
{
+ unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
u64 addr = (u64)pfn << PAGE_SHIFT;
unsigned long flags;
- unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
reserve_memtype(addr, addr + size, want_flags, &flags);
if (flags != want_flags) {
@@ -514,7 +592,7 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
free_memtype(addr, addr + size);
}
-#if defined(CONFIG_DEBUG_FS)
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
/* get Nth element of the linked list */
static struct memtype *memtype_get_idx(loff_t pos)
@@ -537,6 +615,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
}
spin_unlock(&memtype_lock);
kfree(print_entry);
+
return NULL;
}
@@ -567,6 +646,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
print_entry->start, print_entry->end);
kfree(print_entry);
+
return 0;
}
@@ -598,4 +678,4 @@ static int __init pat_memtype_list_init(void)
late_initcall(pat_memtype_list_init);
-#endif /* CONFIG_DEBUG_FS */
+#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index efa1911e20c..df3d5c861cd 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -79,25 +79,34 @@ static unsigned int mw32[] = { 0xC7 };
static unsigned int mw64[] = { 0x89, 0x8B };
#endif /* not __i386__ */
-static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
- int *rexr)
+struct prefix_bits {
+ unsigned shorted:1;
+ unsigned enlarged:1;
+ unsigned rexr:1;
+ unsigned rex:1;
+};
+
+static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
{
int i;
unsigned char *p = addr;
- *shorted = 0;
- *enlarged = 0;
- *rexr = 0;
+ prf->shorted = 0;
+ prf->enlarged = 0;
+ prf->rexr = 0;
+ prf->rex = 0;
restart:
for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
if (*p == prefix_codes[i]) {
if (*p == 0x66)
- *shorted = 1;
+ prf->shorted = 1;
#ifdef __amd64__
if ((*p & 0xf8) == 0x48)
- *enlarged = 1;
+ prf->enlarged = 1;
if ((*p & 0xf4) == 0x44)
- *rexr = 1;
+ prf->rexr = 1;
+ if ((*p & 0xf0) == 0x40)
+ prf->rex = 1;
#endif
p++;
goto restart;
@@ -135,12 +144,12 @@ enum reason_type get_ins_type(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
- int shorted, enlarged, rexr;
+ struct prefix_bits prf;
int i;
enum reason_type rv = OTHERS;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
@@ -156,10 +165,11 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(rw8); i++)
@@ -168,7 +178,7 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
for (i = 0; i < ARRAY_SIZE(rw32); i++)
if (rw32[i] == opcode)
- return (shorted ? 2 : (enlarged ? 8 : 4));
+ return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
return 0;
@@ -178,10 +188,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(mw8); i++)
@@ -194,11 +205,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
for (i = 0; i < ARRAY_SIZE(mw32); i++)
if (mw32[i] == opcode)
- return shorted ? 2 : 4;
+ return prf.shorted ? 2 : 4;
for (i = 0; i < ARRAY_SIZE(mw64); i++)
if (mw64[i] == opcode)
- return shorted ? 2 : (enlarged ? 8 : 4);
+ return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
return 0;
@@ -238,7 +249,7 @@ enum {
#endif
};
-static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
+static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
{
unsigned char *rv = NULL;
@@ -255,18 +266,6 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
case arg_DL:
rv = (unsigned char *)&regs->dx;
break;
- case arg_AH:
- rv = 1 + (unsigned char *)&regs->ax;
- break;
- case arg_BH:
- rv = 1 + (unsigned char *)&regs->bx;
- break;
- case arg_CH:
- rv = 1 + (unsigned char *)&regs->cx;
- break;
- case arg_DH:
- rv = 1 + (unsigned char *)&regs->dx;
- break;
#ifdef __amd64__
case arg_R8:
rv = (unsigned char *)&regs->r8;
@@ -294,9 +293,55 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
break;
#endif
default:
- printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
break;
}
+
+ if (rv)
+ return rv;
+
+ if (rex) {
+ /*
+ * If REX prefix exists, access low bytes of SI etc.
+ * instead of AH etc.
+ */
+ switch (no) {
+ case arg_SI:
+ rv = (unsigned char *)&regs->si;
+ break;
+ case arg_DI:
+ rv = (unsigned char *)&regs->di;
+ break;
+ case arg_BP:
+ rv = (unsigned char *)&regs->bp;
+ break;
+ case arg_SP:
+ rv = (unsigned char *)&regs->sp;
+ break;
+ default:
+ break;
+ }
+ } else {
+ switch (no) {
+ case arg_AH:
+ rv = 1 + (unsigned char *)&regs->ax;
+ break;
+ case arg_BH:
+ rv = 1 + (unsigned char *)&regs->bx;
+ break;
+ case arg_CH:
+ rv = 1 + (unsigned char *)&regs->cx;
+ break;
+ case arg_DH:
+ rv = 1 + (unsigned char *)&regs->dx;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!rv)
+ printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+
return rv;
}
@@ -368,11 +413,12 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
unsigned char mod_rm;
int reg;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
unsigned long rv;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
if (reg_rop[i] == opcode) {
@@ -392,10 +438,10 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
do_work:
mod_rm = *p;
- reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
+ reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
switch (get_ins_reg_width(ins_addr)) {
case 1:
- return *get_reg_w8(reg, regs);
+ return *get_reg_w8(reg, prf.rex, regs);
case 2:
return *(unsigned short *)get_reg_w32(reg, regs);
@@ -422,11 +468,12 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
unsigned char mod_rm;
unsigned char mod;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
unsigned long rv;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
if (imm_wop[i] == opcode) {
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d50302774fe..86f2ffc43c3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,10 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
-static void pgd_ctor(void *p)
+static void pgd_ctor(pgd_t *pgd)
{
- pgd_t *pgd = p;
-
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
@@ -87,7 +85,7 @@ static void pgd_ctor(void *p)
pgd_list_add(pgd);
}
-static void pgd_dtor(void *pgd)
+static void pgd_dtor(pgd_t *pgd)
{
unsigned long flags; /* can be called from interrupt context */
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cab0abbd1eb..0951db9ee51 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -123,7 +123,8 @@ static int __init parse_vmalloc(char *arg)
if (!arg)
return -EINVAL;
- __VMALLOC_RESERVE = memparse(arg, &arg);
+ /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
+ __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
return 0;
}
early_param("vmalloc", parse_vmalloc);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 1b4763e26ea..51c0a2fc14f 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
return;
}
- if (is_uv_system())
+ if (get_uv_system_type() >= UV_X2APIC)
apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
else
apic_id = pa->apic_id;
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index d877c5b423e..ab50a8d7402 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -3,6 +3,7 @@
*/
#include <linux/module.h>
#include <linux/io.h>
+#include <linux/mmiotrace.h>
#define MODULE_NAME "testmmiotrace"
@@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
static void do_write_test(void __iomem *p)
{
unsigned int i;
+ mmiotrace_printk("Write test.\n");
for (i = 0; i < 256; i++)
iowrite8(i, p + i);
for (i = 1024; i < (5 * 1024); i += 2)
@@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p)
static void do_read_test(void __iomem *p)
{
unsigned int i;
+ mmiotrace_printk("Read test.\n");
for (i = 0; i < 256; i++)
ioread8(p + i);
for (i = 1024; i < (5 * 1024); i += 2)
@@ -39,6 +42,7 @@ static void do_test(void)
pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
return;
}
+ mmiotrace_printk("ioremap returned %p.\n", p);
do_write_test(p);
do_read_test(p);
iounmap(p);
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
index 30f3eb36666..446902b2a6b 100644
--- a/arch/x86/oprofile/Makefile
+++ b/arch/x86/oprofile/Makefile
@@ -7,6 +7,6 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
timer_int.o )
oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
-oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \
+oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
op_model_ppro.o op_model_p4.o
oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index e2095cba409..04df67f8a7b 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -52,8 +52,7 @@ struct frame_head {
unsigned long ret;
} __attribute__((packed));
-static struct frame_head *
-dump_user_backtrace(struct frame_head * head)
+static struct frame_head *dump_user_backtrace(struct frame_head *head)
{
struct frame_head bufhead[2];
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 0227694f7da..022cd41ea9b 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -1,10 +1,11 @@
/**
* @file nmi_int.c
*
- * @remark Copyright 2002 OProfile authors
+ * @remark Copyright 2002-2008 OProfile authors
* @remark Read the file COPYING
*
* @author John Levon <levon@movementarian.org>
+ * @author Robert Richter <robert.richter@amd.com>
*/
#include <linux/init.h>
@@ -27,85 +28,9 @@ static struct op_x86_model_spec const *model;
static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
-static int nmi_start(void);
-static void nmi_stop(void);
-static void nmi_cpu_start(void *dummy);
-static void nmi_cpu_stop(void *dummy);
-
/* 0 == registered but off, 1 == registered and on */
static int nmi_enabled = 0;
-#ifdef CONFIG_SMP
-static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
- void *data)
-{
- int cpu = (unsigned long)data;
- switch (action) {
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
- break;
- case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
- break;
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block oprofile_cpu_nb = {
- .notifier_call = oprofile_cpu_notifier
-};
-#endif
-
-#ifdef CONFIG_PM
-
-static int nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
- /* Only one CPU left, just stop that one */
- if (nmi_enabled == 1)
- nmi_cpu_stop(NULL);
- return 0;
-}
-
-static int nmi_resume(struct sys_device *dev)
-{
- if (nmi_enabled == 1)
- nmi_cpu_start(NULL);
- return 0;
-}
-
-static struct sysdev_class oprofile_sysclass = {
- .name = "oprofile",
- .resume = nmi_resume,
- .suspend = nmi_suspend,
-};
-
-static struct sys_device device_oprofile = {
- .id = 0,
- .cls = &oprofile_sysclass,
-};
-
-static int __init init_sysfs(void)
-{
- int error;
-
- error = sysdev_class_register(&oprofile_sysclass);
- if (!error)
- error = sysdev_register(&device_oprofile);
- return error;
-}
-
-static void exit_sysfs(void)
-{
- sysdev_unregister(&device_oprofile);
- sysdev_class_unregister(&oprofile_sysclass);
-}
-
-#else
-#define init_sysfs() do { } while (0)
-#define exit_sysfs() do { } while (0)
-#endif /* CONFIG_PM */
-
static int profile_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data)
{
@@ -295,10 +220,12 @@ static void nmi_cpu_shutdown(void *dummy)
static void nmi_shutdown(void)
{
- struct op_msrs *msrs = &get_cpu_var(cpu_msrs);
+ struct op_msrs *msrs;
+
nmi_enabled = 0;
on_each_cpu(nmi_cpu_shutdown, NULL, 1);
unregister_die_notifier(&profile_exceptions_nb);
+ msrs = &get_cpu_var(cpu_msrs);
model->shutdown(msrs);
free_msrs();
put_cpu_var(cpu_msrs);
@@ -358,6 +285,77 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
return 0;
}
+#ifdef CONFIG_SMP
+static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
+ void *data)
+{
+ int cpu = (unsigned long)data;
+ switch (action) {
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE:
+ smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
+ break;
+ case CPU_DOWN_PREPARE:
+ smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block oprofile_cpu_nb = {
+ .notifier_call = oprofile_cpu_notifier
+};
+#endif
+
+#ifdef CONFIG_PM
+
+static int nmi_suspend(struct sys_device *dev, pm_message_t state)
+{
+ /* Only one CPU left, just stop that one */
+ if (nmi_enabled == 1)
+ nmi_cpu_stop(NULL);
+ return 0;
+}
+
+static int nmi_resume(struct sys_device *dev)
+{
+ if (nmi_enabled == 1)
+ nmi_cpu_start(NULL);
+ return 0;
+}
+
+static struct sysdev_class oprofile_sysclass = {
+ .name = "oprofile",
+ .resume = nmi_resume,
+ .suspend = nmi_suspend,
+};
+
+static struct sys_device device_oprofile = {
+ .id = 0,
+ .cls = &oprofile_sysclass,
+};
+
+static int __init init_sysfs(void)
+{
+ int error;
+
+ error = sysdev_class_register(&oprofile_sysclass);
+ if (!error)
+ error = sysdev_register(&device_oprofile);
+ return error;
+}
+
+static void exit_sysfs(void)
+{
+ sysdev_unregister(&device_oprofile);
+ sysdev_class_unregister(&oprofile_sysclass);
+}
+
+#else
+#define init_sysfs() do { } while (0)
+#define exit_sysfs() do { } while (0)
+#endif /* CONFIG_PM */
+
static int p4force;
module_param(p4force, int, 0);
@@ -417,9 +415,6 @@ static int __init ppro_init(char **cpu_type)
case 15: case 23:
*cpu_type = "i386/core_2";
break;
- case 26:
- *cpu_type = "i386/core_2";
- break;
default:
/* Unknown */
return 0;
@@ -429,6 +424,16 @@ static int __init ppro_init(char **cpu_type)
return 1;
}
+static int __init arch_perfmon_init(char **cpu_type)
+{
+ if (!cpu_has_arch_perfmon)
+ return 0;
+ *cpu_type = "i386/arch_perfmon";
+ model = &op_arch_perfmon_spec;
+ arch_perfmon_setup_counters();
+ return 1;
+}
+
/* in order to get sysfs right */
static int using_nmi;
@@ -436,7 +441,8 @@ int __init op_nmi_init(struct oprofile_operations *ops)
{
__u8 vendor = boot_cpu_data.x86_vendor;
__u8 family = boot_cpu_data.x86;
- char *cpu_type;
+ char *cpu_type = NULL;
+ int ret = 0;
if (!cpu_has_apic)
return -ENODEV;
@@ -449,19 +455,23 @@ int __init op_nmi_init(struct oprofile_operations *ops)
default:
return -ENODEV;
case 6:
- model = &op_athlon_spec;
+ model = &op_amd_spec;
cpu_type = "i386/athlon";
break;
case 0xf:
- model = &op_athlon_spec;
+ model = &op_amd_spec;
/* Actually it could be i386/hammer too, but give
user space an consistent name. */
cpu_type = "x86-64/hammer";
break;
case 0x10:
- model = &op_athlon_spec;
+ model = &op_amd_spec;
cpu_type = "x86-64/family10";
break;
+ case 0x11:
+ model = &op_amd_spec;
+ cpu_type = "x86-64/family11h";
+ break;
}
break;
@@ -469,36 +479,44 @@ int __init op_nmi_init(struct oprofile_operations *ops)
switch (family) {
/* Pentium IV */
case 0xf:
- if (!p4_init(&cpu_type))
- return -ENODEV;
+ p4_init(&cpu_type);
break;
/* A P6-class processor */
case 6:
- if (!ppro_init(&cpu_type))
- return -ENODEV;
+ ppro_init(&cpu_type);
break;
default:
- return -ENODEV;
+ break;
}
+
+ if (!cpu_type && !arch_perfmon_init(&cpu_type))
+ return -ENODEV;
break;
default:
return -ENODEV;
}
- init_sysfs();
#ifdef CONFIG_SMP
register_cpu_notifier(&oprofile_cpu_nb);
#endif
- using_nmi = 1;
+ /* default values, can be overwritten by model */
ops->create_files = nmi_create_files;
ops->setup = nmi_setup;
ops->shutdown = nmi_shutdown;
ops->start = nmi_start;
ops->stop = nmi_stop;
ops->cpu_type = cpu_type;
+
+ if (model->init)
+ ret = model->init(ops);
+ if (ret)
+ return ret;
+
+ init_sysfs();
+ using_nmi = 1;
printk(KERN_INFO "oprofile: using NMI interrupt.\n");
return 0;
}
@@ -511,4 +529,6 @@ void op_nmi_exit(void)
unregister_cpu_notifier(&oprofile_cpu_nb);
#endif
}
+ if (model->exit)
+ model->exit();
}
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index 2880b15c467..91b6a116165 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -6,22 +6,22 @@
*
* @author John Levon
*/
-
+
#ifndef OP_COUNTER_H
#define OP_COUNTER_H
-
+
#define OP_MAX_COUNTER 8
-
+
/* Per-perfctr configuration as set via
* oprofilefs.
*/
struct op_counter_config {
- unsigned long count;
- unsigned long enabled;
- unsigned long event;
- unsigned long kernel;
- unsigned long user;
- unsigned long unit_mask;
+ unsigned long count;
+ unsigned long enabled;
+ unsigned long event;
+ unsigned long kernel;
+ unsigned long user;
+ unsigned long unit_mask;
};
extern struct op_counter_config counter_config[];
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
new file mode 100644
index 00000000000..509513760a6
--- /dev/null
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -0,0 +1,546 @@
+/*
+ * @file op_model_amd.c
+ * athlon / K7 / K8 / Family 10h model-specific MSR operations
+ *
+ * @remark Copyright 2002-2008 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ * @author Philippe Elie
+ * @author Graydon Hoare
+ * @author Robert Richter <robert.richter@amd.com>
+ * @author Barry Kasindorf
+*/
+
+#include <linux/oprofile.h>
+#include <linux/device.h>
+#include <linux/pci.h>
+
+#include <asm/ptrace.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+
+#include "op_x86_model.h"
+#include "op_counter.h"
+
+#define NUM_COUNTERS 4
+#define NUM_CONTROLS 4
+
+#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
+#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
+#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
+#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
+
+#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
+#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
+#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
+#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
+#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
+#define CTRL_CLEAR_LO(x) (x &= (1<<21))
+#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
+#define CTRL_SET_ENABLE(val) (val |= 1<<20)
+#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
+#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
+#define CTRL_SET_UM(val, m) (val |= (m << 8))
+#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
+#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
+#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
+#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
+
+static unsigned long reset_value[NUM_COUNTERS];
+
+#ifdef CONFIG_OPROFILE_IBS
+
+/* IbsFetchCtl bits/masks */
+#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */
+#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */
+#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */
+
+/*IbsOpCtl bits */
+#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */
+#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */
+
+/* Codes used in cpu_buffer.c */
+/* This produces duplicate code, need to be fixed */
+#define IBS_FETCH_BEGIN 3
+#define IBS_OP_BEGIN 4
+
+/* The function interface needs to be fixed, something like add
+ data. Should then be added to linux/oprofile.h. */
+extern void
+oprofile_add_ibs_sample(struct pt_regs *const regs,
+ unsigned int *const ibs_sample, int ibs_code);
+
+struct ibs_fetch_sample {
+ /* MSRC001_1031 IBS Fetch Linear Address Register */
+ unsigned int ibs_fetch_lin_addr_low;
+ unsigned int ibs_fetch_lin_addr_high;
+ /* MSRC001_1030 IBS Fetch Control Register */
+ unsigned int ibs_fetch_ctl_low;
+ unsigned int ibs_fetch_ctl_high;
+ /* MSRC001_1032 IBS Fetch Physical Address Register */
+ unsigned int ibs_fetch_phys_addr_low;
+ unsigned int ibs_fetch_phys_addr_high;
+};
+
+struct ibs_op_sample {
+ /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */
+ unsigned int ibs_op_rip_low;
+ unsigned int ibs_op_rip_high;
+ /* MSRC001_1035 IBS Op Data Register */
+ unsigned int ibs_op_data1_low;
+ unsigned int ibs_op_data1_high;
+ /* MSRC001_1036 IBS Op Data 2 Register */
+ unsigned int ibs_op_data2_low;
+ unsigned int ibs_op_data2_high;
+ /* MSRC001_1037 IBS Op Data 3 Register */
+ unsigned int ibs_op_data3_low;
+ unsigned int ibs_op_data3_high;
+ /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */
+ unsigned int ibs_dc_linear_low;
+ unsigned int ibs_dc_linear_high;
+ /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */
+ unsigned int ibs_dc_phys_low;
+ unsigned int ibs_dc_phys_high;
+};
+
+/*
+ * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+
+*/
+static void clear_ibs_nmi(void);
+
+static int ibs_allowed; /* AMD Family10h and later */
+
+struct op_ibs_config {
+ unsigned long op_enabled;
+ unsigned long fetch_enabled;
+ unsigned long max_cnt_fetch;
+ unsigned long max_cnt_op;
+ unsigned long rand_en;
+ unsigned long dispatched_ops;
+};
+
+static struct op_ibs_config ibs_config;
+
+#endif
+
+/* functions for op_amd_spec */
+
+static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
+{
+ int i;
+
+ for (i = 0; i < NUM_COUNTERS; i++) {
+ if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+ msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+ else
+ msrs->counters[i].addr = 0;
+ }
+
+ for (i = 0; i < NUM_CONTROLS; i++) {
+ if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
+ msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+ else
+ msrs->controls[i].addr = 0;
+ }
+}
+
+
+static void op_amd_setup_ctrs(struct op_msrs const * const msrs)
+{
+ unsigned int low, high;
+ int i;
+
+ /* clear all counters */
+ for (i = 0 ; i < NUM_CONTROLS; ++i) {
+ if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
+ continue;
+ CTRL_READ(low, high, msrs, i);
+ CTRL_CLEAR_LO(low);
+ CTRL_CLEAR_HI(high);
+ CTRL_WRITE(low, high, msrs, i);
+ }
+
+ /* avoid a false detection of ctr overflows in NMI handler */
+ for (i = 0; i < NUM_COUNTERS; ++i) {
+ if (unlikely(!CTR_IS_RESERVED(msrs, i)))
+ continue;
+ CTR_WRITE(1, msrs, i);
+ }
+
+ /* enable active counters */
+ for (i = 0; i < NUM_COUNTERS; ++i) {
+ if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
+ reset_value[i] = counter_config[i].count;
+
+ CTR_WRITE(counter_config[i].count, msrs, i);
+
+ CTRL_READ(low, high, msrs, i);
+ CTRL_CLEAR_LO(low);
+ CTRL_CLEAR_HI(high);
+ CTRL_SET_ENABLE(low);
+ CTRL_SET_USR(low, counter_config[i].user);
+ CTRL_SET_KERN(low, counter_config[i].kernel);
+ CTRL_SET_UM(low, counter_config[i].unit_mask);
+ CTRL_SET_EVENT_LOW(low, counter_config[i].event);
+ CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
+ CTRL_SET_HOST_ONLY(high, 0);
+ CTRL_SET_GUEST_ONLY(high, 0);
+
+ CTRL_WRITE(low, high, msrs, i);
+ } else {
+ reset_value[i] = 0;
+ }
+ }
+}
+
+#ifdef CONFIG_OPROFILE_IBS
+
+static inline int
+op_amd_handle_ibs(struct pt_regs * const regs,
+ struct op_msrs const * const msrs)
+{
+ unsigned int low, high;
+ struct ibs_fetch_sample ibs_fetch;
+ struct ibs_op_sample ibs_op;
+
+ if (!ibs_allowed)
+ return 1;
+
+ if (ibs_config.fetch_enabled) {
+ rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
+ if (high & IBS_FETCH_HIGH_VALID_BIT) {
+ ibs_fetch.ibs_fetch_ctl_high = high;
+ ibs_fetch.ibs_fetch_ctl_low = low;
+ rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high);
+ ibs_fetch.ibs_fetch_lin_addr_high = high;
+ ibs_fetch.ibs_fetch_lin_addr_low = low;
+ rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high);
+ ibs_fetch.ibs_fetch_phys_addr_high = high;
+ ibs_fetch.ibs_fetch_phys_addr_low = low;
+
+ oprofile_add_ibs_sample(regs,
+ (unsigned int *)&ibs_fetch,
+ IBS_FETCH_BEGIN);
+
+ /*reenable the IRQ */
+ rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
+ high &= ~IBS_FETCH_HIGH_VALID_BIT;
+ high |= IBS_FETCH_HIGH_ENABLE;
+ low &= IBS_FETCH_LOW_MAX_CNT_MASK;
+ wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
+ }
+ }
+
+ if (ibs_config.op_enabled) {
+ rdmsr(MSR_AMD64_IBSOPCTL, low, high);
+ if (low & IBS_OP_LOW_VALID_BIT) {
+ rdmsr(MSR_AMD64_IBSOPRIP, low, high);
+ ibs_op.ibs_op_rip_low = low;
+ ibs_op.ibs_op_rip_high = high;
+ rdmsr(MSR_AMD64_IBSOPDATA, low, high);
+ ibs_op.ibs_op_data1_low = low;
+ ibs_op.ibs_op_data1_high = high;
+ rdmsr(MSR_AMD64_IBSOPDATA2, low, high);
+ ibs_op.ibs_op_data2_low = low;
+ ibs_op.ibs_op_data2_high = high;
+ rdmsr(MSR_AMD64_IBSOPDATA3, low, high);
+ ibs_op.ibs_op_data3_low = low;
+ ibs_op.ibs_op_data3_high = high;
+ rdmsr(MSR_AMD64_IBSDCLINAD, low, high);
+ ibs_op.ibs_dc_linear_low = low;
+ ibs_op.ibs_dc_linear_high = high;
+ rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high);
+ ibs_op.ibs_dc_phys_low = low;
+ ibs_op.ibs_dc_phys_high = high;
+
+ /* reenable the IRQ */
+ oprofile_add_ibs_sample(regs,
+ (unsigned int *)&ibs_op,
+ IBS_OP_BEGIN);
+ rdmsr(MSR_AMD64_IBSOPCTL, low, high);
+ high = 0;
+ low &= ~IBS_OP_LOW_VALID_BIT;
+ low |= IBS_OP_LOW_ENABLE;
+ wrmsr(MSR_AMD64_IBSOPCTL, low, high);
+ }
+ }
+
+ return 1;
+}
+
+#endif
+
+static int op_amd_check_ctrs(struct pt_regs * const regs,
+ struct op_msrs const * const msrs)
+{
+ unsigned int low, high;
+ int i;
+
+ for (i = 0 ; i < NUM_COUNTERS; ++i) {
+ if (!reset_value[i])
+ continue;
+ CTR_READ(low, high, msrs, i);
+ if (CTR_OVERFLOWED(low)) {
+ oprofile_add_sample(regs, i);
+ CTR_WRITE(reset_value[i], msrs, i);
+ }
+ }
+
+#ifdef CONFIG_OPROFILE_IBS
+ op_amd_handle_ibs(regs, msrs);
+#endif
+
+ /* See op_model_ppro.c */
+ return 1;
+}
+
+static void op_amd_start(struct op_msrs const * const msrs)
+{
+ unsigned int low, high;
+ int i;
+ for (i = 0 ; i < NUM_COUNTERS ; ++i) {
+ if (reset_value[i]) {
+ CTRL_READ(low, high, msrs, i);
+ CTRL_SET_ACTIVE(low);
+ CTRL_WRITE(low, high, msrs, i);
+ }
+ }
+
+#ifdef CONFIG_OPROFILE_IBS
+ if (ibs_allowed && ibs_config.fetch_enabled) {
+ low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
+ high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */
+ + IBS_FETCH_HIGH_ENABLE;
+ wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
+ }
+
+ if (ibs_allowed && ibs_config.op_enabled) {
+ low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF)
+ + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */
+ + IBS_OP_LOW_ENABLE;
+ high = 0;
+ wrmsr(MSR_AMD64_IBSOPCTL, low, high);
+ }
+#endif
+}
+
+
+static void op_amd_stop(struct op_msrs const * const msrs)
+{
+ unsigned int low, high;
+ int i;
+
+ /* Subtle: stop on all counters to avoid race with
+ * setting our pm callback */
+ for (i = 0 ; i < NUM_COUNTERS ; ++i) {
+ if (!reset_value[i])
+ continue;
+ CTRL_READ(low, high, msrs, i);
+ CTRL_SET_INACTIVE(low);
+ CTRL_WRITE(low, high, msrs, i);
+ }
+
+#ifdef CONFIG_OPROFILE_IBS
+ if (ibs_allowed && ibs_config.fetch_enabled) {
+ low = 0; /* clear max count and enable */
+ high = 0;
+ wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
+ }
+
+ if (ibs_allowed && ibs_config.op_enabled) {
+ low = 0; /* clear max count and enable */
+ high = 0;
+ wrmsr(MSR_AMD64_IBSOPCTL, low, high);
+ }
+#endif
+}
+
+static void op_amd_shutdown(struct op_msrs const * const msrs)
+{
+ int i;
+
+ for (i = 0 ; i < NUM_COUNTERS ; ++i) {
+ if (CTR_IS_RESERVED(msrs, i))
+ release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+ }
+ for (i = 0 ; i < NUM_CONTROLS ; ++i) {
+ if (CTRL_IS_RESERVED(msrs, i))
+ release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+ }
+}
+
+#ifndef CONFIG_OPROFILE_IBS
+
+/* no IBS support */
+
+static int op_amd_init(struct oprofile_operations *ops)
+{
+ return 0;
+}
+
+static void op_amd_exit(void) {}
+
+#else
+
+static u8 ibs_eilvt_off;
+
+static inline void apic_init_ibs_nmi_per_cpu(void *arg)
+{
+ ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
+}
+
+static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
+{
+ setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
+}
+
+static int pfm_amd64_setup_eilvt(void)
+{
+#define IBSCTL_LVTOFFSETVAL (1 << 8)
+#define IBSCTL 0x1cc
+ struct pci_dev *cpu_cfg;
+ int nodes;
+ u32 value = 0;
+
+ /* per CPU setup */
+ on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
+
+ nodes = 0;
+ cpu_cfg = NULL;
+ do {
+ cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+ PCI_DEVICE_ID_AMD_10H_NB_MISC,
+ cpu_cfg);
+ if (!cpu_cfg)
+ break;
+ ++nodes;
+ pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+ | IBSCTL_LVTOFFSETVAL);
+ pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+ if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
+ printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
+ "IBSCTL = 0x%08x", value);
+ return 1;
+ }
+ } while (1);
+
+ if (!nodes) {
+ printk(KERN_DEBUG "No CPU node configured for IBS");
+ return 1;
+ }
+
+#ifdef CONFIG_NUMA
+ /* Sanity check */
+ /* Works only for 64bit with proper numa implementation. */
+ if (nodes != num_possible_nodes()) {
+ printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, "
+ "found: %d, expected %d",
+ nodes, num_possible_nodes());
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/*
+ * initialize the APIC for the IBS interrupts
+ * if available (AMD Family10h rev B0 and later)
+ */
+static void setup_ibs(void)
+{
+ ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
+
+ if (!ibs_allowed)
+ return;
+
+ if (pfm_amd64_setup_eilvt()) {
+ ibs_allowed = 0;
+ return;
+ }
+
+ printk(KERN_INFO "oprofile: AMD IBS detected\n");
+}
+
+
+/*
+ * unitialize the APIC for the IBS interrupts if needed on AMD Family10h
+ * rev B0 and later */
+static void clear_ibs_nmi(void)
+{
+ if (ibs_allowed)
+ on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
+}
+
+static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
+
+static int setup_ibs_files(struct super_block *sb, struct dentry *root)
+{
+ struct dentry *dir;
+ int ret = 0;
+
+ /* architecture specific files */
+ if (create_arch_files)
+ ret = create_arch_files(sb, root);
+
+ if (ret)
+ return ret;
+
+ if (!ibs_allowed)
+ return ret;
+
+ /* model specific files */
+
+ /* setup some reasonable defaults */
+ ibs_config.max_cnt_fetch = 250000;
+ ibs_config.fetch_enabled = 0;
+ ibs_config.max_cnt_op = 250000;
+ ibs_config.op_enabled = 0;
+ ibs_config.dispatched_ops = 1;
+
+ dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
+ oprofilefs_create_ulong(sb, dir, "enable",
+ &ibs_config.fetch_enabled);
+ oprofilefs_create_ulong(sb, dir, "max_count",
+ &ibs_config.max_cnt_fetch);
+ oprofilefs_create_ulong(sb, dir, "rand_enable",
+ &ibs_config.rand_en);
+
+ dir = oprofilefs_mkdir(sb, root, "ibs_op");
+ oprofilefs_create_ulong(sb, dir, "enable",
+ &ibs_config.op_enabled);
+ oprofilefs_create_ulong(sb, dir, "max_count",
+ &ibs_config.max_cnt_op);
+ oprofilefs_create_ulong(sb, dir, "dispatched_ops",
+ &ibs_config.dispatched_ops);
+
+ return 0;
+}
+
+static int op_amd_init(struct oprofile_operations *ops)
+{
+ setup_ibs();
+ create_arch_files = ops->create_files;
+ ops->create_files = setup_ibs_files;
+ return 0;
+}
+
+static void op_amd_exit(void)
+{
+ clear_ibs_nmi();
+}
+
+#endif
+
+struct op_x86_model_spec const op_amd_spec = {
+ .init = op_amd_init,
+ .exit = op_amd_exit,
+ .num_counters = NUM_COUNTERS,
+ .num_controls = NUM_CONTROLS,
+ .fill_in_addresses = &op_amd_fill_in_addresses,
+ .setup_ctrs = &op_amd_setup_ctrs,
+ .check_ctrs = &op_amd_check_ctrs,
+ .start = &op_amd_start,
+ .stop = &op_amd_stop,
+ .shutdown = &op_amd_shutdown
+};
diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c
deleted file mode 100644
index 3d534879a9d..00000000000
--- a/arch/x86/oprofile/op_model_athlon.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * @file op_model_athlon.h
- * athlon / K7 / K8 / Family 10h model-specific MSR operations
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- * @author Philippe Elie
- * @author Graydon Hoare
- */
-
-#include <linux/oprofile.h>
-#include <asm/ptrace.h>
-#include <asm/msr.h>
-#include <asm/nmi.h>
-
-#include "op_x86_model.h"
-#include "op_counter.h"
-
-#define NUM_COUNTERS 4
-#define NUM_CONTROLS 4
-
-#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
-#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
-#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
-#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
-
-#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
-#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
-#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
-#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
-#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
-#define CTRL_CLEAR_LO(x) (x &= (1<<21))
-#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
-#define CTRL_SET_ENABLE(val) (val |= 1<<20)
-#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
-#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
-#define CTRL_SET_UM(val, m) (val |= (m << 8))
-#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
-#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
-#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
-#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
-
-static unsigned long reset_value[NUM_COUNTERS];
-
-static void athlon_fill_in_addresses(struct op_msrs * const msrs)
-{
- int i;
-
- for (i = 0; i < NUM_COUNTERS; i++) {
- if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
- msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
- else
- msrs->counters[i].addr = 0;
- }
-
- for (i = 0; i < NUM_CONTROLS; i++) {
- if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
- msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
- else
- msrs->controls[i].addr = 0;
- }
-}
-
-
-static void athlon_setup_ctrs(struct op_msrs const * const msrs)
-{
- unsigned int low, high;
- int i;
-
- /* clear all counters */
- for (i = 0 ; i < NUM_CONTROLS; ++i) {
- if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
- continue;
- CTRL_READ(low, high, msrs, i);
- CTRL_CLEAR_LO(low);
- CTRL_CLEAR_HI(high);
- CTRL_WRITE(low, high, msrs, i);
- }
-
- /* avoid a false detection of ctr overflows in NMI handler */
- for (i = 0; i < NUM_COUNTERS; ++i) {
- if (unlikely(!CTR_IS_RESERVED(msrs, i)))
- continue;
- CTR_WRITE(1, msrs, i);
- }
-
- /* enable active counters */
- for (i = 0; i < NUM_COUNTERS; ++i) {
- if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
- reset_value[i] = counter_config[i].count;
-
- CTR_WRITE(counter_config[i].count, msrs, i);
-
- CTRL_READ(low, high, msrs, i);
- CTRL_CLEAR_LO(low);
- CTRL_CLEAR_HI(high);
- CTRL_SET_ENABLE(low);
- CTRL_SET_USR(low, counter_config[i].user);
- CTRL_SET_KERN(low, counter_config[i].kernel);
- CTRL_SET_UM(low, counter_config[i].unit_mask);
- CTRL_SET_EVENT_LOW(low, counter_config[i].event);
- CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
- CTRL_SET_HOST_ONLY(high, 0);
- CTRL_SET_GUEST_ONLY(high, 0);
-
- CTRL_WRITE(low, high, msrs, i);
- } else {
- reset_value[i] = 0;
- }
- }
-}
-
-
-static int athlon_check_ctrs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- unsigned int low, high;
- int i;
-
- for (i = 0 ; i < NUM_COUNTERS; ++i) {
- if (!reset_value[i])
- continue;
- CTR_READ(low, high, msrs, i);
- if (CTR_OVERFLOWED(low)) {
- oprofile_add_sample(regs, i);
- CTR_WRITE(reset_value[i], msrs, i);
- }
- }
-
- /* See op_model_ppro.c */
- return 1;
-}
-
-
-static void athlon_start(struct op_msrs const * const msrs)
-{
- unsigned int low, high;
- int i;
- for (i = 0 ; i < NUM_COUNTERS ; ++i) {
- if (reset_value[i]) {
- CTRL_READ(low, high, msrs, i);
- CTRL_SET_ACTIVE(low);
- CTRL_WRITE(low, high, msrs, i);
- }
- }
-}
-
-
-static void athlon_stop(struct op_msrs const * const msrs)
-{
- unsigned int low, high;
- int i;
-
- /* Subtle: stop on all counters to avoid race with
- * setting our pm callback */
- for (i = 0 ; i < NUM_COUNTERS ; ++i) {
- if (!reset_value[i])
- continue;
- CTRL_READ(low, high, msrs, i);
- CTRL_SET_INACTIVE(low);
- CTRL_WRITE(low, high, msrs, i);
- }
-}
-
-static void athlon_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0 ; i < NUM_COUNTERS ; ++i) {
- if (CTR_IS_RESERVED(msrs, i))
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- }
- for (i = 0 ; i < NUM_CONTROLS ; ++i) {
- if (CTRL_IS_RESERVED(msrs, i))
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
-}
-
-struct op_x86_model_spec const op_athlon_spec = {
- .num_counters = NUM_COUNTERS,
- .num_controls = NUM_CONTROLS,
- .fill_in_addresses = &athlon_fill_in_addresses,
- .setup_ctrs = &athlon_setup_ctrs,
- .check_ctrs = &athlon_check_ctrs,
- .start = &athlon_start,
- .stop = &athlon_stop,
- .shutdown = &athlon_shutdown
-};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 56b4757a1f4..4c4a51c90bc 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -10,11 +10,12 @@
#include <linux/oprofile.h>
#include <linux/smp.h>
+#include <linux/ptrace.h>
+#include <linux/nmi.h>
#include <asm/msr.h>
-#include <asm/ptrace.h>
#include <asm/fixmap.h>
#include <asm/apic.h>
-#include <asm/nmi.h>
+
#include "op_x86_model.h"
#include "op_counter.h"
@@ -40,7 +41,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT;
static inline void setup_num_counters(void)
{
#ifdef CONFIG_SMP
- if (smp_num_siblings == 2){
+ if (smp_num_siblings == 2) {
num_counters = NUM_COUNTERS_HT2;
num_controls = NUM_CONTROLS_HT2;
}
@@ -86,7 +87,7 @@ struct p4_event_binding {
#define CTR_FLAME_2 (1 << 6)
#define CTR_IQ_5 (1 << 7)
-static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
+static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
{ CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
{ CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
{ CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
@@ -97,32 +98,32 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
{ CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
};
-#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT
+#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
/* p4 event codes in libop/op_event.h are indices into this table. */
static struct p4_event_binding p4_events[NUM_EVENTS] = {
-
+
{ /* BRANCH_RETIRED */
- 0x05, 0x06,
+ 0x05, 0x06,
{ {CTR_IQ_4, MSR_P4_CRU_ESCR2},
{CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
-
+
{ /* MISPRED_BRANCH_RETIRED */
- 0x04, 0x03,
+ 0x04, 0x03,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR0},
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
-
+
{ /* TC_DELIVER_MODE */
0x01, 0x01,
- { { CTR_MS_0, MSR_P4_TC_ESCR0},
+ { { CTR_MS_0, MSR_P4_TC_ESCR0},
{ CTR_MS_2, MSR_P4_TC_ESCR1} }
},
-
+
{ /* BPU_FETCH_REQUEST */
- 0x00, 0x03,
+ 0x00, 0x03,
{ { CTR_BPU_0, MSR_P4_BPU_ESCR0},
{ CTR_BPU_2, MSR_P4_BPU_ESCR1} }
},
@@ -146,7 +147,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* LOAD_PORT_REPLAY */
- 0x02, 0x04,
+ 0x02, 0x04,
{ { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
{ CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
},
@@ -170,43 +171,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* BSQ_CACHE_REFERENCE */
- 0x07, 0x0c,
+ 0x07, 0x0c,
{ { CTR_BPU_0, MSR_P4_BSU_ESCR0},
{ CTR_BPU_2, MSR_P4_BSU_ESCR1} }
},
{ /* IOQ_ALLOCATION */
- 0x06, 0x03,
+ 0x06, 0x03,
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ 0, 0 } }
},
{ /* IOQ_ACTIVE_ENTRIES */
- 0x06, 0x1a,
+ 0x06, 0x1a,
{ { CTR_BPU_2, MSR_P4_FSB_ESCR1},
{ 0, 0 } }
},
{ /* FSB_DATA_ACTIVITY */
- 0x06, 0x17,
+ 0x06, 0x17,
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ CTR_BPU_2, MSR_P4_FSB_ESCR1} }
},
{ /* BSQ_ALLOCATION */
- 0x07, 0x05,
+ 0x07, 0x05,
{ { CTR_BPU_0, MSR_P4_BSU_ESCR0},
{ 0, 0 } }
},
{ /* BSQ_ACTIVE_ENTRIES */
0x07, 0x06,
- { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
+ { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
{ 0, 0 } }
},
{ /* X87_ASSIST */
- 0x05, 0x03,
+ 0x05, 0x03,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
{ CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
@@ -216,21 +217,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* PACKED_SP_UOP */
- 0x01, 0x08,
+ 0x01, 0x08,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* PACKED_DP_UOP */
- 0x01, 0x0c,
+ 0x01, 0x0c,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
{ /* SCALAR_SP_UOP */
- 0x01, 0x0a,
+ 0x01, 0x0a,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
@@ -242,31 +243,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* 64BIT_MMX_UOP */
- 0x01, 0x02,
+ 0x01, 0x02,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* 128BIT_MMX_UOP */
- 0x01, 0x1a,
+ 0x01, 0x1a,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
{ /* X87_FP_UOP */
- 0x01, 0x04,
+ 0x01, 0x04,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* X87_SIMD_MOVES_UOP */
- 0x01, 0x2e,
+ 0x01, 0x2e,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* MACHINE_CLEAR */
- 0x05, 0x02,
+ 0x05, 0x02,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
{ CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
@@ -276,9 +277,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ CTR_BPU_2, MSR_P4_FSB_ESCR1} }
},
-
+
{ /* TC_MS_XFER */
- 0x00, 0x05,
+ 0x00, 0x05,
{ { CTR_MS_0, MSR_P4_MS_ESCR0},
{ CTR_MS_2, MSR_P4_MS_ESCR1} }
},
@@ -308,7 +309,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* INSTR_RETIRED */
- 0x04, 0x02,
+ 0x04, 0x02,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR0},
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
@@ -319,14 +320,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
- { /* UOP_TYPE */
- 0x02, 0x02,
+ { /* UOP_TYPE */
+ 0x02, 0x02,
{ { CTR_IQ_4, MSR_P4_RAT_ESCR0},
{ CTR_IQ_5, MSR_P4_RAT_ESCR1} }
},
{ /* RETIRED_MISPRED_BRANCH_TYPE */
- 0x02, 0x05,
+ 0x02, 0x05,
{ { CTR_MS_0, MSR_P4_TBPU_ESCR0},
{ CTR_MS_2, MSR_P4_TBPU_ESCR1} }
},
@@ -349,8 +350,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
-#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
-#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
+#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
+#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
#define CCCR_RESERVED_BITS 0x38030FFF
#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -360,15 +361,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
-#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
-#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
+#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
+#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
-#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
-#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
-#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0)
-#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0)
+#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
+#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
+#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
+#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
@@ -380,7 +381,7 @@ static unsigned int get_stagger(void)
#ifdef CONFIG_SMP
int cpu = smp_processor_id();
return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu)));
-#endif
+#endif
return 0;
}
@@ -395,25 +396,23 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT];
static void p4_fill_in_addresses(struct op_msrs * const msrs)
{
- unsigned int i;
+ unsigned int i;
unsigned int addr, cccraddr, stag;
setup_num_counters();
stag = get_stagger();
/* initialize some registers */
- for (i = 0; i < num_counters; ++i) {
+ for (i = 0; i < num_counters; ++i)
msrs->counters[i].addr = 0;
- }
- for (i = 0; i < num_controls; ++i) {
+ for (i = 0; i < num_controls; ++i)
msrs->controls[i].addr = 0;
- }
-
+
/* the counter & cccr registers we pay attention to */
for (i = 0; i < num_counters; ++i) {
addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
- if (reserve_perfctr_nmi(addr)){
+ if (reserve_perfctr_nmi(addr)) {
msrs->counters[i].addr = addr;
msrs->controls[i].addr = cccraddr;
}
@@ -447,22 +446,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
-
+
for (addr = MSR_P4_MS_ESCR0 + stag;
- addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
+ addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
-
+
for (addr = MSR_P4_IX_ESCR0 + stag;
- addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
+ addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
/* there are 2 remaining non-contiguously located ESCRs */
- if (num_counters == NUM_COUNTERS_NON_HT) {
+ if (num_counters == NUM_COUNTERS_NON_HT) {
/* standard non-HT CPUs handle both remaining ESCRs*/
if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
@@ -498,20 +497,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
unsigned int stag;
stag = get_stagger();
-
+
/* convert from counter *number* to counter *bit* */
counter_bit = 1 << VIRT_CTR(stag, ctr);
-
+
/* find our event binding structure. */
if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
- printk(KERN_ERR
- "oprofile: P4 event code 0x%lx out of range\n",
+ printk(KERN_ERR
+ "oprofile: P4 event code 0x%lx out of range\n",
counter_config[ctr].event);
return;
}
-
+
ev = &(p4_events[counter_config[ctr].event - 1]);
-
+
for (i = 0; i < maxbind; i++) {
if (ev->bindings[i].virt_counter & counter_bit) {
@@ -526,25 +525,24 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
}
ESCR_SET_EVENT_SELECT(escr, ev->event_select);
- ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
+ ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
ESCR_WRITE(escr, high, ev, i);
-
+
/* modify CCCR */
CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
CCCR_CLEAR(cccr);
CCCR_SET_REQUIRED_BITS(cccr);
CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
- if (stag == 0) {
+ if (stag == 0)
CCCR_SET_PMI_OVF_0(cccr);
- } else {
+ else
CCCR_SET_PMI_OVF_1(cccr);
- }
CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
return;
}
}
- printk(KERN_ERR
+ printk(KERN_ERR
"oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
counter_config[ctr].event, stag, ctr);
}
@@ -559,14 +557,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
stag = get_stagger();
rdmsr(MSR_IA32_MISC_ENABLE, low, high);
- if (! MISC_PMC_ENABLED_P(low)) {
+ if (!MISC_PMC_ENABLED_P(low)) {
printk(KERN_ERR "oprofile: P4 PMC not available\n");
return;
}
/* clear the cccrs we will use */
for (i = 0 ; i < num_counters ; i++) {
- if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+ if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
continue;
rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
CCCR_CLEAR(low);
@@ -576,14 +574,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
/* clear all escrs (including those outside our concern) */
for (i = num_counters; i < num_controls; i++) {
- if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+ if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
continue;
wrmsr(msrs->controls[i].addr, 0, 0);
}
/* setup all counters */
for (i = 0 ; i < num_counters ; ++i) {
- if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) {
+ if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) {
reset_value[i] = counter_config[i].count;
pmc_setup_one_p4_counter(i);
CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
@@ -603,11 +601,11 @@ static int p4_check_ctrs(struct pt_regs * const regs,
stag = get_stagger();
for (i = 0; i < num_counters; ++i) {
-
- if (!reset_value[i])
+
+ if (!reset_value[i])
continue;
- /*
+ /*
* there is some eccentricity in the hardware which
* requires that we perform 2 extra corrections:
*
@@ -616,24 +614,24 @@ static int p4_check_ctrs(struct pt_regs * const regs,
*
* - write the counter back twice to ensure it gets
* updated properly.
- *
+ *
* the former seems to be related to extra NMIs happening
* during the current NMI; the latter is reported as errata
* N15 in intel doc 249199-029, pentium 4 specification
* update, though their suggested work-around does not
* appear to solve the problem.
*/
-
+
real = VIRT_CTR(stag, i);
CCCR_READ(low, high, real);
- CTR_READ(ctr, high, real);
+ CTR_READ(ctr, high, real);
if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
oprofile_add_sample(regs, i);
- CTR_WRITE(reset_value[i], real);
+ CTR_WRITE(reset_value[i], real);
CCCR_CLEAR_OVF(low);
CCCR_WRITE(low, high, real);
- CTR_WRITE(reset_value[i], real);
+ CTR_WRITE(reset_value[i], real);
}
}
@@ -683,15 +681,16 @@ static void p4_shutdown(struct op_msrs const * const msrs)
int i;
for (i = 0 ; i < num_counters ; ++i) {
- if (CTR_IS_RESERVED(msrs,i))
+ if (CTR_IS_RESERVED(msrs, i))
release_perfctr_nmi(msrs->counters[i].addr);
}
- /* some of the control registers are specially reserved in
+ /*
+ * some of the control registers are specially reserved in
* conjunction with the counter registers (hence the starting offset).
* This saves a few bits.
*/
for (i = num_counters ; i < num_controls ; ++i) {
- if (CTRL_IS_RESERVED(msrs,i))
+ if (CTRL_IS_RESERVED(msrs, i))
release_evntsel_nmi(msrs->controls[i].addr);
}
}
@@ -699,24 +698,24 @@ static void p4_shutdown(struct op_msrs const * const msrs)
#ifdef CONFIG_SMP
struct op_x86_model_spec const op_p4_ht2_spec = {
- .num_counters = NUM_COUNTERS_HT2,
- .num_controls = NUM_CONTROLS_HT2,
- .fill_in_addresses = &p4_fill_in_addresses,
- .setup_ctrs = &p4_setup_ctrs,
- .check_ctrs = &p4_check_ctrs,
- .start = &p4_start,
- .stop = &p4_stop,
- .shutdown = &p4_shutdown
+ .num_counters = NUM_COUNTERS_HT2,
+ .num_controls = NUM_CONTROLS_HT2,
+ .fill_in_addresses = &p4_fill_in_addresses,
+ .setup_ctrs = &p4_setup_ctrs,
+ .check_ctrs = &p4_check_ctrs,
+ .start = &p4_start,
+ .stop = &p4_stop,
+ .shutdown = &p4_shutdown
};
#endif
struct op_x86_model_spec const op_p4_spec = {
- .num_counters = NUM_COUNTERS_NON_HT,
- .num_controls = NUM_CONTROLS_NON_HT,
- .fill_in_addresses = &p4_fill_in_addresses,
- .setup_ctrs = &p4_setup_ctrs,
- .check_ctrs = &p4_check_ctrs,
- .start = &p4_start,
- .stop = &p4_stop,
- .shutdown = &p4_shutdown
+ .num_counters = NUM_COUNTERS_NON_HT,
+ .num_controls = NUM_CONTROLS_NON_HT,
+ .fill_in_addresses = &p4_fill_in_addresses,
+ .setup_ctrs = &p4_setup_ctrs,
+ .check_ctrs = &p4_check_ctrs,
+ .start = &p4_start,
+ .stop = &p4_stop,
+ .shutdown = &p4_shutdown
};
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index eff431f6c57..0620d6d45f7 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -1,32 +1,34 @@
/*
* @file op_model_ppro.h
- * pentium pro / P6 model-specific MSR operations
+ * Family 6 perfmon and architectural perfmon MSR operations
*
* @remark Copyright 2002 OProfile authors
+ * @remark Copyright 2008 Intel Corporation
* @remark Read the file COPYING
*
* @author John Levon
* @author Philippe Elie
* @author Graydon Hoare
+ * @author Andi Kleen
*/
#include <linux/oprofile.h>
+#include <linux/slab.h>
#include <asm/ptrace.h>
#include <asm/msr.h>
#include <asm/apic.h>
#include <asm/nmi.h>
+#include <asm/intel_arch_perfmon.h>
#include "op_x86_model.h"
#include "op_counter.h"
-#define NUM_COUNTERS 2
-#define NUM_CONTROLS 2
+static int num_counters = 2;
+static int counter_width = 32;
#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
-#define CTR_32BIT_WRITE(l, msrs, c) \
- do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0); } while (0)
-#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
+#define CTR_OVERFLOWED(n) (!((n) & (1U<<(counter_width-1))))
#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
@@ -40,20 +42,20 @@
#define CTRL_SET_UM(val, m) (val |= (m << 8))
#define CTRL_SET_EVENT(val, e) (val |= e)
-static unsigned long reset_value[NUM_COUNTERS];
+static u64 *reset_value;
static void ppro_fill_in_addresses(struct op_msrs * const msrs)
{
int i;
- for (i = 0; i < NUM_COUNTERS; i++) {
+ for (i = 0; i < num_counters; i++) {
if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
else
msrs->counters[i].addr = 0;
}
- for (i = 0; i < NUM_CONTROLS; i++) {
+ for (i = 0; i < num_counters; i++) {
if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
else
@@ -67,8 +69,22 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
unsigned int low, high;
int i;
+ if (!reset_value) {
+ reset_value = kmalloc(sizeof(unsigned) * num_counters,
+ GFP_ATOMIC);
+ if (!reset_value)
+ return;
+ }
+
+ if (cpu_has_arch_perfmon) {
+ union cpuid10_eax eax;
+ eax.full = cpuid_eax(0xa);
+ if (counter_width < eax.split.bit_width)
+ counter_width = eax.split.bit_width;
+ }
+
/* clear all counters */
- for (i = 0 ; i < NUM_CONTROLS; ++i) {
+ for (i = 0 ; i < num_counters; ++i) {
if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
continue;
CTRL_READ(low, high, msrs, i);
@@ -77,18 +93,18 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
}
/* avoid a false detection of ctr overflows in NMI handler */
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (unlikely(!CTR_IS_RESERVED(msrs, i)))
continue;
- CTR_32BIT_WRITE(1, msrs, i);
+ wrmsrl(msrs->counters[i].addr, -1LL);
}
/* enable active counters */
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
reset_value[i] = counter_config[i].count;
- CTR_32BIT_WRITE(counter_config[i].count, msrs, i);
+ wrmsrl(msrs->counters[i].addr, -reset_value[i]);
CTRL_READ(low, high, msrs, i);
CTRL_CLEAR(low);
@@ -111,13 +127,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
unsigned int low, high;
int i;
- for (i = 0 ; i < NUM_COUNTERS; ++i) {
+ for (i = 0 ; i < num_counters; ++i) {
if (!reset_value[i])
continue;
CTR_READ(low, high, msrs, i);
if (CTR_OVERFLOWED(low)) {
oprofile_add_sample(regs, i);
- CTR_32BIT_WRITE(reset_value[i], msrs, i);
+ wrmsrl(msrs->counters[i].addr, -reset_value[i]);
}
}
@@ -141,7 +157,7 @@ static void ppro_start(struct op_msrs const * const msrs)
unsigned int low, high;
int i;
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (reset_value[i]) {
CTRL_READ(low, high, msrs, i);
CTRL_SET_ACTIVE(low);
@@ -156,7 +172,7 @@ static void ppro_stop(struct op_msrs const * const msrs)
unsigned int low, high;
int i;
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (!reset_value[i])
continue;
CTRL_READ(low, high, msrs, i);
@@ -169,24 +185,70 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
{
int i;
- for (i = 0 ; i < NUM_COUNTERS ; ++i) {
+ for (i = 0 ; i < num_counters ; ++i) {
if (CTR_IS_RESERVED(msrs, i))
release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
}
- for (i = 0 ; i < NUM_CONTROLS ; ++i) {
+ for (i = 0 ; i < num_counters ; ++i) {
if (CTRL_IS_RESERVED(msrs, i))
release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
}
+ if (reset_value) {
+ kfree(reset_value);
+ reset_value = NULL;
+ }
}
-struct op_x86_model_spec const op_ppro_spec = {
- .num_counters = NUM_COUNTERS,
- .num_controls = NUM_CONTROLS,
- .fill_in_addresses = &ppro_fill_in_addresses,
- .setup_ctrs = &ppro_setup_ctrs,
- .check_ctrs = &ppro_check_ctrs,
- .start = &ppro_start,
- .stop = &ppro_stop,
- .shutdown = &ppro_shutdown
+struct op_x86_model_spec op_ppro_spec = {
+ .num_counters = 2, /* can be overriden */
+ .num_controls = 2, /* dito */
+ .fill_in_addresses = &ppro_fill_in_addresses,
+ .setup_ctrs = &ppro_setup_ctrs,
+ .check_ctrs = &ppro_check_ctrs,
+ .start = &ppro_start,
+ .stop = &ppro_stop,
+ .shutdown = &ppro_shutdown
+};
+
+/*
+ * Architectural performance monitoring.
+ *
+ * Newer Intel CPUs (Core1+) have support for architectural
+ * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details.
+ * The advantage of this is that it can be done without knowing about
+ * the specific CPU.
+ */
+
+void arch_perfmon_setup_counters(void)
+{
+ union cpuid10_eax eax;
+
+ eax.full = cpuid_eax(0xa);
+
+ /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
+ if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
+ current_cpu_data.x86_model == 15) {
+ eax.split.version_id = 2;
+ eax.split.num_counters = 2;
+ eax.split.bit_width = 40;
+ }
+
+ num_counters = eax.split.num_counters;
+
+ op_arch_perfmon_spec.num_counters = num_counters;
+ op_arch_perfmon_spec.num_controls = num_counters;
+ op_ppro_spec.num_counters = num_counters;
+ op_ppro_spec.num_controls = num_counters;
+}
+
+struct op_x86_model_spec op_arch_perfmon_spec = {
+ /* num_counters/num_controls filled in at runtime */
+ .fill_in_addresses = &ppro_fill_in_addresses,
+ /* user space does the cpuid check for available events */
+ .setup_ctrs = &ppro_setup_ctrs,
+ .check_ctrs = &ppro_check_ctrs,
+ .start = &ppro_start,
+ .stop = &ppro_stop,
+ .shutdown = &ppro_shutdown
};
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 45b605fa71d..825e79064d6 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -22,8 +22,8 @@ struct op_msr {
};
struct op_msrs {
- struct op_msr * counters;
- struct op_msr * controls;
+ struct op_msr *counters;
+ struct op_msr *controls;
};
struct pt_regs;
@@ -32,8 +32,10 @@ struct pt_regs;
* various x86 CPU models' perfctr support.
*/
struct op_x86_model_spec {
- unsigned int const num_counters;
- unsigned int const num_controls;
+ int (*init)(struct oprofile_operations *ops);
+ void (*exit)(void);
+ unsigned int num_counters;
+ unsigned int num_controls;
void (*fill_in_addresses)(struct op_msrs * const msrs);
void (*setup_ctrs)(struct op_msrs const * const msrs);
int (*check_ctrs)(struct pt_regs * const regs,
@@ -43,9 +45,12 @@ struct op_x86_model_spec {
void (*shutdown)(struct op_msrs const * const msrs);
};
-extern struct op_x86_model_spec const op_ppro_spec;
+extern struct op_x86_model_spec op_ppro_spec;
extern struct op_x86_model_spec const op_p4_spec;
extern struct op_x86_model_spec const op_p4_ht2_spec;
-extern struct op_x86_model_spec const op_athlon_spec;
+extern struct op_x86_model_spec const op_amd_spec;
+extern struct op_x86_model_spec op_arch_perfmon_spec;
+
+extern void arch_perfmon_setup_counters(void);
#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 19af06927fb..1d88d2b3977 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -250,10 +250,5 @@ int __init pci_acpi_init(void)
acpi_pci_irq_enable(dev);
}
-#ifdef CONFIG_X86_IO_APIC
- if (acpi_ioapic)
- print_IO_APIC();
-#endif
-
return 0;
}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 6a0fca78c36..22e057665e5 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -580,7 +580,7 @@ static int __cpuinit amd_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
int cpu = (long)hcpu;
- switch(action) {
+ switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0);
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 4bdaa590375..3c27a809393 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -511,3 +511,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size);
+
+/*
+ * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
+ * confusing the PCI engine:
+ */
+static void sb600_disable_hpet_bar(struct pci_dev *dev)
+{
+ u8 val;
+
+ /*
+ * The SB600 and SB700 both share the same device
+ * ID, but the PM register 0x55 does something different
+ * for the SB700, so make sure we are dealing with the
+ * SB600 before touching the bit:
+ */
+
+ pci_read_config_byte(dev, 0x08, &val);
+
+ if (val < 0x2F) {
+ outb(0x55, 0xCD6);
+ val = inb(0xCD7);
+
+ /* Set bit 7 in PM register 0x55 */
+ outb(0x55, 0xCD6);
+ outb(val | 0x80, 0xCD7);
+ }
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 8791fc55e71..844df0cbbd3 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -33,6 +33,7 @@
#include <linux/bootmem.h>
#include <asm/pat.h>
+#include <asm/e820.h>
#include "pci.h"
@@ -227,6 +228,8 @@ void __init pcibios_resource_survey(void)
pcibios_allocate_bus_resources(&pci_root_buses);
pcibios_allocate_resources(0);
pcibios_allocate_resources(1);
+
+ e820_reserve_resources_late();
}
/**
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 8e077185e18..bf69dbe08bf 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -493,7 +493,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
if (pirq <= 4)
irq = read_config_nybble(router, 0x56, pirq - 1);
dev_info(&dev->dev,
- "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
+ "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n",
dev->vendor, dev->device, pirq, irq);
return irq;
}
@@ -501,7 +501,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
{
dev_info(&dev->dev,
- "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
+ "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n",
dev->vendor, dev->device, pirq, irq);
if (pirq <= 4)
write_config_nybble(router, 0x56, pirq - 1, irq);
@@ -590,13 +590,20 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
case PCI_DEVICE_ID_INTEL_ICH10_1:
case PCI_DEVICE_ID_INTEL_ICH10_2:
case PCI_DEVICE_ID_INTEL_ICH10_3:
- case PCI_DEVICE_ID_INTEL_PCH_0:
- case PCI_DEVICE_ID_INTEL_PCH_1:
r->name = "PIIX/ICH";
r->get = pirq_piix_get;
r->set = pirq_piix_set;
return 1;
}
+
+ if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) &&
+ (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) {
+ r->name = "PIIX/ICH";
+ r->get = pirq_piix_get;
+ r->set = pirq_piix_set;
+ return 1;
+ }
+
return 0;
}
@@ -823,7 +830,7 @@ static void __init pirq_find_router(struct irq_router *r)
r->get = NULL;
r->set = NULL;
- DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
+ DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n",
rt->rtr_vendor, rt->rtr_device);
pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
@@ -843,7 +850,7 @@ static void __init pirq_find_router(struct irq_router *r)
h->probe(r, pirq_router_dev, pirq_router_dev->device))
break;
}
- dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
+ dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n",
pirq_router.name,
pirq_router_dev->vendor, pirq_router_dev->device);
@@ -1043,35 +1050,44 @@ static void __init pcibios_fixup_irqs(void)
if (io_apic_assign_pci_irqs) {
int irq;
- if (pin) {
- /*
- * interrupt pins are numbered starting
- * from 1
- */
- pin--;
- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
- PCI_SLOT(dev->devfn), pin);
- /*
- * Busses behind bridges are typically not listed in the MP-table.
- * In this case we have to look up the IRQ based on the parent bus,
- * parent slot, and pin number. The SMP code detects such bridged
- * busses itself so we should get into this branch reliably.
- */
- if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
- struct pci_dev *bridge = dev->bus->self;
-
- pin = (pin + PCI_SLOT(dev->devfn)) % 4;
- irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
- PCI_SLOT(bridge->devfn), pin);
- if (irq >= 0)
- dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
- pci_name(bridge),
- 'A' + pin, irq);
- }
- if (irq >= 0) {
- dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
- dev->irq = irq;
- }
+ if (!pin)
+ continue;
+
+ /*
+ * interrupt pins are numbered starting from 1
+ */
+ pin--;
+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+ PCI_SLOT(dev->devfn), pin);
+ /*
+ * Busses behind bridges are typically not listed in the
+ * MP-table. In this case we have to look up the IRQ
+ * based on the parent bus, parent slot, and pin number.
+ * The SMP code detects such bridged busses itself so we
+ * should get into this branch reliably.
+ */
+ if (irq < 0 && dev->bus->parent) {
+ /* go back to the bridge */
+ struct pci_dev *bridge = dev->bus->self;
+ int bus;
+
+ pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+ bus = bridge->bus->number;
+ irq = IO_APIC_get_PCI_irq_vector(bus,
+ PCI_SLOT(bridge->devfn), pin);
+ if (irq >= 0)
+ dev_warn(&dev->dev,
+ "using bridge %s INT %c to "
+ "get IRQ %d\n",
+ pci_name(bridge),
+ 'A' + pin, irq);
+ }
+ if (irq >= 0) {
+ dev_info(&dev->dev,
+ "PCI->APIC IRQ transform: INT %c "
+ "-> IRQ %d\n",
+ 'A' + pin, irq);
+ dev->irq = irq;
}
}
#endif
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index d9635764ce3..654a2234f8f 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -209,7 +209,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
return name != NULL;
}
-static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
+static void __init pci_mmcfg_insert_resources(void)
{
#define PCI_MMCFG_RESOURCE_NAME_LEN 19
int i;
@@ -233,7 +233,7 @@ static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
cfg->pci_segment);
res->start = cfg->address;
res->end = res->start + (num_buses << 20) - 1;
- res->flags = IORESOURCE_MEM | resource_flags;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
names += PCI_MMCFG_RESOURCE_NAME_LEN;
}
@@ -434,11 +434,9 @@ static void __init __pci_mmcfg_init(int early)
(pci_mmcfg_config[0].address == 0))
return;
- if (pci_mmcfg_arch_init()) {
- if (known_bridge)
- pci_mmcfg_insert_resources(IORESOURCE_BUSY);
+ if (pci_mmcfg_arch_init())
pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
- } else {
+ else {
/*
* Signal not to attempt to insert mmcfg resources because
* the architecture mmcfg setup could not initialize.
@@ -475,7 +473,7 @@ static int __init pci_mmcfg_late_insert_resources(void)
* marked so it won't cause request errors when __request_region is
* called.
*/
- pci_mmcfg_insert_resources(0);
+ pci_mmcfg_insert_resources();
return 0;
}
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index d3e083dea72..274d06082f4 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -11,6 +11,7 @@
#include <linux/suspend.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
+#include <asm/xcr.h>
static struct saved_context saved_context;
@@ -126,6 +127,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
if (boot_cpu_has(X86_FEATURE_SEP))
enable_sep_cpu();
+ /*
+ * restore XCR0 for xsave capable cpu's.
+ */
+ if (cpu_has_xsave)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
+
fix_processor_context();
do_fpu_end();
mtrr_ap_init();
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index 66bdfb591fd..e3b6cf70d62 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -14,6 +14,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/mtrr.h>
+#include <asm/xcr.h>
static void fix_processor_context(void);
@@ -122,6 +123,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
wrmsrl(MSR_GS_BASE, ctxt->gs_base);
wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+ /*
+ * restore XCR0 for xsave capable cpu's.
+ */
+ if (cpu_has_xsave)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
+
fix_processor_context();
do_fpu_end();
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index 4fc7e872c85..d1e9b53f9d3 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -1,5 +1,3 @@
-.text
-
/*
* This may not use any stack, nor any variable that is not "NoSave":
*
@@ -12,17 +10,18 @@
#include <asm/segment.h>
#include <asm/page.h>
#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
- .text
+.text
ENTRY(swsusp_arch_suspend)
-
movl %esp, saved_context_esp
movl %ebx, saved_context_ebx
movl %ebp, saved_context_ebp
movl %esi, saved_context_esi
movl %edi, saved_context_edi
- pushfl ; popl saved_context_eflags
+ pushfl
+ popl saved_context_eflags
call swsusp_save
ret
@@ -59,7 +58,7 @@ done:
movl mmu_cr4_features, %ecx
jecxz 1f # cr4 Pentium and higher, skip if zero
movl %ecx, %edx
- andl $~(1<<7), %edx; # PGE
+ andl $~(X86_CR4_PGE), %edx
movl %edx, %cr4; # turn off PGE
1:
movl %cr3, %eax; # flush TLB
@@ -74,7 +73,8 @@ done:
movl saved_context_esi, %esi
movl saved_context_edi, %edi
- pushl saved_context_eflags ; popfl
+ pushl saved_context_eflags
+ popfl
xorl %eax, %eax
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 3815e425f47..87b9ab16642 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -26,5 +26,13 @@ config XEN_MAX_DOMAIN_MEMORY
config XEN_SAVE_RESTORE
bool
- depends on PM
- default y \ No newline at end of file
+ depends on XEN && PM
+ default y
+
+config XEN_DEBUG_FS
+ bool "Enable Xen debug and tuning parameters in debugfs"
+ depends on XEN && DEBUG_FS
+ default n
+ help
+ Enable statistics output and various tuning options in debugfs.
+ Enabling this option may incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 59c1e539aed..313947940a1 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,12 @@
-obj-y := enlighten.o setup.o multicalls.o mmu.o \
+ifdef CONFIG_FTRACE
+# Do not profile debug and lowlevel utilities
+CFLAGS_REMOVE_spinlock.o = -pg
+CFLAGS_REMOVE_time.o = -pg
+CFLAGS_REMOVE_irq.o = -pg
+endif
+
+obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm_$(BITS).o grant-table.o suspend.o
-obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SMP) += smp.o spinlock.o
+obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 00000000000..b53225d2cac
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,123 @@
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+#include "debugfs.h"
+
+static struct dentry *d_xen_debug;
+
+struct dentry * __init xen_init_debugfs(void)
+{
+ if (!d_xen_debug) {
+ d_xen_debug = debugfs_create_dir("xen", NULL);
+
+ if (!d_xen_debug)
+ pr_warning("Could not create 'xen' debugfs directory\n");
+ }
+
+ return d_xen_debug;
+}
+
+struct array_data
+{
+ void *array;
+ unsigned elements;
+};
+
+static int u32_array_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return nonseekable_open(inode, file);
+}
+
+static size_t format_array(char *buf, size_t bufsize, const char *fmt,
+ u32 *array, unsigned array_size)
+{
+ size_t ret = 0;
+ unsigned i;
+
+ for(i = 0; i < array_size; i++) {
+ size_t len;
+
+ len = snprintf(buf, bufsize, fmt, array[i]);
+ len++; /* ' ' or '\n' */
+ ret += len;
+
+ if (buf) {
+ buf += len;
+ bufsize -= len;
+ buf[-1] = (i == array_size-1) ? '\n' : ' ';
+ }
+ }
+
+ ret++; /* \0 */
+ if (buf)
+ *buf = '\0';
+
+ return ret;
+}
+
+static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
+{
+ size_t len = format_array(NULL, 0, fmt, array, array_size);
+ char *ret;
+
+ ret = kmalloc(len, GFP_KERNEL);
+ if (ret == NULL)
+ return NULL;
+
+ format_array(ret, len, fmt, array, array_size);
+ return ret;
+}
+
+static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct array_data *data = inode->i_private;
+ size_t size;
+
+ if (*ppos == 0) {
+ if (file->private_data) {
+ kfree(file->private_data);
+ file->private_data = NULL;
+ }
+
+ file->private_data = format_array_alloc("%u", data->array, data->elements);
+ }
+
+ size = 0;
+ if (file->private_data)
+ size = strlen(file->private_data);
+
+ return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
+}
+
+static int xen_array_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+
+ return 0;
+}
+
+static struct file_operations u32_array_fops = {
+ .owner = THIS_MODULE,
+ .open = u32_array_open,
+ .release= xen_array_release,
+ .read = u32_array_read,
+};
+
+struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+ struct dentry *parent,
+ u32 *array, unsigned elements)
+{
+ struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+ if (data == NULL)
+ return NULL;
+
+ data->array = array;
+ data->elements = elements;
+
+ return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
+}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 00000000000..e2813208483
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,10 @@
+#ifndef _XEN_DEBUGFS_H
+#define _XEN_DEBUGFS_H
+
+struct dentry * __init xen_init_debugfs(void);
+
+struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+ struct dentry *parent,
+ u32 *array, unsigned elements);
+
+#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a4e201b47f6..b61534c7a4c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,12 +30,12 @@
#include <xen/interface/xen.h>
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
-#include <xen/interface/sched.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/hvc-console.h>
#include <asm/paravirt.h>
+#include <asm/apic.h>
#include <asm/page.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@@ -57,6 +57,9 @@ EXPORT_SYMBOL_GPL(hypercall_page);
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+enum xen_domain_type xen_domain_type = XEN_NATIVE;
+EXPORT_SYMBOL_GPL(xen_domain_type);
+
/*
* Identity map, in addition to plain kernel map. This needs to be
* large enough to allocate page table pages to allocate the rest.
@@ -110,7 +113,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
*
* 0: not available, 1: available
*/
-static int have_vcpu_info_placement = 1;
+static int have_vcpu_info_placement =
+#ifdef CONFIG_X86_32
+ 1
+#else
+ 0
+#endif
+ ;
+
static void xen_vcpu_setup(int cpu)
{
@@ -226,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg)
return HYPERVISOR_get_debugreg(reg);
}
-static unsigned long xen_save_fl(void)
+static void xen_leave_lazy(void)
{
- struct vcpu_info *vcpu;
- unsigned long flags;
-
- vcpu = x86_read_percpu(xen_vcpu);
-
- /* flag has opposite sense of mask */
- flags = !vcpu->evtchn_upcall_mask;
-
- /* convert to IF type flag
- -0 -> 0x00000000
- -1 -> 0xffffffff
- */
- return (-flags) & X86_EFLAGS_IF;
+ paravirt_leave_lazy(paravirt_get_lazy_mode());
+ xen_mc_flush();
}
-static void xen_restore_fl(unsigned long flags)
+static unsigned long xen_store_tr(void)
{
- struct vcpu_info *vcpu;
-
- /* convert from IF type flag */
- flags = !(flags & X86_EFLAGS_IF);
-
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
- vcpu->evtchn_upcall_mask = flags;
- preempt_enable_no_resched();
-
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
-
- if (flags == 0) {
- preempt_check_resched();
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- force_evtchn_callback();
- }
+ return 0;
}
-static void xen_irq_disable(void)
+/*
+ * Set the page permissions for a particular virtual address. If the
+ * address is a vmalloc mapping (or other non-linear mapping), then
+ * find the linear mapping of the page and also set its protections to
+ * match.
+ */
+static void set_aliased_prot(void *v, pgprot_t prot)
{
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
- preempt_enable_no_resched();
-}
+ int level;
+ pte_t *ptep;
+ pte_t pte;
+ unsigned long pfn;
+ struct page *page;
-static void xen_irq_enable(void)
-{
- struct vcpu_info *vcpu;
+ ptep = lookup_address((unsigned long)v, &level);
+ BUG_ON(ptep == NULL);
- /* We don't need to worry about being preempted here, since
- either a) interrupts are disabled, so no preemption, or b)
- the caller is confused and is trying to re-enable interrupts
- on an indeterminate processor. */
+ pfn = pte_pfn(*ptep);
+ page = pfn_to_page(pfn);
- vcpu = x86_read_percpu(xen_vcpu);
- vcpu->evtchn_upcall_mask = 0;
+ pte = pfn_pte(pfn, prot);
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
+ if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
+ BUG();
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- force_evtchn_callback();
-}
+ if (!PageHighMem(page)) {
+ void *av = __va(PFN_PHYS(pfn));
-static void xen_safe_halt(void)
-{
- /* Blocking includes an implicit local_irq_enable(). */
- if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
- BUG();
+ if (av != v)
+ if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
+ BUG();
+ } else
+ kmap_flush_unused();
}
-static void xen_halt(void)
+static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
- if (irqs_disabled())
- HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
- else
- xen_safe_halt();
-}
+ const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+ int i;
-static void xen_leave_lazy(void)
-{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
- xen_mc_flush();
+ for(i = 0; i < entries; i += entries_per_page)
+ set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
}
-static unsigned long xen_store_tr(void)
+static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
{
- return 0;
+ const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+ int i;
+
+ for(i = 0; i < entries; i += entries_per_page)
+ set_aliased_prot(ldt + i, PAGE_KERNEL);
}
static void xen_set_ldt(const void *addr, unsigned entries)
@@ -425,8 +400,7 @@ static void xen_load_gs_index(unsigned int idx)
static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
const void *ptr)
{
- unsigned long lp = (unsigned long)&dt[entrynum];
- xmaddr_t mach_lp = virt_to_machine(lp);
+ xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
u64 entry = *(u64 *)ptr;
preempt_disable();
@@ -559,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
}
static void xen_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+ struct thread_struct *thread)
{
struct multicall_space mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -580,16 +554,47 @@ static void xen_io_delay(void)
}
#ifdef CONFIG_X86_LOCAL_APIC
-static u32 xen_apic_read(unsigned long reg)
+static u32 xen_apic_read(u32 reg)
{
return 0;
}
-static void xen_apic_write(unsigned long reg, u32 val)
+static void xen_apic_write(u32 reg, u32 val)
{
/* Warn to see if there's any stray references */
WARN_ON(1);
}
+
+static u64 xen_apic_icr_read(void)
+{
+ return 0;
+}
+
+static void xen_apic_icr_write(u32 low, u32 id)
+{
+ /* Warn to see if there's any stray references */
+ WARN_ON(1);
+}
+
+static void xen_apic_wait_icr_idle(void)
+{
+ return;
+}
+
+static u32 xen_safe_apic_wait_icr_idle(void)
+{
+ return 0;
+}
+
+static struct apic_ops xen_basic_apic_ops = {
+ .read = xen_apic_read,
+ .write = xen_apic_write,
+ .icr_read = xen_apic_icr_read,
+ .icr_write = xen_apic_icr_write,
+ .wait_icr_idle = xen_apic_wait_icr_idle,
+ .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
+};
+
#endif
static void xen_flush_tlb(void)
@@ -803,6 +808,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
ret = -EFAULT;
break;
#endif
+
+ case MSR_STAR:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ case MSR_SYSCALL_MASK:
+ case MSR_IA32_SYSENTER_CS:
+ case MSR_IA32_SYSENTER_ESP:
+ case MSR_IA32_SYSENTER_EIP:
+ /* Fast syscall setup is all done in hypercalls, so
+ these are all ignored. Stub them out here to stop
+ Xen console noise. */
+ break;
+
default:
ret = native_write_msr_safe(msr, low, high);
}
@@ -812,7 +830,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
+static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
{
#ifdef CONFIG_FLATMEM
BUG_ON(mem_map); /* should only be used early */
@@ -822,7 +840,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
/* Early release_pte assumes that all pts are pinned, since there's
only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(u32 pfn)
+static void xen_release_pte_init(unsigned long pfn)
{
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
@@ -838,7 +856,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
/* This needs to make sure the new pte page is pinned iff its being
attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
+static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
@@ -846,22 +864,23 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
SetPagePinned(page);
if (!PageHighMem(page)) {
- make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
- if (level == PT_PTE)
+ make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+ if (level == PT_PTE && USE_SPLIT_PTLOCKS)
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
} else
/* make sure there are no stray mappings of
this page */
kmap_flush_unused();
+ vm_unmap_aliases();
}
}
-static void xen_alloc_pte(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PTE);
}
-static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PMD);
}
@@ -909,13 +928,13 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
}
/* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(u32 pfn, unsigned level)
+static void xen_release_ptpage(unsigned long pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
if (PagePinned(page)) {
if (!PageHighMem(page)) {
- if (level == PT_PTE)
+ if (level == PT_PTE && USE_SPLIT_PTLOCKS)
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
@@ -923,23 +942,23 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
}
}
-static void xen_release_pte(u32 pfn)
+static void xen_release_pte(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PTE);
}
-static void xen_release_pmd(u32 pfn)
+static void xen_release_pmd(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PMD);
}
#if PAGETABLE_LEVELS == 4
-static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
}
-static void xen_release_pud(u32 pfn)
+static void xen_release_pud(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PUD);
}
@@ -962,6 +981,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
}
#endif
+#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
@@ -980,6 +1000,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
xen_set_pte(ptep, pte);
}
+#endif
static __init void xen_pagetable_setup_start(pgd_t *base)
{
@@ -1046,7 +1067,6 @@ void xen_setup_vcpu_info_placement(void)
/* xen_vcpu_setup managed to place the vcpu_info within the
percpu area for all cpus, so make use of it */
-#ifdef CONFIG_X86_32
if (have_vcpu_info_placement) {
printk(KERN_INFO "Xen: using vcpu_info placement\n");
@@ -1056,7 +1076,6 @@ void xen_setup_vcpu_info_placement(void)
pv_irq_ops.irq_enable = xen_irq_enable_direct;
pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
}
-#endif
}
static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -1077,12 +1096,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
goto patch_site
switch (type) {
-#ifdef CONFIG_X86_32
SITE(pv_irq_ops, irq_enable);
SITE(pv_irq_ops, irq_disable);
SITE(pv_irq_ops, save_fl);
SITE(pv_irq_ops, restore_fl);
-#endif /* CONFIG_X86_32 */
#undef SITE
patch_site:
@@ -1220,6 +1237,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.load_gs_index = xen_load_gs_index,
#endif
+ .alloc_ldt = xen_alloc_ldt,
+ .free_ldt = xen_free_ldt,
+
.store_gdt = native_store_gdt,
.store_idt = native_store_idt,
.store_tr = xen_store_tr,
@@ -1241,40 +1261,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
},
};
-static void __init __xen_init_IRQ(void)
-{
-#ifdef CONFIG_X86_64
- int i;
-
- /* Create identity vector->irq map */
- for(i = 0; i < NR_VECTORS; i++) {
- int cpu;
-
- for_each_possible_cpu(cpu)
- per_cpu(vector_irq, cpu)[i] = i;
- }
-#endif /* CONFIG_X86_64 */
-
- xen_init_IRQ();
-}
-
-static const struct pv_irq_ops xen_irq_ops __initdata = {
- .init_IRQ = __xen_init_IRQ,
- .save_fl = xen_save_fl,
- .restore_fl = xen_restore_fl,
- .irq_disable = xen_irq_disable,
- .irq_enable = xen_irq_enable,
- .safe_halt = xen_safe_halt,
- .halt = xen_halt,
-#ifdef CONFIG_X86_64
- .adjust_exception_frame = xen_adjust_exception_frame,
-#endif
-};
-
static const struct pv_apic_ops xen_apic_ops __initdata = {
#ifdef CONFIG_X86_LOCAL_APIC
- .apic_write = xen_apic_write,
- .apic_read = xen_apic_read,
.setup_boot_clock = paravirt_nop,
.setup_secondary_clock = paravirt_nop,
.startup_ipi_hook = paravirt_nop,
@@ -1413,7 +1401,7 @@ static void __init xen_reserve_top(void)
if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
top = pp.virt_start;
- reserve_top_address(-top + 2 * PAGE_SIZE);
+ reserve_top_address(-top);
#endif /* CONFIG_X86_32 */
}
@@ -1447,48 +1435,11 @@ static void *m2v(phys_addr_t maddr)
return __ka(m2p(maddr));
}
-#ifdef CONFIG_X86_64
-static void walk(pgd_t *pgd, unsigned long addr)
-{
- unsigned l4idx = pgd_index(addr);
- unsigned l3idx = pud_index(addr);
- unsigned l2idx = pmd_index(addr);
- unsigned l1idx = pte_index(addr);
- pgd_t l4;
- pud_t l3;
- pmd_t l2;
- pte_t l1;
-
- xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
- pgd, addr, l4idx, l3idx, l2idx, l1idx);
-
- l4 = pgd[l4idx];
- xen_raw_printk(" l4: %016lx\n", l4.pgd);
- xen_raw_printk(" %016lx\n", pgd_val(l4));
-
- l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
- xen_raw_printk(" l3: %016lx\n", l3.pud);
- xen_raw_printk(" %016lx\n", pud_val(l3));
-
- l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
- xen_raw_printk(" l2: %016lx\n", l2.pmd);
- xen_raw_printk(" %016lx\n", pmd_val(l2));
-
- l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
- xen_raw_printk(" l1: %016lx\n", l1.pte);
- xen_raw_printk(" %016lx\n", pte_val(l1));
-}
-#endif
-
static void set_page_prot(void *addr, pgprot_t prot)
{
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
pte_t pte = pfn_pte(pfn, prot);
- xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
- addr, pfn, get_phys_to_machine(pfn),
- pgprot_val(prot), pte.pte);
-
if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
BUG();
}
@@ -1664,6 +1615,8 @@ asmlinkage void __init xen_start_kernel(void)
if (!xen_start_info)
return;
+ xen_domain_type = XEN_PV_DOMAIN;
+
BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
xen_setup_features();
@@ -1673,10 +1626,18 @@ asmlinkage void __init xen_start_kernel(void)
pv_init_ops = xen_init_ops;
pv_time_ops = xen_time_ops;
pv_cpu_ops = xen_cpu_ops;
- pv_irq_ops = xen_irq_ops;
pv_apic_ops = xen_apic_ops;
pv_mmu_ops = xen_mmu_ops;
+ xen_init_irq_ops();
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * set up the basic apic ops.
+ */
+ apic_ops = &xen_basic_apic_ops;
+#endif
+
if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -1700,7 +1661,7 @@ asmlinkage void __init xen_start_kernel(void)
/* Prevent unwanted bits from being set in PTEs. */
__supported_pte_mask &= ~_PAGE_GLOBAL;
- if (!is_initial_xendomain())
+ if (!xen_initial_domain())
__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
/* Don't do the full vcpu_info placement stuff until we have a
@@ -1735,7 +1696,7 @@ asmlinkage void __init xen_start_kernel(void)
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
- if (!is_initial_xendomain()) {
+ if (!xen_initial_domain()) {
add_preferred_console("xenboot", 0, NULL);
add_preferred_console("tty", 0, NULL);
add_preferred_console("hvc", 0, NULL);
@@ -1743,15 +1704,6 @@ asmlinkage void __init xen_start_kernel(void)
xen_raw_console_write("about to get started...\n");
-#if 0
- xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
- &boot_params, __pa_symbol(&boot_params),
- __va(__pa_symbol(&boot_params)));
-
- walk(pgd, &boot_params);
- walk(pgd, __va(__pa(&boot_params)));
-#endif
-
/* Start the world */
#ifdef CONFIG_X86_32
i386_start_kernel();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 00000000000..bb042608c60
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,141 @@
+#include <linux/hardirq.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/vcpu.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xen-ops.h"
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void xen_force_evtchn_callback(void)
+{
+ (void)HYPERVISOR_xen_version(0, NULL);
+}
+
+static void __init __xen_init_IRQ(void)
+{
+ int i;
+
+ /* Create identity vector->irq map */
+ for(i = 0; i < NR_VECTORS; i++) {
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(vector_irq, cpu)[i] = i;
+ }
+
+ xen_init_IRQ();
+}
+
+static unsigned long xen_save_fl(void)
+{
+ struct vcpu_info *vcpu;
+ unsigned long flags;
+
+ vcpu = x86_read_percpu(xen_vcpu);
+
+ /* flag has opposite sense of mask */
+ flags = !vcpu->evtchn_upcall_mask;
+
+ /* convert to IF type flag
+ -0 -> 0x00000000
+ -1 -> 0xffffffff
+ */
+ return (-flags) & X86_EFLAGS_IF;
+}
+
+static void xen_restore_fl(unsigned long flags)
+{
+ struct vcpu_info *vcpu;
+
+ /* convert from IF type flag */
+ flags = !(flags & X86_EFLAGS_IF);
+
+ /* There's a one instruction preempt window here. We need to
+ make sure we're don't switch CPUs between getting the vcpu
+ pointer and updating the mask. */
+ preempt_disable();
+ vcpu = x86_read_percpu(xen_vcpu);
+ vcpu->evtchn_upcall_mask = flags;
+ preempt_enable_no_resched();
+
+ /* Doesn't matter if we get preempted here, because any
+ pending event will get dealt with anyway. */
+
+ if (flags == 0) {
+ preempt_check_resched();
+ barrier(); /* unmask then check (avoid races) */
+ if (unlikely(vcpu->evtchn_upcall_pending))
+ xen_force_evtchn_callback();
+ }
+}
+
+static void xen_irq_disable(void)
+{
+ /* There's a one instruction preempt window here. We need to
+ make sure we're don't switch CPUs between getting the vcpu
+ pointer and updating the mask. */
+ preempt_disable();
+ x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+ preempt_enable_no_resched();
+}
+
+static void xen_irq_enable(void)
+{
+ struct vcpu_info *vcpu;
+
+ /* We don't need to worry about being preempted here, since
+ either a) interrupts are disabled, so no preemption, or b)
+ the caller is confused and is trying to re-enable interrupts
+ on an indeterminate processor. */
+
+ vcpu = x86_read_percpu(xen_vcpu);
+ vcpu->evtchn_upcall_mask = 0;
+
+ /* Doesn't matter if we get preempted here, because any
+ pending event will get dealt with anyway. */
+
+ barrier(); /* unmask then check (avoid races) */
+ if (unlikely(vcpu->evtchn_upcall_pending))
+ xen_force_evtchn_callback();
+}
+
+static void xen_safe_halt(void)
+{
+ /* Blocking includes an implicit local_irq_enable(). */
+ if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
+ BUG();
+}
+
+static void xen_halt(void)
+{
+ if (irqs_disabled())
+ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+ else
+ xen_safe_halt();
+}
+
+static const struct pv_irq_ops xen_irq_ops __initdata = {
+ .init_IRQ = __xen_init_IRQ,
+ .save_fl = xen_save_fl,
+ .restore_fl = xen_restore_fl,
+ .irq_disable = xen_irq_disable,
+ .irq_enable = xen_irq_enable,
+ .safe_halt = xen_safe_halt,
+ .halt = xen_halt,
+#ifdef CONFIG_X86_64
+ .adjust_exception_frame = xen_adjust_exception_frame,
+#endif
+};
+
+void __init xen_init_irq_ops()
+{
+ pv_irq_ops = xen_irq_ops;
+}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aa37469da69..d4d52f5a1cf 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,6 +40,7 @@
*/
#include <linux/sched.h>
#include <linux/highmem.h>
+#include <linux/debugfs.h>
#include <linux/bug.h>
#include <asm/pgtable.h>
@@ -57,6 +58,61 @@
#include "multicalls.h"
#include "mmu.h"
+#include "debugfs.h"
+
+#define MMU_UPDATE_HISTO 30
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct {
+ u32 pgd_update;
+ u32 pgd_update_pinned;
+ u32 pgd_update_batched;
+
+ u32 pud_update;
+ u32 pud_update_pinned;
+ u32 pud_update_batched;
+
+ u32 pmd_update;
+ u32 pmd_update_pinned;
+ u32 pmd_update_batched;
+
+ u32 pte_update;
+ u32 pte_update_pinned;
+ u32 pte_update_batched;
+
+ u32 mmu_update;
+ u32 mmu_update_extended;
+ u32 mmu_update_histo[MMU_UPDATE_HISTO];
+
+ u32 prot_commit;
+ u32 prot_commit_batched;
+
+ u32 set_pte_at;
+ u32 set_pte_at_batched;
+ u32 set_pte_at_pinned;
+ u32 set_pte_at_current;
+ u32 set_pte_at_kernel;
+} mmu_stats;
+
+static u8 zero_stats;
+
+static inline void check_zero(void)
+{
+ if (unlikely(zero_stats)) {
+ memset(&mmu_stats, 0, sizeof(mmu_stats));
+ zero_stats = 0;
+ }
+}
+
+#define ADD_STATS(elem, val) \
+ do { check_zero(); mmu_stats.elem += (val); } while(0)
+
+#else /* !CONFIG_XEN_DEBUG_FS */
+
+#define ADD_STATS(elem, val) do { (void)(val); } while(0)
+
+#endif /* CONFIG_XEN_DEBUG_FS */
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr)
}
-static bool page_pinned(void *ptr)
+static bool xen_page_pinned(void *ptr)
{
struct page *page = virt_to_page(ptr);
return PagePinned(page);
}
-static void extend_mmu_update(const struct mmu_update *update)
+static void xen_extend_mmu_update(const struct mmu_update *update)
{
struct multicall_space mcs;
struct mmu_update *u;
mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
- if (mcs.mc != NULL)
+ if (mcs.mc != NULL) {
+ ADD_STATS(mmu_update_extended, 1);
+ ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
+
mcs.mc->args[1]++;
- else {
+
+ if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
+ ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
+ else
+ ADD_STATS(mmu_update_histo[0], 1);
+ } else {
+ ADD_STATS(mmu_update, 1);
mcs = __xen_mc_entry(sizeof(*u));
MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+ ADD_STATS(mmu_update_histo[1], 1);
}
u = mcs.args;
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
/* ptr may be ioremapped for 64-bit pagetable setup */
u.ptr = arbitrary_virt_to_machine(ptr).maddr;
u.val = pmd_val_ma(val);
- extend_mmu_update(&u);
+ xen_extend_mmu_update(&u);
+
+ ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
xen_mc_issue(PARAVIRT_LAZY_MMU);
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
void xen_set_pmd(pmd_t *ptr, pmd_t val)
{
+ ADD_STATS(pmd_update, 1);
+
/* If page is not pinned, we can just update the entry
directly */
- if (!page_pinned(ptr)) {
+ if (!xen_page_pinned(ptr)) {
*ptr = val;
return;
}
+ ADD_STATS(pmd_update_pinned, 1);
+
xen_set_pmd_hyper(ptr, val);
}
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
if (mm == &init_mm)
preempt_disable();
+ ADD_STATS(set_pte_at, 1);
+// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
+ ADD_STATS(set_pte_at_current, mm == current->mm);
+ ADD_STATS(set_pte_at_kernel, mm == &init_mm);
+
if (mm == current->mm || mm == &init_mm) {
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
struct multicall_space mcs;
mcs = xen_mc_entry(0);
MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+ ADD_STATS(set_pte_at_batched, 1);
xen_mc_issue(PARAVIRT_LAZY_MMU);
goto out;
} else
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
u.val = pte_val_ma(pte);
- extend_mmu_update(&u);
+ xen_extend_mmu_update(&u);
+
+ ADD_STATS(prot_commit, 1);
+ ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
/* ptr may be ioremapped for 64-bit pagetable setup */
u.ptr = arbitrary_virt_to_machine(ptr).maddr;
u.val = pud_val_ma(val);
- extend_mmu_update(&u);
+ xen_extend_mmu_update(&u);
+
+ ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
xen_mc_issue(PARAVIRT_LAZY_MMU);
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
void xen_set_pud(pud_t *ptr, pud_t val)
{
+ ADD_STATS(pud_update, 1);
+
/* If page is not pinned, we can just update the entry
directly */
- if (!page_pinned(ptr)) {
+ if (!xen_page_pinned(ptr)) {
*ptr = val;
return;
}
+ ADD_STATS(pud_update_pinned, 1);
+
xen_set_pud_hyper(ptr, val);
}
void xen_set_pte(pte_t *ptep, pte_t pte)
{
+ ADD_STATS(pte_update, 1);
+// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
+ ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
+
#ifdef CONFIG_X86_PAE
ptep->pte_high = pte.pte_high;
smp_wmb();
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
u.ptr = virt_to_machine(ptr).maddr;
u.val = pgd_val_ma(val);
- extend_mmu_update(&u);
+ xen_extend_mmu_update(&u);
}
/*
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
{
pgd_t *user_ptr = xen_get_user_pgd(ptr);
+ ADD_STATS(pgd_update, 1);
+
/* If page is not pinned, we can just update the entry
directly */
- if (!page_pinned(ptr)) {
+ if (!xen_page_pinned(ptr)) {
*ptr = val;
if (user_ptr) {
- WARN_ON(page_pinned(user_ptr));
+ WARN_ON(xen_page_pinned(user_ptr));
*user_ptr = val;
}
return;
}
+ ADD_STATS(pgd_update_pinned, 1);
+ ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
+
/* If it's pinned, then we can at least batch the kernel and
user updates together. */
xen_mc_batch();
@@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
* For 64-bit, we must skip the Xen hole in the middle of the address
* space, just after the big x86-64 virtual hole.
*/
-static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
- unsigned long limit)
+static int xen_pgd_walk(struct mm_struct *mm,
+ int (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ unsigned long limit)
{
+ pgd_t *pgd = mm->pgd;
int flush = 0;
unsigned hole_low, hole_high;
unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
pmdidx_limit = 0;
#endif
- flush |= (*func)(virt_to_page(pgd), PT_PGD);
-
for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
pud_t *pud;
@@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
pud = pud_offset(&pgd[pgdidx], 0);
if (PTRS_PER_PUD > 1) /* not folded */
- flush |= (*func)(virt_to_page(pud), PT_PUD);
+ flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
pmd_t *pmd;
@@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
pmd = pmd_offset(&pud[pudidx], 0);
if (PTRS_PER_PMD > 1) /* not folded */
- flush |= (*func)(virt_to_page(pmd), PT_PMD);
+ flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
struct page *pte;
@@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
continue;
pte = pmd_page(pmd[pmdidx]);
- flush |= (*func)(pte, PT_PTE);
+ flush |= (*func)(mm, pte, PT_PTE);
}
}
}
+
out:
+ /* Do the top level last, so that the callbacks can use it as
+ a cue to do final things like tlb flushes. */
+ flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
return flush;
}
-static spinlock_t *lock_pte(struct page *page)
+/* If we're using split pte locks, then take the page's lock and
+ return a pointer to it. Otherwise return NULL. */
+static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
{
spinlock_t *ptl = NULL;
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if USE_SPLIT_PTLOCKS
ptl = __pte_lockptr(page);
- spin_lock(ptl);
+ spin_lock_nest_lock(ptl, &mm->page_table_lock);
#endif
return ptl;
}
-static void do_unlock(void *v)
+static void xen_pte_unlock(void *v)
{
spinlock_t *ptl = v;
spin_unlock(ptl);
@@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
}
-static int pin_page(struct page *page, enum pt_level level)
+static int xen_pin_page(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
{
unsigned pgfl = TestSetPagePinned(page);
int flush;
@@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
flush = 0;
+ /*
+ * We need to hold the pagetable lock between the time
+ * we make the pagetable RO and when we actually pin
+ * it. If we don't, then other users may come in and
+ * attempt to update the pagetable by writing it,
+ * which will fail because the memory is RO but not
+ * pinned, so Xen won't do the trap'n'emulate.
+ *
+ * If we're using split pte locks, we can't hold the
+ * entire pagetable's worth of locks during the
+ * traverse, because we may wrap the preempt count (8
+ * bits). The solution is to mark RO and pin each PTE
+ * page while holding the lock. This means the number
+ * of locks we end up holding is never more than a
+ * batch size (~32 entries, at present).
+ *
+ * If we're not using split pte locks, we needn't pin
+ * the PTE pages independently, because we're
+ * protected by the overall pagetable lock.
+ */
ptl = NULL;
if (level == PT_PTE)
- ptl = lock_pte(page);
+ ptl = xen_pte_lock(page, mm);
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL_RO),
level == PT_PGD ? UVMF_TLB_FLUSH : 0);
- if (level == PT_PTE)
+ if (ptl) {
xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
- if (ptl) {
/* Queue a deferred unlock for when this batch
is completed. */
- xen_mc_callback(do_unlock, ptl);
+ xen_mc_callback(xen_pte_unlock, ptl);
}
}
@@ -715,14 +838,15 @@ static int pin_page(struct page *page, enum pt_level level)
/* This is called just after a mm has been created, but it has not
been used yet. We need to make sure that its pagetable is all
read-only, and can be pinned. */
-void xen_pgd_pin(pgd_t *pgd)
+static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
{
xen_mc_batch();
- if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
+ if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
/* re-enable interrupts for kmap_flush_unused */
xen_mc_issue(0);
kmap_flush_unused();
+ vm_unmap_aliases();
xen_mc_batch();
}
@@ -733,25 +857,35 @@ void xen_pgd_pin(pgd_t *pgd)
xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
if (user_pgd) {
- pin_page(virt_to_page(user_pgd), PT_PGD);
+ xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
}
}
#else /* CONFIG_X86_32 */
#ifdef CONFIG_X86_PAE
/* Need to make sure unshared kernel PMD is pinnable */
- pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+ xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
+ PT_PMD);
#endif
xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
#endif /* CONFIG_X86_64 */
xen_mc_issue(0);
}
+static void xen_pgd_pin(struct mm_struct *mm)
+{
+ __xen_pgd_pin(mm, mm->pgd);
+}
+
/*
* On save, we need to pin all pagetables to make sure they get their
* mfns turned into pfns. Search the list for any unpinned pgds and pin
* them (unpinned pgds are not currently in use, probably because the
* process is under construction or destruction).
+ *
+ * Expected to be called in stop_machine() ("equivalent to taking
+ * every spinlock in the system"), so the locking doesn't really
+ * matter all that much.
*/
void xen_mm_pin_all(void)
{
@@ -762,7 +896,7 @@ void xen_mm_pin_all(void)
list_for_each_entry(page, &pgd_list, lru) {
if (!PagePinned(page)) {
- xen_pgd_pin((pgd_t *)page_address(page));
+ __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
SetPageSavePinned(page);
}
}
@@ -775,7 +909,8 @@ void xen_mm_pin_all(void)
* that's before we have page structures to store the bits. So do all
* the book-keeping now.
*/
-static __init int mark_pinned(struct page *page, enum pt_level level)
+static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
{
SetPagePinned(page);
return 0;
@@ -783,10 +918,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
void __init xen_mark_init_mm_pinned(void)
{
- pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+ xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
}
-static int unpin_page(struct page *page, enum pt_level level)
+static int xen_unpin_page(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
{
unsigned pgfl = TestClearPagePinned(page);
@@ -796,10 +932,18 @@ static int unpin_page(struct page *page, enum pt_level level)
spinlock_t *ptl = NULL;
struct multicall_space mcs;
+ /*
+ * Do the converse to pin_page. If we're using split
+ * pte locks, we must be holding the lock for while
+ * the pte page is unpinned but still RO to prevent
+ * concurrent updates from seeing it in this
+ * partially-pinned state.
+ */
if (level == PT_PTE) {
- ptl = lock_pte(page);
+ ptl = xen_pte_lock(page, mm);
- xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
+ if (ptl)
+ xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
}
mcs = __xen_mc_entry(0);
@@ -810,7 +954,7 @@ static int unpin_page(struct page *page, enum pt_level level)
if (ptl) {
/* unlock when batch completed */
- xen_mc_callback(do_unlock, ptl);
+ xen_mc_callback(xen_pte_unlock, ptl);
}
}
@@ -818,7 +962,7 @@ static int unpin_page(struct page *page, enum pt_level level)
}
/* Release a pagetables pages back as normal RW */
-static void xen_pgd_unpin(pgd_t *pgd)
+static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
{
xen_mc_batch();
@@ -830,21 +974,27 @@ static void xen_pgd_unpin(pgd_t *pgd)
if (user_pgd) {
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
- unpin_page(virt_to_page(user_pgd), PT_PGD);
+ xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
}
}
#endif
#ifdef CONFIG_X86_PAE
/* Need to make sure unshared kernel PMD is unpinned */
- pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+ xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
+ PT_PMD);
#endif
- pgd_walk(pgd, unpin_page, USER_LIMIT);
+ xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
xen_mc_issue(0);
}
+static void xen_pgd_unpin(struct mm_struct *mm)
+{
+ __xen_pgd_unpin(mm, mm->pgd);
+}
+
/*
* On resume, undo any pinning done at save, so that the rest of the
* kernel doesn't see any unexpected pinned pagetables.
@@ -859,7 +1009,7 @@ void xen_mm_unpin_all(void)
list_for_each_entry(page, &pgd_list, lru) {
if (PageSavePinned(page)) {
BUG_ON(!PagePinned(page));
- xen_pgd_unpin((pgd_t *)page_address(page));
+ __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
ClearPageSavePinned(page);
}
}
@@ -870,14 +1020,14 @@ void xen_mm_unpin_all(void)
void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
spin_lock(&next->page_table_lock);
- xen_pgd_pin(next->pgd);
+ xen_pgd_pin(next);
spin_unlock(&next->page_table_lock);
}
void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
spin_lock(&mm->page_table_lock);
- xen_pgd_pin(mm->pgd);
+ xen_pgd_pin(mm);
spin_unlock(&mm->page_table_lock);
}
@@ -907,7 +1057,7 @@ static void drop_other_mm_ref(void *info)
}
}
-static void drop_mm_ref(struct mm_struct *mm)
+static void xen_drop_mm_ref(struct mm_struct *mm)
{
cpumask_t mask;
unsigned cpu;
@@ -937,7 +1087,7 @@ static void drop_mm_ref(struct mm_struct *mm)
smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
}
#else
-static void drop_mm_ref(struct mm_struct *mm)
+static void xen_drop_mm_ref(struct mm_struct *mm)
{
if (current->active_mm == mm)
load_cr3(swapper_pg_dir);
@@ -961,14 +1111,77 @@ static void drop_mm_ref(struct mm_struct *mm)
void xen_exit_mmap(struct mm_struct *mm)
{
get_cpu(); /* make sure we don't move around */
- drop_mm_ref(mm);
+ xen_drop_mm_ref(mm);
put_cpu();
spin_lock(&mm->page_table_lock);
/* pgd may not be pinned in the error exit path of execve */
- if (page_pinned(mm->pgd))
- xen_pgd_unpin(mm->pgd);
+ if (xen_page_pinned(mm->pgd))
+ xen_pgd_unpin(mm);
spin_unlock(&mm->page_table_lock);
}
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct dentry *d_mmu_debug;
+
+static int __init xen_mmu_debugfs(void)
+{
+ struct dentry *d_xen = xen_init_debugfs();
+
+ if (d_xen == NULL)
+ return -ENOMEM;
+
+ d_mmu_debug = debugfs_create_dir("mmu", d_xen);
+
+ debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
+
+ debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
+ debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
+ &mmu_stats.pgd_update_pinned);
+ debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
+ &mmu_stats.pgd_update_pinned);
+
+ debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
+ debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
+ &mmu_stats.pud_update_pinned);
+ debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
+ &mmu_stats.pud_update_pinned);
+
+ debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
+ debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
+ &mmu_stats.pmd_update_pinned);
+ debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
+ &mmu_stats.pmd_update_pinned);
+
+ debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
+// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
+// &mmu_stats.pte_update_pinned);
+ debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
+ &mmu_stats.pte_update_pinned);
+
+ debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
+ debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
+ &mmu_stats.mmu_update_extended);
+ xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
+ mmu_stats.mmu_update_histo, 20);
+
+ debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
+ debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
+ &mmu_stats.set_pte_at_batched);
+ debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
+ &mmu_stats.set_pte_at_current);
+ debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
+ &mmu_stats.set_pte_at_kernel);
+
+ debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
+ debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
+ &mmu_stats.prot_commit_batched);
+
+ return 0;
+}
+fs_initcall(xen_mmu_debugfs);
+
+#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 0f59bd03f9e..98d71659da5 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
void xen_exit_mmap(struct mm_struct *mm);
-void xen_pgd_pin(pgd_t *pgd);
-//void xen_pgd_unpin(pgd_t *pgd);
-
pteval_t xen_pte_val(pte_t);
pmdval_t xen_pmd_val(pmd_t);
pgdval_t xen_pgd_val(pgd_t);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 9efd1c6c977..8ea8a0d0b0d 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,16 +21,20 @@
*/
#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/debugfs.h>
#include <asm/xen/hypercall.h>
#include "multicalls.h"
+#include "debugfs.h"
+
+#define MC_BATCH 32
#define MC_DEBUG 1
-#define MC_BATCH 32
#define MC_ARGS (MC_BATCH * 16)
+
struct mc_buffer {
struct multicall_entry entries[MC_BATCH];
#if MC_DEBUG
@@ -47,6 +51,76 @@ struct mc_buffer {
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+/* flush reasons 0- slots, 1- args, 2- callbacks */
+enum flush_reasons
+{
+ FL_SLOTS,
+ FL_ARGS,
+ FL_CALLBACKS,
+
+ FL_N_REASONS
+};
+
+#ifdef CONFIG_XEN_DEBUG_FS
+#define NHYPERCALLS 40 /* not really */
+
+static struct {
+ unsigned histo[MC_BATCH+1];
+
+ unsigned issued;
+ unsigned arg_total;
+ unsigned hypercalls;
+ unsigned histo_hypercalls[NHYPERCALLS];
+
+ unsigned flush[FL_N_REASONS];
+} mc_stats;
+
+static u8 zero_stats;
+
+static inline void check_zero(void)
+{
+ if (unlikely(zero_stats)) {
+ memset(&mc_stats, 0, sizeof(mc_stats));
+ zero_stats = 0;
+ }
+}
+
+static void mc_add_stats(const struct mc_buffer *mc)
+{
+ int i;
+
+ check_zero();
+
+ mc_stats.issued++;
+ mc_stats.hypercalls += mc->mcidx;
+ mc_stats.arg_total += mc->argidx;
+
+ mc_stats.histo[mc->mcidx]++;
+ for(i = 0; i < mc->mcidx; i++) {
+ unsigned op = mc->entries[i].op;
+ if (op < NHYPERCALLS)
+ mc_stats.histo_hypercalls[op]++;
+ }
+}
+
+static void mc_stats_flush(enum flush_reasons idx)
+{
+ check_zero();
+
+ mc_stats.flush[idx]++;
+}
+
+#else /* !CONFIG_XEN_DEBUG_FS */
+
+static inline void mc_add_stats(const struct mc_buffer *mc)
+{
+}
+
+static inline void mc_stats_flush(enum flush_reasons idx)
+{
+}
+#endif /* CONFIG_XEN_DEBUG_FS */
+
void xen_mc_flush(void)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
something in the middle */
local_irq_save(flags);
+ mc_add_stats(b);
+
if (b->mcidx) {
#if MC_DEBUG
memcpy(b->debug, b->entries,
@@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args)
if (b->mcidx == MC_BATCH ||
(argidx + args) > MC_ARGS) {
+ mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
xen_mc_flush();
argidx = roundup(b->argidx, sizeof(u64));
}
@@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct callback *cb;
- if (b->cbidx == MC_BATCH)
+ if (b->cbidx == MC_BATCH) {
+ mc_stats_flush(FL_CALLBACKS);
xen_mc_flush();
+ }
cb = &b->callbacks[b->cbidx++];
cb->fn = fn;
cb->data = data;
}
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct dentry *d_mc_debug;
+
+static int __init xen_mc_debugfs(void)
+{
+ struct dentry *d_xen = xen_init_debugfs();
+
+ if (d_xen == NULL)
+ return -ENOMEM;
+
+ d_mc_debug = debugfs_create_dir("multicalls", d_xen);
+
+ debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
+
+ debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
+ debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
+ debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
+
+ xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
+ mc_stats.histo, MC_BATCH);
+ xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
+ mc_stats.histo_hypercalls, NHYPERCALLS);
+ xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
+ mc_stats.flush, FL_N_REASONS);
+
+ return 0;
+}
+fs_initcall(xen_mc_debugfs);
+
+#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d8faf79a0a1..d77da613b1d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,11 +11,8 @@
* useful topology information for the kernel to make use of. As a
* result, all CPUs are treated as if they're single-core and
* single-threaded.
- *
- * This does not handle HOTPLUG_CPU yet.
*/
#include <linux/sched.h>
-#include <linux/kernel_stat.h>
#include <linux/err.h>
#include <linux/smp.h>
@@ -36,8 +33,6 @@
#include "xen-ops.h"
#include "mmu.h"
-static void __cpuinit xen_init_lock_cpu(int cpu);
-
cpumask_t xen_cpu_initialized_map;
static DEFINE_PER_CPU(int, resched_irq);
@@ -64,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
-static __cpuinit void cpu_bringup_and_idle(void)
+static __cpuinit void cpu_bringup(void)
{
int cpu = smp_processor_id();
cpu_init();
+ touch_softlockup_watchdog();
preempt_disable();
xen_enable_sysenter();
@@ -89,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void)
local_irq_enable();
wmb(); /* make sure everything is out */
+}
+
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+ cpu_bringup();
cpu_idle();
}
@@ -212,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
cpu_set(cpu, cpu_present_map);
}
-
- //init_xenbus_allowed_cpumask();
}
static __cpuinit int
@@ -281,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
struct task_struct *idle = idle_task(cpu);
int rc;
-#if 0
- rc = cpu_up_check(cpu);
- if (rc)
- return rc;
-#endif
-
#ifdef CONFIG_X86_64
/* Allocate node local memory for AP pdas */
WARN_ON(cpu == 0);
@@ -339,6 +332,60 @@ static void xen_smp_cpus_done(unsigned int max_cpus)
{
}
+#ifdef CONFIG_HOTPLUG_CPU
+static int xen_cpu_disable(void)
+{
+ unsigned int cpu = smp_processor_id();
+ if (cpu == 0)
+ return -EBUSY;
+
+ cpu_disable_common();
+
+ load_cr3(swapper_pg_dir);
+ return 0;
+}
+
+static void xen_cpu_die(unsigned int cpu)
+{
+ while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
+ current->state = TASK_UNINTERRUPTIBLE;
+ schedule_timeout(HZ/10);
+ }
+ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
+ xen_uninit_lock_cpu(cpu);
+ xen_teardown_timer(cpu);
+
+ if (num_online_cpus() == 1)
+ alternatives_smp_switch(0);
+}
+
+static void xen_play_dead(void)
+{
+ play_dead_common();
+ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+ cpu_bringup();
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static int xen_cpu_disable(void)
+{
+ return -ENOSYS;
+}
+
+static void xen_cpu_die(unsigned int cpu)
+{
+ BUG();
+}
+
+static void xen_play_dead(void)
+{
+ BUG();
+}
+
+#endif
static void stop_self(void *v)
{
int cpu = smp_processor_id();
@@ -419,176 +466,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
-struct xen_spinlock {
- unsigned char lock; /* 0 -> free; 1 -> locked */
- unsigned short spinners; /* count of waiting cpus */
-};
-
-static int xen_spin_is_locked(struct raw_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- return xl->lock != 0;
-}
-
-static int xen_spin_is_contended(struct raw_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- /* Not strictly true; this is only the count of contended
- lock-takers entering the slow path. */
- return xl->spinners != 0;
-}
-
-static int xen_spin_trylock(struct raw_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- u8 old = 1;
-
- asm("xchgb %b0,%1"
- : "+q" (old), "+m" (xl->lock) : : "memory");
-
- return old == 0;
-}
-
-static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
-
-static inline void spinning_lock(struct xen_spinlock *xl)
-{
- __get_cpu_var(lock_spinners) = xl;
- wmb(); /* set lock of interest before count */
- asm(LOCK_PREFIX " incw %0"
- : "+m" (xl->spinners) : : "memory");
-}
-
-static inline void unspinning_lock(struct xen_spinlock *xl)
-{
- asm(LOCK_PREFIX " decw %0"
- : "+m" (xl->spinners) : : "memory");
- wmb(); /* decrement count before clearing lock */
- __get_cpu_var(lock_spinners) = NULL;
-}
-
-static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- int irq = __get_cpu_var(lock_kicker_irq);
- int ret;
-
- /* If kicker interrupts not initialized yet, just spin */
- if (irq == -1)
- return 0;
-
- /* announce we're spinning */
- spinning_lock(xl);
-
- /* clear pending */
- xen_clear_irq_pending(irq);
-
- /* check again make sure it didn't become free while
- we weren't looking */
- ret = xen_spin_trylock(lock);
- if (ret)
- goto out;
-
- /* block until irq becomes pending */
- xen_poll_irq(irq);
- kstat_this_cpu.irqs[irq]++;
-
-out:
- unspinning_lock(xl);
- return ret;
-}
-
-static void xen_spin_lock(struct raw_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- int timeout;
- u8 oldval;
-
- do {
- timeout = 1 << 10;
-
- asm("1: xchgb %1,%0\n"
- " testb %1,%1\n"
- " jz 3f\n"
- "2: rep;nop\n"
- " cmpb $0,%0\n"
- " je 1b\n"
- " dec %2\n"
- " jnz 2b\n"
- "3:\n"
- : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
- : "1" (1)
- : "memory");
-
- } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
-}
-
-static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- /* XXX should mix up next cpu selection */
- if (per_cpu(lock_spinners, cpu) == xl) {
- xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
- break;
- }
- }
-}
-
-static void xen_spin_unlock(struct raw_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- smp_wmb(); /* make sure no writes get moved after unlock */
- xl->lock = 0; /* release lock */
-
- /* make sure unlock happens before kick */
- barrier();
-
- if (unlikely(xl->spinners))
- xen_spin_unlock_slow(xl);
-}
-
-static __cpuinit void xen_init_lock_cpu(int cpu)
-{
- int irq;
- const char *name;
-
- name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
- irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
- cpu,
- xen_reschedule_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
- name,
- NULL);
-
- if (irq >= 0) {
- disable_irq(irq); /* make sure it's never delivered */
- per_cpu(lock_kicker_irq, cpu) = irq;
- }
-
- printk("cpu %d spinlock event irq %d\n", cpu, irq);
-}
-
-static void __init xen_init_spinlocks(void)
-{
- pv_lock_ops.spin_is_locked = xen_spin_is_locked;
- pv_lock_ops.spin_is_contended = xen_spin_is_contended;
- pv_lock_ops.spin_lock = xen_spin_lock;
- pv_lock_ops.spin_trylock = xen_spin_trylock;
- pv_lock_ops.spin_unlock = xen_spin_unlock;
-}
-
static const struct smp_ops xen_smp_ops __initdata = {
.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_smp_prepare_cpus,
- .cpu_up = xen_cpu_up,
.smp_cpus_done = xen_smp_cpus_done,
+ .cpu_up = xen_cpu_up,
+ .cpu_die = xen_cpu_die,
+ .cpu_disable = xen_cpu_disable,
+ .play_dead = xen_play_dead,
+
.smp_send_stop = xen_smp_send_stop,
.smp_send_reschedule = xen_smp_send_reschedule,
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 00000000000..5601506f2dd
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,428 @@
+/*
+ * Split spinlock implementation out into its own file, so it can be
+ * compiled in a FTRACE-compatible way.
+ */
+#include <linux/kernel_stat.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/log2.h>
+
+#include <asm/paravirt.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+#include "debugfs.h"
+
+#ifdef CONFIG_XEN_DEBUG_FS
+static struct xen_spinlock_stats
+{
+ u64 taken;
+ u32 taken_slow;
+ u32 taken_slow_nested;
+ u32 taken_slow_pickup;
+ u32 taken_slow_spurious;
+ u32 taken_slow_irqenable;
+
+ u64 released;
+ u32 released_slow;
+ u32 released_slow_kicked;
+
+#define HISTO_BUCKETS 30
+ u32 histo_spin_total[HISTO_BUCKETS+1];
+ u32 histo_spin_spinning[HISTO_BUCKETS+1];
+ u32 histo_spin_blocked[HISTO_BUCKETS+1];
+
+ u64 time_total;
+ u64 time_spinning;
+ u64 time_blocked;
+} spinlock_stats;
+
+static u8 zero_stats;
+
+static unsigned lock_timeout = 1 << 10;
+#define TIMEOUT lock_timeout
+
+static inline void check_zero(void)
+{
+ if (unlikely(zero_stats)) {
+ memset(&spinlock_stats, 0, sizeof(spinlock_stats));
+ zero_stats = 0;
+ }
+}
+
+#define ADD_STATS(elem, val) \
+ do { check_zero(); spinlock_stats.elem += (val); } while(0)
+
+static inline u64 spin_time_start(void)
+{
+ return xen_clocksource_read();
+}
+
+static void __spin_time_accum(u64 delta, u32 *array)
+{
+ unsigned index = ilog2(delta);
+
+ check_zero();
+
+ if (index < HISTO_BUCKETS)
+ array[index]++;
+ else
+ array[HISTO_BUCKETS]++;
+}
+
+static inline void spin_time_accum_spinning(u64 start)
+{
+ u32 delta = xen_clocksource_read() - start;
+
+ __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
+ spinlock_stats.time_spinning += delta;
+}
+
+static inline void spin_time_accum_total(u64 start)
+{
+ u32 delta = xen_clocksource_read() - start;
+
+ __spin_time_accum(delta, spinlock_stats.histo_spin_total);
+ spinlock_stats.time_total += delta;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+ u32 delta = xen_clocksource_read() - start;
+
+ __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
+ spinlock_stats.time_blocked += delta;
+}
+#else /* !CONFIG_XEN_DEBUG_FS */
+#define TIMEOUT (1 << 10)
+#define ADD_STATS(elem, val) do { (void)(val); } while(0)
+
+static inline u64 spin_time_start(void)
+{
+ return 0;
+}
+
+static inline void spin_time_accum_total(u64 start)
+{
+}
+static inline void spin_time_accum_spinning(u64 start)
+{
+}
+static inline void spin_time_accum_blocked(u64 start)
+{
+}
+#endif /* CONFIG_XEN_DEBUG_FS */
+
+struct xen_spinlock {
+ unsigned char lock; /* 0 -> free; 1 -> locked */
+ unsigned short spinners; /* count of waiting cpus */
+};
+
+static int xen_spin_is_locked(struct raw_spinlock *lock)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+ return xl->lock != 0;
+}
+
+static int xen_spin_is_contended(struct raw_spinlock *lock)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+ /* Not strictly true; this is only the count of contended
+ lock-takers entering the slow path. */
+ return xl->spinners != 0;
+}
+
+static int xen_spin_trylock(struct raw_spinlock *lock)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+ u8 old = 1;
+
+ asm("xchgb %b0,%1"
+ : "+q" (old), "+m" (xl->lock) : : "memory");
+
+ return old == 0;
+}
+
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
+
+/*
+ * Mark a cpu as interested in a lock. Returns the CPU's previous
+ * lock of interest, in case we got preempted by an interrupt.
+ */
+static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
+{
+ struct xen_spinlock *prev;
+
+ prev = __get_cpu_var(lock_spinners);
+ __get_cpu_var(lock_spinners) = xl;
+
+ wmb(); /* set lock of interest before count */
+
+ asm(LOCK_PREFIX " incw %0"
+ : "+m" (xl->spinners) : : "memory");
+
+ return prev;
+}
+
+/*
+ * Mark a cpu as no longer interested in a lock. Restores previous
+ * lock of interest (NULL for none).
+ */
+static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
+{
+ asm(LOCK_PREFIX " decw %0"
+ : "+m" (xl->spinners) : : "memory");
+ wmb(); /* decrement count before restoring lock */
+ __get_cpu_var(lock_spinners) = prev;
+}
+
+static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+ struct xen_spinlock *prev;
+ int irq = __get_cpu_var(lock_kicker_irq);
+ int ret;
+ unsigned long flags;
+ u64 start;
+
+ /* If kicker interrupts not initialized yet, just spin */
+ if (irq == -1)
+ return 0;
+
+ start = spin_time_start();
+
+ /* announce we're spinning */
+ prev = spinning_lock(xl);
+
+ flags = __raw_local_save_flags();
+ if (irq_enable) {
+ ADD_STATS(taken_slow_irqenable, 1);
+ raw_local_irq_enable();
+ }
+
+ ADD_STATS(taken_slow, 1);
+ ADD_STATS(taken_slow_nested, prev != NULL);
+
+ do {
+ /* clear pending */
+ xen_clear_irq_pending(irq);
+
+ /* check again make sure it didn't become free while
+ we weren't looking */
+ ret = xen_spin_trylock(lock);
+ if (ret) {
+ ADD_STATS(taken_slow_pickup, 1);
+
+ /*
+ * If we interrupted another spinlock while it
+ * was blocking, make sure it doesn't block
+ * without rechecking the lock.
+ */
+ if (prev != NULL)
+ xen_set_irq_pending(irq);
+ goto out;
+ }
+
+ /*
+ * Block until irq becomes pending. If we're
+ * interrupted at this point (after the trylock but
+ * before entering the block), then the nested lock
+ * handler guarantees that the irq will be left
+ * pending if there's any chance the lock became free;
+ * xen_poll_irq() returns immediately if the irq is
+ * pending.
+ */
+ xen_poll_irq(irq);
+ ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
+ } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
+
+ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
+
+out:
+ raw_local_irq_restore(flags);
+ unspinning_lock(xl, prev);
+ spin_time_accum_blocked(start);
+
+ return ret;
+}
+
+static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+ unsigned timeout;
+ u8 oldval;
+ u64 start_spin;
+
+ ADD_STATS(taken, 1);
+
+ start_spin = spin_time_start();
+
+ do {
+ u64 start_spin_fast = spin_time_start();
+
+ timeout = TIMEOUT;
+
+ asm("1: xchgb %1,%0\n"
+ " testb %1,%1\n"
+ " jz 3f\n"
+ "2: rep;nop\n"
+ " cmpb $0,%0\n"
+ " je 1b\n"
+ " dec %2\n"
+ " jnz 2b\n"
+ "3:\n"
+ : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
+ : "1" (1)
+ : "memory");
+
+ spin_time_accum_spinning(start_spin_fast);
+
+ } while (unlikely(oldval != 0 &&
+ (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
+
+ spin_time_accum_total(start_spin);
+}
+
+static void xen_spin_lock(struct raw_spinlock *lock)
+{
+ __xen_spin_lock(lock, false);
+}
+
+static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
+{
+ __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
+}
+
+static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
+{
+ int cpu;
+
+ ADD_STATS(released_slow, 1);
+
+ for_each_online_cpu(cpu) {
+ /* XXX should mix up next cpu selection */
+ if (per_cpu(lock_spinners, cpu) == xl) {
+ ADD_STATS(released_slow_kicked, 1);
+ xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+ break;
+ }
+ }
+}
+
+static void xen_spin_unlock(struct raw_spinlock *lock)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+ ADD_STATS(released, 1);
+
+ smp_wmb(); /* make sure no writes get moved after unlock */
+ xl->lock = 0; /* release lock */
+
+ /* make sure unlock happens before kick */
+ barrier();
+
+ if (unlikely(xl->spinners))
+ xen_spin_unlock_slow(xl);
+}
+
+static irqreturn_t dummy_handler(int irq, void *dev_id)
+{
+ BUG();
+ return IRQ_HANDLED;
+}
+
+void __cpuinit xen_init_lock_cpu(int cpu)
+{
+ int irq;
+ const char *name;
+
+ name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
+ irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
+ cpu,
+ dummy_handler,
+ IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ name,
+ NULL);
+
+ if (irq >= 0) {
+ disable_irq(irq); /* make sure it's never delivered */
+ per_cpu(lock_kicker_irq, cpu) = irq;
+ }
+
+ printk("cpu %d spinlock event irq %d\n", cpu, irq);
+}
+
+void xen_uninit_lock_cpu(int cpu)
+{
+ unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
+}
+
+void __init xen_init_spinlocks(void)
+{
+ pv_lock_ops.spin_is_locked = xen_spin_is_locked;
+ pv_lock_ops.spin_is_contended = xen_spin_is_contended;
+ pv_lock_ops.spin_lock = xen_spin_lock;
+ pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
+ pv_lock_ops.spin_trylock = xen_spin_trylock;
+ pv_lock_ops.spin_unlock = xen_spin_unlock;
+}
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct dentry *d_spin_debug;
+
+static int __init xen_spinlock_debugfs(void)
+{
+ struct dentry *d_xen = xen_init_debugfs();
+
+ if (d_xen == NULL)
+ return -ENOMEM;
+
+ d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
+
+ debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
+
+ debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
+
+ debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
+ debugfs_create_u32("taken_slow", 0444, d_spin_debug,
+ &spinlock_stats.taken_slow);
+ debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
+ &spinlock_stats.taken_slow_nested);
+ debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
+ &spinlock_stats.taken_slow_pickup);
+ debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
+ &spinlock_stats.taken_slow_spurious);
+ debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
+ &spinlock_stats.taken_slow_irqenable);
+
+ debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
+ debugfs_create_u32("released_slow", 0444, d_spin_debug,
+ &spinlock_stats.released_slow);
+ debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
+ &spinlock_stats.released_slow_kicked);
+
+ debugfs_create_u64("time_spinning", 0444, d_spin_debug,
+ &spinlock_stats.time_spinning);
+ debugfs_create_u64("time_blocked", 0444, d_spin_debug,
+ &spinlock_stats.time_blocked);
+ debugfs_create_u64("time_total", 0444, d_spin_debug,
+ &spinlock_stats.time_total);
+
+ xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
+ spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
+ xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
+ spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
+ xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
+ spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
+
+ return 0;
+}
+fs_initcall(xen_spinlock_debugfs);
+
+#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 685b77470fc..c9f7cda48ed 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,8 +30,6 @@
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
-static cycle_t xen_clocksource_read(void);
-
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
@@ -200,20 +198,13 @@ unsigned long long xen_sched_clock(void)
/* Get the TSC speed from Xen */
unsigned long xen_tsc_khz(void)
{
- u64 xen_khz = 1000000ULL << 32;
- const struct pvclock_vcpu_time_info *info =
+ struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
- do_div(xen_khz, info->tsc_to_system_mul);
- if (info->tsc_shift < 0)
- xen_khz <<= -info->tsc_shift;
- else
- xen_khz >>= info->tsc_shift;
-
- return xen_khz;
+ return pvclock_tsc_khz(info);
}
-static cycle_t xen_clocksource_read(void)
+cycle_t xen_clocksource_read(void)
{
struct pvclock_vcpu_time_info *src;
cycle_t ret;
@@ -452,6 +443,14 @@ void xen_setup_timer(int cpu)
setup_runstate_info(cpu);
}
+void xen_teardown_timer(int cpu)
+{
+ struct clock_event_device *evt;
+ BUG_ON(cpu == 0);
+ evt = &per_cpu(xen_clock_events, cpu);
+ unbind_from_irqhandler(evt->irq, NULL);
+}
+
void xen_setup_cpu_clockevents(void)
{
BUG_ON(preemptible());
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41d..42786f59d9c 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -298,7 +298,7 @@ check_events:
push %eax
push %ecx
push %edx
- call force_evtchn_callback
+ call xen_force_evtchn_callback
pop %edx
pop %ecx
pop %eax
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 7f58304fafb..05794c566e8 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -26,8 +26,15 @@
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
-#if 0
-#include <asm/percpu.h>
+#if 1
+/*
+ x86-64 does not yet support direct access to percpu variables
+ via a segment override, so we just need to make sure this code
+ never gets used
+ */
+#define BUG ud2a
+#define PER_CPU_VAR(var, off) 0xdeadbeef
+#endif
/*
Enable events. This clears the event mask and tests the pending
@@ -35,6 +42,8 @@
events, then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
+ BUG
+
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
@@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct)
non-zero.
*/
ENTRY(xen_irq_disable_direct)
+ BUG
+
movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
ENDPATCH(xen_irq_disable_direct)
ret
@@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct)
Xen and x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
+ BUG
+
testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
setz %ah
addb %ah,%ah
@@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct)
if so.
*/
ENTRY(xen_restore_fl_direct)
+ BUG
+
testb $X86_EFLAGS_IF>>8, %ah
setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
/* Preempt here doesn't matter because that will deal with
@@ -122,7 +137,7 @@ check_events:
push %r9
push %r10
push %r11
- call force_evtchn_callback
+ call xen_force_evtchn_callback
pop %r11
pop %r10
pop %r9
@@ -133,7 +148,6 @@ check_events:
pop %rcx
pop %rax
ret
-#endif
ENTRY(xen_adjust_exception_frame)
mov 8+0(%rsp),%rcx
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index dd3c23152a2..d7422dc2a55 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
#define XEN_OPS_H
#include <linux/init.h>
+#include <linux/clocksource.h>
#include <linux/irqreturn.h>
#include <xen/xen-ops.h>
@@ -31,7 +32,10 @@ void xen_vcpu_restore(void);
void __init xen_build_dynamic_phys_to_machine(void);
+void xen_init_irq_ops(void);
void xen_setup_timer(int cpu);
+void xen_teardown_timer(int cpu);
+cycle_t xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
unsigned long xen_tsc_khz(void);
void __init xen_time_init(void);
@@ -50,6 +54,10 @@ void __init xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP
void xen_smp_init(void);
+void __init xen_init_spinlocks(void);
+__cpuinit void xen_init_lock_cpu(int cpu);
+void xen_uninit_lock_cpu(int cpu);
+
extern cpumask_t xen_cpu_initialized_map;
#else
static inline void xen_smp_init(void) {}