aboutsummaryrefslogtreecommitdiff
path: root/arch/i386/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile6
-rw-r--r--arch/i386/kernel/acpi/boot.c30
-rw-r--r--arch/i386/kernel/apic.c1627
-rw-r--r--arch/i386/kernel/apm.c72
-rw-r--r--arch/i386/kernel/asm-offsets.c2
-rw-r--r--arch/i386/kernel/cpu/common.c14
-rw-r--r--arch/i386/kernel/cpu/cyrix.c52
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.c1
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.h2
-rw-r--r--arch/i386/kernel/cpu/mtrr/if.c32
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c6
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h2
-rw-r--r--arch/i386/kernel/cpu/proc.c14
-rw-r--r--arch/i386/kernel/cpu/transmeta.c5
-rw-r--r--arch/i386/kernel/cpuid.c9
-rw-r--r--arch/i386/kernel/e820.c18
-rw-r--r--arch/i386/kernel/entry.S78
-rw-r--r--arch/i386/kernel/head.S40
-rw-r--r--arch/i386/kernel/hpet.c498
-rw-r--r--arch/i386/kernel/i8253.c96
-rw-r--r--arch/i386/kernel/i8259.c7
-rw-r--r--arch/i386/kernel/io_apic.c24
-rw-r--r--arch/i386/kernel/irq.c22
-rw-r--r--arch/i386/kernel/kprobes.c6
-rw-r--r--arch/i386/kernel/microcode.c4
-rw-r--r--arch/i386/kernel/msr.c15
-rw-r--r--arch/i386/kernel/nmi.c107
-rw-r--r--arch/i386/kernel/paravirt.c116
-rw-r--r--arch/i386/kernel/pcspeaker.c20
-rw-r--r--arch/i386/kernel/process.c49
-rw-r--r--arch/i386/kernel/ptrace.c16
-rw-r--r--arch/i386/kernel/setup.c39
-rw-r--r--arch/i386/kernel/signal.c16
-rw-r--r--arch/i386/kernel/smp.c5
-rw-r--r--arch/i386/kernel/smpboot.c203
-rw-r--r--arch/i386/kernel/sysenter.c2
-rw-r--r--arch/i386/kernel/time.c138
-rw-r--r--arch/i386/kernel/time_hpet.c497
-rw-r--r--arch/i386/kernel/topology.c2
-rw-r--r--arch/i386/kernel/traps.c27
-rw-r--r--arch/i386/kernel/tsc.c195
-rw-r--r--arch/i386/kernel/tsc_sync.c1
-rw-r--r--arch/i386/kernel/vm86.c33
-rw-r--r--arch/i386/kernel/vmi.c949
-rw-r--r--arch/i386/kernel/vmitime.c499
-rw-r--r--arch/i386/kernel/vmlinux.lds.S9
46 files changed, 3520 insertions, 2085 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1e8988e558c..4ae3dcf1d2f 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -18,7 +18,7 @@ obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_APM) += apm.o
-obj-$(CONFIG_X86_SMP) += smp.o smpboot.o
+obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
@@ -32,7 +32,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_MODULES) += module.o
obj-y += sysenter.o vsyscall.o
obj-$(CONFIG_ACPI_SRAT) += srat.o
-obj-$(CONFIG_HPET_TIMER) += time_hpet.o
obj-$(CONFIG_EFI) += efi.o efi_stub.o
obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
obj-$(CONFIG_VM86) += vm86.o
@@ -40,8 +39,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_K8_NB) += k8.o
-# Make sure this is linked after any other paravirt_ops structs: see head.S
+obj-$(CONFIG_VMI) += vmi.o vmitime.o
obj-$(CONFIG_PARAVIRT) += paravirt.o
+obj-y += pcspeaker.o
EXTRA_AFLAGS := -traditional
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index e94aff6888c..e5eb97a910e 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -25,6 +25,7 @@
#include <linux/init.h>
#include <linux/acpi.h>
+#include <linux/acpi_pmtmr.h>
#include <linux/efi.h>
#include <linux/cpumask.h>
#include <linux/module.h>
@@ -615,6 +616,7 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table)
}
#ifdef CONFIG_HPET_TIMER
+#include <asm/hpet.h>
static int __init acpi_parse_hpet(struct acpi_table_header *table)
{
@@ -645,24 +647,11 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
hpet_res->end = (1 * 1024) - 1;
}
-#ifdef CONFIG_X86_64
- vxtime.hpet_address = hpet_tbl->address.address;
-
+ hpet_address = hpet_tbl->address.address;
printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
- hpet_tbl->id, vxtime.hpet_address);
-
- res_start = vxtime.hpet_address;
-#else /* X86 */
- {
- extern unsigned long hpet_address;
+ hpet_tbl->id, hpet_address);
- hpet_address = hpet_tbl->address.address;
- printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
- hpet_tbl->id, hpet_address);
-
- res_start = hpet_address;
- }
-#endif /* X86 */
+ res_start = hpet_address;
if (hpet_res) {
hpet_res->start = res_start;
@@ -676,10 +665,6 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
#define acpi_parse_hpet NULL
#endif
-#ifdef CONFIG_X86_PM_TIMER
-extern u32 pmtmr_ioport;
-#endif
-
static int __init acpi_parse_fadt(struct acpi_table_header *table)
{
@@ -865,10 +850,9 @@ static inline int acpi_parse_madt_ioapic_entries(void)
static void __init acpi_process_madt(void)
{
#ifdef CONFIG_X86_LOCAL_APIC
- int count, error;
+ int error;
- count = acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt);
- if (count >= 1) {
+ if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
/*
* Parse MADT LAPIC entries
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 776d9be26af..7a2c9cbdb51 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -25,6 +25,8 @@
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/cpu.h>
+#include <linux/clockchips.h>
+#include <linux/acpi_pmtmr.h>
#include <linux/module.h>
#include <asm/atomic.h>
@@ -44,128 +46,548 @@
#include "io_ports.h"
/*
- * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
- * IPIs in place of local APIC timers
+ * Sanity check
*/
-static cpumask_t timer_bcast_ipi;
+#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
+# error SPURIOUS_APIC_VECTOR definition error
+#endif
/*
* Knob to control our willingness to enable the local APIC.
+ *
+ * -1=force-disable, +1=force-enable
*/
-static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
-
-static inline void lapic_disable(void)
-{
- enable_local_apic = -1;
- clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-}
+static int enable_local_apic __initdata = 0;
-static inline void lapic_enable(void)
-{
- enable_local_apic = 1;
-}
+/* Local APIC timer verification ok */
+static int local_apic_timer_verify_ok;
/*
- * Debug level
+ * Debug level, exported for io_apic.c
*/
int apic_verbosity;
+static unsigned int calibration_result;
+static int lapic_next_event(unsigned long delta,
+ struct clock_event_device *evt);
+static void lapic_timer_setup(enum clock_event_mode mode,
+ struct clock_event_device *evt);
+static void lapic_timer_broadcast(cpumask_t mask);
static void apic_pm_activate(void);
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+ .name = "lapic",
+ .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
+ | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
+ .shift = 32,
+ .set_mode = lapic_timer_setup,
+ .set_next_event = lapic_next_event,
+ .broadcast = lapic_timer_broadcast,
+ .rating = 100,
+ .irq = -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+
+/*
+ * Get the LAPIC version
+ */
+static inline int lapic_get_version(void)
+{
+ return GET_APIC_VERSION(apic_read(APIC_LVR));
+}
+
+/*
+ * Check, if the APIC is integrated or a seperate chip
+ */
+static inline int lapic_is_integrated(void)
+{
+ return APIC_INTEGRATED(lapic_get_version());
+}
+
+/*
+ * Check, whether this is a modern or a first generation APIC
+ */
static int modern_apic(void)
{
- unsigned int lvr, version;
/* AMD systems use old APIC versions, so check the CPU */
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 >= 0xf)
+ boot_cpu_data.x86 >= 0xf)
return 1;
- lvr = apic_read(APIC_LVR);
- version = GET_APIC_VERSION(lvr);
- return version >= 0x14;
+ return lapic_get_version() >= 0x14;
}
+/**
+ * enable_NMI_through_LVT0 - enable NMI through local vector table 0
+ */
+void enable_NMI_through_LVT0 (void * dummy)
+{
+ unsigned int v = APIC_DM_NMI;
+
+ /* Level triggered for 82489DX */
+ if (!lapic_is_integrated())
+ v |= APIC_LVT_LEVEL_TRIGGER;
+ apic_write_around(APIC_LVT0, v);
+}
+
+/**
+ * get_physical_broadcast - Get number of physical broadcast IDs
+ */
+int get_physical_broadcast(void)
+{
+ return modern_apic() ? 0xff : 0xf;
+}
+
+/**
+ * lapic_get_maxlvt - get the maximum number of local vector table entries
+ */
+int lapic_get_maxlvt(void)
+{
+ unsigned int v = apic_read(APIC_LVR);
+
+ /* 82489DXs do not report # of LVT entries. */
+ return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
+}
+
+/*
+ * Local APIC timer
+ */
+
+/* Clock divisor is set to 16 */
+#define APIC_DIVISOR 16
+
/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
*/
-void ack_bad_irq(unsigned int irq)
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
{
- printk("unexpected IRQ trap at vector %02x\n", irq);
+ unsigned int lvtt_value, tmp_value;
+
+ lvtt_value = LOCAL_TIMER_VECTOR;
+ if (!oneshot)
+ lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+ if (!lapic_is_integrated())
+ lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+
+ if (!irqen)
+ lvtt_value |= APIC_LVT_MASKED;
+
+ apic_write_around(APIC_LVTT, lvtt_value);
+
/*
- * Currently unexpected vectors happen only on SMP and APIC.
- * We _must_ ack these because every local APIC has only N
- * irq slots per priority level, and a 'hanging, unacked' IRQ
- * holds up an irq slot - in excessive cases (when multiple
- * unexpected vectors occur) that might lock up the APIC
- * completely.
- * But only ack when the APIC is enabled -AK
+ * Divide PICLK by 16
*/
- if (cpu_has_apic)
- ack_APIC_irq();
+ tmp_value = apic_read(APIC_TDCR);
+ apic_write_around(APIC_TDCR, (tmp_value
+ & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+ | APIC_TDR_DIV_16);
+
+ if (!oneshot)
+ apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
}
-void __init apic_intr_init(void)
+/*
+ * Program the next event, relative to now
+ */
+static int lapic_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ apic_write_around(APIC_TMICT, delta);
+ return 0;
+}
+
+/*
+ * Setup the lapic timer in periodic or oneshot mode
+ */
+static void lapic_timer_setup(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ unsigned long flags;
+ unsigned int v;
+
+ /* Lapic used for broadcast ? */
+ if (!local_apic_timer_verify_ok)
+ return;
+
+ local_irq_save(flags);
+
+ switch (mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ case CLOCK_EVT_MODE_ONESHOT:
+ __setup_APIC_LVTT(calibration_result,
+ mode != CLOCK_EVT_MODE_PERIODIC, 1);
+ break;
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ v = apic_read(APIC_LVTT);
+ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write_around(APIC_LVTT, v);
+ break;
+ }
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Local APIC timer broadcast function
+ */
+static void lapic_timer_broadcast(cpumask_t mask)
{
#ifdef CONFIG_SMP
- smp_intr_init();
+ send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
#endif
- /* self generated IPI for local APIC timer */
- set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+}
- /* IPI vectors for APIC spurious and error interrupts */
- set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+/*
+ * Setup the local APIC timer for this CPU. Copy the initilized values
+ * of the boot CPU and register the clock event in the framework.
+ */
+static void __devinit setup_APIC_timer(void)
+{
+ struct clock_event_device *levt = &__get_cpu_var(lapic_events);
- /* thermal monitor LVT interrupt */
-#ifdef CONFIG_X86_MCE_P4THERMAL
- set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
+ memcpy(levt, &lapic_clockevent, sizeof(*levt));
+ levt->cpumask = cpumask_of_cpu(smp_processor_id());
+
+ clockevents_register_device(levt);
}
-/* Using APIC to generate smp_local_timer_interrupt? */
-int using_apic_timer __read_mostly = 0;
+/*
+ * In this functions we calibrate APIC bus clocks to the external timer.
+ *
+ * We want to do the calibration only once since we want to have local timer
+ * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * frequency.
+ *
+ * This was previously done by reading the PIT/HPET and waiting for a wrap
+ * around to find out, that a tick has elapsed. I have a box, where the PIT
+ * readout is broken, so it never gets out of the wait loop again. This was
+ * also reported by others.
+ *
+ * Monitoring the jiffies value is inaccurate and the clockevents
+ * infrastructure allows us to do a simple substitution of the interrupt
+ * handler.
+ *
+ * The calibration routine also uses the pm_timer when possible, as the PIT
+ * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
+ * back to normal later in the boot process).
+ */
+
+#define LAPIC_CAL_LOOPS (HZ/10)
-static int enabled_via_apicbase;
+static __initdata volatile int lapic_cal_loops = -1;
+static __initdata long lapic_cal_t1, lapic_cal_t2;
+static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
+static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
-void enable_NMI_through_LVT0 (void * dummy)
+/*
+ * Temporary interrupt handler.
+ */
+static void __init lapic_cal_handler(struct clock_event_device *dev)
{
- unsigned int v, ver;
+ unsigned long long tsc = 0;
+ long tapic = apic_read(APIC_TMCCT);
+ unsigned long pm = acpi_pm_read_early();
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
- v = APIC_DM_NMI; /* unmask and set to NMI */
- if (!APIC_INTEGRATED(ver)) /* 82489DX */
- v |= APIC_LVT_LEVEL_TRIGGER;
- apic_write_around(APIC_LVT0, v);
+ if (cpu_has_tsc)
+ rdtscll(tsc);
+
+ switch (lapic_cal_loops++) {
+ case 0:
+ lapic_cal_t1 = tapic;
+ lapic_cal_tsc1 = tsc;
+ lapic_cal_pm1 = pm;
+ lapic_cal_j1 = jiffies;
+ break;
+
+ case LAPIC_CAL_LOOPS:
+ lapic_cal_t2 = tapic;
+ lapic_cal_tsc2 = tsc;
+ if (pm < lapic_cal_pm1)
+ pm += ACPI_PM_OVRRUN;
+ lapic_cal_pm2 = pm;
+ lapic_cal_j2 = jiffies;
+ break;
+ }
}
-int get_physical_broadcast(void)
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
{
- if (modern_apic())
- return 0xff;
- else
- return 0xf;
+ struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+ const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
+ const long pm_thresh = pm_100ms/100;
+ void (*real_handler)(struct clock_event_device *dev);
+ unsigned long deltaj;
+ long delta, deltapm;
+
+ apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+ "calibrating APIC timer ...\n");
+
+ local_irq_disable();
+
+ /* Replace the global interrupt handler */
+ real_handler = global_clock_event->event_handler;
+ global_clock_event->event_handler = lapic_cal_handler;
+
+ /*
+ * Setup the APIC counter to 1e9. There is no way the lapic
+ * can underflow in the 100ms detection time frame
+ */
+ __setup_APIC_LVTT(1000000000, 0, 0);
+
+ /* Let the interrupts run */
+ local_irq_enable();
+
+ while(lapic_cal_loops <= LAPIC_CAL_LOOPS);
+
+ local_irq_disable();
+
+ /* Restore the real event handler */
+ global_clock_event->event_handler = real_handler;
+
+ /* Build delta t1-t2 as apic timer counts down */
+ delta = lapic_cal_t1 - lapic_cal_t2;
+ apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+
+ /* Check, if the PM timer is available */
+ deltapm = lapic_cal_pm2 - lapic_cal_pm1;
+ apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+
+ if (deltapm) {
+ unsigned long mult;
+ u64 res;
+
+ mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+
+ if (deltapm > (pm_100ms - pm_thresh) &&
+ deltapm < (pm_100ms + pm_thresh)) {
+ apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
+ } else {
+ res = (((u64) deltapm) * mult) >> 22;
+ do_div(res, 1000000);
+ printk(KERN_WARNING "APIC calibration not consistent "
+ "with PM Timer: %ldms instead of 100ms\n",
+ (long)res);
+ /* Correct the lapic counter value */
+ res = (((u64) delta ) * pm_100ms);
+ do_div(res, deltapm);
+ printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
+ "%lu (%ld)\n", (unsigned long) res, delta);
+ delta = (long) res;
+ }
+ }
+
+ /* Calculate the scaled math multiplication factor */
+ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
+ lapic_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &lapic_clockevent);
+
+ calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+
+ apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
+ apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
+ apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
+ calibration_result);
+
+ if (cpu_has_tsc) {
+ delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
+ apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
+ "%ld.%04ld MHz.\n",
+ (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+ (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ }
+
+ apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
+ "%u.%04u MHz.\n",
+ calibration_result / (1000000 / HZ),
+ calibration_result % (1000000 / HZ));
+
+
+ apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+
+ /*
+ * Setup the apic timer manually
+ */
+ local_apic_timer_verify_ok = 1;
+ levt->event_handler = lapic_cal_handler;
+ lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+ lapic_cal_loops = -1;
+
+ /* Let the interrupts run */
+ local_irq_enable();
+
+ while(lapic_cal_loops <= LAPIC_CAL_LOOPS);
+
+ local_irq_disable();
+
+ /* Stop the lapic timer */
+ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+
+ local_irq_enable();
+
+ /* Jiffies delta */
+ deltaj = lapic_cal_j2 - lapic_cal_j1;
+ apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+
+ /* Check, if the PM timer is available */
+ deltapm = lapic_cal_pm2 - lapic_cal_pm1;
+ apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+
+ local_apic_timer_verify_ok = 0;
+
+ if (deltapm) {
+ if (deltapm > (pm_100ms - pm_thresh) &&
+ deltapm < (pm_100ms + pm_thresh)) {
+ apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
+ /* Check, if the jiffies result is consistent */
+ if (deltaj < LAPIC_CAL_LOOPS-2 ||
+ deltaj > LAPIC_CAL_LOOPS+2) {
+ /*
+ * Not sure, what we can do about this one.
+ * When high resultion timers are active
+ * and the lapic timer does not stop in C3
+ * we are fine. Otherwise more trouble might
+ * be waiting. -- tglx
+ */
+ printk(KERN_WARNING "Global event device %s "
+ "has wrong frequency "
+ "(%lu ticks instead of %d)\n",
+ global_clock_event->name, deltaj,
+ LAPIC_CAL_LOOPS);
+ }
+ local_apic_timer_verify_ok = 1;
+ }
+ } else {
+ /* Check, if the jiffies result is consistent */
+ if (deltaj >= LAPIC_CAL_LOOPS-2 &&
+ deltaj <= LAPIC_CAL_LOOPS+2) {
+ apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+ local_apic_timer_verify_ok = 1;
+ }
+ }
+
+ if (!local_apic_timer_verify_ok) {
+ printk(KERN_WARNING
+ "APIC timer disabled due to verification failure.\n");
+ /* No broadcast on UP ! */
+ if (num_possible_cpus() == 1)
+ return;
+ } else
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+
+ /* Setup the lapic or request the broadcast */
+ setup_APIC_timer();
+}
+
+void __devinit setup_secondary_APIC_clock(void)
+{
+ setup_APIC_timer();
}
-int get_maxlvt(void)
+/*
+ * The guts of the apic timer interrupt
+ */
+static void local_apic_timer_interrupt(void)
{
- unsigned int v, ver, maxlvt;
+ int cpu = smp_processor_id();
+ struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
- v = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(v);
- /* 82489DXs do not report # of LVT entries. */
- maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2;
- return maxlvt;
+ /*
+ * Normally we should not be here till LAPIC has been initialized but
+ * in some cases like kdump, its possible that there is a pending LAPIC
+ * timer interrupt from previous kernel's context and is delivered in
+ * new kernel the moment interrupts are enabled.
+ *
+ * Interrupts are enabled early and LAPIC is setup much later, hence
+ * its possible that when we get here evt->event_handler is NULL.
+ * Check for event_handler being NULL and discard the interrupt as
+ * spurious.
+ */
+ if (!evt->event_handler) {
+ printk(KERN_WARNING
+ "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+ /* Switch it off */
+ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+ return;
+ }
+
+ per_cpu(irq_stat, cpu).apic_timer_irqs++;
+
+ evt->event_handler(evt);
}
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ * interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+
+void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /*
+ * NOTE! We'd better ACK the irq immediately,
+ * because timer handling can be slow.
+ */
+ ack_APIC_irq();
+ /*
+ * update_process_times() expects us to have done irq_enter().
+ * Besides, if we don't timer interrupts ignore the global
+ * interrupt lock, which is the WrongThing (tm) to do.
+ */
+ irq_enter();
+ local_apic_timer_interrupt();
+ irq_exit();
+
+ set_irq_regs(old_regs);
+}
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+ return -EINVAL;
+}
+
+/*
+ * Local APIC start and shutdown
+ */
+
+/**
+ * clear_local_APIC - shutdown the local APIC
+ *
+ * This is called, when a CPU is disabled and before rebooting, so the state of
+ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
+ * leftovers during boot.
+ */
void clear_local_APIC(void)
{
- int maxlvt;
+ int maxlvt = lapic_get_maxlvt();
unsigned long v;
- maxlvt = get_maxlvt();
-
/*
* Masking an LVT entry can trigger a local APIC error
* if the vector is zero. Mask LVTERR first to prevent this.
@@ -189,7 +611,7 @@ void clear_local_APIC(void)
apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
}
-/* lets not touch this if we didn't frob it */
+ /* lets not touch this if we didn't frob it */
#ifdef CONFIG_X86_MCE_P4THERMAL
if (maxlvt >= 5) {
v = apic_read(APIC_LVTTHMR);
@@ -211,85 +633,18 @@ void clear_local_APIC(void)
if (maxlvt >= 5)
apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
#endif
- v = GET_APIC_VERSION(apic_read(APIC_LVR));
- if (APIC_INTEGRATED(v)) { /* !82489DX */
- if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */
+ /* Integrated APIC (!82489DX) ? */
+ if (lapic_is_integrated()) {
+ if (maxlvt > 3)
+ /* Clear ESR due to Pentium errata 3AP and 11AP */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
}
-void __init connect_bsp_APIC(void)
-{
- if (pic_mode) {
- /*
- * Do not trust the local APIC being empty at bootup.
- */
- clear_local_APIC();
- /*
- * PIC mode, enable APIC mode in the IMCR, i.e.
- * connect BSP's local APIC to INT and NMI lines.
- */
- apic_printk(APIC_VERBOSE, "leaving PIC mode, "
- "enabling APIC mode.\n");
- outb(0x70, 0x22);
- outb(0x01, 0x23);
- }
- enable_apic_mode();
-}
-
-void disconnect_bsp_APIC(int virt_wire_setup)
-{
- if (pic_mode) {
- /*
- * Put the board back into PIC mode (has an effect
- * only on certain older boards). Note that APIC
- * interrupts, including IPIs, won't work beyond
- * this point! The only exception are INIT IPIs.
- */
- apic_printk(APIC_VERBOSE, "disabling APIC mode, "
- "entering PIC mode.\n");
- outb(0x70, 0x22);
- outb(0x00, 0x23);
- }
- else {
- /* Go back to Virtual Wire compatibility mode */
- unsigned long value;
-
- /* For the spurious interrupt use vector F, and enable it */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
- value |= 0xf;
- apic_write_around(APIC_SPIV, value);
-
- if (!virt_wire_setup) {
- /* For LVT0 make it edge triggered, active high, external and enabled */
- value = apic_read(APIC_LVT0);
- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
- apic_write_around(APIC_LVT0, value);
- }
- else {
- /* Disable LVT0 */
- apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
- }
-
- /* For LVT1 make it edge triggered, active high, nmi and enabled */
- value = apic_read(APIC_LVT1);
- value &= ~(
- APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
- apic_write_around(APIC_LVT1, value);
- }
-}
-
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
void disable_local_APIC(void)
{
unsigned long value;
@@ -304,8 +659,13 @@ void disable_local_APIC(void)
value &= ~APIC_SPIV_APIC_ENABLED;
apic_write_around(APIC_SPIV, value);
+ /*
+ * When LAPIC was disabled by the BIOS and enabled by the kernel,
+ * restore the disabled state.
+ */
if (enabled_via_apicbase) {
unsigned int l, h;
+
rdmsr(MSR_IA32_APICBASE, l, h);
l &= ~MSR_IA32_APICBASE_ENABLE;
wrmsr(MSR_IA32_APICBASE, l, h);
@@ -313,6 +673,28 @@ void disable_local_APIC(void)
}
/*
+ * If Linux enabled the LAPIC against the BIOS default disable it down before
+ * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
+ * not power-off. Additionally clear all LVT entries before disable_local_APIC
+ * for the case where Linux didn't enable the LAPIC.
+ */
+void lapic_shutdown(void)
+{
+ unsigned long flags;
+
+ if (!cpu_has_apic)
+ return;
+
+ local_irq_save(flags);
+ clear_local_APIC();
+
+ if (enabled_via_apicbase)
+ disable_local_APIC();
+
+ local_irq_restore(flags);
+}
+
+/*
* This is to verify that we're looking at a real local APIC.
* Check these against your board if the CPUs aren't getting
* started for no apparent reason.
@@ -344,7 +726,7 @@ int __init verify_local_APIC(void)
reg1 = GET_APIC_VERSION(reg0);
if (reg1 == 0x00 || reg1 == 0xff)
return 0;
- reg1 = get_maxlvt();
+ reg1 = lapic_get_maxlvt();
if (reg1 < 0x02 || reg1 == 0xff)
return 0;
@@ -367,10 +749,15 @@ int __init verify_local_APIC(void)
return 1;
}
+/**
+ * sync_Arb_IDs - synchronize APIC bus arbitration IDs
+ */
void __init sync_Arb_IDs(void)
{
- /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1
- And not needed on AMD */
+ /*
+ * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
+ * needed on AMD.
+ */
if (modern_apic())
return;
/*
@@ -383,14 +770,12 @@ void __init sync_Arb_IDs(void)
| APIC_DM_INIT);
}
-extern void __error_in_apic_c (void);
-
/*
* An initial setup of the virtual wire mode.
*/
void __init init_bsp_APIC(void)
{
- unsigned long value, ver;
+ unsigned long value;
/*
* Don't do the setup now if we have a SMP BIOS as the
@@ -399,9 +784,6 @@ void __init init_bsp_APIC(void)
if (smp_found_config || !cpu_has_apic)
return;
- value = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(value);
-
/*
* Do not trust the local APIC being empty at bootup.
*/
@@ -413,9 +795,10 @@ void __init init_bsp_APIC(void)
value = apic_read(APIC_SPIV);
value &= ~APIC_VECTOR_MASK;
value |= APIC_SPIV_APIC_ENABLED;
-
+
/* This bit is reserved on P4/Xeon and should be cleared */
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15))
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ (boot_cpu_data.x86 == 15))
value &= ~APIC_SPIV_FOCUS_DISABLED;
else
value |= APIC_SPIV_FOCUS_DISABLED;
@@ -427,14 +810,17 @@ void __init init_bsp_APIC(void)
*/
apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
value = APIC_DM_NMI;
- if (!APIC_INTEGRATED(ver)) /* 82489DX */
+ if (!lapic_is_integrated()) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write_around(APIC_LVT1, value);
}
+/**
+ * setup_local_APIC - setup the local APIC
+ */
void __devinit setup_local_APIC(void)
{
- unsigned long oldvalue, value, ver, maxlvt;
+ unsigned long oldvalue, value, maxlvt, integrated;
int i, j;
/* Pound the ESR really hard over the head with a big hammer - mbligh */
@@ -445,11 +831,7 @@ void __devinit setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
- value = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(value);
-
- if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
- __error_in_apic_c();
+ integrated = lapic_is_integrated();
/*
* Double-check whether this APIC is really registered.
@@ -520,13 +902,10 @@ void __devinit setup_local_APIC(void)
* like LRU than MRU (the short-term load is more even across CPUs).
* See also the comment in end_level_ioapic_irq(). --macro
*/
-#if 1
+
/* Enable focus processor (bit==0) */
value &= ~APIC_SPIV_FOCUS_DISABLED;
-#else
- /* Disable focus processor (bit==1) */
- value |= APIC_SPIV_FOCUS_DISABLED;
-#endif
+
/*
* Set spurious IRQ vector
*/
@@ -562,17 +941,18 @@ void __devinit setup_local_APIC(void)
value = APIC_DM_NMI;
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
- if (!APIC_INTEGRATED(ver)) /* 82489DX */
+ if (!integrated) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write_around(APIC_LVT1, value);
- if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */
- maxlvt = get_maxlvt();
+ if (integrated && !esr_disable) { /* !82489DX */
+ maxlvt = lapic_get_maxlvt();
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
oldvalue = apic_read(APIC_ESR);
- value = ERROR_APIC_VECTOR; // enables sending errors
+ /* enables sending errors */
+ value = ERROR_APIC_VECTOR;
apic_write_around(APIC_LVTERR, value);
/*
* spec says clear errors after enabling vector.
@@ -585,207 +965,30 @@ void __devinit setup_local_APIC(void)
"vector: 0x%08lx after: 0x%08lx\n",
oldvalue, value);
} else {
- if (esr_disable)
- /*
- * Something untraceble is creating bad interrupts on
+ if (esr_disable)
+ /*
+ * Something untraceble is creating bad interrupts on
* secondary quads ... for the moment, just leave the
* ESR disabled - we can't do anything useful with the
* errors anyway - mbligh
*/
- printk("Leaving ESR disabled.\n");
- else
- printk("No ESR for 82489DX.\n");
+ printk(KERN_INFO "Leaving ESR disabled.\n");
+ else
+ printk(KERN_INFO "No ESR for 82489DX.\n");
}
+ /* Disable the local apic timer */
+ value = apic_read(APIC_LVTT);
+ value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write_around(APIC_LVTT, value);
+
setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
}
/*
- * If Linux enabled the LAPIC against the BIOS default
- * disable it down before re-entering the BIOS on shutdown.
- * Otherwise the BIOS may get confused and not power-off.
- * Additionally clear all LVT entries before disable_local_APIC
- * for the case where Linux didn't enable the LAPIC.
+ * Detect and initialize APIC
*/
-void lapic_shutdown(void)
-{
- unsigned long flags;
-
- if (!cpu_has_apic)
- return;
-
- local_irq_save(flags);
- clear_local_APIC();
-
- if (enabled_via_apicbase)
- disable_local_APIC();
-
- local_irq_restore(flags);
-}
-
-#ifdef CONFIG_PM
-
-static struct {
- int active;
- /* r/w apic fields */
- unsigned int apic_id;
- unsigned int apic_taskpri;
- unsigned int apic_ldr;
- unsigned int apic_dfr;
- unsigned int apic_spiv;
- unsigned int apic_lvtt;
- unsigned int apic_lvtpc;
- unsigned int apic_lvt0;
- unsigned int apic_lvt1;
- unsigned int apic_lvterr;
- unsigned int apic_tmict;
- unsigned int apic_tdcr;
- unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
- unsigned long flags;
- int maxlvt;
-
- if (!apic_pm_state.active)
- return 0;
-
- maxlvt = get_maxlvt();
-
- apic_pm_state.apic_id = apic_read(APIC_ID);
- apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
- apic_pm_state.apic_ldr = apic_read(APIC_LDR);
- apic_pm_state.apic_dfr = apic_read(APIC_DFR);
- apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
- apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
- if (maxlvt >= 4)
- apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
- apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
- apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
- apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
- apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
- apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_P4THERMAL
- if (maxlvt >= 5)
- apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-#endif
-
- local_irq_save(flags);
- disable_local_APIC();
- local_irq_restore(flags);
- return 0;
-}
-
-static int lapic_resume(struct sys_device *dev)
-{
- unsigned int l, h;
- unsigned long flags;
- int maxlvt;
-
- if (!apic_pm_state.active)
- return 0;
-
- maxlvt = get_maxlvt();
-
- local_irq_save(flags);
-
- /*
- * Make sure the APICBASE points to the right address
- *
- * FIXME! This will be wrong if we ever support suspend on
- * SMP! We'll need to do this as part of the CPU restore!
- */
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
-
- apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
- apic_write(APIC_ID, apic_pm_state.apic_id);
- apic_write(APIC_DFR, apic_pm_state.apic_dfr);
- apic_write(APIC_LDR, apic_pm_state.apic_ldr);
- apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
- apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
- apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
- apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_P4THERMAL
- if (maxlvt >= 5)
- apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-#endif
- if (maxlvt >= 4)
- apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
- apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
- apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
- apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- local_irq_restore(flags);
- return 0;
-}
-
-/*
- * This device has no shutdown method - fully functioning local APICs
- * are needed on every CPU up until machine_halt/restart/poweroff.
- */
-
-static struct sysdev_class lapic_sysclass = {
- set_kset_name("lapic"),
- .resume = lapic_resume,
- .suspend = lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
- .id = 0,
- .cls = &lapic_sysclass,
-};
-
-static void __devinit apic_pm_activate(void)
-{
- apic_pm_state.active = 1;
-}
-
-static int __init init_lapic_sysfs(void)
-{
- int error;
-
- if (!cpu_has_apic)
- return 0;
- /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-
- error = sysdev_class_register(&lapic_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic);
- return error;
-}
-device_initcall(init_lapic_sysfs);
-
-#else /* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif /* CONFIG_PM */
-
-/*
- * Detect and enable local APICs on non-SMP boards.
- * Original code written by Keir Fraser.
- */
-
-static int __init apic_set_verbosity(char *str)
-{
- if (strcmp("debug", str) == 0)
- apic_verbosity = APIC_DEBUG;
- else if (strcmp("verbose", str) == 0)
- apic_verbosity = APIC_VERBOSE;
- return 1;
-}
-
-__setup("apic=", apic_set_verbosity);
-
static int __init detect_init_APIC (void)
{
u32 h, l, features;
@@ -797,7 +1000,7 @@ static int __init detect_init_APIC (void)
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
- (boot_cpu_data.x86 == 15))
+ (boot_cpu_data.x86 == 15))
break;
goto no_apic;
case X86_VENDOR_INTEL:
@@ -811,23 +1014,23 @@ static int __init detect_init_APIC (void)
if (!cpu_has_apic) {
/*
- * Over-ride BIOS and try to enable the local
- * APIC only if "lapic" specified.
+ * Over-ride BIOS and try to enable the local APIC only if
+ * "lapic" specified.
*/
if (enable_local_apic <= 0) {
- printk("Local APIC disabled by BIOS -- "
+ printk(KERN_INFO "Local APIC disabled by BIOS -- "
"you can enable it with \"lapic\"\n");
return -1;
}
/*
- * Some BIOSes disable the local APIC in the
- * APIC_BASE MSR. This can only be done in
- * software for Intel P6 or later and AMD K7
- * (Model > 1) or later.
+ * Some BIOSes disable the local APIC in the APIC_BASE
+ * MSR. This can only be done in software for Intel P6 or later
+ * and AMD K7 (Model > 1) or later.
*/
rdmsr(MSR_IA32_APICBASE, l, h);
if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- printk("Local APIC disabled by BIOS -- reenabling.\n");
+ printk(KERN_INFO
+ "Local APIC disabled by BIOS -- reenabling.\n");
l &= ~MSR_IA32_APICBASE_BASE;
l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
wrmsr(MSR_IA32_APICBASE, l, h);
@@ -840,7 +1043,7 @@ static int __init detect_init_APIC (void)
*/
features = cpuid_edx(1);
if (!(features & (1 << X86_FEATURE_APIC))) {
- printk("Could not enable APIC!\n");
+ printk(KERN_WARNING "Could not enable APIC!\n");
return -1;
}
set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
@@ -854,17 +1057,20 @@ static int __init detect_init_APIC (void)
if (nmi_watchdog != NMI_NONE)
nmi_watchdog = NMI_LOCAL_APIC;
- printk("Found and enabled local APIC!\n");
+ printk(KERN_INFO "Found and enabled local APIC!\n");
apic_pm_activate();
return 0;
no_apic:
- printk("No local APIC present or hardware disabled\n");
+ printk(KERN_INFO "No local APIC present or hardware disabled\n");
return -1;
}
+/**
+ * init_apic_mappings - initialize APIC mappings
+ */
void __init init_apic_mappings(void)
{
unsigned long apic_phys;
@@ -924,384 +1130,92 @@ fake_ioapic_page:
}
/*
- * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts
- * per second. We assume that the caller has already set up the local
- * APIC.
- *
- * The APIC timer is not exactly sync with the external timer chip, it
- * closely follows bus clocks.
- */
-
-/*
- * The timer chip is already set up at HZ interrupts per second here,
- * but we do not accept timer interrupts yet. We only allow the BP
- * to calibrate.
- */
-static unsigned int __devinit get_8254_timer_count(void)
-{
- unsigned long flags;
-
- unsigned int count;
-
- spin_lock_irqsave(&i8253_lock, flags);
-
- outb_p(0x00, PIT_MODE);
- count = inb_p(PIT_CH0);
- count |= inb_p(PIT_CH0) << 8;
-
- spin_unlock_irqrestore(&i8253_lock, flags);
-
- return count;
-}
-
-/* next tick in 8254 can be caught by catching timer wraparound */
-static void __devinit wait_8254_wraparound(void)
-{
- unsigned int curr_count, prev_count;
-
- curr_count = get_8254_timer_count();
- do {
- prev_count = curr_count;
- curr_count = get_8254_timer_count();
-
- /* workaround for broken Mercury/Neptune */
- if (prev_count >= curr_count + 0x100)
- curr_count = get_8254_timer_count();
-
- } while (prev_count >= curr_count);
-}
-
-/*
- * Default initialization for 8254 timers. If we use other timers like HPET,
- * we override this later
- */
-void (*wait_timer_tick)(void) __devinitdata = wait_8254_wraparound;
-
-/*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
*/
-
-#define APIC_DIVISOR 16
-
-static void __setup_APIC_LVTT(unsigned int clocks)
+int __init APIC_init_uniprocessor (void)
{
- unsigned int lvtt_value, tmp_value, ver;
- int cpu = smp_processor_id();
-
- ver = GET_APIC_VERSION(apic_read(APIC_LVR));
- lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
- if (!APIC_INTEGRATED(ver))
- lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
-
- if (cpu_isset(cpu, timer_bcast_ipi))
- lvtt_value |= APIC_LVT_MASKED;
+ if (enable_local_apic < 0)
+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
- apic_write_around(APIC_LVTT, lvtt_value);
+ if (!smp_found_config && !cpu_has_apic)
+ return -1;
/*
- * Divide PICLK by 16
+ * Complain if the BIOS pretends there is one.
*/
- tmp_value = apic_read(APIC_TDCR);
- apic_write_around(APIC_TDCR, (tmp_value
- & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
- | APIC_TDR_DIV_16);
-
- apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
-}
+ if (!cpu_has_apic &&
+ APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+ boot_cpu_physical_apicid);
+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+ return -1;
+ }
-static void __devinit setup_APIC_timer(unsigned int clocks)
-{
- unsigned long flags;
+ verify_local_APIC();
- local_irq_save(flags);
+ connect_bsp_APIC();
/*
- * Wait for IRQ0's slice:
+ * Hack: In case of kdump, after a crash, kernel might be booting
+ * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
+ * might be zero if read from MP tables. Get it from LAPIC.
*/
- wait_timer_tick();
+#ifdef CONFIG_CRASH_DUMP
+ boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+#endif
+ phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
- __setup_APIC_LVTT(clocks);
+ setup_local_APIC();
- local_irq_restore(flags);
+#ifdef CONFIG_X86_IO_APIC
+ if (smp_found_config)
+ if (!skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+#endif
+ setup_boot_clock();
+
+ return 0;
}
/*
- * In this function we calibrate APIC bus clocks to the external
- * timer. Unfortunately we cannot use jiffies and the timer irq
- * to calibrate, since some later bootup code depends on getting
- * the first irq? Ugh.
- *
- * We want to do the calibration only once since we
- * want to have local timer irqs syncron. CPUs connected
- * by the same APIC bus have the very same bus frequency.
- * And we want to have irqs off anyways, no accidental
- * APIC irq that way.
+ * APIC command line parameters
*/
-
-static int __init calibrate_APIC_clock(void)
-{
- unsigned long long t1 = 0, t2 = 0;
- long tt1, tt2;
- long result;
- int i;
- const int LOOPS = HZ/10;
-
- apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n");
-
- /*
- * Put whatever arbitrary (but long enough) timeout
- * value into the APIC clock, we just want to get the
- * counter running for calibration.
- */
- __setup_APIC_LVTT(1000000000);
-
- /*
- * The timer chip counts down to zero. Let's wait
- * for a wraparound to start exact measurement:
- * (the current tick might have been already half done)
- */
-
- wait_timer_tick();
-
- /*
- * We wrapped around just now. Let's start:
- */
- if (cpu_has_tsc)
- rdtscll(t1);
- tt1 = apic_read(APIC_TMCCT);
-
- /*
- * Let's wait LOOPS wraprounds:
- */
- for (i = 0; i < LOOPS; i++)
- wait_timer_tick();
-
- tt2 = apic_read(APIC_TMCCT);
- if (cpu_has_tsc)
- rdtscll(t2);
-
- /*
- * The APIC bus clock counter is 32 bits only, it
- * might have overflown, but note that we use signed
- * longs, thus no extra care needed.
- *
- * underflown to be exact, as the timer counts down ;)
- */
-
- result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
-
- if (cpu_has_tsc)
- apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
- "%ld.%04ld MHz.\n",
- ((long)(t2-t1)/LOOPS)/(1000000/HZ),
- ((long)(t2-t1)/LOOPS)%(1000000/HZ));
-
- apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
- "%ld.%04ld MHz.\n",
- result/(1000000/HZ),
- result%(1000000/HZ));
-
- return result;
-}
-
-static unsigned int calibration_result;
-
-void __init setup_boot_APIC_clock(void)
-{
- unsigned long flags;
- apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
- using_apic_timer = 1;
-
- local_irq_save(flags);
-
- calibration_result = calibrate_APIC_clock();
- /*
- * Now set up the timer for real.
- */
- setup_APIC_timer(calibration_result);
-
- local_irq_restore(flags);
-}
-
-void __devinit setup_secondary_APIC_clock(void)
-{
- setup_APIC_timer(calibration_result);
-}
-
-void disable_APIC_timer(void)
-{
- if (using_apic_timer) {
- unsigned long v;
-
- v = apic_read(APIC_LVTT);
- /*
- * When an illegal vector value (0-15) is written to an LVT
- * entry and delivery mode is Fixed, the APIC may signal an
- * illegal vector error, with out regard to whether the mask
- * bit is set or whether an interrupt is actually seen on input.
- *
- * Boot sequence might call this function when the LVTT has
- * '0' vector value. So make sure vector field is set to
- * valid value.
- */
- v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write_around(APIC_LVTT, v);
- }
-}
-
-void enable_APIC_timer(void)
+static int __init parse_lapic(char *arg)
{
- int cpu = smp_processor_id();
-
- if (using_apic_timer &&
- !cpu_isset(cpu, timer_bcast_ipi)) {
- unsigned long v;
-
- v = apic_read(APIC_LVTT);
- apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
- }
+ enable_local_apic = 1;
+ return 0;
}
+early_param("lapic", parse_lapic);
-void switch_APIC_timer_to_ipi(void *cpumask)
+static int __init parse_nolapic(char *arg)
{
- cpumask_t mask = *(cpumask_t *)cpumask;
- int cpu = smp_processor_id();
-
- if (cpu_isset(cpu, mask) &&
- !cpu_isset(cpu, timer_bcast_ipi)) {
- disable_APIC_timer();
- cpu_set(cpu, timer_bcast_ipi);
- }
+ enable_local_apic = -1;
+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+ return 0;
}
-EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
+early_param("nolapic", parse_nolapic);
-void switch_ipi_to_APIC_timer(void *cpumask)
+static int __init apic_set_verbosity(char *str)
{
- cpumask_t mask = *(cpumask_t *)cpumask;
- int cpu = smp_processor_id();
-
- if (cpu_isset(cpu, mask) &&
- cpu_isset(cpu, timer_bcast_ipi)) {
- cpu_clear(cpu, timer_bcast_ipi);
- enable_APIC_timer();
- }
+ if (strcmp("debug", str) == 0)
+ apic_verbosity = APIC_DEBUG;
+ else if (strcmp("verbose", str) == 0)
+ apic_verbosity = APIC_VERBOSE;
+ return 1;
}
-EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
-
-#undef APIC_DIVISOR
-/*
- * Local timer interrupt handler. It does both profiling and
- * process statistics/rescheduling.
- *
- * We do profiling in every local tick, statistics/rescheduling
- * happen only every 'profiling multiplier' ticks. The default
- * multiplier is 1 and it can be changed by writing the new multiplier
- * value into /proc/profile.
- */
-
-inline void smp_local_timer_interrupt(void)
-{
- profile_tick(CPU_PROFILING);
-#ifdef CONFIG_SMP
- update_process_times(user_mode_vm(get_irq_regs()));
-#endif
+__setup("apic=", apic_set_verbosity);
- /*
- * We take the 'long' return path, and there every subsystem
- * grabs the apropriate locks (kernel lock/ irq lock).
- *
- * we might want to decouple profiling from the 'long path',
- * and do the profiling totally in assembly.
- *
- * Currently this isn't too much of an issue (performance wise),
- * we can take more than 100K local irqs per second on a 100 MHz P5.
- */
-}
/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- * interrupt as well. Thus we cannot inline the local irq ... ]
+ * Local APIC interrupts
*/
-fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
- int cpu = smp_processor_id();
-
- /*
- * the NMI deadlock-detector uses this.
- */
- per_cpu(irq_stat, cpu).apic_timer_irqs++;
-
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- */
- ack_APIC_irq();
- /*
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- irq_enter();
- smp_local_timer_interrupt();
- irq_exit();
- set_irq_regs(old_regs);
-}
-
-#ifndef CONFIG_SMP
-static void up_apic_timer_interrupt_call(void)
-{
- int cpu = smp_processor_id();
-
- /*
- * the NMI deadlock-detector uses this.
- */
- per_cpu(irq_stat, cpu).apic_timer_irqs++;
-
- smp_local_timer_interrupt();
-}
-#endif
-
-void smp_send_timer_broadcast_ipi(void)
-{
- cpumask_t mask;
-
- cpus_and(mask, cpu_online_map, timer_bcast_ipi);
- if (!cpus_empty(mask)) {
-#ifdef CONFIG_SMP
- send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
-#else
- /*
- * We can directly call the apic timer interrupt handler
- * in UP case. Minus all irq related functions
- */
- up_apic_timer_interrupt_call();
-#endif
- }
-}
-
-int setup_profiling_timer(unsigned int multiplier)
-{
- return -EINVAL;
-}
-
/*
* This interrupt should _never_ happen with our APIC/SMP architecture
*/
-fastcall void smp_spurious_interrupt(struct pt_regs *regs)
+void smp_spurious_interrupt(struct pt_regs *regs)
{
unsigned long v;
@@ -1316,16 +1230,15 @@ fastcall void smp_spurious_interrupt(struct pt_regs *regs)
ack_APIC_irq();
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
- printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n",
- smp_processor_id());
+ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
+ "should never happen.\n", smp_processor_id());
irq_exit();
}
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
-
-fastcall void smp_error_interrupt(struct pt_regs *regs)
+void smp_error_interrupt(struct pt_regs *regs)
{
unsigned long v, v1;
@@ -1348,69 +1261,261 @@ fastcall void smp_error_interrupt(struct pt_regs *regs)
7: Illegal register address
*/
printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
- smp_processor_id(), v , v1);
+ smp_processor_id(), v , v1);
irq_exit();
}
/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
+ * Initialize APIC interrupts
*/
-int __init APIC_init_uniprocessor (void)
+void __init apic_intr_init(void)
{
- if (enable_local_apic < 0)
- clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+#ifdef CONFIG_SMP
+ smp_intr_init();
+#endif
+ /* self generated IPI for local APIC timer */
+ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
- if (!smp_found_config && !cpu_has_apic)
- return -1;
+ /* IPI vectors for APIC spurious and error interrupts */
+ set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+ set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
- /*
- * Complain if the BIOS pretends there is one.
- */
- if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
- boot_cpu_physical_apicid);
- clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
- return -1;
+ /* thermal monitor LVT interrupt */
+#ifdef CONFIG_X86_MCE_P4THERMAL
+ set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+}
+
+/**
+ * connect_bsp_APIC - attach the APIC to the interrupt system
+ */
+void __init connect_bsp_APIC(void)
+{
+ if (pic_mode) {
+ /*
+ * Do not trust the local APIC being empty at bootup.
+ */
+ clear_local_APIC();
+ /*
+ * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
+ * local APIC to INT and NMI lines.
+ */
+ apic_printk(APIC_VERBOSE, "leaving PIC mode, "
+ "enabling APIC mode.\n");
+ outb(0x70, 0x22);
+ outb(0x01, 0x23);
}
+ enable_apic_mode();
+}
- verify_local_APIC();
+/**
+ * disconnect_bsp_APIC - detach the APIC from the interrupt system
+ * @virt_wire_setup: indicates, whether virtual wire mode is selected
+ *
+ * Virtual wire mode is necessary to deliver legacy interrupts even when the
+ * APIC is disabled.
+ */
+void disconnect_bsp_APIC(int virt_wire_setup)
+{
+ if (pic_mode) {
+ /*
+ * Put the board back into PIC mode (has an effect only on
+ * certain older boards). Note that APIC interrupts, including
+ * IPIs, won't work beyond this point! The only exception are
+ * INIT IPIs.
+ */
+ apic_printk(APIC_VERBOSE, "disabling APIC mode, "
+ "entering PIC mode.\n");
+ outb(0x70, 0x22);
+ outb(0x00, 0x23);
+ } else {
+ /* Go back to Virtual Wire compatibility mode */
+ unsigned long value;
- connect_bsp_APIC();
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write_around(APIC_SPIV, value);
- /*
- * Hack: In case of kdump, after a crash, kernel might be booting
- * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
- * might be zero if read from MP tables. Get it from LAPIC.
- */
-#ifdef CONFIG_CRASH_DUMP
- boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
-#endif
- phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+ if (!virt_wire_setup) {
+ /*
+ * For LVT0 make it edge triggered, active high,
+ * external and enabled
+ */
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+ apic_write_around(APIC_LVT0, value);
+ } else {
+ /* Disable LVT0 */
+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+ }
- setup_local_APIC();
+ /*
+ * For LVT1 make it edge triggered, active high, nmi and
+ * enabled
+ */
+ value = apic_read(APIC_LVT1);
+ value &= ~(
+ APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write_around(APIC_LVT1, value);
+ }
+}
-#ifdef CONFIG_X86_IO_APIC
- if (smp_found_config)
- if (!skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
+/*
+ * Power management
+ */
+#ifdef CONFIG_PM
+
+static struct {
+ int active;
+ /* r/w apic fields */
+ unsigned int apic_id;
+ unsigned int apic_taskpri;
+ unsigned int apic_ldr;
+ unsigned int apic_dfr;
+ unsigned int apic_spiv;
+ unsigned int apic_lvtt;
+ unsigned int apic_lvtpc;
+ unsigned int apic_lvt0;
+ unsigned int apic_lvt1;
+ unsigned int apic_lvterr;
+ unsigned int apic_tmict;
+ unsigned int apic_tdcr;
+ unsigned int apic_thmr;
+} apic_pm_state;
+
+static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+ unsigned long flags;
+ int maxlvt;
+
+ if (!apic_pm_state.active)
+ return 0;
+
+ maxlvt = lapic_get_maxlvt();
+
+ apic_pm_state.apic_id = apic_read(APIC_ID);
+ apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+ apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+ apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+ apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+ apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+ if (maxlvt >= 4)
+ apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+ apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+ apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+ apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+ apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+ apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+ if (maxlvt >= 5)
+ apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
#endif
- setup_boot_APIC_clock();
+ local_irq_save(flags);
+ disable_local_APIC();
+ local_irq_restore(flags);
return 0;
}
-static int __init parse_lapic(char *arg)
+static int lapic_resume(struct sys_device *dev)
{
- lapic_enable();
+ unsigned int l, h;
+ unsigned long flags;
+ int maxlvt;
+
+ if (!apic_pm_state.active)
+ return 0;
+
+ maxlvt = lapic_get_maxlvt();
+
+ local_irq_save(flags);
+
+ /*
+ * Make sure the APICBASE points to the right address
+ *
+ * FIXME! This will be wrong if we ever support suspend on
+ * SMP! We'll need to do this as part of the CPU restore!
+ */
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+
+ apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+ apic_write(APIC_ID, apic_pm_state.apic_id);
+ apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+ apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+ apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+ apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+ apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+ apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+ if (maxlvt >= 5)
+ apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+ if (maxlvt >= 4)
+ apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+ apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+ apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+ apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ local_irq_restore(flags);
return 0;
}
-early_param("lapic", parse_lapic);
-static int __init parse_nolapic(char *arg)
+/*
+ * This device has no shutdown method - fully functioning local APICs
+ * are needed on every CPU up until machine_halt/restart/poweroff.
+ */
+
+static struct sysdev_class lapic_sysclass = {
+ set_kset_name("lapic"),
+ .resume = lapic_resume,
+ .suspend = lapic_suspend,
+};
+
+static struct sys_device device_lapic = {
+ .id = 0,
+ .cls = &lapic_sysclass,
+};
+
+static void __devinit apic_pm_activate(void)
{
- lapic_disable();
- return 0;
+ apic_pm_state.active = 1;
}
-early_param("nolapic", parse_nolapic);
+static int __init init_lapic_sysfs(void)
+{
+ int error;
+
+ if (!cpu_has_apic)
+ return 0;
+ /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+
+ error = sysdev_class_register(&lapic_sysclass);
+ if (!error)
+ error = sysdev_register(&device_lapic);
+ return error;
+}
+device_initcall(init_lapic_sysfs);
+
+#else /* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif /* CONFIG_PM */
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 19901692754..064bbf2861f 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -211,6 +211,7 @@
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include <linux/miscdevice.h>
#include <linux/apm_bios.h>
#include <linux/init.h>
@@ -235,7 +236,6 @@
#include "io_ports.h"
-extern unsigned long get_cmos_time(void);
extern void machine_real_restart(unsigned char *, int);
#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
@@ -1175,28 +1175,6 @@ out:
spin_unlock(&user_list_lock);
}
-static void set_time(void)
-{
- struct timespec ts;
- if (got_clock_diff) { /* Must know time zone in order to set clock */
- ts.tv_sec = get_cmos_time() + clock_cmos_diff;
- ts.tv_nsec = 0;
- do_settimeofday(&ts);
- }
-}
-
-static void get_time_diff(void)
-{
-#ifndef CONFIG_APM_RTC_IS_GMT
- /*
- * Estimate time zone so that set_time can update the clock
- */
- clock_cmos_diff = -get_cmos_time();
- clock_cmos_diff += get_seconds();
- got_clock_diff = 1;
-#endif
-}
-
static void reinit_timer(void)
{
#ifdef INIT_TIMER_AFTER_SUSPEND
@@ -1236,19 +1214,6 @@ static int suspend(int vetoable)
local_irq_disable();
device_power_down(PMSG_SUSPEND);
- /* serialize with the timer interrupt */
- write_seqlock(&xtime_lock);
-
- /* protect against access to timer chip registers */
- spin_lock(&i8253_lock);
-
- get_time_diff();
- /*
- * Irq spinlock must be dropped around set_system_power_state.
- * We'll undo any timer changes due to interrupts below.
- */
- spin_unlock(&i8253_lock);
- write_sequnlock(&xtime_lock);
local_irq_enable();
save_processor_state();
@@ -1257,7 +1222,6 @@ static int suspend(int vetoable)
restore_processor_state();
local_irq_disable();
- set_time();
reinit_timer();
if (err == APM_NO_ERROR)
@@ -1287,11 +1251,6 @@ static void standby(void)
local_irq_disable();
device_power_down(PMSG_SUSPEND);
- /* serialize with the timer interrupt */
- write_seqlock(&xtime_lock);
- /* If needed, notify drivers here */
- get_time_diff();
- write_sequnlock(&xtime_lock);
local_irq_enable();
err = set_system_power_state(APM_STATE_STANDBY);
@@ -1385,7 +1344,6 @@ static void check_events(void)
ignore_bounce = 1;
if ((event != APM_NORMAL_RESUME)
|| (ignore_normal_resume == 0)) {
- set_time();
device_resume();
pm_send_all(PM_RESUME, (void *)0);
queue_event(event, NULL);
@@ -1401,7 +1359,6 @@ static void check_events(void)
break;
case APM_UPDATE_TIME:
- set_time();
break;
case APM_CRITICAL_SUSPEND:
@@ -1636,9 +1593,8 @@ static int do_open(struct inode * inode, struct file * filp)
return 0;
}
-static int apm_get_info(char *buf, char **start, off_t fpos, int length)
+static int proc_apm_show(struct seq_file *m, void *v)
{
- char * p;
unsigned short bx;
unsigned short cx;
unsigned short dx;
@@ -1650,8 +1606,6 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
int time_units = -1;
char *units = "?";
- p = buf;
-
if ((num_online_cpus() == 1) &&
!(error = apm_get_power_status(&bx, &cx, &dx))) {
ac_line_status = (bx >> 8) & 0xff;
@@ -1705,7 +1659,7 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
-1: Unknown
8) min = minutes; sec = seconds */
- p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
+ seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
driver_version,
(apm_info.bios.version >> 8) & 0xff,
apm_info.bios.version & 0xff,
@@ -1716,10 +1670,22 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
percentage,
time_units,
units);
+ return 0;
+}
- return p - buf;
+static int proc_apm_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_apm_show, NULL);
}
+static const struct file_operations apm_file_ops = {
+ .owner = THIS_MODULE,
+ .open = proc_apm_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static int apm(void *unused)
{
unsigned short bx;
@@ -1894,7 +1860,7 @@ static int __init apm_setup(char *str)
__setup("apm=", apm_setup);
#endif
-static struct file_operations apm_bios_fops = {
+static const struct file_operations apm_bios_fops = {
.owner = THIS_MODULE,
.read = do_read,
.poll = do_poll,
@@ -2341,9 +2307,9 @@ static int __init apm_init(void)
set_base(gdt[APM_DS >> 3],
__va((unsigned long)apm_info.bios.dseg << 4));
- apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info);
+ apm_proc = create_proc_entry("apm", 0, NULL);
if (apm_proc)
- apm_proc->owner = THIS_MODULE;
+ apm_proc->proc_fops = &apm_file_ops;
kapmd_task = kthread_create(apm, NULL, "kapmd");
if (IS_ERR(kapmd_task)) {
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 1b2f3cd3327..c37535163bf 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -72,7 +72,7 @@ void foo(void)
OFFSET(PT_EAX, pt_regs, eax);
OFFSET(PT_DS, pt_regs, xds);
OFFSET(PT_ES, pt_regs, xes);
- OFFSET(PT_GS, pt_regs, xgs);
+ OFFSET(PT_FS, pt_regs, xfs);
OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
OFFSET(PT_EIP, pt_regs, eip);
OFFSET(PT_CS, pt_regs, xcs);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 8a8bbdaaf38..dcbbd0a8bfc 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -605,7 +605,7 @@ void __init early_cpu_init(void)
struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
- regs->xgs = __KERNEL_PDA;
+ regs->xfs = __KERNEL_PDA;
return regs;
}
@@ -662,12 +662,12 @@ struct i386_pda boot_pda = {
.pcurrent = &init_task,
};
-static inline void set_kernel_gs(void)
+static inline void set_kernel_fs(void)
{
- /* Set %gs for this CPU's PDA. Memory clobber is to create a
+ /* Set %fs for this CPU's PDA. Memory clobber is to create a
barrier with respect to any PDA operations, so the compiler
doesn't move any before here. */
- asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
+ asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
}
/* Initialize the CPU's GDT and PDA. The boot CPU does this for
@@ -718,7 +718,7 @@ void __cpuinit cpu_set_gdt(int cpu)
the boot CPU, this will transition from the boot gdt+pda to
the real ones). */
load_gdt(cpu_gdt_descr);
- set_kernel_gs();
+ set_kernel_fs();
}
/* Common CPU init for both boot and secondary CPUs */
@@ -764,8 +764,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
- /* Clear %fs. */
- asm volatile ("mov %0, %%fs" : : "r" (0));
+ /* Clear %gs. */
+ asm volatile ("mov %0, %%gs" : : "r" (0));
/* Clear all 6 debug registers: */
set_debugreg(0, 0);
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index c0c3b59de32..de27bd07bc9 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -6,6 +6,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/timer.h>
+#include <asm/pci-direct.h>
#include "cpu.h"
@@ -161,19 +162,19 @@ static void __cpuinit set_cx86_inc(void)
static void __cpuinit geode_configure(void)
{
unsigned long flags;
- u8 ccr3, ccr4;
+ u8 ccr3;
local_irq_save(flags);
/* Suspend on halt power saving and enable #SUSP pin */
setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
ccr3 = getCx86(CX86_CCR3);
- setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* Enable */
-
- ccr4 = getCx86(CX86_CCR4);
- ccr4 |= 0x38; /* FPU fast, DTE cache, Mem bypass */
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
- setCx86(CX86_CCR3, ccr3);
+
+ /* FPU fast, DTE cache, Mem bypass */
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
+ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
set_cx86_memwb();
set_cx86_reorder();
@@ -183,14 +184,6 @@ static void __cpuinit geode_configure(void)
}
-#ifdef CONFIG_PCI
-static struct pci_device_id __cpuinitdata cyrix_55x0[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) },
- { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) },
- { },
-};
-#endif
-
static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
@@ -258,6 +251,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
#ifdef CONFIG_PCI
+ {
+ u32 vendor, device;
/* It isn't really a PCI quirk directly, but the cure is the
same. The MediaGX has deep magic SMM stuff that handles the
SB emulation. It thows away the fifo on disable_dma() which
@@ -273,22 +268,34 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
isa_dma_bridge_buggy = 2;
+ /* We do this before the PCI layer is running. However we
+ are safe here as we know the bridge must be a Cyrix
+ companion and must be present */
+ vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID);
+ device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID);
/*
* The 5510/5520 companion chips have a funky PIT.
*/
- if (pci_dev_present(cyrix_55x0))
+ if (vendor == PCI_VENDOR_ID_CYRIX &&
+ (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
pit_latch_buggy = 1;
+ }
#endif
c->x86_cache_size=16; /* Yep 16K integrated cache thats it */
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
/* Enable cxMMX extensions (GX1 Datasheet 54) */
- setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
- /* GXlv/GXm/GX1 */
- if((dir1 >= 0x50 && dir1 <= 0x54) || dir1 >= 0x63)
+ /*
+ * GXm : 0x30 ... 0x5f GXm datasheet 51
+ * GXlv: 0x6x GXlv datasheet 54
+ * ? : 0x7x
+ * GX1 : 0x8x GX1 datasheet 56
+ */
+ if((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <=dir1 && dir1 <= 0x8f))
geode_configure();
get_model_name(c); /* get CPU marketing name */
return;
@@ -415,15 +422,14 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
if (dir0 == 5 || dir0 == 3)
{
- unsigned char ccr3, ccr4;
+ unsigned char ccr3;
unsigned long flags;
printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
- setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
- ccr4 = getCx86(CX86_CCR4);
- setCx86(CX86_CCR4, ccr4 | 0x80); /* enable cpuid */
- setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */
+ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
local_irq_restore(flags);
}
}
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index d555bec0db9..4f10c62d180 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -12,6 +12,7 @@
#include <asm/processor.h>
#include <asm/system.h>
+#include <asm/mce.h>
#include "mce.h"
diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h
index 84fd4cf7d0f..81fb6e2d35f 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.h
+++ b/arch/i386/kernel/cpu/mcheck/mce.h
@@ -1,4 +1,5 @@
#include <linux/init.h>
+#include <asm/mce.h>
void amd_mcheck_init(struct cpuinfo_x86 *c);
void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
@@ -9,6 +10,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c);
/* Call the installed machine check handler for this CPU setup. */
extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
-extern int mce_disabled;
extern int nr_mce_banks;
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index 5ae1705eafa..c7d8f175674 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -211,6 +211,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
default:
return -ENOTTY;
case MTRRIOC_ADD_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_ADD_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err =
@@ -218,21 +221,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
file, 0);
break;
case MTRRIOC_SET_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_SET_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_add(sentry.base, sentry.size, sentry.type, 0);
break;
case MTRRIOC_DEL_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_DEL_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 0);
break;
case MTRRIOC_KILL_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_KILL_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_del(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_GET_ENTRY:
+#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
@@ -249,6 +264,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
break;
case MTRRIOC_ADD_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_ADD_PAGE_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err =
@@ -256,21 +274,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
file, 1);
break;
case MTRRIOC_SET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_SET_PAGE_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0);
break;
case MTRRIOC_DEL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_DEL_PAGE_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 1);
break;
case MTRRIOC_KILL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_KILL_PAGE_ENTRY:
+#endif
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
err = mtrr_del_page(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_GET_PAGE_ENTRY:
+#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
@@ -339,7 +369,7 @@ static int mtrr_open(struct inode *inode, struct file *file)
return single_open(file, mtrr_seq_show, NULL);
}
-static struct file_operations mtrr_fops = {
+static const struct file_operations mtrr_fops = {
.owner = THIS_MODULE,
.open = mtrr_open,
.read = seq_read,
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 16bb7ea8714..0acfb6a5a22 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -50,7 +50,7 @@ u32 num_var_ranges = 0;
unsigned int *usage_table;
static DEFINE_MUTEX(mtrr_mutex);
-u32 size_or_mask, size_and_mask;
+u64 size_or_mask, size_and_mask;
static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
@@ -662,8 +662,8 @@ void __init mtrr_bp_init(void)
boot_cpu_data.x86_mask == 0x4))
phys_addr = 36;
- size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1);
- size_and_mask = ~size_or_mask & 0xfff00000;
+ size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
+ size_and_mask = ~size_or_mask & 0xfffff00000ULL;
} else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
boot_cpu_data.x86 == 6) {
/* VIA C* family have Intel style MTRRs, but
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index d61ea9db6cf..289dfe6030e 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -84,7 +84,7 @@ void get_mtrr_state(void);
extern void set_mtrr_ops(struct mtrr_ops * ops);
-extern u32 size_or_mask, size_and_mask;
+extern u64 size_or_mask, size_and_mask;
extern struct mtrr_ops * mtrr_if;
#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 6624d8583c4..47e3ebbfb28 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -29,7 +29,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
- NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
+ NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow",
/* Transmeta-defined */
"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -47,7 +47,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
/* Intel-defined (#2) */
"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
- NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* VIA/Cyrix/Centaur-defined */
@@ -57,8 +57,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* AMD-defined (#2) */
- "lahf_lm", "cmp_legacy", "svm", NULL, "cr8legacy", NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm",
+ "sse4a", "misalignsse",
+ "3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
};
@@ -69,8 +70,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
"ttp", /* thermal trip */
"tm",
"stc",
+ "100mhzsteps",
+ "hwpstate",
NULL,
- /* nothing */ /* constant_tsc - moved to flags */
+ NULL, /* constant_tsc - moved to flags */
+ /* nothing */
};
struct cpuinfo_x86 *c = v;
int i, n = c - cpu_data;
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 4056fb7d2cd..5678d46863c 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -9,7 +9,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
{
unsigned int cap_mask, uk, max, dummy;
unsigned int cms_rev1, cms_rev2;
- unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev;
+ unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
char cpu_info[65];
get_model_name(c); /* Same as AMD/Cyrix */
@@ -72,6 +72,9 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
wrmsr(0x80860004, ~0, uk);
c->x86_capability[0] = cpuid_edx(0x00000001);
wrmsr(0x80860004, cap_mask, uk);
+
+ /* All Transmeta CPUs have a constant TSC */
+ set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
/* If we can run i686 user-space code, call us an i686 */
#define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV)
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 51130b39cd2..eeae0d99233 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -48,7 +48,6 @@ static struct class *cpuid_class;
#ifdef CONFIG_SMP
struct cpuid_command {
- int cpu;
u32 reg;
u32 *data;
};
@@ -57,8 +56,7 @@ static void cpuid_smp_cpuid(void *cmd_block)
{
struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
- if (cmd->cpu == smp_processor_id())
- cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
+ cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
&cmd->data[3]);
}
@@ -70,11 +68,10 @@ static inline void do_cpuid(int cpu, u32 reg, u32 * data)
if (cpu == smp_processor_id()) {
cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
} else {
- cmd.cpu = cpu;
cmd.reg = reg;
cmd.data = data;
- smp_call_function(cpuid_smp_cpuid, &cmd, 1, 1);
+ smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
}
preempt_enable();
}
@@ -148,7 +145,7 @@ static int cpuid_open(struct inode *inode, struct file *file)
/*
* File operations we support
*/
-static struct file_operations cpuid_fops = {
+static const struct file_operations cpuid_fops = {
.owner = THIS_MODULE,
.llseek = cpuid_seek,
.read = cpuid_read,
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index f391abcf7da..70f39560846 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -14,6 +14,7 @@
#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/e820.h>
+#include <asm/setup.h>
#ifdef CONFIG_EFI
int efi_enabled = 0;
@@ -156,21 +157,22 @@ static struct resource standard_io_resources[] = { {
.flags = IORESOURCE_BUSY | IORESOURCE_IO
} };
-static int romsignature(const unsigned char *x)
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
{
unsigned short sig;
- int ret = 0;
- if (probe_kernel_address((const unsigned short *)x, sig) == 0)
- ret = (sig == 0xaa55);
- return ret;
+
+ return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
+ sig == ROMSIGNATURE;
}
static int __init romchecksum(unsigned char *rom, unsigned long length)
{
- unsigned char *p, sum = 0;
+ unsigned char sum;
- for (p = rom; p < rom + length; p++)
- sum += *p;
+ for (sum = 0; length; length--)
+ sum += *rom++;
return sum == 0;
}
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5e47683fc63..18bddcb8e9e 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,7 +30,7 @@
* 18(%esp) - %eax
* 1C(%esp) - %ds
* 20(%esp) - %es
- * 24(%esp) - %gs
+ * 24(%esp) - %fs
* 28(%esp) - orig_eax
* 2C(%esp) - %eip
* 30(%esp) - %cs
@@ -99,9 +99,9 @@ VM_MASK = 0x00020000
#define SAVE_ALL \
cld; \
- pushl %gs; \
+ pushl %fs; \
CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET gs, 0;*/\
+ /*CFI_REL_OFFSET fs, 0;*/\
pushl %es; \
CFI_ADJUST_CFA_OFFSET 4;\
/*CFI_REL_OFFSET es, 0;*/\
@@ -133,7 +133,7 @@ VM_MASK = 0x00020000
movl %edx, %ds; \
movl %edx, %es; \
movl $(__KERNEL_PDA), %edx; \
- movl %edx, %gs
+ movl %edx, %fs
#define RESTORE_INT_REGS \
popl %ebx; \
@@ -166,9 +166,9 @@ VM_MASK = 0x00020000
2: popl %es; \
CFI_ADJUST_CFA_OFFSET -4;\
/*CFI_RESTORE es;*/\
-3: popl %gs; \
+3: popl %fs; \
CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE gs;*/\
+ /*CFI_RESTORE fs;*/\
.pushsection .fixup,"ax"; \
4: movl $0,(%esp); \
jmp 1b; \
@@ -227,6 +227,7 @@ ENTRY(ret_from_fork)
CFI_ADJUST_CFA_OFFSET -4
jmp syscall_exit
CFI_ENDPROC
+END(ret_from_fork)
/*
* Return to user mode is not as complex as all this looks,
@@ -258,6 +259,7 @@ ENTRY(resume_userspace)
# int/exception return?
jne work_pending
jmp restore_all
+END(ret_from_exception)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
@@ -272,6 +274,7 @@ need_resched:
jz restore_all
call preempt_schedule_irq
jmp need_resched
+END(resume_kernel)
#endif
CFI_ENDPROC
@@ -349,16 +352,17 @@ sysenter_past_esp:
movl PT_OLDESP(%esp), %ecx
xorl %ebp,%ebp
TRACE_IRQS_ON
-1: mov PT_GS(%esp), %gs
+1: mov PT_FS(%esp), %fs
ENABLE_INTERRUPTS_SYSEXIT
CFI_ENDPROC
.pushsection .fixup,"ax"
-2: movl $0,PT_GS(%esp)
+2: movl $0,PT_FS(%esp)
jmp 1b
.section __ex_table,"a"
.align 4
.long 1b,2b
.popsection
+ENDPROC(sysenter_entry)
# system call handler stub
ENTRY(system_call)
@@ -459,6 +463,7 @@ ldt_ss:
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
CFI_ENDPROC
+ENDPROC(system_call)
# perform work that needs to be done immediately before resumption
ALIGN
@@ -504,6 +509,7 @@ work_notifysig_v86:
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace_sig
+END(work_pending)
# perform syscall exit tracing
ALIGN
@@ -519,6 +525,7 @@ syscall_trace_entry:
cmpl $(nr_syscalls), %eax
jnae syscall_call
jmp syscall_exit
+END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
@@ -532,6 +539,7 @@ syscall_exit_work:
movl $1, %edx
call do_syscall_trace
jmp resume_userspace
+END(syscall_exit_work)
CFI_ENDPROC
RING0_INT_FRAME # can't unwind into user space anyway
@@ -542,15 +550,17 @@ syscall_fault:
GET_THREAD_INFO(%ebp)
movl $-EFAULT,PT_EAX(%esp)
jmp resume_userspace
+END(syscall_fault)
syscall_badsys:
movl $-ENOSYS,PT_EAX(%esp)
jmp resume_userspace
+END(syscall_badsys)
CFI_ENDPROC
#define FIXUP_ESPFIX_STACK \
/* since we are on a wrong stack, we cant make it a C code :( */ \
- movl %gs:PDA_cpu, %ebx; \
+ movl %fs:PDA_cpu, %ebx; \
PER_CPU(cpu_gdt_descr, %ebx); \
movl GDS_address(%ebx), %ebx; \
GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
@@ -581,9 +591,9 @@ syscall_badsys:
ENTRY(interrupt)
.text
-vector=0
ENTRY(irq_entries_start)
RING0_INT_FRAME
+vector=0
.rept NR_IRQS
ALIGN
.if vector
@@ -592,11 +602,16 @@ ENTRY(irq_entries_start)
1: pushl $~(vector)
CFI_ADJUST_CFA_OFFSET 4
jmp common_interrupt
-.data
+ .previous
.long 1b
-.text
+ .text
vector=vector+1
.endr
+END(irq_entries_start)
+
+.previous
+END(interrupt)
+.previous
/*
* the CPU automatically disables interrupts when executing an IRQ vector,
@@ -609,6 +624,7 @@ common_interrupt:
movl %esp,%eax
call do_IRQ
jmp ret_from_intr
+ENDPROC(common_interrupt)
CFI_ENDPROC
#define BUILD_INTERRUPT(name, nr) \
@@ -621,18 +637,24 @@ ENTRY(name) \
movl %esp,%eax; \
call smp_/**/name; \
jmp ret_from_intr; \
- CFI_ENDPROC
+ CFI_ENDPROC; \
+ENDPROC(name)
/* The include is where all of the SMP etc. interrupts come from */
#include "entry_arch.h"
+/* This alternate entry is needed because we hijack the apic LVTT */
+#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
+BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
+#endif
+
KPROBE_ENTRY(page_fault)
RING0_EC_FRAME
pushl $do_page_fault
CFI_ADJUST_CFA_OFFSET 4
ALIGN
error_code:
- /* the function address is in %gs's slot on the stack */
+ /* the function address is in %fs's slot on the stack */
pushl %es
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET es, 0*/
@@ -661,20 +683,20 @@ error_code:
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ebx, 0
cld
- pushl %gs
+ pushl %fs
CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET gs, 0*/
+ /*CFI_REL_OFFSET fs, 0*/
movl $(__KERNEL_PDA), %ecx
- movl %ecx, %gs
+ movl %ecx, %fs
UNWIND_ESPFIX_STACK
popl %ecx
CFI_ADJUST_CFA_OFFSET -4
/*CFI_REGISTER es, ecx*/
- movl PT_GS(%esp), %edi # get the function address
+ movl PT_FS(%esp), %edi # get the function address
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- mov %ecx, PT_GS(%esp)
- /*CFI_REL_OFFSET gs, ES*/
+ mov %ecx, PT_FS(%esp)
+ /*CFI_REL_OFFSET fs, ES*/
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
@@ -692,6 +714,7 @@ ENTRY(coprocessor_error)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(coprocessor_error)
ENTRY(simd_coprocessor_error)
RING0_INT_FRAME
@@ -701,6 +724,7 @@ ENTRY(simd_coprocessor_error)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(simd_coprocessor_error)
ENTRY(device_not_available)
RING0_INT_FRAME
@@ -721,6 +745,7 @@ device_not_available_emulate:
CFI_ADJUST_CFA_OFFSET -4
jmp ret_from_exception
CFI_ENDPROC
+END(device_not_available)
/*
* Debug traps and NMI can happen at the one SYSENTER instruction
@@ -864,10 +889,12 @@ ENTRY(native_iret)
.align 4
.long 1b,iret_exc
.previous
+END(native_iret)
ENTRY(native_irq_enable_sysexit)
sti
sysexit
+END(native_irq_enable_sysexit)
#endif
KPROBE_ENTRY(int3)
@@ -890,6 +917,7 @@ ENTRY(overflow)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(overflow)
ENTRY(bounds)
RING0_INT_FRAME
@@ -899,6 +927,7 @@ ENTRY(bounds)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(bounds)
ENTRY(invalid_op)
RING0_INT_FRAME
@@ -908,6 +937,7 @@ ENTRY(invalid_op)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(invalid_op)
ENTRY(coprocessor_segment_overrun)
RING0_INT_FRAME
@@ -917,6 +947,7 @@ ENTRY(coprocessor_segment_overrun)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(coprocessor_segment_overrun)
ENTRY(invalid_TSS)
RING0_EC_FRAME
@@ -924,6 +955,7 @@ ENTRY(invalid_TSS)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(invalid_TSS)
ENTRY(segment_not_present)
RING0_EC_FRAME
@@ -931,6 +963,7 @@ ENTRY(segment_not_present)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(segment_not_present)
ENTRY(stack_segment)
RING0_EC_FRAME
@@ -938,6 +971,7 @@ ENTRY(stack_segment)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(stack_segment)
KPROBE_ENTRY(general_protection)
RING0_EC_FRAME
@@ -953,6 +987,7 @@ ENTRY(alignment_check)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(alignment_check)
ENTRY(divide_error)
RING0_INT_FRAME
@@ -962,6 +997,7 @@ ENTRY(divide_error)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(divide_error)
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
@@ -972,6 +1008,7 @@ ENTRY(machine_check)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(machine_check)
#endif
ENTRY(spurious_interrupt_bug)
@@ -982,6 +1019,7 @@ ENTRY(spurious_interrupt_bug)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
+END(spurious_interrupt_bug)
ENTRY(kernel_thread_helper)
pushl $0 # fake return address for unwinder
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index edef5084ce1..3fa7f9389af 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -53,6 +53,7 @@
* any particular GDT layout, because we load our own as soon as we
* can.
*/
+.section .text.head,"ax",@progbits
ENTRY(startup_32)
#ifdef CONFIG_PARAVIRT
@@ -103,7 +104,7 @@ ENTRY(startup_32)
movzwl OLD_CL_OFFSET,%esi
addl $(OLD_CL_BASE_ADDR),%esi
2:
- movl $(saved_command_line - __PAGE_OFFSET),%edi
+ movl $(boot_command_line - __PAGE_OFFSET),%edi
movl $(COMMAND_LINE_SIZE/4),%ecx
rep
movsl
@@ -141,16 +142,25 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
jb 10b
movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
-#ifdef CONFIG_SMP
xorl %ebx,%ebx /* This is the boot CPU (BSP) */
jmp 3f
-
/*
* Non-boot CPU entry point; entered from trampoline.S
* We can't lgdt here, because lgdt itself uses a data segment, but
* we know the trampoline has already loaded the boot_gdt_table GDT
* for us.
+ *
+ * If cpu hotplug is not supported then this code can go in init section
+ * which will be freed later
*/
+
+#ifdef CONFIG_HOTPLUG_CPU
+.section .text,"ax",@progbits
+#else
+.section .init.text,"ax",@progbits
+#endif
+
+#ifdef CONFIG_SMP
ENTRY(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
@@ -208,8 +218,8 @@ ENTRY(startup_32_smp)
xorl %ebx,%ebx
incl %ebx
-3:
#endif /* CONFIG_SMP */
+3:
/*
* Enable paging
@@ -309,7 +319,7 @@ is386: movl $2,%ecx # set MP
call check_x87
call setup_pda
- lgdt cpu_gdt_descr
+ lgdt early_gdt_descr
lidt idt_descr
ljmp $(__KERNEL_CS),$1f
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
@@ -319,12 +329,12 @@ is386: movl $2,%ecx # set MP
movl %eax,%ds
movl %eax,%es
- xorl %eax,%eax # Clear FS and LDT
- movl %eax,%fs
+ xorl %eax,%eax # Clear GS and LDT
+ movl %eax,%gs
lldt %ax
movl $(__KERNEL_PDA),%eax
- mov %eax,%gs
+ mov %eax,%fs
cld # gcc2 wants the direction flag cleared at all times
pushl $0 # fake return address for unwinder
@@ -360,12 +370,12 @@ check_x87:
* cpu_gdt_table and boot_pda; for secondary CPUs, these will be
* that CPU's GDT and PDA.
*/
-setup_pda:
+ENTRY(setup_pda)
/* get the PDA pointer */
movl start_pda, %eax
/* slot the PDA address into the GDT */
- mov cpu_gdt_descr+2, %ecx
+ mov early_gdt_descr+2, %ecx
mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
shr $16, %eax
mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
@@ -492,6 +502,7 @@ ignore_int:
#endif
iret
+.section .text
#ifdef CONFIG_PARAVIRT
startup_paravirt:
cld
@@ -502,10 +513,11 @@ startup_paravirt:
pushl %ecx
pushl %eax
- /* paravirt.o is last in link, and that probe fn never returns */
pushl $__start_paravirtprobe
1:
movl 0(%esp), %eax
+ cmpl $__stop_paravirtprobe, %eax
+ je unhandled_paravirt
pushl (%eax)
movl 8(%esp), %eax
call *(%esp)
@@ -517,6 +529,10 @@ startup_paravirt:
addl $4, (%esp)
jmp 1b
+
+unhandled_paravirt:
+ /* Nothing wanted us: we're screwed. */
+ ud2
#endif
/*
@@ -581,7 +597,7 @@ idt_descr:
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
-ENTRY(cpu_gdt_descr)
+ENTRY(early_gdt_descr)
.word GDT_ENTRIES*8-1
.long cpu_gdt_table
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 0b29d41322a..e1006b7acc9 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -1,4 +1,5 @@
#include <linux/clocksource.h>
+#include <linux/clockchips.h>
#include <linux/errno.h>
#include <linux/hpet.h>
#include <linux/init.h>
@@ -6,17 +7,278 @@
#include <asm/hpet.h>
#include <asm/io.h>
+extern struct clock_event_device *global_clock_event;
+
#define HPET_MASK CLOCKSOURCE_MASK(32)
#define HPET_SHIFT 22
/* FSEC = 10^-15 NSEC = 10^-9 */
#define FSEC_PER_NSEC 1000000
-static void __iomem *hpet_ptr;
+/*
+ * HPET address is set in acpi/boot.c, when an ACPI entry exists
+ */
+unsigned long hpet_address;
+static void __iomem * hpet_virt_address;
+
+static inline unsigned long hpet_readl(unsigned long a)
+{
+ return readl(hpet_virt_address + a);
+}
+
+static inline void hpet_writel(unsigned long d, unsigned long a)
+{
+ writel(d, hpet_virt_address + a);
+}
+
+/*
+ * HPET command line enable / disable
+ */
+static int boot_hpet_disable;
+
+static int __init hpet_setup(char* str)
+{
+ if (str) {
+ if (!strncmp("disable", str, 7))
+ boot_hpet_disable = 1;
+ }
+ return 1;
+}
+__setup("hpet=", hpet_setup);
+
+static inline int is_hpet_capable(void)
+{
+ return (!boot_hpet_disable && hpet_address);
+}
+
+/*
+ * HPET timer interrupt enable / disable
+ */
+static int hpet_legacy_int_enabled;
+
+/**
+ * is_hpet_enabled - check whether the hpet timer interrupt is enabled
+ */
+int is_hpet_enabled(void)
+{
+ return is_hpet_capable() && hpet_legacy_int_enabled;
+}
+
+/*
+ * When the hpet driver (/dev/hpet) is enabled, we need to reserve
+ * timer 0 and timer 1 in case of RTC emulation.
+ */
+#ifdef CONFIG_HPET
+static void hpet_reserve_platform_timers(unsigned long id)
+{
+ struct hpet __iomem *hpet = hpet_virt_address;
+ struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
+ unsigned int nrtimers, i;
+ struct hpet_data hd;
+
+ nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+
+ memset(&hd, 0, sizeof (hd));
+ hd.hd_phys_address = hpet_address;
+ hd.hd_address = hpet_virt_address;
+ hd.hd_nirqs = nrtimers;
+ hd.hd_flags = HPET_DATA_PLATFORM;
+ hpet_reserve_timer(&hd, 0);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+ hpet_reserve_timer(&hd, 1);
+#endif
+
+ hd.hd_irq[0] = HPET_LEGACY_8254;
+ hd.hd_irq[1] = HPET_LEGACY_RTC;
+
+ for (i = 2; i < nrtimers; timer++, i++)
+ hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
+ Tn_INT_ROUTE_CNF_SHIFT;
+
+ hpet_alloc(&hd);
+
+}
+#else
+static void hpet_reserve_platform_timers(unsigned long id) { }
+#endif
+
+/*
+ * Common hpet info
+ */
+static unsigned long hpet_period;
+
+static void hpet_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt);
+static int hpet_next_event(unsigned long delta,
+ struct clock_event_device *evt);
+
+/*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+ .name = "hpet",
+ .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+ .set_mode = hpet_set_mode,
+ .set_next_event = hpet_next_event,
+ .shift = 32,
+ .irq = 0,
+};
+
+static void hpet_start_counter(void)
+{
+ unsigned long cfg = hpet_readl(HPET_CFG);
+
+ cfg &= ~HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
+ hpet_writel(0, HPET_COUNTER);
+ hpet_writel(0, HPET_COUNTER + 4);
+ cfg |= HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
+}
+
+static void hpet_enable_int(void)
+{
+ unsigned long cfg = hpet_readl(HPET_CFG);
+
+ cfg |= HPET_CFG_LEGACY;
+ hpet_writel(cfg, HPET_CFG);
+ hpet_legacy_int_enabled = 1;
+}
+
+static void hpet_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ unsigned long cfg, cmp, now;
+ uint64_t delta;
+
+ switch(mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
+ delta >>= hpet_clockevent.shift;
+ now = hpet_readl(HPET_COUNTER);
+ cmp = now + (unsigned long) delta;
+ cfg = hpet_readl(HPET_T0_CFG);
+ cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
+ HPET_TN_SETVAL | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_T0_CFG);
+ /*
+ * The first write after writing TN_SETVAL to the
+ * config register sets the counter value, the second
+ * write sets the period.
+ */
+ hpet_writel(cmp, HPET_T0_CMP);
+ udelay(1);
+ hpet_writel((unsigned long) delta, HPET_T0_CMP);
+ break;
+
+ case CLOCK_EVT_MODE_ONESHOT:
+ cfg = hpet_readl(HPET_T0_CFG);
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_T0_CFG);
+ break;
+
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ cfg = hpet_readl(HPET_T0_CFG);
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_T0_CFG);
+ break;
+ }
+}
+
+static int hpet_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ unsigned long cnt;
+
+ cnt = hpet_readl(HPET_COUNTER);
+ cnt += delta;
+ hpet_writel(cnt, HPET_T0_CMP);
+
+ return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0);
+}
+
+/*
+ * Try to setup the HPET timer
+ */
+int __init hpet_enable(void)
+{
+ unsigned long id;
+ uint64_t hpet_freq;
+
+ if (!is_hpet_capable())
+ return 0;
+
+ hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+
+ /*
+ * Read the period and check for a sane value:
+ */
+ hpet_period = hpet_readl(HPET_PERIOD);
+ if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
+ goto out_nohpet;
+
+ /*
+ * The period is a femto seconds value. We need to calculate the
+ * scaled math multiplication factor for nanosecond to hpet tick
+ * conversion.
+ */
+ hpet_freq = 1000000000000000ULL;
+ do_div(hpet_freq, hpet_period);
+ hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
+ NSEC_PER_SEC, 32);
+ /* Calculate the min / max delta */
+ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
+ &hpet_clockevent);
+ hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
+ &hpet_clockevent);
+
+ /*
+ * Read the HPET ID register to retrieve the IRQ routing
+ * information and the number of channels
+ */
+ id = hpet_readl(HPET_ID);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+ /*
+ * The legacy routing mode needs at least two channels, tick timer
+ * and the rtc emulation channel.
+ */
+ if (!(id & HPET_ID_NUMBER))
+ goto out_nohpet;
+#endif
+
+ /* Start the counter */
+ hpet_start_counter();
+
+ if (id & HPET_ID_LEGSUP) {
+ hpet_enable_int();
+ hpet_reserve_platform_timers(id);
+ /*
+ * Start hpet with the boot cpu mask and make it
+ * global after the IO_APIC has been initialized.
+ */
+ hpet_clockevent.cpumask =cpumask_of_cpu(0);
+ clockevents_register_device(&hpet_clockevent);
+ global_clock_event = &hpet_clockevent;
+ return 1;
+ }
+ return 0;
+out_nohpet:
+ iounmap(hpet_virt_address);
+ hpet_virt_address = NULL;
+ return 0;
+}
+
+/*
+ * Clock source related code
+ */
static cycle_t read_hpet(void)
{
- return (cycle_t)readl(hpet_ptr);
+ return (cycle_t)hpet_readl(HPET_COUNTER);
}
static struct clocksource clocksource_hpet = {
@@ -24,28 +286,17 @@ static struct clocksource clocksource_hpet = {
.rating = 250,
.read = read_hpet,
.mask = HPET_MASK,
- .mult = 0, /* set below */
.shift = HPET_SHIFT,
- .is_continuous = 1,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static int __init init_hpet_clocksource(void)
{
- unsigned long hpet_period;
- void __iomem* hpet_base;
u64 tmp;
- int err;
- if (!is_hpet_enabled())
+ if (!hpet_virt_address)
return -ENODEV;
- /* calculate the hpet address: */
- hpet_base = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
- hpet_ptr = hpet_base + HPET_COUNTER;
-
- /* calculate the frequency: */
- hpet_period = readl(hpet_base + HPET_PERIOD);
-
/*
* hpet period is in femto seconds per cycle
* so we need to convert this to ns/cyc units
@@ -61,11 +312,218 @@ static int __init init_hpet_clocksource(void)
do_div(tmp, FSEC_PER_NSEC);
clocksource_hpet.mult = (u32)tmp;
- err = clocksource_register(&clocksource_hpet);
- if (err)
- iounmap(hpet_base);
-
- return err;
+ return clocksource_register(&clocksource_hpet);
}
module_init(init_hpet_clocksource);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ * is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/mc146818rtc.h>
+#include <linux/rtc.h>
+
+#define DEFAULT_RTC_INT_FREQ 64
+#define DEFAULT_RTC_SHIFT 6
+#define RTC_NUM_INTS 1
+
+static unsigned long hpet_rtc_flags;
+static unsigned long hpet_prev_update_sec;
+static struct rtc_time hpet_alarm_time;
+static unsigned long hpet_pie_count;
+static unsigned long hpet_t1_cmp;
+static unsigned long hpet_default_delta;
+static unsigned long hpet_pie_delta;
+static unsigned long hpet_pie_limit;
+
+/*
+ * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
+ * is not supported by all HPET implementations for timer 1.
+ *
+ * hpet_rtc_timer_init() is called when the rtc is initialized.
+ */
+int hpet_rtc_timer_init(void)
+{
+ unsigned long cfg, cnt, delta, flags;
+
+ if (!is_hpet_enabled())
+ return 0;
+
+ if (!hpet_default_delta) {
+ uint64_t clc;
+
+ clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+ clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
+ hpet_default_delta = (unsigned long) clc;
+ }
+
+ if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+ delta = hpet_default_delta;
+ else
+ delta = hpet_pie_delta;
+
+ local_irq_save(flags);
+
+ cnt = delta + hpet_readl(HPET_COUNTER);
+ hpet_writel(cnt, HPET_T1_CMP);
+ hpet_t1_cmp = cnt;
+
+ cfg = hpet_readl(HPET_T1_CFG);
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_T1_CFG);
+
+ local_irq_restore(flags);
+
+ return 1;
+}
+
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+ if (!is_hpet_enabled())
+ return 0;
+
+ hpet_rtc_flags &= ~bit_mask;
+ return 1;
+}
+
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+ unsigned long oldbits = hpet_rtc_flags;
+
+ if (!is_hpet_enabled())
+ return 0;
+
+ hpet_rtc_flags |= bit_mask;
+
+ if (!oldbits)
+ hpet_rtc_timer_init();
+
+ return 1;
+}
+
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
+ unsigned char sec)
+{
+ if (!is_hpet_enabled())
+ return 0;
+
+ hpet_alarm_time.tm_hour = hrs;
+ hpet_alarm_time.tm_min = min;
+ hpet_alarm_time.tm_sec = sec;
+
+ return 1;
+}
+
+int hpet_set_periodic_freq(unsigned long freq)
+{
+ uint64_t clc;
+
+ if (!is_hpet_enabled())
+ return 0;
+
+ if (freq <= DEFAULT_RTC_INT_FREQ)
+ hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
+ else {
+ clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+ do_div(clc, freq);
+ clc >>= hpet_clockevent.shift;
+ hpet_pie_delta = (unsigned long) clc;
+ }
+ return 1;
+}
+
+int hpet_rtc_dropped_irq(void)
+{
+ return is_hpet_enabled();
+}
+
+static void hpet_rtc_timer_reinit(void)
+{
+ unsigned long cfg, delta;
+ int lost_ints = -1;
+
+ if (unlikely(!hpet_rtc_flags)) {
+ cfg = hpet_readl(HPET_T1_CFG);
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_T1_CFG);
+ return;
+ }
+
+ if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+ delta = hpet_default_delta;
+ else
+ delta = hpet_pie_delta;
+
+ /*
+ * Increment the comparator value until we are ahead of the
+ * current count.
+ */
+ do {
+ hpet_t1_cmp += delta;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+ lost_ints++;
+ } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
+
+ if (lost_ints) {
+ if (hpet_rtc_flags & RTC_PIE)
+ hpet_pie_count += lost_ints;
+ if (printk_ratelimit())
+ printk(KERN_WARNING "rtc: lost %d interrupts\n",
+ lost_ints);
+ }
+}
+
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
+{
+ struct rtc_time curr_time;
+ unsigned long rtc_int_flag = 0;
+
+ hpet_rtc_timer_reinit();
+
+ if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
+ rtc_get_rtc_time(&curr_time);
+
+ if (hpet_rtc_flags & RTC_UIE &&
+ curr_time.tm_sec != hpet_prev_update_sec) {
+ rtc_int_flag = RTC_UF;
+ hpet_prev_update_sec = curr_time.tm_sec;
+ }
+
+ if (hpet_rtc_flags & RTC_PIE &&
+ ++hpet_pie_count >= hpet_pie_limit) {
+ rtc_int_flag |= RTC_PF;
+ hpet_pie_count = 0;
+ }
+
+ if (hpet_rtc_flags & RTC_PIE &&
+ (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
+ (curr_time.tm_min == hpet_alarm_time.tm_min) &&
+ (curr_time.tm_hour == hpet_alarm_time.tm_hour))
+ rtc_int_flag |= RTC_AF;
+
+ if (rtc_int_flag) {
+ rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+ rtc_interrupt(rtc_int_flag, dev_id);
+ }
+ return IRQ_HANDLED;
+}
+#endif
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index 9a0060b92e3..a6bc7bb3883 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -2,7 +2,7 @@
* i8253.c 8253/PIT functions
*
*/
-#include <linux/clocksource.h>
+#include <linux/clockchips.h>
#include <linux/spinlock.h>
#include <linux/jiffies.h>
#include <linux/sysdev.h>
@@ -19,17 +19,97 @@
DEFINE_SPINLOCK(i8253_lock);
EXPORT_SYMBOL(i8253_lock);
-void setup_pit_timer(void)
+/*
+ * HPET replaces the PIT, when enabled. So we need to know, which of
+ * the two timers is used
+ */
+struct clock_event_device *global_clock_event;
+
+/*
+ * Initialize the PIT timer.
+ *
+ * This is also called after resume to bring the PIT into operation again.
+ */
+static void init_pit_timer(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8253_lock, flags);
+
+ switch(mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ /* binary, mode 2, LSB/MSB, ch 0 */
+ outb_p(0x34, PIT_MODE);
+ udelay(10);
+ outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
+ udelay(10);
+ outb(LATCH >> 8 , PIT_CH0); /* MSB */
+ break;
+
+ case CLOCK_EVT_MODE_ONESHOT:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ case CLOCK_EVT_MODE_UNUSED:
+ /* One shot setup */
+ outb_p(0x38, PIT_MODE);
+ udelay(10);
+ break;
+ }
+ spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+/*
+ * Program the next event in oneshot mode
+ *
+ * Delta is given in PIT ticks
+ */
+static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
{
unsigned long flags;
spin_lock_irqsave(&i8253_lock, flags);
- outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
- udelay(10);
- outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
- udelay(10);
- outb(LATCH >> 8 , PIT_CH0); /* MSB */
+ outb_p(delta & 0xff , PIT_CH0); /* LSB */
+ outb(delta >> 8 , PIT_CH0); /* MSB */
spin_unlock_irqrestore(&i8253_lock, flags);
+
+ return 0;
+}
+
+/*
+ * On UP the PIT can serve all of the possible timer functions. On SMP systems
+ * it can be solely used for the global tick.
+ *
+ * The profiling and update capabilites are switched off once the local apic is
+ * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
+ * !using_apic_timer decisions in do_timer_interrupt_hook()
+ */
+struct clock_event_device pit_clockevent = {
+ .name = "pit",
+ .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+ .set_mode = init_pit_timer,
+ .set_next_event = pit_next_event,
+ .shift = 32,
+ .irq = 0,
+};
+
+/*
+ * Initialize the conversion factor and the min/max deltas of the clock event
+ * structure and register the clock event source with the framework.
+ */
+void __init setup_pit_timer(void)
+{
+ /*
+ * Start pit with the boot cpu mask and make it global after the
+ * IO_APIC has been initialized.
+ */
+ pit_clockevent.cpumask = cpumask_of_cpu(0);
+ pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
+ pit_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFF, &pit_clockevent);
+ pit_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &pit_clockevent);
+ clockevents_register_device(&pit_clockevent);
+ global_clock_event = &pit_clockevent;
}
/*
@@ -46,7 +126,7 @@ static cycle_t pit_read(void)
static u32 old_jifs;
spin_lock_irqsave(&i8253_lock, flags);
- /*
+ /*
* Although our caller may have the read side of xtime_lock,
* this is now a seqlock, and we are cheating in this routine
* by having side effects on state that we cannot undo if
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index c8d45821c78..03abfdb1a6e 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -41,6 +41,7 @@ static void mask_and_ack_8259A(unsigned int);
static struct irq_chip i8259A_chip = {
.name = "XT-PIC",
.mask = disable_8259A_irq,
+ .disable = disable_8259A_irq,
.unmask = enable_8259A_irq,
.mask_ack = mask_and_ack_8259A,
};
@@ -410,12 +411,6 @@ void __init native_init_IRQ(void)
intr_init_hook();
/*
- * Set the clock to HZ Hz, we already have a valid
- * vector now:
- */
- setup_pit_timer();
-
- /*
* External FPU? Set up irq13 if so, for
* original braindamaged IBM FERR coupling.
*/
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index ba8d302a0b7..6fec4dab70b 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -343,7 +343,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
break;
entry = irq_2_pin + entry->next;
}
- set_native_irq_info(irq, cpumask);
+ irq_desc[irq].affinity = cpumask;
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -482,8 +482,8 @@ static void do_irq_balance(void)
package_index = CPU_TO_PACKAGEINDEX(i);
for (j = 0; j < NR_IRQS; j++) {
unsigned long value_now, delta;
- /* Is this an active IRQ? */
- if (!irq_desc[j].action)
+ /* Is this an active IRQ or balancing disabled ? */
+ if (!irq_desc[j].action || irq_balancing_disabled(j))
continue;
if ( package_index == i )
IRQ_DELTA(package_index,j) = 0;
@@ -1281,11 +1281,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
trigger == IOAPIC_LEVEL)
set_irq_chip_and_handler_name(irq, &ioapic_chip,
handle_fasteoi_irq, "fasteoi");
- else {
- irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
+ else
set_irq_chip_and_handler_name(irq, &ioapic_chip,
handle_edge_irq, "edge");
- }
set_intr_gate(vector, interrupt[irq]);
}
@@ -1356,7 +1354,7 @@ static void __init setup_IO_APIC_irqs(void)
}
spin_lock_irqsave(&ioapic_lock, flags);
__ioapic_write_entry(apic, pin, entry);
- set_native_irq_info(irq, TARGET_CPUS);
+ irq_desc[irq].affinity = TARGET_CPUS;
spin_unlock_irqrestore(&ioapic_lock, flags);
}
}
@@ -1588,7 +1586,7 @@ void /*__init*/ print_local_APIC(void * dummy)
v = apic_read(APIC_LVR);
printk(KERN_INFO "... APIC VERSION: %08x\n", v);
ver = GET_APIC_VERSION(v);
- maxlvt = get_maxlvt();
+ maxlvt = lapic_get_maxlvt();
v = apic_read(APIC_TASKPRI);
printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
@@ -1920,7 +1918,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
static void __init setup_ioapic_ids_from_mpc(void) { }
#endif
-static int no_timer_check __initdata;
+int no_timer_check __initdata;
static int __init notimercheck(char *s)
{
@@ -2310,7 +2308,7 @@ static inline void __init check_timer(void)
disable_8259A_irq(0);
set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
- "fasteio");
+ "fasteoi");
apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
enable_8259A_irq(0);
@@ -2587,7 +2585,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
write_msi_msg(irq, &msg);
- set_native_irq_info(irq, mask);
+ irq_desc[irq].affinity = mask;
}
#endif /* CONFIG_SMP */
@@ -2671,7 +2669,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
dest = cpu_mask_to_apicid(mask);
target_ht_irq(irq, dest);
- set_native_irq_info(irq, mask);
+ irq_desc[irq].affinity = mask;
}
#endif
@@ -2877,7 +2875,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
spin_lock_irqsave(&ioapic_lock, flags);
__ioapic_write_entry(ioapic, pin, entry);
- set_native_irq_info(irq, TARGET_CPUS);
+ irq_desc[irq].affinity = TARGET_CPUS;
spin_unlock_irqrestore(&ioapic_lock, flags);
return 0;
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 3201d421090..8db8d514c9c 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -10,7 +10,6 @@
* io_apic.c.)
*/
-#include <asm/uaccess.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/interrupt.h>
@@ -19,19 +18,34 @@
#include <linux/cpu.h>
#include <linux/delay.h>
+#include <asm/apic.h>
+#include <asm/uaccess.h>
+
DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
EXPORT_PER_CPU_SYMBOL(irq_stat);
-#ifndef CONFIG_X86_LOCAL_APIC
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themselves.
*/
void ack_bad_irq(unsigned int irq)
{
- printk("unexpected IRQ trap at vector %02x\n", irq);
-}
+ printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * Currently unexpected vectors happen only on SMP and APIC.
+ * We _must_ ack these because every local APIC has only N
+ * irq slots per priority level, and a 'hanging, unacked' IRQ
+ * holds up an irq slot - in excessive cases (when multiple
+ * unexpected vectors occur) that might lock up the APIC
+ * completely.
+ * But only ack when the APIC is enabled -AK
+ */
+ if (cpu_has_apic)
+ ack_APIC_irq();
#endif
+}
#ifdef CONFIG_4KSTACKS
/*
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index af1d5334499..b545bc746fc 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -363,7 +363,7 @@ no_kprobe:
" pushf\n"
/* skip cs, eip, orig_eax */
" subl $12, %esp\n"
- " pushl %gs\n"
+ " pushl %fs\n"
" pushl %ds\n"
" pushl %es\n"
" pushl %eax\n"
@@ -387,7 +387,7 @@ no_kprobe:
" popl %edi\n"
" popl %ebp\n"
" popl %eax\n"
- /* skip eip, orig_eax, es, ds, gs */
+ /* skip eip, orig_eax, es, ds, fs */
" addl $20, %esp\n"
" popf\n"
" ret\n");
@@ -408,7 +408,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
spin_lock_irqsave(&kretprobe_lock, flags);
head = kretprobe_inst_table_head(current);
/* fixup registers */
- regs->xcs = __KERNEL_CS;
+ regs->xcs = __KERNEL_CS | get_kernel_rpl();
regs->eip = trampoline_address;
regs->orig_eax = 0xffffffff;
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index c8fa13721bc..b8f16633a6e 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -384,7 +384,7 @@ static int do_microcode_update (void)
{
long cursor = 0;
int error = 0;
- void *new_mc;
+ void *new_mc = NULL;
int cpu;
cpumask_t old;
@@ -451,7 +451,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
return ret;
}
-static struct file_operations microcode_fops = {
+static const struct file_operations microcode_fops = {
.owner = THIS_MODULE,
.write = microcode_write,
.open = microcode_open,
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 4a472a17d1c..bcaa6e9b619 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -68,7 +68,6 @@ static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
#ifdef CONFIG_SMP
struct msr_command {
- int cpu;
int err;
u32 reg;
u32 data[2];
@@ -78,16 +77,14 @@ static void msr_smp_wrmsr(void *cmd_block)
{
struct msr_command *cmd = (struct msr_command *)cmd_block;
- if (cmd->cpu == smp_processor_id())
- cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
+ cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
}
static void msr_smp_rdmsr(void *cmd_block)
{
struct msr_command *cmd = (struct msr_command *)cmd_block;
- if (cmd->cpu == smp_processor_id())
- cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
+ cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
}
static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
@@ -99,12 +96,11 @@ static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
if (cpu == smp_processor_id()) {
ret = wrmsr_eio(reg, eax, edx);
} else {
- cmd.cpu = cpu;
cmd.reg = reg;
cmd.data[0] = eax;
cmd.data[1] = edx;
- smp_call_function(msr_smp_wrmsr, &cmd, 1, 1);
+ smp_call_function_single(cpu, msr_smp_wrmsr, &cmd, 1, 1);
ret = cmd.err;
}
preempt_enable();
@@ -120,10 +116,9 @@ static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx)
if (cpu == smp_processor_id()) {
ret = rdmsr_eio(reg, eax, edx);
} else {
- cmd.cpu = cpu;
cmd.reg = reg;
- smp_call_function(msr_smp_rdmsr, &cmd, 1, 1);
+ smp_call_function_single(cpu, msr_smp_rdmsr, &cmd, 1, 1);
*eax = cmd.data[0];
*edx = cmd.data[1];
@@ -230,7 +225,7 @@ static int msr_open(struct inode *inode, struct file *file)
/*
* File operations we support
*/
-static struct file_operations msr_fops = {
+static const struct file_operations msr_fops = {
.owner = THIS_MODULE,
.llseek = msr_seek,
.read = msr_read,
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 1a6f8bb8881..821df34d2b3 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -23,6 +23,7 @@
#include <linux/dmi.h>
#include <linux/kprobes.h>
#include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
#include <asm/smp.h>
#include <asm/nmi.h>
@@ -185,7 +186,8 @@ static __cpuinit inline int nmi_known_cpu(void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
- return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
+ return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
+ || (boot_cpu_data.x86 == 16));
case X86_VENDOR_INTEL:
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
return 1;
@@ -216,6 +218,28 @@ static __init void nmi_cpu_busy(void *data)
}
#endif
+static unsigned int adjust_for_32bit_ctr(unsigned int hz)
+{
+ u64 counter_val;
+ unsigned int retval = hz;
+
+ /*
+ * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
+ * are writable, with higher bits sign extending from bit 31.
+ * So, we can only program the counter with 31 bit values and
+ * 32nd bit should be 1, for 33.. to be 1.
+ * Find the appropriate nmi_hz
+ */
+ counter_val = (u64)cpu_khz * 1000;
+ do_div(counter_val, retval);
+ if (counter_val > 0x7fffffffULL) {
+ u64 count = (u64)cpu_khz * 1000;
+ do_div(count, 0x7fffffffUL);
+ retval = count + 1;
+ }
+ return retval;
+}
+
static int __init check_nmi_watchdog(void)
{
unsigned int *prev_nmi_count;
@@ -281,18 +305,10 @@ static int __init check_nmi_watchdog(void)
struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
nmi_hz = 1;
- /*
- * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
- * are writable, with higher bits sign extending from bit 31.
- * So, we can only program the counter with 31 bit values and
- * 32nd bit should be 1, for 33.. to be 1.
- * Find the appropriate nmi_hz
- */
- if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
- ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
- u64 count = (u64)cpu_khz * 1000;
- do_div(count, 0x7fffffffUL);
- nmi_hz = count + 1;
+
+ if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
+ wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
+ nmi_hz = adjust_for_32bit_ctr(nmi_hz);
}
}
@@ -369,6 +385,34 @@ void enable_timer_nmi_watchdog(void)
}
}
+static void __acpi_nmi_disable(void *__unused)
+{
+ apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+ on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_enable(void *__unused)
+{
+ apic_write_around(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+ on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+
#ifdef CONFIG_PM
static int nmi_pm_active; /* nmi_active before suspend */
@@ -442,6 +486,17 @@ static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
wrmsrl(perfctr_msr, 0 - count);
}
+static void write_watchdog_counter32(unsigned int perfctr_msr,
+ const char *descr)
+{
+ u64 count = (u64)cpu_khz * 1000;
+
+ do_div(count, nmi_hz);
+ if(descr)
+ Dprintk("setting %s to -0x%08Lx\n", descr, count);
+ wrmsr(perfctr_msr, (u32)(-count), 0);
+}
+
/* Note that these events don't tick when the CPU idles. This means
the frequency varies with CPU load. */
@@ -531,7 +586,8 @@ static int setup_p6_watchdog(void)
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
- write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
+ nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+ write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= P6_EVNTSEL0_ENABLE;
wrmsr(evntsel_msr, evntsel, 0);
@@ -704,7 +760,8 @@ static int setup_intel_arch_watchdog(void)
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
- write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
+ nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+ write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
wrmsr(evntsel_msr, evntsel, 0);
@@ -762,7 +819,8 @@ void setup_apic_nmi_watchdog (void *unused)
if (nmi_watchdog == NMI_LOCAL_APIC) {
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
+ if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
+ boot_cpu_data.x86 != 16)
return;
if (!setup_k7_watchdog())
return;
@@ -916,9 +974,13 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
cpu_clear(cpu, backtrace_mask);
}
- sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
+ /*
+ * Take the local apic timer and PIT/HPET into account. We don't
+ * know which one is active, when we have highres/dyntick on
+ */
+ sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
- /* if the apic timer isn't firing, this cpu isn't doing much */
+ /* if the none of the timers isn't firing, this cpu isn't doing much */
if (!touched && last_irq_sums[cpu] == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
@@ -956,6 +1018,8 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
dummy &= ~P4_CCCR_OVF;
wrmsrl(wd->cccr_msr, dummy);
apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /* start the cycle over again */
+ write_watchdog_counter(wd->perfctr_msr, NULL);
}
else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
@@ -964,9 +1028,12 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
* other P6 variant.
* ArchPerfom/Core Duo also needs this */
apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /* P6/ARCH_PERFMON has 32 bit counter write */
+ write_watchdog_counter32(wd->perfctr_msr, NULL);
+ } else {
+ /* start the cycle over again */
+ write_watchdog_counter(wd->perfctr_msr, NULL);
}
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL);
rc = 1;
} else if (nmi_watchdog == NMI_IO_APIC) {
/* don't know how to accurately check for this.
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index e55fd05da0f..c156ecfa387 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -92,7 +92,7 @@ static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
return insn_len;
}
-static fastcall unsigned long native_get_debugreg(int regno)
+static unsigned long native_get_debugreg(int regno)
{
unsigned long val = 0; /* Damn you, gcc! */
@@ -115,7 +115,7 @@ static fastcall unsigned long native_get_debugreg(int regno)
return val;
}
-static fastcall void native_set_debugreg(int regno, unsigned long value)
+static void native_set_debugreg(int regno, unsigned long value)
{
switch (regno) {
case 0:
@@ -146,55 +146,55 @@ void init_IRQ(void)
paravirt_ops.init_IRQ();
}
-static fastcall void native_clts(void)
+static void native_clts(void)
{
asm volatile ("clts");
}
-static fastcall unsigned long native_read_cr0(void)
+static unsigned long native_read_cr0(void)
{
unsigned long val;
asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
return val;
}
-static fastcall void native_write_cr0(unsigned long val)
+static void native_write_cr0(unsigned long val)
{
asm volatile("movl %0,%%cr0": :"r" (val));
}
-static fastcall unsigned long native_read_cr2(void)
+static unsigned long native_read_cr2(void)
{
unsigned long val;
asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
return val;
}
-static fastcall void native_write_cr2(unsigned long val)
+static void native_write_cr2(unsigned long val)
{
asm volatile("movl %0,%%cr2": :"r" (val));
}
-static fastcall unsigned long native_read_cr3(void)
+static unsigned long native_read_cr3(void)
{
unsigned long val;
asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
return val;
}
-static fastcall void native_write_cr3(unsigned long val)
+static void native_write_cr3(unsigned long val)
{
asm volatile("movl %0,%%cr3": :"r" (val));
}
-static fastcall unsigned long native_read_cr4(void)
+static unsigned long native_read_cr4(void)
{
unsigned long val;
asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
return val;
}
-static fastcall unsigned long native_read_cr4_safe(void)
+static unsigned long native_read_cr4_safe(void)
{
unsigned long val;
/* This could fault if %cr4 does not exist */
@@ -207,51 +207,51 @@ static fastcall unsigned long native_read_cr4_safe(void)
return val;
}
-static fastcall void native_write_cr4(unsigned long val)
+static void native_write_cr4(unsigned long val)
{
asm volatile("movl %0,%%cr4": :"r" (val));
}
-static fastcall unsigned long native_save_fl(void)
+static unsigned long native_save_fl(void)
{
unsigned long f;
asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
return f;
}
-static fastcall void native_restore_fl(unsigned long f)
+static void native_restore_fl(unsigned long f)
{
asm volatile("pushl %0 ; popfl": /* no output */
:"g" (f)
:"memory", "cc");
}
-static fastcall void native_irq_disable(void)
+static void native_irq_disable(void)
{
asm volatile("cli": : :"memory");
}
-static fastcall void native_irq_enable(void)
+static void native_irq_enable(void)
{
asm volatile("sti": : :"memory");
}
-static fastcall void native_safe_halt(void)
+static void native_safe_halt(void)
{
asm volatile("sti; hlt": : :"memory");
}
-static fastcall void native_halt(void)
+static void native_halt(void)
{
asm volatile("hlt": : :"memory");
}
-static fastcall void native_wbinvd(void)
+static void native_wbinvd(void)
{
asm volatile("wbinvd": : :"memory");
}
-static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
+static unsigned long long native_read_msr(unsigned int msr, int *err)
{
unsigned long long val;
@@ -270,7 +270,7 @@ static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
return val;
}
-static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
+static int native_write_msr(unsigned int msr, unsigned long long val)
{
int err;
asm volatile("2: wrmsr ; xorl %0,%0\n"
@@ -288,53 +288,53 @@ static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
return err;
}
-static fastcall unsigned long long native_read_tsc(void)
+static unsigned long long native_read_tsc(void)
{
unsigned long long val;
asm volatile("rdtsc" : "=A" (val));
return val;
}
-static fastcall unsigned long long native_read_pmc(void)
+static unsigned long long native_read_pmc(void)
{
unsigned long long val;
asm volatile("rdpmc" : "=A" (val));
return val;
}
-static fastcall void native_load_tr_desc(void)
+static void native_load_tr_desc(void)
{
asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
}
-static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
+static void native_load_gdt(const struct Xgt_desc_struct *dtr)
{
asm volatile("lgdt %0"::"m" (*dtr));
}
-static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
+static void native_load_idt(const struct Xgt_desc_struct *dtr)
{
asm volatile("lidt %0"::"m" (*dtr));
}
-static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
+static void native_store_gdt(struct Xgt_desc_struct *dtr)
{
asm ("sgdt %0":"=m" (*dtr));
}
-static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
+static void native_store_idt(struct Xgt_desc_struct *dtr)
{
asm ("sidt %0":"=m" (*dtr));
}
-static fastcall unsigned long native_store_tr(void)
+static unsigned long native_store_tr(void)
{
unsigned long tr;
asm ("str %0":"=r" (tr));
return tr;
}
-static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
+static void native_load_tls(struct thread_struct *t, unsigned int cpu)
{
#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
C(0); C(1); C(2);
@@ -348,22 +348,22 @@ static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32
lp[1] = entry_high;
}
-static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
{
native_write_dt_entry(dt, entrynum, low, high);
}
-static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
+static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
{
native_write_dt_entry(dt, entrynum, low, high);
}
-static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
+static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
{
native_write_dt_entry(dt, entrynum, low, high);
}
-static fastcall void native_load_esp0(struct tss_struct *tss,
+static void native_load_esp0(struct tss_struct *tss,
struct thread_struct *thread)
{
tss->esp0 = thread->esp0;
@@ -375,12 +375,12 @@ static fastcall void native_load_esp0(struct tss_struct *tss,
}
}
-static fastcall void native_io_delay(void)
+static void native_io_delay(void)
{
asm volatile("outb %al,$0x80");
}
-static fastcall void native_flush_tlb(void)
+static void native_flush_tlb(void)
{
__native_flush_tlb();
}
@@ -389,49 +389,49 @@ static fastcall void native_flush_tlb(void)
* Global pages have to be flushed a bit differently. Not a real
* performance problem because this does not happen often.
*/
-static fastcall void native_flush_tlb_global(void)
+static void native_flush_tlb_global(void)
{
__native_flush_tlb_global();
}
-static fastcall void native_flush_tlb_single(u32 addr)
+static void native_flush_tlb_single(u32 addr)
{
__native_flush_tlb_single(addr);
}
#ifndef CONFIG_X86_PAE
-static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
+static void native_set_pte(pte_t *ptep, pte_t pteval)
{
*ptep = pteval;
}
-static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
{
*ptep = pteval;
}
-static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
*pmdp = pmdval;
}
#else /* CONFIG_X86_PAE */
-static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
+static void native_set_pte(pte_t *ptep, pte_t pte)
{
ptep->pte_high = pte.pte_high;
smp_wmb();
ptep->pte_low = pte.pte_low;
}
-static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
{
ptep->pte_high = pte.pte_high;
smp_wmb();
ptep->pte_low = pte.pte_low;
}
-static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
{
ptep->pte_low = 0;
smp_wmb();
@@ -440,29 +440,29 @@ static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long
ptep->pte_low = pte.pte_low;
}
-static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
+static void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
{
set_64bit((unsigned long long *)ptep,pte_val(pteval));
}
-static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
}
-static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
+static void native_set_pud(pud_t *pudp, pud_t pudval)
{
*pudp = pudval;
}
-static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
ptep->pte_low = 0;
smp_wmb();
ptep->pte_high = 0;
}
-static fastcall void native_pmd_clear(pmd_t *pmd)
+static void native_pmd_clear(pmd_t *pmd)
{
u32 *tmp = (u32 *)pmd;
*tmp = 0;
@@ -472,8 +472,8 @@ static fastcall void native_pmd_clear(pmd_t *pmd)
#endif /* CONFIG_X86_PAE */
/* These are in entry.S */
-extern fastcall void native_iret(void);
-extern fastcall void native_irq_enable_sysexit(void);
+extern void native_iret(void);
+extern void native_irq_enable_sysexit(void);
static int __init print_banner(void)
{
@@ -482,9 +482,6 @@ static int __init print_banner(void)
}
core_initcall(print_banner);
-/* We simply declare start_kernel to be the paravirt probe of last resort. */
-paravirt_probe(start_kernel);
-
struct paravirt_ops paravirt_ops = {
.name = "bare hardware",
.paravirt_enabled = 0,
@@ -544,12 +541,21 @@ struct paravirt_ops paravirt_ops = {
.apic_write = native_apic_write,
.apic_write_atomic = native_apic_write_atomic,
.apic_read = native_apic_read,
+ .setup_boot_clock = setup_boot_APIC_clock,
+ .setup_secondary_clock = setup_secondary_APIC_clock,
#endif
+ .set_lazy_mode = (void *)native_nop,
.flush_tlb_user = native_flush_tlb,
.flush_tlb_kernel = native_flush_tlb_global,
.flush_tlb_single = native_flush_tlb_single,
+ .alloc_pt = (void *)native_nop,
+ .alloc_pd = (void *)native_nop,
+ .alloc_pd_clone = (void *)native_nop,
+ .release_pt = (void *)native_nop,
+ .release_pd = (void *)native_nop,
+
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
.set_pmd = native_set_pmd,
@@ -565,6 +571,8 @@ struct paravirt_ops paravirt_ops = {
.irq_enable_sysexit = native_irq_enable_sysexit,
.iret = native_iret,
+
+ .startup_ipi_hook = (void *)native_nop,
};
/*
diff --git a/arch/i386/kernel/pcspeaker.c b/arch/i386/kernel/pcspeaker.c
new file mode 100644
index 00000000000..bc1f2d3ea27
--- /dev/null
+++ b/arch/i386/kernel/pcspeaker.c
@@ -0,0 +1,20 @@
+#include <linux/platform_device.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+static __init int add_pcspkr(void)
+{
+ struct platform_device *pd;
+ int ret;
+
+ pd = platform_device_alloc("pcspkr", -1);
+ if (!pd)
+ return -ENOMEM;
+
+ ret = platform_device_add(pd);
+ if (ret)
+ platform_device_put(pd);
+
+ return ret;
+}
+device_initcall(add_pcspkr);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index c641056233a..393a67d5d94 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -38,6 +38,7 @@
#include <linux/ptrace.h>
#include <linux/random.h>
#include <linux/personality.h>
+#include <linux/tick.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -173,6 +174,7 @@ void cpu_idle(void)
/* endless idle loop with no priority at all */
while (1) {
+ tick_nohz_stop_sched_tick();
while (!need_resched()) {
void (*idle)(void);
@@ -191,6 +193,7 @@ void cpu_idle(void)
__get_cpu_var(irq_stat).idle_timestamp = jiffies;
idle();
}
+ tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
schedule();
preempt_disable();
@@ -308,8 +311,8 @@ void show_regs(struct pt_regs * regs)
regs->eax,regs->ebx,regs->ecx,regs->edx);
printk("ESI: %08lx EDI: %08lx EBP: %08lx",
regs->esi, regs->edi, regs->ebp);
- printk(" DS: %04x ES: %04x GS: %04x\n",
- 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
+ printk(" DS: %04x ES: %04x FS: %04x\n",
+ 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
cr0 = read_cr0();
cr2 = read_cr2();
@@ -340,7 +343,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
regs.xds = __USER_DS;
regs.xes = __USER_DS;
- regs.xgs = __KERNEL_PDA;
+ regs.xfs = __KERNEL_PDA;
regs.orig_eax = -1;
regs.eip = (unsigned long) kernel_thread_helper;
regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -425,7 +428,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
p->thread.eip = (unsigned long) ret_from_fork;
- savesegment(fs,p->thread.fs);
+ savesegment(gs,p->thread.gs);
tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -501,8 +504,8 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
dump->regs.eax = regs->eax;
dump->regs.ds = regs->xds;
dump->regs.es = regs->xes;
- savesegment(fs,dump->regs.fs);
- dump->regs.gs = regs->xgs;
+ dump->regs.fs = regs->xfs;
+ savesegment(gs,dump->regs.gs);
dump->regs.orig_eax = regs->orig_eax;
dump->regs.eip = regs->eip;
dump->regs.cs = regs->xcs;
@@ -653,7 +656,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
load_esp0(tss, next);
/*
- * Save away %fs. No need to save %gs, as it was saved on the
+ * Save away %gs. No need to save %fs, as it was saved on the
* stack on entry. No need to save %es and %ds, as those are
* always kernel segments while inside the kernel. Doing this
* before setting the new TLS descriptors avoids the situation
@@ -662,7 +665,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
- savesegment(fs, prev->fs);
+ savesegment(gs, prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
@@ -670,14 +673,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
load_TLS(next, cpu);
/*
- * Restore %fs if needed.
- *
- * Glibc normally makes %fs be zero.
+ * Restore IOPL if needed. In normal use, the flags restore
+ * in the switch assembly will handle this. But if the kernel
+ * is running virtualized at a non-zero CPL, the popf will
+ * not restore flags, so it must be done in a separate step.
*/
- if (unlikely(prev->fs | next->fs))
- loadsegment(fs, next->fs);
-
- write_pda(pcurrent, next_p);
+ if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
+ set_iopl_mask(next->iopl);
/*
* Now maybe handle debug registers and/or IO bitmaps
@@ -688,6 +690,15 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
disable_tsc(prev_p, next_p);
+ /*
+ * Leave lazy mode, flushing any hypercalls made here.
+ * This must be done before restoring TLS segments so
+ * the GDT and LDT are properly updated, and must be
+ * done before math_state_restore, so the TS bit is up
+ * to date.
+ */
+ arch_leave_lazy_cpu_mode();
+
/* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap; the
* chances of needing FPU soon are obviously high now
@@ -695,6 +706,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
if (next_p->fpu_counter > 5)
math_state_restore();
+ /*
+ * Restore %gs if needed (which is common)
+ */
+ if (prev->gs | next->gs)
+ loadsegment(gs, next->gs);
+
+ write_pda(pcurrent, next_p);
+
return prev_p;
}
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index af8aabe8580..4a8f8a25972 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -89,14 +89,14 @@ static int putreg(struct task_struct *child,
unsigned long regno, unsigned long value)
{
switch (regno >> 2) {
- case FS:
+ case GS:
if (value && (value & 3) != 3)
return -EIO;
- child->thread.fs = value;
+ child->thread.gs = value;
return 0;
case DS:
case ES:
- case GS:
+ case FS:
if (value && (value & 3) != 3)
return -EIO;
value &= 0xffff;
@@ -112,7 +112,7 @@ static int putreg(struct task_struct *child,
value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
break;
}
- if (regno > ES*4)
+ if (regno > FS*4)
regno -= 1*4;
put_stack_long(child, regno, value);
return 0;
@@ -124,18 +124,18 @@ static unsigned long getreg(struct task_struct *child,
unsigned long retval = ~0UL;
switch (regno >> 2) {
- case FS:
- retval = child->thread.fs;
+ case GS:
+ retval = child->thread.gs;
break;
case DS:
case ES:
- case GS:
+ case FS:
case SS:
case CS:
retval = 0xffff;
/* fall through */
default:
- if (regno > ES*4)
+ if (regno > FS*4)
regno -= 1*4;
retval &= get_stack_long(child, regno);
}
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 4b31ad70c1a..122623dcc6e 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -33,7 +33,6 @@
#include <linux/initrd.h>
#include <linux/bootmem.h>
#include <linux/seq_file.h>
-#include <linux/platform_device.h>
#include <linux/console.h>
#include <linux/mca.h>
#include <linux/root_dev.h>
@@ -60,6 +59,7 @@
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/io.h>
+#include <asm/vmi.h>
#include <setup_arch.h>
#include <bios_ebda.h>
@@ -132,7 +132,7 @@ unsigned long saved_videomode;
#define RAMDISK_PROMPT_FLAG 0x8000
#define RAMDISK_LOAD_FLAG 0x4000
-static char command_line[COMMAND_LINE_SIZE];
+static char __initdata command_line[COMMAND_LINE_SIZE];
unsigned char __initdata boot_params[PARAM_SIZE];
@@ -576,11 +576,19 @@ void __init setup_arch(char **cmdline_p)
print_memory_map("user");
}
- strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
max_low_pfn = setup_memory();
+#ifdef CONFIG_VMI
+ /*
+ * Must be after max_low_pfn is determined, and before kernel
+ * pagetables are setup.
+ */
+ vmi_init();
+#endif
+
/*
* NOTE: before this point _nobody_ is allowed to allocate
* any memory using the bootmem allocator. Although the
@@ -651,28 +659,3 @@ void __init setup_arch(char **cmdline_p)
#endif
tsc_init();
}
-
-static __init int add_pcspkr(void)
-{
- struct platform_device *pd;
- int ret;
-
- pd = platform_device_alloc("pcspkr", -1);
- if (!pd)
- return -ENOMEM;
-
- ret = platform_device_add(pd);
- if (ret)
- platform_device_put(pd);
-
- return ret;
-}
-device_initcall(add_pcspkr);
-
-/*
- * Local Variables:
- * mode:c
- * c-file-style:"k&r"
- * c-basic-offset:8
- * End:
- */
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 65d7620eaa0..4f99e870c98 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -21,6 +21,7 @@
#include <linux/suspend.h>
#include <linux/ptrace.h>
#include <linux/elf.h>
+#include <linux/binfmts.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
#include <asm/uaccess.h>
@@ -128,8 +129,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
- COPY_SEG(gs);
- GET_SEG(fs);
+ GET_SEG(gs);
+ COPY_SEG(fs);
COPY_SEG(es);
COPY_SEG(ds);
COPY(edi);
@@ -244,9 +245,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
{
int tmp, err = 0;
- err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
- savesegment(fs, tmp);
- err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
+ err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
+ savesegment(gs, tmp);
+ err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
@@ -349,7 +350,10 @@ static int setup_frame(int sig, struct k_sigaction *ka,
goto give_sigsegv;
}
- restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
+ if (current->binfmt->hasvdso)
+ restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
+ else
+ restorer = (void *)&frame->retcode;
if (ka->sa.sa_flags & SA_RESTORER)
restorer = ka->sa.sa_restorer;
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 5285aff8367..0e8977871b1 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -374,8 +374,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
/*
* i'm not happy about this global shared spinlock in the
* MM hot path, but we'll see how contended it is.
- * Temporarily this turns IRQs off, so that lockups are
- * detected by the NMI watchdog.
+ * AK: x86-64 has a faster method that could be ported.
*/
spin_lock(&tlbstate_lock);
@@ -400,7 +399,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
while (!cpus_empty(flush_cpumask))
/* nothing. lockup detection does not belong here */
- mb();
+ cpu_relax();
flush_mm = NULL;
flush_va = 0;
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 8c6c8c52b95..48bfcaa13ec 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -63,6 +63,7 @@
#include <mach_apic.h>
#include <mach_wakecpu.h>
#include <smpboot_hooks.h>
+#include <asm/vmi.h>
/* Set if we find a B stepping CPU */
static int __devinitdata smp_b_stepping;
@@ -93,12 +94,6 @@ cpumask_t cpu_possible_map;
EXPORT_SYMBOL(cpu_possible_map);
static cpumask_t smp_commenced_mask;
-/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
- * is no way to resync one AP against BP. TBD: for prescott and above, we
- * should use IA64's algorithm
- */
-static int __devinitdata tsc_sync_disabled;
-
/* Per CPU bogomips and other parameters */
struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
EXPORT_SYMBOL(cpu_data);
@@ -215,151 +210,6 @@ valid_k7:
;
}
-/*
- * TSC synchronization.
- *
- * We first check whether all CPUs have their TSC's synchronized,
- * then we print a warning if not, and always resync.
- */
-
-static struct {
- atomic_t start_flag;
- atomic_t count_start;
- atomic_t count_stop;
- unsigned long long values[NR_CPUS];
-} tsc __cpuinitdata = {
- .start_flag = ATOMIC_INIT(0),
- .count_start = ATOMIC_INIT(0),
- .count_stop = ATOMIC_INIT(0),
-};
-
-#define NR_LOOPS 5
-
-static void __init synchronize_tsc_bp(void)
-{
- int i;
- unsigned long long t0;
- unsigned long long sum, avg;
- long long delta;
- unsigned int one_usec;
- int buggy = 0;
-
- printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
-
- /* convert from kcyc/sec to cyc/usec */
- one_usec = cpu_khz / 1000;
-
- atomic_set(&tsc.start_flag, 1);
- wmb();
-
- /*
- * We loop a few times to get a primed instruction cache,
- * then the last pass is more or less synchronized and
- * the BP and APs set their cycle counters to zero all at
- * once. This reduces the chance of having random offsets
- * between the processors, and guarantees that the maximum
- * delay between the cycle counters is never bigger than
- * the latency of information-passing (cachelines) between
- * two CPUs.
- */
- for (i = 0; i < NR_LOOPS; i++) {
- /*
- * all APs synchronize but they loop on '== num_cpus'
- */
- while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
- cpu_relax();
- atomic_set(&tsc.count_stop, 0);
- wmb();
- /*
- * this lets the APs save their current TSC:
- */
- atomic_inc(&tsc.count_start);
-
- rdtscll(tsc.values[smp_processor_id()]);
- /*
- * We clear the TSC in the last loop:
- */
- if (i == NR_LOOPS-1)
- write_tsc(0, 0);
-
- /*
- * Wait for all APs to leave the synchronization point:
- */
- while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
- cpu_relax();
- atomic_set(&tsc.count_start, 0);
- wmb();
- atomic_inc(&tsc.count_stop);
- }
-
- sum = 0;
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_isset(i, cpu_callout_map)) {
- t0 = tsc.values[i];
- sum += t0;
- }
- }
- avg = sum;
- do_div(avg, num_booting_cpus());
-
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_isset(i, cpu_callout_map))
- continue;
- delta = tsc.values[i] - avg;
- if (delta < 0)
- delta = -delta;
- /*
- * We report bigger than 2 microseconds clock differences.
- */
- if (delta > 2*one_usec) {
- long long realdelta;
-
- if (!buggy) {
- buggy = 1;
- printk("\n");
- }
- realdelta = delta;
- do_div(realdelta, one_usec);
- if (tsc.values[i] < avg)
- realdelta = -realdelta;
-
- if (realdelta)
- printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
- "skew, fixed it up.\n", i, realdelta);
- }
- }
- if (!buggy)
- printk("passed.\n");
-}
-
-static void __cpuinit synchronize_tsc_ap(void)
-{
- int i;
-
- /*
- * Not every cpu is online at the time
- * this gets called, so we first wait for the BP to
- * finish SMP initialization:
- */
- while (!atomic_read(&tsc.start_flag))
- cpu_relax();
-
- for (i = 0; i < NR_LOOPS; i++) {
- atomic_inc(&tsc.count_start);
- while (atomic_read(&tsc.count_start) != num_booting_cpus())
- cpu_relax();
-
- rdtscll(tsc.values[smp_processor_id()]);
- if (i == NR_LOOPS-1)
- write_tsc(0, 0);
-
- atomic_inc(&tsc.count_stop);
- while (atomic_read(&tsc.count_stop) != num_booting_cpus())
- cpu_relax();
- }
-}
-#undef NR_LOOPS
-
extern void calibrate_delay(void);
static atomic_t init_deasserted;
@@ -437,20 +287,12 @@ static void __cpuinit smp_callin(void)
/*
* Save our processor parameters
*/
- smp_store_cpu_info(cpuid);
-
- disable_APIC_timer();
+ smp_store_cpu_info(cpuid);
/*
* Allow the master to continue.
*/
cpu_set(cpuid, cpu_callin_map);
-
- /*
- * Synchronize the TSC with the BP
- */
- if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
- synchronize_tsc_ap();
}
static int cpucount;
@@ -545,18 +387,25 @@ static void __cpuinit start_secondary(void *unused)
* booting is too fragile that we want to limit the
* things done here to the most necessary things.
*/
+#ifdef CONFIG_VMI
+ vmi_bringup();
+#endif
secondary_cpu_init();
preempt_disable();
smp_callin();
while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
rep_nop();
- setup_secondary_APIC_clock();
+ /*
+ * Check TSC synchronization with the BP:
+ */
+ check_tsc_sync_target();
+
+ setup_secondary_clock();
if (nmi_watchdog == NMI_IO_APIC) {
disable_8259A_irq(0);
enable_NMI_through_LVT0(NULL);
enable_8259A_irq(0);
}
- enable_APIC_timer();
/*
* low-memory mappings have been cleared, flush them from
* the local TLBs too.
@@ -619,7 +468,6 @@ extern struct {
unsigned short ss;
} stack_start;
extern struct i386_pda *start_pda;
-extern struct Xgt_desc_struct cpu_gdt_descr;
#ifdef CONFIG_NUMA
@@ -749,7 +597,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
/*
* Due to the Pentium erratum 3AP.
*/
- maxlvt = get_maxlvt();
+ maxlvt = lapic_get_maxlvt();
if (maxlvt > 3) {
apic_read_around(APIC_SPIV);
apic_write(APIC_ESR, 0);
@@ -835,11 +683,18 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
num_starts = 0;
/*
+ * Paravirt / VMI wants a startup IPI hook here to set up the
+ * target processor state.
+ */
+ startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
+ (unsigned long) stack_start.esp);
+
+ /*
* Run STARTUP IPI loop.
*/
Dprintk("#startup loops: %d.\n", num_starts);
- maxlvt = get_maxlvt();
+ maxlvt = lapic_get_maxlvt();
for (j = 1; j <= num_starts; j++) {
Dprintk("Sending STARTUP #%d.\n",j);
@@ -1115,8 +970,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
info.cpu = cpu;
INIT_WORK(&info.task, do_warm_boot_cpu);
- tsc_sync_disabled = 1;
-
/* init low mem mapping */
clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
@@ -1124,7 +977,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
schedule_work(&info.task);
wait_for_completion(&done);
- tsc_sync_disabled = 0;
zap_low_mappings();
ret = 0;
exit:
@@ -1320,13 +1172,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
smpboot_setup_io_apic();
- setup_boot_APIC_clock();
-
- /*
- * Synchronize the TSC with the AP
- */
- if (cpu_has_tsc && cpucount && cpu_khz)
- synchronize_tsc_bp();
+ setup_boot_clock();
}
/* These are wrappers to interface to the new boot process. Someone
@@ -1461,9 +1307,16 @@ int __cpuinit __cpu_up(unsigned int cpu)
}
local_irq_enable();
+
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* Unleash the CPU! */
cpu_set(cpu, smp_commenced_mask);
+
+ /*
+ * Check TSC synchronization with the AP:
+ */
+ check_tsc_sync_source(cpu);
+
while (!cpu_isset(cpu, cpu_online_map))
cpu_relax();
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index bc882a2b1db..13ca54a85a1 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -78,7 +78,7 @@ int __init sysenter_setup(void)
syscall_pages[0] = virt_to_page(syscall_page);
#ifdef CONFIG_COMPAT_VDSO
- __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY);
+ __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC);
printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
#endif
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index c505b16c099..a5350059557 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -131,15 +131,13 @@ unsigned long profile_pc(struct pt_regs *regs)
unsigned long pc = instruction_pointer(regs);
#ifdef CONFIG_SMP
- if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+ if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
+ in_lock_functions(pc)) {
#ifdef CONFIG_FRAME_POINTER
return *(unsigned long *)(regs->ebp + 4);
#else
- unsigned long *sp;
- if ((regs->xcs & 3) == 0)
- sp = (unsigned long *)&regs->esp;
- else
- sp = (unsigned long *)regs->esp;
+ unsigned long *sp = (unsigned long *)&regs->esp;
+
/* Return address is either directly at stack pointer
or above a saved eflags. Eflags has bits 22-31 zero,
kernel addresses don't. */
@@ -161,15 +159,6 @@ EXPORT_SYMBOL(profile_pc);
*/
irqreturn_t timer_interrupt(int irq, void *dev_id)
{
- /*
- * Here we are in the timer irq handler. We just have irqs locally
- * disabled but we don't know if the timer_bh is running on the other
- * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
- * the irq version of write_lock because as just said we have irq
- * locally disabled. -arca
- */
- write_seqlock(&xtime_lock);
-
#ifdef CONFIG_X86_IO_APIC
if (timer_ack) {
/*
@@ -188,7 +177,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
do_timer_interrupt_hook();
-
if (MCA_bus) {
/* The PS/2 uses level-triggered interrupts. You can't
turn them off, nor would you want to (any attempt to
@@ -203,18 +191,11 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
}
- write_sequnlock(&xtime_lock);
-
-#ifdef CONFIG_X86_LOCAL_APIC
- if (using_apic_timer)
- smp_send_timer_broadcast_ipi();
-#endif
-
return IRQ_HANDLED;
}
/* not static: needed by APM */
-unsigned long get_cmos_time(void)
+unsigned long read_persistent_clock(void)
{
unsigned long retval;
unsigned long flags;
@@ -227,11 +208,11 @@ unsigned long get_cmos_time(void)
return retval;
}
-EXPORT_SYMBOL(get_cmos_time);
static void sync_cmos_clock(unsigned long dummy);
static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+int no_sync_cmos_clock;
static void sync_cmos_clock(unsigned long dummy)
{
@@ -275,117 +256,20 @@ static void sync_cmos_clock(unsigned long dummy)
void notify_arch_cmos_timer(void)
{
- mod_timer(&sync_cmos_timer, jiffies + 1);
-}
-
-static long clock_cmos_diff;
-static unsigned long sleep_start;
-
-static int timer_suspend(struct sys_device *dev, pm_message_t state)
-{
- /*
- * Estimate time zone so that set_time can update the clock
- */
- unsigned long ctime = get_cmos_time();
-
- clock_cmos_diff = -ctime;
- clock_cmos_diff += get_seconds();
- sleep_start = ctime;
- return 0;
-}
-
-static int timer_resume(struct sys_device *dev)
-{
- unsigned long flags;
- unsigned long sec;
- unsigned long ctime = get_cmos_time();
- long sleep_length = (ctime - sleep_start) * HZ;
- struct timespec ts;
-
- if (sleep_length < 0) {
- printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n");
- /* The time after the resume must not be earlier than the time
- * before the suspend or some nasty things will happen
- */
- sleep_length = 0;
- ctime = sleep_start;
- }
-#ifdef CONFIG_HPET_TIMER
- if (is_hpet_enabled())
- hpet_reenable();
-#endif
- setup_pit_timer();
-
- sec = ctime + clock_cmos_diff;
- ts.tv_sec = sec;
- ts.tv_nsec = 0;
- do_settimeofday(&ts);
- write_seqlock_irqsave(&xtime_lock, flags);
- jiffies_64 += sleep_length;
- write_sequnlock_irqrestore(&xtime_lock, flags);
- touch_softlockup_watchdog();
- return 0;
-}
-
-static struct sysdev_class timer_sysclass = {
- .resume = timer_resume,
- .suspend = timer_suspend,
- set_kset_name("timer"),
-};
-
-
-/* XXX this driverfs stuff should probably go elsewhere later -john */
-static struct sys_device device_timer = {
- .id = 0,
- .cls = &timer_sysclass,
-};
-
-static int time_init_device(void)
-{
- int error = sysdev_class_register(&timer_sysclass);
- if (!error)
- error = sysdev_register(&device_timer);
- return error;
+ if (!no_sync_cmos_clock)
+ mod_timer(&sync_cmos_timer, jiffies + 1);
}
-device_initcall(time_init_device);
-
-#ifdef CONFIG_HPET_TIMER
extern void (*late_time_init)(void);
/* Duplicate of time_init() below, with hpet_enable part added */
static void __init hpet_time_init(void)
{
- struct timespec ts;
- ts.tv_sec = get_cmos_time();
- ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-
- do_settimeofday(&ts);
-
- if ((hpet_enable() >= 0) && hpet_use_timer) {
- printk("Using HPET for base-timer\n");
- }
-
+ if (!hpet_enable())
+ setup_pit_timer();
do_time_init();
}
-#endif
void __init time_init(void)
{
- struct timespec ts;
-#ifdef CONFIG_HPET_TIMER
- if (is_hpet_capable()) {
- /*
- * HPET initialization needs to do memory-mapped io. So, let
- * us do a late initialization after mem_init().
- */
- late_time_init = hpet_time_init;
- return;
- }
-#endif
- ts.tv_sec = get_cmos_time();
- ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-
- do_settimeofday(&ts);
-
- do_time_init();
+ late_time_init = hpet_time_init;
}
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
deleted file mode 100644
index 1e4702dfcd0..00000000000
--- a/arch/i386/kernel/time_hpet.c
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * linux/arch/i386/kernel/time_hpet.c
- * This code largely copied from arch/x86_64/kernel/time.c
- * See that file for credits.
- *
- * 2003-06-30 Venkatesh Pallipadi - Additional changes for HPET support
- */
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/param.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/timer.h>
-#include <asm/fixmap.h>
-#include <asm/apic.h>
-
-#include <linux/timex.h>
-
-#include <asm/hpet.h>
-#include <linux/hpet.h>
-
-static unsigned long hpet_period; /* fsecs / HPET clock */
-unsigned long hpet_tick; /* hpet clks count per tick */
-unsigned long hpet_address; /* hpet memory map physical address */
-int hpet_use_timer;
-
-static int use_hpet; /* can be used for runtime check of hpet */
-static int boot_hpet_disable; /* boottime override for HPET timer */
-static void __iomem * hpet_virt_address; /* hpet kernel virtual address */
-
-#define FSEC_TO_USEC (1000000000UL)
-
-int hpet_readl(unsigned long a)
-{
- return readl(hpet_virt_address + a);
-}
-
-static void hpet_writel(unsigned long d, unsigned long a)
-{
- writel(d, hpet_virt_address + a);
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * HPET counters dont wrap around on every tick. They just change the
- * comparator value and continue. Next tick can be caught by checking
- * for a change in the comparator value. Used in apic.c.
- */
-static void __devinit wait_hpet_tick(void)
-{
- unsigned int start_cmp_val, end_cmp_val;
-
- start_cmp_val = hpet_readl(HPET_T0_CMP);
- do {
- end_cmp_val = hpet_readl(HPET_T0_CMP);
- } while (start_cmp_val == end_cmp_val);
-}
-#endif
-
-static int hpet_timer_stop_set_go(unsigned long tick)
-{
- unsigned int cfg;
-
- /*
- * Stop the timers and reset the main counter.
- */
- cfg = hpet_readl(HPET_CFG);
- cfg &= ~HPET_CFG_ENABLE;
- hpet_writel(cfg, HPET_CFG);
- hpet_writel(0, HPET_COUNTER);
- hpet_writel(0, HPET_COUNTER + 4);
-
- if (hpet_use_timer) {
- /*
- * Set up timer 0, as periodic with first interrupt to happen at
- * hpet_tick, and period also hpet_tick.
- */
- cfg = hpet_readl(HPET_T0_CFG);
- cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
- HPET_TN_SETVAL | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_T0_CFG);
-
- /*
- * The first write after writing TN_SETVAL to the config register sets
- * the counter value, the second write sets the threshold.
- */
- hpet_writel(tick, HPET_T0_CMP);
- hpet_writel(tick, HPET_T0_CMP);
- }
- /*
- * Go!
- */
- cfg = hpet_readl(HPET_CFG);
- if (hpet_use_timer)
- cfg |= HPET_CFG_LEGACY;
- cfg |= HPET_CFG_ENABLE;
- hpet_writel(cfg, HPET_CFG);
-
- return 0;
-}
-
-/*
- * Check whether HPET was found by ACPI boot parse. If yes setup HPET
- * counter 0 for kernel base timer.
- */
-int __init hpet_enable(void)
-{
- unsigned int id;
- unsigned long tick_fsec_low, tick_fsec_high; /* tick in femto sec */
- unsigned long hpet_tick_rem;
-
- if (boot_hpet_disable)
- return -1;
-
- if (!hpet_address) {
- return -1;
- }
- hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
- /*
- * Read the period, compute tick and quotient.
- */
- id = hpet_readl(HPET_ID);
-
- /*
- * We are checking for value '1' or more in number field if
- * CONFIG_HPET_EMULATE_RTC is set because we will need an
- * additional timer for RTC emulation.
- * However, we can do with one timer otherwise using the
- * the single HPET timer for system time.
- */
-#ifdef CONFIG_HPET_EMULATE_RTC
- if (!(id & HPET_ID_NUMBER)) {
- iounmap(hpet_virt_address);
- hpet_virt_address = NULL;
- return -1;
- }
-#endif
-
-
- hpet_period = hpet_readl(HPET_PERIOD);
- if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) {
- iounmap(hpet_virt_address);
- hpet_virt_address = NULL;
- return -1;
- }
-
- /*
- * 64 bit math
- * First changing tick into fsec
- * Then 64 bit div to find number of hpet clk per tick
- */
- ASM_MUL64_REG(tick_fsec_low, tick_fsec_high,
- KERNEL_TICK_USEC, FSEC_TO_USEC);
- ASM_DIV64_REG(hpet_tick, hpet_tick_rem,
- hpet_period, tick_fsec_low, tick_fsec_high);
-
- if (hpet_tick_rem > (hpet_period >> 1))
- hpet_tick++; /* rounding the result */
-
- hpet_use_timer = id & HPET_ID_LEGSUP;
-
- if (hpet_timer_stop_set_go(hpet_tick)) {
- iounmap(hpet_virt_address);
- hpet_virt_address = NULL;
- return -1;
- }
-
- use_hpet = 1;
-
-#ifdef CONFIG_HPET
- {
- struct hpet_data hd;
- unsigned int ntimer;
-
- memset(&hd, 0, sizeof (hd));
-
- ntimer = hpet_readl(HPET_ID);
- ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
- ntimer++;
-
- /*
- * Register with driver.
- * Timer0 and Timer1 is used by platform.
- */
- hd.hd_phys_address = hpet_address;
- hd.hd_address = hpet_virt_address;
- hd.hd_nirqs = ntimer;
- hd.hd_flags = HPET_DATA_PLATFORM;
- hpet_reserve_timer(&hd, 0);
-#ifdef CONFIG_HPET_EMULATE_RTC
- hpet_reserve_timer(&hd, 1);
-#endif
- hd.hd_irq[0] = HPET_LEGACY_8254;
- hd.hd_irq[1] = HPET_LEGACY_RTC;
- if (ntimer > 2) {
- struct hpet __iomem *hpet;
- struct hpet_timer __iomem *timer;
- int i;
-
- hpet = hpet_virt_address;
-
- for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer;
- timer++, i++)
- hd.hd_irq[i] = (timer->hpet_config &
- Tn_INT_ROUTE_CNF_MASK) >>
- Tn_INT_ROUTE_CNF_SHIFT;
-
- }
-
- hpet_alloc(&hd);
- }
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
- if (hpet_use_timer)
- wait_timer_tick = wait_hpet_tick;
-#endif
- return 0;
-}
-
-int hpet_reenable(void)
-{
- return hpet_timer_stop_set_go(hpet_tick);
-}
-
-int is_hpet_enabled(void)
-{
- return use_hpet;
-}
-
-int is_hpet_capable(void)
-{
- if (!boot_hpet_disable && hpet_address)
- return 1;
- return 0;
-}
-
-static int __init hpet_setup(char* str)
-{
- if (str) {
- if (!strncmp("disable", str, 7))
- boot_hpet_disable = 1;
- }
- return 1;
-}
-
-__setup("hpet=", hpet_setup);
-
-#ifdef CONFIG_HPET_EMULATE_RTC
-/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
- * is enabled, we support RTC interrupt functionality in software.
- * RTC has 3 kinds of interrupts:
- * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
- * is updated
- * 2) Alarm Interrupt - generate an interrupt at a specific time of day
- * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
- * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
- * (1) and (2) above are implemented using polling at a frequency of
- * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
- * overhead. (DEFAULT_RTC_INT_FREQ)
- * For (3), we use interrupts at 64Hz or user specified periodic
- * frequency, whichever is higher.
- */
-#include <linux/mc146818rtc.h>
-#include <linux/rtc.h>
-
-#define DEFAULT_RTC_INT_FREQ 64
-#define RTC_NUM_INTS 1
-
-static unsigned long UIE_on;
-static unsigned long prev_update_sec;
-
-static unsigned long AIE_on;
-static struct rtc_time alarm_time;
-
-static unsigned long PIE_on;
-static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
-static unsigned long PIE_count;
-
-static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
-static unsigned int hpet_t1_cmp; /* cached comparator register */
-
-/*
- * Timer 1 for RTC, we do not use periodic interrupt feature,
- * even if HPET supports periodic interrupts on Timer 1.
- * The reason being, to set up a periodic interrupt in HPET, we need to
- * stop the main counter. And if we do that everytime someone diables/enables
- * RTC, we will have adverse effect on main kernel timer running on Timer 0.
- * So, for the time being, simulate the periodic interrupt in software.
- *
- * hpet_rtc_timer_init() is called for the first time and during subsequent
- * interuppts reinit happens through hpet_rtc_timer_reinit().
- */
-int hpet_rtc_timer_init(void)
-{
- unsigned int cfg, cnt;
- unsigned long flags;
-
- if (!is_hpet_enabled())
- return 0;
- /*
- * Set the counter 1 and enable the interrupts.
- */
- if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
- hpet_rtc_int_freq = PIE_freq;
- else
- hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
-
- local_irq_save(flags);
-
- cnt = hpet_readl(HPET_COUNTER);
- cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
- hpet_writel(cnt, HPET_T1_CMP);
- hpet_t1_cmp = cnt;
-
- cfg = hpet_readl(HPET_T1_CFG);
- cfg &= ~HPET_TN_PERIODIC;
- cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_T1_CFG);
-
- local_irq_restore(flags);
-
- return 1;
-}
-
-static void hpet_rtc_timer_reinit(void)
-{
- unsigned int cfg, cnt, ticks_per_int, lost_ints;
-
- if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
- cfg = hpet_readl(HPET_T1_CFG);
- cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_T1_CFG);
- return;
- }
-
- if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
- hpet_rtc_int_freq = PIE_freq;
- else
- hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
-
- /* It is more accurate to use the comparator value than current count.*/
- ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
- hpet_t1_cmp += ticks_per_int;
- hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
-
- /*
- * If the interrupt handler was delayed too long, the write above tries
- * to schedule the next interrupt in the past and the hardware would
- * not interrupt until the counter had wrapped around.
- * So we have to check that the comparator wasn't set to a past time.
- */
- cnt = hpet_readl(HPET_COUNTER);
- if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
- lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
- /* Make sure that, even with the time needed to execute
- * this code, the next scheduled interrupt has been moved
- * back to the future: */
- lost_ints++;
-
- hpet_t1_cmp += lost_ints * ticks_per_int;
- hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
-
- if (PIE_on)
- PIE_count += lost_ints;
-
- printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
- hpet_rtc_int_freq);
- }
-}
-
-/*
- * The functions below are called from rtc driver.
- * Return 0 if HPET is not being used.
- * Otherwise do the necessary changes and return 1.
- */
-int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
-{
- if (!is_hpet_enabled())
- return 0;
-
- if (bit_mask & RTC_UIE)
- UIE_on = 0;
- if (bit_mask & RTC_PIE)
- PIE_on = 0;
- if (bit_mask & RTC_AIE)
- AIE_on = 0;
-
- return 1;
-}
-
-int hpet_set_rtc_irq_bit(unsigned long bit_mask)
-{
- int timer_init_reqd = 0;
-
- if (!is_hpet_enabled())
- return 0;
-
- if (!(PIE_on | AIE_on | UIE_on))
- timer_init_reqd = 1;
-
- if (bit_mask & RTC_UIE) {
- UIE_on = 1;
- }
- if (bit_mask & RTC_PIE) {
- PIE_on = 1;
- PIE_count = 0;
- }
- if (bit_mask & RTC_AIE) {
- AIE_on = 1;
- }
-
- if (timer_init_reqd)
- hpet_rtc_timer_init();
-
- return 1;
-}
-
-int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
-{
- if (!is_hpet_enabled())
- return 0;
-
- alarm_time.tm_hour = hrs;
- alarm_time.tm_min = min;
- alarm_time.tm_sec = sec;
-
- return 1;
-}
-
-int hpet_set_periodic_freq(unsigned long freq)
-{
- if (!is_hpet_enabled())
- return 0;
-
- PIE_freq = freq;
- PIE_count = 0;
-
- return 1;
-}
-
-int hpet_rtc_dropped_irq(void)
-{
- if (!is_hpet_enabled())
- return 0;
-
- return 1;
-}
-
-irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
-{
- struct rtc_time curr_time;
- unsigned long rtc_int_flag = 0;
- int call_rtc_interrupt = 0;
-
- hpet_rtc_timer_reinit();
-
- if (UIE_on | AIE_on) {
- rtc_get_rtc_time(&curr_time);
- }
- if (UIE_on) {
- if (curr_time.tm_sec != prev_update_sec) {
- /* Set update int info, call real rtc int routine */
- call_rtc_interrupt = 1;
- rtc_int_flag = RTC_UF;
- prev_update_sec = curr_time.tm_sec;
- }
- }
- if (PIE_on) {
- PIE_count++;
- if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
- /* Set periodic int info, call real rtc int routine */
- call_rtc_interrupt = 1;
- rtc_int_flag |= RTC_PF;
- PIE_count = 0;
- }
- }
- if (AIE_on) {
- if ((curr_time.tm_sec == alarm_time.tm_sec) &&
- (curr_time.tm_min == alarm_time.tm_min) &&
- (curr_time.tm_hour == alarm_time.tm_hour)) {
- /* Set alarm int info, call real rtc int routine */
- call_rtc_interrupt = 1;
- rtc_int_flag |= RTC_AF;
- }
- }
- if (call_rtc_interrupt) {
- rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
- rtc_interrupt(rtc_int_flag, dev_id);
- }
- return IRQ_HANDLED;
-}
-#endif
-
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 79cf608e14c..45782356a61 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -1,5 +1,5 @@
/*
- * arch/i386/kernel/topology.c - Populate driverfs with topology information
+ * arch/i386/kernel/topology.c - Populate sysfs with topology information
*
* Written by: Matthew Dobson, IBM Corporation
* Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 0efad8aeb41..af0d3f70a81 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -94,6 +94,7 @@ asmlinkage void spurious_interrupt_bug(void);
asmlinkage void machine_check(void);
int kstack_depth_to_print = 24;
+static unsigned int code_bytes = 64;
ATOMIC_NOTIFIER_HEAD(i386die_chain);
int register_die_notifier(struct notifier_block *nb)
@@ -291,10 +292,11 @@ void show_registers(struct pt_regs *regs)
int i;
int in_kernel = 1;
unsigned long esp;
- unsigned short ss;
+ unsigned short ss, gs;
esp = (unsigned long) (&regs->esp);
savesegment(ss, ss);
+ savesegment(gs, gs);
if (user_mode_vm(regs)) {
in_kernel = 0;
esp = regs->esp;
@@ -313,8 +315,8 @@ void show_registers(struct pt_regs *regs)
regs->eax, regs->ebx, regs->ecx, regs->edx);
printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
regs->esi, regs->edi, regs->ebp, esp);
- printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n",
- regs->xds & 0xffff, regs->xes & 0xffff, ss);
+ printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n",
+ regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
TASK_COMM_LEN, current->comm, current->pid,
current_thread_info(), current, current->thread_info);
@@ -324,7 +326,8 @@ void show_registers(struct pt_regs *regs)
*/
if (in_kernel) {
u8 *eip;
- int code_bytes = 64;
+ unsigned int code_prologue = code_bytes * 43 / 64;
+ unsigned int code_len = code_bytes;
unsigned char c;
printk("\n" KERN_EMERG "Stack: ");
@@ -332,14 +335,14 @@ void show_registers(struct pt_regs *regs)
printk(KERN_EMERG "Code: ");
- eip = (u8 *)regs->eip - 43;
+ eip = (u8 *)regs->eip - code_prologue;
if (eip < (u8 *)PAGE_OFFSET ||
probe_kernel_address(eip, c)) {
/* try starting at EIP */
eip = (u8 *)regs->eip;
- code_bytes = 32;
+ code_len = code_len - code_prologue + 1;
}
- for (i = 0; i < code_bytes; i++, eip++) {
+ for (i = 0; i < code_len; i++, eip++) {
if (eip < (u8 *)PAGE_OFFSET ||
probe_kernel_address(eip, c)) {
printk(" Bad EIP value.");
@@ -1191,3 +1194,13 @@ static int __init kstack_setup(char *s)
return 1;
}
__setup("kstack=", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+ code_bytes = simple_strtoul(s, NULL, 0);
+ if (code_bytes > 8192)
+ code_bytes = 8192;
+
+ return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 2cfc7b09b92..3082a418635 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -23,6 +23,7 @@
* an extra value to store the TSC freq
*/
unsigned int tsc_khz;
+unsigned long long (*custom_sched_clock)(void);
int tsc_disable;
@@ -59,12 +60,6 @@ static inline int check_tsc_unstable(void)
return tsc_unstable;
}
-void mark_tsc_unstable(void)
-{
- tsc_unstable = 1;
-}
-EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-
/* Accellerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
* basic equation:
@@ -107,14 +102,14 @@ unsigned long long sched_clock(void)
{
unsigned long long this_offset;
+ if (unlikely(custom_sched_clock))
+ return (*custom_sched_clock)();
+
/*
- * in the NUMA case we dont use the TSC as they are not
- * synchronized across all CPUs.
+ * Fall back to jiffies if there's no TSC available:
*/
-#ifndef CONFIG_NUMA
- if (!cpu_khz || check_tsc_unstable())
-#endif
- /* no locking but a rare wrong value is not a big deal */
+ if (unlikely(tsc_disable))
+ /* No locking but a rare wrong value is not a big deal: */
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
/* read the Time Stamp Counter: */
@@ -194,13 +189,13 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
void __init tsc_init(void)
{
if (!cpu_has_tsc || tsc_disable)
- return;
+ goto out_no_tsc;
cpu_khz = calculate_cpu_khz();
tsc_khz = cpu_khz;
if (!cpu_khz)
- return;
+ goto out_no_tsc;
printk("Detected %lu.%03lu MHz processor.\n",
(unsigned long)cpu_khz / 1000,
@@ -208,37 +203,18 @@ void __init tsc_init(void)
set_cyc2ns_scale(cpu_khz);
use_tsc_delay();
-}
+ return;
-#ifdef CONFIG_CPU_FREQ
-
-static unsigned int cpufreq_delayed_issched = 0;
-static unsigned int cpufreq_init = 0;
-static struct work_struct cpufreq_delayed_get_work;
-
-static void handle_cpufreq_delayed_get(struct work_struct *work)
-{
- unsigned int cpu;
-
- for_each_online_cpu(cpu)
- cpufreq_get(cpu);
-
- cpufreq_delayed_issched = 0;
+out_no_tsc:
+ /*
+ * Set the tsc_disable flag if there's no TSC support, this
+ * makes it a fast flag for the kernel to see whether it
+ * should be using the TSC.
+ */
+ tsc_disable = 1;
}
-/*
- * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries
- * to verify the CPU frequency the timing core thinks the CPU is running
- * at is still correct.
- */
-static inline void cpufreq_delayed_get(void)
-{
- if (cpufreq_init && !cpufreq_delayed_issched) {
- cpufreq_delayed_issched = 1;
- printk(KERN_DEBUG "Checking if CPU frequency changed.\n");
- schedule_work(&cpufreq_delayed_get_work);
- }
-}
+#ifdef CONFIG_CPU_FREQ
/*
* if the CPU frequency is scaled, TSC-based delays will need a different
@@ -303,17 +279,9 @@ static struct notifier_block time_cpufreq_notifier_block = {
static int __init cpufreq_tsc(void)
{
- int ret;
-
- INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
- ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER);
- if (!ret)
- cpufreq_init = 1;
-
- return ret;
+ return cpufreq_register_notifier(&time_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
}
-
core_initcall(cpufreq_tsc);
#endif
@@ -321,7 +289,6 @@ core_initcall(cpufreq_tsc);
/* clock source code */
static unsigned long current_tsc_khz = 0;
-static int tsc_update_callback(void);
static cycle_t read_tsc(void)
{
@@ -339,37 +306,28 @@ static struct clocksource clocksource_tsc = {
.mask = CLOCKSOURCE_MASK(64),
.mult = 0, /* to be set */
.shift = 22,
- .update_callback = tsc_update_callback,
- .is_continuous = 1,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
+ CLOCK_SOURCE_MUST_VERIFY,
};
-static int tsc_update_callback(void)
+void mark_tsc_unstable(void)
{
- int change = 0;
-
- /* check to see if we should switch to the safe clocksource: */
- if (clocksource_tsc.rating != 0 && check_tsc_unstable()) {
- clocksource_tsc.rating = 0;
- clocksource_reselect();
- change = 1;
- }
-
- /* only update if tsc_khz has changed: */
- if (current_tsc_khz != tsc_khz) {
- current_tsc_khz = tsc_khz;
- clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
- clocksource_tsc.shift);
- change = 1;
+ if (!tsc_unstable) {
+ tsc_unstable = 1;
+ /* Can be called before registration */
+ if (clocksource_tsc.mult)
+ clocksource_change_rating(&clocksource_tsc, 0);
+ else
+ clocksource_tsc.rating = 0;
}
-
- return change;
}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
{
printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
d->ident);
- mark_tsc_unstable();
+ tsc_unstable = 1;
return 0;
}
@@ -386,65 +344,44 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
{}
};
-#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */
-static struct timer_list verify_tsc_freq_timer;
-
-/* XXX - Probably should add locking */
-static void verify_tsc_freq(unsigned long unused)
-{
- static u64 last_tsc;
- static unsigned long last_jiffies;
-
- u64 now_tsc, interval_tsc;
- unsigned long now_jiffies, interval_jiffies;
-
-
- if (check_tsc_unstable())
- return;
-
- rdtscll(now_tsc);
- now_jiffies = jiffies;
-
- if (!last_jiffies) {
- goto out;
- }
-
- interval_jiffies = now_jiffies - last_jiffies;
- interval_tsc = now_tsc - last_tsc;
- interval_tsc *= HZ;
- do_div(interval_tsc, cpu_khz*1000);
-
- if (interval_tsc < (interval_jiffies * 3 / 4)) {
- printk("TSC appears to be running slowly. "
- "Marking it as unstable\n");
- mark_tsc_unstable();
- return;
- }
-
-out:
- last_tsc = now_tsc;
- last_jiffies = now_jiffies;
- /* set us up to go off on the next interval: */
- mod_timer(&verify_tsc_freq_timer,
- jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL));
-}
-
/*
* Make an educated guess if the TSC is trustworthy and synchronized
* over all CPUs.
*/
-static __init int unsynchronized_tsc(void)
+__cpuinit int unsynchronized_tsc(void)
{
+ if (!cpu_has_tsc || tsc_unstable)
+ return 1;
/*
* Intel systems are normally all synchronized.
* Exceptions must mark TSC as unstable:
*/
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- return 0;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ /* assume multi socket systems are not synchronized: */
+ if (num_possible_cpus() > 1)
+ tsc_unstable = 1;
+ }
+ return tsc_unstable;
+}
+
+/*
+ * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
+ */
+#ifdef CONFIG_MGEODE_LX
+/* RTSC counts during suspend */
+#define RTSC_SUSP 0x100
+
+static void __init check_geode_tsc_reliable(void)
+{
+ unsigned long val;
- /* assume multi socket systems are not synchronized: */
- return num_possible_cpus() > 1;
+ rdmsrl(MSR_GEODE_BUSCONT_CONF0, val);
+ if ((val & RTSC_SUSP))
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}
+#else
+static inline void check_geode_tsc_reliable(void) { }
+#endif
static int __init init_tsc_clocksource(void)
{
@@ -453,20 +390,16 @@ static int __init init_tsc_clocksource(void)
/* check blacklist */
dmi_check_system(bad_tsc_dmi_table);
- if (unsynchronized_tsc()) /* mark unstable if unsynced */
- mark_tsc_unstable();
+ unsynchronized_tsc();
+ check_geode_tsc_reliable();
current_tsc_khz = tsc_khz;
clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
clocksource_tsc.shift);
/* lower the rating if we already know its unstable: */
- if (check_tsc_unstable())
+ if (check_tsc_unstable()) {
clocksource_tsc.rating = 0;
-
- init_timer(&verify_tsc_freq_timer);
- verify_tsc_freq_timer.function = verify_tsc_freq;
- verify_tsc_freq_timer.expires =
- jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL);
- add_timer(&verify_tsc_freq_timer);
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
+ }
return clocksource_register(&clocksource_tsc);
}
diff --git a/arch/i386/kernel/tsc_sync.c b/arch/i386/kernel/tsc_sync.c
new file mode 100644
index 00000000000..12424629af8
--- /dev/null
+++ b/arch/i386/kernel/tsc_sync.c
@@ -0,0 +1 @@
+#include "../../x86_64/kernel/tsc_sync.c"
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index be2f96e67f7..d1b8f2b7aea 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -96,12 +96,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
{
int ret = 0;
- /* kernel_vm86_regs is missing xfs, so copy everything up to
- (but not including) xgs, and then rest after xgs. */
- ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs));
- ret += copy_to_user(&user->__null_gs, &regs->pt.xgs,
+ /* kernel_vm86_regs is missing xgs, so copy everything up to
+ (but not including) orig_eax, and then rest including orig_eax. */
+ ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+ ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
sizeof(struct kernel_vm86_regs) -
- offsetof(struct kernel_vm86_regs, pt.xgs));
+ offsetof(struct kernel_vm86_regs, pt.orig_eax));
return ret;
}
@@ -113,12 +113,13 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
{
int ret = 0;
- ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs));
- ret += copy_from_user(&regs->pt.xgs, &user->__null_gs,
+ /* copy eax-xfs inclusive */
+ ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+ /* copy orig_eax-__gsh+extra */
+ ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
sizeof(struct kernel_vm86_regs) -
- offsetof(struct kernel_vm86_regs, pt.xgs) +
+ offsetof(struct kernel_vm86_regs, pt.orig_eax) +
extra);
-
return ret;
}
@@ -157,8 +158,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
ret = KVM86->regs32;
- loadsegment(fs, current->thread.saved_fs);
- ret->xgs = current->thread.saved_gs;
+ ret->xfs = current->thread.saved_fs;
+ loadsegment(gs, current->thread.saved_gs);
return ret;
}
@@ -285,9 +286,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
*/
info->regs.pt.xds = 0;
info->regs.pt.xes = 0;
- info->regs.pt.xgs = 0;
+ info->regs.pt.xfs = 0;
-/* we are clearing fs later just before "jmp resume_userspace",
+/* we are clearing gs later just before "jmp resume_userspace",
* because it is not saved/restored.
*/
@@ -321,8 +322,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
*/
info->regs32->eax = 0;
tsk->thread.saved_esp0 = tsk->thread.esp0;
- savesegment(fs, tsk->thread.saved_fs);
- tsk->thread.saved_gs = info->regs32->xgs;
+ tsk->thread.saved_fs = info->regs32->xfs;
+ savesegment(gs, tsk->thread.saved_gs);
tss = &per_cpu(init_tss, get_cpu());
tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -342,7 +343,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
__asm__ __volatile__(
"movl %0,%%esp\n\t"
"movl %1,%%ebp\n\t"
- "mov %2, %%fs\n\t"
+ "mov %2, %%gs\n\t"
"jmp resume_userspace"
: /* no outputs */
:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
new file mode 100644
index 00000000000..bb5a7abf949
--- /dev/null
+++ b/arch/i386/kernel/vmi.c
@@ -0,0 +1,949 @@
+/*
+ * VMI specific paravirt-ops implementation
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to zach@vmware.com
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/license.h>
+#include <linux/cpu.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <asm/vmi.h>
+#include <asm/io.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+#include <asm/vmi_time.h>
+
+/* Convenient for calling VMI functions indirectly in the ROM */
+typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
+typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
+
+#define call_vrom_func(rom,func) \
+ (((VROMFUNC *)(rom->func))())
+
+#define call_vrom_long_func(rom,func,arg) \
+ (((VROMLONGFUNC *)(rom->func)) (arg))
+
+static struct vrom_header *vmi_rom;
+static int license_gplok;
+static int disable_nodelay;
+static int disable_pge;
+static int disable_pse;
+static int disable_sep;
+static int disable_tsc;
+static int disable_mtrr;
+
+/* Cached VMI operations */
+struct {
+ void (*cpuid)(void /* non-c */);
+ void (*_set_ldt)(u32 selector);
+ void (*set_tr)(u32 selector);
+ void (*set_kernel_stack)(u32 selector, u32 esp0);
+ void (*allocate_page)(u32, u32, u32, u32, u32);
+ void (*release_page)(u32, u32);
+ void (*set_pte)(pte_t, pte_t *, unsigned);
+ void (*update_pte)(pte_t *, unsigned);
+ void (*set_linear_mapping)(int, u32, u32, u32);
+ void (*flush_tlb)(int);
+ void (*set_initial_ap_state)(int, int);
+ void (*halt)(void);
+} vmi_ops;
+
+/* XXX move this to alternative.h */
+extern struct paravirt_patch __start_parainstructions[],
+ __stop_parainstructions[];
+
+/*
+ * VMI patching routines.
+ */
+#define MNEM_CALL 0xe8
+#define MNEM_JMP 0xe9
+#define MNEM_RET 0xc3
+
+static char irq_save_disable_callout[] = {
+ MNEM_CALL, 0, 0, 0, 0,
+ MNEM_CALL, 0, 0, 0, 0,
+ MNEM_RET
+};
+#define IRQ_PATCH_INT_MASK 0
+#define IRQ_PATCH_DISABLE 5
+
+static inline void patch_offset(unsigned char *eip, unsigned char *dest)
+{
+ *(unsigned long *)(eip+1) = dest-eip-5;
+}
+
+static unsigned patch_internal(int call, unsigned len, void *insns)
+{
+ u64 reloc;
+ struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
+ reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
+ switch(rel->type) {
+ case VMI_RELOCATION_CALL_REL:
+ BUG_ON(len < 5);
+ *(char *)insns = MNEM_CALL;
+ patch_offset(insns, rel->eip);
+ return 5;
+
+ case VMI_RELOCATION_JUMP_REL:
+ BUG_ON(len < 5);
+ *(char *)insns = MNEM_JMP;
+ patch_offset(insns, rel->eip);
+ return 5;
+
+ case VMI_RELOCATION_NOP:
+ /* obliterate the whole thing */
+ return 0;
+
+ case VMI_RELOCATION_NONE:
+ /* leave native code in place */
+ break;
+
+ default:
+ BUG();
+ }
+ return len;
+}
+
+/*
+ * Apply patch if appropriate, return length of new instruction
+ * sequence. The callee does nop padding for us.
+ */
+static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+ switch (type) {
+ case PARAVIRT_IRQ_DISABLE:
+ return patch_internal(VMI_CALL_DisableInterrupts, len, insns);
+ case PARAVIRT_IRQ_ENABLE:
+ return patch_internal(VMI_CALL_EnableInterrupts, len, insns);
+ case PARAVIRT_RESTORE_FLAGS:
+ return patch_internal(VMI_CALL_SetInterruptMask, len, insns);
+ case PARAVIRT_SAVE_FLAGS:
+ return patch_internal(VMI_CALL_GetInterruptMask, len, insns);
+ case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE:
+ if (len >= 10) {
+ patch_internal(VMI_CALL_GetInterruptMask, len, insns);
+ patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5);
+ return 10;
+ } else {
+ /*
+ * You bastards didn't leave enough room to
+ * patch save_flags_irq_disable inline. Patch
+ * to a helper
+ */
+ BUG_ON(len < 5);
+ *(char *)insns = MNEM_CALL;
+ patch_offset(insns, irq_save_disable_callout);
+ return 5;
+ }
+ case PARAVIRT_INTERRUPT_RETURN:
+ return patch_internal(VMI_CALL_IRET, len, insns);
+ case PARAVIRT_STI_SYSEXIT:
+ return patch_internal(VMI_CALL_SYSEXIT, len, insns);
+ default:
+ break;
+ }
+ return len;
+}
+
+/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
+static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ int override = 0;
+ if (*eax == 1)
+ override = 1;
+ asm volatile ("call *%6"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
+ if (override) {
+ if (disable_pse)
+ *edx &= ~X86_FEATURE_PSE;
+ if (disable_pge)
+ *edx &= ~X86_FEATURE_PGE;
+ if (disable_sep)
+ *edx &= ~X86_FEATURE_SEP;
+ if (disable_tsc)
+ *edx &= ~X86_FEATURE_TSC;
+ if (disable_mtrr)
+ *edx &= ~X86_FEATURE_MTRR;
+ }
+}
+
+static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
+{
+ if (gdt[nr].a != new->a || gdt[nr].b != new->b)
+ write_gdt_entry(gdt, nr, new->a, new->b);
+}
+
+static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+ vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
+ vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
+ vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
+}
+
+static void vmi_set_ldt(const void *addr, unsigned entries)
+{
+ unsigned cpu = smp_processor_id();
+ u32 low, high;
+
+ pack_descriptor(&low, &high, (unsigned long)addr,
+ entries * sizeof(struct desc_struct) - 1,
+ DESCTYPE_LDT, 0);
+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
+ vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
+}
+
+static void vmi_set_tr(void)
+{
+ vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
+}
+
+static void vmi_load_esp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ tss->esp0 = thread->esp0;
+
+ /* This can only happen when SEP is enabled, no need to test "SEP"arately */
+ if (unlikely(tss->ss1 != thread->sysenter_cs)) {
+ tss->ss1 = thread->sysenter_cs;
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+ }
+ vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0);
+}
+
+static void vmi_flush_tlb_user(void)
+{
+ vmi_ops.flush_tlb(VMI_FLUSH_TLB);
+}
+
+static void vmi_flush_tlb_kernel(void)
+{
+ vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
+}
+
+/* Stub to do nothing at all; used for delays and unimplemented calls */
+static void vmi_nop(void)
+{
+}
+
+/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
+#ifdef CONFIG_NO_IDLE_HZ
+static fastcall void vmi_safe_halt(void)
+{
+ int idle = vmi_stop_hz_timer();
+ vmi_ops.halt();
+ if (idle) {
+ local_irq_disable();
+ vmi_account_time_restart_hz_timer();
+ local_irq_enable();
+ }
+}
+#endif
+
+#ifdef CONFIG_DEBUG_PAGE_TYPE
+
+#ifdef CONFIG_X86_PAE
+#define MAX_BOOT_PTS (2048+4+1)
+#else
+#define MAX_BOOT_PTS (1024+1)
+#endif
+
+/*
+ * During boot, mem_map is not yet available in paging_init, so stash
+ * all the boot page allocations here.
+ */
+static struct {
+ u32 pfn;
+ int type;
+} boot_page_allocations[MAX_BOOT_PTS];
+static int num_boot_page_allocations;
+static int boot_allocations_applied;
+
+void vmi_apply_boot_page_allocations(void)
+{
+ int i;
+ BUG_ON(!mem_map);
+ for (i = 0; i < num_boot_page_allocations; i++) {
+ struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
+ page->type = boot_page_allocations[i].type;
+ page->type = boot_page_allocations[i].type &
+ ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+ }
+ boot_allocations_applied = 1;
+}
+
+static void record_page_type(u32 pfn, int type)
+{
+ BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
+ boot_page_allocations[num_boot_page_allocations].pfn = pfn;
+ boot_page_allocations[num_boot_page_allocations].type = type;
+ num_boot_page_allocations++;
+}
+
+static void check_zeroed_page(u32 pfn, int type, struct page *page)
+{
+ u32 *ptr;
+ int i;
+ int limit = PAGE_SIZE / sizeof(int);
+
+ if (page_address(page))
+ ptr = (u32 *)page_address(page);
+ else
+ ptr = (u32 *)__va(pfn << PAGE_SHIFT);
+ /*
+ * When cloning the root in non-PAE mode, only the userspace
+ * pdes need to be zeroed.
+ */
+ if (type & VMI_PAGE_CLONE)
+ limit = USER_PTRS_PER_PGD;
+ for (i = 0; i < limit; i++)
+ BUG_ON(ptr[i]);
+}
+
+/*
+ * We stash the page type into struct page so we can verify the page
+ * types are used properly.
+ */
+static void vmi_set_page_type(u32 pfn, int type)
+{
+ /* PAE can have multiple roots per page - don't track */
+ if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+ return;
+
+ if (boot_allocations_applied) {
+ struct page *page = pfn_to_page(pfn);
+ if (type != VMI_PAGE_NORMAL)
+ BUG_ON(page->type);
+ else
+ BUG_ON(page->type == VMI_PAGE_NORMAL);
+ page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+ if (type & VMI_PAGE_ZEROED)
+ check_zeroed_page(pfn, type, page);
+ } else {
+ record_page_type(pfn, type);
+ }
+}
+
+static void vmi_check_page_type(u32 pfn, int type)
+{
+ /* PAE can have multiple roots per page - skip checks */
+ if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+ return;
+
+ type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+ if (boot_allocations_applied) {
+ struct page *page = pfn_to_page(pfn);
+ BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
+ BUG_ON(type == VMI_PAGE_NORMAL && page->type);
+ BUG_ON((type & page->type) == 0);
+ }
+}
+#else
+#define vmi_set_page_type(p,t) do { } while (0)
+#define vmi_check_page_type(p,t) do { } while (0)
+#endif
+
+static void vmi_allocate_pt(u32 pfn)
+{
+ vmi_set_page_type(pfn, VMI_PAGE_L1);
+ vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
+}
+
+static void vmi_allocate_pd(u32 pfn)
+{
+ /*
+ * This call comes in very early, before mem_map is setup.
+ * It is called only for swapper_pg_dir, which already has
+ * data on it.
+ */
+ vmi_set_page_type(pfn, VMI_PAGE_L2);
+ vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
+}
+
+static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+{
+ vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
+ vmi_check_page_type(clonepfn, VMI_PAGE_L2);
+ vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
+}
+
+static void vmi_release_pt(u32 pfn)
+{
+ vmi_ops.release_page(pfn, VMI_PAGE_L1);
+ vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+
+static void vmi_release_pd(u32 pfn)
+{
+ vmi_ops.release_page(pfn, VMI_PAGE_L2);
+ vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+
+/*
+ * Helper macros for MMU update flags. We can defer updates until a flush
+ * or page invalidation only if the update is to the current address space
+ * (otherwise, there is no flush). We must check against init_mm, since
+ * this could be a kernel update, which usually passes init_mm, although
+ * sometimes this check can be skipped if we know the particular function
+ * is only called on user mode PTEs. We could change the kernel to pass
+ * current->active_mm here, but in particular, I was unsure if changing
+ * mm/highmem.c to do this would still be correct on other architectures.
+ */
+#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
+ (!mustbeuser && (mm) == &init_mm))
+#define vmi_flags_addr(mm, addr, level, user) \
+ ((level) | (is_current_as(mm, user) ? \
+ (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+#define vmi_flags_addr_defer(mm, addr, level, user) \
+ ((level) | (is_current_as(mm, user) ? \
+ (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+
+static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+ vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+ vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+ vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+ vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_set_pte(pte_t *ptep, pte_t pte)
+{
+ /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
+ vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
+ vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
+}
+
+static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+{
+ vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+ vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+#ifdef CONFIG_X86_PAE
+ const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
+ vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
+#else
+ const pte_t pte = { pmdval.pud.pgd.pgd };
+ vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+#endif
+ vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
+}
+
+#ifdef CONFIG_X86_PAE
+
+static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+ /*
+ * XXX This is called from set_pmd_pte, but at both PT
+ * and PD layers so the VMI_PAGE_PT flag is wrong. But
+ * it is only called for large page mapping changes,
+ * the Xen backend, doesn't support large pages, and the
+ * ESX backend doesn't depend on the flag.
+ */
+ set_64bit((unsigned long long *)ptep,pte_val(pteval));
+ vmi_ops.update_pte(ptep, VMI_PAGE_PT);
+}
+
+static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+ vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+ vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
+}
+
+static void vmi_set_pud(pud_t *pudp, pud_t pudval)
+{
+ /* Um, eww */
+ const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
+ vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+ vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
+}
+
+static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ const pte_t pte = { 0 };
+ vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+ vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+void vmi_pmd_clear(pmd_t *pmd)
+{
+ const pte_t pte = { 0 };
+ vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
+ vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
+}
+#endif
+
+#ifdef CONFIG_SMP
+struct vmi_ap_state ap;
+extern void setup_pda(void);
+
+static void __init /* XXX cpu hotplug */
+vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
+ unsigned long start_esp)
+{
+ /* Default everything to zero. This is fine for most GPRs. */
+ memset(&ap, 0, sizeof(struct vmi_ap_state));
+
+ ap.gdtr_limit = GDT_SIZE - 1;
+ ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
+
+ ap.idtr_limit = IDT_ENTRIES * 8 - 1;
+ ap.idtr_base = (unsigned long) idt_table;
+
+ ap.ldtr = 0;
+
+ ap.cs = __KERNEL_CS;
+ ap.eip = (unsigned long) start_eip;
+ ap.ss = __KERNEL_DS;
+ ap.esp = (unsigned long) start_esp;
+
+ ap.ds = __USER_DS;
+ ap.es = __USER_DS;
+ ap.fs = __KERNEL_PDA;
+ ap.gs = 0;
+
+ ap.eflags = 0;
+
+ setup_pda();
+
+#ifdef CONFIG_X86_PAE
+ /* efer should match BSP efer. */
+ if (cpu_has_nx) {
+ unsigned l, h;
+ rdmsr(MSR_EFER, l, h);
+ ap.efer = (unsigned long long) h << 32 | l;
+ }
+#endif
+
+ ap.cr3 = __pa(swapper_pg_dir);
+ /* Protected mode, paging, AM, WP, NE, MP. */
+ ap.cr0 = 0x80050023;
+ ap.cr4 = mmu_cr4_features;
+ vmi_ops.set_initial_ap_state(__pa(&ap), phys_apicid);
+}
+#endif
+
+static inline int __init check_vmi_rom(struct vrom_header *rom)
+{
+ struct pci_header *pci;
+ struct pnp_header *pnp;
+ const char *manufacturer = "UNKNOWN";
+ const char *product = "UNKNOWN";
+ const char *license = "unspecified";
+
+ if (rom->rom_signature != 0xaa55)
+ return 0;
+ if (rom->vrom_signature != VMI_SIGNATURE)
+ return 0;
+ if (rom->api_version_maj != VMI_API_REV_MAJOR ||
+ rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
+ printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
+ rom->api_version_maj,
+ rom->api_version_min);
+ return 0;
+ }
+
+ /*
+ * Relying on the VMI_SIGNATURE field is not 100% safe, so check
+ * the PCI header and device type to make sure this is really a
+ * VMI device.
+ */
+ if (!rom->pci_header_offs) {
+ printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
+ return 0;
+ }
+
+ pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
+ if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
+ pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
+ /* Allow it to run... anyways, but warn */
+ printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
+ }
+
+ if (rom->pnp_header_offs) {
+ pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
+ if (pnp->manufacturer_offset)
+ manufacturer = (const char *)rom+pnp->manufacturer_offset;
+ if (pnp->product_offset)
+ product = (const char *)rom+pnp->product_offset;
+ }
+
+ if (rom->license_offs)
+ license = (char *)rom+rom->license_offs;
+
+ printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
+ manufacturer, product,
+ rom->api_version_maj, rom->api_version_min,
+ pci->rom_version_maj, pci->rom_version_min);
+
+ license_gplok = license_is_gpl_compatible(license);
+ if (!license_gplok) {
+ printk(KERN_WARNING "VMI: ROM license '%s' taints kernel... "
+ "inlining disabled\n",
+ license);
+ add_taint(TAINT_PROPRIETARY_MODULE);
+ }
+ return 1;
+}
+
+/*
+ * Probe for the VMI option ROM
+ */
+static inline int __init probe_vmi_rom(void)
+{
+ unsigned long base;
+
+ /* VMI ROM is in option ROM area, check signature */
+ for (base = 0xC0000; base < 0xE0000; base += 2048) {
+ struct vrom_header *romstart;
+ romstart = (struct vrom_header *)isa_bus_to_virt(base);
+ if (check_vmi_rom(romstart)) {
+ vmi_rom = romstart;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * VMI setup common to all processors
+ */
+void vmi_bringup(void)
+{
+ /* We must establish the lowmem mapping for MMU ops to work */
+ if (vmi_rom)
+ vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0);
+}
+
+/*
+ * Return a pointer to the VMI function or a NOP stub
+ */
+static void *vmi_get_function(int vmicall)
+{
+ u64 reloc;
+ const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+ reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
+ BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
+ if (rel->type == VMI_RELOCATION_CALL_REL)
+ return (void *)rel->eip;
+ else
+ return (void *)vmi_nop;
+}
+
+/*
+ * Helper macro for making the VMI paravirt-ops fill code readable.
+ * For unimplemented operations, fall back to default.
+ */
+#define para_fill(opname, vmicall) \
+do { \
+ reloc = call_vrom_long_func(vmi_rom, get_reloc, \
+ VMI_CALL_##vmicall); \
+ if (rel->type != VMI_RELOCATION_NONE) { \
+ BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); \
+ paravirt_ops.opname = (void *)rel->eip; \
+ } \
+} while (0)
+
+/*
+ * Activate the VMI interface and switch into paravirtualized mode
+ */
+static inline int __init activate_vmi(void)
+{
+ short kernel_cs;
+ u64 reloc;
+ const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+
+ if (call_vrom_func(vmi_rom, vmi_init) != 0) {
+ printk(KERN_ERR "VMI ROM failed to initialize!");
+ return 0;
+ }
+ savesegment(cs, kernel_cs);
+
+ paravirt_ops.paravirt_enabled = 1;
+ paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
+
+ paravirt_ops.patch = vmi_patch;
+ paravirt_ops.name = "vmi";
+
+ /*
+ * Many of these operations are ABI compatible with VMI.
+ * This means we can fill in the paravirt-ops with direct
+ * pointers into the VMI ROM. If the calling convention for
+ * these operations changes, this code needs to be updated.
+ *
+ * Exceptions
+ * CPUID paravirt-op uses pointers, not the native ISA
+ * halt has no VMI equivalent; all VMI halts are "safe"
+ * no MSR support yet - just trap and emulate. VMI uses the
+ * same ABI as the native ISA, but Linux wants exceptions
+ * from bogus MSR read / write handled
+ * rdpmc is not yet used in Linux
+ */
+
+ /* CPUID is special, so very special */
+ reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_CPUID);
+ if (rel->type != VMI_RELOCATION_NONE) {
+ BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+ vmi_ops.cpuid = (void *)rel->eip;
+ paravirt_ops.cpuid = vmi_cpuid;
+ }
+
+ para_fill(clts, CLTS);
+ para_fill(get_debugreg, GetDR);
+ para_fill(set_debugreg, SetDR);
+ para_fill(read_cr0, GetCR0);
+ para_fill(read_cr2, GetCR2);
+ para_fill(read_cr3, GetCR3);
+ para_fill(read_cr4, GetCR4);
+ para_fill(write_cr0, SetCR0);
+ para_fill(write_cr2, SetCR2);
+ para_fill(write_cr3, SetCR3);
+ para_fill(write_cr4, SetCR4);
+ para_fill(save_fl, GetInterruptMask);
+ para_fill(restore_fl, SetInterruptMask);
+ para_fill(irq_disable, DisableInterrupts);
+ para_fill(irq_enable, EnableInterrupts);
+ /* irq_save_disable !!! sheer pain */
+ patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK],
+ (char *)paravirt_ops.save_fl);
+ patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
+ (char *)paravirt_ops.irq_disable);
+#ifndef CONFIG_NO_IDLE_HZ
+ para_fill(safe_halt, Halt);
+#else
+ vmi_ops.halt = vmi_get_function(VMI_CALL_Halt);
+ paravirt_ops.safe_halt = vmi_safe_halt;
+#endif
+ para_fill(wbinvd, WBINVD);
+ /* paravirt_ops.read_msr = vmi_rdmsr */
+ /* paravirt_ops.write_msr = vmi_wrmsr */
+ para_fill(read_tsc, RDTSC);
+ /* paravirt_ops.rdpmc = vmi_rdpmc */
+
+ /* TR interface doesn't pass TR value */
+ reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetTR);
+ if (rel->type != VMI_RELOCATION_NONE) {
+ BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+ vmi_ops.set_tr = (void *)rel->eip;
+ paravirt_ops.load_tr_desc = vmi_set_tr;
+ }
+
+ /* LDT is special, too */
+ reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetLDT);
+ if (rel->type != VMI_RELOCATION_NONE) {
+ BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+ vmi_ops._set_ldt = (void *)rel->eip;
+ paravirt_ops.set_ldt = vmi_set_ldt;
+ }
+
+ para_fill(load_gdt, SetGDT);
+ para_fill(load_idt, SetIDT);
+ para_fill(store_gdt, GetGDT);
+ para_fill(store_idt, GetIDT);
+ para_fill(store_tr, GetTR);
+ paravirt_ops.load_tls = vmi_load_tls;
+ para_fill(write_ldt_entry, WriteLDTEntry);
+ para_fill(write_gdt_entry, WriteGDTEntry);
+ para_fill(write_idt_entry, WriteIDTEntry);
+ reloc = call_vrom_long_func(vmi_rom, get_reloc,
+ VMI_CALL_UpdateKernelStack);
+ if (rel->type != VMI_RELOCATION_NONE) {
+ BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+ vmi_ops.set_kernel_stack = (void *)rel->eip;
+ paravirt_ops.load_esp0 = vmi_load_esp0;
+ }
+
+ para_fill(set_iopl_mask, SetIOPLMask);
+ paravirt_ops.io_delay = (void *)vmi_nop;
+ if (!disable_nodelay) {
+ paravirt_ops.const_udelay = (void *)vmi_nop;
+ }
+
+ para_fill(set_lazy_mode, SetLazyMode);
+
+ reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_FlushTLB);
+ if (rel->type != VMI_RELOCATION_NONE) {
+ vmi_ops.flush_tlb = (void *)rel->eip;
+ paravirt_ops.flush_tlb_user = vmi_flush_tlb_user;
+ paravirt_ops.flush_tlb_kernel = vmi_flush_tlb_kernel;
+ }
+ para_fill(flush_tlb_single, InvalPage);
+
+ /*
+ * Until a standard flag format can be agreed on, we need to
+ * implement these as wrappers in Linux. Get the VMI ROM
+ * function pointers for the two backend calls.
+ */
+#ifdef CONFIG_X86_PAE
+ vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
+ vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
+#else
+ vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
+ vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
+#endif
+ vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
+ vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
+ vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
+
+ paravirt_ops.alloc_pt = vmi_allocate_pt;
+ paravirt_ops.alloc_pd = vmi_allocate_pd;
+ paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+ paravirt_ops.release_pt = vmi_release_pt;
+ paravirt_ops.release_pd = vmi_release_pd;
+ paravirt_ops.set_pte = vmi_set_pte;
+ paravirt_ops.set_pte_at = vmi_set_pte_at;
+ paravirt_ops.set_pmd = vmi_set_pmd;
+ paravirt_ops.pte_update = vmi_update_pte;
+ paravirt_ops.pte_update_defer = vmi_update_pte_defer;
+#ifdef CONFIG_X86_PAE
+ paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
+ paravirt_ops.set_pte_present = vmi_set_pte_present;
+ paravirt_ops.set_pud = vmi_set_pud;
+ paravirt_ops.pte_clear = vmi_pte_clear;
+ paravirt_ops.pmd_clear = vmi_pmd_clear;
+#endif
+ /*
+ * These MUST always be patched. Don't support indirect jumps
+ * through these operations, as the VMI interface may use either
+ * a jump or a call to get to these operations, depending on
+ * the backend. They are performance critical anyway, so requiring
+ * a patch is not a big problem.
+ */
+ paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
+ paravirt_ops.iret = (void *)0xbadbab0;
+
+#ifdef CONFIG_SMP
+ paravirt_ops.startup_ipi_hook = vmi_startup_ipi_hook;
+ vmi_ops.set_initial_ap_state = vmi_get_function(VMI_CALL_SetInitialAPState);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ paravirt_ops.apic_read = vmi_get_function(VMI_CALL_APICRead);
+ paravirt_ops.apic_write = vmi_get_function(VMI_CALL_APICWrite);
+ paravirt_ops.apic_write_atomic = vmi_get_function(VMI_CALL_APICWrite);
+#endif
+
+ /*
+ * Check for VMI timer functionality by probing for a cycle frequency method
+ */
+ reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
+ if (rel->type != VMI_RELOCATION_NONE) {
+ vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
+ vmi_timer_ops.get_cycle_counter =
+ vmi_get_function(VMI_CALL_GetCycleCounter);
+ vmi_timer_ops.get_wallclock =
+ vmi_get_function(VMI_CALL_GetWallclockTime);
+ vmi_timer_ops.wallclock_updated =
+ vmi_get_function(VMI_CALL_WallclockUpdated);
+ vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
+ vmi_timer_ops.cancel_alarm =
+ vmi_get_function(VMI_CALL_CancelAlarm);
+ paravirt_ops.time_init = vmi_time_init;
+ paravirt_ops.get_wallclock = vmi_get_wallclock;
+ paravirt_ops.set_wallclock = vmi_set_wallclock;
+#ifdef CONFIG_X86_LOCAL_APIC
+ paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
+ paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
+#endif
+ custom_sched_clock = vmi_sched_clock;
+ }
+
+ /*
+ * Alternative instruction rewriting doesn't happen soon enough
+ * to convert VMI_IRET to a call instead of a jump; so we have
+ * to do this before IRQs get reenabled. Fortunately, it is
+ * idempotent.
+ */
+ apply_paravirt(__start_parainstructions, __stop_parainstructions);
+
+ vmi_bringup();
+
+ return 1;
+}
+
+#undef para_fill
+
+void __init vmi_init(void)
+{
+ unsigned long flags;
+
+ if (!vmi_rom)
+ probe_vmi_rom();
+ else
+ check_vmi_rom(vmi_rom);
+
+ /* In case probing for or validating the ROM failed, basil */
+ if (!vmi_rom)
+ return;
+
+ reserve_top_address(-vmi_rom->virtual_top);
+
+ local_irq_save(flags);
+ activate_vmi();
+#ifdef CONFIG_SMP
+ no_timer_check = 1;
+#endif
+ local_irq_restore(flags & X86_EFLAGS_IF);
+}
+
+static int __init parse_vmi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "disable_nodelay"))
+ disable_nodelay = 1;
+ else if (!strcmp(arg, "disable_pge")) {
+ clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ disable_pge = 1;
+ } else if (!strcmp(arg, "disable_pse")) {
+ clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+ disable_pse = 1;
+ } else if (!strcmp(arg, "disable_sep")) {
+ clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
+ disable_sep = 1;
+ } else if (!strcmp(arg, "disable_tsc")) {
+ clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+ disable_tsc = 1;
+ } else if (!strcmp(arg, "disable_mtrr")) {
+ clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
+ disable_mtrr = 1;
+ }
+ return 0;
+}
+
+early_param("vmi", parse_vmi);
diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c
new file mode 100644
index 00000000000..76d2adcae5a
--- /dev/null
+++ b/arch/i386/kernel/vmitime.c
@@ -0,0 +1,499 @@
+/*
+ * VMI paravirtual timer support routines.
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to dhecht@vmware.com
+ *
+ */
+
+/*
+ * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
+ * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
+ * See comments there for proper credits.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/rcupdate.h>
+#include <linux/clocksource.h>
+
+#include <asm/timer.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/div64.h>
+#include <asm/timer.h>
+#include <asm/desc.h>
+
+#include <asm/vmi.h>
+#include <asm/vmi_time.h>
+
+#include <mach_timer.h>
+#include <io_ports.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
+#else
+#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
+#endif
+
+/* Cached VMI operations */
+struct vmi_timer_ops vmi_timer_ops;
+
+#ifdef CONFIG_NO_IDLE_HZ
+
+/* /proc/sys/kernel/hz_timer state. */
+int sysctl_hz_timer;
+
+/* Some stats */
+static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
+static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
+static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
+
+#endif /* CONFIG_NO_IDLE_HZ */
+
+/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
+static int alarm_hz = CONFIG_VMI_ALARM_HZ;
+
+/* Cache of the value get_cycle_frequency / HZ. */
+static signed long long cycles_per_jiffy;
+
+/* Cache of the value get_cycle_frequency / alarm_hz. */
+static signed long long cycles_per_alarm;
+
+/* The number of cycles accounted for by the 'jiffies'/'xtime' count.
+ * Protected by xtime_lock. */
+static unsigned long long real_cycles_accounted_system;
+
+/* The number of cycles accounted for by update_process_times(), per cpu. */
+static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
+
+/* The number of stolen cycles accounted, per cpu. */
+static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
+
+/* Clock source. */
+static cycle_t read_real_cycles(void)
+{
+ return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
+}
+
+static cycle_t read_available_cycles(void)
+{
+ return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+}
+
+#if 0
+static cycle_t read_stolen_cycles(void)
+{
+ return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
+}
+#endif /* 0 */
+
+static struct clocksource clocksource_vmi = {
+ .name = "vmi-timer",
+ .rating = 450,
+ .read = read_real_cycles,
+ .mask = CLOCKSOURCE_MASK(64),
+ .mult = 0, /* to be set */
+ .shift = 22,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+
+/* Timer interrupt handler. */
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
+
+static struct irqaction vmi_timer_irq = {
+ vmi_timer_interrupt,
+ SA_INTERRUPT,
+ CPU_MASK_NONE,
+ "VMI-alarm",
+ NULL,
+ NULL
+};
+
+/* Alarm rate */
+static int __init vmi_timer_alarm_rate_setup(char* str)
+{
+ int alarm_rate;
+ if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
+ alarm_hz = alarm_rate;
+ printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
+ }
+ return 1;
+}
+__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
+
+
+/* Initialization */
+static void vmi_get_wallclock_ts(struct timespec *ts)
+{
+ unsigned long long wallclock;
+ wallclock = vmi_timer_ops.get_wallclock(); // nsec units
+ ts->tv_nsec = do_div(wallclock, 1000000000);
+ ts->tv_sec = wallclock;
+}
+
+static void update_xtime_from_wallclock(void)
+{
+ struct timespec ts;
+ vmi_get_wallclock_ts(&ts);
+ do_settimeofday(&ts);
+}
+
+unsigned long vmi_get_wallclock(void)
+{
+ struct timespec ts;
+ vmi_get_wallclock_ts(&ts);
+ return ts.tv_sec;
+}
+
+int vmi_set_wallclock(unsigned long now)
+{
+ return -1;
+}
+
+unsigned long long vmi_sched_clock(void)
+{
+ return read_available_cycles();
+}
+
+void __init vmi_time_init(void)
+{
+ unsigned long long cycles_per_sec, cycles_per_msec;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ setup_irq(0, &vmi_timer_irq);
+#ifdef CONFIG_X86_LOCAL_APIC
+ set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
+#endif
+
+ no_sync_cmos_clock = 1;
+
+ vmi_get_wallclock_ts(&xtime);
+ set_normalized_timespec(&wall_to_monotonic,
+ -xtime.tv_sec, -xtime.tv_nsec);
+
+ real_cycles_accounted_system = read_real_cycles();
+ update_xtime_from_wallclock();
+ per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
+
+ cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
+
+ cycles_per_jiffy = cycles_per_sec;
+ (void)do_div(cycles_per_jiffy, HZ);
+ cycles_per_alarm = cycles_per_sec;
+ (void)do_div(cycles_per_alarm, alarm_hz);
+ cycles_per_msec = cycles_per_sec;
+ (void)do_div(cycles_per_msec, 1000);
+ cpu_khz = cycles_per_msec;
+
+ printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;"
+ "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
+ cycles_per_alarm);
+
+ clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
+ clocksource_vmi.shift);
+ if (clocksource_register(&clocksource_vmi))
+ printk(KERN_WARNING "Error registering VMITIME clocksource.");
+
+ /* Disable PIT. */
+ outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
+
+ /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu
+ * reduce the latency calling update_process_times. */
+ vmi_timer_ops.set_alarm(
+ VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+ per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
+ cycles_per_alarm);
+
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+void __init vmi_timer_setup_boot_alarm(void)
+{
+ local_irq_disable();
+
+ /* Route the interrupt to the correct vector. */
+ apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
+
+ /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
+ vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
+ vmi_timer_ops.set_alarm(
+ VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+ per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
+ cycles_per_alarm);
+ local_irq_enable();
+}
+
+/* Initialize the time accounting variables for an AP on an SMP system.
+ * Also, set the local alarm for the AP. */
+void __init vmi_timer_setup_secondary_alarm(void)
+{
+ int cpu = smp_processor_id();
+
+ /* Route the interrupt to the correct vector. */
+ apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
+
+ per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
+
+ vmi_timer_ops.set_alarm(
+ VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+ per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
+ cycles_per_alarm);
+}
+
+#endif
+
+/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
+static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
+{
+ long long cycles_not_accounted;
+
+ write_seqlock(&xtime_lock);
+
+ cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
+ while (cycles_not_accounted >= cycles_per_jiffy) {
+ /* systems wide jiffies and wallclock. */
+ do_timer(1);
+
+ cycles_not_accounted -= cycles_per_jiffy;
+ real_cycles_accounted_system += cycles_per_jiffy;
+ }
+
+ if (vmi_timer_ops.wallclock_updated())
+ update_xtime_from_wallclock();
+
+ write_sequnlock(&xtime_lock);
+}
+
+/* Update per-cpu process times. */
+static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
+ unsigned long long cur_process_times_cycles)
+{
+ long long cycles_not_accounted;
+ cycles_not_accounted = cur_process_times_cycles -
+ per_cpu(process_times_cycles_accounted_cpu, cpu);
+
+ while (cycles_not_accounted >= cycles_per_jiffy) {
+ /* Account time to the current process. This includes
+ * calling into the scheduler to decrement the timeslice
+ * and possibly reschedule.*/
+ update_process_times(user_mode(regs));
+ /* XXX handle /proc/profile multiplier. */
+ profile_tick(CPU_PROFILING);
+
+ cycles_not_accounted -= cycles_per_jiffy;
+ per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
+ }
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+/* Update per-cpu idle times. Used when a no-hz halt is ended. */
+static void vmi_account_no_hz_idle_cycles(int cpu,
+ unsigned long long cur_process_times_cycles)
+{
+ long long cycles_not_accounted;
+ unsigned long no_idle_hz_jiffies = 0;
+
+ cycles_not_accounted = cur_process_times_cycles -
+ per_cpu(process_times_cycles_accounted_cpu, cpu);
+
+ while (cycles_not_accounted >= cycles_per_jiffy) {
+ no_idle_hz_jiffies++;
+ cycles_not_accounted -= cycles_per_jiffy;
+ per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
+ }
+ /* Account time to the idle process. */
+ account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
+}
+#endif
+
+/* Update per-cpu stolen time. */
+static void vmi_account_stolen_cycles(int cpu,
+ unsigned long long cur_real_cycles,
+ unsigned long long cur_avail_cycles)
+{
+ long long stolen_cycles_not_accounted;
+ unsigned long stolen_jiffies = 0;
+
+ if (cur_real_cycles < cur_avail_cycles)
+ return;
+
+ stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
+ per_cpu(stolen_cycles_accounted_cpu, cpu);
+
+ while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
+ stolen_jiffies++;
+ stolen_cycles_not_accounted -= cycles_per_jiffy;
+ per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
+ }
+ /* HACK: pass NULL to force time onto cpustat->steal. */
+ account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
+}
+
+/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
+ * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
+static void vmi_local_timer_interrupt(int cpu)
+{
+ unsigned long long cur_real_cycles, cur_process_times_cycles;
+
+ cur_real_cycles = read_real_cycles();
+ cur_process_times_cycles = read_available_cycles();
+ /* Update system wide (real) time state (xtime, jiffies). */
+ vmi_account_real_cycles(cur_real_cycles);
+ /* Update per-cpu process times. */
+ vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles);
+ /* Update time stolen from this cpu by the hypervisor. */
+ vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+
+/* Must be called only from idle loop, with interrupts disabled. */
+int vmi_stop_hz_timer(void)
+{
+ /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
+
+ unsigned long seq, next;
+ unsigned long long real_cycles_expiry;
+ int cpu = smp_processor_id();
+ int idle;
+
+ BUG_ON(!irqs_disabled());
+ if (sysctl_hz_timer != 0)
+ return 0;
+
+ cpu_set(cpu, nohz_cpu_mask);
+ smp_mb();
+ if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
+ (next = next_timer_interrupt(), time_before_eq(next, jiffies))) {
+ cpu_clear(cpu, nohz_cpu_mask);
+ next = jiffies;
+ idle = 0;
+ } else
+ idle = 1;
+
+ /* Convert jiffies to the real cycle counter. */
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ real_cycles_expiry = real_cycles_accounted_system +
+ (long)(next - jiffies) * cycles_per_jiffy;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ /* This cpu is going idle. Disable the periodic alarm. */
+ if (idle) {
+ vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
+ per_cpu(idle_start_jiffies, cpu) = jiffies;
+ }
+
+ /* Set the real time alarm to expire at the next event. */
+ vmi_timer_ops.set_alarm(
+ VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
+ real_cycles_expiry, 0);
+
+ return idle;
+}
+
+static void vmi_reenable_hz_timer(int cpu)
+{
+ /* For /proc/vmi/info idle_hz stat. */
+ per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu);
+ per_cpu(vmi_idle_no_hz_irqs, cpu)++;
+
+ /* Don't bother explicitly cancelling the one-shot alarm -- at
+ * worse we will receive a spurious timer interrupt. */
+ vmi_timer_ops.set_alarm(
+ VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+ per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
+ cycles_per_alarm);
+ /* Indicate this cpu is no longer nohz idle. */
+ cpu_clear(cpu, nohz_cpu_mask);
+}
+
+/* Called from interrupt handlers when (local) HZ timer is disabled. */
+void vmi_account_time_restart_hz_timer(void)
+{
+ unsigned long long cur_real_cycles, cur_process_times_cycles;
+ int cpu = smp_processor_id();
+
+ BUG_ON(!irqs_disabled());
+ /* Account the time during which the HZ timer was disabled. */
+ cur_real_cycles = read_real_cycles();
+ cur_process_times_cycles = read_available_cycles();
+ /* Update system wide (real) time state (xtime, jiffies). */
+ vmi_account_real_cycles(cur_real_cycles);
+ /* Update per-cpu idle times. */
+ vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
+ /* Update time stolen from this cpu by the hypervisor. */
+ vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
+ /* Reenable the hz timer. */
+ vmi_reenable_hz_timer(cpu);
+}
+
+#endif /* CONFIG_NO_IDLE_HZ */
+
+/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
+ * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
+ * APIC setup and setup_boot_vmi_alarm() is called. */
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
+{
+ vmi_local_timer_interrupt(smp_processor_id());
+ return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
+ * Also used in UP when CONFIG_X86_LOCAL_APIC.
+ * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */
+void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ int cpu = smp_processor_id();
+
+ /*
+ * the NMI deadlock-detector uses this.
+ */
+ per_cpu(irq_stat,cpu).apic_timer_irqs++;
+
+ /*
+ * NOTE! We'd better ACK the irq immediately,
+ * because timer handling can be slow.
+ */
+ ack_APIC_irq();
+
+ /*
+ * update_process_times() expects us to have done irq_enter().
+ * Besides, if we don't timer interrupts ignore the global
+ * interrupt lock, which is the WrongThing (tm) to do.
+ */
+ irq_enter();
+ vmi_local_timer_interrupt(cpu);
+ irq_exit();
+ set_irq_regs(old_regs);
+}
+
+#endif /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index a53c8b1854b..ca51610955d 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -37,9 +37,14 @@ SECTIONS
{
. = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
phys_startup_32 = startup_32 - LOAD_OFFSET;
+
+ .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+ _text = .; /* Text and read-only data */
+ *(.text.head)
+ } :text = 0x9090
+
/* read-only */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
- _text = .; /* Text and read-only data */
*(.text)
SCHED_TEXT
LOCK_TEXT
@@ -181,12 +186,14 @@ SECTIONS
from .altinstructions and .eh_frame */
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
+#if defined(CONFIG_BLK_DEV_INITRD)
. = ALIGN(4096);
.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
__initramfs_start = .;
*(.init.ramfs)
__initramfs_end = .;
}
+#endif
. = ALIGN(L1_CACHE_BYTES);
.data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
__per_cpu_start = .;