From cb3c8b9003f15efa4a750a32d2d602d40cc45d5a Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Wed, 19 Mar 2008 14:25:59 -0300 Subject: x86: integrate do_boot_cpu This is a very large patch, because it depends on a lot of auxiliary static functions. But they all have been modified to the point that they're sufficiently close now. So they're just merged in smpboot.c Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 588 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 588 insertions(+) (limited to 'arch/x86/kernel/smpboot.c') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5bff87e9989..69c17965f48 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -4,14 +4,42 @@ #include #include #include +#include +#include +#include #include #include #include #include #include +#include +#include +#include +#include +#include #include +#include +#include + +/* Store all idle threads, this can be reused instead of creating +* a new thread. Also avoids complicated thread destroy functionality +* for idle threads. +*/ +#ifdef CONFIG_HOTPLUG_CPU +/* + * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is + * removed after init for !CONFIG_HOTPLUG_CPU. + */ +static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); +#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) +#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) +#else +struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; +#define get_idle_for_cpu(x) (idle_thread_array[(x)]) +#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) +#endif /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -41,6 +69,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +static atomic_t init_deasserted; + /* ready for x86_64, no harm for x86, since it will overwrite after alloc */ unsigned char *trampoline_base = __va(SMP_TRAMPOLINE_BASE); @@ -110,6 +140,96 @@ void unmap_cpu_to_logical_apicid(int cpu) #define map_cpu_to_logical_apicid() do {} while (0) #endif +/* + * Report back to the Boot Processor. + * Running on AP. + */ +void __cpuinit smp_callin(void) +{ + int cpuid, phys_id; + unsigned long timeout; + + /* + * If waken up by an INIT in an 82489DX configuration + * we may get here before an INIT-deassert IPI reaches + * our local APIC. We have to wait for the IPI or we'll + * lock up on an APIC access. + */ + wait_for_init_deassert(&init_deasserted); + + /* + * (This works even if the APIC is not enabled.) + */ + phys_id = GET_APIC_ID(apic_read(APIC_ID)); + cpuid = smp_processor_id(); + if (cpu_isset(cpuid, cpu_callin_map)) { + panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, + phys_id, cpuid); + } + Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); + + /* + * STARTUP IPIs are fragile beasts as they might sometimes + * trigger some glue motherboard logic. Complete APIC bus + * silence for 1 second, this overestimates the time the + * boot CPU is spending to send the up to 2 STARTUP IPIs + * by a factor of two. This should be enough. + */ + + /* + * Waiting 2s total for startup (udelay is not yet working) + */ + timeout = jiffies + 2*HZ; + while (time_before(jiffies, timeout)) { + /* + * Has the boot CPU finished it's STARTUP sequence? + */ + if (cpu_isset(cpuid, cpu_callout_map)) + break; + cpu_relax(); + } + + if (!time_before(jiffies, timeout)) { + panic("%s: CPU%d started up but did not get a callout!\n", + __func__, cpuid); + } + + /* + * the boot CPU has finished the init stage and is spinning + * on callin_map until we finish. We are free to set up this + * CPU, first the APIC. (this is probably redundant on most + * boards) + */ + + Dprintk("CALLIN, before setup_local_APIC().\n"); + smp_callin_clear_local_apic(); + setup_local_APIC(); + end_local_APIC_setup(); + map_cpu_to_logical_apicid(); + + /* + * Get our bogomips. + * + * Need to enable IRQs because it can take longer and then + * the NMI watchdog might kill us. + */ + local_irq_enable(); + calibrate_delay(); + local_irq_disable(); + Dprintk("Stack at about %p\n", &cpuid); + + /* + * Save our processor parameters + */ + smp_store_cpu_info(cpuid); + + /* + * Allow the master to continue. + */ + cpu_set(cpuid, cpu_callin_map); +} + + static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 @@ -327,6 +447,474 @@ void impress_friends(void) Dprintk("Before bogocount - setting activated=1.\n"); } +static inline void __inquire_remote_apic(int apicid) +{ + unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; + char *names[] = { "ID", "VERSION", "SPIV" }; + int timeout; + u32 status; + + printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); + + /* + * Wait for idle. + */ + status = safe_apic_wait_icr_idle(); + if (status) + printk(KERN_CONT + "a previous APIC delivery may have failed\n"); + + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + + timeout = 0; + do { + udelay(100); + status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; + } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); + + switch (status) { + case APIC_ICR_RR_VALID: + status = apic_read(APIC_RRR); + printk(KERN_CONT "%08x\n", status); + break; + default: + printk(KERN_CONT "failed\n"); + } + } +} + +#ifdef WAKE_SECONDARY_VIA_NMI +/* + * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal + * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this + * won't ... remember to clear down the APIC, etc later. + */ +static int __devinit +wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) +{ + unsigned long send_status, accept_status = 0; + int maxlvt; + + /* Target chip */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); + + /* Boot on the stack */ + /* Kick the second */ + apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); + + Dprintk("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + /* + * Due to the Pentium erratum 3AP. + */ + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) { + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + } + accept_status = (apic_read(APIC_ESR) & 0xEF); + Dprintk("NMI sent.\n"); + + if (send_status) + printk(KERN_ERR "APIC never delivered???\n"); + if (accept_status) + printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + + return (send_status | accept_status); +} +#endif /* WAKE_SECONDARY_VIA_NMI */ + +extern void start_secondary(void *unused); +#ifdef WAKE_SECONDARY_VIA_INIT +static int __devinit +wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) +{ + unsigned long send_status, accept_status = 0; + int maxlvt, num_starts, j; + + /* + * Be paranoid about clearing APIC errors. + */ + if (APIC_INTEGRATED(apic_version[phys_apicid])) { + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } + + Dprintk("Asserting INIT.\n"); + + /* + * Turn INIT on target chip + */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* + * Send IPI + */ + apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT + | APIC_DM_INIT); + + Dprintk("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + mdelay(10); + + Dprintk("Deasserting INIT.\n"); + + /* Target chip */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* Send IPI */ + apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + + Dprintk("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + mb(); + atomic_set(&init_deasserted, 1); + + /* + * Should we send STARTUP IPIs ? + * + * Determine this based on the APIC version. + * If we don't have an integrated APIC, don't send the STARTUP IPIs. + */ + if (APIC_INTEGRATED(apic_version[phys_apicid])) + num_starts = 2; + else + num_starts = 0; + + /* + * Paravirt / VMI wants a startup IPI hook here to set up the + * target processor state. + */ + startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, +#ifdef CONFIG_X86_64 + (unsigned long)init_rsp); +#else + (unsigned long)stack_start.sp); +#endif + + /* + * Run STARTUP IPI loop. + */ + Dprintk("#startup loops: %d.\n", num_starts); + + maxlvt = lapic_get_maxlvt(); + + for (j = 1; j <= num_starts; j++) { + Dprintk("Sending STARTUP #%d.\n", j); + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + Dprintk("After apic_write.\n"); + + /* + * STARTUP IPI + */ + + /* Target chip */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* Boot on the stack */ + /* Kick the second */ + apic_write_around(APIC_ICR, APIC_DM_STARTUP + | (start_eip >> 12)); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(300); + + Dprintk("Startup point 1.\n"); + + Dprintk("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + /* + * Due to the Pentium erratum 3AP. + */ + if (maxlvt > 3) { + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + } + accept_status = (apic_read(APIC_ESR) & 0xEF); + if (send_status || accept_status) + break; + } + Dprintk("After Startup.\n"); + + if (send_status) + printk(KERN_ERR "APIC never delivered???\n"); + if (accept_status) + printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + + return (send_status | accept_status); +} +#endif /* WAKE_SECONDARY_VIA_INIT */ + +struct create_idle { + struct work_struct work; + struct task_struct *idle; + struct completion done; + int cpu; +}; + +static void __cpuinit do_fork_idle(struct work_struct *work) +{ + struct create_idle *c_idle = + container_of(work, struct create_idle, work); + + c_idle->idle = fork_idle(c_idle->cpu); + complete(&c_idle->done); +} + +static int __cpuinit do_boot_cpu(int apicid, int cpu) +/* + * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad + * (ie clustered apic addressing mode), this is a LOGICAL apic ID. + * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu. + */ +{ + unsigned long boot_error = 0; + int timeout; + unsigned long start_ip; + unsigned short nmi_high = 0, nmi_low = 0; + struct create_idle c_idle = { + .cpu = cpu, + .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), + }; + INIT_WORK(&c_idle.work, do_fork_idle); +#ifdef CONFIG_X86_64 + /* allocate memory for gdts of secondary cpus. Hotplug is considered */ + if (!cpu_gdt_descr[cpu].address && + !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { + printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu); + return -1; + } + + /* Allocate node local memory for AP pdas */ + if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { + struct x8664_pda *newpda, *pda; + int node = cpu_to_node(cpu); + pda = cpu_pda(cpu); + newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC, + node); + if (newpda) { + memcpy(newpda, pda, sizeof(struct x8664_pda)); + cpu_pda(cpu) = newpda; + } else + printk(KERN_ERR + "Could not allocate node local PDA for CPU %d on node %d\n", + cpu, node); + } +#endif + + alternatives_smp_switch(1); + + c_idle.idle = get_idle_for_cpu(cpu); + + /* + * We can't use kernel_thread since we must avoid to + * reschedule the child. + */ + if (c_idle.idle) { + c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *) + (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); + init_idle(c_idle.idle, cpu); + goto do_rest; + } + + if (!keventd_up() || current_is_keventd()) + c_idle.work.func(&c_idle.work); + else { + schedule_work(&c_idle.work); + wait_for_completion(&c_idle.done); + } + + if (IS_ERR(c_idle.idle)) { + printk("failed fork for CPU %d\n", cpu); + return PTR_ERR(c_idle.idle); + } + + set_idle_for_cpu(cpu, c_idle.idle); +do_rest: +#ifdef CONFIG_X86_32 + per_cpu(current_task, cpu) = c_idle.idle; + init_gdt(cpu); + early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); + c_idle.idle->thread.ip = (unsigned long) start_secondary; + /* Stack for startup_32 can be just as for start_secondary onwards */ + stack_start.sp = (void *) c_idle.idle->thread.sp; + irq_ctx_init(cpu); +#else + cpu_pda(cpu)->pcurrent = c_idle.idle; + init_rsp = c_idle.idle->thread.sp; + load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread); + initial_code = (unsigned long)start_secondary; + clear_tsk_thread_flag(c_idle.idle, TIF_FORK); +#endif + + /* start_ip had better be page-aligned! */ + start_ip = setup_trampoline(); + + /* So we see what's up */ + printk(KERN_INFO "Booting processor %d/%d ip %lx\n", + cpu, apicid, start_ip); + + /* + * This grunge runs the startup process for + * the targeted processor. + */ + + atomic_set(&init_deasserted, 0); + + Dprintk("Setting warm reset code and vector.\n"); + + store_NMI_vector(&nmi_high, &nmi_low); + + smpboot_setup_warm_reset_vector(start_ip); + /* + * Be paranoid about clearing APIC errors. + */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + + + /* + * Starting actual IPI sequence... + */ + boot_error = wakeup_secondary_cpu(apicid, start_ip); + + if (!boot_error) { + /* + * allow APs to start initializing. + */ + Dprintk("Before Callout %d.\n", cpu); + cpu_set(cpu, cpu_callout_map); + Dprintk("After Callout %d.\n", cpu); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_isset(cpu, cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + + if (cpu_isset(cpu, cpu_callin_map)) { + /* number CPUs logically, starting from 1 (BSP is 0) */ + Dprintk("OK.\n"); + printk(KERN_INFO "CPU%d: ", cpu); + print_cpu_info(&cpu_data(cpu)); + Dprintk("CPU has booted.\n"); + } else { + boot_error = 1; + if (*((volatile unsigned char *)trampoline_base) + == 0xA5) + /* trampoline started but...? */ + printk(KERN_ERR "Stuck ??\n"); + else + /* trampoline code not run */ + printk(KERN_ERR "Not responding.\n"); + inquire_remote_apic(apicid); + } + } + + if (boot_error) { + /* Try to put things back the way they were before ... */ + unmap_cpu_to_logical_apicid(cpu); +#ifdef CONFIG_X86_64 + clear_node_cpumask(cpu); /* was set by numa_add_cpu */ +#endif + cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ + cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ + cpu_clear(cpu, cpu_possible_map); + cpu_clear(cpu, cpu_present_map); + per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; + } + + /* mark "stuck" area as not stuck */ + *((volatile unsigned long *)trampoline_base) = 0; + + return boot_error; +} + +int __cpuinit native_cpu_up(unsigned int cpu) +{ + int apicid = cpu_present_to_apicid(cpu); + unsigned long flags; + int err; + + WARN_ON(irqs_disabled()); + + Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); + + if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || + !physid_isset(apicid, phys_cpu_present_map)) { + printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); + return -EINVAL; + } + + /* + * Already booted CPU? + */ + if (cpu_isset(cpu, cpu_callin_map)) { + Dprintk("do_boot_cpu %d Already started\n", cpu); + return -ENOSYS; + } + + /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + +#ifdef CONFIG_X86_32 + /* init low mem mapping */ + clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); + flush_tlb_all(); +#endif + + err = do_boot_cpu(apicid, cpu); + if (err < 0) { + Dprintk("do_boot_cpu failed %d\n", err); + return err; + } + + /* + * Check TSC synchronization with the AP (keep irqs disabled + * while doing so): + */ + local_irq_save(flags); + check_tsc_sync_source(cpu); + local_irq_restore(flags); + + while (!cpu_isset(cpu, cpu_online_map)) { + cpu_relax(); + touch_nmi_watchdog(); + } + + return 0; +} + #ifdef CONFIG_HOTPLUG_CPU void remove_siblinginfo(int cpu) { -- cgit v1.2.3