From 89b831ef8bf5cfbb357dbc0a2e07700d7f20eec5 Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Sat, 5 Nov 2005 17:25:53 +0100 Subject: [PATCH] x86_64: Support for AMD specific MCE Threshold. MC4_MISC - DRAM Errors Threshold Register realized under AMD K8 Rev F. This register is used to count correctable and uncorrectable ECC errors that occur during DRAM read operations. The user may interface through sysfs files in order to change the threshold configuration. bank%d/error_count - reads current error count, write to clear. bank%d/interrupt_enable - set/clear interrupt enable. bank%d/threshold_limit - read/write the threshold limit. APIC vector 0xF9 in hw_irq.h. 5 software defined bank ids in mce.h. new apic.c function to setup threshold apic lvt. defaults to interrupt off, count enabled, and threshold limit max. sysfs interface created on /sys/devices/system/threshold. AK: added some ifdefs to make it compile on UP Signed-off-by: Jacob Shin Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/Makefile | 1 + arch/x86_64/kernel/apic.c | 10 + arch/x86_64/kernel/entry.S | 3 + arch/x86_64/kernel/i8259.c | 2 + arch/x86_64/kernel/mce.c | 3 + arch/x86_64/kernel/mce_amd.c | 538 +++++++++++++++++++++++++++++++++++++++++++ arch/x86_64/kernel/traps.c | 4 + 7 files changed, 561 insertions(+) create mode 100644 arch/x86_64/kernel/mce_amd.c (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 14328cab5d3..fe4cbd1c4b2 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -11,6 +11,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ obj-$(CONFIG_X86_MCE) += mce.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o +obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_X86_MSR) += msr.o diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index b6e7715d877..18691ce4c75 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -833,6 +833,16 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } +#ifdef CONFIG_X86_MCE_AMD +void setup_threshold_lvt(unsigned long lvt_off) +{ + unsigned int v = 0; + unsigned long reg = (lvt_off << 4) + 0x500; + v |= THRESHOLD_APIC_VECTOR; + apic_write(reg, v); +} +#endif /* CONFIG_X86_MCE_AMD */ + #undef APIC_DIVISOR /* diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 7937971d185..9ff42041bb6 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -612,6 +612,9 @@ retint_kernel: ENTRY(thermal_interrupt) apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt +ENTRY(threshold_interrupt) + apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt + #ifdef CONFIG_SMP ENTRY(reschedule_interrupt) apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index c6c9791d77c..5de30035e54 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -492,6 +492,7 @@ void invalidate_interrupt5(void); void invalidate_interrupt6(void); void invalidate_interrupt7(void); void thermal_interrupt(void); +void threshold_interrupt(void); void i8254_timer_resume(void); static void setup_timer_hardware(void) @@ -580,6 +581,7 @@ void __init init_IRQ(void) set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); #endif set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 69541db5ff2..cf8a76f0f47 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -356,6 +356,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) case X86_VENDOR_INTEL: mce_intel_feature_init(c); break; + case X86_VENDOR_AMD: + mce_amd_feature_init(c); + break; default: break; } diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c new file mode 100644 index 00000000000..1f76175ace0 --- /dev/null +++ b/arch/x86_64/kernel/mce_amd.c @@ -0,0 +1,538 @@ +/* + * (c) 2005 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + * + * Written by Jacob Shin - AMD, Inc. + * + * Support : jacob.shin@amd.com + * + * MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F. + * MC4_MISC0 exists per physical processor. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PFX "mce_threshold: " +#define VERSION "version 1.00.9" +#define NR_BANKS 5 +#define THRESHOLD_MAX 0xFFF +#define INT_TYPE_APIC 0x00020000 +#define MASK_VALID_HI 0x80000000 +#define MASK_LVTOFF_HI 0x00F00000 +#define MASK_COUNT_EN_HI 0x00080000 +#define MASK_INT_TYPE_HI 0x00060000 +#define MASK_OVERFLOW_HI 0x00010000 +#define MASK_ERR_COUNT_HI 0x00000FFF +#define MASK_OVERFLOW 0x0001000000000000L + +struct threshold_bank { + unsigned int cpu; + u8 bank; + u8 interrupt_enable; + u16 threshold_limit; + struct kobject kobj; +}; + +static struct threshold_bank threshold_defaults = { + .interrupt_enable = 0, + .threshold_limit = THRESHOLD_MAX, +}; + +#ifdef CONFIG_SMP +static unsigned char shared_bank[NR_BANKS] = { + 0, 0, 0, 0, 1 +}; +#endif + +static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ + +/* + * CPU Initialization + */ + +/* must be called with correct cpu affinity */ +static void threshold_restart_bank(struct threshold_bank *b, + int reset, u16 old_limit) +{ + u32 mci_misc_hi, mci_misc_lo; + + rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi); + + if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + reset = 1; /* limit cannot be lower than err count */ + + if (reset) { /* reset err count and overflow bit */ + mci_misc_hi = + (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + (THRESHOLD_MAX - b->threshold_limit); + } else if (old_limit) { /* change limit w/o reset */ + int new_count = (mci_misc_hi & THRESHOLD_MAX) + + (old_limit - b->threshold_limit); + mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | + (new_count & THRESHOLD_MAX); + } + + b->interrupt_enable ? + (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : + (mci_misc_hi &= ~MASK_INT_TYPE_HI); + + mci_misc_hi |= MASK_COUNT_EN_HI; + wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi); +} + +void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) +{ + int bank; + u32 mci_misc_lo, mci_misc_hi; + unsigned int cpu = smp_processor_id(); + + for (bank = 0; bank < NR_BANKS; ++bank) { + rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi); + + /* !valid, !counter present, bios locked */ + if (!(mci_misc_hi & MASK_VALID_HI) || + !(mci_misc_hi & MASK_VALID_HI >> 1) || + (mci_misc_hi & MASK_VALID_HI >> 2)) + continue; + + per_cpu(bank_map, cpu) |= (1 << bank); + +#ifdef CONFIG_SMP + if (shared_bank[bank] && cpu_core_id[cpu]) + continue; +#endif + + setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20); + threshold_defaults.cpu = cpu; + threshold_defaults.bank = bank; + threshold_restart_bank(&threshold_defaults, 0, 0); + } +} + +/* + * APIC Interrupt Handler + */ + +/* + * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. + * the interrupt goes off when error_count reaches threshold_limit. + * the handler will simply log mcelog w/ software defined bank number. + */ +asmlinkage void mce_threshold_interrupt(void) +{ + int bank; + struct mce m; + + ack_APIC_irq(); + irq_enter(); + + memset(&m, 0, sizeof(m)); + rdtscll(m.tsc); + m.cpu = smp_processor_id(); + + /* assume first bank caused it */ + for (bank = 0; bank < NR_BANKS; ++bank) { + m.bank = MCE_THRESHOLD_BASE + bank; + rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc); + + if (m.misc & MASK_OVERFLOW) { + mce_log(&m); + goto out; + } + } + out: + irq_exit(); +} + +/* + * Sysfs Interface + */ + +static struct sysdev_class threshold_sysclass = { + set_kset_name("threshold"), +}; + +static DEFINE_PER_CPU(struct sys_device, device_threshold); + +struct threshold_attr { + struct attribute attr; + ssize_t(*show) (struct threshold_bank *, char *); + ssize_t(*store) (struct threshold_bank *, const char *, size_t count); +}; + +static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); + +static cpumask_t affinity_set(unsigned int cpu) +{ + cpumask_t oldmask = current->cpus_allowed; + cpumask_t newmask = CPU_MASK_NONE; + cpu_set(cpu, newmask); + set_cpus_allowed(current, newmask); + return oldmask; +} + +static void affinity_restore(cpumask_t oldmask) +{ + set_cpus_allowed(current, oldmask); +} + +#define SHOW_FIELDS(name) \ + static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \ + { \ + return sprintf(buf, "%lx\n", (unsigned long) b->name); \ + } +SHOW_FIELDS(interrupt_enable) +SHOW_FIELDS(threshold_limit) + +static ssize_t store_interrupt_enable(struct threshold_bank *b, + const char *buf, size_t count) +{ + char *end; + cpumask_t oldmask; + unsigned long new = simple_strtoul(buf, &end, 0); + if (end == buf) + return -EINVAL; + b->interrupt_enable = !!new; + + oldmask = affinity_set(b->cpu); + threshold_restart_bank(b, 0, 0); + affinity_restore(oldmask); + + return end - buf; +} + +static ssize_t store_threshold_limit(struct threshold_bank *b, + const char *buf, size_t count) +{ + char *end; + cpumask_t oldmask; + u16 old; + unsigned long new = simple_strtoul(buf, &end, 0); + if (end == buf) + return -EINVAL; + if (new > THRESHOLD_MAX) + new = THRESHOLD_MAX; + if (new < 1) + new = 1; + old = b->threshold_limit; + b->threshold_limit = new; + + oldmask = affinity_set(b->cpu); + threshold_restart_bank(b, 0, old); + affinity_restore(oldmask); + + return end - buf; +} + +static ssize_t show_error_count(struct threshold_bank *b, char *buf) +{ + u32 high, low; + cpumask_t oldmask; + oldmask = affinity_set(b->cpu); + rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */ + affinity_restore(oldmask); + return sprintf(buf, "%x\n", + (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); +} + +static ssize_t store_error_count(struct threshold_bank *b, + const char *buf, size_t count) +{ + cpumask_t oldmask; + oldmask = affinity_set(b->cpu); + threshold_restart_bank(b, 1, 0); + affinity_restore(oldmask); + return 1; +} + +#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +}; + +#define ATTR_FIELDS(name) \ + static struct threshold_attr name = \ + THRESHOLD_ATTR(name, 0644, show_## name, store_## name) + +ATTR_FIELDS(interrupt_enable); +ATTR_FIELDS(threshold_limit); +ATTR_FIELDS(error_count); + +static struct attribute *default_attrs[] = { + &interrupt_enable.attr, + &threshold_limit.attr, + &error_count.attr, + NULL +}; + +#define to_bank(k) container_of(k,struct threshold_bank,kobj) +#define to_attr(a) container_of(a,struct threshold_attr,attr) + +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct threshold_bank *b = to_bank(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + ret = a->show ? a->show(b, buf) : -EIO; + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct threshold_bank *b = to_bank(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + ret = a->store ? a->store(b, buf, count) : -EIO; + return ret; +} + +static struct sysfs_ops threshold_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type threshold_ktype = { + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, +}; + +/* symlinks sibling shared banks to first core. first core owns dir/files. */ +static __cpuinit int threshold_create_bank(unsigned int cpu, int bank) +{ + int err = 0; + struct threshold_bank *b = 0; + +#ifdef CONFIG_SMP + if (cpu_core_id[cpu] && shared_bank[bank]) { /* symlink */ + char name[16]; + unsigned lcpu = first_cpu(cpu_core_map[cpu]); + if (cpu_core_id[lcpu]) + goto out; /* first core not up yet */ + + b = per_cpu(threshold_banks, lcpu)[bank]; + if (!b) + goto out; + sprintf(name, "bank%i", bank); + err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj, + &b->kobj, name); + if (err) + goto out; + per_cpu(threshold_banks, cpu)[bank] = b; + goto out; + } +#endif + + b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL); + if (!b) { + err = -ENOMEM; + goto out; + } + memset(b, 0, sizeof(struct threshold_bank)); + + b->cpu = cpu; + b->bank = bank; + b->interrupt_enable = 0; + b->threshold_limit = THRESHOLD_MAX; + kobject_set_name(&b->kobj, "bank%i", bank); + b->kobj.parent = &per_cpu(device_threshold, cpu).kobj; + b->kobj.ktype = &threshold_ktype; + + err = kobject_register(&b->kobj); + if (err) { + kfree(b); + goto out; + } + per_cpu(threshold_banks, cpu)[bank] = b; + out: + return err; +} + +/* create dir/files for all valid threshold banks */ +static __cpuinit int threshold_create_device(unsigned int cpu) +{ + int bank; + int err = 0; + + per_cpu(device_threshold, cpu).id = cpu; + per_cpu(device_threshold, cpu).cls = &threshold_sysclass; + err = sysdev_register(&per_cpu(device_threshold, cpu)); + if (err) + goto out; + + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & 1 << bank)) + continue; + err = threshold_create_bank(cpu, bank); + if (err) + goto out; + } + out: + return err; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * let's be hotplug friendly. + * in case of multiple core processors, the first core always takes ownership + * of shared sysfs dir/files, and rest of the cores will be symlinked to it. + */ + +/* cpu hotplug call removes all symlinks before first core dies */ +static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank) +{ + struct threshold_bank *b; + char name[16]; + + b = per_cpu(threshold_banks, cpu)[bank]; + if (!b) + return; + if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) { + sprintf(name, "bank%i", bank); + sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name); + per_cpu(threshold_banks, cpu)[bank] = 0; + } else { + kobject_unregister(&b->kobj); + kfree(per_cpu(threshold_banks, cpu)[bank]); + } +} + +static __cpuinit void threshold_remove_device(unsigned int cpu) +{ + int bank; + + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & 1 << bank)) + continue; + threshold_remove_bank(cpu, bank); + } + sysdev_unregister(&per_cpu(device_threshold, cpu)); +} + +/* link all existing siblings when first core comes up */ +static __cpuinit int threshold_create_symlinks(unsigned int cpu) +{ + int bank, err = 0; + unsigned int lcpu = 0; + + if (cpu_core_id[cpu]) + return 0; + for_each_cpu_mask(lcpu, cpu_core_map[cpu]) { + if (lcpu == cpu) + continue; + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & 1 << bank)) + continue; + if (!shared_bank[bank]) + continue; + err = threshold_create_bank(lcpu, bank); + } + } + return err; +} + +/* remove all symlinks before first core dies. */ +static __cpuinit void threshold_remove_symlinks(unsigned int cpu) +{ + int bank; + unsigned int lcpu = 0; + if (cpu_core_id[cpu]) + return; + for_each_cpu_mask(lcpu, cpu_core_map[cpu]) { + if (lcpu == cpu) + continue; + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & 1 << bank)) + continue; + if (!shared_bank[bank]) + continue; + threshold_remove_bank(lcpu, bank); + } + } +} +#else /* !CONFIG_HOTPLUG_CPU */ +static __cpuinit void threshold_create_symlinks(unsigned int cpu) +{ +} +static __cpuinit void threshold_remove_symlinks(unsigned int cpu) +{ +} +static void threshold_remove_device(unsigned int cpu) +{ +} +#endif + +/* get notified when a cpu comes on/off */ +static __cpuinit int threshold_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + /* cpu was unsigned int to begin with */ + unsigned int cpu = (unsigned long)hcpu; + + if (cpu >= NR_CPUS) + goto out; + + switch (action) { + case CPU_ONLINE: + threshold_create_device(cpu); + threshold_create_symlinks(cpu); + break; + case CPU_DOWN_PREPARE: + threshold_remove_symlinks(cpu); + break; + case CPU_DOWN_FAILED: + threshold_create_symlinks(cpu); + break; + case CPU_DEAD: + threshold_remove_device(cpu); + break; + default: + break; + } + out: + return NOTIFY_OK; +} + +static struct notifier_block threshold_cpu_notifier = { + .notifier_call = threshold_cpu_callback, +}; + +static __init int threshold_init_device(void) +{ + int err; + int lcpu = 0; + + err = sysdev_class_register(&threshold_sysclass); + if (err) + goto out; + + /* to hit CPUs online before the notifier is up */ + for_each_online_cpu(lcpu) { + err = threshold_create_device(lcpu); + if (err) + goto out; + } + register_cpu_notifier(&threshold_cpu_notifier); + + out: + return err; +} + +device_initcall(threshold_init_device); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index b5e09e6b553..4a836384dd0 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -888,6 +888,10 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +{ +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task -- cgit v1.2.3 From 6004e1b7effcbb385a6b7c790e4b8008682cf679 Mon Sep 17 00:00:00 2001 From: James Cleverdon Date: Sat, 5 Nov 2005 17:25:53 +0100 Subject: [PATCH] i386/x86-64: Share interrupt vectors when there is a large number of interrupt sources Here's a patch that builds on Natalie Protasevich's IRQ compression patch and tries to work for MPS boots as well as ACPI. It is meant for a 4-node IBM x460 NUMA box, which was dying because it had interrupt pins with GSI numbers > NR_IRQS and thus overflowed irq_desc. The problem is that this system has 270 GSIs (which are 1:1 mapped with I/O APIC RTEs) and an 8-node box would have 540. This is much bigger than NR_IRQS (224 for both i386 and x86_64). Also, there aren't enough vectors to go around. There are about 190 usable vectors, not counting the reserved ones and the unused vectors at 0x20 to 0x2F. So, my patch attempts to compress the GSI range and share vectors by sharing IRQs. Cc: "Protasevich, Natalie" Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/io_apic.c | 80 ++++++++++++++++++++++++++++++++++++++++---- arch/x86_64/kernel/mpparse.c | 2 +- 2 files changed, 74 insertions(+), 8 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index c8eee20cd51..97154ab058b 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -57,7 +57,7 @@ int nr_ioapic_registers[MAX_IO_APICS]; * Rough estimation of how many shared IRQs there are, can * be changed anytime. */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) /* @@ -85,6 +85,7 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; int pin; \ struct irq_pin_list *entry = irq_2_pin + irq; \ \ + BUG_ON(irq >= NR_IRQS); \ for (;;) { \ unsigned int reg; \ pin = entry->pin; \ @@ -127,6 +128,8 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) } #endif +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF }; + /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -137,6 +140,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) static int first_free_entry = NR_IRQS; struct irq_pin_list *entry = irq_2_pin + irq; + BUG_ON(irq >= NR_IRQS); while (entry->next) entry = irq_2_pin + entry->next; @@ -144,7 +148,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) entry->next = first_free_entry; entry = irq_2_pin + entry->next; if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: whoops"); + panic("io_apic.c: ran out of irq_2_pin entries!"); } entry->apic = apic; entry->pin = pin; @@ -420,6 +424,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) best_guess = irq; } } + BUG_ON(best_guess >= NR_IRQS); return best_guess; } @@ -610,6 +615,64 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } +static int next_irq = 16; + +/* + * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number + * from ACPI, which can reach 800 in large boxen. + * + * Compact the sparse GSI space into a sequential IRQ series and reuse + * vectors if possible. + */ +int gsi_irq_sharing(int gsi) +{ + int i, tries, vector; + + BUG_ON(gsi >= NR_IRQ_VECTORS); + + if (platform_legacy_irq(gsi)) + return gsi; + + if (gsi_2_irq[gsi] != 0xFF) + return (int)gsi_2_irq[gsi]; + + tries = NR_IRQS; + try_again: + vector = assign_irq_vector(gsi); + + /* + * Sharing vectors means sharing IRQs, so scan irq_vectors for previous + * use of vector and if found, return that IRQ. However, we never want + * to share legacy IRQs, which usually have a different trigger mode + * than PCI. + */ + for (i = 0; i < NR_IRQS; i++) + if (IO_APIC_VECTOR(i) == vector) + break; + if (platform_legacy_irq(i)) { + if (--tries >= 0) { + IO_APIC_VECTOR(i) = 0; + goto try_again; + } + panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi); + } + if (i < NR_IRQS) { + gsi_2_irq[gsi] = i; + printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n", + gsi, vector, i); + return i; + } + + i = next_irq++; + BUG_ON(i >= NR_IRQS); + gsi_2_irq[gsi] = i; + IO_APIC_VECTOR(i) = vector; + printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n", + gsi, vector, i); + return i; +} + static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -639,6 +702,7 @@ static int pin_2_irq(int idx, int apic, int pin) while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; + irq = gsi_irq_sharing(irq); break; } default: @@ -648,6 +712,7 @@ static int pin_2_irq(int idx, int apic, int pin) break; } } + BUG_ON(irq >= NR_IRQS); /* * PCI IRQ command line redirection. Yes, limits are hardcoded. @@ -663,6 +728,7 @@ static int pin_2_irq(int idx, int apic, int pin) } } } + BUG_ON(irq >= NR_IRQS); return irq; } @@ -690,8 +756,8 @@ int assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; - BUG_ON(irq >= NR_IRQ_VECTORS); - if (IO_APIC_VECTOR(irq) > 0) + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) return IO_APIC_VECTOR(irq); next: current_vector += 8; @@ -699,9 +765,8 @@ next: goto next; if (current_vector >= FIRST_SYSTEM_VECTOR) { - offset++; - if (!(offset%8)) - return -ENOSPC; + /* If we run out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; current_vector = FIRST_DEVICE_VECTOR + offset; } @@ -1917,6 +1982,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a entry.polarity = active_high_low; entry.mask = 1; /* Disabled (masked) */ + irq = gsi_irq_sharing(irq); /* * IRQs < 16 are already in the irq_2_pin[] map */ diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index f16d38d09da..8f6958e7945 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -218,7 +218,7 @@ static void __init MP_intsrc_info (struct mpc_config_intsrc *m) m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); - if (++mp_irq_entries == MAX_IRQ_SOURCES) + if (++mp_irq_entries >= MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); } -- cgit v1.2.3 From 47492d3667ec519172ab978bd8231b8c7152fa9d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:53 +0100 Subject: [PATCH] x86_64: Use the DMA32 zone for dma_alloc_coherent()/pci_alloc_consistent Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/pci-gart.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 88be97c9698..2e28e855ec3 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -220,6 +220,12 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, uses the normal dma_mask for alloc_coherent. */ dma_mask &= *dev->dma_mask; + /* Why <=? Even when the mask is smaller than 4GB it is often larger + than 16MB and in this case we have a chance of finding fitting memory + in the next higher zone first. If not retry with true GFP_DMA. -AK */ + if (dma_mask <= 0xffffffff) + gfp |= GFP_DMA32; + again: memory = dma_alloc_pages(dev, gfp, get_order(size)); if (memory == NULL) @@ -245,7 +251,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, } if (!(gfp & GFP_DMA)) { - gfp |= GFP_DMA; + gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } return NULL; -- cgit v1.2.3 From e9b59d834faf0305cb6214a358f65f72c57b9e99 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Sat, 5 Nov 2005 17:25:53 +0100 Subject: [PATCH] x86_64: Remove duplicate __cpuinit define Remove duplicate __cpuinit in smp.c. Already defined in init.h which is already included. Signed-off-by: Ashok Raj Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/smp.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 9db9dda161b..493a0d1f127 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -28,8 +28,6 @@ #include #include -#define __cpuinit __init - /* * Smarter SMP flushing macros. * c/o Linus Torvalds. -- cgit v1.2.3 From f5f786d0455c359c554b8f74783f887c0a2c9fac Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Sat, 5 Nov 2005 17:25:53 +0100 Subject: [PATCH] x86-64/i386: Fix CPU model for family 6 According to cpuid instruction in IA32 SDM-Vol2, when computing cpu model, we need to consider extended model ID for family 0x6 also. AK: Also added fixes/simplifcation from Petr Vandrovec Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index da0bc3e7bdf..f27731ac95c 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -1060,10 +1060,10 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86 = (tfms >> 8) & 0xf; c->x86_model = (tfms >> 4) & 0xf; c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) { + if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; - } if (c->x86_capability[0] & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; } else { -- cgit v1.2.3 From 50895c5d76e15d8af480eff1aaab5770cabbc2c2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:53 +0100 Subject: [PATCH] x86_64: Fix gcc 4 warning in aperture.c Fix arch/x86_64/kernel/aperture.c: In function #iommu_hole_init#: arch/x86_64/kernel/aperture.c:199: warning: #aper_order# may be used uninitialized in this function Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/aperture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 962ad4823b6..c7f4fdd20f0 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -196,7 +196,7 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) void __init iommu_hole_init(void) { int fix, num; - u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0; + u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; u64 aper_base, last_aper_base = 0; int valid_agp = 0; -- cgit v1.2.3 From 69d81fcde7797342417591ba7affb372b9c86eae Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:53 +0100 Subject: [PATCH] x86_64: Speed up numa_node_id by putting it directly into the PDA Not go from the CPU number to an mapping array. Mode number is often used now in fast paths. This also adds a generic numa_node_id to all the topology includes Suggested by Eric Dumazet Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index f27731ac95c..99cfa751949 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -823,7 +823,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) if (!node_online(node)) node = nearby_node(apicid); } - cpu_to_node[cpu] = node; + numa_set_node(cpu, node); printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", cpu, c->x86_num_cores, node, cpu_core_id[cpu]); @@ -975,7 +975,7 @@ static void srat_detect_node(void) node = apicid_to_node[hard_smp_processor_id()]; if (node == NUMA_NO_NODE) node = 0; - cpu_to_node[cpu] = node; + numa_set_node(cpu, node); if (acpi_numa > 0) printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node); -- cgit v1.2.3 From f6c2e3330d3fdd5474bc3756da46fca889a30e33 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Sat, 5 Nov 2005 17:25:53 +0100 Subject: [PATCH] x86_64: Unmap NULL during early bootup We should zap the low mappings, as soon as possible, so that we can catch kernel bugs more effectively. Previously early boot had NULL mapped and didn't trap on NULL references. This patch introduces boot_level4_pgt, which will always have low identity addresses mapped. Druing boot, all the processors will use this as their level4 pgt. On BP, we will switch to init_level4_pgt as soon as we enter C code and zap the low mappings as soon as we are done with the usage of identity low mapped addresses. On AP's we will zap the low mappings as soon as we jump to C code. Signed-off-by: Suresh Siddha Signed-off-by: Ashok Raj Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/head.S | 37 +++++++++++++++++++++++-------------- arch/x86_64/kernel/head64.c | 8 ++++++++ arch/x86_64/kernel/mpparse.c | 2 +- arch/x86_64/kernel/setup.c | 2 ++ arch/x86_64/kernel/setup64.c | 2 +- arch/x86_64/kernel/smpboot.c | 3 --- 6 files changed, 35 insertions(+), 19 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index b92e5f45ed4..15290968e49 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -70,7 +71,7 @@ startup_32: movl %eax, %cr4 /* Setup early boot stage 4 level pagetables */ - movl $(init_level4_pgt - __START_KERNEL_map), %eax + movl $(boot_level4_pgt - __START_KERNEL_map), %eax movl %eax, %cr3 /* Setup EFER (Extended Feature Enable Register) */ @@ -113,7 +114,7 @@ startup_64: movq %rax, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax + movq $(boot_level4_pgt - __START_KERNEL_map), %rax movq %rax, %cr3 /* Check if nx is implemented */ @@ -240,20 +241,10 @@ ljumpvector: ENTRY(stext) ENTRY(_stext) - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ .org 0x1000 ENTRY(init_level4_pgt) - .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ - .fill 255,8,0 - .quad 0x000000000000a007 + __PHYSICAL_START - .fill 254,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ + /* This gets initialized in x86_64_start_kernel */ + .fill 512,8,0 .org 0x2000 ENTRY(level3_ident_pgt) @@ -350,6 +341,24 @@ ENTRY(wakeup_level4_pgt) .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ #endif +#ifndef CONFIG_HOTPLUG_CPU + __INITDATA +#endif + /* + * This default setting generates an ident mapping at address 0x100000 + * and a mapping for the kernel that precisely maps virtual address + * 0xffffffff80000000 to physical address 0x000000. (always using + * 2Mbyte large pages provided by PAE mode) + */ + .align PAGE_SIZE +ENTRY(boot_level4_pgt) + .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ + .fill 255,8,0 + .quad 0x000000000000a007 + __PHYSICAL_START + .fill 254,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ + .data .align 16 diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index cf6ab147a2a..e0f12d49db4 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -19,6 +19,7 @@ #include #include #include +#include /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ @@ -86,6 +87,13 @@ void __init x86_64_start_kernel(char * real_mode_data) set_intr_gate(i, early_idt_handler); asm volatile("lidt %0" :: "m" (idt_descr)); clear_bss(); + + /* + * switch to init_level4_pgt from boot_level4_pgt + */ + memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t)); + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); + pda_init(0); copy_bootdata(real_mode_data); #ifdef CONFIG_SMP diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 8f6958e7945..4a581d1cefb 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -549,7 +549,7 @@ void __init get_smp_config (void) * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) { + if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) { smp_found_config = 0; printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 99cfa751949..f76f3743f5d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -571,6 +571,8 @@ void __init setup_arch(char **cmdline_p) init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); + zap_low_mappings(0); + #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 79190891fbc..06dc354375c 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -141,7 +141,6 @@ void pda_init(int cpu) panic("cannot allocate irqstack for cpu %d", cpu); } - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); pda->irqstackptr += IRQSTACKSIZE-64; } @@ -197,6 +196,7 @@ void __cpuinit cpu_init (void) /* CPU 0 is initialised in head64.c */ if (cpu != 0) { pda_init(cpu); + zap_low_mappings(cpu); } else estacks = boot_exception_stacks; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 658a81b33f3..3393fc08823 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1063,9 +1063,6 @@ int __cpuinit __cpu_up(unsigned int cpu) */ void __init smp_cpus_done(unsigned int max_cpus) { -#ifndef CONFIG_HOTPLUG_CPU - zap_low_mappings(); -#endif smp_cleanup_boot(); #ifdef CONFIG_X86_IO_APIC -- cgit v1.2.3 From 2bc0414ee04fd8bb798760801f5d7476dff44241 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:53 +0100 Subject: [PATCH] x86_64: Only use asm/sections.h to declare section symbols Adding __initdata_* to asm-generic/sections.h Replaces a lot of open coded externs in arch/x86_64/* I had to change __bss_end to __bss_stop to match the other architectures. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/e820.c | 3 +-- arch/x86_64/kernel/head64.c | 6 ++---- arch/x86_64/kernel/setup.c | 2 +- arch/x86_64/kernel/vmlinux.lds.S | 2 +- 4 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index ab3f87aaff7..17579a1a174 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -23,8 +23,7 @@ #include #include #include - -extern char _end[]; +#include /* * PFN of last memory page. diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index e0f12d49db4..b675c5add01 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -20,14 +20,14 @@ #include #include #include +#include /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) { - extern char __bss_start[], __bss_end[]; memset(__bss_start, 0, - (unsigned long) __bss_end - (unsigned long) __bss_start); + (unsigned long) __bss_stop - (unsigned long) __bss_start); } #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ @@ -76,8 +76,6 @@ static void __init setup_boot_cpu_data(void) boot_cpu_data.x86_mask = eax & 0xf; } -extern char _end[]; - void __init x86_64_start_kernel(char * real_mode_data) { char *s; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index f76f3743f5d..2ad3556dda5 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -60,6 +60,7 @@ #include #include #include +#include /* * Machine setup.. @@ -103,7 +104,6 @@ struct edid_info edid_info; struct e820map e820; extern int root_mountflags; -extern char _text, _etext, _edata, _end; char command_line[COMMAND_LINE_SIZE]; diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 6dd642cad2e..58b19215b4b 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -50,7 +50,7 @@ SECTIONS *(.bss.page_aligned) *(.bss) } - __bss_end = .; + __bss_stop = .; . = ALIGN(PAGE_SIZE); . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); -- cgit v1.2.3 From 420f8f68c9c5148dddf946bebdbc7eacde2172cb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: New heuristics to find out hotpluggable CPUs. With a NR_CPUS==128 kernel with CPU hotplug enabled we would waste 4MB on per CPU data of all possible CPUs. The reason was that HOTPLUG always set up possible map to NR_CPUS cpus and then we need to allocate that much (each per CPU data is roughly ~32k now) The underlying problem is that ACPI didn't tell us how many hotplug CPUs the platform supports. So the old code just assumed all, which would lead to this memory wastage. This implements some new heuristics: - If the BIOS specified disabled CPUs in the ACPI/mptables assume they can be enabled later (this is bending the ACPI specification a bit, but seems like a obvious extension) - The user can overwrite it with a new additionals_cpus=NUM option - Otherwise use half of the available CPUs or 2, whatever is more. Cc: ashok.raj@intel.com Cc: len.brown@intel.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/mpparse.c | 8 ++++++-- arch/x86_64/kernel/smpboot.c | 39 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 42 insertions(+), 5 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 4a581d1cefb..1d61f10a92c 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -65,7 +65,9 @@ unsigned long mp_lapic_addr = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_id = -1U; /* Internal processor count */ -static unsigned int num_processors = 0; +unsigned int num_processors __initdata = 0; + +unsigned disabled_cpus __initdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; @@ -109,8 +111,10 @@ static void __init MP_processor_info (struct mpc_config_processor *m) int ver, cpu; static int found_bsp=0; - if (!(m->mpc_cpuflag & CPU_ENABLED)) + if (!(m->mpc_cpuflag & CPU_ENABLED)) { + disabled_cpus++; return; + } printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", m->mpc_apicid, diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 3393fc08823..f74319a8065 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -880,6 +880,9 @@ static __init void disable_smp(void) } #ifdef CONFIG_HOTPLUG_CPU + +int additional_cpus __initdata = -1; + /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@ -888,14 +891,38 @@ static __init void disable_smp(void) * cpu_present_map on the other hand can change dynamically. * In case when cpu_hotplug is not compiled, then we resort to current * behaviour, which is cpu_possible == cpu_present. - * If cpu-hotplug is supported, then we need to preallocate for all - * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. * - Ashok Raj + * + * Three ways to find out the number of additional hotplug CPUs: + * - If the BIOS specified disabled CPUs in ACPI/mptables use that. + * - otherwise use half of the available CPUs or 2, whatever is more. + * - The user can overwrite it with additional_cpus=NUM + * We do this because additional CPUs waste a lot of memory. + * -AK */ __init void prefill_possible_map(void) { int i; - for (i = 0; i < NR_CPUS; i++) + int possible; + + if (additional_cpus == -1) { + if (disabled_cpus > 0) { + additional_cpus = disabled_cpus; + } else { + additional_cpus = num_processors / 2; + if (additional_cpus == 0) + additional_cpus = 2; + } + } + possible = num_processors + additional_cpus; + if (possible > NR_CPUS) + possible = NR_CPUS; + + printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", + possible, + max_t(int, possible - num_processors, 0)); + + for (i = 0; i < possible; i++) cpu_set(i, cpu_possible_map); } #endif @@ -1151,6 +1178,12 @@ void __cpu_die(unsigned int cpu) printk(KERN_ERR "CPU %u didn't die...\n", cpu); } +static __init int setup_additional_cpus(char *s) +{ + return get_option(&s, &additional_cpus); +} +__setup("additional_cpus=", setup_additional_cpus); + #else /* ... !CONFIG_HOTPLUG_CPU */ int __cpu_disable(void) -- cgit v1.2.3 From e4e5d324b9c5586f408a72d1534474c449dd5212 Mon Sep 17 00:00:00 2001 From: Bryan Ford Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: Save/restore CS in 64bit signal handlers and force __USER_CS for CS This allows to run 64bit signal handlers in 64bit processes that run small code snippets in compat mode. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/signal.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index d642fbf3da2..5876df116c9 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -110,6 +110,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned COPY(r14); COPY(r15); + /* Kernel saves and restores only the CS segment register on signals, + * which is the bare minimum needed to allow mixed 32/64-bit code. + * App's signal handler can save/restore other segments if needed. */ + { + unsigned cs; + err |= __get_user(cs, &sc->cs); + regs->cs = cs | 3; /* Force into user mode */ + } + { unsigned int tmpflags; err |= __get_user(tmpflags, &sc->eflags); @@ -187,6 +196,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo { int err = 0; + err |= __put_user(regs->cs, &sc->cs); err |= __put_user(0, &sc->gs); err |= __put_user(0, &sc->fs); @@ -318,7 +328,14 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->rsp = (unsigned long)frame; + /* Set up the CS register to run signal handlers in 64-bit mode, + even if the handler happens to be interrupting 32-bit code. */ + regs->cs = __USER_CS; + + /* This, by contrast, has nothing to do with segment registers - + see include/asm-x86_64/uaccess.h for details. */ set_fs(USER_DS); + regs->eflags &= ~TF_MASK; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); -- cgit v1.2.3 From af9c142de94ecf724a18700273bbba390873e072 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: Force correct address space size for MTRR on some 64bit Intel Xeons They report 40bit, but only have 36bits of physical address space. This caused problems with setting up the correct masks for MTRR. CPUID workaround for steppings 0F33h(supporting x86) and 0F34h(supporting x86 and EM64T). Detail info can be found at: http://download.intel.com/design/Xeon/specupdt/30240216.pdf http://download.intel.com/design/Pentium4/specupdt/30235221.pdf Signed-off-by: Shaohua Li Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/setup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 2ad3556dda5..476ee034fca 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -995,6 +995,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) unsigned eax = cpuid_eax(0x80000008); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + /* CPUID workaround for Intel 0F34 CPU */ + if (c->x86_vendor == X86_VENDOR_INTEL && + c->x86 == 0xF && c->x86_model == 0x3 && + c->x86_mask == 0x4) + c->x86_phys_bits = 36; } if (c->x86 == 15) -- cgit v1.2.3 From ea0be473a1f0ee89024a24d8ea4b05fbf6efcee3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: Allow modular build of ia32 aout loader Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/x8664_ksyms.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index fd99ddd009b..4a54221e10b 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -203,3 +203,6 @@ EXPORT_SYMBOL(flush_tlb_page); #endif EXPORT_SYMBOL(cpu_khz); + +EXPORT_SYMBOL(load_gs_index); + -- cgit v1.2.3 From a88cde13bae3fffd6ecc812bdd02c91eafb6073e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: Formatting fixes for arch/x86_64/kernel/process.c No functional changes. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/process.c | 47 +++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 20 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index b5a89c0bdf5..bc44c92ca19 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -148,7 +148,8 @@ void cpu_idle_wait(void) do { ssleep(1); for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + if (cpu_isset(cpu, map) && + !per_cpu(cpu_idle_state, cpu)) cpu_clear(cpu, map); } cpus_and(map, map, cpu_online_map); @@ -278,7 +279,8 @@ void __show_regs(struct pt_regs * regs) system_utsname.version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); printk_address(regs->rip); - printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); + printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, + regs->eflags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->rax, regs->rbx, regs->rcx); printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", @@ -430,15 +432,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, struct pt_regs * childregs; struct task_struct *me = current; - childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; - + childregs = ((struct pt_regs *) + (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; *childregs = *regs; childregs->rax = 0; childregs->rsp = rsp; - if (rsp == ~0UL) { + if (rsp == ~0UL) childregs->rsp = (unsigned long)childregs; - } p->thread.rsp = (unsigned long) childregs; p->thread.rsp0 = (unsigned long) (childregs+1); @@ -460,7 +461,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, p->thread.io_bitmap_max = 0; return -ENOMEM; } - memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES); } /* @@ -497,7 +499,8 @@ out: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced */ -struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; @@ -568,7 +571,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct * prev->userrsp = read_pda(oldrsp); write_pda(oldrsp, next->userrsp); write_pda(pcurrent, next_p); - write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); + write_pda(kernelstack, + (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); /* * Now maybe reload the debug registers @@ -649,7 +653,9 @@ asmlinkage long sys_fork(struct pt_regs *regs) return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); } -asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +asmlinkage long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) { if (!newsp) newsp = regs->rsp; @@ -685,7 +691,8 @@ unsigned long get_wchan(struct task_struct *p) return 0; fp = *(u64 *)(p->thread.rsp); do { - if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) + if (fp < (unsigned long)stack || + fp > (unsigned long)stack+THREAD_SIZE) return 0; rip = *(u64 *)(fp+8); if (!in_sched_functions(rip)) @@ -720,8 +727,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) task->thread.gsindex = 0; task->thread.gs = addr; if (doit) { - load_gs_index(0); - ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); + load_gs_index(0); + ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); } } put_cpu(); @@ -738,7 +745,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) set_32bit_tls(task, FS_TLS, addr); if (doit) { load_TLS(&task->thread, cpu); - asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL)); + asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); } task->thread.fsindex = FS_TLS_SEL; task->thread.fs = 0; @@ -748,8 +755,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) if (doit) { /* set the selector to 0 to not confuse __switch_to */ - asm volatile("movl %0,%%fs" :: "r" (0)); - ret = checking_wrmsrl(MSR_FS_BASE, addr); + asm volatile("movl %0,%%fs" :: "r" (0)); + ret = checking_wrmsrl(MSR_FS_BASE, addr); } } put_cpu(); @@ -758,9 +765,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) unsigned long base; if (task->thread.fsindex == FS_TLS_SEL) base = read_32bit_tls(task, FS_TLS); - else if (doit) { + else if (doit) rdmsrl(MSR_FS_BASE, base); - } else + else base = task->thread.fs; ret = put_user(base, (unsigned long __user *)addr); break; @@ -769,9 +776,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) unsigned long base; if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); - else if (doit) { + else if (doit) rdmsrl(MSR_KERNEL_GS_BASE, base); - } else + else base = task->thread.gs; ret = put_user(base, (unsigned long __user *)addr); break; -- cgit v1.2.3 From 3506229ff9968e5dbc862a50285fbea0e8821b58 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: Don't enable interrupt unconditionally in reboot path When they were disabled before (e.g. after a panic) it's better to keep them off, otherwise followon panics can happen from timer interrupt handlers etc. Drawback is that pageup in the console won't work anymore though. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/reboot.c | 5 +++-- arch/x86_64/kernel/smp.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index 47f95687905..97804bc2c0a 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -77,6 +77,7 @@ static inline void kb_wait(void) void machine_shutdown(void) { + unsigned long flags; /* Stop the cpus and apics */ #ifdef CONFIG_SMP int reboot_cpu_id; @@ -98,7 +99,7 @@ void machine_shutdown(void) smp_send_stop(); #endif - local_irq_disable(); + local_irq_save(flags); #ifndef CONFIG_SMP disable_local_APIC(); @@ -106,7 +107,7 @@ void machine_shutdown(void) disable_IO_APIC(); - local_irq_enable(); + local_irq_restore(flags); } void machine_emergency_restart(void) diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 493a0d1f127..cfc3d9dccbd 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -450,13 +450,14 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, void smp_stop_cpu(void) { + unsigned long flags; /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); + local_irq_save(flags); disable_local_APIC(); - local_irq_enable(); + local_irq_restore(flags); } static void smp_really_stop_cpu(void *dummy) -- cgit v1.2.3 From 94605eff572b727aaad9b4b29bc358b919096503 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86-64/i386: Intel HT, Multi core detection fixes Fields obtained through cpuid vector 0x1(ebx[16:23]) and vector 0x4(eax[14:25], eax[26:31]) indicate the maximum values and might not always be the same as what is available and what OS sees. So make sure "siblings" and "cpu cores" values in /proc/cpuinfo reflect the values as seen by OS instead of what cpuid instruction says. This will also fix the buggy BIOS cases (for example where cpuid on a single core cpu says there are "2" siblings, even when HT is disabled in the BIOS. http://bugzilla.kernel.org/show_bug.cgi?id=4359) Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/setup.c | 69 +++++++++++++++++++------------------------- arch/x86_64/kernel/smpboot.c | 69 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 84 insertions(+), 54 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 476ee034fca..40c77f6fe4b 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -795,7 +795,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) #endif bits = 0; - while ((1 << bits) < c->x86_num_cores) + while ((1 << bits) < c->x86_max_cores) bits++; /* Low order bits define the core id (index of core in socket) */ @@ -828,7 +828,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) numa_set_node(cpu, node); printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", - cpu, c->x86_num_cores, node, cpu_core_id[cpu]); + cpu, c->x86_max_cores, node, cpu_core_id[cpu]); #endif #endif } @@ -877,9 +877,9 @@ static int __init init_amd(struct cpuinfo_x86 *c) display_cacheinfo(c); if (c->extended_cpuid_level >= 0x80000008) { - c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - if (c->x86_num_cores & (c->x86_num_cores - 1)) - c->x86_num_cores = 1; + c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + if (c->x86_max_cores & (c->x86_max_cores - 1)) + c->x86_max_cores = 1; amd_detect_cmp(c); } @@ -891,54 +891,44 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; - int index_msb, tmp; + int index_msb, core_bits; int cpu = smp_processor_id(); - + + cpuid(1, &eax, &ebx, &ecx, &edx); + + c->apicid = phys_pkg_id(0); + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; - cpuid(1, &eax, &ebx, &ecx, &edx); smp_num_siblings = (ebx & 0xff0000) >> 16; - + if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { - index_msb = 31; - /* - * At this point we only support two siblings per - * processor package. - */ + } else if (smp_num_siblings > 1 ) { + if (smp_num_siblings > NR_CPUS) { printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); smp_num_siblings = 1; return; } - tmp = smp_num_siblings; - while ((tmp & 0x80000000 ) == 0) { - tmp <<=1 ; - index_msb--; - } - if (smp_num_siblings & (smp_num_siblings - 1)) - index_msb++; + + index_msb = get_count_order(smp_num_siblings); phys_proc_id[cpu] = phys_pkg_id(index_msb); - + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", phys_proc_id[cpu]); - smp_num_siblings = smp_num_siblings / c->x86_num_cores; + smp_num_siblings = smp_num_siblings / c->x86_max_cores; - tmp = smp_num_siblings; - index_msb = 31; - while ((tmp & 0x80000000) == 0) { - tmp <<=1 ; - index_msb--; - } - if (smp_num_siblings & (smp_num_siblings - 1)) - index_msb++; + index_msb = get_count_order(smp_num_siblings) ; + + core_bits = get_count_order(c->x86_max_cores); - cpu_core_id[cpu] = phys_pkg_id(index_msb); + cpu_core_id[cpu] = phys_pkg_id(index_msb) & + ((1 << core_bits) - 1); - if (c->x86_num_cores > 1) + if (c->x86_max_cores > 1) printk(KERN_INFO "CPU: Processor Core ID: %d\n", cpu_core_id[cpu]); } @@ -1006,7 +996,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) c->x86_cache_alignment = c->x86_clflush_size * 2; if (c->x86 >= 15) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - c->x86_num_cores = intel_num_cpu_cores(c); + c->x86_max_cores = intel_num_cpu_cores(c); srat_detect_node(); } @@ -1044,7 +1034,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_model_id[0] = '\0'; /* Unset */ c->x86_clflush_size = 64; c->x86_cache_alignment = c->x86_clflush_size; - c->x86_num_cores = 1; + c->x86_max_cores = 1; c->extended_cpuid_level = 0; memset(&c->x86_capability, 0, sizeof c->x86_capability); @@ -1278,13 +1268,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); #ifdef CONFIG_SMP - if (smp_num_siblings * c->x86_num_cores > 1) { + if (smp_num_siblings * c->x86_max_cores > 1) { int cpu = c - cpu_data; seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]); - seq_printf(m, "siblings\t: %d\n", - c->x86_num_cores * smp_num_siblings); + seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]); - seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } #endif diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index f74319a8065..2b9ddba61b3 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -64,6 +64,7 @@ int smp_num_siblings = 1; /* Package ID of each logical CPU */ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +/* core ID of each logical CPU */ u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; EXPORT_SYMBOL(phys_proc_id); EXPORT_SYMBOL(cpu_core_id); @@ -89,7 +90,10 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; /* Set when the idlers are all forked */ int smp_threads_ready; +/* representing HT siblings of each logical CPU */ cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; + +/* representing HT and core siblings of each logical CPU */ cpumask_t cpu_core_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_core_map); @@ -436,30 +440,59 @@ void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +/* representing cpus for which sibling maps can be computed */ +static cpumask_t cpu_sibling_setup_map; + static inline void set_cpu_sibling_map(int cpu) { int i; + struct cpuinfo_x86 *c = cpu_data; + + cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { - for_each_cpu(i) { - if (cpu_core_id[cpu] == cpu_core_id[i]) { + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (phys_proc_id[cpu] == phys_proc_id[i] && + cpu_core_id[cpu] == cpu_core_id[i]) { cpu_set(i, cpu_sibling_map[cpu]); cpu_set(cpu, cpu_sibling_map[i]); + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); } } } else { cpu_set(cpu, cpu_sibling_map[cpu]); } - if (current_cpu_data.x86_num_cores > 1) { - for_each_cpu(i) { - if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_set(i, cpu_core_map[cpu]); - cpu_set(cpu, cpu_core_map[i]); - } - } - } else { + if (current_cpu_data.x86_max_cores == 1) { cpu_core_map[cpu] = cpu_sibling_map[cpu]; + c[cpu].booted_cores = 1; + return; + } + + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (phys_proc_id[cpu] == phys_proc_id[i]) { + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + /* + * Does this new cpu bringup a new core? + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) { + /* + * for each core in package, increment + * the booted_cores for this new cpu + */ + if (first_cpu(cpu_sibling_map[i]) == i) + c[cpu].booted_cores++; + /* + * increment the core count for all + * the other cpus in this package + */ + if (i != cpu) + c[i].booted_cores++; + } else if (i != cpu && !c[cpu].booted_cores) + c[cpu].booted_cores = c[i].booted_cores; + } } } @@ -993,6 +1026,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) nmi_watchdog_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ + set_cpu_sibling_map(0); if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -1036,8 +1070,6 @@ void __init smp_prepare_boot_cpu(void) int me = smp_processor_id(); cpu_set(me, cpu_online_map); cpu_set(me, cpu_callout_map); - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); per_cpu(cpu_state, me) = CPU_ONLINE; } @@ -1106,15 +1138,24 @@ void __init smp_cpus_done(unsigned int max_cpus) static void remove_siblinginfo(int cpu) { int sibling; + struct cpuinfo_x86 *c = cpu_data; + for_each_cpu_mask(sibling, cpu_core_map[cpu]) { + cpu_clear(cpu, cpu_core_map[sibling]); + /* + * last thread sibling in this cpu core going down + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) + c[sibling].booted_cores--; + } + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) cpu_clear(cpu, cpu_sibling_map[sibling]); - for_each_cpu_mask(sibling, cpu_core_map[cpu]) - cpu_clear(cpu, cpu_core_map[sibling]); cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); phys_proc_id[cpu] = BAD_APICID; cpu_core_id[cpu] = BAD_APICID; + cpu_clear(cpu, cpu_sibling_setup_map); } void remove_cpu_from_maps(void) -- cgit v1.2.3 From a6f5deb2be4c82f24fefadcbf7e448f540c05ae6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: Reduce number of retries for reset through keyboard controller Old code could retry for 10 seconds worst time. Only try it for one second now. Suggested by Yinghai Lu Cc: Yinghai.Lu@amd.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/reboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index 97804bc2c0a..75235ed2b31 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -121,7 +121,7 @@ void machine_emergency_restart(void) /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { case BOOT_KBD: - for (i=0; i<100; i++) { + for (i=0; i<10; i++) { kb_wait(); udelay(50); outb(0xfe,0x64); /* pulse reset low */ -- cgit v1.2.3 From efbbdce94f6ea54cf06d9a06e4c95f6874ad64a8 Mon Sep 17 00:00:00 2001 From: Paolo 'Blaisorblade' Giarrusso Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: Use common sys_time64 Keeping this function does not makes sense because it's a copied (and buggy) copy of sys_time. The only difference is that now.tv_sec (which is a time_t, i.e. a 64-bit long) is copied (and truncated) into a int (32-bit). The prototype is the same (they both take a long __user *), so let's drop this and redirect it to sys_time (and make sure it exists by defining __ARCH_WANT_SYS_TIME). Only disadvantage is that the sys_stime definition is also compiled (may be fixed if needed by adding a separate __ARCH_WANT_SYS_STIME macro, and defining it for all arch's defining __ARCH_WANT_SYS_TIME except x86_64). Acked-by: Andi Kleen Signed-off-by: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/sys_x86_64.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c index cc7821c6885..6449ea8fe75 100644 --- a/arch/x86_64/kernel/sys_x86_64.c +++ b/arch/x86_64/kernel/sys_x86_64.c @@ -154,17 +154,3 @@ asmlinkage long sys_uname(struct new_utsname __user * name) err |= copy_to_user(&name->machine, "i686", 5); return err ? -EFAULT : 0; } - -asmlinkage long sys_time64(long __user * tloc) -{ - struct timeval now; - int i; - - do_gettimeofday(&now); - i = now.tv_sec; - if (tloc) { - if (put_user(i,tloc)) - i = -EFAULT; - } - return i; -} -- cgit v1.2.3 From e583538f077d5f70191670b47a046ba436ec3428 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: Log machine checks from boot on Intel systems The logging for boot errors was turned off because it was broken on some AMD systems. But give Intel EM64T systems a chance because they are supposed to be correct there. The advantage is that there is a chance to actually log uncorrected machine checks after the reset. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/mce.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index cf8a76f0f47..183dc610542 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -37,7 +37,7 @@ static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; static unsigned long console_logged; static int notify_user; static int rip_msr; -static int mce_bootlog; +static int mce_bootlog = 1; /* * Lockless MCE logging infrastructure. @@ -347,7 +347,11 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) /* disable GART TBL walk error reporting, which trips off incorrectly with the IOMMU & 3ware & Cerberus. */ clear_bit(10, &bank[4]); + /* Lots of broken BIOS around that don't clear them + by default and leave crap in there. Don't log. */ + mce_bootlog = 0; } + } static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) @@ -498,16 +502,16 @@ static int __init mcheck_disable(char *str) /* mce=off disables machine check. Note you can reenable it later using sysfs. mce=TOLERANCELEVEL (number, see above) - mce=bootlog Log MCEs from before booting. Disabled by default to work - around buggy BIOS that leave bogus MCEs. */ + mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + mce=nobootlog Don't log MCEs from before booting. */ static int __init mcheck_enable(char *str) { if (*str == '=') str++; if (!strcmp(str, "off")) mce_dont_init = 1; - else if (!strcmp(str, "bootlog")) - mce_bootlog = 1; + else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) + mce_bootlog = str[0] == 'b'; else if (isdigit(str[0])) get_option(&str, &tolerant); else -- cgit v1.2.3 From 9e43e1b7c7c9872da032442d8e4bb112a02d16f4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: Remove CONFIG_CHECKING and add command line option for pagefault tracing CONFIG_CHECKING covered some debugging code used in the early times of the port. But it wasn't even SMP safe for quite some time and the bugs it checked for seem to be gone. This patch removes all the code to verify GS at kernel entry. There haven't been any new bugs in this area for a long time. Previously it also covered the sysctl for the page fault tracing. That didn't make much sense because that code was unconditionally compiled in. I made that a boot option now because it is typically only useful at boot. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/traps.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 4a836384dd0..bf337f49318 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -428,19 +428,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, { conditional_sti(regs); -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda, - regs->rip); - } - } -#endif - if (user_mode(regs)) { struct task_struct *tsk = current; @@ -513,20 +500,6 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, { conditional_sti(regs); -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - oops_in_progress++; - printk("general protection handler: wrong gs %lx expected %p\n", gs, pda); - oops_in_progress--; - } - } -#endif - if (user_mode(regs)) { struct task_struct *tsk = current; @@ -665,19 +638,6 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, struct task_struct *tsk = current; siginfo_t info; -#ifdef CONFIG_CHECKING - { - /* RED-PEN interaction with debugger - could destroy gs */ - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("debug handler: wrong gs %lx expected %p\n", gs, pda); - } - } -#endif - get_debugreg(condition, 6); if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, -- cgit v1.2.3 From 8893166ff8694f36655009aa9bf8e7f2e1c9339f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: Increase the maximum number of local APICs to the maximum This is needed for large multinode IBM systems which have a sparse APIC space in clustered mode, fully covering the available 8 bits. The previous kernels would limit the local APIC number to 127, which caused it to reject some of the CPUs at boot. I increased the maximum and shrunk the apic_version array a bit to make up for that (the version is only 8 bit, so don't need an full int to store) Cc: Chris McDermott Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/mpparse.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 1d61f10a92c..1105250bf02 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -42,7 +42,7 @@ int acpi_found_madt; * Various Linux-internal data structures created from the * MP-table. */ -int apic_version [MAX_APICS]; +unsigned char apic_version [MAX_APICS]; unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; @@ -108,7 +108,8 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __init MP_processor_info (struct mpc_config_processor *m) { - int ver, cpu; + int cpu; + unsigned char ver; static int found_bsp=0; if (!(m->mpc_cpuflag & CPU_ENABLED)) { @@ -133,12 +134,14 @@ static void __init MP_processor_info (struct mpc_config_processor *m) } cpu = num_processors++; - - if (m->mpc_apicid > MAX_APICS) { + +#if MAX_APICS < 255 + if ((int)m->mpc_apicid > MAX_APICS) { printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", m->mpc_apicid, MAX_APICS); return; } +#endif ver = m->mpc_apicver; physid_set(m->mpc_apicid, phys_cpu_present_map); -- cgit v1.2.3 From d3ee871e63d0a0c70413dc0aa5534b8d6cd6ec37 Mon Sep 17 00:00:00 2001 From: Bob Picco Date: Sat, 5 Nov 2005 17:25:54 +0100 Subject: [PATCH] x86_64: Fix sparse mem Fix up booting with sparse mem enabled. Otherwise it would just cause an early PANIC at boot. Signed-off-by: Bob Picco Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/setup.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 40c77f6fe4b..750e01dcbdf 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -412,7 +412,6 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { unsigned long bootmap_size, bootmap; - memory_present(0, start_pfn, end_pfn); bootmap_size = bootmem_bootmap_pages(end_pfn)<