aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/discontig_32.c285
-rw-r--r--arch/x86/mm/dump_pagetables.c2
-rw-r--r--arch/x86/mm/fault.c97
-rw-r--r--arch/x86/mm/init_32.c523
-rw-r--r--arch/x86/mm/init_64.c552
-rw-r--r--arch/x86/mm/ioremap.c49
-rw-r--r--arch/x86/mm/k8topology_64.c21
-rw-r--r--arch/x86/mm/numa_64.c93
-rw-r--r--arch/x86/mm/pageattr-test.c21
-rw-r--r--arch/x86/mm/pageattr.c62
-rw-r--r--arch/x86/mm/pat.c378
-rw-r--r--arch/x86/mm/pgtable.c190
-rw-r--r--arch/x86/mm/pgtable_32.c56
-rw-r--r--arch/x86/mm/srat_32.c280
-rw-r--r--arch/x86/mm/srat_64.c21
16 files changed, 1733 insertions, 900 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 07dab503c9e..9873716e9f7 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -18,5 +18,6 @@ obj-$(CONFIG_NUMA) += discontig_32.o
else
obj-$(CONFIG_NUMA) += numa_64.o
obj-$(CONFIG_K8_NUMA) += k8topology_64.o
-obj-$(CONFIG_ACPI_NUMA) += srat_64.o
endif
+obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
+
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 914ccf98368..5dfef9fa061 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -38,6 +38,7 @@
#include <asm/setup.h>
#include <asm/mmzone.h>
#include <asm/bios_ebda.h>
+#include <asm/proto.h>
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
@@ -59,14 +60,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
/*
* 4) physnode_map - the mapping between a pfn and owning node
* physnode_map keeps track of the physical memory layout of a generic
- * numa node on a 256Mb break (each element of the array will
- * represent 256Mb of memory and will be marked by the node id. so,
+ * numa node on a 64Mb break (each element of the array will
+ * represent 64Mb of memory and will be marked by the node id. so,
* if the first gig is on node 0, and the second gig is on node 1
* physnode_map will contain:
*
- * physnode_map[0-3] = 0;
- * physnode_map[4-7] = 1;
- * physnode_map[8- ] = -1;
+ * physnode_map[0-15] = 0;
+ * physnode_map[16-31] = 1;
+ * physnode_map[32- ] = -1;
*/
s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
EXPORT_SYMBOL(physnode_map);
@@ -75,15 +76,15 @@ void memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;
- printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
+ printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
nid, start, end);
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
printk(KERN_DEBUG " ");
for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
- printk("%ld ", pfn);
+ printk(KERN_CONT "%lx ", pfn);
}
- printk("\n");
+ printk(KERN_CONT "\n");
}
unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
#endif
extern unsigned long find_max_low_pfn(void);
-extern void add_one_highpage_init(struct page *, int, int);
extern unsigned long highend_pfn, highstart_pfn;
#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -117,13 +117,13 @@ static unsigned long kva_pages;
*/
int __init get_memcfg_numa_flat(void)
{
- printk("NUMA - single node, flat memory mode\n");
+ printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
- /* Run the memory configuration and find the top of memory. */
- propagate_e820_map();
node_start_pfn[0] = 0;
node_end_pfn[0] = max_pfn;
+ e820_register_active_regions(0, 0, max_pfn);
memory_present(0, 0, max_pfn);
+ node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
/* Indicate there is one node available. */
nodes_clear(node_online_map);
@@ -156,24 +156,32 @@ static void __init propagate_e820_map_node(int nid)
*/
static void __init allocate_pgdat(int nid)
{
- if (nid && node_has_online_mem(nid))
+ char buf[16];
+
+ if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
else {
- NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
- min_low_pfn += PFN_UP(sizeof(pg_data_t));
+ unsigned long pgdat_phys;
+ pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+ max_pfn_mapped<<PAGE_SHIFT,
+ sizeof(pg_data_t),
+ PAGE_SIZE);
+ NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
+ memset(buf, 0, sizeof(buf));
+ sprintf(buf, "NODE_DATA %d", nid);
+ reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
}
+ printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
+ nid, (unsigned long)NODE_DATA(nid));
}
-#ifdef CONFIG_DISCONTIGMEM
/*
- * In the discontig memory model, a portion of the kernel virtual area (KVA)
- * is reserved and portions of nodes are mapped using it. This is to allow
- * node-local memory to be allocated for structures that would normally require
- * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers
- * should be prepared to allocate from the bootmem allocator instead. This KVA
- * mechanism is incompatible with SPARSEMEM as it makes assumptions about the
- * layout of memory that are broken if alloc_remap() succeeds for some of the
- * map and fails for others
+ * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
+ * virtual address space (KVA) is reserved and portions of nodes are mapped
+ * using it. This is to allow node-local memory to be allocated for
+ * structures that would normally require ZONE_NORMAL. The memory is
+ * allocated with alloc_remap() and callers should be prepared to allocate
+ * from the bootmem allocator instead.
*/
static unsigned long node_remap_start_pfn[MAX_NUMNODES];
static void *node_remap_end_vaddr[MAX_NUMNODES];
@@ -195,15 +203,19 @@ void *alloc_remap(int nid, unsigned long size)
return allocation;
}
-void __init remap_numa_kva(void)
+static void __init remap_numa_kva(void)
{
void *vaddr;
unsigned long pfn;
int node;
for_each_online_node(node) {
+ printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+ printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
+ (unsigned long)vaddr,
+ node_remap_start_pfn[node] + pfn);
set_pmd_pfn((ulong) vaddr,
node_remap_start_pfn[node] + pfn,
PAGE_KERNEL_LARGE);
@@ -215,17 +227,21 @@ static unsigned long calculate_numa_remap_pages(void)
{
int nid;
unsigned long size, reserve_pages = 0;
- unsigned long pfn;
for_each_online_node(nid) {
- unsigned old_end_pfn = node_end_pfn[nid];
+ u64 node_kva_target;
+ u64 node_kva_final;
/*
* The acpi/srat node info can show hot-add memroy zones
* where memory could be added but not currently present.
*/
+ printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
+ nid, node_start_pfn[nid], node_end_pfn[nid]);
if (node_start_pfn[nid] > max_pfn)
continue;
+ if (!node_end_pfn[nid])
+ continue;
if (node_end_pfn[nid] > max_pfn)
node_end_pfn[nid] = max_pfn;
@@ -237,41 +253,48 @@ static unsigned long calculate_numa_remap_pages(void)
/* now the roundup is correct, convert to PAGE_SIZE pages */
size = size * PTRS_PER_PTE;
- /*
- * Validate the region we are allocating only contains valid
- * pages.
- */
- for (pfn = node_end_pfn[nid] - size;
- pfn < node_end_pfn[nid]; pfn++)
- if (!page_is_ram(pfn))
- break;
-
- if (pfn != node_end_pfn[nid])
- size = 0;
+ node_kva_target = round_down(node_end_pfn[nid] - size,
+ PTRS_PER_PTE);
+ node_kva_target <<= PAGE_SHIFT;
+ do {
+ node_kva_final = find_e820_area(node_kva_target,
+ ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
+ ((u64)size)<<PAGE_SHIFT,
+ LARGE_PAGE_BYTES);
+ node_kva_target -= LARGE_PAGE_BYTES;
+ } while (node_kva_final == -1ULL &&
+ (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
+
+ if (node_kva_final == -1ULL)
+ panic("Can not get kva ram\n");
- printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
- size, nid);
node_remap_size[nid] = size;
node_remap_offset[nid] = reserve_pages;
reserve_pages += size;
- printk("Shrinking node %d from %ld pages to %ld pages\n",
- nid, node_end_pfn[nid], node_end_pfn[nid] - size);
-
- if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
- /*
- * Align node_end_pfn[] and node_remap_start_pfn[] to
- * pmd boundary. remap_numa_kva will barf otherwise.
- */
- printk("Shrinking node %d further by %ld pages for proper alignment\n",
- nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
- size += node_end_pfn[nid] & (PTRS_PER_PTE-1);
- }
+ printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
+ " node %d at %llx\n",
+ size, nid, node_kva_final>>PAGE_SHIFT);
+
+ /*
+ * prevent kva address below max_low_pfn want it on system
+ * with less memory later.
+ * layout will be: KVA address , KVA RAM
+ *
+ * we are supposed to only record the one less then max_low_pfn
+ * but we could have some hole in high memory, and it will only
+ * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
+ * to use it as free.
+ * So reserve_early here, hope we don't run out of that array
+ */
+ reserve_early(node_kva_final,
+ node_kva_final+(((u64)size)<<PAGE_SHIFT),
+ "KVA RAM");
- node_end_pfn[nid] -= size;
- node_remap_start_pfn[nid] = node_end_pfn[nid];
- shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
+ node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
+ remove_active_range(nid, node_remap_start_pfn[nid],
+ node_remap_start_pfn[nid] + size);
}
- printk("Reserving total of %ld pages for numa KVA remap\n",
+ printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
reserve_pages);
return reserve_pages;
}
@@ -285,37 +308,16 @@ static void init_remap_allocator(int nid)
node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
ALIGN(sizeof(pg_data_t), PAGE_SIZE);
- printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+ printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
(ulong) node_remap_start_vaddr[nid],
- (ulong) pfn_to_kaddr(highstart_pfn
- + node_remap_offset[nid] + node_remap_size[nid]));
-}
-#else
-void *alloc_remap(int nid, unsigned long size)
-{
- return NULL;
-}
-
-static unsigned long calculate_numa_remap_pages(void)
-{
- return 0;
-}
-
-static void init_remap_allocator(int nid)
-{
-}
-
-void __init remap_numa_kva(void)
-{
+ (ulong) node_remap_end_vaddr[nid]);
}
-#endif /* CONFIG_DISCONTIGMEM */
-extern void setup_bootmem_allocator(void);
-unsigned long __init setup_memory(void)
+void __init initmem_init(unsigned long start_pfn,
+ unsigned long end_pfn)
{
int nid;
- unsigned long system_start_pfn, system_max_low_pfn;
- unsigned long wasted_pages;
+ long kva_target_pfn;
/*
* When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -324,109 +326,77 @@ unsigned long __init setup_memory(void)
* this space and use it to adjust the boundary between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
- get_memcfg_numa();
- kva_pages = calculate_numa_remap_pages();
+ get_memcfg_numa();
- /* partially used pages are not usable - thus round upwards */
- system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
+ kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
- kva_start_pfn = find_max_low_pfn() - kva_pages;
+ kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
+ do {
+ kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
+ max_low_pfn<<PAGE_SHIFT,
+ kva_pages<<PAGE_SHIFT,
+ PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
+ kva_target_pfn -= PTRS_PER_PTE;
+ } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
-#ifdef CONFIG_BLK_DEV_INITRD
- /* Numa kva area is below the initrd */
- if (initrd_start)
- kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
- - kva_pages;
-#endif
+ if (kva_start_pfn == -1UL)
+ panic("Can not get kva space\n");
- /*
- * We waste pages past at the end of the KVA for no good reason other
- * than how it is located. This is bad.
- */
- wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
- kva_start_pfn -= wasted_pages;
- kva_pages += wasted_pages;
-
- system_max_low_pfn = max_low_pfn = find_max_low_pfn();
- printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
+ printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
kva_start_pfn, max_low_pfn);
- printk("max_pfn = %ld\n", max_pfn);
+ printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
+
+ /* avoid clash with initrd */
+ reserve_early(kva_start_pfn<<PAGE_SHIFT,
+ (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
+ "KVA PG");
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
- if (max_pfn > system_max_low_pfn)
- highstart_pfn = system_max_low_pfn;
+ if (max_pfn > max_low_pfn)
+ highstart_pfn = max_low_pfn;
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
- num_physpages = system_max_low_pfn;
- high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1;
+ num_physpages = max_low_pfn;
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
- pages_to_mb(system_max_low_pfn));
- printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
- min_low_pfn, max_low_pfn, highstart_pfn);
+ pages_to_mb(max_low_pfn));
+ printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
+ max_low_pfn, highstart_pfn);
- printk("Low memory ends at vaddr %08lx\n",
+ printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
(ulong) pfn_to_kaddr(max_low_pfn));
for_each_online_node(nid) {
init_remap_allocator(nid);
allocate_pgdat(nid);
}
- printk("High memory starts at vaddr %08lx\n",
+ remap_numa_kva();
+
+ printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
for_each_online_node(nid)
propagate_e820_map_node(nid);
- memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
+ for_each_online_node(nid)
+ memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+
NODE_DATA(0)->bdata = &node0_bdata;
setup_bootmem_allocator();
- return max_low_pfn;
-}
-
-void __init numa_kva_reserve(void)
-{
- if (kva_pages)
- reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
- BOOTMEM_DEFAULT);
}
-void __init zone_sizes_init(void)
-{
- int nid;
- unsigned long max_zone_pfns[MAX_NR_ZONES];
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] =
- virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
- max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-#endif
-
- /* If SRAT has not registered memory, register it now */
- if (find_max_pfn_with_active_regions() == 0) {
- for_each_online_node(nid) {
- if (node_has_online_mem(nid))
- add_active_range(nid, node_start_pfn[nid],
- node_end_pfn[nid]);
- }
- }
-
- free_area_init_nodes(max_zone_pfns);
- return;
-}
-
-void __init set_highmem_pages_init(int bad_ppro)
+void __init set_highmem_pages_init(void)
{
#ifdef CONFIG_HIGHMEM
struct zone *zone;
- struct page *page;
+ int nid;
for_each_zone(zone) {
- unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
+ unsigned long zone_start_pfn, zone_end_pfn;
if (!is_highmem(zone))
continue;
@@ -434,16 +404,12 @@ void __init set_highmem_pages_init(int bad_ppro)
zone_start_pfn = zone->zone_start_pfn;
zone_end_pfn = zone_start_pfn + zone->spanned_pages;
- printk("Initializing %s for node %d (%08lx:%08lx)\n",
- zone->name, zone_to_nid(zone),
- zone_start_pfn, zone_end_pfn);
+ nid = zone_to_nid(zone);
+ printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
+ zone->name, nid, zone_start_pfn, zone_end_pfn);
- for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
- if (!pfn_valid(node_pfn))
- continue;
- page = pfn_to_page(node_pfn);
- add_one_highpage_init(page, node_pfn, bad_ppro);
- }
+ add_highpages_with_active_regions(nid, zone_start_pfn,
+ zone_end_pfn);
}
totalram_pages += totalhigh_pages;
#endif
@@ -476,3 +442,4 @@ int memory_add_physaddr_to_nid(u64 addr)
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
+
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2c24bea92c6..0bb0caed897 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = {
{ 0, "User Space" },
#ifdef CONFIG_X86_64
{ 0x8000000000000000UL, "Kernel Space" },
- { 0xffff810000000000UL, "Low Kernel Mapping" },
+ { PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
{ __START_KERNEL_map, "High Kernel Mapping" },
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0a778e3c43e..455f3fe67b4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -66,11 +66,7 @@ static inline int notify_page_fault(struct pt_regs *regs)
int ret = 0;
/* kprobe_running() needs smp_processor_id() */
-#ifdef CONFIG_X86_32
if (!user_mode_vm(regs)) {
-#else
- if (!user_mode(regs)) {
-#endif
preempt_disable();
if (kprobe_running() && kprobe_fault_handler(regs, 14))
ret = 1;
@@ -407,11 +403,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
printk(KERN_CONT "NULL pointer dereference");
else
printk(KERN_CONT "paging request");
-#ifdef CONFIG_X86_32
- printk(KERN_CONT " at %08lx\n", address);
-#else
- printk(KERN_CONT " at %016lx\n", address);
-#endif
+ printk(KERN_CONT " at %p\n", (void *) address);
printk(KERN_ALERT "IP:");
printk_address(regs->ip, 1);
dump_pagetable(address);
@@ -813,14 +805,10 @@ bad_area_nosemaphore:
if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
printk_ratelimit()) {
printk(
-#ifdef CONFIG_X86_32
- "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
-#else
- "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
-#endif
+ "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- tsk->comm, task_pid_nr(tsk), address, regs->ip,
- regs->sp, error_code);
+ tsk->comm, task_pid_nr(tsk), address,
+ (void *) regs->ip, (void *) regs->sp, error_code);
print_vma_addr(" in ", regs->ip);
printk("\n");
}
@@ -928,14 +916,7 @@ LIST_HEAD(pgd_list);
void vmalloc_sync_all(void)
{
#ifdef CONFIG_X86_32
- /*
- * Note that races in the updates of insync and start aren't
- * problematic: insync can only get set bits added, and updates to
- * start are only improving performance (without affecting correctness
- * if undone).
- */
- static DECLARE_BITMAP(insync, PTRS_PER_PGD);
- static unsigned long start = TASK_SIZE;
+ unsigned long start = VMALLOC_START & PGDIR_MASK;
unsigned long address;
if (SHARED_KERNEL_PMD)
@@ -943,56 +924,38 @@ void vmalloc_sync_all(void)
BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
- if (!test_bit(pgd_index(address), insync)) {
- unsigned long flags;
- struct page *page;
-
- spin_lock_irqsave(&pgd_lock, flags);
- list_for_each_entry(page, &pgd_list, lru) {
- if (!vmalloc_sync_one(page_address(page),
- address))
- break;
- }
- spin_unlock_irqrestore(&pgd_lock, flags);
- if (!page)
- set_bit(pgd_index(address), insync);
+ unsigned long flags;
+ struct page *page;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_for_each_entry(page, &pgd_list, lru) {
+ if (!vmalloc_sync_one(page_address(page),
+ address))
+ break;
}
- if (address == start && test_bit(pgd_index(address), insync))
- start = address + PGDIR_SIZE;
+ spin_unlock_irqrestore(&pgd_lock, flags);
}
#else /* CONFIG_X86_64 */
- /*
- * Note that races in the updates of insync and start aren't
- * problematic: insync can only get set bits added, and updates to
- * start are only improving performance (without affecting correctness
- * if undone).
- */
- static DECLARE_BITMAP(insync, PTRS_PER_PGD);
- static unsigned long start = VMALLOC_START & PGDIR_MASK;
+ unsigned long start = VMALLOC_START & PGDIR_MASK;
unsigned long address;
for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
- if (!test_bit(pgd_index(address), insync)) {
- const pgd_t *pgd_ref = pgd_offset_k(address);
- unsigned long flags;
- struct page *page;
-
- if (pgd_none(*pgd_ref))
- continue;
- spin_lock_irqsave(&pgd_lock, flags);
- list_for_each_entry(page, &pgd_list, lru) {
- pgd_t *pgd;
- pgd = (pgd_t *)page_address(page) + pgd_index(address);
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
- else
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
- }
- spin_unlock_irqrestore(&pgd_lock, flags);
- set_bit(pgd_index(address), insync);
+ const pgd_t *pgd_ref = pgd_offset_k(address);
+ unsigned long flags;
+ struct page *page;
+
+ if (pgd_none(*pgd_ref))
+ continue;
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ else
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
}
- if (address == start)
- start = address + PGDIR_SIZE;
+ spin_unlock_irqrestore(&pgd_lock, flags);
}
#endif
}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f96eca21ad8..9689a5138e6 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
unsigned int __VMALLOC_RESERVE = 128 << 20;
+unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -57,6 +58,27 @@ unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
+
+static unsigned long __initdata table_start;
+static unsigned long __meminitdata table_end;
+static unsigned long __meminitdata table_top;
+
+static int __initdata after_init_bootmem;
+
+static __init void *alloc_low_page(unsigned long *phys)
+{
+ unsigned long pfn = table_end++;
+ void *adr;
+
+ if (pfn >= table_top)
+ panic("alloc_low_page: ran out of memory");
+
+ adr = __va(pfn * PAGE_SIZE);
+ memset(adr, 0, PAGE_SIZE);
+ *phys = pfn * PAGE_SIZE;
+ return adr;
+}
+
/*
* Creates a middle page table and puts a pointer to it in the
* given global directory entry. This only returns the gd entry
@@ -68,9 +90,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
pmd_t *pmd_table;
#ifdef CONFIG_X86_PAE
+ unsigned long phys;
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-
+ if (after_init_bootmem)
+ pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ else
+ pmd_table = (pmd_t *)alloc_low_page(&phys);
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
@@ -92,12 +117,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = NULL;
+ if (after_init_bootmem) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
+ page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
#endif
- if (!page_table) {
- page_table =
+ if (!page_table)
+ page_table =
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ } else {
+ unsigned long phys;
+ page_table = (pte_t *)alloc_low_page(&phys);
}
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -155,38 +184,44 @@ static inline int is_kernel_text(unsigned long addr)
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET:
*/
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ int use_pse)
{
int pgd_idx, pmd_idx, pte_ofs;
unsigned long pfn;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
+ unsigned pages_2m = 0, pages_4k = 0;
- pgd_idx = pgd_index(PAGE_OFFSET);
- pgd = pgd_base + pgd_idx;
- pfn = 0;
+ if (!cpu_has_pse)
+ use_pse = 0;
+ pfn = start_pfn;
+ pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pgd = pgd_base + pgd_idx;
for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
pmd = one_md_table_init(pgd);
- if (pfn >= max_low_pfn)
- continue;
- for (pmd_idx = 0;
- pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+ if (pfn >= end_pfn)
+ continue;
+#ifdef CONFIG_X86_PAE
+ pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pmd += pmd_idx;
+#else
+ pmd_idx = 0;
+#endif
+ for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
pmd++, pmd_idx++) {
unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
/*
* Map with big pages if possible, otherwise
* create normal page tables:
- *
- * Don't use a large page for the first 2/4MB of memory
- * because there are often fixed size MTRRs in there
- * and overlapping MTRRs into large pages can cause
- * slowdowns.
*/
- if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
+ if (use_pse) {
unsigned int addr2;
pgprot_t prot = PAGE_KERNEL_LARGE;
@@ -197,34 +232,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
is_kernel_text(addr2))
prot = PAGE_KERNEL_LARGE_EXEC;
+ pages_2m++;
set_pmd(pmd, pfn_pmd(pfn, prot));
pfn += PTRS_PER_PTE;
- max_pfn_mapped = pfn;
continue;
}
pte = one_page_table_init(pmd);
- for (pte_ofs = 0;
- pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+ pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pte += pte_ofs;
+ for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
pgprot_t prot = PAGE_KERNEL;
if (is_kernel_text(addr))
prot = PAGE_KERNEL_EXEC;
+ pages_4k++;
set_pte(pte, pfn_pte(pfn, prot));
}
- max_pfn_mapped = pfn;
}
}
-}
-
-static inline int page_kills_ppro(unsigned long pagenr)
-{
- if (pagenr >= 0x70000 && pagenr <= 0x7003F)
- return 1;
- return 0;
+ update_page_count(PG_LEVEL_2M, pages_2m);
+ update_page_count(PG_LEVEL_4K, pages_4k);
}
/*
@@ -287,29 +318,62 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
pkmap_page_table = pte;
}
-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+static void __init add_one_highpage_init(struct page *page, int pfn)
{
- if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
- } else
- SetPageReserved(page);
+ ClearPageReserved(page);
+ init_page_count(page);
+ __free_page(page);
+ totalhigh_pages++;
}
-#ifndef CONFIG_NUMA
-static void __init set_highmem_pages_init(int bad_ppro)
+struct add_highpages_data {
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+};
+
+static int __init add_highpages_work_fn(unsigned long start_pfn,
+ unsigned long end_pfn, void *datax)
{
- int pfn;
+ int node_pfn;
+ struct page *page;
+ unsigned long final_start_pfn, final_end_pfn;
+ struct add_highpages_data *data;
- for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
- /*
- * Holes under sparsemem might not have no mem_map[]:
- */
- if (pfn_valid(pfn))
- add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+ data = (struct add_highpages_data *)datax;
+
+ final_start_pfn = max(start_pfn, data->start_pfn);
+ final_end_pfn = min(end_pfn, data->end_pfn);
+ if (final_start_pfn >= final_end_pfn)
+ return 0;
+
+ for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
+ node_pfn++) {
+ if (!pfn_valid(node_pfn))
+ continue;
+ page = pfn_to_page(node_pfn);
+ add_one_highpage_init(page, node_pfn);
}
+
+ return 0;
+
+}
+
+void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct add_highpages_data data;
+
+ data.start_pfn = start_pfn;
+ data.end_pfn = end_pfn;
+
+ work_with_active_regions(nid, add_highpages_work_fn, &data);
+}
+
+#ifndef CONFIG_NUMA
+static void __init set_highmem_pages_init(void)
+{
+ add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
+
totalram_pages += totalhigh_pages;
}
#endif /* !CONFIG_NUMA */
@@ -317,14 +381,9 @@ static void __init set_highmem_pages_init(int bad_ppro)
#else
# define kmap_init() do { } while (0)
# define permanent_kmaps_init(pgd_base) do { } while (0)
-# define set_highmem_pages_init(bad_ppro) do { } while (0)
+# define set_highmem_pages_init() do { } while (0)
#endif /* CONFIG_HIGHMEM */
-pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
-EXPORT_SYMBOL(__PAGE_KERNEL);
-
-pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
-
void __init native_pagetable_setup_start(pgd_t *base)
{
unsigned long pfn, va;
@@ -380,27 +439,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
* be partially populated, and so it avoids stomping on any existing
* mappings.
*/
-static void __init pagetable_init(void)
+static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
{
- pgd_t *pgd_base = swapper_pg_dir;
unsigned long vaddr, end;
- paravirt_pagetable_setup_start(pgd_base);
-
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
-
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __PAGE_KERNEL |= _PAGE_GLOBAL;
- __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
- }
-
- kernel_physical_mapping_init(pgd_base);
- remap_numa_kva();
-
/*
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
@@ -410,6 +452,13 @@ static void __init pagetable_init(void)
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
page_table_range_init(vaddr, end, pgd_base);
early_ioremap_reset();
+}
+
+static void __init pagetable_init(void)
+{
+ pgd_t *pgd_base = swapper_pg_dir;
+
+ paravirt_pagetable_setup_start(pgd_base);
permanent_kmaps_init(pgd_base);
@@ -456,7 +505,7 @@ void zap_low_mappings(void)
int nx_enabled;
-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
#ifdef CONFIG_X86_PAE
@@ -509,27 +558,318 @@ static void __init set_nx(void)
}
#endif
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
/*
- * paging_init() sets up the page tables - note that the first 8MB are
- * already mapped by head.S.
- *
- * This routines also unmaps the page at virtual kernel address 0, so
- * that we can trap those pesky NULL-reference errors in the kernel.
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
*/
-void __init paging_init(void)
+static int __init parse_highmem(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+ return 0;
+}
+early_param("highmem", parse_highmem);
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
{
+ /* it could update max_pfn */
+
+ /* max_low_pfn is 0, we already have early_res support */
+
+ max_low_pfn = max_pfn;
+ if (max_low_pfn > MAXMEM_PFN) {
+ if (highmem_pages == -1)
+ highmem_pages = max_pfn - MAXMEM_PFN;
+ if (highmem_pages + MAXMEM_PFN < max_pfn)
+ max_pfn = MAXMEM_PFN + highmem_pages;
+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
+ printk(KERN_WARNING "only %luMB highmem pages "
+ "available, ignoring highmem size of %uMB.\n",
+ pages_to_mb(max_pfn - MAXMEM_PFN),
+ pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+ max_low_pfn = MAXMEM_PFN;
+#ifndef CONFIG_HIGHMEM
+ /* Maximum memory usable is what is directly addressable */
+ printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+ MAXMEM>>20);
+ if (max_pfn > MAX_NONPAE_PFN)
+ printk(KERN_WARNING
+ "Use a HIGHMEM64G enabled kernel.\n");
+ else
+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+ if (max_pfn > MAX_NONPAE_PFN) {
+ max_pfn = MAX_NONPAE_PFN;
+ printk(KERN_WARNING "Warning only 4GB will be used."
+ "Use a HIGHMEM64G enabled kernel.\n");
+ }
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+ } else {
+ if (highmem_pages == -1)
+ highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+ if (highmem_pages >= max_pfn) {
+ printk(KERN_ERR "highmem size specified (%uMB) is "
+ "bigger than pages available (%luMB)!.\n",
+ pages_to_mb(highmem_pages),
+ pages_to_mb(max_pfn));
+ highmem_pages = 0;
+ }
+ if (highmem_pages) {
+ if (max_low_pfn - highmem_pages <
+ 64*1024*1024/PAGE_SIZE){
+ printk(KERN_ERR "highmem size %uMB results in "
+ "smaller than 64MB lowmem, ignoring it.\n"
+ , pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+ max_low_pfn -= highmem_pages;
+ }
+#else
+ if (highmem_pages)
+ printk(KERN_ERR "ignoring highmem size on non-highmem"
+ " kernel!\n");
+#endif
+ }
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init initmem_init(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+#ifdef CONFIG_HIGHMEM
+ highstart_pfn = highend_pfn = max_pfn;
+ if (max_pfn > max_low_pfn)
+ highstart_pfn = max_low_pfn;
+ memory_present(0, 0, highend_pfn);
+ e820_register_active_regions(0, 0, highend_pfn);
+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+ pages_to_mb(highend_pfn - highstart_pfn));
+ num_physpages = highend_pfn;
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+ memory_present(0, 0, max_low_pfn);
+ e820_register_active_regions(0, 0, max_low_pfn);
+ num_physpages = max_low_pfn;
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+ max_mapnr = num_physpages;
+#endif
+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+ pages_to_mb(max_low_pfn));
+
+ setup_bootmem_allocator();
+}
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+static void __init zone_sizes_init(void)
+{
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+ max_zone_pfns[ZONE_DMA] =
+ virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+#endif
+
+ free_area_init_nodes(max_zone_pfns);
+}
+
+void __init setup_bootmem_allocator(void)
+{
+ int i;
+ unsigned long bootmap_size, bootmap;
+ /*
+ * Initialize the boot-time allocator (with low memory only):
+ */
+ bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
+ bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+ max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+ PAGE_SIZE);
+ if (bootmap == -1L)
+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
+
+ /* don't touch min_low_pfn */
+ bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
+ min_low_pfn, max_low_pfn);
+ printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
+ max_pfn_mapped<<PAGE_SHIFT);
+ printk(KERN_INFO " low ram: %08lx - %08lx\n",
+ min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+ printk(KERN_INFO " bootmap %08lx - %08lx\n",
+ bootmap, bootmap + bootmap_size);
+ for_each_online_node(i)
+ free_bootmem_with_active_regions(i, max_low_pfn);
+ early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+
+ after_init_bootmem = 1;
+}
+
+static void __init find_early_table_space(unsigned long end)
+{
+ unsigned long puds, pmds, ptes, tables, start;
+
+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+ tables = PAGE_ALIGN(puds * sizeof(pud_t));
+
+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+ tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
+
+ if (cpu_has_pse) {
+ unsigned long extra;
+
+ extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+ extra += PMD_SIZE;
+ ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else
+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ tables += PAGE_ALIGN(ptes * sizeof(pte_t));
+
+ /* for fixmap */
+ tables += PAGE_SIZE * 2;
+
+ /*
+ * RED-PEN putting page tables only on node 0 could
+ * cause a hotspot and fill up ZONE_DMA. The page tables
+ * need roughly 0.5KB per GB.
+ */
+ start = 0x7000;
+ table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+ tables, PAGE_SIZE);
+ if (table_start == -1UL)
+ panic("Cannot find space for the kernel page tables");
+
+ table_start >>= PAGE_SHIFT;
+ table_end = table_start;
+ table_top = table_start + (tables>>PAGE_SHIFT);
+
+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, table_start << PAGE_SHIFT,
+ (table_start << PAGE_SHIFT) + tables);
+}
+
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ pgd_t *pgd_base = swapper_pg_dir;
+ unsigned long start_pfn, end_pfn;
+ unsigned long big_page_start;
+
+ /*
+ * Find space for the kernel direct mapping tables.
+ */
+ if (!after_init_bootmem)
+ find_early_table_space(end);
+
#ifdef CONFIG_X86_PAE
set_nx();
if (nx_enabled)
printk(KERN_INFO "NX (Execute Disable) protection: active\n");
#endif
- pagetable_init();
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+
+ /*
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
+ */
+ big_page_start = PMD_SIZE;
+
+ if (start < big_page_start) {
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
+ } else {
+ /* head is not big page alignment ? */
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ }
+ if (start_pfn < end_pfn)
+ kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
+
+ /* big page range */
+ start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < (big_page_start >> PAGE_SHIFT))
+ start_pfn = big_page_start >> PAGE_SHIFT;
+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn)
+ kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
+ cpu_has_pse);
+
+ /* tail is not big page alignment ? */
+ start_pfn = end_pfn;
+ if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
+ end_pfn = end >> PAGE_SHIFT;
+ if (start_pfn < end_pfn)
+ kernel_physical_mapping_init(pgd_base, start_pfn,
+ end_pfn, 0);
+ }
+
+ early_ioremap_page_table_range_init(pgd_base);
load_cr3(swapper_pg_dir);
__flush_tlb_all();
+ if (!after_init_bootmem)
+ reserve_early(table_start << PAGE_SHIFT,
+ table_end << PAGE_SHIFT, "PGTABLE");
+
+ return end >> PAGE_SHIFT;
+}
+
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+ pagetable_init();
+
+ __flush_tlb_all();
+
kmap_init();
+
+ /*
+ * NOTE: at this point the bootmem allocator is fully available.
+ */
+ sparse_init();
+ zone_sizes_init();
+
+ paravirt_post_allocator_init();
}
/*
@@ -564,24 +904,11 @@ static struct kcore_list kcore_mem, kcore_vmalloc;
void __init mem_init(void)
{
int codesize, reservedpages, datasize, initsize;
- int tmp, bad_ppro;
+ int tmp;
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
- bad_ppro = ppro_with_ram_bug();
-
-#ifdef CONFIG_HIGHMEM
- /* check that fixmap and pkmap do not overlap */
- if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
- printk(KERN_ERR
- "fixmap and kmap areas overlap - this will crash\n");
- printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
- PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
- FIXADDR_START);
- BUG();
- }
-#endif
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
@@ -593,7 +920,7 @@ void __init mem_init(void)
if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
reservedpages++;
- set_highmem_pages_init(bad_ppro);
+ set_highmem_pages_init();
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
@@ -614,7 +941,6 @@ void __init mem_init(void)
(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
);
-#if 1 /* double-sanity-check paranoia */
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
@@ -655,7 +981,6 @@ void __init mem_init(void)
#endif
BUG_ON(VMALLOC_START > VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
-#endif /* double-sanity-check paranoia */
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
@@ -788,3 +1113,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
free_init_pages("initrd memory", start, end);
}
#endif
+
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+ int flags)
+{
+ return reserve_bootmem(phys, len, flags);
+}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 17c0a6138a5..27de2435e00 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -18,6 +18,7 @@
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
+#include <linux/initrd.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
@@ -47,6 +48,14 @@
#include <asm/numa.h>
#include <asm/cacheflush.h>
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
static unsigned long dma_reserve __initdata;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -135,26 +144,17 @@ static __init void *spp_getpage(void)
return ptr;
}
-static __init void
-set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
+void
+set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
- pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte, new_pte;
-
- pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
+ pte_t *pte;
- pgd = pgd_offset_k(vaddr);
- if (pgd_none(*pgd)) {
- printk(KERN_ERR
- "PGD FIXMAP MISSING, it should be setup in head.S!\n");
- return;
- }
- pud = pud_offset(pgd, vaddr);
+ pud = pud_page + pud_index(vaddr);
if (pud_none(*pud)) {
pmd = (pmd_t *) spp_getpage();
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+ pud_populate(&init_mm, pud, pmd);
if (pmd != pmd_offset(pud, 0)) {
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
pmd, pmd_offset(pud, 0));
@@ -164,13 +164,12 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
pte = (pte_t *) spp_getpage();
- set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+ pmd_populate_kernel(&init_mm, pmd, pte);
if (pte != pte_offset_kernel(pmd, 0)) {
printk(KERN_ERR "PAGETABLE BUG #02!\n");
return;
}
}
- new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
pte = pte_offset_kernel(pmd, vaddr);
if (!pte_none(*pte) && pte_val(new_pte) &&
@@ -185,6 +184,64 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
__flush_tlb_one(vaddr);
}
+void
+set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+ pgd_t *pgd;
+ pud_t *pud_page;
+
+ pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
+
+ pgd = pgd_offset_k(vaddr);
+ if (pgd_none(*pgd)) {
+ printk(KERN_ERR
+ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
+ return;
+ }
+ pud_page = (pud_t*)pgd_page_vaddr(*pgd);
+ set_pte_vaddr_pud(pud_page, vaddr, pteval);
+}
+
+/*
+ * Create large page table mappings for a range of physical addresses.
+ */
+static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
+ pgprot_t prot)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
+ for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
+ pgd = pgd_offset_k((unsigned long)__va(phys));
+ if (pgd_none(*pgd)) {
+ pud = (pud_t *) spp_getpage();
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+ _PAGE_USER));
+ }
+ pud = pud_offset(pgd, (unsigned long)__va(phys));
+ if (pud_none(*pud)) {
+ pmd = (pmd_t *) spp_getpage();
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
+ _PAGE_USER));
+ }
+ pmd = pmd_offset(pud, phys);
+ BUG_ON(!pmd_none(*pmd));
+ set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
+ }
+}
+
+void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
+{
+ __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
+}
+
+void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
+{
+ __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
+}
+
/*
* The head.S code sets up the kernel high mapping:
*
@@ -213,20 +270,9 @@ void __init cleanup_highmap(void)
}
}
-/* NOTE: this is meant to be run only at boot */
-void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
-{
- unsigned long address = __fix_to_virt(idx);
-
- if (idx >= __end_of_fixed_addresses) {
- printk(KERN_ERR "Invalid __set_fixmap\n");
- return;
- }
- set_pte_phys(address, phys, prot);
-}
-
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
+static unsigned long __meminitdata table_top;
static __meminit void *alloc_low_page(unsigned long *phys)
{
@@ -240,7 +286,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
return adr;
}
- if (pfn >= end_pfn)
+ if (pfn >= table_top)
panic("alloc_low_page: ran out of memory");
adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -257,65 +303,61 @@ static __meminit void unmap_low_page(void *adr)
early_iounmap(adr, PAGE_SIZE);
}
-/* Must run before zap_low_mappings */
-__meminit void *early_ioremap(unsigned long addr, unsigned long size)
+static unsigned long __meminit
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
{
- pmd_t *pmd, *last_pmd;
- unsigned long vaddr;
- int i, pmds;
+ unsigned pages = 0;
+ unsigned long last_map_addr = end;
+ int i;
+
+ pte_t *pte = pte_page + pte_index(addr);
- pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
- vaddr = __START_KERNEL_map;
- pmd = level2_kernel_pgt;
- last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
+ for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
- for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
- for (i = 0; i < pmds; i++) {
- if (pmd_present(pmd[i]))
- goto continue_outer_loop;
+ if (addr >= end) {
+ if (!after_bootmem) {
+ for(; i < PTRS_PER_PTE; i++, pte++)
+ set_pte(pte, __pte(0));
+ }
+ break;
}
- vaddr += addr & ~PMD_MASK;
- addr &= PMD_MASK;
- for (i = 0; i < pmds; i++, addr += PMD_SIZE)
- set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- __flush_tlb_all();
+ if (pte_val(*pte))
+ continue;
- return (void *)vaddr;
-continue_outer_loop:
- ;
+ if (0)
+ printk(" pte=%p addr=%lx pte=%016lx\n",
+ pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
+ set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
+ last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
+ pages++;
}
- printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
+ update_page_count(PG_LEVEL_4K, pages);
- return NULL;
+ return last_map_addr;
}
-/*
- * To avoid virtual aliases later:
- */
-__meminit void early_iounmap(void *addr, unsigned long size)
+static unsigned long __meminit
+phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
{
- unsigned long vaddr;
- pmd_t *pmd;
- int i, pmds;
-
- vaddr = (unsigned long)addr;
- pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
- pmd = level2_kernel_pgt + pmd_index(vaddr);
+ pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
- for (i = 0; i < pmds; i++)
- pmd_clear(pmd + i);
-
- __flush_tlb_all();
+ return phys_pte_init(pte, address, end);
}
static unsigned long __meminit
-phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
+ unsigned long page_size_mask)
{
+ unsigned long pages = 0;
+ unsigned long last_map_addr = end;
+
int i = pmd_index(address);
for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
+ unsigned long pte_phys;
pmd_t *pmd = pmd_page + pmd_index(address);
+ pte_t *pte;
if (address >= end) {
if (!after_bootmem) {
@@ -325,31 +367,50 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
break;
}
- if (pmd_val(*pmd))
+ if (pmd_val(*pmd)) {
+ if (!pmd_large(*pmd))
+ last_map_addr = phys_pte_update(pmd, address,
+ end);
+ continue;
+ }
+
+ if (page_size_mask & (1<<PG_LEVEL_2M)) {
+ pages++;
+ set_pte((pte_t *)pmd,
+ pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ last_map_addr = (address & PMD_MASK) + PMD_SIZE;
continue;
+ }
- set_pte((pte_t *)pmd,
- pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ pte = alloc_low_page(&pte_phys);
+ last_map_addr = phys_pte_init(pte, address, end);
+ unmap_low_page(pte);
+
+ pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
}
- return address;
+ update_page_count(PG_LEVEL_2M, pages);
+ return last_map_addr;
}
static unsigned long __meminit
-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
+ unsigned long page_size_mask)
{
pmd_t *pmd = pmd_offset(pud, 0);
unsigned long last_map_addr;
spin_lock(&init_mm.page_table_lock);
- last_map_addr = phys_pmd_init(pmd, address, end);
+ last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
spin_unlock(&init_mm.page_table_lock);
__flush_tlb_all();
return last_map_addr;
}
static unsigned long __meminit
-phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
+ unsigned long page_size_mask)
{
+ unsigned long pages = 0;
unsigned long last_map_addr = end;
int i = pud_index(addr);
@@ -369,11 +430,13 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
if (pud_val(*pud)) {
if (!pud_large(*pud))
- last_map_addr = phys_pmd_update(pud, addr, end);
+ last_map_addr = phys_pmd_update(pud, addr, end,
+ page_size_mask);
continue;
}
- if (direct_gbpages) {
+ if (page_size_mask & (1<<PG_LEVEL_1G)) {
+ pages++;
set_pte((pte_t *)pud,
pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
@@ -383,27 +446,50 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
pmd = alloc_low_page(&pmd_phys);
spin_lock(&init_mm.page_table_lock);
- set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
- last_map_addr = phys_pmd_init(pmd, addr, end);
+ last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
+ unmap_low_page(pmd);
+ pud_populate(&init_mm, pud, __va(pmd_phys));
spin_unlock(&init_mm.page_table_lock);
- unmap_low_page(pmd);
}
__flush_tlb_all();
+ update_page_count(PG_LEVEL_1G, pages);
- return last_map_addr >> PAGE_SHIFT;
+ return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
+ unsigned long page_size_mask)
+{
+ pud_t *pud;
+
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
+
+ return phys_pud_init(pud, addr, end, page_size_mask);
}
static void __init find_early_table_space(unsigned long end)
{
- unsigned long puds, pmds, tables, start;
+ unsigned long puds, pmds, ptes, tables, start;
puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
- if (!direct_gbpages) {
+ if (direct_gbpages) {
+ unsigned long extra;
+ extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+ pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+ } else
pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
- }
+ tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+ if (cpu_has_pse) {
+ unsigned long extra;
+ extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+ ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else
+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
/*
* RED-PEN putting page tables only on node 0 could
@@ -417,10 +503,10 @@ static void __init find_early_table_space(unsigned long end)
table_start >>= PAGE_SHIFT;
table_end = table_start;
+ table_top = table_start + (tables >> PAGE_SHIFT);
- early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_start << PAGE_SHIFT,
- (table_start << PAGE_SHIFT) + tables);
+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
}
static void __init init_gbpages(void)
@@ -431,7 +517,7 @@ static void __init init_gbpages(void)
direct_gbpages = 0;
}
-#ifdef CONFIG_MEMTEST_BOOTPARAM
+#ifdef CONFIG_MEMTEST
static void __init memtest(unsigned long start_phys, unsigned long size,
unsigned pattern)
@@ -493,7 +579,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
}
-static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
+/* default is disabled */
+static int memtest_pattern __initdata;
static int __init parse_memtest(char *arg)
{
@@ -542,15 +629,85 @@ static void __init early_memtest(unsigned long start, unsigned long end)
}
#endif
+static unsigned long __init kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
+{
+
+ unsigned long next, last_map_addr = end;
+
+ start = (unsigned long)__va(start);
+ end = (unsigned long)__va(end);
+
+ for (; start < end; start = next) {
+ pgd_t *pgd = pgd_offset_k(start);
+ unsigned long pud_phys;
+ pud_t *pud;
+
+ next = start + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pgd_val(*pgd)) {
+ last_map_addr = phys_pud_update(pgd, __pa(start),
+ __pa(end), page_size_mask);
+ continue;
+ }
+
+ if (after_bootmem)
+ pud = pud_offset(pgd, start & PGDIR_MASK);
+ else
+ pud = alloc_low_page(&pud_phys);
+
+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+ page_size_mask);
+ unmap_low_page(pud);
+ pgd_populate(&init_mm, pgd_offset_k(start),
+ __va(pud_phys));
+ }
+
+ return last_map_addr;
+}
+
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+#define NR_RANGE_MR 5
+
+static int save_mr(struct map_range *mr, int nr_range,
+ unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long page_size_mask)
+{
+
+ if (start_pfn < end_pfn) {
+ if (nr_range >= NR_RANGE_MR)
+ panic("run out of range for init_memory_mapping\n");
+ mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+ mr[nr_range].end = end_pfn<<PAGE_SHIFT;
+ mr[nr_range].page_size_mask = page_size_mask;
+ nr_range++;
+ }
+
+ return nr_range;
+}
+
/*
* Setup the direct mapping of the physical memory at PAGE_OFFSET.
* This runs before bootmem is initialized and gets pages directly from
* the physical memory. To access them they are temporarily mapped.
*/
-unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
{
- unsigned long next, last_map_addr = end;
- unsigned long start_phys = start, end_phys = end;
+ unsigned long last_map_addr = 0;
+ unsigned long page_size_mask = 0;
+ unsigned long start_pfn, end_pfn;
+
+ struct map_range mr[NR_RANGE_MR];
+ int nr_range, i;
printk(KERN_INFO "init_memory_mapping\n");
@@ -561,48 +718,115 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon
* memory mapped. Unfortunately this is done currently before the
* nodes are discovered.
*/
- if (!after_bootmem) {
+ if (!after_bootmem)
init_gbpages();
- find_early_table_space(end);
+
+ if (direct_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (cpu_has_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = 0;
+
+ /* head if not big page alignment ?*/
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+ /* big page (2M) range*/
+ start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
+ end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+
+ /* big page (1G) range */
+ start_pfn = end_pfn;
+ end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask &
+ ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+
+ /* tail is not big page (1G) alignment */
+ start_pfn = end_pfn;
+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+
+ /* tail is not big page (2M) alignment */
+ start_pfn = end_pfn;
+ end_pfn = end>>PAGE_SHIFT;
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+ /* try to merge same page size and continuous */
+ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+ unsigned long old_start;
+ if (mr[i].end != mr[i+1].start ||
+ mr[i].page_size_mask != mr[i+1].page_size_mask)
+ continue;
+ /* move it */
+ old_start = mr[i].start;
+ memmove(&mr[i], &mr[i+1],
+ (nr_range - 1 - i) * sizeof (struct map_range));
+ mr[i].start = old_start;
+ nr_range--;
}
- start = (unsigned long)__va(start);
- end = (unsigned long)__va(end);
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+ mr[i].start, mr[i].end,
+ (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+ (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
- for (; start < end; start = next) {
- pgd_t *pgd = pgd_offset_k(start);
- unsigned long pud_phys;
- pud_t *pud;
-
- if (after_bootmem)
- pud = pud_offset(pgd, start & PGDIR_MASK);
- else
- pud = alloc_low_page(&pud_phys);
+ if (!after_bootmem)
+ find_early_table_space(end);
- next = start + PGDIR_SIZE;
- if (next > end)
- next = end;
- last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
- if (!after_bootmem)
- set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
- unmap_low_page(pud);
- }
+ for (i = 0; i < nr_range; i++)
+ last_map_addr = kernel_physical_mapping_init(
+ mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
if (!after_bootmem)
mmu_cr4_features = read_cr4();
__flush_tlb_all();
- if (!after_bootmem)
+ if (!after_bootmem && table_end > table_start)
reserve_early(table_start << PAGE_SHIFT,
table_end << PAGE_SHIFT, "PGTABLE");
+ printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
+ last_map_addr, end);
+
if (!after_bootmem)
- early_memtest(start_phys, end_phys);
+ early_memtest(start, end);
- return last_map_addr;
+ return last_map_addr >> PAGE_SHIFT;
}
#ifndef CONFIG_NUMA
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long bootmap_size, bootmap;
+
+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
+ PAGE_SIZE);
+ if (bootmap == -1L)
+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ /* don't touch min_low_pfn */
+ bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
+ 0, end_pfn);
+ e820_register_active_regions(0, start_pfn, end_pfn);
+ free_bootmem_with_active_regions(0, end_pfn);
+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
+ reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
+}
+
void __init paging_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -610,9 +834,9 @@ void __init paging_init(void)
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
- max_zone_pfns[ZONE_NORMAL] = end_pfn;
+ max_zone_pfns[ZONE_NORMAL] = max_pfn;
- memory_present(0, 0, end_pfn);
+ memory_present(0, 0, max_pfn);
sparse_init();
free_area_init_nodes(max_zone_pfns);
}
@@ -694,8 +918,8 @@ void __init mem_init(void)
#else
totalram_pages = free_all_bootmem();
#endif
- reservedpages = end_pfn - totalram_pages -
- absent_pages_in_range(0, end_pfn);
+ reservedpages = max_pfn - totalram_pages -
+ absent_pages_in_range(0, max_pfn);
after_bootmem = 1;
codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -714,7 +938,7 @@ void __init mem_init(void)
printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
"%ldk reserved, %ldk data, %ldk init)\n",
(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
- end_pfn << (PAGE_SHIFT-10),
+ max_pfn << (PAGE_SHIFT-10),
codesize >> 10,
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
@@ -805,24 +1029,26 @@ void free_initrd_mem(unsigned long start, unsigned long end)
}
#endif
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+ int flags)
{
#ifdef CONFIG_NUMA
int nid, next_nid;
+ int ret;
#endif
unsigned long pfn = phys >> PAGE_SHIFT;
- if (pfn >= end_pfn) {
+ if (pfn >= max_pfn) {
/*
* This can happen with kdump kernels when accessing
* firmware tables:
*/
if (pfn < max_pfn_mapped)
- return;
+ return -EFAULT;
- printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+ printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
phys, len);
- return;
+ return -EFAULT;
}
/* Should check here against the e820 map to avoid double free */
@@ -830,9 +1056,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
nid = phys_to_nid(phys);
next_nid = phys_to_nid(phys + len - 1);
if (nid == next_nid)
- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
else
- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem(phys, len, flags);
+
+ if (ret != 0)
+ return ret;
+
#else
reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
#endif
@@ -841,6 +1071,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
dma_reserve += len / PAGE_SIZE;
set_dma_reserve(dma_reserve);
}
+
+ return 0;
}
int kern_addr_valid(unsigned long addr)
@@ -945,7 +1177,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
pmd_t *pmd;
for (; addr < end; addr = next) {
- next = pmd_addr_end(addr, end);
+ void *p = NULL;
pgd = vmemmap_pgd_populate(addr, node);
if (!pgd)
@@ -955,33 +1187,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
if (!pud)
return -ENOMEM;
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd)) {
- pte_t entry;
- void *p;
+ if (!cpu_has_pse) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ pmd = vmemmap_pmd_populate(pud, addr, node);
+
+ if (!pmd)
+ return -ENOMEM;
+
+ p = vmemmap_pte_populate(pmd, addr, node);
- p = vmemmap_alloc_block(PMD_SIZE, node);
if (!p)
return -ENOMEM;
- entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
- PAGE_KERNEL_LARGE);
- set_pmd(pmd, __pmd(pte_val(entry)));
-
- /* check to see if we have contiguous blocks */
- if (p_end != p || node_start != node) {
- if (p_start)
- printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
- addr_start, addr_end-1, p_start, p_end-1, node_start);
- addr_start = addr;
- node_start = node;
- p_start = p;
- }
- addr_end = addr + PMD_SIZE;
- p_end = p + PMD_SIZE;
+ addr_end = addr + PAGE_SIZE;
+ p_end = p + PAGE_SIZE;
} else {
- vmemmap_verify((pte_t *)pmd, node, addr, next);
+ next = pmd_addr_end(addr, end);
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ pte_t entry;
+
+ p = vmemmap_alloc_block(PMD_SIZE, node);
+ if (!p)
+ return -ENOMEM;
+
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE);
+ set_pmd(pmd, __pmd(pte_val(entry)));
+
+ /* check to see if we have contiguous blocks */
+ if (p_end != p || node_start != node) {
+ if (p_start)
+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ addr_start = addr;
+ node_start = node;
+ p_start = p;
+ }
+
+ addr_end = addr + PMD_SIZE;
+ p_end = p + PMD_SIZE;
+ } else
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
}
+
}
return 0;
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index e92aa461f4d..24c1d3c3018 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -146,7 +146,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't remap the low PCI/ISA area, it's always mapped..
*/
- if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+ if (is_ISA_range(phys_addr, last_addr))
return (__force void __iomem *)phys_to_virt(phys_addr);
/*
@@ -268,7 +268,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
{
/*
* Ideally, this should be:
- * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
+ * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
*
* Till we fix all X drivers to use ioremap_wc(), we will use
* UC MINUS.
@@ -292,7 +292,7 @@ EXPORT_SYMBOL(ioremap_nocache);
*/
void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
{
- if (pat_wc_enabled)
+ if (pat_enabled)
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
__builtin_return_address(0));
else
@@ -307,6 +307,29 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
}
EXPORT_SYMBOL(ioremap_cache);
+static void __iomem *ioremap_default(resource_size_t phys_addr,
+ unsigned long size)
+{
+ unsigned long flags;
+ void *ret;
+ int err;
+
+ /*
+ * - WB for WB-able memory and no other conflicting mappings
+ * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+ * - Inherit from confliting mappings otherwise
+ */
+ err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
+ if (err < 0)
+ return NULL;
+
+ ret = (void *) __ioremap_caller(phys_addr, size, flags,
+ __builtin_return_address(0));
+
+ free_memtype(phys_addr, phys_addr + size);
+ return (void __iomem *)ret;
+}
+
/**
* iounmap - Free a IO remapping
* @addr: virtual address from ioremap_*
@@ -325,8 +348,8 @@ void iounmap(volatile void __iomem *addr)
* vm_area and by simply returning an address into the kernel mapping
* of ISA space. So handle that here.
*/
- if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
- addr < phys_to_virt(ISA_END_ADDRESS))
+ if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
+ (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
return;
addr = (volatile void __iomem *)
@@ -341,7 +364,7 @@ void iounmap(volatile void __iomem *addr)
cpa takes care of the direct mappings. */
read_lock(&vmlist_lock);
for (p = vmlist; p; p = p->next) {
- if (p->addr == addr)
+ if (p->addr == (void __force *)addr)
break;
}
read_unlock(&vmlist_lock);
@@ -355,7 +378,7 @@ void iounmap(volatile void __iomem *addr)
free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
/* Finally remove it */
- o = remove_vm_area((void *)addr);
+ o = remove_vm_area((void __force *)addr);
BUG_ON(p != o || o == NULL);
kfree(p);
}
@@ -374,7 +397,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
if (page_is_ram(start >> PAGE_SHIFT))
return __va(phys);
- addr = (void *)ioremap(start, PAGE_SIZE);
+ addr = (void __force *)ioremap_default(start, PAGE_SIZE);
if (addr)
addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
@@ -390,8 +413,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
return;
}
-#ifdef CONFIG_X86_32
-
int __initdata early_ioremap_debug;
static int __init early_ioremap_debug_setup(char *str)
@@ -403,8 +424,7 @@ static int __init early_ioremap_debug_setup(char *str)
early_param("early_ioremap_debug", early_ioremap_debug_setup);
static __initdata int after_paging_init;
-static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
- __section(.bss.page_aligned);
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
@@ -493,10 +513,11 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
return;
}
pte = early_ioremap_pte(addr);
+
if (pgprot_val(flags))
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
else
- pte_clear(NULL, addr, pte);
+ pte_clear(&init_mm, addr, pte);
__flush_tlb_one(addr);
}
@@ -634,5 +655,3 @@ void __this_fixmap_does_not_exist(void)
{
WARN_ON(1);
}
-
-#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 1f476e47784..41f1b5c00a1 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,6 +22,7 @@
#include <asm/numa.h>
#include <asm/mpspec.h>
#include <asm/apic.h>
+#include <asm/k8.h>
static __init int find_northbridge(void)
{
@@ -56,34 +57,33 @@ static __init void early_get_boot_cpu_id(void)
/*
* Find possible boot-time SMP configuration:
*/
+#ifdef CONFIG_X86_MPPARSE
early_find_smp_config();
+#endif
#ifdef CONFIG_ACPI
/*
* Read APIC information from ACPI tables.
*/
early_acpi_boot_init();
#endif
+#ifdef CONFIG_X86_MPPARSE
/*
* get boot-time SMP configuration:
*/
if (smp_found_config)
early_get_smp_config();
+#endif
early_init_lapic_mapping();
}
int __init k8_scan_nodes(unsigned long start, unsigned long end)
{
+ unsigned numnodes, cores, bits, apicid_base;
unsigned long prevbase;
struct bootnode nodes[8];
- int nodeid, i, nb;
unsigned char nodeids[8];
- int found = 0;
- u32 reg;
- unsigned numnodes;
- unsigned cores;
- unsigned bits;
- int j;
- unsigned apicid_base;
+ int i, j, nb, found = 0;
+ u32 nodeid, reg;
if (!early_pci_allowed())
return -1;
@@ -105,7 +105,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
prevbase = 0;
for (i = 0; i < 8; i++) {
unsigned long base, limit;
- u32 nodeid;
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -144,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
limit |= (1<<24)-1;
limit++;
- if (limit > end_pfn << PAGE_SHIFT)
- limit = end_pfn << PAGE_SHIFT;
+ if (limit > max_pfn << PAGE_SHIFT)
+ limit = max_pfn << PAGE_SHIFT;
if (limit <= base)
continue;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5..b432d578177 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -27,30 +27,17 @@
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
-bootmem_data_t plat_node_bdata[MAX_NUMNODES];
+static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
struct memnode memnode;
-#ifdef CONFIG_SMP
-int x86_cpu_to_node_map_init[NR_CPUS] = {
- [0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
-void *x86_cpu_to_node_map_early_ptr;
-EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
-#endif
-DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-
s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
-cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_to_cpumask_map);
-
int numa_off __initdata;
-unsigned long __initdata nodemap_addr;
-unsigned long __initdata nodemap_size;
+static unsigned long __initdata nodemap_addr;
+static unsigned long __initdata nodemap_size;
/*
* Given a shift value, try to populate memnodemap[]
@@ -99,7 +86,7 @@ static int __init allocate_cachealigned_memnodemap(void)
addr = 0x8000;
nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
- nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT,
+ nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
nodemap_size, L1_CACHE_BYTES);
if (nodemap_addr == -1UL) {
printk(KERN_ERR
@@ -192,7 +179,7 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
void __init setup_node_bootmem(int nodeid, unsigned long start,
unsigned long end)
{
- unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size;
+ unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
unsigned long bootmap_start, nodedata_phys;
void *bootmap;
const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
@@ -204,7 +191,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
start, end);
start_pfn = start >> PAGE_SHIFT;
- end_pfn = end >> PAGE_SHIFT;
+ last_pfn = end >> PAGE_SHIFT;
node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
SMP_CACHE_BYTES);
@@ -217,7 +204,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
- NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+ NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
/*
* Find a place for the bootmem map
@@ -226,14 +213,14 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
* early_node_mem will get that with find_e820_area instead
* of alloc_bootmem, that could clash with reserved range
*/
- bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
+ bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
nid = phys_to_nid(nodedata_phys);
if (nid == nodeid)
bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
else
bootmap_start = round_up(start, PAGE_SIZE);
/*
- * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
+ * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
* to use that to align to PAGE_SIZE
*/
bootmap = early_node_mem(nodeid, bootmap_start, end,
@@ -248,7 +235,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
bootmap_start >> PAGE_SHIFT,
- start_pfn, end_pfn);
+ start_pfn, last_pfn);
printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
bootmap_start, bootmap_start + bootmap_size - 1,
@@ -309,7 +296,7 @@ void __init numa_init_array(void)
#ifdef CONFIG_NUMA_EMU
/* Numa emulation */
-char *cmdline __initdata;
+static char *cmdline __initdata;
/*
* Setups up nid to range from addr to addr + size. If the end
@@ -413,15 +400,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
}
/*
- * Sets up the system RAM area from start_pfn to end_pfn according to the
+ * Sets up the system RAM area from start_pfn to last_pfn according to the
* numa=fake command-line option.
*/
static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
{
u64 size, addr = start_pfn << PAGE_SHIFT;
- u64 max_addr = end_pfn << PAGE_SHIFT;
+ u64 max_addr = last_pfn << PAGE_SHIFT;
int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
memset(&nodes, 0, sizeof(nodes));
@@ -527,7 +514,7 @@ out:
}
#endif /* CONFIG_NUMA_EMU */
-void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
{
int i;
@@ -535,7 +522,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
nodes_clear(node_online_map);
#ifdef CONFIG_NUMA_EMU
- if (cmdline && !numa_emulation(start_pfn, end_pfn))
+ if (cmdline && !numa_emulation(start_pfn, last_pfn))
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
@@ -543,7 +530,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
#ifdef CONFIG_ACPI_NUMA
if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
- end_pfn << PAGE_SHIFT))
+ last_pfn << PAGE_SHIFT))
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
@@ -551,7 +538,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
#ifdef CONFIG_K8_NUMA
if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
- end_pfn<<PAGE_SHIFT))
+ last_pfn<<PAGE_SHIFT))
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
@@ -561,7 +548,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
start_pfn << PAGE_SHIFT,
- end_pfn << PAGE_SHIFT);
+ last_pfn << PAGE_SHIFT);
/* setup dummy node covering all memory */
memnode_shift = 63;
memnodemap = memnode.embedded_map;
@@ -570,29 +557,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
node_set(0, node_possible_map);
for (i = 0; i < NR_CPUS; i++)
numa_set_node(i, 0);
- /* cpumask_of_cpu() may not be available during early startup */
- memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
- cpu_set(0, node_to_cpumask_map[0]);
- e820_register_active_regions(0, start_pfn, end_pfn);
- setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
-}
-
-__cpuinit void numa_add_cpu(int cpu)
-{
- set_bit(cpu,
- (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-
-void __cpuinit numa_set_node(int cpu, int node)
-{
- int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
-
- if(cpu_to_node_map)
- cpu_to_node_map[cpu] = node;
- else if(per_cpu_offset(cpu))
- per_cpu(x86_cpu_to_node_map, cpu) = node;
- else
- Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
+ e820_register_active_regions(0, start_pfn, last_pfn);
+ setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
}
unsigned long __init numa_free_all_bootmem(void)
@@ -613,7 +579,7 @@ void __init paging_init(void)
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
- max_zone_pfns[ZONE_NORMAL] = end_pfn;
+ max_zone_pfns[ZONE_NORMAL] = max_pfn;
sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
@@ -641,6 +607,7 @@ static __init int numa_setup(char *opt)
}
early_param("numa", numa_setup);
+#ifdef CONFIG_NUMA
/*
* Setup early cpu_to_node.
*
@@ -652,14 +619,19 @@ early_param("numa", numa_setup);
* is already initialized in a round robin manner at numa_init_array,
* prior to this call, and this initialization is good enough
* for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
*/
void __init init_cpu_to_node(void)
{
- int i;
+ int cpu;
+ u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
- for (i = 0; i < NR_CPUS; i++) {
+ BUG_ON(cpu_to_apicid == NULL);
+
+ for_each_possible_cpu(cpu) {
int node;
- u16 apicid = x86_cpu_to_apicid_init[i];
+ u16 apicid = cpu_to_apicid[cpu];
if (apicid == BAD_APICID)
continue;
@@ -668,8 +640,9 @@ void __init init_cpu_to_node(void)
continue;
if (!node_online(node))
continue;
- numa_set_node(i, node);
+ numa_set_node(cpu, node);
}
}
+#endif
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 75f1b109aae..0dcd42eb94e 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,8 +1,8 @@
/*
* self test for change_page_attr.
*
- * Clears the global bit on random pages in the direct mapping, then reverts
- * and compares page tables forwards and afterwards.
+ * Clears the a test pte bit on random pages in the direct mapping,
+ * then reverts and compares page tables forwards and afterwards.
*/
#include <linux/bootmem.h>
#include <linux/kthread.h>
@@ -32,6 +32,13 @@ enum {
GPS = (1<<30)
};
+#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1)
+
+static int pte_testbit(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_UNUSED1;
+}
+
struct split_state {
long lpg, gpg, spg, exec;
long min_exec, max_exec;
@@ -165,15 +172,14 @@ static int pageattr_test(void)
continue;
}
- err = change_page_attr_clear(addr[i], len[i],
- __pgprot(_PAGE_GLOBAL));
+ err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT);
if (err < 0) {
printk(KERN_ERR "CPA %d failed %d\n", i, err);
failed++;
}
pte = lookup_address(addr[i], &level);
- if (!pte || pte_global(*pte) || pte_huge(*pte)) {
+ if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
pte ? (u64)pte_val(*pte) : 0ULL);
failed++;
@@ -198,14 +204,13 @@ static int pageattr_test(void)
failed++;
continue;
}
- err = change_page_attr_set(addr[i], len[i],
- __pgprot(_PAGE_GLOBAL));
+ err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT);
if (err < 0) {
printk(KERN_ERR "CPA reverting failed: %d\n", err);
failed++;
}
pte = lookup_address(addr[i], &level);
- if (!pte || !pte_global(*pte)) {
+ if (!pte || pte_testbit(*pte)) {
printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 57970f2935c..47f4e2e4a09 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -34,6 +34,41 @@ struct cpa_data {
unsigned force_split : 1;
};
+#ifdef CONFIG_PROC_FS
+static unsigned long direct_pages_count[PG_LEVEL_NUM];
+
+void update_page_count(int level, unsigned long pages)
+{
+ unsigned long flags;
+
+ /* Protect against CPA */
+ spin_lock_irqsave(&pgd_lock, flags);
+ direct_pages_count[level] += pages;
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static void split_page_count(int level)
+{
+ direct_pages_count[level]--;
+ direct_pages_count[level - 1] += PTRS_PER_PTE;
+}
+
+int arch_report_meminfo(char *page)
+{
+ int n = sprintf(page, "DirectMap4k: %8lu\n"
+ "DirectMap2M: %8lu\n",
+ direct_pages_count[PG_LEVEL_4K],
+ direct_pages_count[PG_LEVEL_2M]);
+#ifdef CONFIG_X86_64
+ n += sprintf(page + n, "DirectMap1G: %8lu\n",
+ direct_pages_count[PG_LEVEL_1G]);
+#endif
+ return n;
+}
+#else
+static inline void split_page_count(int level) { }
+#endif
+
#ifdef CONFIG_X86_64
static inline unsigned long highmap_start_pfn(void)
@@ -501,6 +536,16 @@ static int split_large_page(pte_t *kpte, unsigned long address)
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
+ if (address >= (unsigned long)__va(0) &&
+ address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+ split_page_count(level);
+
+#ifdef CONFIG_X86_64
+ if (address >= (unsigned long)__va(1UL<<32) &&
+ address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
+ split_page_count(level);
+#endif
+
/*
* Install the new, split up pagetable. Important details here:
*
@@ -614,15 +659,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
struct cpa_data alias_cpa;
int ret = 0;
- if (cpa->pfn > max_pfn_mapped)
+ if (cpa->pfn >= max_pfn_mapped)
return 0;
+#ifdef CONFIG_X86_64
+ if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
+ return 0;
+#endif
/*
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (!within(cpa->vaddr, PAGE_OFFSET,
- PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+ if (!(within(cpa->vaddr, PAGE_OFFSET,
+ PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
+#ifdef CONFIG_X86_64
+ || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
+#endif
+ )) {
alias_cpa = *cpa;
alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
@@ -806,7 +860,7 @@ int _set_memory_wc(unsigned long addr, int numpages)
int set_memory_wc(unsigned long addr, int numpages)
{
- if (!pat_wc_enabled)
+ if (!pat_enabled)
return set_memory_uc(addr, numpages);
if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 06b7a1c90fb..d4585077977 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -26,11 +26,11 @@
#include <asm/io.h>
#ifdef CONFIG_X86_PAT
-int __read_mostly pat_wc_enabled = 1;
+int __read_mostly pat_enabled = 1;
void __cpuinit pat_disable(char *reason)
{
- pat_wc_enabled = 0;
+ pat_enabled = 0;
printk(KERN_INFO "%s\n", reason);
}
@@ -42,6 +42,19 @@ static int __init nopat(char *str)
early_param("nopat", nopat);
#endif
+
+static int debug_enable;
+static int __init pat_debug_setup(char *str)
+{
+ debug_enable = 1;
+ return 0;
+}
+__setup("debugpat", pat_debug_setup);
+
+#define dprintk(fmt, arg...) \
+ do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
+
+
static u64 __read_mostly boot_pat_state;
enum {
@@ -53,24 +66,25 @@ enum {
PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
};
-#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
+#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
void pat_init(void)
{
u64 pat;
- if (!pat_wc_enabled)
+ if (!pat_enabled)
return;
/* Paranoia check. */
- if (!cpu_has_pat) {
- printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
+ if (!cpu_has_pat && boot_pat_state) {
/*
- * Panic if this happens on the secondary CPU, and we
+ * If this happens we are on a secondary CPU, but
* switched to PAT on the boot CPU. We have no way to
* undo PAT.
- */
- BUG_ON(boot_pat_state);
+ */
+ printk(KERN_ERR "PAT enabled, "
+ "but not supported by secondary CPU\n");
+ BUG();
}
/* Set PWT to Write-Combining. All other bits stay the same */
@@ -86,8 +100,8 @@ void pat_init(void)
* 011 UC _PAGE_CACHE_UC
* PAT bit unused
*/
- pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
- PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
/* Boot CPU check */
if (!boot_pat_state)
@@ -103,11 +117,11 @@ void pat_init(void)
static char *cattr_name(unsigned long flags)
{
switch (flags & _PAGE_CACHE_MASK) {
- case _PAGE_CACHE_UC: return "uncached";
- case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
- case _PAGE_CACHE_WB: return "write-back";
- case _PAGE_CACHE_WC: return "write-combining";
- default: return "broken";
+ case _PAGE_CACHE_UC: return "uncached";
+ case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
+ case _PAGE_CACHE_WB: return "write-back";
+ case _PAGE_CACHE_WC: return "write-combining";
+ default: return "broken";
}
}
@@ -145,47 +159,50 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
* The intersection is based on "Effective Memory Type" tables in IA-32
* SDM vol 3a
*/
-static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
- unsigned long *ret_prot)
+static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
{
- unsigned long pat_type;
- u8 mtrr_type;
-
- pat_type = prot & _PAGE_CACHE_MASK;
- prot &= (~_PAGE_CACHE_MASK);
-
- /*
- * We return the PAT request directly for types where PAT takes
- * precedence with respect to MTRR and for UC_MINUS.
- * Consistency checks with other PAT requests is done later
- * while going through memtype list.
- */
- if (pat_type == _PAGE_CACHE_WC) {
- *ret_prot = prot | _PAGE_CACHE_WC;
- return 0;
- } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
- *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
- return 0;
- } else if (pat_type == _PAGE_CACHE_UC) {
- *ret_prot = prot | _PAGE_CACHE_UC;
- return 0;
- }
-
/*
* Look for MTRR hint to get the effective type in case where PAT
* request is for WB.
*/
- mtrr_type = mtrr_type_lookup(start, end);
+ if (req_type == _PAGE_CACHE_WB) {
+ u8 mtrr_type;
+
+ mtrr_type = mtrr_type_lookup(start, end);
+ if (mtrr_type == MTRR_TYPE_UNCACHABLE)
+ return _PAGE_CACHE_UC;
+ if (mtrr_type == MTRR_TYPE_WRCOMB)
+ return _PAGE_CACHE_WC;
+ }
- if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
- *ret_prot = prot | _PAGE_CACHE_UC;
- } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
- *ret_prot = prot | _PAGE_CACHE_WC;
- } else {
- *ret_prot = prot | _PAGE_CACHE_WB;
+ return req_type;
+}
+
+static int chk_conflict(struct memtype *new, struct memtype *entry,
+ unsigned long *type)
+{
+ if (new->type != entry->type) {
+ if (type) {
+ new->type = entry->type;
+ *type = entry->type;
+ } else
+ goto conflict;
}
+ /* check overlaps with more than one entry in the list */
+ list_for_each_entry_continue(entry, &memtype_list, nd) {
+ if (new->end <= entry->start)
+ break;
+ else if (new->type != entry->type)
+ goto conflict;
+ }
return 0;
+
+ conflict:
+ printk(KERN_INFO "%s:%d conflicting memory types "
+ "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
+ new->end, cattr_name(new->type), cattr_name(entry->type));
+ return -EBUSY;
}
/*
@@ -198,37 +215,36 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
* req_type will have a special case value '-1', when requester want to inherit
* the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
*
- * If ret_type is NULL, function will return an error if it cannot reserve the
- * region with req_type. If ret_type is non-null, function will return
- * available type in ret_type in case of no error. In case of any error
+ * If new_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If new_type is non-NULL, function will return
+ * available type in new_type in case of no error. In case of any error
* it will return a negative return value.
*/
int reserve_memtype(u64 start, u64 end, unsigned long req_type,
- unsigned long *ret_type)
+ unsigned long *new_type)
{
- struct memtype *new_entry = NULL;
- struct memtype *parse;
+ struct memtype *new, *entry;
unsigned long actual_type;
+ struct list_head *where;
int err = 0;
- /* Only track when pat_wc_enabled */
- if (!pat_wc_enabled) {
+ BUG_ON(start >= end); /* end is exclusive */
+
+ if (!pat_enabled) {
/* This is identical to page table setting without PAT */
- if (ret_type) {
- if (req_type == -1) {
- *ret_type = _PAGE_CACHE_WB;
- } else {
- *ret_type = req_type;
- }
+ if (new_type) {
+ if (req_type == -1)
+ *new_type = _PAGE_CACHE_WB;
+ else
+ *new_type = req_type & _PAGE_CACHE_MASK;
}
return 0;
}
/* Low ISA region is always mapped WB in page table. No need to track */
- if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
- if (ret_type)
- *ret_type = _PAGE_CACHE_WB;
-
+ if (is_ISA_range(start, end - 1)) {
+ if (new_type)
+ *new_type = _PAGE_CACHE_WB;
return 0;
}
@@ -241,206 +257,92 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
*/
u8 mtrr_type = mtrr_type_lookup(start, end);
- if (mtrr_type == MTRR_TYPE_WRBACK) {
- req_type = _PAGE_CACHE_WB;
+ if (mtrr_type == MTRR_TYPE_WRBACK)
actual_type = _PAGE_CACHE_WB;
- } else {
- req_type = _PAGE_CACHE_UC_MINUS;
+ else
actual_type = _PAGE_CACHE_UC_MINUS;
- }
- } else {
- req_type &= _PAGE_CACHE_MASK;
- err = pat_x_mtrr_type(start, end, req_type, &actual_type);
- }
-
- if (err) {
- if (ret_type)
- *ret_type = actual_type;
-
- return -EINVAL;
- }
+ } else
+ actual_type = pat_x_mtrr_type(start, end,
+ req_type & _PAGE_CACHE_MASK);
- new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
- if (!new_entry)
+ new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!new)
return -ENOMEM;
- new_entry->start = start;
- new_entry->end = end;
- new_entry->type = actual_type;
+ new->start = start;
+ new->end = end;
+ new->type = actual_type;
- if (ret_type)
- *ret_type = actual_type;
+ if (new_type)
+ *new_type = actual_type;
spin_lock(&memtype_lock);
/* Search for existing mapping that overlaps the current range */
- list_for_each_entry(parse, &memtype_list, nd) {
- struct memtype *saved_ptr;
-
- if (parse->start >= end) {
- pr_debug("New Entry\n");
- list_add(&new_entry->nd, parse->nd.prev);
- new_entry = NULL;
+ where = NULL;
+ list_for_each_entry(entry, &memtype_list, nd) {
+ if (end <= entry->start) {
+ where = entry->nd.prev;
break;
- }
-
- if (start <= parse->start && end >= parse->start) {
- if (actual_type != parse->type && ret_type) {
- actual_type = parse->type;
- *ret_type = actual_type;
- new_entry->type = actual_type;
- }
-
- if (actual_type != parse->type) {
- printk(
- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
- current->comm, current->pid,
- start, end,
- cattr_name(actual_type),
- cattr_name(parse->type));
- err = -EBUSY;
- break;
+ } else if (start <= entry->start) { /* end > entry->start */
+ err = chk_conflict(new, entry, new_type);
+ if (!err) {
+ dprintk("Overlap at 0x%Lx-0x%Lx\n",
+ entry->start, entry->end);
+ where = entry->nd.prev;
}
-
- saved_ptr = parse;
- /*
- * Check to see whether the request overlaps more
- * than one entry in the list
- */
- list_for_each_entry_continue(parse, &memtype_list, nd) {
- if (end <= parse->start) {
- break;
- }
-
- if (actual_type != parse->type) {
- printk(
- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
- current->comm, current->pid,
- start, end,
- cattr_name(actual_type),
- cattr_name(parse->type));
- err = -EBUSY;
- break;
- }
- }
-
- if (err) {
- break;
- }
-
- pr_debug("Overlap at 0x%Lx-0x%Lx\n",
- saved_ptr->start, saved_ptr->end);
- /* No conflict. Go ahead and add this new entry */
- list_add(&new_entry->nd, saved_ptr->nd.prev);
- new_entry = NULL;
break;
- }
-
- if (start < parse->end) {
- if (actual_type != parse->type && ret_type) {
- actual_type = parse->type;
- *ret_type = actual_type;
- new_entry->type = actual_type;
- }
-
- if (actual_type != parse->type) {
- printk(
- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
- current->comm, current->pid,
- start, end,
- cattr_name(actual_type),
- cattr_name(parse->type));
- err = -EBUSY;
- break;
- }
-
- saved_ptr = parse;
- /*
- * Check to see whether the request overlaps more
- * than one entry in the list
- */
- list_for_each_entry_continue(parse, &memtype_list, nd) {
- if (end <= parse->start) {
- break;
- }
-
- if (actual_type != parse->type) {
- printk(
- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
- current->comm, current->pid,
- start, end,
- cattr_name(actual_type),
- cattr_name(parse->type));
- err = -EBUSY;
- break;
- }
- }
-
- if (err) {
- break;
+ } else if (start < entry->end) { /* start > entry->start */
+ err = chk_conflict(new, entry, new_type);
+ if (!err) {
+ dprintk("Overlap at 0x%Lx-0x%Lx\n",
+ entry->start, entry->end);
+ where = &entry->nd;
}
-
- pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
- saved_ptr->start, saved_ptr->end);
- /* No conflict. Go ahead and add this new entry */
- list_add(&new_entry->nd, &saved_ptr->nd);
- new_entry = NULL;
break;
}
}
if (err) {
- printk(KERN_INFO
- "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
- start, end, cattr_name(new_entry->type),
- cattr_name(req_type));
- kfree(new_entry);
+ printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
+ "track %s, req %s\n",
+ start, end, cattr_name(new->type), cattr_name(req_type));
+ kfree(new);
spin_unlock(&memtype_lock);
return err;
}
- if (new_entry) {
- /* No conflict. Not yet added to the list. Add to the tail */
- list_add_tail(&new_entry->nd, &memtype_list);
- pr_debug("New Entry\n");
- }
-
- if (ret_type) {
- pr_debug(
- "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
- start, end, cattr_name(actual_type),
- cattr_name(req_type), cattr_name(*ret_type));
- } else {
- pr_debug(
- "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
- start, end, cattr_name(actual_type),
- cattr_name(req_type));
- }
+ if (where)
+ list_add(&new->nd, where);
+ else
+ list_add_tail(&new->nd, &memtype_list);
spin_unlock(&memtype_lock);
+
+ dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+ start, end, cattr_name(new->type), cattr_name(req_type),
+ new_type ? cattr_name(*new_type) : "-");
+
return err;
}
int free_memtype(u64 start, u64 end)
{
- struct memtype *ml;
+ struct memtype *entry;
int err = -EINVAL;
- /* Only track when pat_wc_enabled */
- if (!pat_wc_enabled) {
+ if (!pat_enabled)
return 0;
- }
/* Low ISA region is always mapped WB. No need to track */
- if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
+ if (is_ISA_range(start, end - 1))
return 0;
- }
spin_lock(&memtype_lock);
- list_for_each_entry(ml, &memtype_list, nd) {
- if (ml->start == start && ml->end == end) {
- list_del(&ml->nd);
- kfree(ml);
+ list_for_each_entry(entry, &memtype_list, nd) {
+ if (entry->start == start && entry->end == end) {
+ list_del(&entry->nd);
+ kfree(entry);
err = 0;
break;
}
@@ -452,7 +354,7 @@ int free_memtype(u64 start, u64 end)
current->comm, current->pid, start, end);
}
- pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+ dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
return err;
}
@@ -521,12 +423,12 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
* caching for the high addresses through the KEN pin, but
* we maintain the tradition of paranoia in this code.
*/
- if (!pat_wc_enabled &&
- ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
- test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
- test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
- test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
- (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+ if (!pat_enabled &&
+ !(boot_cpu_has(X86_FEATURE_MTRR) ||
+ boot_cpu_has(X86_FEATURE_K6_MTRR) ||
+ boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
+ boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
+ (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
flags = _PAGE_CACHE_UC;
}
#endif
@@ -547,8 +449,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
if (retval < 0)
return 0;
- if (pfn <= max_pfn_mapped &&
- ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
+ if (((pfn < max_low_pfn_mapped) ||
+ (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
+ ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
free_memtype(offset, offset + size);
printk(KERN_INFO
"%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
@@ -586,4 +489,3 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
free_memtype(addr, addr + size);
}
-
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 50159764f69..557b2abceef 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlb.h>
+#include <asm/fixmap.h>
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
@@ -65,12 +66,6 @@ static inline void pgd_list_del(pgd_t *pgd)
static void pgd_ctor(void *p)
{
pgd_t *pgd = p;
- unsigned long flags;
-
- /* Clear usermode parts of PGD */
- memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
-
- spin_lock_irqsave(&pgd_lock, flags);
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -90,8 +85,6 @@ static void pgd_ctor(void *p)
/* list required to sync kernel mapping updates */
if (!SHARED_KERNEL_PMD)
pgd_list_add(pgd);
-
- spin_unlock_irqrestore(&pgd_lock, flags);
}
static void pgd_dtor(void *pgd)
@@ -119,6 +112,72 @@ static void pgd_dtor(void *pgd)
#ifdef CONFIG_X86_PAE
/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update. Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+ /* Note: almost everything apart from _PAGE_PRESENT is
+ reserved at the pmd (PDPT) level. */
+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ */
+ if (mm == current->active_mm)
+ write_cr3(read_cr3());
+}
+#else /* !CONFIG_X86_PAE */
+
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+#define PREALLOCATED_PMDS 0
+
+#endif /* CONFIG_X86_PAE */
+
+static void free_pmds(pmd_t *pmds[])
+{
+ int i;
+
+ for(i = 0; i < PREALLOCATED_PMDS; i++)
+ if (pmds[i])
+ free_page((unsigned long)pmds[i]);
+}
+
+static int preallocate_pmds(pmd_t *pmds[])
+{
+ int i;
+ bool failed = false;
+
+ for(i = 0; i < PREALLOCATED_PMDS; i++) {
+ pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+ if (pmd == NULL)
+ failed = true;
+ pmds[i] = pmd;
+ }
+
+ if (failed) {
+ free_pmds(pmds);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
* Mop up any pmd pages which may still be attached to the pgd.
* Normally they will be freed by munmap/exit_mmap, but any pmd we
* preallocate which never got a corresponding vma will need to be
@@ -128,7 +187,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
int i;
- for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+ for(i = 0; i < PREALLOCATED_PMDS; i++) {
pgd_t pgd = pgdp[i];
if (pgd_val(pgd) != 0) {
@@ -142,32 +201,17 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
}
}
-/*
- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
- * updating the top-level pagetable entries to guarantee the
- * processor notices the update. Since this is expensive, and
- * all 4 top-level entries are used almost immediately in a
- * new process's life, we just pre-populate them here.
- *
- * Also, if we're in a paravirt environment where the kernel pmd is
- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
- * and initialize the kernel pmds here.
- */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
pud_t *pud;
unsigned long addr;
int i;
pud = pud_offset(pgd, 0);
- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
- i++, pud++, addr += PUD_SIZE) {
- pmd_t *pmd = pmd_alloc_one(mm, addr);
- if (!pmd) {
- pgd_mop_up_pmds(mm, pgd);
- return 0;
- }
+ for (addr = i = 0; i < PREALLOCATED_PMDS;
+ i++, pud++, addr += PUD_SIZE) {
+ pmd_t *pmd = pmds[i];
if (i >= KERNEL_PGD_BOUNDARY)
memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -175,61 +219,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
pud_populate(mm, pud, pmd);
}
-
- return 1;
}
-void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+pgd_t *pgd_alloc(struct mm_struct *mm)
{
- paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+ pgd_t *pgd;
+ pmd_t *pmds[PREALLOCATED_PMDS];
+ unsigned long flags;
- /* Note: almost everything apart from _PAGE_PRESENT is
- reserved at the pmd (PDPT) level. */
- set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+ pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- /*
- * According to Intel App note "TLBs, Paging-Structure Caches,
- * and Their Invalidation", April 2007, document 317080-001,
- * section 8.1: in PAE mode we explicitly have to flush the
- * TLB via cr3 if the top-level pgd is changed...
- */
- if (mm == current->active_mm)
- write_cr3(read_cr3());
-}
-#else /* !CONFIG_X86_PAE */
-/* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
- return 1;
-}
+ if (pgd == NULL)
+ goto out;
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
-{
-}
-#endif /* CONFIG_X86_PAE */
+ mm->pgd = pgd;
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (preallocate_pmds(pmds) != 0)
+ goto out_free_pgd;
- /* so that alloc_pmd can use it */
- mm->pgd = pgd;
- if (pgd)
- pgd_ctor(pgd);
+ if (paravirt_pgd_alloc(mm) != 0)
+ goto out_free_pmds;
- if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
- pgd_dtor(pgd);
- free_page((unsigned long)pgd);
- pgd = NULL;
- }
+ /*
+ * Make sure that pre-populating the pmds is atomic with
+ * respect to anything walking the pgd_list, so that they
+ * never see a partially populated pgd.
+ */
+ spin_lock_irqsave(&pgd_lock, flags);
+
+ pgd_ctor(pgd);
+ pgd_prepopulate_pmd(mm, pgd, pmds);
+
+ spin_unlock_irqrestore(&pgd_lock, flags);
return pgd;
+
+out_free_pmds:
+ free_pmds(pmds);
+out_free_pgd:
+ free_page((unsigned long)pgd);
+out:
+ return NULL;
}
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
pgd_mop_up_pmds(mm, pgd);
pgd_dtor(pgd);
+ paravirt_pgd_free(mm, pgd);
free_page((unsigned long)pgd);
}
@@ -255,7 +292,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
if (pte_young(*ptep))
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
- &ptep->pte);
+ (unsigned long *) &ptep->pte);
if (ret)
pte_update(vma->vm_mm, addr, ptep);
@@ -274,3 +311,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
return young;
}
+
+int fixmaps_set;
+
+void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
+{
+ unsigned long address = __fix_to_virt(idx);
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+ set_pte_vaddr(address, pte);
+ fixmaps_set++;
+}
+
+void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
+{
+ __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
+}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 369cf065b6a..b4becbf8c57 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -71,7 +71,7 @@ void show_mem(void)
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.
*/
-static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
pud_t *pud;
@@ -94,8 +94,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
return;
}
pte = pte_offset_kernel(pmd, vaddr);
- if (pgprot_val(flags))
- set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags));
+ if (pte_val(pteval))
+ set_pte_present(&init_mm, vaddr, pte, pteval);
else
pte_clear(&init_mm, vaddr, pte);
@@ -141,22 +141,9 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
__flush_tlb_one(vaddr);
}
-static int fixmaps;
unsigned long __FIXADDR_TOP = 0xfffff000;
EXPORT_SYMBOL(__FIXADDR_TOP);
-void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
-{
- unsigned long address = __fix_to_virt(idx);
-
- if (idx >= __end_of_fixed_addresses) {
- BUG();
- return;
- }
- set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
- fixmaps++;
-}
-
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
@@ -164,11 +151,44 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
* Can be used to relocate the fixmap area and poke a hole in the top
* of kernel address space to make room for a hypervisor.
*/
-void reserve_top_address(unsigned long reserve)
+void __init reserve_top_address(unsigned long reserve)
{
- BUG_ON(fixmaps > 0);
+ BUG_ON(fixmaps_set > 0);
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
(int)-reserve);
__FIXADDR_TOP = -reserve - PAGE_SIZE;
__VMALLOC_RESERVE += reserve;
}
+
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ __VMALLOC_RESERVE = memparse(arg, &arg);
+ return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+ unsigned long address;
+
+ if (!arg)
+ return -EINVAL;
+
+ address = memparse(arg, &arg);
+ reserve_top_address(address);
+ return 0;
+}
+early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
new file mode 100644
index 00000000000..f41d67f8f83
--- /dev/null
+++ b/arch/x86/mm/srat_32.c
@@ -0,0 +1,280 @@
+/*
+ * Some of the code in this file has been gleaned from the 64 bit
+ * discontigmem support code base.
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to Pat Gaughen <gone@us.ibm.com>
+ */
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/acpi.h>
+#include <linux/nodemask.h>
+#include <asm/srat.h>
+#include <asm/topology.h>
+#include <asm/smp.h>
+#include <asm/e820.h>
+
+/*
+ * proximity macros and definitions
+ */
+#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
+#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
+#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
+#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
+/* bitmap length; _PXM is at most 255 */
+#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
+static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
+
+#define MAX_CHUNKS_PER_NODE 3
+#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
+struct node_memory_chunk_s {
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+ u8 pxm; // proximity domain of node
+ u8 nid; // which cnode contains this chunk?
+ u8 bank; // which mem bank on this node
+};
+static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
+
+static int __initdata num_memory_chunks; /* total number of memory chunks */
+static u8 __initdata apicid_to_pxm[MAX_APICID];
+
+int numa_off __initdata;
+int acpi_numa __initdata;
+
+static __init void bad_srat(void)
+{
+ printk(KERN_ERR "SRAT: SRAT not used.\n");
+ acpi_numa = -1;
+ num_memory_chunks = 0;
+}
+
+static __init inline int srat_disabled(void)
+{
+ return numa_off || acpi_numa < 0;
+}
+
+/* Identify CPU proximity domains */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
+{
+ if (srat_disabled())
+ return;
+ if (cpu_affinity->header.length !=
+ sizeof(struct acpi_srat_cpu_affinity)) {
+ bad_srat();
+ return;
+ }
+
+ if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+ return; /* empty entry */
+
+ /* mark this node as "seen" in node bitmap */
+ BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
+
+ apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
+
+ printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
+ cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
+}
+
+/*
+ * Identify memory proximity domains and hot-remove capabilities.
+ * Fill node memory chunk list structure.
+ */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
+{
+ unsigned long long paddr, size;
+ unsigned long start_pfn, end_pfn;
+ u8 pxm;
+ struct node_memory_chunk_s *p, *q, *pend;
+
+ if (srat_disabled())
+ return;
+ if (memory_affinity->header.length !=
+ sizeof(struct acpi_srat_mem_affinity)) {
+ bad_srat();
+ return;
+ }
+
+ if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+ return; /* empty entry */
+
+ pxm = memory_affinity->proximity_domain & 0xff;
+
+ /* mark this node as "seen" in node bitmap */
+ BMAP_SET(pxm_bitmap, pxm);
+
+ /* calculate info for memory chunk structure */
+ paddr = memory_affinity->base_address;
+ size = memory_affinity->length;
+
+ start_pfn = paddr >> PAGE_SHIFT;
+ end_pfn = (paddr + size) >> PAGE_SHIFT;
+
+
+ if (num_memory_chunks >= MAXCHUNKS) {
+ printk(KERN_WARNING "Too many mem chunks in SRAT."
+ " Ignoring %lld MBytes at %llx\n",
+ size/(1024*1024), paddr);
+ return;
+ }
+
+ /* Insertion sort based on base address */
+ pend = &node_memory_chunk[num_memory_chunks];
+ for (p = &node_memory_chunk[0]; p < pend; p++) {
+ if (start_pfn < p->start_pfn)
+ break;
+ }
+ if (p < pend) {
+ for (q = pend; q >= p; q--)
+ *(q + 1) = *q;
+ }
+ p->start_pfn = start_pfn;
+ p->end_pfn = end_pfn;
+ p->pxm = pxm;
+
+ num_memory_chunks++;
+
+ printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)"
+ " in proximity domain %02x %s\n",
+ start_pfn, end_pfn,
+ memory_affinity->memory_type,
+ pxm,
+ ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
+ "enabled and removable" : "enabled" ) );
+}
+
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+}
+
+void acpi_numa_arch_fixup(void)
+{
+}
+/*
+ * The SRAT table always lists ascending addresses, so can always
+ * assume that the first "start" address that you see is the real
+ * start of the node, and that the current "end" address is after
+ * the previous one.
+ */
+static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
+{
+ /*
+ * Only add present memory as told by the e820.
+ * There is no guarantee from the SRAT that the memory it
+ * enumerates is present at boot time because it represents
+ * *possible* memory hotplug areas the same as normal RAM.
+ */
+ if (memory_chunk->start_pfn >= max_pfn) {
+ printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
+ memory_chunk->start_pfn, memory_chunk->end_pfn);
+ return;
+ }
+ if (memory_chunk->nid != nid)
+ return;
+
+ if (!node_has_online_mem(nid))
+ node_start_pfn[nid] = memory_chunk->start_pfn;
+
+ if (node_start_pfn[nid] > memory_chunk->start_pfn)
+ node_start_pfn[nid] = memory_chunk->start_pfn;
+
+ if (node_end_pfn[nid] < memory_chunk->end_pfn)
+ node_end_pfn[nid] = memory_chunk->end_pfn;
+}
+
+int __init get_memcfg_from_srat(void)
+{
+ int i, j, nid;
+
+
+ if (srat_disabled())
+ goto out_fail;
+
+ if (num_memory_chunks == 0) {
+ printk(KERN_WARNING
+ "could not finy any ACPI SRAT memory areas.\n");
+ goto out_fail;
+ }
+
+ /* Calculate total number of nodes in system from PXM bitmap and create
+ * a set of sequential node IDs starting at zero. (ACPI doesn't seem
+ * to specify the range of _PXM values.)
+ */
+ /*
+ * MCD - we no longer HAVE to number nodes sequentially. PXM domain
+ * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
+ * 32, so we will continue numbering them in this manner until MAX_NUMNODES
+ * approaches MAX_PXM_DOMAINS for i386.
+ */
+ nodes_clear(node_online_map);
+ for (i = 0; i < MAX_PXM_DOMAINS; i++) {
+ if (BMAP_TEST(pxm_bitmap, i)) {
+ int nid = acpi_map_pxm_to_node(i);
+ node_set_online(nid);
+ }
+ }
+ BUG_ON(num_online_nodes() == 0);
+
+ /* set cnode id in memory chunk structure */
+ for (i = 0; i < num_memory_chunks; i++)
+ node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
+
+ printk(KERN_DEBUG "pxm bitmap: ");
+ for (i = 0; i < sizeof(pxm_bitmap); i++) {
+ printk(KERN_CONT "%02x ", pxm_bitmap[i]);
+ }
+ printk(KERN_CONT "\n");
+ printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
+ num_online_nodes());
+ printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
+ num_memory_chunks);
+
+ for (i = 0; i < MAX_APICID; i++)
+ apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
+
+ for (j = 0; j < num_memory_chunks; j++){
+ struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
+ printk(KERN_DEBUG
+ "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
+ j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
+ node_read_chunk(chunk->nid, chunk);
+ e820_register_active_regions(chunk->nid, chunk->start_pfn,
+ min(chunk->end_pfn, max_pfn));
+ }
+
+ for_each_online_node(nid) {
+ unsigned long start = node_start_pfn[nid];
+ unsigned long end = min(node_end_pfn[nid], max_pfn);
+
+ memory_present(nid, start, end);
+ node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
+ }
+ return 1;
+out_fail:
+ printk(KERN_ERR "failed to get NUMA memory information from SRAT"
+ " table\n");
+ return 0;
+}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 99649dccad2..1b4763e26ea 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -100,7 +100,19 @@ static __init inline int srat_disabled(void)
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
- acpi_slit = slit;
+ unsigned length;
+ unsigned long phys;
+
+ length = slit->header.length;
+ phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
+ PAGE_SIZE);
+
+ if (phys == -1L)
+ panic(" Can not save slit!\n");
+
+ acpi_slit = __va(phys);
+ memcpy(acpi_slit, slit, length);
+ reserve_early(phys, phys + length, "ACPI SLIT");
}
/* Callback for Proximity Domain -> LAPIC mapping */
@@ -299,7 +311,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
pxmram = 0;
}
- e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
+ e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
if ((long)(e820ram - pxmram) >= 1*1024*1024) {
printk(KERN_ERR
@@ -376,7 +388,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
if (node == NUMA_NO_NODE)
continue;
if (!node_isset(node, node_possible_map))
- numa_set_node(i, NUMA_NO_NODE);
+ numa_clear_node(i);
}
numa_init_array();
return 0;
@@ -495,6 +507,7 @@ int __node_distance(int a, int b)
EXPORT_SYMBOL(__node_distance);
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
int memory_add_physaddr_to_nid(u64 start)
{
int i, ret = 0;
@@ -506,4 +519,4 @@ int memory_add_physaddr_to_nid(u64 start)
return ret;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-
+#endif