From 77b52b4c5c66175553ee59eb43f74366f1e54bde Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Mon, 5 May 2008 19:09:10 -0700 Subject: x86: add "debugpat" boot option enable debug messages by a boot option "debugpat". Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pat.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 60adbe22efa..1a82f4db8a4 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -42,6 +42,19 @@ static int nopat(char *str) early_param("nopat", nopat); #endif + +static int debug_enable; +static int __init pat_debug_setup(char *str) +{ + debug_enable = 1; + return 0; +} +__setup("debugpat", pat_debug_setup); + +#define dprintk(fmt, arg...) \ + do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) + + static u64 __read_mostly boot_pat_state; enum { @@ -279,7 +292,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, struct memtype *saved_ptr; if (parse->start >= end) { - pr_debug("New Entry\n"); + dprintk("New Entry\n"); list_add(&new_entry->nd, parse->nd.prev); new_entry = NULL; break; @@ -329,7 +342,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, break; } - pr_debug("Overlap at 0x%Lx-0x%Lx\n", + dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ list_add(&new_entry->nd, saved_ptr->nd.prev); @@ -381,7 +394,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, break; } - pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n", + dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ list_add(&new_entry->nd, &saved_ptr->nd); @@ -403,16 +416,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (new_entry) { /* No conflict. Not yet added to the list. Add to the tail */ list_add_tail(&new_entry->nd, &memtype_list); - pr_debug("New Entry\n"); + dprintk("New Entry\n"); } if (ret_type) { - pr_debug( + dprintk( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", start, end, cattr_name(actual_type), cattr_name(req_type), cattr_name(*ret_type)); } else { - pr_debug( + dprintk( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", start, end, cattr_name(actual_type), cattr_name(req_type)); @@ -453,7 +466,7 @@ int free_memtype(u64 start, u64 end) current->comm, current->pid, start, end); } - pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); return err; } -- cgit v1.2.3 From 0327318445d55808991a63137cfb698a90ab6adf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 18 Apr 2008 17:49:15 -0700 Subject: x86_64: simplify the memtest parameter setting use CONFIG_MEMTEST only. if it is set, will have memtest=0 (disabled) need to have memtest=4 in command line to test more patterns. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_64.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 32ba13b0f81..c6d81316db6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -431,7 +431,7 @@ static void __init init_gbpages(void) direct_gbpages = 0; } -#ifdef CONFIG_MEMTEST_BOOTPARAM +#ifdef CONFIG_MEMTEST static void __init memtest(unsigned long start_phys, unsigned long size, unsigned pattern) @@ -493,7 +493,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size, } -static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; +/* default is disabled */ +static int memtest_pattern __initdata; static int __init parse_memtest(char *arg) { -- cgit v1.2.3 From b9ada4281c48076bdb252813be24ddb57e17cade Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 20 May 2008 11:01:08 +0100 Subject: x86: reinstate numa remap for SPARSEMEM on x86 NUMA systems Recent kernels have been panic'ing trying to allocate memory early in boot, in __alloc_pages: BUG: unable to handle kernel paging request at 00001568 IP: [] __alloc_pages+0x33/0x2cc *pdpt = 00000000013a5001 *pde = 0000000000000000 Oops: 0000 [#1] SMP Modules linked in: Pid: 1, comm: swapper Not tainted (2.6.25 #78) EIP: 0060:[] EFLAGS: 00010246 CPU: 0 EIP is at __alloc_pages+0x33/0x2cc EAX: 00001564 EBX: 000412d0 ECX: 00001564 EDX: 000005c3 ESI: f78012a0 EDI: 00000001 EBP: 00001564 ESP: f7871e50 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process swapper (pid: 1, ti=f7870000 task=f786f670 task.ti=f7870000) Stack: 00000000 f786f670 00000010 00000000 0000b700 000412d0 f78012a0 00000001 00000000 c105b64d 00000000 000412d0 f78012a0 f7803120 00000000 c105c1c5 00000010 f7803144 000412d0 00000001 f7803130 f7803120 f78012a0 00000001 Call Trace: [] kmem_getpages+0x94/0x129 [] cache_grow+0x8f/0x123 [] ____cache_alloc_node+0xb9/0xe4 [] kmem_cache_alloc_node+0x92/0xd2 [] build_sched_domains+0x536/0x70d [] do_flush_tlb_all+0x0/0x3f [] do_flush_tlb_all+0x0/0x3f [] interleave_nodes+0x23/0x5a [] alternate_node_alloc+0x43/0x5b [] arch_init_sched_domains+0x46/0x51 [] kernel_init+0x0/0x82 [] sched_init_smp+0x10/0xbb [] kernel_init+0x43/0x82 [] kernel_thread_helper+0x7/0x10 Debugging this showed that the NODE_DATA() for nodes other than node 0 were all NULL. Tracing this back showed that the NODE_DATA() pointers were being initialised to each nodes remap space. However under SPARSEMEM remap is disabled which leads to the pgdat's being placed incorrectly at kernel virtual address 0. Leading to the panic when attempting to allocate memory from these nodes. Numa remap was disabled in the commit below. This occured while fixing problems triggered when attempting to boot x86_32 NUMA SPARSEMEM kernels on non-numa hardware. x86: make NUMA work on 32-bit commit 1b000a5dbeb2f34bc03d45ebdf3f6d24a60c3aed The real problem is believed to be related to other alignment issues in the regions blocked out from the bootmem allocator for small memory systems, and has been fixed separately. Therefore re-enable remap for SPARSMEM, which fixes pgdat allocation issues. Testing confirms that SPARSMEM NUMA kernels will boot correctly with this part of the change reverted. Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914ccf98368..026201f143a 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -164,16 +164,13 @@ static void __init allocate_pgdat(int nid) } } -#ifdef CONFIG_DISCONTIGMEM /* - * In the discontig memory model, a portion of the kernel virtual area (KVA) - * is reserved and portions of nodes are mapped using it. This is to allow - * node-local memory to be allocated for structures that would normally require - * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers - * should be prepared to allocate from the bootmem allocator instead. This KVA - * mechanism is incompatible with SPARSEMEM as it makes assumptions about the - * layout of memory that are broken if alloc_remap() succeeds for some of the - * map and fails for others + * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel + * virtual address space (KVA) is reserved and portions of nodes are mapped + * using it. This is to allow node-local memory to be allocated for + * structures that would normally require ZONE_NORMAL. The memory is + * allocated with alloc_remap() and callers should be prepared to allocate + * from the bootmem allocator instead. */ static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; @@ -290,25 +287,6 @@ static void init_remap_allocator(int nid) (ulong) pfn_to_kaddr(highstart_pfn + node_remap_offset[nid] + node_remap_size[nid])); } -#else -void *alloc_remap(int nid, unsigned long size) -{ - return NULL; -} - -static unsigned long calculate_numa_remap_pages(void) -{ - return 0; -} - -static void init_remap_allocator(int nid) -{ -} - -void __init remap_numa_kva(void) -{ -} -#endif /* CONFIG_DISCONTIGMEM */ extern void setup_bootmem_allocator(void); unsigned long __init setup_memory(void) -- cgit v1.2.3 From 84d6bd0e272e6eace1e7e4c501f32bddfb665bb2 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 20 May 2008 11:01:19 +0100 Subject: x86: cope with no remap space being allocated for a numa node When allocating the pgdat's for numa nodes on x86_32 we attempt to place them in the numa remap space for that node. However should the node not have any remap space allocated (such as due to having non-ram pages in the remap location in the node) then we will incorrectly place the pgdat at zero. Check we have remap available, falling back to node 0 memory where we do not. Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 026201f143a..435c343c14b 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -156,7 +156,7 @@ static void __init propagate_e820_map_node(int nid) */ static void __init allocate_pgdat(int nid) { - if (nid && node_has_online_mem(nid)) + if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); -- cgit v1.2.3 From 46dd98a5c0b907f94742579ab605be1933a37abd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 14 May 2008 16:10:39 -0700 Subject: arch/x86/mm/pat.c: use boot_cpu_has() arch/x86/mm/pat.c: In function 'phys_mem_access_prot_allowed': arch/x86/mm/pat.c:526: warning: passing argument 2 of 'constant_test_bit' from incompatible pointer type arch/x86/mm/pat.c:526: warning: passing argument 2 of 'variable_test_bit' from incompatible pointer type arch/x86/mm/pat.c:527: warning: passing argument 2 of 'constant_test_bit' from incompatible pointer type arch/x86/mm/pat.c:527: warning: passing argument 2 of 'variable_test_bit' from incompatible pointer type arch/x86/mm/pat.c:528: warning: passing argument 2 of 'constant_test_bit' from incompatible pointer type arch/x86/mm/pat.c:528: warning: passing argument 2 of 'variable_test_bit' from incompatible pointer type arch/x86/mm/pat.c:529: warning: passing argument 2 of 'constant_test_bit' from incompatible pointer type arch/x86/mm/pat.c:529: warning: passing argument 2 of 'variable_test_bit' from incompatible pointer type Don't open-code test_bit() on a __u32 Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index a1cbea0b79b..e83b770676d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -536,10 +536,10 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * we maintain the tradition of paranoia in this code. */ if (!pat_wc_enabled && - ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && + ! ( boot_cpu_has(X86_FEATURE_MTRR) || + boot_cpu_has(X86_FEATURE_K6_MTRR) || + boot_cpu_has(X86_FEATURE_CYRIX_ARR) || + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && (pfn << PAGE_SHIFT) >= __pa(high_memory)) { flags = _PAGE_CACHE_UC; } -- cgit v1.2.3 From 0eafe234a2571f51ef9f7e8baddb58c828750771 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: k8topology add missing header k8_scan_nodes is global and needs a prototype. Add the header file which contains it. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/k8topology_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 1f476e47784..48471199887 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -22,6 +22,7 @@ #include #include #include +#include static __init int find_northbridge(void) { -- cgit v1.2.3 From d34c08958fa36c7a8c3f8d9c0ebe6ec1ab744a68 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: k8topology fix shadow variable sparse mutters: arch/x86/mm/k8topology_64.c:108:7: warning: symbol 'nodeid' shadows an earlier one Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/k8topology_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 48471199887..36c6c4a9135 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -76,10 +76,10 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) { unsigned long prevbase; struct bootnode nodes[8]; - int nodeid, i, nb; + int i, nb; unsigned char nodeids[8]; int found = 0; - u32 reg; + u32 nodeid, reg; unsigned numnodes; unsigned cores; unsigned bits; @@ -106,7 +106,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = 0; for (i = 0; i < 8; i++) { unsigned long base, limit; - u32 nodeid; base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); -- cgit v1.2.3 From 55d4f22abc9f011c5db982f83377cdd38bb9a2a3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: k8topology cleanup variable declarations Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/k8topology_64.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 36c6c4a9135..0ea66b532c3 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -74,17 +74,12 @@ static __init void early_get_boot_cpu_id(void) int __init k8_scan_nodes(unsigned long start, unsigned long end) { + unsigned numnodes, cores, bits, apicid_base; unsigned long prevbase; struct bootnode nodes[8]; - int i, nb; unsigned char nodeids[8]; - int found = 0; + int i, j, nb, found = 0; u32 nodeid, reg; - unsigned numnodes; - unsigned cores; - unsigned bits; - int j; - unsigned apicid_base; if (!early_pci_allowed()) return -1; -- cgit v1.2.3 From 11034d55975be6d8b955a6eb11e87fc67ec086c2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: init64.c include initrd.h free_initrd_mem needs a prototype, which is in linux/initrd.h Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 32ba13b0f81..18123096cdb 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 6a1673ae2201931ce34bf7ba9f90928c864d8334 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86: make memory_add_physaddr_to_nid depend on MEMORY_HOTPLUG memory_add_physaddr_to_nid() is only used in the CONFIG_MEMORY_HOTPLUG_SPARSE || CONFIG_ACPI_HOTPLUG_MEMORY case. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 3890234e5b2..f6d0584cbf3 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -522,6 +522,7 @@ int __node_distance(int a, int b) EXPORT_SYMBOL(__node_distance); +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) int memory_add_physaddr_to_nid(u64 start) { int i, ret = 0; @@ -533,4 +534,4 @@ int memory_add_physaddr_to_nid(u64 start) return ret; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); - +#endif -- cgit v1.2.3 From 4e50e62ce52f39b5f45e82f68b18254458b429bb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86: eliminate duplicate consistency checks in init_32.c Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_32.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ec30d10154b..4834c0fcb6b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -571,17 +571,6 @@ void __init mem_init(void) #endif bad_ppro = ppro_with_ram_bug(); -#ifdef CONFIG_HIGHMEM - /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR - "fixmap and kmap areas overlap - this will crash\n"); - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE, - FIXADDR_START); - BUG(); - } -#endif /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); @@ -614,7 +603,6 @@ void __init mem_init(void) (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) ); -#if 1 /* double-sanity-check paranoia */ printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM @@ -655,7 +643,6 @@ void __init mem_init(void) #endif BUG_ON(VMALLOC_START > VMALLOC_END); BUG_ON((unsigned long)high_memory > VMALLOC_START); -#endif /* double-sanity-check paranoia */ if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); -- cgit v1.2.3 From 48e239572271cf79412d90753a721242a765f7d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 17:24:34 +0200 Subject: x86: fixup the fallout of the bitops changes Signed-off-by: Thomas Gleixner --- arch/x86/mm/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 50159764f69..ee1d6d39edd 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -255,7 +255,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, if (pte_young(*ptep)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, - &ptep->pte); + (unsigned long *) &ptep->pte); if (ret) pte_update(vma->vm_mm, addr, ptep); -- cgit v1.2.3 From e9197bf0114661195bee35e7795cfc42164d9b2c Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 14 May 2008 08:15:10 -0700 Subject: x86 boot: remove some unused extern function declarations Remove three extern declarations for routines that don't exist. Fix a typo in a comment. Signed-off-by: Paul Jackson Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c5066d519e5..afb07ffb931 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -233,7 +233,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, else bootmap_start = round_up(start, PAGE_SIZE); /* - * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like + * SMP_CACHE_BYTES could be enough, but init_bootmem_node like * to use that to align to PAGE_SIZE */ bootmap = early_node_mem(nodeid, bootmap_start, end, -- cgit v1.2.3 From 163872950dc856fd23849c27f60049feaac49ae6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 29 May 2008 12:57:22 -0700 Subject: x86: extend e820 early_res support 32bit -fix #4 reserve_early pgdata for 32bit numa Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914ccf98368..47749727907 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -159,8 +159,13 @@ static void __init allocate_pgdat(int nid) if (nid && node_has_online_mem(nid)) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { - NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); - min_low_pfn += PFN_UP(sizeof(pg_data_t)); + unsigned long pgdat_phys; + pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); + reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), + "NODE_DATA"); } } -- cgit v1.2.3 From a5481280b29b6a3db912ec100498bd31eaa6d2db Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 29 May 2008 12:58:37 -0700 Subject: x86: extend e820 early_res support 32bit -fix #5 reserve early numa kva, so it will not clash with new RAMDISK Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 47749727907..55fdbab6b01 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -357,6 +357,11 @@ unsigned long __init setup_memory(void) printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); printk("max_pfn = %ld\n", max_pfn); + + /* avoid clash with initrd */ + reserve_early(kva_start_pfn< system_max_low_pfn) @@ -392,13 +397,6 @@ unsigned long __init setup_memory(void) return max_low_pfn; } -void __init numa_kva_reserve(void) -{ - if (kva_pages) - reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages), - BOOTMEM_DEFAULT); -} - void __init zone_sizes_init(void) { int nid; -- cgit v1.2.3 From 2772f54bf37b033263abe4a2314c40a308a1a5cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 10:09:45 +0200 Subject: x86: add acpi_numa_slit_init() dummy implementation on 32-bit allow CONFIG_ACPI_NUMA builds to succeed on 32-bit. --- arch/x86/mm/discontig_32.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 435c343c14b..d28987a2f2b 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -454,3 +454,12 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif + +#ifdef CONFIG_ACPI_NUMA +/* + * Dummy on 32-bit, for now: + */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ +} +#endif -- cgit v1.2.3 From b28852d6703e4b72ce363c5168ea8d3fb28b9c57 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 10:11:07 +0200 Subject: x86: add dummy acpi_numa_processor_affinity_init() implementation on 32-bit allow CONFIG_ACPI_NUMA builds to succeed. --- arch/x86/mm/discontig_32.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index d28987a2f2b..bb8544a7cb9 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -462,4 +462,9 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { } + +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ +} #endif -- cgit v1.2.3 From cf3d0cb8a10a4fd19439bb34995d9ad28854739a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 11:36:56 +0200 Subject: x86: 32-bit numa, build fix on Summit it's possible to have: CONFIG_ACPI_SRAT=y CONFIG_HAVE_ARCH_PARSE_SRAT=y in which case acpi.h defines the acpi_numa_slit_init() and acpi_numa_processor_affinity_init() methods as a macro. --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index bb8544a7cb9..0d28b4bff81 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -455,7 +455,7 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#ifdef CONFIG_ACPI_NUMA +#if defined(CONFIG_ACPI_NUMA) && !defined(CONFIG_HAVE_ARCH_PARSE_SRAT) /* * Dummy on 32-bit, for now: */ -- cgit v1.2.3 From 471b3c1b011f807d16f1e19d1d4ecf703f1e7d1a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 12:39:43 +0200 Subject: x86, numaq 32-bit: build fix fix: drivers/built-in.o: In function `acpi_numa_init': : undefined reference to `acpi_numa_arch_fixup' which can happen with ACPI && NUMAQ. --- arch/x86/mm/discontig_32.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 0d28b4bff81..8b4eac0ca07 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -467,4 +467,8 @@ void __init acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { } + +void __init acpi_numa_arch_fixup(void) +{ +} #endif -- cgit v1.2.3 From ba924c81dd5a7a7fb5ded025af7fdd3b61f8ca67 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 May 2008 22:51:51 -0700 Subject: x86, numa, 32-bit: increase max_elements to 1024 so every element will represent 64M instead of 256M. AMD opteron could have HW memory hole remapping, so could have [0, 8g + 64M) on node0. Reduce element size to 64M to keep that on node 0 Later we need to use find_e820_area() to allocate memory_node_map like on 64-bit. But need to move memory_present out of populate_mem_map... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 55fdbab6b01..922af20d1d2 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -59,14 +59,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; /* * 4) physnode_map - the mapping between a pfn and owning node * physnode_map keeps track of the physical memory layout of a generic - * numa node on a 256Mb break (each element of the array will - * represent 256Mb of memory and will be marked by the node id. so, + * numa node on a 64Mb break (each element of the array will + * represent 64Mb of memory and will be marked by the node id. so, * if the first gig is on node 0, and the second gig is on node 1 * physnode_map will contain: * - * physnode_map[0-3] = 0; - * physnode_map[4-7] = 1; - * physnode_map[8- ] = -1; + * physnode_map[0-15] = 0; + * physnode_map[16-31] = 1; + * physnode_map[32- ] = -1; */ s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; EXPORT_SYMBOL(physnode_map); @@ -81,9 +81,9 @@ void memory_present(int nid, unsigned long start, unsigned long end) printk(KERN_DEBUG " "); for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { physnode_map[pfn / PAGES_PER_ELEMENT] = nid; - printk("%ld ", pfn); + printk(KERN_CONT "%ld ", pfn); } - printk("\n"); + printk(KERN_CONT "\n"); } unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, -- cgit v1.2.3 From b66cd7207387b9b428aaf1988e21dd263c6a4928 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 May 2008 22:53:47 -0700 Subject: x86: set node_remap_size[0] in fallback path ... otherwise alloc_remap will not get node_mem_map from kva area, and alloc_node_mem_map has to alloc_bootmem_node to get mem_map. It will use two low address copies ... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 922af20d1d2..98b099eeab3 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -124,6 +124,7 @@ int __init get_memcfg_numa_flat(void) node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; memory_present(0, 0, max_pfn); + node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); /* Indicate there is one node available. */ nodes_clear(node_online_map); -- cgit v1.2.3 From 0596152388e234efebce464355186ad9e16c8cb6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 May 2008 22:52:47 -0700 Subject: x86, 32-bit: change propagate_e820_map() back to find_max_pfn() we don't need to call memory_present that early. numa and sparse will call memory_present later and might even fail, it will call memory_present for the full range. also for sparse it will call alloc_bootmem ... before we set up bootmem. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 98b099eeab3..ebbbba33815 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -120,7 +120,7 @@ int __init get_memcfg_numa_flat(void) printk("NUMA - single node, flat memory mode\n"); /* Run the memory configuration and find the top of memory. */ - propagate_e820_map(); + find_max_pfn(); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; memory_present(0, 0, max_pfn); -- cgit v1.2.3 From e8c27ac9191ab9e6506ae5cbe70d87ac50f8e960 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 Jun 2008 13:15:22 -0700 Subject: x86, numa, 32-bit: print out debug info on all kvas also fix the print out of node_remap_end_vaddr Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index ebbbba33815..3150ad38567 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -168,6 +168,8 @@ static void __init allocate_pgdat(int nid) reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), "NODE_DATA"); } + printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", + nid, (unsigned long)NODE_DATA(nid)); } #ifdef CONFIG_DISCONTIGMEM @@ -208,8 +210,12 @@ void __init remap_numa_kva(void) int node; for_each_online_node(node) { + printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { vaddr = node_remap_start_vaddr[node]+(pfn< Date: Sun, 1 Jun 2008 21:06:31 -0700 Subject: x86, numa, 32-bit: avoid clash between ramdisk and kva use find_e820_area to get address space... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 3150ad38567..73a983489c6 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -38,6 +38,7 @@ #include #include #include +#include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -326,7 +327,6 @@ unsigned long __init setup_memory(void) { int nid; unsigned long system_start_pfn, system_max_low_pfn; - unsigned long wasted_pages; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -337,29 +337,18 @@ unsigned long __init setup_memory(void) */ get_memcfg_numa(); - kva_pages = calculate_numa_remap_pages(); + kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); /* partially used pages are not usable - thus round upwards */ system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - kva_start_pfn = find_max_low_pfn() - kva_pages; - -#ifdef CONFIG_BLK_DEV_INITRD - /* Numa kva area is below the initrd */ - if (initrd_start) - kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET) - - kva_pages; -#endif - - /* - * We waste pages past at the end of the KVA for no good reason other - * than how it is located. This is bad. - */ - wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1); - kva_start_pfn -= wasted_pages; - kva_pages += wasted_pages; - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); + kva_start_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); + kva_start_pfn = find_e820_area(kva_start_pfn<> PAGE_SHIFT; + printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); printk("max_pfn = %ld\n", max_pfn); -- cgit v1.2.3 From 6af61a7614a306fe882a0c2b4ddc63b65aa66efc Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 Jun 2008 23:53:50 -0700 Subject: x86: clean up max_pfn_mapped usage - 32-bit on 32-bit in head_32.S after initial page table is done, we get initial max_pfn_mapped, and then kernel_physical_mapping_init will give us a final one. We need to use that to make sure find_e820_area will get valid addresses for boot_map and for NODE_DATA(0) on numa32. XEN PV and lguest may need to assign max_pfn_mapped too. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 73a983489c6..914a81ee785 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -163,7 +163,8 @@ static void __init allocate_pgdat(int nid) else { unsigned long pgdat_phys; pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), -- cgit v1.2.3 From 84b56fa46b36c2df508e7d421feab514fad30f81 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 3 Jun 2008 19:32:30 -0700 Subject: x86, numa, 32-bit: make sure get we kva space when 1/3 user/kernel split is used, and less memory is installed, or if we have a big hole below 4g, max_low_pfn is still using 3g-128m try to go down from max_low_pfn until we get it. otherwise will panic. need to make 32-bit code to use register_e820_active_regions ... later. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914a81ee785..7ced26ab9ae 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -328,6 +328,7 @@ unsigned long __init setup_memory(void) { int nid; unsigned long system_start_pfn, system_max_low_pfn; + long kva_target_pfn; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -344,11 +345,17 @@ unsigned long __init setup_memory(void) system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); system_max_low_pfn = max_low_pfn = find_max_low_pfn(); - kva_start_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); - kva_start_pfn = find_e820_area(kva_start_pfn<> PAGE_SHIFT; + kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); + do { + kva_start_pfn = find_e820_area(kva_target_pfn<> PAGE_SHIFT; + kva_target_pfn -= PTRS_PER_PTE; + } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); + + if (kva_start_pfn == -1UL) + panic("Can not get kva space\n"); printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); -- cgit v1.2.3 From 7b2a0a6c4866cac146dcb0433e6984eb19a81335 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 3 Jun 2008 19:35:04 -0700 Subject: x86: make 32-bit use e820_register_active_regions() this way 32-bit is more similar to 64-bit, and smarter e820 and numa. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 7ced26ab9ae..a89ccf3d4c1 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -120,10 +120,9 @@ int __init get_memcfg_numa_flat(void) { printk("NUMA - single node, flat memory mode\n"); - /* Run the memory configuration and find the top of memory. */ - find_max_pfn(); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; + e820_register_active_regions(0, 0, max_pfn); memory_present(0, 0, max_pfn); node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); @@ -337,6 +336,11 @@ unsigned long __init setup_memory(void) * this space and use it to adjust the boundary between ZONE_NORMAL * and ZONE_HIGHMEM. */ + + /* call find_max_low_pfn at first, it could update max_pfn */ + system_max_low_pfn = max_low_pfn = find_max_low_pfn(); + + remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); @@ -344,7 +348,6 @@ unsigned long __init setup_memory(void) /* partially used pages are not usable - thus round upwards */ system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); do { kva_start_pfn = find_e820_area(kva_target_pfn< Date: Mon, 2 Jun 2008 14:26:18 +0800 Subject: x86: reserve highmem pages via reserve_early This patch makes early reserved highmem pages become reserved pages. This can be used for highmem pages allocated by bootloader such as EFI memory map, linked list of setup_data, etc. Signed-off-by: Huang Ying Cc: andi@firstfloor.org Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ec30d10154b..0e7bb5e8167 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -289,7 +289,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { + if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) && + !page_is_reserved_early(pfn)) { ClearPageReserved(page); init_page_count(page); __free_page(page); -- cgit v1.2.3 From cc1a9d86ce989083703c4bdc11b75a87e1cc404a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 8 Jun 2008 19:39:16 -0700 Subject: mm, x86: shrink_active_range() should check all Now we are using register_e820_active_regions() instead of add_active_range() directly. So end_pfn could be different between the value in early_node_map to node_end_pfn. So we need to make shrink_active_range() smarter. shrink_active_range() is a generic MM function in mm/page_alloc.c but it is only used on 32-bit x86. Should we move it back to some file in arch/x86? Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index a89ccf3d4c1..489605bab85 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -282,7 +282,7 @@ static unsigned long calculate_numa_remap_pages(void) node_end_pfn[nid] -= size; node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); + shrink_active_range(nid, node_end_pfn[nid]); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); -- cgit v1.2.3 From 9043f007963f4039befa3c31f47173f74a0b1c70 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 6 Jun 2008 18:53:33 -0700 Subject: x86, numa, 32-bit: use find_e820_area() to find KVA RAM on node don't assume we can use RAM near the end of every node. Esp systems that have few memory and they could have kva address and kva RAM all below max_low_pfn. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 59 ++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 26 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 489605bab85..accc7c6c57f 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -228,17 +228,21 @@ static unsigned long calculate_numa_remap_pages(void) { int nid; unsigned long size, reserve_pages = 0; - unsigned long pfn; for_each_online_node(nid) { - unsigned old_end_pfn = node_end_pfn[nid]; + u64 node_end_target; + u64 node_end_final; /* * The acpi/srat node info can show hot-add memroy zones * where memory could be added but not currently present. */ + printk("node %d pfn: [%lx - %lx]\n", + nid, node_start_pfn[nid], node_end_pfn[nid]); if (node_start_pfn[nid] > max_pfn) continue; + if (!node_end_pfn[nid]) + continue; if (node_end_pfn[nid] > max_pfn) node_end_pfn[nid] = max_pfn; @@ -250,37 +254,40 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - /* - * Validate the region we are allocating only contains valid - * pages. - */ - for (pfn = node_end_pfn[nid] - size; - pfn < node_end_pfn[nid]; pfn++) - if (!page_is_ram(pfn)) - break; - - if (pfn != node_end_pfn[nid]) - size = 0; + node_end_target = round_down(node_end_pfn[nid] - size, + PTRS_PER_PTE); + node_end_target <<= PAGE_SHIFT; + do { + node_end_final = find_e820_area(node_end_target, + ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + + if (node_end_final == -1ULL) + panic("Can not get kva ram\n"); printk("Reserving %ld pages of KVA for lmem_map of node %d\n", size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %ld pages\n", - nid, node_end_pfn[nid], node_end_pfn[nid] - size); - - if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { - /* - * Align node_end_pfn[] and node_remap_start_pfn[] to - * pmd boundary. remap_numa_kva will barf otherwise. - */ - printk("Shrinking node %d further by %ld pages for proper alignment\n", - nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); - size += node_end_pfn[nid] & (PTRS_PER_PTE-1); - } + printk("Shrinking node %d from %ld pages to %lld pages\n", + nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + + /* + * prevent kva address below max_low_pfn want it on system + * with less memory later. + * layout will be: KVA address , KVA RAM + */ + if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) + reserve_early(node_end_final, + node_end_final+(((u64)size)<>PAGE_SHIFT; node_remap_start_pfn[nid] = node_end_pfn[nid]; shrink_active_range(nid, node_end_pfn[nid]); } -- cgit v1.2.3 From c26421d01986e1521043c8feb47256833df3bf31 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Thu, 29 May 2008 12:01:44 -0700 Subject: x86: fix Xorg crash with xf86MapVidMem error Clarify the usage of mtrr_lookup() in PAT code, and to make PAT code resilient to mtrr lookup problems. Specifically, pat_x_mtrr_type() is restructured to highlight, under what conditions we look for mtrr hint. pat_x_mtrr_type() uses a default type when there are any errors in mtrr lookup (still maintaining the pat consistency). And, reserve_memtype() highlights its usage ot mtrr_lookup for request type of '-1' and also defaults in a sane way on any mtrr lookup failure. pat.c looks at mtrr type of a range to get a hint on what mapping type to request when user/API: (1) hasn't specified any type (/dev/mem mapping) and we do not want to take performance hit by always mapping UC_MINUS. This will be the case for /dev/mem mappings used to map BIOS area or ACPI region which are WB'able. In this case, as long as MTRR is not WB, PAT will request UC_MINUS for such mappings. (2) user/API requests WB mapping while in reality MTRR may have UC or WC. In this case, PAT can map as WB (without checking MTRR) and still effective type will be UC or WC. But, a subsequent request to map same region as UC or WC may fail, as the region will get trackked as WB in PAT list. Looking at MTRR hint helps us to track based on effective type rather than what user requested. Again, here mtrr_lookup is only used as hint and we fallback to WB mapping (as requested by user) as default. In both cases, after using the mtrr hint, we still go through the memtype list to make sure there are no inconsistencies among multiple users. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Tested-by: Rufus & Azrael Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e83b770676d..320d644a810 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -164,32 +164,33 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, unsigned long pat_type; u8 mtrr_type; - mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == 0xFF) { /* MTRR not enabled */ - *ret_prot = prot; - return 0; - } - if (mtrr_type == 0xFE) { /* MTRR match error */ - *ret_prot = _PAGE_CACHE_UC; - return -1; - } - if (mtrr_type != MTRR_TYPE_UNCACHABLE && - mtrr_type != MTRR_TYPE_WRBACK && - mtrr_type != MTRR_TYPE_WRCOMB) { /* MTRR type unhandled */ - *ret_prot = _PAGE_CACHE_UC; - return -1; - } - pat_type = prot & _PAGE_CACHE_MASK; prot &= (~_PAGE_CACHE_MASK); - /* Currently doing intersection by hand. Optimize it later. */ + /* + * We return the PAT request directly for types where PAT takes + * precedence with respect to MTRR and for UC_MINUS. + * Consistency checks with other PAT requests is done later + * while going through memtype list. + */ if (pat_type == _PAGE_CACHE_WC) { *ret_prot = prot | _PAGE_CACHE_WC; + return 0; } else if (pat_type == _PAGE_CACHE_UC_MINUS) { *ret_prot = prot | _PAGE_CACHE_UC_MINUS; - } else if (pat_type == _PAGE_CACHE_UC || - mtrr_type == MTRR_TYPE_UNCACHABLE) { + return 0; + } else if (pat_type == _PAGE_CACHE_UC) { + *ret_prot = prot | _PAGE_CACHE_UC; + return 0; + } + + /* + * Look for MTRR hint to get the effective type in case where PAT + * request is for WB. + */ + mtrr_type = mtrr_type_lookup(start, end); + + if (mtrr_type == MTRR_TYPE_UNCACHABLE) { *ret_prot = prot | _PAGE_CACHE_UC; } else if (mtrr_type == MTRR_TYPE_WRCOMB) { *ret_prot = prot | _PAGE_CACHE_WC; @@ -246,14 +247,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (req_type == -1) { /* - * Special case where caller wants to inherit from mtrr or - * existing pat mapping, defaulting to UC_MINUS in case of - * no match. + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. */ u8 mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == 0xFE) { /* MTRR match error */ - err = -1; - } if (mtrr_type == MTRR_TYPE_WRBACK) { req_type = _PAGE_CACHE_WB; -- cgit v1.2.3 From 97cfab6ac4ddfda0d722393bbf46cc40bc332107 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 10 Jun 2008 16:05:18 +0200 Subject: x86: PAT: fix ambiguous paranoia check in pat_init() Starting with commit 8d4a4300854f3971502e81dacd930704cb88f606 (x86: cleanup PAT cpu validation) the PAT CPU feature flag is not cleared anymore. Now the error message "PAT enabled, but CPU feature cleared" in pat_init() is misleading. Furthermore the current code does not check for existence of the PAT CPU feature flag if a CPU is whitelisted in validate_pat_support. This patch clears pat_wc_enabled if boot CPU has no PAT feature flag and adapts the paranoia check. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 320d644a810..65105b1195a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -76,14 +76,15 @@ void pat_init(void) return; /* Paranoia check. */ - if (!cpu_has_pat) { - printk(KERN_ERR "PAT enabled, but CPU feature cleared\n"); + if (!cpu_has_pat && boot_pat_state) { /* - * Panic if this happens on the secondary CPU, and we + * If this happens we are on a secondary CPU, but * switched to PAT on the boot CPU. We have no way to * undo PAT. - */ - BUG_ON(boot_pat_state); + */ + printk(KERN_ERR "PAT enabled, " + "but not supported by secondary CPU\n"); + BUG(); } /* Set PWT to Write-Combining. All other bits stay the same */ -- cgit v1.2.3 From cd7a4e936d345ab4cb49d68192d90bd4e4c58458 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 10 Jun 2008 16:05:39 +0200 Subject: x86: PAT: fixed checkpatch errors (and whitespaces) x86: PAT: fixed checkpatch errors (and whitespaces) Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 65105b1195a..a8b69bb2697 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -66,7 +66,7 @@ enum { PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ }; -#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) +#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) void pat_init(void) { @@ -100,8 +100,8 @@ void pat_init(void) * 011 UC _PAGE_CACHE_UC * PAT bit unused */ - pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | - PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); /* Boot CPU check */ if (!boot_pat_state) @@ -117,11 +117,11 @@ void pat_init(void) static char *cattr_name(unsigned long flags) { switch (flags & _PAGE_CACHE_MASK) { - case _PAGE_CACHE_UC: return "uncached"; - case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; - case _PAGE_CACHE_WB: return "write-back"; - case _PAGE_CACHE_WC: return "write-combining"; - default: return "broken"; + case _PAGE_CACHE_UC: return "uncached"; + case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_WB: return "write-back"; + case _PAGE_CACHE_WC: return "write-combining"; + default: return "broken"; } } @@ -536,11 +536,11 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * we maintain the tradition of paranoia in this code. */ if (!pat_wc_enabled && - ! ( boot_cpu_has(X86_FEATURE_MTRR) || - boot_cpu_has(X86_FEATURE_K6_MTRR) || - boot_cpu_has(X86_FEATURE_CYRIX_ARR) || - boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && - (pfn << PAGE_SHIFT) >= __pa(high_memory)) { + !(boot_cpu_has(X86_FEATURE_MTRR) || + boot_cpu_has(X86_FEATURE_K6_MTRR) || + boot_cpu_has(X86_FEATURE_CYRIX_ARR) || + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && + (pfn << PAGE_SHIFT) >= __pa(high_memory)) { flags = _PAGE_CACHE_UC; } #endif @@ -562,7 +562,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, return 0; if (pfn <= max_pfn_mapped && - ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { + ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { free_memtype(offset, offset + size); printk(KERN_INFO "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", @@ -600,4 +600,3 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } - -- cgit v1.2.3 From 499f8f84b8324ba27d756e03f373fa16eeed9ccc Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 10 Jun 2008 16:06:21 +0200 Subject: x86: rename pat_wc_enabled to pat_enabled BTW, what does pat_wc_enabled stand for? Does it mean "write-combining"? Currently it is used to globally switch on or off PAT support. Thus I renamed it to pat_enabled. I think this increases readability (and hope that I didn't miss something). Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 4 ++-- arch/x86/mm/pageattr.c | 2 +- arch/x86/mm/pat.c | 16 ++++++++-------- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 71bb3159031..ddeafed1171 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -261,7 +261,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { /* * Ideally, this should be: - * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; + * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; * * Till we fix all X drivers to use ioremap_wc(), we will use * UC MINUS. @@ -285,7 +285,7 @@ EXPORT_SYMBOL(ioremap_nocache); */ void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) { - if (pat_wc_enabled) + if (pat_enabled) return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, __builtin_return_address(0)); else diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 60bcb5b6a37..6916fe4bf0c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -805,7 +805,7 @@ int _set_memory_wc(unsigned long addr, int numpages) int set_memory_wc(unsigned long addr, int numpages) { - if (!pat_wc_enabled) + if (!pat_enabled) return set_memory_uc(addr, numpages); if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index a8b69bb2697..4beccea0897 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -26,11 +26,11 @@ #include #ifdef CONFIG_X86_PAT -int __read_mostly pat_wc_enabled = 1; +int __read_mostly pat_enabled = 1; void __cpuinit pat_disable(char *reason) { - pat_wc_enabled = 0; + pat_enabled = 0; printk(KERN_INFO "%s\n", reason); } @@ -72,7 +72,7 @@ void pat_init(void) { u64 pat; - if (!pat_wc_enabled) + if (!pat_enabled) return; /* Paranoia check. */ @@ -225,8 +225,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long actual_type; int err = 0; - /* Only track when pat_wc_enabled */ - if (!pat_wc_enabled) { + /* Only track when pat_enabled */ + if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (ret_type) { if (req_type == -1) { @@ -440,8 +440,8 @@ int free_memtype(u64 start, u64 end) struct memtype *ml; int err = -EINVAL; - /* Only track when pat_wc_enabled */ - if (!pat_wc_enabled) { + /* Only track when pat_enabled */ + if (!pat_enabled) { return 0; } @@ -535,7 +535,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * caching for the high addresses through the KEN pin, but * we maintain the tradition of paranoia in this code. */ - if (!pat_wc_enabled && + if (!pat_enabled && !(boot_cpu_has(X86_FEATURE_MTRR) || boot_cpu_has(X86_FEATURE_K6_MTRR) || boot_cpu_has(X86_FEATURE_CYRIX_ARR) || -- cgit v1.2.3 From 6cf514fce18589ea1e0521c5f2d7c2bb280fefc7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 16 Jun 2008 18:42:43 +0100 Subject: x86: PAT: make pat_x_mtrr_type() more readable Clean up over-complications in pat_x_mtrr_type(). And if reserve_memtype() ignores stray req_type bits when pat_enabled, it's better to mask them off when not also. Signed-off-by: Hugh Dickins Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 47 ++++++++++++----------------------------------- 1 file changed, 12 insertions(+), 35 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 7c21572bbdd..ac3a2b11eb3 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -159,47 +159,31 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ * The intersection is based on "Effective Memory Type" tables in IA-32 * SDM vol 3a */ -static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, - unsigned long *ret_prot) +static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) { - unsigned long pat_type; u8 mtrr_type; - pat_type = prot & _PAGE_CACHE_MASK; - prot &= (~_PAGE_CACHE_MASK); - /* * We return the PAT request directly for types where PAT takes * precedence with respect to MTRR and for UC_MINUS. * Consistency checks with other PAT requests is done later * while going through memtype list. */ - if (pat_type == _PAGE_CACHE_WC) { - *ret_prot = prot | _PAGE_CACHE_WC; - return 0; - } else if (pat_type == _PAGE_CACHE_UC_MINUS) { - *ret_prot = prot | _PAGE_CACHE_UC_MINUS; - return 0; - } else if (pat_type == _PAGE_CACHE_UC) { - *ret_prot = prot | _PAGE_CACHE_UC; - return 0; - } + if (req_type == _PAGE_CACHE_WC || + req_type == _PAGE_CACHE_UC_MINUS || + req_type == _PAGE_CACHE_UC) + return req_type; /* * Look for MTRR hint to get the effective type in case where PAT * request is for WB. */ mtrr_type = mtrr_type_lookup(start, end); - - if (mtrr_type == MTRR_TYPE_UNCACHABLE) { - *ret_prot = prot | _PAGE_CACHE_UC; - } else if (mtrr_type == MTRR_TYPE_WRCOMB) { - *ret_prot = prot | _PAGE_CACHE_WC; - } else { - *ret_prot = prot | _PAGE_CACHE_WB; - } - - return 0; + if (mtrr_type == MTRR_TYPE_UNCACHABLE) + return _PAGE_CACHE_UC; + if (mtrr_type == MTRR_TYPE_WRCOMB) + return _PAGE_CACHE_WC; + return _PAGE_CACHE_WB; } /* @@ -232,7 +216,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (req_type == -1) { *ret_type = _PAGE_CACHE_WB; } else { - *ret_type = req_type; + *ret_type = req_type & _PAGE_CACHE_MASK; } } return 0; @@ -264,14 +248,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } } else { req_type &= _PAGE_CACHE_MASK; - err = pat_x_mtrr_type(start, end, req_type, &actual_type); - } - - if (err) { - if (ret_type) - *ret_type = actual_type; - - return -EINVAL; + actual_type = pat_x_mtrr_type(start, end, req_type); } new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); -- cgit v1.2.3 From dd0c7c4903c29da9aa3bf33deecf064d190a0d81 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 18 Jun 2008 15:38:57 +0200 Subject: x86: shrink pat_x_mtrr_type to its essentials Signed-off-by: Andreas Herrmann Cc: Venkatesh Pallipadi Cc: Suresh B Siddha Cc: Hugh Dickins Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ac3a2b11eb3..227df3ca9bf 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -161,29 +161,21 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ */ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) { - u8 mtrr_type; - - /* - * We return the PAT request directly for types where PAT takes - * precedence with respect to MTRR and for UC_MINUS. - * Consistency checks with other PAT requests is done later - * while going through memtype list. - */ - if (req_type == _PAGE_CACHE_WC || - req_type == _PAGE_CACHE_UC_MINUS || - req_type == _PAGE_CACHE_UC) - return req_type; - /* * Look for MTRR hint to get the effective type in case where PAT * request is for WB. */ - mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == MTRR_TYPE_UNCACHABLE) - return _PAGE_CACHE_UC; - if (mtrr_type == MTRR_TYPE_WRCOMB) - return _PAGE_CACHE_WC; - return _PAGE_CACHE_WB; + if (req_type == _PAGE_CACHE_WB) { + u8 mtrr_type; + + mtrr_type = mtrr_type_lookup(start, end); + if (mtrr_type == MTRR_TYPE_UNCACHABLE) + return _PAGE_CACHE_UC; + if (mtrr_type == MTRR_TYPE_WRCOMB) + return _PAGE_CACHE_WC; + } + + return req_type; } /* -- cgit v1.2.3 From 7c7e6e07e2a7c0d2d96389f4f0540e44a80ecdaa Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jun 2008 11:41:54 -0700 Subject: x86: unify __set_fixmap In both cases, I went with the 32-bit behaviour. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 12 ------------ arch/x86/mm/pgtable.c | 14 ++++++++++++++ arch/x86/mm/pgtable_32.c | 14 +------------- 3 files changed, 15 insertions(+), 25 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 156e6d7b0e3..e5f53194985 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -213,18 +213,6 @@ void __init cleanup_highmap(void) } } -/* NOTE: this is meant to be run only at boot */ -void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - printk(KERN_ERR "Invalid __set_fixmap\n"); - return; - } - set_pte_phys(address, phys, prot); -} - static unsigned long __initdata table_start; static unsigned long __meminitdata table_end; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 50159764f69..3ebebe480b5 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -274,3 +274,17 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, return young; } + +int fixmaps_set; + +void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + fixmaps_set++; +} diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 369cf065b6a..3f97c3c8728 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -145,18 +145,6 @@ static int fixmaps; unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); -void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - BUG(); - return; - } - set_pte_pfn(address, phys >> PAGE_SHIFT, flags); - fixmaps++; -} - /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve @@ -166,7 +154,7 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) */ void reserve_top_address(unsigned long reserve) { - BUG_ON(fixmaps > 0); + BUG_ON(fixmaps_set > 0); printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", (int)-reserve); __FIXADDR_TOP = -reserve - PAGE_SIZE; -- cgit v1.2.3 From d494a96125c99f1e37b1f831b29b42c9b712ee05 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jun 2008 11:41:59 -0700 Subject: x86: implement set_pte_vaddr Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 9 ++++----- arch/x86/mm/pgtable.c | 2 +- arch/x86/mm/pgtable_32.c | 6 +++--- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e5f53194985..74fae833512 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -135,15 +135,15 @@ static __init void *spp_getpage(void) return ptr; } -static void -set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) +void +set_pte_vaddr(unsigned long vaddr, pte_t new_pte) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte, new_pte; + pte_t *pte; - pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys); + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(new_pte)); pgd = pgd_offset_k(vaddr); if (pgd_none(*pgd)) { @@ -170,7 +170,6 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) return; } } - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); pte = pte_offset_kernel(pmd, vaddr); if (!pte_none(*pte) && pte_val(new_pte) && diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3ebebe480b5..7498124e30f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -285,6 +285,6 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) BUG(); return; } - set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + set_pte_vaddr(address, pfn_pte(phys >> PAGE_SHIFT, flags)); fixmaps_set++; } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 3f97c3c8728..0662f345212 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -71,7 +71,7 @@ void show_mem(void) * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. */ -static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud; @@ -94,8 +94,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) return; } pte = pte_offset_kernel(pmd, vaddr); - if (pgprot_val(flags)) - set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags)); + if (pte_val(pteval)) + set_pte_present(&init_mm, vaddr, pte, pteval); else pte_clear(&init_mm, vaddr, pte); -- cgit v1.2.3 From aeaaa59c7e15dcfaaf57ce069ef81683067d575d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jun 2008 11:42:01 -0700 Subject: x86/paravirt/xen: add set_fixmap pv_mmu_ops Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Juan Quintela Signed-off-by: Eduardo Habkost Signed-off-by: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7498124e30f..e9fb66361fc 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -277,7 +277,7 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, int fixmaps_set; -void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); @@ -285,6 +285,11 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) BUG(); return; } - set_pte_vaddr(address, pfn_pte(phys >> PAGE_SHIFT, flags)); + set_pte_vaddr(address, pte); fixmaps_set++; } + +void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +{ + __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); +} -- cgit v1.2.3 From a1d5a8691f1b6c92491747bea3b778b184fa5837 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Jun 2008 15:34:46 +0200 Subject: x86: unify __set_fixmap, fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix build failure: arch/x86/mm/pgtable.c:280: warning: ‘enum fixed_addresses’ declared inside parameter list arch/x86/mm/pgtable.c:280: warning: its scope is only this definition or declaration, which is probably not what you want arch/x86/mm/pgtable.c:280: error: parameter 1 (‘idx’) has incomplete type Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index e9fb66361fc..892fd1892b8 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -2,6 +2,7 @@ #include #include #include +#include pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { -- cgit v1.2.3 From bcc643dc287cb732e96a1685ac130c3ae8b1c960 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 20 Jun 2008 21:58:46 +0200 Subject: x86: introduce macro to check whether an address range is in the ISA range Signed-off-by: Andreas Herrmann Cc: Venkatesh Pallipadi Cc: Suresh B Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 2 +- arch/x86/mm/pat.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 7452eb31ed1..6d9960dd6f3 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -142,7 +142,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't remap the low PCI/ISA area, it's always mapped.. */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) + if (is_ISA_range(phys_addr, last_addr)) return (__force void __iomem *)phys_to_virt(phys_addr); /* diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 227df3ca9bf..5bbc22efe4e 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -215,7 +215,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } /* Low ISA region is always mapped WB in page table. No need to track */ - if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { + if (is_ISA_range(start, end - 1)) { if (ret_type) *ret_type = _PAGE_CACHE_WB; @@ -415,9 +415,8 @@ int free_memtype(u64 start, u64 end) } /* Low ISA region is always mapped WB. No need to track */ - if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { + if (is_ISA_range(start, end - 1)) return 0; - } spin_lock(&memtype_lock); list_for_each_entry(ml, &memtype_list, nd) { -- cgit v1.2.3 From ac97991ec9e0a05c8860f4a04f8477227b1c03f2 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 20 Jun 2008 22:01:49 +0200 Subject: x86: pat.c choose more crisp variable names - parse/ml => entry (within list_for_each and friends) - new_entry => new - ret_type => new_type (to avoid confusion with req_type) (... to make it more readable...) Signed-off-by: Andreas Herrmann Cc: Venkatesh Pallipadi Cc: Suresh B Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 127 ++++++++++++++++++++++++++---------------------------- 1 file changed, 62 insertions(+), 65 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 5bbc22efe4e..a6507bfb12a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -188,37 +188,34 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) * req_type will have a special case value '-1', when requester want to inherit * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. * - * If ret_type is NULL, function will return an error if it cannot reserve the - * region with req_type. If ret_type is non-null, function will return - * available type in ret_type in case of no error. In case of any error + * If new_type is NULL, function will return an error if it cannot reserve the + * region with req_type. If new_type is non-NULL, function will return + * available type in new_type in case of no error. In case of any error * it will return a negative return value. */ int reserve_memtype(u64 start, u64 end, unsigned long req_type, - unsigned long *ret_type) + unsigned long *new_type) { - struct memtype *new_entry = NULL; - struct memtype *parse; + struct memtype *new, *entry; unsigned long actual_type; int err = 0; /* Only track when pat_enabled */ if (!pat_enabled) { /* This is identical to page table setting without PAT */ - if (ret_type) { - if (req_type == -1) { - *ret_type = _PAGE_CACHE_WB; - } else { - *ret_type = req_type & _PAGE_CACHE_MASK; - } + if (new_type) { + if (req_type == -1) + *new_type = _PAGE_CACHE_WB; + else + *new_type = req_type & _PAGE_CACHE_MASK; } return 0; } /* Low ISA region is always mapped WB in page table. No need to track */ if (is_ISA_range(start, end - 1)) { - if (ret_type) - *ret_type = _PAGE_CACHE_WB; - + if (new_type) + *new_type = _PAGE_CACHE_WB; return 0; } @@ -243,65 +240,65 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, actual_type = pat_x_mtrr_type(start, end, req_type); } - new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); - if (!new_entry) + new = kmalloc(sizeof(struct memtype), GFP_KERNEL); + if (!new) return -ENOMEM; - new_entry->start = start; - new_entry->end = end; - new_entry->type = actual_type; + new->start = start; + new->end = end; + new->type = actual_type; - if (ret_type) - *ret_type = actual_type; + if (new_type) + *new_type = actual_type; spin_lock(&memtype_lock); /* Search for existing mapping that overlaps the current range */ - list_for_each_entry(parse, &memtype_list, nd) { + list_for_each_entry(entry, &memtype_list, nd) { struct memtype *saved_ptr; - if (parse->start >= end) { + if (entry->start >= end) { dprintk("New Entry\n"); - list_add(&new_entry->nd, parse->nd.prev); - new_entry = NULL; + list_add(&new->nd, entry->nd.prev); + new = NULL; break; } - if (start <= parse->start && end >= parse->start) { - if (actual_type != parse->type && ret_type) { - actual_type = parse->type; - *ret_type = actual_type; - new_entry->type = actual_type; + if (start <= entry->start && end >= entry->start) { + if (actual_type != entry->type && new_type) { + actual_type = entry->type; + *new_type = actual_type; + new->type = actual_type; } - if (actual_type != parse->type) { + if (actual_type != entry->type) { printk( KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", current->comm, current->pid, start, end, cattr_name(actual_type), - cattr_name(parse->type)); + cattr_name(entry->type)); err = -EBUSY; break; } - saved_ptr = parse; + saved_ptr = entry; /* * Check to see whether the request overlaps more * than one entry in the list */ - list_for_each_entry_continue(parse, &memtype_list, nd) { - if (end <= parse->start) { + list_for_each_entry_continue(entry, &memtype_list, nd) { + if (end <= entry->start) { break; } - if (actual_type != parse->type) { + if (actual_type != entry->type) { printk( KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", current->comm, current->pid, start, end, cattr_name(actual_type), - cattr_name(parse->type)); + cattr_name(entry->type)); err = -EBUSY; break; } @@ -314,46 +311,46 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ - list_add(&new_entry->nd, saved_ptr->nd.prev); - new_entry = NULL; + list_add(&new->nd, saved_ptr->nd.prev); + new = NULL; break; } - if (start < parse->end) { - if (actual_type != parse->type && ret_type) { - actual_type = parse->type; - *ret_type = actual_type; - new_entry->type = actual_type; + if (start < entry->end) { + if (actual_type != entry->type && new_type) { + actual_type = entry->type; + *new_type = actual_type; + new->type = actual_type; } - if (actual_type != parse->type) { + if (actual_type != entry->type) { printk( KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", current->comm, current->pid, start, end, cattr_name(actual_type), - cattr_name(parse->type)); + cattr_name(entry->type)); err = -EBUSY; break; } - saved_ptr = parse; + saved_ptr = entry; /* * Check to see whether the request overlaps more * than one entry in the list */ - list_for_each_entry_continue(parse, &memtype_list, nd) { - if (end <= parse->start) { + list_for_each_entry_continue(entry, &memtype_list, nd) { + if (end <= entry->start) { break; } - if (actual_type != parse->type) { + if (actual_type != entry->type) { printk( KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", current->comm, current->pid, start, end, cattr_name(actual_type), - cattr_name(parse->type)); + cattr_name(entry->type)); err = -EBUSY; break; } @@ -366,8 +363,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ - list_add(&new_entry->nd, &saved_ptr->nd); - new_entry = NULL; + list_add(&new->nd, &saved_ptr->nd); + new = NULL; break; } } @@ -375,24 +372,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (err) { printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", - start, end, cattr_name(new_entry->type), + start, end, cattr_name(new->type), cattr_name(req_type)); - kfree(new_entry); + kfree(new); spin_unlock(&memtype_lock); return err; } - if (new_entry) { + if (new) { /* No conflict. Not yet added to the list. Add to the tail */ - list_add_tail(&new_entry->nd, &memtype_list); + list_add_tail(&new->nd, &memtype_list); dprintk("New Entry\n"); } - if (ret_type) { + if (new_type) { dprintk( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", start, end, cattr_name(actual_type), - cattr_name(req_type), cattr_name(*ret_type)); + cattr_name(req_type), cattr_name(*new_type)); } else { dprintk( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", @@ -406,7 +403,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, int free_memtype(u64 start, u64 end) { - struct memtype *ml; + struct memtype *entry; int err = -EINVAL; /* Only track when pat_enabled */ @@ -419,10 +416,10 @@ int free_memtype(u64 start, u64 end) return 0; spin_lock(&memtype_lock); - list_for_each_entry(ml, &memtype_list, nd) { - if (ml->start == start && ml->end == end) { - list_del(&ml->nd); - kfree(ml); + list_for_each_entry(entry, &memtype_list, nd) { + if (entry->start == start && entry->end == end) { + list_del(&entry->nd); + kfree(entry); err = 0; break; } -- cgit v1.2.3 From 69e26be9b1d0c83d3581475095ce2a1ccc578215 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 20 Jun 2008 22:03:06 +0200 Subject: x86: pat.c more trivial changes - add BUG statement to catch invalid start and end parameters - No need to track the actual type in both req_type and actual_type -- keep req_type unchanged. - removed (IMHO) superfluous comments Signed-off-by: Andreas Herrmann Cc: Venkatesh Pallipadi Cc: Suresh B Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index a6507bfb12a..c996a364120 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -200,7 +200,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long actual_type; int err = 0; - /* Only track when pat_enabled */ + BUG_ON(start >= end); /* end is exclusive */ + if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (new_type) { @@ -228,17 +229,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, */ u8 mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == MTRR_TYPE_WRBACK) { - req_type = _PAGE_CACHE_WB; + if (mtrr_type == MTRR_TYPE_WRBACK) actual_type = _PAGE_CACHE_WB; - } else { - req_type = _PAGE_CACHE_UC_MINUS; + else actual_type = _PAGE_CACHE_UC_MINUS; - } - } else { - req_type &= _PAGE_CACHE_MASK; - actual_type = pat_x_mtrr_type(start, end, req_type); - } + } else + actual_type = pat_x_mtrr_type(start, end, + req_type & _PAGE_CACHE_MASK); new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) @@ -406,10 +403,8 @@ int free_memtype(u64 start, u64 end) struct memtype *entry; int err = -EINVAL; - /* Only track when pat_enabled */ - if (!pat_enabled) { + if (!pat_enabled) return 0; - } /* Low ISA region is always mapped WB. No need to track */ if (is_ISA_range(start, end - 1)) -- cgit v1.2.3 From 3e9c83b309fd7cbf1d9b801d0d5877c040e30420 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 20 Jun 2008 22:04:02 +0200 Subject: x86: pat.c consolidate error/debug messages in reserve_memtype ... and move last debug message out of locked section. Signed-off-by: Andreas Herrmann Cc: Venkatesh Pallipadi Cc: Suresh B Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 51 +++++++++++---------------------------------------- 1 file changed, 11 insertions(+), 40 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index c996a364120..1118288f8fe 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -269,12 +269,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (actual_type != entry->type) { - printk( - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, - start, end, - cattr_name(actual_type), - cattr_name(entry->type)); err = -EBUSY; break; } @@ -290,12 +284,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (actual_type != entry->type) { - printk( - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, - start, end, - cattr_name(actual_type), - cattr_name(entry->type)); err = -EBUSY; break; } @@ -321,12 +309,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (actual_type != entry->type) { - printk( - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, - start, end, - cattr_name(actual_type), - cattr_name(entry->type)); err = -EBUSY; break; } @@ -342,12 +324,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (actual_type != entry->type) { - printk( - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, - start, end, - cattr_name(actual_type), - cattr_name(entry->type)); err = -EBUSY; break; } @@ -367,10 +343,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (err) { - printk(KERN_INFO - "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", - start, end, cattr_name(new->type), - cattr_name(req_type)); + printk(KERN_INFO "%s:%d conflicting memory types " + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start, + end, cattr_name(new->type), cattr_name(entry->type)); + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " + "track %s, req %s\n", + start, end, cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); return err; @@ -382,19 +360,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("New Entry\n"); } - if (new_type) { - dprintk( - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", - start, end, cattr_name(actual_type), - cattr_name(req_type), cattr_name(*new_type)); - } else { - dprintk( - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", - start, end, cattr_name(actual_type), - cattr_name(req_type)); - } - spin_unlock(&memtype_lock); + + dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", + start, end, cattr_name(new->type), cattr_name(req_type), + new_type ? cattr_name(*new_type) : "-"); + return err; } -- cgit v1.2.3 From f6887264deba4cd991f0ca006918dcff4c939021 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 20 Jun 2008 22:05:37 +0200 Subject: x86: pat.c consolidate list_add handling in reserve_memtype Signed-off-by: Andreas Herrmann Cc: Venkatesh Pallipadi Cc: Suresh B Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1118288f8fe..49dcd9652ec 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -198,6 +198,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, { struct memtype *new, *entry; unsigned long actual_type; + struct list_head *where; int err = 0; BUG_ON(start >= end); /* end is exclusive */ @@ -251,13 +252,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_lock(&memtype_lock); /* Search for existing mapping that overlaps the current range */ + where = NULL; list_for_each_entry(entry, &memtype_list, nd) { struct memtype *saved_ptr; if (entry->start >= end) { - dprintk("New Entry\n"); - list_add(&new->nd, entry->nd.prev); - new = NULL; + where = entry->nd.prev; break; } @@ -295,9 +295,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); - /* No conflict. Go ahead and add this new entry */ - list_add(&new->nd, saved_ptr->nd.prev); - new = NULL; + where = saved_ptr->nd.prev; break; } @@ -335,9 +333,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); - /* No conflict. Go ahead and add this new entry */ - list_add(&new->nd, &saved_ptr->nd); - new = NULL; + where = &saved_ptr->nd; break; } } @@ -354,11 +350,10 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return err; } - if (new) { - /* No conflict. Not yet added to the list. Add to the tail */ + if (where) + list_add(&new->nd, where); + else list_add_tail(&new->nd, &memtype_list); - dprintk("New Entry\n"); - } spin_unlock(&memtype_lock); -- cgit v1.2.3 From 64fe44c38bbdfab4fe052029058ce5fe9804de68 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 20 Jun 2008 22:07:09 +0200 Subject: x86: pat.c introduce function to check for conflicts with existing memtypes ... to strip down loop body in reserve_memtype. Signed-off-by: Andreas Herrmann Cc: Venkatesh Pallipadi Cc: Suresh B Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 96 +++++++++++++++++++------------------------------------ 1 file changed, 33 insertions(+), 63 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 49dcd9652ec..281ac6489c0 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -178,6 +178,33 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) return req_type; } +static int chk_conflict(struct memtype *new, struct memtype *entry, + unsigned long *type) +{ + if (new->type != entry->type) { + if (type) { + new->type = entry->type; + *type = entry->type; + } else + goto conflict; + } + + /* check overlaps with more than one entry in the list */ + list_for_each_entry_continue(entry, &memtype_list, nd) { + if (new->end <= entry->start) + break; + else if (new->type != entry->type) + goto conflict; + } + return 0; + + conflict: + printk(KERN_INFO "%s:%d conflicting memory types " + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start, + new->end, cattr_name(new->type), cattr_name(entry->type)); + return -EBUSY; +} + /* * req_type typically has one of the: * - _PAGE_CACHE_WB @@ -254,94 +281,37 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, /* Search for existing mapping that overlaps the current range */ where = NULL; list_for_each_entry(entry, &memtype_list, nd) { - struct memtype *saved_ptr; - if (entry->start >= end) { where = entry->nd.prev; break; } if (start <= entry->start && end >= entry->start) { - if (actual_type != entry->type && new_type) { - actual_type = entry->type; - *new_type = actual_type; - new->type = actual_type; - } - - if (actual_type != entry->type) { - err = -EBUSY; - break; - } - - saved_ptr = entry; - /* - * Check to see whether the request overlaps more - * than one entry in the list - */ - list_for_each_entry_continue(entry, &memtype_list, nd) { - if (end <= entry->start) { - break; - } - - if (actual_type != entry->type) { - err = -EBUSY; - break; - } - } - + err = chk_conflict(new, entry, new_type); if (err) { break; } dprintk("Overlap at 0x%Lx-0x%Lx\n", - saved_ptr->start, saved_ptr->end); - where = saved_ptr->nd.prev; + entry->start, entry->end); + where = entry->nd.prev; break; } if (start < entry->end) { - if (actual_type != entry->type && new_type) { - actual_type = entry->type; - *new_type = actual_type; - new->type = actual_type; - } - - if (actual_type != entry->type) { - err = -EBUSY; - break; - } - - saved_ptr = entry; - /* - * Check to see whether the request overlaps more - * than one entry in the list - */ - list_for_each_entry_continue(entry, &memtype_list, nd) { - if (end <= entry->start) { - break; - } - - if (actual_type != entry->type) { - err = -EBUSY; - break; - } - } - + err = chk_conflict(new, entry, new_type); if (err) { break; } dprintk("Overlap at 0x%Lx-0x%Lx\n", - saved_ptr->start, saved_ptr->end); - where = &saved_ptr->nd; + entry->start, entry->end); + where = &entry->nd; break; } } if (err) { - printk(KERN_INFO "%s:%d conflicting memory types " - "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start, - end, cattr_name(new->type), cattr_name(entry->type)); printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " "track %s, req %s\n", start, end, cattr_name(new->type), cattr_name(req_type)); -- cgit v1.2.3 From 33af9039cbf629041da2bfa0cf451208391a1ec3 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 20 Jun 2008 22:08:37 +0200 Subject: x86: pat.c final cleanup of loop body in reserve_memtype Signed-off-by: Andreas Herrmann Cc: Venkatesh Pallipadi Cc: Suresh B Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 281ac6489c0..a885a1019b8 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -281,32 +281,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, /* Search for existing mapping that overlaps the current range */ where = NULL; list_for_each_entry(entry, &memtype_list, nd) { - if (entry->start >= end) { + if (end <= entry->start) { where = entry->nd.prev; break; - } - - if (start <= entry->start && end >= entry->start) { + } else if (start <= entry->start) { /* end > entry->start */ err = chk_conflict(new, entry, new_type); - if (err) { - break; + if (!err) { + dprintk("Overlap at 0x%Lx-0x%Lx\n", + entry->start, entry->end); + where = entry->nd.prev; } - - dprintk("Overlap at 0x%Lx-0x%Lx\n", - entry->start, entry->end); - where = entry->nd.prev; break; - } - - if (start < entry->end) { + } else if (start < entry->end) { /* start > entry->start */ err = chk_conflict(new, entry, new_type); - if (err) { - break; + if (!err) { + dprintk("Overlap at 0x%Lx-0x%Lx\n", + entry->start, entry->end); + where = &entry->nd; } - - dprintk("Overlap at 0x%Lx-0x%Lx\n", - entry->start, entry->end); - where = &entry->nd; break; } } -- cgit v1.2.3 From f294a8ce211bed7bfaca19bef21376a86200c421 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 1 Jul 2008 15:38:13 +0200 Subject: x86: small unifications of address printing 'man 3 printf' tells me that %p should be printed as if by %#x, but this is not true for the kernel, which does not use the '0x' prefix for the %p conversion specifier. A small cast to (void *) is also prettier than #ifdef/#else/#endif. Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8bcb6f40ccb..0eb70d1dd1f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -396,11 +396,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); -#ifdef CONFIG_X86_32 - printk(KERN_CONT " at %08lx\n", address); -#else - printk(KERN_CONT " at %016lx\n", address); -#endif + printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); printk_address(regs->ip, 1); dump_pagetable(address); @@ -800,14 +796,10 @@ bad_area_nosemaphore: if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { printk( -#ifdef CONFIG_X86_32 - "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", -#else - "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx", -#endif + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, regs->ip, - regs->sp, error_code); + tsk->comm, task_pid_nr(tsk), address, + (void *) regs->ip, (void *) regs->sp, error_code); print_vma_addr(" in ", regs->ip); printk("\n"); } -- cgit v1.2.3 From 95c60b08c6af6db2165837139da10f593462d51c Mon Sep 17 00:00:00 2001 From: Gustavo Fernando Padovan Date: Wed, 25 Jun 2008 04:03:19 -0300 Subject: x86: remove unnecessary #ifdef CONFIG_X86_32...#else Remove the #ifdef conditional because this comparison is already done in user_mode_vm(). Signed-off-by: Gustavo F. Padovan Cc: akpm@osdl.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8bcb6f40ccb..1e64795714c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -55,11 +55,7 @@ static inline int notify_page_fault(struct pt_regs *regs) int ret = 0; /* kprobe_running() needs smp_processor_id() */ -#ifdef CONFIG_X86_32 if (!user_mode_vm(regs)) { -#else - if (!user_mode(regs)) { -#endif preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; -- cgit v1.2.3 From ce0c0e50f94e8c55b00a722e8c6e8d6c802be211 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 2 May 2008 11:46:49 +0200 Subject: x86, generic: CPA add statistics about state of direct mapping v4 Add information about the mapping state of the direct mapping to /proc/meminfo. I chose /proc/meminfo because that is where all the other memory statistics are too and it is a generally useful metric even outside debugging situations. A lot of split kernel pages means the kernel will run slower. This way we can see how many large pages are really used for it and how many are split. Useful for general insight into the kernel. v2: Add hotplug locking to 64bit to plug a very obscure theoretical race. 32bit doesn't need it because it doesn't support hotadd for lowmem. Fix some typos v3: Rename dpages_cnt Add CONFIG ifdef for count update as requested by tglx Expand description v4: Fix stupid bugs added in v3 Move update_page_count to pageattr.c Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 5 +++++ arch/x86/mm/init_64.c | 7 +++++++ arch/x86/mm/pageattr.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ec30d10154b..0269ac230bf 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -162,6 +162,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) pgd_t *pgd; pmd_t *pmd; pte_t *pte; + unsigned pages_2m = 0, pages_4k = 0; pgd_idx = pgd_index(PAGE_OFFSET); pgd = pgd_base + pgd_idx; @@ -197,6 +198,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) is_kernel_text(addr2)) prot = PAGE_KERNEL_LARGE_EXEC; + pages_2m++; set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; @@ -213,11 +215,14 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) if (is_kernel_text(addr)) prot = PAGE_KERNEL_EXEC; + pages_4k++; set_pte(pte, pfn_pte(pfn, prot)); } max_pfn_mapped = pfn; } } + update_page_count(PG_LEVEL_2M, pages_2m); + update_page_count(PG_LEVEL_4K, pages_4k); } static inline int page_kills_ppro(unsigned long pagenr) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 819dad973b1..5e438385905 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -312,6 +312,8 @@ __meminit void early_iounmap(void *addr, unsigned long size) static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { + unsigned long pages = 0; + int i = pmd_index(address); for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { @@ -328,9 +330,11 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) if (pmd_val(*pmd)) continue; + pages++; set_pte((pte_t *)pmd, pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); } + update_page_count(PG_LEVEL_2M, pages); return address; } @@ -350,6 +354,7 @@ phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) { + unsigned long pages = 0; unsigned long last_map_addr = end; int i = pud_index(addr); @@ -374,6 +379,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) } if (direct_gbpages) { + pages++; set_pte((pte_t *)pud, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); last_map_addr = (addr & PUD_MASK) + PUD_SIZE; @@ -390,6 +396,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) unmap_low_page(pmd); } __flush_tlb_all(); + update_page_count(PG_LEVEL_1G, pages); return last_map_addr >> PAGE_SHIFT; } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 60bcb5b6a37..668205bca15 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -34,6 +34,19 @@ struct cpa_data { unsigned force_split : 1; }; +static unsigned long direct_pages_count[PG_LEVEL_NUM]; + +void __meminit update_page_count(int level, unsigned long pages) +{ +#ifdef CONFIG_PROC_FS + unsigned long flags; + /* Protect against CPA */ + spin_lock_irqsave(&pgd_lock, flags); + direct_pages_count[level] += pages; + spin_unlock_irqrestore(&pgd_lock, flags); +#endif +} + #ifdef CONFIG_X86_64 static inline unsigned long highmap_start_pfn(void) @@ -500,6 +513,12 @@ static int split_large_page(pte_t *kpte, unsigned long address) for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); + if (address >= (unsigned long)__va(0) && + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) { + direct_pages_count[level]--; + direct_pages_count[level - 1] += PTRS_PER_PTE; + } + /* * Install the new, split up pagetable. Important details here: * @@ -1029,6 +1048,22 @@ bool kernel_page_present(struct page *page) #endif /* CONFIG_DEBUG_PAGEALLOC */ +#ifdef CONFIG_PROC_FS +int arch_report_meminfo(char *page) +{ + int n; + n = sprintf(page, "DirectMap4k: %8lu\n" + "DirectMap2M: %8lu\n", + direct_pages_count[PG_LEVEL_4K], + direct_pages_count[PG_LEVEL_2M]); +#ifdef CONFIG_X86_64 + n += sprintf(page + n, "DirectMap1G: %8lu\n", + direct_pages_count[PG_LEVEL_1G]); +#endif + return n; +} +#endif + /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. -- cgit v1.2.3 From 65280e613fada41704f35709b6c8952ca4b8750c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 May 2008 16:35:21 +0200 Subject: x86: janitor CPA statistics patch 1) Remove __meminit from update_pages_count. It is used inside split_pages() 2) Make the code depend on PROC_FS. Doing statistics for nothing is useless and not adding useless code is nice to the Linux tiny folks. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 668205bca15..0a3f5e047f8 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -34,18 +34,40 @@ struct cpa_data { unsigned force_split : 1; }; +#ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; -void __meminit update_page_count(int level, unsigned long pages) +void update_page_count(int level, unsigned long pages) { -#ifdef CONFIG_PROC_FS unsigned long flags; + /* Protect against CPA */ spin_lock_irqsave(&pgd_lock, flags); direct_pages_count[level] += pages; spin_unlock_irqrestore(&pgd_lock, flags); +} + +static void split_page_count(int level) +{ + direct_pages_count[level]--; + direct_pages_count[level - 1] += PTRS_PER_PTE; +} + +int arch_report_meminfo(char *page) +{ + int n = sprintf(page, "DirectMap4k: %8lu\n" + "DirectMap2M: %8lu\n", + direct_pages_count[PG_LEVEL_4K], + direct_pages_count[PG_LEVEL_2M]); +#ifdef CONFIG_X86_64 + n += sprintf(page + n, "DirectMap1G: %8lu\n", + direct_pages_count[PG_LEVEL_1G]); #endif + return n; } +#else +static inline void split_page_count(int level) { } +#endif #ifdef CONFIG_X86_64 @@ -514,10 +536,8 @@ static int split_large_page(pte_t *kpte, unsigned long address) set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); if (address >= (unsigned long)__va(0) && - address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) { - direct_pages_count[level]--; - direct_pages_count[level - 1] += PTRS_PER_PTE; - } + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) + split_page_count(level); /* * Install the new, split up pagetable. Important details here: @@ -1048,22 +1068,6 @@ bool kernel_page_present(struct page *page) #endif /* CONFIG_DEBUG_PAGEALLOC */ -#ifdef CONFIG_PROC_FS -int arch_report_meminfo(char *page) -{ - int n; - n = sprintf(page, "DirectMap4k: %8lu\n" - "DirectMap2M: %8lu\n", - direct_pages_count[PG_LEVEL_4K], - direct_pages_count[PG_LEVEL_2M]); -#ifdef CONFIG_X86_64 - n += sprintf(page + n, "DirectMap1G: %8lu\n", - direct_pages_count[PG_LEVEL_1G]); -#endif - return n; -} -#endif - /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. -- cgit v1.2.3 From 6e92a5a6151f4a467b8c1bde9123c0a9d1a63339 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: add sparse annotations to ioremap arch/x86/mm/ioremap.c:308:11: error: incompatible types in comparison expression (different address spaces) Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 2b2bb3f9b68..01d76426971 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -318,8 +318,8 @@ void iounmap(volatile void __iomem *addr) * vm_area and by simply returning an address into the kernel mapping * of ISA space. So handle that here. */ - if (addr >= phys_to_virt(ISA_START_ADDRESS) && - addr < phys_to_virt(ISA_END_ADDRESS)) + if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) return; addr = (volatile void __iomem *) @@ -332,7 +332,7 @@ void iounmap(volatile void __iomem *addr) cpa takes care of the direct mappings. */ read_lock(&vmlist_lock); for (p = vmlist; p; p = p->next) { - if (p->addr == addr) + if (p->addr == (void __force *)addr) break; } read_unlock(&vmlist_lock); @@ -346,7 +346,7 @@ void iounmap(volatile void __iomem *addr) free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); /* Finally remove it */ - o = remove_vm_area((void *)addr); + o = remove_vm_area((void __force *)addr); BUG_ON(p != o || o == NULL); kfree(p); } @@ -365,7 +365,7 @@ void *xlate_dev_mem_ptr(unsigned long phys) if (page_is_ram(start >> PAGE_SHIFT)) return __va(phys); - addr = (void *)ioremap(start, PAGE_SIZE); + addr = (void __force *)ioremap(start, PAGE_SIZE); if (addr) addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); -- cgit v1.2.3 From 684eb0163a98bc329193b4aa4535cdd705a5dd58 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86_64: use PAGE_OFFSET in dump_pagetables Use PAGE_OFFSET macro instead of using 0xffff810000000000UL directly. Signed-off-by: Jiri Slaby Cc: Arjan van de Ven Cc: Andi Kleen Cc: hpa@zytor.com Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/dump_pagetables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 2c24bea92c6..0bb0caed897 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = { { 0, "User Space" }, #ifdef CONFIG_X86_64 { 0x8000000000000000UL, "Kernel Space" }, - { 0xffff810000000000UL, "Low Kernel Mapping" }, + { PAGE_OFFSET, "Low Kernel Mapping" }, { VMALLOC_START, "vmalloc() Area" }, { VMEMMAP_START, "Vmemmap" }, { __START_KERNEL_map, "High Kernel Mapping" }, -- cgit v1.2.3 From 8b2ef1d7285740953a2c4ef7faf15fdfc5e2f358 Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Sun, 8 Jun 2008 15:46:30 +0200 Subject: x86: add flags parameter to reserve_bootmem_generic() This patch adds a 'flags' parameter to reserve_bootmem_generic() like it already has been added in reserve_bootmem() with commit 72a7fe3967dbf86cb34e24fbf1d957fe24d2f246. It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively change the behaviour. Since the change is x86-specific, I don't think it's necessary to add a new API for migration. There are only 4 users of that function. The change is necessary for the next patch, using reserve_bootmem_generic() for crashkernel reservation. Signed-off-by: Bernhard Walle Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 819dad973b1..bf7bf1de6c2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -799,12 +799,13 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags) { #ifdef CONFIG_NUMA int nid, next_nid; #endif unsigned long pfn = phys >> PAGE_SHIFT; + int ret; if (pfn >= end_pfn) { /* @@ -812,11 +813,11 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) * firmware tables: */ if (pfn < max_pfn_mapped) - return; + return -EFAULT; printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", phys, len); - return; + return -EFAULT; } /* Should check here against the e820 map to avoid double free */ @@ -824,9 +825,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) nid = phys_to_nid(phys); next_nid = phys_to_nid(phys + len - 1); if (nid == next_nid) - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); + ret = reserve_bootmem(phys, len, flags); + + if (ret != 0) + return ret; + #else reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #endif @@ -835,6 +840,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) dma_reserve += len / PAGE_SIZE; set_dma_reserve(dma_reserve); } + + return 0; } int kern_addr_valid(unsigned long addr) -- cgit v1.2.3 From d2dbf343329dc777d77488743465f7be4245971d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Jun 2008 02:00:56 -0700 Subject: x86: clean up reserve_bootmem_generic() and port it to 32-bit 1. add reserve_bootmem_generic for 32bit 2. change len to unsigned long 3. make early_res_to_bootmem to use it Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 6 ++++++ arch/x86/mm/init_64.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0e7bb5e8167..abadb1da70d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -785,3 +785,9 @@ void free_initrd_mem(unsigned long start, unsigned long end) free_init_pages("initrd memory", start, end); } #endif + +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) +{ + return reserve_bootmem(phys, len, flags); +} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bf7bf1de6c2..b8c2c1ef7ad 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -799,7 +799,8 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags) +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) { #ifdef CONFIG_NUMA int nid, next_nid; -- cgit v1.2.3 From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Jun 2008 19:08:52 -0700 Subject: x86: replace shrink_active_range() with remove_active_range() in case we have kva before ramdisk on a node, we still need to use those ranges. v2: reserve_early kva ram area, in case there are holes in highmem, to avoid those area could be treat as free high pages. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index accc7c6c57f..c3f119e99e0 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -230,8 +230,8 @@ static unsigned long calculate_numa_remap_pages(void) unsigned long size, reserve_pages = 0; for_each_online_node(nid) { - u64 node_end_target; - u64 node_end_final; + u64 node_kva_target; + u64 node_kva_final; /* * The acpi/srat node info can show hot-add memroy zones @@ -254,42 +254,45 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_end_target = round_down(node_end_pfn[nid] - size, + node_kva_target = round_down(node_end_pfn[nid] - size, PTRS_PER_PTE); - node_end_target <<= PAGE_SHIFT; + node_kva_target <<= PAGE_SHIFT; do { - node_end_final = find_e820_area(node_end_target, + node_kva_final = find_e820_area(node_kva_target, ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + node_kva_target -= LARGE_PAGE_BYTES; + } while (node_kva_final == -1ULL && + (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - if (node_end_final == -1ULL) + if (node_kva_final == -1ULL) panic("Can not get kva ram\n"); - printk("Reserving %ld pages of KVA for lmem_map of node %d\n", - size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %lld pages\n", - nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + size, nid, node_kva_final>>PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system * with less memory later. * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then max_low_pfn + * but we could have some hole in high memory, and it will only + * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide + * to use it as free. + * So reserve_early here, hope we don't run out of that array */ - if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) - reserve_early(node_end_final, - node_end_final+(((u64)size)<>PAGE_SHIFT; - node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, node_end_pfn[nid]); + reserve_early(node_kva_final, + node_kva_final+(((u64)size)<>PAGE_SHIFT; + remove_active_range(nid, node_remap_start_pfn[nid], + node_remap_start_pfn[nid] + size); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); -- cgit v1.2.3 From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 14 Jun 2008 18:32:52 -0700 Subject: x86, mm: use add_highpages_with_active_regions() for high pages init v2 use early_node_map to init high pages, so we can remove page_is_ram() and page_is_reserved_early() in the big loop with add_one_highpage also remove page_is_reserved_early(), it is not needed anymore. v2: fix the build of other platforms Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 19 ++++++-------- arch/x86/mm/init_32.c | 62 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 23 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index c3f119e99e0..7c4d0255f8d 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -100,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, #endif extern unsigned long find_max_low_pfn(void); -extern void add_one_highpage_init(struct page *, int, int); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) @@ -432,10 +431,10 @@ void __init set_highmem_pages_init(int bad_ppro) { #ifdef CONFIG_HIGHMEM struct zone *zone; - struct page *page; + int nid; for_each_zone(zone) { - unsigned long node_pfn, zone_start_pfn, zone_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; if (!is_highmem(zone)) continue; @@ -443,16 +442,12 @@ void __init set_highmem_pages_init(int bad_ppro) zone_start_pfn = zone->zone_start_pfn; zone_end_pfn = zone_start_pfn + zone->spanned_pages; + nid = zone_to_nid(zone); printk("Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, zone_to_nid(zone), - zone_start_pfn, zone_end_pfn); - - for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); - } + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn, bad_ppro); } totalram_pages += totalhigh_pages; #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index abadb1da70d..ba07a489230 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -287,10 +287,10 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __init +add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) && - !page_is_reserved_early(pfn)) { + if (!(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); init_page_count(page); __free_page(page); @@ -299,18 +299,58 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) SetPageReserved(page); } +struct add_highpages_data { + unsigned long start_pfn; + unsigned long end_pfn; + int bad_ppro; +}; + +static void __init add_highpages_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + int node_pfn; + struct page *page; + unsigned long final_start_pfn, final_end_pfn; + struct add_highpages_data *data; + int bad_ppro; + + data = (struct add_highpages_data *)datax; + bad_ppro = data->bad_ppro; + + final_start_pfn = max(start_pfn, data->start_pfn); + final_end_pfn = min(end_pfn, data->end_pfn); + if (final_start_pfn >= final_end_pfn) + return; + + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; + node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page, node_pfn, bad_ppro); + } + +} + +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, + int bad_ppro) +{ + struct add_highpages_data data; + + data.start_pfn = start_pfn; + data.end_pfn = end_pfn; + data.bad_ppro = bad_ppro; + + work_with_active_regions(nid, add_highpages_work_fn, &data); +} + #ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { - int pfn; + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn, + bad_ppro); - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { - /* - * Holes under sparsemem might not have no mem_map[]: - */ - if (pfn_valid(pfn)) - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); - } totalram_pages += totalhigh_pages; } #endif /* !CONFIG_NUMA */ -- cgit v1.2.3 From cc9f7a0ccf000d4db5fbdc7b0ae48eefea102f69 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 16:11:08 -0700 Subject: x86: kill bad_ppro so don't punish all other cpus without that problem when init highmem Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 4 ++-- arch/x86/mm/init_32.c | 43 ++++++++++++------------------------------- 2 files changed, 14 insertions(+), 33 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 7c4d0255f8d..6216e43b6e9 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -427,7 +427,7 @@ void __init zone_sizes_init(void) return; } -void __init set_highmem_pages_init(int bad_ppro) +void __init set_highmem_pages_init(void) { #ifdef CONFIG_HIGHMEM struct zone *zone; @@ -447,7 +447,7 @@ void __init set_highmem_pages_init(int bad_ppro) zone->name, nid, zone_start_pfn, zone_end_pfn); add_highpages_with_active_regions(nid, zone_start_pfn, - zone_end_pfn, bad_ppro); + zone_end_pfn); } totalram_pages += totalhigh_pages; #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ba07a489230..fb5694d788b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -220,13 +220,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) } } -static inline int page_kills_ppro(unsigned long pagenr) -{ - if (pagenr >= 0x70000 && pagenr <= 0x7003F) - return 1; - return 0; -} - /* * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. @@ -287,22 +280,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -static void __init -add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __init add_one_highpage_init(struct page *page, int pfn) { - if (!(bad_ppro && page_kills_ppro(pfn))) { - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalhigh_pages++; - } else - SetPageReserved(page); + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + totalhigh_pages++; } struct add_highpages_data { unsigned long start_pfn; unsigned long end_pfn; - int bad_ppro; }; static void __init add_highpages_work_fn(unsigned long start_pfn, @@ -312,10 +300,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, struct page *page; unsigned long final_start_pfn, final_end_pfn; struct add_highpages_data *data; - int bad_ppro; data = (struct add_highpages_data *)datax; - bad_ppro = data->bad_ppro; final_start_pfn = max(start_pfn, data->start_pfn); final_end_pfn = min(end_pfn, data->end_pfn); @@ -327,29 +313,26 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, if (!pfn_valid(node_pfn)) continue; page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); + add_one_highpage_init(page, node_pfn); } } void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn, - int bad_ppro) + unsigned long end_pfn) { struct add_highpages_data data; data.start_pfn = start_pfn; data.end_pfn = end_pfn; - data.bad_ppro = bad_ppro; work_with_active_regions(nid, add_highpages_work_fn, &data); } #ifndef CONFIG_NUMA -static void __init set_highmem_pages_init(int bad_ppro) +static void __init set_highmem_pages_init(void) { - add_highpages_with_active_regions(0, highstart_pfn, highend_pfn, - bad_ppro); + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); totalram_pages += totalhigh_pages; } @@ -358,7 +341,7 @@ static void __init set_highmem_pages_init(int bad_ppro) #else # define kmap_init() do { } while (0) # define permanent_kmaps_init(pgd_base) do { } while (0) -# define set_highmem_pages_init(bad_ppro) do { } while (0) +# define set_highmem_pages_init() do { } while (0) #endif /* CONFIG_HIGHMEM */ pteval_t __PAGE_KERNEL = _PAGE_KERNEL; @@ -605,13 +588,11 @@ static struct kcore_list kcore_mem, kcore_vmalloc; void __init mem_init(void) { int codesize, reservedpages, datasize, initsize; - int tmp, bad_ppro; + int tmp; #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif - bad_ppro = ppro_with_ram_bug(); - #ifdef CONFIG_HIGHMEM /* check that fixmap and pkmap do not overlap */ if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { @@ -634,7 +615,7 @@ void __init mem_init(void) if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; - set_highmem_pages_init(bad_ppro); + set_highmem_pages_init(); codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; -- cgit v1.2.3 From 064d25f12014ae1d97c2882f9ab874995321f2b2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 19:58:28 -0700 Subject: x86: merge setup_memory_map with e820 ... and kill e820_32/64.c and e820_32/64.h Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b8c2c1ef7ad..5266f3141d6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -47,6 +47,18 @@ #include #include +/* + * PFN of last memory page. + */ +unsigned long end_pfn; + +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_pfn_mapped; + static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -- cgit v1.2.3 From a4caa18efe468acb3522e30763de57a67b3e438b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 19 Jun 2008 12:15:01 -0700 Subject: x86: fix compiling when CONFIG_X86_MPPARSE is not set Signed-off-by: Yinghai Lu Cc: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/mm/k8topology_64.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 1f476e47784..f75aa2ae54a 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -56,18 +56,22 @@ static __init void early_get_boot_cpu_id(void) /* * Find possible boot-time SMP configuration: */ +#ifdef CONFIG_X86_MPPARSE early_find_smp_config(); +#endif #ifdef CONFIG_ACPI /* * Read APIC information from ACPI tables. */ early_acpi_boot_init(); #endif +#ifdef CONFIG_X86_MPPARSE /* * get boot-time SMP configuration: */ if (smp_found_config) early_get_smp_config(); +#endif early_init_lapic_mapping(); } -- cgit v1.2.3 From 23ca4bba3e20c6c3cb11c1bb0ab4770b724d39ac Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:12 +0200 Subject: x86: cleanup early per cpu variables/accesses v4 * Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is used by some per_cpu variables that are initialized and accessed before there are per_cpu areas allocated. ["Early" in respect to per_cpu variables is "earlier than the per_cpu areas have been setup".] This patchset adds these new macros: DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) EXPORT_EARLY_PER_CPU_SYMBOL(_name) DECLARE_EARLY_PER_CPU(_type, _name) early_per_cpu_ptr(_name) early_per_cpu_map(_name, _idx) early_per_cpu(_name, _cpu) The DEFINE macro defines the per_cpu variable as well as the early map and pointer. It also initializes the per_cpu variable and map elements to "_initvalue". The early_* macros provide access to the initial map (usually setup during system init) and the early pointer. This pointer is initialized to point to the early map but is then NULL'ed when the actual per_cpu areas are setup. After that the per_cpu variable is the correct access to the variable. The early_per_cpu() macro is not very efficient but does show how to access the variable if you have a function that can be called both "early" and "late". It tests the early ptr to be NULL, and if not then it's still valid. Otherwise, the per_cpu variable is used instead: #define early_per_cpu(_name, _cpu) \ (early_per_cpu_ptr(_name) ? \ early_per_cpu_ptr(_name)[_cpu] : \ per_cpu(_name, _cpu)) A better method is to actually check the pointer manually. In the case below, numa_set_node can be called both "early" and "late": void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); if (cpu_to_node_map) cpu_to_node_map[cpu] = node; else per_cpu(x86_cpu_to_node_map, cpu) = node; } * Add a flag "arch_provides_topology_pointers" that indicates pointers to topology cpumask_t maps are available. Otherwise, use the function returning the cpumask_t value. This is useful if cpumask_t set size is very large to avoid copying data on to/off of the stack. * The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while the non-debug case has been optimized a bit. * Remove an unreferenced compiler warning in drivers/base/topology.c * Clean up #ifdef in setup.c For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/numa_64.c | 43 +++++++++++-------------------------------- arch/x86/mm/srat_64.c | 2 +- 2 files changed, 12 insertions(+), 33 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c5066d519e5..970f86775c4 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -31,16 +31,6 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; -#ifdef CONFIG_SMP -int x86_cpu_to_node_map_init[NR_CPUS] = { - [0 ... NR_CPUS-1] = NUMA_NO_NODE -}; -void *x86_cpu_to_node_map_early_ptr; -EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); -#endif -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); - s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; @@ -577,24 +567,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } -__cpuinit void numa_add_cpu(int cpu) -{ - set_bit(cpu, - (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_set_node(int cpu, int node) -{ - int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; - - if(cpu_to_node_map) - cpu_to_node_map[cpu] = node; - else if(per_cpu_offset(cpu)) - per_cpu(x86_cpu_to_node_map, cpu) = node; - else - Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); -} - unsigned long __init numa_free_all_bootmem(void) { unsigned long pages = 0; @@ -641,6 +613,7 @@ static __init int numa_setup(char *opt) } early_param("numa", numa_setup); +#ifdef CONFIG_NUMA /* * Setup early cpu_to_node. * @@ -652,14 +625,19 @@ early_param("numa", numa_setup); * is already initialized in a round robin manner at numa_init_array, * prior to this call, and this initialization is good enough * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. */ void __init init_cpu_to_node(void) { - int i; + int cpu; + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - for (i = 0; i < NR_CPUS; i++) { + BUG_ON(cpu_to_apicid == NULL); + + for_each_possible_cpu(cpu) { int node; - u16 apicid = x86_cpu_to_apicid_init[i]; + u16 apicid = cpu_to_apicid[cpu]; if (apicid == BAD_APICID) continue; @@ -668,8 +646,9 @@ void __init init_cpu_to_node(void) continue; if (!node_online(node)) continue; - numa_set_node(i, node); + numa_set_node(cpu, node); } } +#endif diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 99649dccad2..012220e31c9 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -376,7 +376,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (node == NUMA_NO_NODE) continue; if (!node_isset(node, node_possible_map)) - numa_set_node(i, NUMA_NO_NODE); + numa_clear_node(i); } numa_init_array(); return 0; -- cgit v1.2.3 From 9f248bde9d47cc177011198c9a15fb339b9f3215 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:12 +0200 Subject: x86: remove the static 256k node_to_cpumask_map * Consolidate node_to_cpumask operations and remove the 256k byte node_to_cpumask_map. This is done by allocating the node_to_cpumask_map array after the number of possible nodes (nr_node_ids) is known. * Debug printouts when CONFIG_DEBUG_PER_CPU_MAPS is active have been increased. It now shows faults when calling node_to_cpumask() and node_to_cpumask_ptr(). For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/numa_64.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 970f86775c4..14c7ab417ec 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -35,9 +35,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_to_cpumask_map); - int numa_off __initdata; unsigned long __initdata nodemap_addr; unsigned long __initdata nodemap_size; @@ -560,9 +557,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) node_set(0, node_possible_map); for (i = 0; i < NR_CPUS; i++) numa_set_node(i, 0); - /* cpumask_of_cpu() may not be available during early startup */ - memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); - cpu_set(0, node_to_cpumask_map[0]); e820_register_active_regions(0, start_pfn, end_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } -- cgit v1.2.3 From 864fc31ea59798905a37cd896a3e093915a3b366 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: numa_64.c make local variables static plat_node_bdata, cmdline, nodemap_addr, nodemap_size are local to numa_64.c. Make them static Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 14c7ab417ec..824344f1742 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -27,7 +27,7 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -bootmem_data_t plat_node_bdata[MAX_NUMNODES]; +static bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; @@ -36,8 +36,8 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { }; int numa_off __initdata; -unsigned long __initdata nodemap_addr; -unsigned long __initdata nodemap_size; +static unsigned long __initdata nodemap_addr; +static unsigned long __initdata nodemap_size; /* * Given a shift value, try to populate memnodemap[] @@ -296,7 +296,7 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -char *cmdline __initdata; +static char *cmdline __initdata; /* * Setups up nid to range from addr to addr + size. If the end -- cgit v1.2.3 From 886533a3e370a6d5c4e46819d1e14bd2f20dbb3a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: numa_64.c fix shadowed variable sparse mutters: arch/x86/mm/numa_64.c:195:27: warning: symbol 'end_pfn' shadows an earlier one Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 824344f1742..a1f3778b468 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -179,7 +179,7 @@ static void * __init early_node_mem(int nodeid, unsigned long start, void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { - unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; + unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; unsigned long bootmap_start, nodedata_phys; void *bootmap; const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); @@ -191,7 +191,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, start, end); start_pfn = start >> PAGE_SHIFT; - end_pfn = end >> PAGE_SHIFT; + last_pfn = end >> PAGE_SHIFT; node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, SMP_CACHE_BYTES); @@ -204,7 +204,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; NODE_DATA(nodeid)->node_start_pfn = start_pfn; - NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; + NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; /* * Find a place for the bootmem map @@ -213,7 +213,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, * early_node_mem will get that with find_e820_area instead * of alloc_bootmem, that could clash with reserved range */ - bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); nid = phys_to_nid(nodedata_phys); if (nid == nodeid) bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); @@ -235,7 +235,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, bootmap_size = init_bootmem_node(NODE_DATA(nodeid), bootmap_start >> PAGE_SHIFT, - start_pfn, end_pfn); + start_pfn, last_pfn); printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", bootmap_start, bootmap_start + bootmap_size - 1, @@ -400,15 +400,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, } /* - * Sets up the system RAM area from start_pfn to end_pfn according to the + * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. */ static struct bootnode nodes[MAX_NUMNODES] __initdata; -static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) { u64 size, addr = start_pfn << PAGE_SHIFT; - u64 max_addr = end_pfn << PAGE_SHIFT; + u64 max_addr = last_pfn << PAGE_SHIFT; int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; memset(&nodes, 0, sizeof(nodes)); @@ -514,7 +514,7 @@ out: } #endif /* CONFIG_NUMA_EMU */ -void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) +void __init numa_initmem_init(unsigned long start_pfn, unsigned long last_pfn) { int i; @@ -522,7 +522,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, end_pfn)) + if (cmdline && !numa_emulation(start_pfn, last_pfn)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -530,7 +530,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) #ifdef CONFIG_ACPI_NUMA if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - end_pfn << PAGE_SHIFT)) + last_pfn << PAGE_SHIFT)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -538,7 +538,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) #ifdef CONFIG_K8_NUMA if (!numa_off && !k8_scan_nodes(start_pfn< Date: Sun, 8 Jun 2008 15:46:30 +0200 Subject: x86: add flags parameter to reserve_bootmem_generic() This patch adds a 'flags' parameter to reserve_bootmem_generic() like it already has been added in reserve_bootmem() with commit 72a7fe3967dbf86cb34e24fbf1d957fe24d2f246. It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively change the behaviour. Since the change is x86-specific, I don't think it's necessary to add a new API for migration. There are only 4 users of that function. The change is necessary for the next patch, using reserve_bootmem_generic() for crashkernel reservation. Signed-off-by: Bernhard Walle Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 819dad973b1..bf7bf1de6c2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -799,12 +799,13 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags) { #ifdef CONFIG_NUMA int nid, next_nid; #endif unsigned long pfn = phys >> PAGE_SHIFT; + int ret; if (pfn >= end_pfn) { /* @@ -812,11 +813,11 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) * firmware tables: */ if (pfn < max_pfn_mapped) - return; + return -EFAULT; printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", phys, len); - return; + return -EFAULT; } /* Should check here against the e820 map to avoid double free */ @@ -824,9 +825,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) nid = phys_to_nid(phys); next_nid = phys_to_nid(phys + len - 1); if (nid == next_nid) - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); + ret = reserve_bootmem(phys, len, flags); + + if (ret != 0) + return ret; + #else reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #endif @@ -835,6 +840,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) dma_reserve += len / PAGE_SIZE; set_dma_reserve(dma_reserve); } + + return 0; } int kern_addr_valid(unsigned long addr) -- cgit v1.2.3 From a7bf0bd5e6af7fe69342dabf2a3b721f0163469a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 28 May 2008 15:02:14 +0100 Subject: build: add __page_aligned_data and __page_aligned_bss Making a variable page-aligned by using __attribute__((section(".data.page_aligned"))) is fragile because if sizeof(variable) is not also a multiple of page size, it leaves variables in the remainder of the section unaligned. This patch introduces two new qualifiers, __page_aligned_data and __page_aligned_bss to set the section *and* the alignment of variables. This makes page-aligned variables more robust because the linker will make sure they're aligned properly. Unfortunately it requires *all* page-aligned data to use these macros... Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 416ea415f5c..0561fde56a5 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -394,8 +394,7 @@ static int __init early_ioremap_debug_setup(char *str) early_param("early_ioremap_debug", early_ioremap_debug_setup); static __initdata int after_paging_init; -static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] - __section(.bss.page_aligned); +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { -- cgit v1.2.3 From d52d53b8a5b258bfaab9223a5e7284fcfdd48577 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 20:10:55 -0700 Subject: RFC x86: try to remove arch_get_ram_range want to remove arch_get_ram_range, and use early_node_map instead. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 65d55056b6e..a0484adbf59 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -298,7 +298,7 @@ struct add_highpages_data { unsigned long end_pfn; }; -static void __init add_highpages_work_fn(unsigned long start_pfn, +static int __init add_highpages_work_fn(unsigned long start_pfn, unsigned long end_pfn, void *datax) { int node_pfn; @@ -311,7 +311,7 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, final_start_pfn = max(start_pfn, data->start_pfn); final_end_pfn = min(end_pfn, data->end_pfn); if (final_start_pfn >= final_end_pfn) - return; + return 0; for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; node_pfn++) { @@ -321,6 +321,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, add_one_highpage_init(page, node_pfn); } + return 0; + } void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, -- cgit v1.2.3 From 1f75d7e32ed47b2ab8570771a2ce8c707a7225a2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 22 Jun 2008 02:44:49 -0700 Subject: x86: introduce initmem_init for 64 bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 16 ++++++++++++++++ arch/x86/mm/numa_64.c | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 97c2bc741e9..99a091ee5a6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -611,6 +611,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon } #ifndef CONFIG_NUMA +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long bootmap_size, bootmap; + + bootmap_size = bootmem_bootmap_pages(end_pfn)<> PAGE_SHIFT, end_pfn); + e820_register_active_regions(0, start_pfn, end_pfn); + free_bootmem_with_active_regions(0, end_pfn); + early_res_to_bootmem(0, end_pfn< Date: Sun, 22 Jun 2008 02:45:39 -0700 Subject: x86: introduce initmem_init for 32 bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 4 +-- arch/x86/mm/init_32.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index a2f73ba42b8..3e75be46c4f 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -309,8 +309,8 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -extern void setup_bootmem_allocator(void); -unsigned long __init setup_memory(void) +unsigned long __init initmem_init(unsigned long start_pfn, + unsigned long end_pfn) { int nid; unsigned long system_start_pfn, system_max_low_pfn; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a0484adbf59..9bc8607d798 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -540,6 +540,96 @@ static void __init set_nx(void) } #endif +#ifndef CONFIG_NEED_MULTIPLE_NODES +extern unsigned long find_max_low_pfn(void); +unsigned long __init initmem_init(unsigned long start_pfn, + unsigned long end_pfn) +{ + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + min_low_pfn = PFN_UP(init_pg_tables_end); + + max_low_pfn = find_max_low_pfn(); + +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; + memory_present(0, 0, highend_pfn); + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + memory_present(0, 0, max_low_pfn); + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif +#ifdef CONFIG_FLATMEM + max_mapnr = num_physpages; +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); + + return max_low_pfn; +} + +void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; + remove_all_active_ranges(); +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; + e820_register_active_regions(0, 0, highend_pfn); +#else + e820_register_active_regions(0, 0, max_low_pfn); +#endif + + free_area_init_nodes(max_zone_pfns); +} +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ + +extern void reserve_initrd(void); + +void __init setup_bootmem_allocator(void) +{ + int i; + unsigned long bootmap_size, bootmap; + /* + * Initialize the boot-time allocator (with low memory only): + */ + bootmap_size = bootmem_bootmap_pages(max_low_pfn)<> PAGE_SHIFT, max_low_pfn); + printk(KERN_INFO " mapped low ram: 0 - %08lx\n", + max_pfn_mapped< Date: Sun, 22 Jun 2008 02:46:58 -0700 Subject: x86: introduce reserve_initrd Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9bc8607d798..98080782ee4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -597,8 +597,6 @@ void __init zone_sizes_init(void) } #endif /* !CONFIG_NEED_MULTIPLE_NODES */ -extern void reserve_initrd(void); - void __init setup_bootmem_allocator(void) { int i; @@ -613,9 +611,9 @@ void __init setup_bootmem_allocator(void) if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); -#ifdef CONFIG_BLK_DEV_INITRD + reserve_initrd(); -#endif + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped< Date: Mon, 23 Jun 2008 21:00:45 +0200 Subject: x86: move find_max_low_pfn to init_32.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 98080782ee4..d1017336f1b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -540,8 +540,95 @@ static void __init set_nx(void) } #endif +/* user-defined highmem size */ +static unsigned int highmem_pages = -1; + +/* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ +static int __init parse_highmem(char *arg) +{ + if (!arg) + return -EINVAL; + + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; + return 0; +} +early_param("highmem", parse_highmem); + +/* + * Determine low and high memory ranges: + */ +unsigned long __init find_max_low_pfn(void) +{ + unsigned long max_low_pfn; + + max_low_pfn = max_pfn; + if (max_low_pfn > MAXMEM_PFN) { + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk(KERN_WARNING "only %luMB highmem pages " + "available, ignoring highmem size of %uMB.\n", + pages_to_mb(max_pfn - MAXMEM_PFN), + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn = MAXMEM_PFN; +#ifndef CONFIG_HIGHMEM + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", + MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING + "Use a HIGHMEM64G enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; +#else /* !CONFIG_HIGHMEM */ +#ifndef CONFIG_HIGHMEM64G + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING "Warning only 4GB will be used." + "Use a HIGHMEM64G enabled kernel.\n"); + } +#endif /* !CONFIG_HIGHMEM64G */ +#endif /* !CONFIG_HIGHMEM */ + } else { + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR "highmem size specified (%uMB) is " + "bigger than pages available (%luMB)!.\n", + pages_to_mb(highmem_pages), + pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn - highmem_pages < + 64*1024*1024/PAGE_SIZE){ + printk(KERN_ERR "highmem size %uMB results in " + "smaller than 64MB lowmem, ignoring it.\n" + , pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem" + " kernel!\n"); +#endif + } + return max_low_pfn; +} + #ifndef CONFIG_NEED_MULTIPLE_NODES -extern unsigned long find_max_low_pfn(void); unsigned long __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { -- cgit v1.2.3 From bef1568d9714f1162086c32583ba7984a7ca8e3e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 22 Jun 2008 17:40:10 -0700 Subject: x86: move reservetop and vmalloc parsing to pgtable_32.c also change reserve_top_address to __init attibute Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable_32.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 0662f345212..828907d001e 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -152,7 +152,7 @@ EXPORT_SYMBOL(__FIXADDR_TOP); * Can be used to relocate the fixmap area and poke a hole in the top * of kernel address space to make room for a hypervisor. */ -void reserve_top_address(unsigned long reserve) +void __init reserve_top_address(unsigned long reserve) { BUG_ON(fixmaps_set > 0); printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", @@ -160,3 +160,36 @@ void reserve_top_address(unsigned long reserve) __FIXADDR_TOP = -reserve - PAGE_SIZE; __VMALLOC_RESERVE += reserve; } + +/* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + __VMALLOC_RESERVE = memparse(arg, &arg); + return 0; +} +early_param("vmalloc", parse_vmalloc); + +/* + * reservetop=size reserves a hole at the top of the kernel address space which + * a hypervisor can load into later. Needed for dynamically loaded hypervisors, + * so relocating the fixmap can be done before paging initialization. + */ +static int __init parse_reservetop(char *arg) +{ + unsigned long address; + + if (!arg) + return -EINVAL; + + address = memparse(arg, &arg); + reserve_top_address(address); + return 0; +} +early_param("reservetop", parse_reservetop); -- cgit v1.2.3 From 2ec65f8b89ea003c27ff7723525a2ee335a2b393 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 23 Jun 2008 03:05:30 -0700 Subject: x86: clean up using max_low_pfn on 32-bit so that max_low_pfn is not changed after it is set. so we can move that early and out of initmem_init. could call find_low_pfn_range just after max_pfn is set. also could move reserve_initrd out of setup_bootmem_allocator so 32bit is more like 64bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 22 +++++++--------------- arch/x86/mm/init_32.c | 25 +++++++++---------------- 2 files changed, 16 insertions(+), 31 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 3e75be46c4f..1dfff700264 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -309,11 +309,10 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -unsigned long __init initmem_init(unsigned long start_pfn, +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { int nid; - unsigned long system_start_pfn, system_max_low_pfn; long kva_target_pfn; /* @@ -324,17 +323,11 @@ unsigned long __init initmem_init(unsigned long start_pfn, * and ZONE_HIGHMEM. */ - /* call find_max_low_pfn at first, it could update max_pfn */ - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); - remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); - /* partially used pages are not usable - thus round upwards */ - system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); do { kva_start_pfn = find_e820_area(kva_target_pfn< system_max_low_pfn) - highstart_pfn = system_max_low_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - num_physpages = system_max_low_pfn; - high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(system_max_low_pfn)); - printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", + pages_to_mb(max_low_pfn)); + printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", min_low_pfn, max_low_pfn, highstart_pfn); printk("Low memory ends at vaddr %08lx\n", @@ -387,7 +380,6 @@ unsigned long __init initmem_init(unsigned long start_pfn, memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); NODE_DATA(0)->bdata = &node0_bdata; setup_bootmem_allocator(); - return max_low_pfn; } void __init zone_sizes_init(void) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d1017336f1b..27b82931294 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -561,9 +561,15 @@ early_param("highmem", parse_highmem); /* * Determine low and high memory ranges: */ -unsigned long __init find_max_low_pfn(void) +void __init find_low_pfn_range(void) { - unsigned long max_low_pfn; + /* it could update max_pfn */ + + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + min_low_pfn = PFN_UP(init_pg_tables_end); max_low_pfn = max_pfn; if (max_low_pfn > MAXMEM_PFN) { @@ -625,21 +631,12 @@ unsigned long __init find_max_low_pfn(void) " kernel!\n"); #endif } - return max_low_pfn; } #ifndef CONFIG_NEED_MULTIPLE_NODES -unsigned long __init initmem_init(unsigned long start_pfn, +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - min_low_pfn = PFN_UP(init_pg_tables_end); - - max_low_pfn = find_max_low_pfn(); - #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) @@ -661,8 +658,6 @@ unsigned long __init initmem_init(unsigned long start_pfn, pages_to_mb(max_low_pfn)); setup_bootmem_allocator(); - - return max_low_pfn; } void __init zone_sizes_init(void) @@ -699,8 +694,6 @@ void __init setup_bootmem_allocator(void) panic("Cannot find bootmem map of size %ld\n", bootmap_size); reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); - reserve_initrd(); - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped< Date: Mon, 23 Jun 2008 03:06:14 -0700 Subject: x86: clean up min_low_pfn for 32bit we already had early_res support, so don't need to track min_low_pfn. keep it to 0 always. also use init_bootmem_node instead of init_bootmem, so don't touch min_low_pfn. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 10 ++++------ arch/x86/mm/init_64.c | 4 +++- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 27b82931294..9bb35cf8bc4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -565,11 +565,7 @@ void __init find_low_pfn_range(void) { /* it could update max_pfn */ - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - min_low_pfn = PFN_UP(init_pg_tables_end); + /* max_low_pfn is 0, we already have early_res support */ max_low_pfn = max_pfn; if (max_low_pfn > MAXMEM_PFN) { @@ -694,7 +690,9 @@ void __init setup_bootmem_allocator(void) panic("Cannot find bootmem map of size %ld\n", bootmap_size); reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); + /* don't touch min_low_pfn */ + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, + min_low_pfn, max_low_pfn); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<> PAGE_SHIFT, end_pfn); + /* don't touch min_low_pfn */ + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, + 0, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); early_res_to_bootmem(0, end_pfn< Date: Mon, 23 Jun 2008 14:02:36 -0700 Subject: x86: fix compile warning in init_64.c len is long and ret is only for NUMA Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 955dbc8abf6..54a98407be3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -830,9 +830,9 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, { #ifdef CONFIG_NUMA int nid, next_nid; + int ret; #endif unsigned long pfn = phys >> PAGE_SHIFT; - int ret; if (pfn >= end_pfn) { /* @@ -842,7 +842,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, if (pfn < max_pfn_mapped) return -EFAULT; - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n", phys, len); return -EFAULT; } -- cgit v1.2.3 From c09434571d4b1d8abf530ba4ce28cb868b45f2e5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 23 Jun 2008 16:41:30 -0700 Subject: x86: numa32 pfn print out using hex instead Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 1dfff700264..f5ae31935ca 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -76,13 +76,13 @@ void memory_present(int nid, unsigned long start, unsigned long end) { unsigned long pfn; - printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", + printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", nid, start, end); printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); printk(KERN_DEBUG " "); for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { physnode_map[pfn / PAGES_PER_ELEMENT] = nid; - printk(KERN_CONT "%ld ", pfn); + printk(KERN_CONT "%lx ", pfn); } printk(KERN_CONT "\n"); } @@ -117,7 +117,7 @@ static unsigned long kva_pages; */ int __init get_memcfg_numa_flat(void) { - printk("NUMA - single node, flat memory mode\n"); + printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; @@ -233,7 +233,7 @@ static unsigned long calculate_numa_remap_pages(void) * The acpi/srat node info can show hot-add memroy zones * where memory could be added but not currently present. */ - printk("node %d pfn: [%lx - %lx]\n", + printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", nid, node_start_pfn[nid], node_end_pfn[nid]); if (node_start_pfn[nid] > max_pfn) continue; @@ -268,7 +268,8 @@ static unsigned long calculate_numa_remap_pages(void) node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" + " node %d at %llx\n", size, nid, node_kva_final>>PAGE_SHIFT); /* @@ -290,7 +291,7 @@ static unsigned long calculate_numa_remap_pages(void) remove_active_range(nid, node_remap_start_pfn[nid], node_remap_start_pfn[nid] + size); } - printk("Reserving total of %ld pages for numa KVA remap\n", + printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", reserve_pages); return reserve_pages; } @@ -304,7 +305,7 @@ static void init_remap_allocator(int nid) node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + ALIGN(sizeof(pg_data_t), PAGE_SIZE); - printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, + printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, (ulong) node_remap_start_vaddr[nid], (ulong) node_remap_end_vaddr[nid]); } @@ -340,9 +341,9 @@ void __init initmem_init(unsigned long start_pfn, if (kva_start_pfn == -1UL) panic("Can not get kva space\n"); - printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", + printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", kva_start_pfn, max_low_pfn); - printk("max_pfn = %ld\n", max_pfn); + printk(KERN_INFO "max_pfn = %lx\n", max_pfn); /* avoid clash with initrd */ reserve_early(kva_start_pfn<spanned_pages; nid = zone_to_nid(zone); - printk("Initializing %s for node %d (%08lx:%08lx)\n", + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", zone->name, nid, zone_start_pfn, zone_end_pfn); add_highpages_with_active_regions(nid, zone_start_pfn, -- cgit v1.2.3 From 11cd0bc140b5d66566c9eb49c1058737888cd75c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 23 Jun 2008 19:51:10 -0700 Subject: x86: move some func calling from setup_arch to paging_init those function depend on paging setup pgtable, so they could access the ram in bootmem region but just get mapped. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9bb35cf8bc4..20ca29591ab 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -705,6 +705,23 @@ void __init setup_bootmem_allocator(void) } +/* + * The node 0 pgdat is initialized before all of these because + * it's needed for bootmem. node>0 pgdats have their virtual + * space allocated before the pagetables are in place to access + * them, so they can't be cleared then. + * + * This should all compile down to nothing when NUMA is off. + */ +static void __init remapped_pgdat_init(void) +{ + int nid; + + for_each_online_node(nid) { + if (nid != 0) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + } +} /* * paging_init() sets up the page tables - note that the first 8MB are @@ -727,6 +744,18 @@ void __init paging_init(void) __flush_tlb_all(); kmap_init(); + + /* + * NOTE: at this point the bootmem allocator is fully available. + */ + + post_reserve_initrd(); + + remapped_pgdat_init(); + sparse_init(); + zone_sizes_init(); + + paravirt_post_allocator_init(); } /* -- cgit v1.2.3 From bb23e403e5162765dabe3dc78646724753d6359b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 25 Jun 2008 00:19:02 -0400 Subject: x86, 64-bit: use p??_populate() to attach pages to pagetable Use the _populate() functions to attach new pages to a pagetable, to make sure the right paravirt_ops calls get called. Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 54a98407be3..bfea7605eaa 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -167,7 +167,7 @@ set_pte_vaddr(unsigned long vaddr, pte_t new_pte) pud = pud_offset(pgd, vaddr); if (pud_none(*pud)) { pmd = (pmd_t *) spp_getpage(); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); + pud_populate(&init_mm, pud, pmd); if (pmd != pmd_offset(pud, 0)) { printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud, 0)); @@ -177,7 +177,7 @@ set_pte_vaddr(unsigned long vaddr, pte_t new_pte) pmd = pmd_offset(pud, vaddr); if (pmd_none(*pmd)) { pte = (pte_t *) spp_getpage(); - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); + pmd_populate_kernel(&init_mm, pmd, pte); if (pte != pte_offset_kernel(pmd, 0)) { printk(KERN_ERR "PAGETABLE BUG #02!\n"); return; @@ -389,7 +389,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) pmd = alloc_low_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); - set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); + pud_populate(&init_mm, pud, __va(pmd_phys)); last_map_addr = phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); @@ -592,7 +592,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon next = end; last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); + pgd_populate(&init_mm, pgd_offset_k(start), + __va(pud_phys)); unmap_low_page(pud); } -- cgit v1.2.3 From 4583ed514ea9ac844a6eb02d33120beaedf6837f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 25 Jun 2008 00:19:03 -0400 Subject: x86, 64-bit: unify early_ioremap The 32-bit early_ioremap will work equally well for 64-bit, so just use it. Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 52 --------------------------------------------------- arch/x86/mm/ioremap.c | 5 +---- 2 files changed, 1 insertion(+), 56 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bfea7605eaa..9b6049133a8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -257,58 +257,6 @@ static __meminit void unmap_low_page(void *adr) early_iounmap(adr, PAGE_SIZE); } -/* Must run before zap_low_mappings */ -__meminit void *early_ioremap(unsigned long addr, unsigned long size) -{ - pmd_t *pmd, *last_pmd; - unsigned long vaddr; - int i, pmds; - - pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; - vaddr = __START_KERNEL_map; - pmd = level2_kernel_pgt; - last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; - - for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { - for (i = 0; i < pmds; i++) { - if (pmd_present(pmd[i])) - goto continue_outer_loop; - } - vaddr += addr & ~PMD_MASK; - addr &= PMD_MASK; - - for (i = 0; i < pmds; i++, addr += PMD_SIZE) - set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - __flush_tlb_all(); - - return (void *)vaddr; -continue_outer_loop: - ; - } - printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); - - return NULL; -} - -/* - * To avoid virtual aliases later: - */ -__meminit void early_iounmap(void *addr, unsigned long size) -{ - unsigned long vaddr; - pmd_t *pmd; - int i, pmds; - - vaddr = (unsigned long)addr; - pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; - pmd = level2_kernel_pgt + pmd_index(vaddr); - - for (i = 0; i < pmds; i++) - pmd_clear(pmd + i); - - __flush_tlb_all(); -} - static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0561fde56a5..092b3d72498 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -381,8 +381,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) return; } -#ifdef CONFIG_X86_32 - int __initdata early_ioremap_debug; static int __init early_ioremap_debug_setup(char *str) @@ -483,6 +481,7 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, return; } pte = early_ioremap_pte(addr); + if (pgprot_val(flags)) set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else @@ -624,5 +623,3 @@ void __this_fixmap_does_not_exist(void) { WARN_ON(1); } - -#endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From 4e29684c40f2a332ba4d05f6482d5807725d5624 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Jun 2008 12:18:14 -0700 Subject: x86: introduce init_memory_mapping for 32bit #1 ... so can we use mem below max_low_pfn earlier. this allows us to move several functions more early instead of waiting to after paging_init. That includes moving relocate_initrd() earlier in the bootup, and kva related early setup done in initmem_init. (in followup patches) Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 141 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 112 insertions(+), 29 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 20ca29591ab..619058e6bff 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -57,6 +57,27 @@ unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); + +static unsigned long __initdata table_start; +static unsigned long __meminitdata table_end; +static unsigned long __meminitdata table_top; + +static int __initdata after_init_bootmem; + +static __init void *alloc_low_page(unsigned long *phys) +{ + unsigned long pfn = table_end++; + void *adr; + + if (pfn >= table_top) + panic("alloc_low_page: ran out of memory"); + + adr = __va(pfn * PAGE_SIZE); + memset(adr, 0, PAGE_SIZE); + *phys = pfn * PAGE_SIZE; + return adr; +} + /* * Creates a middle page table and puts a pointer to it in the * given global directory entry. This only returns the gd entry @@ -68,9 +89,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_t *pmd_table; #ifdef CONFIG_X86_PAE + unsigned long phys; if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - + if (after_init_bootmem) + pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + else + pmd_table = (pmd_t *)alloc_low_page(&phys); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -92,12 +116,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = NULL; + if (after_init_bootmem) { #ifdef CONFIG_DEBUG_PAGEALLOC - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif - if (!page_table) { - page_table = + if (!page_table) + page_table = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); + } else { + unsigned long phys; + page_table = (pte_t *)alloc_low_page(&phys); } paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); @@ -155,7 +183,9 @@ static inline int is_kernel_text(unsigned long addr) * of max_low_pfn pages, by creating page tables starting from address * PAGE_OFFSET: */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base) +static void __init kernel_physical_mapping_init(pgd_t *pgd_base, + unsigned long start, + unsigned long end) { int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; @@ -163,18 +193,19 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) pmd_t *pmd; pte_t *pte; unsigned pages_2m = 0, pages_4k = 0; + unsigned limit_pfn = end >> PAGE_SHIFT; pgd_idx = pgd_index(PAGE_OFFSET); pgd = pgd_base + pgd_idx; - pfn = 0; + pfn = start >> PAGE_SHIFT; for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { pmd = one_md_table_init(pgd); - if (pfn >= max_low_pfn) + if (pfn >= limit_pfn) continue; for (pmd_idx = 0; - pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; + pmd_idx < PTRS_PER_PMD && pfn < limit_pfn; pmd++, pmd_idx++) { unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; @@ -418,20 +449,7 @@ static void __init pagetable_init(void) paravirt_pagetable_setup_start(pgd_base); - /* Enable PSE if available */ - if (cpu_has_pse) - set_in_cr4(X86_CR4_PSE); - - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __PAGE_KERNEL |= _PAGE_GLOBAL; - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; - } - - kernel_physical_mapping_init(pgd_base); remap_numa_kva(); - /* * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): @@ -703,6 +721,7 @@ void __init setup_bootmem_allocator(void) free_bootmem_with_active_regions(i, max_low_pfn); early_res_to_bootmem(0, max_low_pfn<> PUD_SHIFT; + tables = PAGE_ALIGN(puds * sizeof(pud_t)); + + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); + + /* + * RED-PEN putting page tables only on node 0 could + * cause a hotspot and fill up ZONE_DMA. The page tables + * need roughly 0.5KB per GB. + */ + start = 0x7000; + table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; + table_end = table_start; + table_top = table_start + (tables>>PAGE_SHIFT); + + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, table_start << PAGE_SHIFT, + (table_start << PAGE_SHIFT) + tables); +} + +unsigned long __init_refok init_memory_mapping(unsigned long start, + unsigned long end) +{ + pgd_t *pgd_base = swapper_pg_dir; + + /* + * Find space for the kernel direct mapping tables. + */ + if (!after_init_bootmem) + find_early_table_space(end); + +#ifdef CONFIG_X86_PAE + set_nx(); + if (nx_enabled) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __PAGE_KERNEL |= _PAGE_GLOBAL; + __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; + } + + kernel_physical_mapping_init(pgd_base, start, end); + + load_cr3(swapper_pg_dir); + + __flush_tlb_all(); + + if (!after_init_bootmem) + reserve_early(table_start << PAGE_SHIFT, + table_end << PAGE_SHIFT, "PGTABLE"); + + return end >> PAGE_SHIFT; +} + /* * paging_init() sets up the page tables - note that the first 8MB are * already mapped by head.S. @@ -732,15 +822,8 @@ static void __init remapped_pgdat_init(void) */ void __init paging_init(void) { -#ifdef CONFIG_X86_PAE - set_nx(); - if (nx_enabled) - printk(KERN_INFO "NX (Execute Disable) protection: active\n"); -#endif pagetable_init(); - load_cr3(swapper_pg_dir); - __flush_tlb_all(); kmap_init(); -- cgit v1.2.3 From cfb0e53b05402f1ce65053677409a819c1798d34 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Jun 2008 12:18:58 -0700 Subject: x86: introduce init_memory_mapping for 32bit #2 moving relocate_initrd early Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 619058e6bff..ecccb055b08 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -831,9 +831,6 @@ void __init paging_init(void) /* * NOTE: at this point the bootmem allocator is fully available. */ - - post_reserve_initrd(); - remapped_pgdat_init(); sparse_init(); zone_sizes_init(); -- cgit v1.2.3 From 3a58a2a6c879b2e47daafd6e641661c50ac9da5a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Jun 2008 12:19:41 -0700 Subject: x86: introduce init_memory_mapping for 32bit #3 move kva related early backto initmem_init for numa32 Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 8 ++++++-- arch/x86/mm/init_32.c | 20 -------------------- 2 files changed, 6 insertions(+), 22 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index f5ae31935ca..42484d446d0 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -200,7 +200,7 @@ void *alloc_remap(int nid, unsigned long size) return allocation; } -void __init remap_numa_kva(void) +static void __init remap_numa_kva(void) { void *vaddr; unsigned long pfn; @@ -373,12 +373,16 @@ void __init initmem_init(unsigned long start_pfn, allocate_pgdat(nid); } + remap_numa_kva(); + printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); for_each_online_node(nid) propagate_e820_map_node(nid); - memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); + for_each_online_node(nid) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + NODE_DATA(0)->bdata = &node0_bdata; setup_bootmem_allocator(); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ecccb055b08..c059c460e32 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -449,7 +449,6 @@ static void __init pagetable_init(void) paravirt_pagetable_setup_start(pgd_base); - remap_numa_kva(); /* * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): @@ -724,24 +723,6 @@ void __init setup_bootmem_allocator(void) after_init_bootmem = 1; } -/* - * The node 0 pgdat is initialized before all of these because - * it's needed for bootmem. node>0 pgdats have their virtual - * space allocated before the pagetables are in place to access - * them, so they can't be cleared then. - * - * This should all compile down to nothing when NUMA is off. - */ -static void __init remapped_pgdat_init(void) -{ - int nid; - - for_each_online_node(nid) { - if (nid != 0) - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - } -} - static void __init find_early_table_space(unsigned long end) { unsigned long puds, pmds, tables, start; @@ -831,7 +812,6 @@ void __init paging_init(void) /* * NOTE: at this point the bootmem allocator is fully available. */ - remapped_pgdat_init(); sparse_init(); zone_sizes_init(); -- cgit v1.2.3 From 1a0db38e5fa50eeb885194b1f4e494d4de55b918 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Jun 2008 14:56:20 -0700 Subject: x86: get max_pfn_mapped in init_memory_mapping so don't shift that in the loop Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9b6049133a8..359e3afaaa6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -346,7 +346,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) __flush_tlb_all(); update_page_count(PG_LEVEL_1G, pages); - return last_map_addr >> PAGE_SHIFT; + return last_map_addr; } static void __init find_early_table_space(unsigned long end) @@ -556,7 +556,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon if (!after_bootmem) early_memtest(start_phys, end_phys); - return last_map_addr; + return last_map_addr >> PAGE_SHIFT; } #ifndef CONFIG_NUMA -- cgit v1.2.3 From d86623a0d55a14e7295e8ca1bd258e0c7f8dcb31 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Jun 2008 14:57:29 -0700 Subject: x86: add table_top check for alloc_low_page in 64 bit that range is from find_e820_area, so don't try to use end_pfn to see if out of boundary...use table_top instead to avoid possible strange result while cross the boundary... also change early_printk to printk, because init_memory_mapping is after early param parsing, and console=uart8250 already working at that time. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 359e3afaaa6..6eced2f1073 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -227,6 +227,7 @@ void __init cleanup_highmap(void) static unsigned long __initdata table_start; static unsigned long __meminitdata table_end; +static unsigned long __meminitdata table_top; static __meminit void *alloc_low_page(unsigned long *phys) { @@ -240,7 +241,7 @@ static __meminit void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= end_pfn) + if (pfn >= table_top) panic("alloc_low_page: ran out of memory"); adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); @@ -372,10 +373,10 @@ static void __init find_early_table_space(unsigned long end) table_start >>= PAGE_SHIFT; table_end = table_start; + table_top = table_start + (tables >> PAGE_SHIFT); - early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, - (table_start << PAGE_SHIFT) + tables); + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); } static void __init init_gbpages(void) -- cgit v1.2.3 From c987d12f8455b19b3b057d63bac3de161bd809fc Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Jun 2008 22:14:09 -0700 Subject: x86: remove end_pfn in 64bit and use max_pfn directly. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 17 ++++++----------- arch/x86/mm/k8topology_64.c | 4 ++-- arch/x86/mm/numa_64.c | 4 ++-- arch/x86/mm/srat_64.c | 2 +- 4 files changed, 11 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6eced2f1073..d5d4b04d48a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -48,11 +48,6 @@ #include #include -/* - * PFN of last memory page. - */ -unsigned long end_pfn; - /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. * The direct mapping extends to max_pfn_mapped, so that we can directly access @@ -586,9 +581,9 @@ void __init paging_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = end_pfn; + max_zone_pfns[ZONE_NORMAL] = max_pfn; - memory_present(0, 0, end_pfn); + memory_present(0, 0, max_pfn); sparse_init(); free_area_init_nodes(max_zone_pfns); } @@ -670,8 +665,8 @@ void __init mem_init(void) #else totalram_pages = free_all_bootmem(); #endif - reservedpages = end_pfn - totalram_pages - - absent_pages_in_range(0, end_pfn); + reservedpages = max_pfn - totalram_pages - + absent_pages_in_range(0, max_pfn); after_bootmem = 1; codesize = (unsigned long) &_etext - (unsigned long) &_text; @@ -690,7 +685,7 @@ void __init mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " "%ldk reserved, %ldk data, %ldk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - end_pfn << (PAGE_SHIFT-10), + max_pfn << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), datasize >> 10, @@ -784,7 +779,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, #endif unsigned long pfn = phys >> PAGE_SHIFT; - if (pfn >= end_pfn) { + if (pfn >= max_pfn) { /* * This can happen with kdump kernels when accessing * firmware tables: diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 317573ec925..41f1b5c00a1 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -143,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) limit |= (1<<24)-1; limit++; - if (limit > end_pfn << PAGE_SHIFT) - limit = end_pfn << PAGE_SHIFT; + if (limit > max_pfn << PAGE_SHIFT) + limit = max_pfn << PAGE_SHIFT; if (limit <= base) continue; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 316e5f961ef..b432d578177 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -86,7 +86,7 @@ static int __init allocate_cachealigned_memnodemap(void) addr = 0x8000; nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); - nodemap_addr = find_e820_area(addr, end_pfn<= 1*1024*1024) { printk(KERN_ERR -- cgit v1.2.3 From 67350a5c4514c280665cdb45439d32a008a264ba Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 25 Jun 2008 00:19:11 -0400 Subject: x86: simplify vmalloc_sync_all vmalloc_sync_all() is only called from register_die_notifier and alloc_vm_area. Neither is on any performance-critical paths, so vmalloc_sync_all() itself is not on any hot paths. Given that the optimisations in vmalloc_sync_all add a fair amount of code and complexity, and are fairly hard to evaluate for correctness, it's better to just remove them to simplify the code rather than worry about its absolute performance. Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 77 ++++++++++++++++++----------------------------------- 1 file changed, 26 insertions(+), 51 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 578b7681955..d0f5fce77d9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -903,14 +903,7 @@ LIST_HEAD(pgd_list); void vmalloc_sync_all(void) { #ifdef CONFIG_X86_32 - /* - * Note that races in the updates of insync and start aren't - * problematic: insync can only get set bits added, and updates to - * start are only improving performance (without affecting correctness - * if undone). - */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = TASK_SIZE; + unsigned long start = VMALLOC_START & PGDIR_MASK; unsigned long address; if (SHARED_KERNEL_PMD) @@ -918,56 +911,38 @@ void vmalloc_sync_all(void) BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), - address)) - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - if (!page) - set_bit(pgd_index(address), insync); + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), + address)) + break; } - if (address == start && test_bit(pgd_index(address), insync)) - start = address + PGDIR_SIZE; + spin_unlock_irqrestore(&pgd_lock, flags); } #else /* CONFIG_X86_64 */ - /* - * Note that races in the updates of insync and start aren't - * problematic: insync can only get set bits added, and updates to - * start are only improving performance (without affecting correctness - * if undone). - */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = VMALLOC_START & PGDIR_MASK; + unsigned long start = VMALLOC_START & PGDIR_MASK; unsigned long address; for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - set_bit(pgd_index(address), insync); + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } - if (address == start) - start = address + PGDIR_SIZE; + spin_unlock_irqrestore(&pgd_lock, flags); } #endif } -- cgit v1.2.3 From eba0045ff87bab465d3c80c289f3bf709c1800f5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 25 Jun 2008 00:19:12 -0400 Subject: x86/paravirt: add a pgd_alloc/free hooks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add hooks which are called at pgd_alloc/free time. The pgd_alloc hook may return an error code, which if non-zero, causes the pgd allocation to be failed. The hooks may be used to allocate/free auxillary per-pgd information. also fix: > * Ingo Molnar wrote: > > include/asm/pgalloc.h: In function ‘paravirt_pgd_free': > include/asm/pgalloc.h:14: error: parameter name omitted > arch/x86/kernel/entry_64.S: In file included from > arch/x86/kernel/traps_64.c:51:include/asm/pgalloc.h: In function ‘paravirt_pgd_free': > include/asm/pgalloc.h:14: error: parameter name omitted Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 45b99ac3948..418c4432fb3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -215,13 +215,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm) /* so that alloc_pmd can use it */ mm->pgd = pgd; - if (pgd) + if (pgd) { pgd_ctor(pgd); - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { - pgd_dtor(pgd); - free_page((unsigned long)pgd); - pgd = NULL; + if (paravirt_pgd_alloc(mm) != 0 || + !pgd_prepopulate_pmd(mm, pgd)) { + pgd_dtor(pgd); + free_page((unsigned long)pgd); + pgd = NULL; + } } return pgd; @@ -231,6 +233,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); + paravirt_pgd_free(mm, pgd); free_page((unsigned long)pgd); } -- cgit v1.2.3 From d8d5900ef8afc562088f8470feeaf17c4747790f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 25 Jun 2008 00:19:13 -0400 Subject: x86: preallocate and prepopulate separately Jan Beulich points out that vmalloc_sync_all() assumes that the kernel's pmd is always expected to be present in the pgd. The current pgd construction code will add the pgd to the pgd_list before its pmds have been pre-populated, thereby making it visible to vmalloc_sync_all(). However, because pgd_prepopulate_pmd also does the allocation, it may block and cannot be done under spinlock. The solution is to preallocate the pmds out of the spinlock, then populate them while holding the pgd_list lock. This patch also pulls the pmd preallocation and mop-up functions out to be common, assuming that the compiler will generate no code for them when PREALLOCTED_PMDS is 0. Also, there's no need for pgd_ctor to clear the pgd again, since it's allocated as a zeroed page. Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar Cc: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 169 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 101 insertions(+), 68 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 418c4432fb3..557b2abceef 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -66,12 +66,6 @@ static inline void pgd_list_del(pgd_t *pgd) static void pgd_ctor(void *p) { pgd_t *pgd = p; - unsigned long flags; - - /* Clear usermode parts of PGD */ - memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t)); - - spin_lock_irqsave(&pgd_lock, flags); /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the @@ -91,8 +85,6 @@ static void pgd_ctor(void *p) /* list required to sync kernel mapping updates */ if (!SHARED_KERNEL_PMD) pgd_list_add(pgd); - - spin_unlock_irqrestore(&pgd_lock, flags); } static void pgd_dtor(void *pgd) @@ -119,6 +111,72 @@ static void pgd_dtor(void *pgd) */ #ifdef CONFIG_X86_PAE +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD + +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + if (mm == current->active_mm) + write_cr3(read_cr3()); +} +#else /* !CONFIG_X86_PAE */ + +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +#define PREALLOCATED_PMDS 0 + +#endif /* CONFIG_X86_PAE */ + +static void free_pmds(pmd_t *pmds[]) +{ + int i; + + for(i = 0; i < PREALLOCATED_PMDS; i++) + if (pmds[i]) + free_page((unsigned long)pmds[i]); +} + +static int preallocate_pmds(pmd_t *pmds[]) +{ + int i; + bool failed = false; + + for(i = 0; i < PREALLOCATED_PMDS; i++) { + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + if (pmd == NULL) + failed = true; + pmds[i] = pmd; + } + + if (failed) { + free_pmds(pmds); + return -ENOMEM; + } + + return 0; +} + /* * Mop up any pmd pages which may still be attached to the pgd. * Normally they will be freed by munmap/exit_mmap, but any pmd we @@ -129,7 +187,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { + for(i = 0; i < PREALLOCATED_PMDS; i++) { pgd_t pgd = pgdp[i]; if (pgd_val(pgd) != 0) { @@ -143,32 +201,17 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) } } -/* - * In PAE mode, we need to do a cr3 reload (=tlb flush) when - * updating the top-level pagetable entries to guarantee the - * processor notices the update. Since this is expensive, and - * all 4 top-level entries are used almost immediately in a - * new process's life, we just pre-populate them here. - * - * Also, if we're in a paravirt environment where the kernel pmd is - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate - * and initialize the kernel pmds here. - */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { pud_t *pud; unsigned long addr; int i; pud = pud_offset(pgd, 0); - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; - i++, pud++, addr += PUD_SIZE) { - pmd_t *pmd = pmd_alloc_one(mm, addr); - if (!pmd) { - pgd_mop_up_pmds(mm, pgd); - return 0; - } + for (addr = i = 0; i < PREALLOCATED_PMDS; + i++, pud++, addr += PUD_SIZE) { + pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), @@ -176,57 +219,47 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) pud_populate(mm, pud, pmd); } - - return 1; } -void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +pgd_t *pgd_alloc(struct mm_struct *mm) { - paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + pgd_t *pgd; + pmd_t *pmds[PREALLOCATED_PMDS]; + unsigned long flags; - /* Note: almost everything apart from _PAGE_PRESENT is - reserved at the pmd (PDPT) level. */ - set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + + if (pgd == NULL) + goto out; + + mm->pgd = pgd; + + if (preallocate_pmds(pmds) != 0) + goto out_free_pgd; + + if (paravirt_pgd_alloc(mm) != 0) + goto out_free_pmds; /* - * According to Intel App note "TLBs, Paging-Structure Caches, - * and Their Invalidation", April 2007, document 317080-001, - * section 8.1: in PAE mode we explicitly have to flush the - * TLB via cr3 if the top-level pgd is changed... + * Make sure that pre-populating the pmds is atomic with + * respect to anything walking the pgd_list, so that they + * never see a partially populated pgd. */ - if (mm == current->active_mm) - write_cr3(read_cr3()); -} -#else /* !CONFIG_X86_PAE */ -/* No need to prepopulate any pagetable entries in non-PAE modes. */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - return 1; -} - -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) -{ -} -#endif /* CONFIG_X86_PAE */ + spin_lock_irqsave(&pgd_lock, flags); -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + pgd_ctor(pgd); + pgd_prepopulate_pmd(mm, pgd, pmds); - /* so that alloc_pmd can use it */ - mm->pgd = pgd; - if (pgd) { - pgd_ctor(pgd); - - if (paravirt_pgd_alloc(mm) != 0 || - !pgd_prepopulate_pmd(mm, pgd)) { - pgd_dtor(pgd); - free_page((unsigned long)pgd); - pgd = NULL; - } - } + spin_unlock_irqrestore(&pgd_lock, flags); return pgd; + +out_free_pmds: + free_pmds(pmds); +out_free_pgd: + free_page((unsigned long)pgd); +out: + return NULL; } void pgd_free(struct mm_struct *mm, pgd_t *pgd) -- cgit v1.2.3 From 4f9c11dd49fb73e1ec088b27ed6539681a445988 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 25 Jun 2008 00:19:19 -0400 Subject: x86, 64-bit: adjust mapping of physical pagetables to work with Xen This makes a few of changes to the construction of the initial pagetables to work better with paravirt_ops/Xen. The main areas are: 1. Support non-PSE mapping of memory, since Xen doesn't currently allow 2M pages to be mapped in guests. 2. Make sure that the ioremap alias of all pages are dropped before attaching the new page to the pagetable. This avoids having writable aliases of pagetable pages. 3. Preserve existing pagetable entries, rather than overwriting. Its possible that a fair amount of pagetable has already been constructed, so reuse what's already in place rather than ignoring and overwriting it. The algorithm relies on the invariant that any page which is part of the kernel pagetable is itself mapped in the linear memory area. This way, it can avoid using ioremap on a pagetable page. The invariant holds because it maps memory from low to high addresses, and also allocates memory from low to high. Each allocated page can map at least 2M of address space, so the mapped area will always progress much faster than the allocated area. It relies on the early boot code mapping enough pages to get started. Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 94 ++++++++++++++++++++++++++++++++++++++++++++------- arch/x86/mm/ioremap.c | 2 +- 2 files changed, 83 insertions(+), 13 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d5d4b04d48a..363751dc3fb 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -253,6 +253,43 @@ static __meminit void unmap_low_page(void *adr) early_iounmap(adr, PAGE_SIZE); } +static void __meminit +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) +{ + unsigned pages = 0; + int i; + pte_t *pte = pte_page + pte_index(addr); + + for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { + + if (addr >= end) { + if (!after_bootmem) { + for(; i < PTRS_PER_PTE; i++, pte++) + set_pte(pte, __pte(0)); + } + break; + } + + if (pte_val(*pte)) + continue; + + if (0) + printk(" pte=%p addr=%lx pte=%016lx\n", + pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); + pages++; + } + update_page_count(PG_LEVEL_4K, pages); +} + +static void __meminit +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) +{ + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); + + phys_pte_init(pte, address, end); +} + static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { @@ -261,7 +298,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) int i = pmd_index(address); for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { + unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); + pte_t *pte; if (address >= end) { if (!after_bootmem) { @@ -271,12 +310,23 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) break; } - if (pmd_val(*pmd)) + if (pmd_val(*pmd)) { + phys_pte_update(pmd, address, end); + continue; + } + + if (cpu_has_pse) { + pages++; + set_pte((pte_t *)pmd, + pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); continue; + } - pages++; - set_pte((pte_t *)pmd, - pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pte = alloc_low_page(&pte_phys); + phys_pte_init(pte, address, end); + unmap_low_page(pte); + + pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); } update_page_count(PG_LEVEL_2M, pages); return address; @@ -333,11 +383,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) pmd = alloc_low_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); - pud_populate(&init_mm, pud, __va(pmd_phys)); last_map_addr = phys_pmd_init(pmd, addr, end); + unmap_low_page(pmd); + pud_populate(&init_mm, pud, __va(pmd_phys)); spin_unlock(&init_mm.page_table_lock); - unmap_low_page(pmd); } __flush_tlb_all(); update_page_count(PG_LEVEL_1G, pages); @@ -345,16 +395,30 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) return last_map_addr; } +static unsigned long __meminit +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + pud_t *pud; + + pud = (pud_t *)pgd_page_vaddr(*pgd); + + return phys_pud_init(pud, addr, end); +} + static void __init find_early_table_space(unsigned long end) { - unsigned long puds, pmds, tables, start; + unsigned long puds, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); if (!direct_gbpages) { - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + unsigned long pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); } + if (!cpu_has_pse) { + unsigned long ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); + } /* * RED-PEN putting page tables only on node 0 could @@ -526,19 +590,25 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon unsigned long pud_phys; pud_t *pud; + next = start + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_val(*pgd)) { + last_map_addr = phys_pud_update(pgd, __pa(start), __pa(end)); + continue; + } + if (after_bootmem) pud = pud_offset(pgd, start & PGDIR_MASK); else pud = alloc_low_page(&pud_phys); - next = start + PGDIR_SIZE; - if (next > end) - next = end; last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); + unmap_low_page(pud); if (!after_bootmem) pgd_populate(&init_mm, pgd_offset_k(start), __va(pud_phys)); - unmap_low_page(pud); } if (!after_bootmem) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 092b3d72498..45e546c4ba7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -485,7 +485,7 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, if (pgprot_val(flags)) set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else - pte_clear(NULL, addr, pte); + pte_clear(&init_mm, addr, pte); __flush_tlb_one(addr); } -- cgit v1.2.3 From 7c934d3990aa4d785feddcef700f4c2c4aba2251 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 25 Jun 2008 00:19:20 -0400 Subject: x86, 64-bit: create small vmemmap mappings if PSE not available If PSE is not available, then fall back to 4k page mappings for the vmemmap area. Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 62 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 22 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 363751dc3fb..e62cbdd4e2d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -988,7 +988,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) pmd_t *pmd; for (; addr < end; addr = next) { - next = pmd_addr_end(addr, end); + void *p = NULL; pgd = vmemmap_pgd_populate(addr, node); if (!pgd) @@ -998,33 +998,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) if (!pud) return -ENOMEM; - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { - pte_t entry; - void *p; + if (!cpu_has_pse) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = vmemmap_pmd_populate(pud, addr, node); + + if (!pmd) + return -ENOMEM; + + p = vmemmap_pte_populate(pmd, addr, node); - p = vmemmap_alloc_block(PMD_SIZE, node); if (!p) return -ENOMEM; - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, - PAGE_KERNEL_LARGE); - set_pmd(pmd, __pmd(pte_val(entry))); - - /* check to see if we have contiguous blocks */ - if (p_end != p || node_start != node) { - if (p_start) - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", - addr_start, addr_end-1, p_start, p_end-1, node_start); - addr_start = addr; - node_start = node; - p_start = p; - } - addr_end = addr + PMD_SIZE; - p_end = p + PMD_SIZE; + addr_end = addr + PAGE_SIZE; + p_end = p + PAGE_SIZE; } else { - vmemmap_verify((pte_t *)pmd, node, addr, next); + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pte_t entry; + + p = vmemmap_alloc_block(PMD_SIZE, node); + if (!p) + return -ENOMEM; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); + set_pmd(pmd, __pmd(pte_val(entry))); + + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; + + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } + } else + vmemmap_verify((pte_t *)pmd, node, addr, next); } + } return 0; } -- cgit v1.2.3 From 0814e0bace537b7024b09187346b99401e6281be Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 25 Jun 2008 00:19:22 -0400 Subject: x86, 64-bit: split set_pte_vaddr() We will need to set a pte on l3_user_pgt. Extract set_pte_vaddr_pud() from set_pte_vaddr(), that will accept the l3 page table as parameter. This change should be a no-op for existing code. Signed-off-by: Eduardo Habkost Signed-off-by: Mark McLoughlin Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e62cbdd4e2d..5c3305e0507 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -144,22 +144,13 @@ static __init void *spp_getpage(void) } void -set_pte_vaddr(unsigned long vaddr, pte_t new_pte) +set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) { - pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(new_pte)); - - pgd = pgd_offset_k(vaddr); - if (pgd_none(*pgd)) { - printk(KERN_ERR - "PGD FIXMAP MISSING, it should be setup in head.S!\n"); - return; - } - pud = pud_offset(pgd, vaddr); + pud = pud_page + pud_index(vaddr); if (pud_none(*pud)) { pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); @@ -192,6 +183,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } +void +set_pte_vaddr(unsigned long vaddr, pte_t pteval) +{ + pgd_t *pgd; + pud_t *pud_page; + + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); + return; + } + pud_page = (pud_t*)pgd_page_vaddr(*pgd); + set_pte_vaddr_pud(pud_page, vaddr, pteval); +} + /* * The head.S code sets up the kernel high mapping: * -- cgit v1.2.3 From 8207c2570af6f819b61be9ef3fb298d0a8c0e18c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 24 Jun 2008 17:32:48 -0400 Subject: x86: fix pte allocation in "x86: introduce init_memory_mapping for 32bit" The patch "x86: introduce init_memory_mapping for 32bit" does not allocate enough space for PTEs if the CPU does not implement PSE. Signed-off-by: Jeremy Fitzhardinge Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c059c460e32..156000de6e6 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -733,6 +733,11 @@ static void __init find_early_table_space(unsigned long end) pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); + if (!cpu_has_pse) { + int ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + tables += PAGE_ALIGN(ptes * sizeof(pte_t)); + } + /* * RED-PEN putting page tables only on node 0 could * cause a hotspot and fill up ZONE_DMA. The page tables -- cgit v1.2.3 From e7b3789524eecc96213dd69d6686efd429235051 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 25 Jun 2008 21:51:28 -0700 Subject: x86: move fix mapping page table range early do that in init_memory_mapping also remove one init_ohci1394_dma_on_all_controllers Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 156000de6e6..b9cf7f70530 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -442,13 +442,10 @@ void __init native_pagetable_setup_done(pgd_t *base) * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init pagetable_init(void) +static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) { - pgd_t *pgd_base = swapper_pg_dir; unsigned long vaddr, end; - paravirt_pagetable_setup_start(pgd_base); - /* * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): @@ -458,6 +455,13 @@ static void __init pagetable_init(void) end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; page_table_range_init(vaddr, end, pgd_base); early_ioremap_reset(); +} + +static void __init pagetable_init(void) +{ + pgd_t *pgd_base = swapper_pg_dir; + + paravirt_pagetable_setup_start(pgd_base); permanent_kmaps_init(pgd_base); @@ -788,6 +792,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, kernel_physical_mapping_init(pgd_base, start, end); + early_ioremap_page_table_range_init(pgd_base); + load_cr3(swapper_pg_dir); __flush_tlb_all(); @@ -799,6 +805,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, return end >> PAGE_SHIFT; } + /* * paging_init() sets up the page tables - note that the first 8MB are * already mapped by head.S. -- cgit v1.2.3 From 22b45144f67dbaf0705992dc1462de2813fb83a1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Jun 2008 12:02:49 -0700 Subject: x86/paravirt: groundwork for 64-bit Xen support, fix #2 Ingo Molnar wrote: > that fixed the build but now we've got a boot crash with this config: > > time.c: Detected 2010.304 MHz processor. > spurious 8259A interrupt: IRQ7. > BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 > IP: [<0000000000000000>] > PGD 0 > Thread overran stack, or stack corrupted > Oops: 0010 [1] SMP > CPU 0 > I don't know if this will fix this bug, but it's definitely a bugfix. It was trashing random pages by overwriting them with pagetables... Don't trash a large pmd's data when mapping physical memory. This is a bugfix for "x86_64: adjust mapping of physical pagetables to work with Xen". Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Cc: Vegard Nossum Cc: Nick Piggin Cc: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5c3305e0507..b10b7f17ea5 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -320,7 +320,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) } if (pmd_val(*pmd)) { - phys_pte_update(pmd, address, end); + if (!pmd_large(*pmd)) + phys_pte_update(pmd, address, end); continue; } -- cgit v1.2.3 From 7482b0e962e128c5b574aa29761f97164189ef14 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 28 Jun 2008 03:30:39 -0700 Subject: x86: fix init_memory_mapping over boundary v3 some ram-end boundary only has page alignment, instead of 2M alignment. v2: make init_memory_mapping more solid: start could be any value other than 0 v3: fix NON PAE by handling left over in kernel_physical_mapping Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b9cf7f70530..90ca67be965 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -195,7 +195,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, unsigned pages_2m = 0, pages_4k = 0; unsigned limit_pfn = end >> PAGE_SHIFT; - pgd_idx = pgd_index(PAGE_OFFSET); + pgd_idx = pgd_index(start + PAGE_OFFSET); pgd = pgd_base + pgd_idx; pfn = start >> PAGE_SHIFT; @@ -218,7 +218,8 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, * and overlapping MTRRs into large pages can cause * slowdowns. */ - if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { + if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0) && + (pfn + PTRS_PER_PTE) <= limit_pfn) { unsigned int addr2; pgprot_t prot = PAGE_KERNEL_LARGE; @@ -233,13 +234,12 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; - max_pfn_mapped = pfn; continue; } pte = one_page_table_init(pmd); for (pte_ofs = 0; - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; + pte_ofs < PTRS_PER_PTE && pfn < limit_pfn; pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { pgprot_t prot = PAGE_KERNEL; @@ -249,7 +249,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, pages_4k++; set_pte(pte, pfn_pte(pfn, prot)); } - max_pfn_mapped = pfn; } } update_page_count(PG_LEVEL_2M, pages_2m); @@ -729,7 +728,7 @@ void __init setup_bootmem_allocator(void) static void __init find_early_table_space(unsigned long end) { - unsigned long puds, pmds, tables, start; + unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = PAGE_ALIGN(puds * sizeof(pud_t)); @@ -737,10 +736,15 @@ static void __init find_early_table_space(unsigned long end) pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); - if (!cpu_has_pse) { - int ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += PAGE_ALIGN(ptes * sizeof(pte_t)); - } + if (cpu_has_pse) { + unsigned long extra; + extra = end - ((end>>21) << 21); + extra += (2UL<<20); + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + + tables += PAGE_ALIGN(ptes * sizeof(pte_t)); /* * RED-PEN putting page tables only on node 0 could -- cgit v1.2.3 From a04ad82d0bff4bb564f290eb50982e02458592d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 29 Jun 2008 00:39:06 -0700 Subject: x86: fix init_memory_mapping over boundary, v4 use PMD_SHIFT to calculate boundary also adjust size for pre-allocated table size Signed-off-by: Yinghai Lu Cc: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 89 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 67 insertions(+), 22 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 90ca67be965..aa5e37c9f4b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -184,8 +184,9 @@ static inline int is_kernel_text(unsigned long addr) * PAGE_OFFSET: */ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, - unsigned long start, - unsigned long end) + unsigned long start_pfn, + unsigned long end_pfn, + int use_pse) { int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; @@ -193,33 +194,33 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, pmd_t *pmd; pte_t *pte; unsigned pages_2m = 0, pages_4k = 0; - unsigned limit_pfn = end >> PAGE_SHIFT; - pgd_idx = pgd_index(start + PAGE_OFFSET); - pgd = pgd_base + pgd_idx; - pfn = start >> PAGE_SHIFT; + if (!cpu_has_pse) + use_pse = 0; + pfn = start_pfn; + pgd_idx = pgd_index((pfn<= limit_pfn) - continue; - for (pmd_idx = 0; - pmd_idx < PTRS_PER_PMD && pfn < limit_pfn; + if (pfn >= end_pfn) + continue; +#ifdef CONFIG_X86_PAE + pmd_idx = pmd_index((pfn<>21) << 21); - extra += (2UL<<20); + + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); + extra += PMD_SIZE; ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; tables += PAGE_ALIGN(ptes * sizeof(pte_t)); + /* for fixmap */ + tables += PAGE_SIZE * 2; + /* * RED-PEN putting page tables only on node 0 could * cause a hotspot and fill up ZONE_DMA. The page tables @@ -770,6 +776,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { pgd_t *pgd_base = swapper_pg_dir; + unsigned long start_pfn, end_pfn; + unsigned long big_page_start; /* * Find space for the kernel direct mapping tables. @@ -794,7 +802,44 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; } - kernel_physical_mapping_init(pgd_base, start, end); + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + big_page_start = PMD_SIZE; + + if (start < big_page_start) { + start_pfn = start >> PAGE_SHIFT; + end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT); + } else { + /* head is not big page alignment ? */ + start_pfn = start >> PAGE_SHIFT; + end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + } + if (start_pfn < end_pfn) + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0); + + /* big page range */ + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < (big_page_start >> PAGE_SHIFT)) + start_pfn = big_page_start >> PAGE_SHIFT; + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, + cpu_has_pse); + + /* tail is not big page alignment ? */ + start_pfn = end_pfn; + if (start_pfn > (big_page_start>>PAGE_SHIFT)) { + end_pfn = end >> PAGE_SHIFT; + if (start_pfn < end_pfn) + kernel_physical_mapping_init(pgd_base, start_pfn, + end_pfn, 0); + } early_ioremap_page_table_range_init(pgd_base); -- cgit v1.2.3 From 996cf4438f9ed711c98a3cb5ab88f842a4102427 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 30 Jun 2008 18:34:58 -0700 Subject: x86: don't reallocate pgt for node0 kva ram already mapped right after away, so don't need to get that for low ram. avoid wasting one copy of pgdat. also add node id in early_res name in case we get it from find_e820_area. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 42484d446d0..e2f0dd13a45 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -156,17 +156,20 @@ static void __init propagate_e820_map_node(int nid) */ static void __init allocate_pgdat(int nid) { - if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) + char buf[16]; + + if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { unsigned long pgdat_phys; pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); - reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), - "NODE_DATA"); + memset(buf, 0, sizeof(buf)); + sprintf(buf, "NODE_DATA %d", nid); + reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); } printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", nid, (unsigned long)NODE_DATA(nid)); -- cgit v1.2.3 From cb95a13a8ace8612ecab042a838e5aab2ec14ef0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 2 Jul 2008 00:31:02 -0700 Subject: x86: merge zones_sizes_init for numa and non numa on 32-bit move out e820_register_active_regions from non numa zones_sizes_init() and remove numa version zones_sizes_init(). and let 32 bit call remove_all_active_ranges() in setup_arch() directly like 64-bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 16 ---------------- arch/x86/mm/init_32.c | 10 ++++------ 2 files changed, 4 insertions(+), 22 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index e2f0dd13a45..fa389d20383 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -327,7 +327,6 @@ void __init initmem_init(unsigned long start_pfn, * and ZONE_HIGHMEM. */ - remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); @@ -390,21 +389,6 @@ void __init initmem_init(unsigned long start_pfn, setup_bootmem_allocator(); } -void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; -#endif - - free_area_init_nodes(max_zone_pfns); - return; -} - void __init set_highmem_pages_init(void) { #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index aa5e37c9f4b..8efe872b961 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -660,12 +660,14 @@ void __init initmem_init(unsigned long start_pfn, if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; memory_present(0, 0, highend_pfn); + e820_register_active_regions(0, 0, highend_pfn); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else memory_present(0, 0, max_low_pfn); + e820_register_active_regions(0, 0, max_low_pfn); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif @@ -677,25 +679,21 @@ void __init initmem_init(unsigned long start_pfn, setup_bootmem_allocator(); } +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ -void __init zone_sizes_init(void) +static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - remove_all_active_ranges(); #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; - e820_register_active_regions(0, 0, highend_pfn); -#else - e820_register_active_regions(0, 0, max_low_pfn); #endif free_area_init_nodes(max_zone_pfns); } -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ void __init setup_bootmem_allocator(void) { -- cgit v1.2.3 From 574977a2edde0148ea365008dceb0c2594d10b11 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 1 Jul 2008 16:46:33 -0700 Subject: x86_64/setup: unconditionally populate the pgd When allocating a new pud, unconditionally populate the pgd (why did we bother to create a new pud if we weren't going to populate it?). This will only happen if the pgd slot was empty, since any existing pud will be reused. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b10b7f17ea5..77d129d62c9 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -616,9 +616,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); unmap_low_page(pud); - if (!after_bootmem) - pgd_populate(&init_mm, pgd_offset_k(start), - __va(pud_phys)); + pgd_populate(&init_mm, pgd_offset_k(start), + __va(pud_phys)); } if (!after_bootmem) -- cgit v1.2.3 From ef5e94af16c0c82452e1ea5d387e1203dd2198d6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 1 Jul 2008 16:46:36 -0700 Subject: x86_32: remove __PAGE_KERNEL(_EXEC) From: Jeremy Fitzhardinge Older x86-32 processors do not support global mappings (PGD), so must only use it if the processor supports it. The _PAGE_KERNEL* flags always have _PAGE_KERNEL set, since logically we always want it set. This is OK even on processors which do not support PGD, since all _PAGE flags are masked with __supported_pte_mask before being turned into a real in-pagetable pte. On 32-bit systems, __supported_pte_mask is initialized to not contain _PAGE_GLOBAL, and it is then added if the CPU is found to support it. The x86-32 code used to use __PAGE_KERNEL/__PAGE_KERNEL_EXEC for this purpose, but they're now redundant and can be removed. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8efe872b961..b5a0fd5f4c5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -383,11 +383,6 @@ static void __init set_highmem_pages_init(void) # define set_highmem_pages_init() do { } while (0) #endif /* CONFIG_HIGHMEM */ -pteval_t __PAGE_KERNEL = _PAGE_KERNEL; -EXPORT_SYMBOL(__PAGE_KERNEL); - -pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; - void __init native_pagetable_setup_start(pgd_t *base) { unsigned long pfn, va; @@ -509,7 +504,7 @@ void zap_low_mappings(void) int nx_enabled; -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); EXPORT_SYMBOL_GPL(__supported_pte_mask); #ifdef CONFIG_X86_PAE @@ -796,8 +791,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* Enable PGE if available */ if (cpu_has_pge) { set_in_cr4(X86_CR4_PGE); - __PAGE_KERNEL |= _PAGE_GLOBAL; - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; + __supported_pte_mask |= _PAGE_GLOBAL; } /* -- cgit v1.2.3 From 5a654ba7a88d90483d0f0586297ea9075d755fc8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 1 Jul 2008 16:46:37 -0700 Subject: x86/cpa: use an undefined PTE bit for testing CPA Rather than using _PAGE_GLOBAL - which not all CPUs support - to test CPA, use one of the reserved-for-software-use PTE flags instead. This allows CPA testing to work on CPUs which don't support PGD. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 75f1b109aae..0dcd42eb94e 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -1,8 +1,8 @@ /* * self test for change_page_attr. * - * Clears the global bit on random pages in the direct mapping, then reverts - * and compares page tables forwards and afterwards. + * Clears the a test pte bit on random pages in the direct mapping, + * then reverts and compares page tables forwards and afterwards. */ #include #include @@ -32,6 +32,13 @@ enum { GPS = (1<<30) }; +#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) + +static int pte_testbit(pte_t pte) +{ + return pte_flags(pte) & _PAGE_UNUSED1; +} + struct split_state { long lpg, gpg, spg, exec; long min_exec, max_exec; @@ -165,15 +172,14 @@ static int pageattr_test(void) continue; } - err = change_page_attr_clear(addr[i], len[i], - __pgprot(_PAGE_GLOBAL)); + err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); if (err < 0) { printk(KERN_ERR "CPA %d failed %d\n", i, err); failed++; } pte = lookup_address(addr[i], &level); - if (!pte || pte_global(*pte) || pte_huge(*pte)) { + if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) { printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], pte ? (u64)pte_val(*pte) : 0ULL); failed++; @@ -198,14 +204,13 @@ static int pageattr_test(void) failed++; continue; } - err = change_page_attr_set(addr[i], len[i], - __pgprot(_PAGE_GLOBAL)); + err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); if (err < 0) { printk(KERN_ERR "CPA reverting failed: %d\n", err); failed++; } pte = lookup_address(addr[i], &level); - if (!pte || !pte_global(*pte)) { + if (!pte || pte_testbit(*pte)) { printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", addr[i], pte ? (u64)pte_val(*pte) : 0ULL); failed++; -- cgit v1.2.3 From 698839fe04ae11f228074ad45614343c3921a6c6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 6 Jul 2008 11:42:11 -0700 Subject: x86: remove have_arch_parse_srat -v2 we already have the same srat handling interface for 32bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index fa389d20383..5dfef9fa061 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -443,20 +443,3 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#if defined(CONFIG_ACPI_NUMA) && !defined(CONFIG_HAVE_ARCH_PARSE_SRAT) -/* - * Dummy on 32-bit, for now: - */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ -} - -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) -{ -} - -void __init acpi_numa_arch_fixup(void) -{ -} -#endif -- cgit v1.2.3 From 6247943d8ab699b57653afd453a4940cca70ef8a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 6 Jul 2008 11:42:11 -0700 Subject: x86: remove acpi_srat config v2 use ACPI_NUMA directly and move srat_32.c to mm/ Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 3 +- arch/x86/mm/srat_32.c | 280 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 282 insertions(+), 1 deletion(-) create mode 100644 arch/x86/mm/srat_32.c (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index b7b3e4c7cfc..c107641cd39 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -13,5 +13,6 @@ obj-$(CONFIG_NUMA) += discontig_32.o else obj-$(CONFIG_NUMA) += numa_64.o obj-$(CONFIG_K8_NUMA) += k8topology_64.o -obj-$(CONFIG_ACPI_NUMA) += srat_64.o endif +obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o + diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c new file mode 100644 index 00000000000..f41d67f8f83 --- /dev/null +++ b/arch/x86/mm/srat_32.c @@ -0,0 +1,280 @@ +/* + * Some of the code in this file has been gleaned from the 64 bit + * discontigmem support code base. + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to Pat Gaughen + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * proximity macros and definitions + */ +#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ +#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ +#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) +#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) +/* bitmap length; _PXM is at most 255 */ +#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) +static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ + +#define MAX_CHUNKS_PER_NODE 3 +#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) +struct node_memory_chunk_s { + unsigned long start_pfn; + unsigned long end_pfn; + u8 pxm; // proximity domain of node + u8 nid; // which cnode contains this chunk? + u8 bank; // which mem bank on this node +}; +static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; + +static int __initdata num_memory_chunks; /* total number of memory chunks */ +static u8 __initdata apicid_to_pxm[MAX_APICID]; + +int numa_off __initdata; +int acpi_numa __initdata; + +static __init void bad_srat(void) +{ + printk(KERN_ERR "SRAT: SRAT not used.\n"); + acpi_numa = -1; + num_memory_chunks = 0; +} + +static __init inline int srat_disabled(void) +{ + return numa_off || acpi_numa < 0; +} + +/* Identify CPU proximity domains */ +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) +{ + if (srat_disabled()) + return; + if (cpu_affinity->header.length != + sizeof(struct acpi_srat_cpu_affinity)) { + bad_srat(); + return; + } + + if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; /* empty entry */ + + /* mark this node as "seen" in node bitmap */ + BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); + + apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; + + printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", + cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); +} + +/* + * Identify memory proximity domains and hot-remove capabilities. + * Fill node memory chunk list structure. + */ +void __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) +{ + unsigned long long paddr, size; + unsigned long start_pfn, end_pfn; + u8 pxm; + struct node_memory_chunk_s *p, *q, *pend; + + if (srat_disabled()) + return; + if (memory_affinity->header.length != + sizeof(struct acpi_srat_mem_affinity)) { + bad_srat(); + return; + } + + if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) + return; /* empty entry */ + + pxm = memory_affinity->proximity_domain & 0xff; + + /* mark this node as "seen" in node bitmap */ + BMAP_SET(pxm_bitmap, pxm); + + /* calculate info for memory chunk structure */ + paddr = memory_affinity->base_address; + size = memory_affinity->length; + + start_pfn = paddr >> PAGE_SHIFT; + end_pfn = (paddr + size) >> PAGE_SHIFT; + + + if (num_memory_chunks >= MAXCHUNKS) { + printk(KERN_WARNING "Too many mem chunks in SRAT." + " Ignoring %lld MBytes at %llx\n", + size/(1024*1024), paddr); + return; + } + + /* Insertion sort based on base address */ + pend = &node_memory_chunk[num_memory_chunks]; + for (p = &node_memory_chunk[0]; p < pend; p++) { + if (start_pfn < p->start_pfn) + break; + } + if (p < pend) { + for (q = pend; q >= p; q--) + *(q + 1) = *q; + } + p->start_pfn = start_pfn; + p->end_pfn = end_pfn; + p->pxm = pxm; + + num_memory_chunks++; + + printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)" + " in proximity domain %02x %s\n", + start_pfn, end_pfn, + memory_affinity->memory_type, + pxm, + ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? + "enabled and removable" : "enabled" ) ); +} + +/* Callback for SLIT parsing */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ +} + +void acpi_numa_arch_fixup(void) +{ +} +/* + * The SRAT table always lists ascending addresses, so can always + * assume that the first "start" address that you see is the real + * start of the node, and that the current "end" address is after + * the previous one. + */ +static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) +{ + /* + * Only add present memory as told by the e820. + * There is no guarantee from the SRAT that the memory it + * enumerates is present at boot time because it represents + * *possible* memory hotplug areas the same as normal RAM. + */ + if (memory_chunk->start_pfn >= max_pfn) { + printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", + memory_chunk->start_pfn, memory_chunk->end_pfn); + return; + } + if (memory_chunk->nid != nid) + return; + + if (!node_has_online_mem(nid)) + node_start_pfn[nid] = memory_chunk->start_pfn; + + if (node_start_pfn[nid] > memory_chunk->start_pfn) + node_start_pfn[nid] = memory_chunk->start_pfn; + + if (node_end_pfn[nid] < memory_chunk->end_pfn) + node_end_pfn[nid] = memory_chunk->end_pfn; +} + +int __init get_memcfg_from_srat(void) +{ + int i, j, nid; + + + if (srat_disabled()) + goto out_fail; + + if (num_memory_chunks == 0) { + printk(KERN_WARNING + "could not finy any ACPI SRAT memory areas.\n"); + goto out_fail; + } + + /* Calculate total number of nodes in system from PXM bitmap and create + * a set of sequential node IDs starting at zero. (ACPI doesn't seem + * to specify the range of _PXM values.) + */ + /* + * MCD - we no longer HAVE to number nodes sequentially. PXM domain + * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically + * 32, so we will continue numbering them in this manner until MAX_NUMNODES + * approaches MAX_PXM_DOMAINS for i386. + */ + nodes_clear(node_online_map); + for (i = 0; i < MAX_PXM_DOMAINS; i++) { + if (BMAP_TEST(pxm_bitmap, i)) { + int nid = acpi_map_pxm_to_node(i); + node_set_online(nid); + } + } + BUG_ON(num_online_nodes() == 0); + + /* set cnode id in memory chunk structure */ + for (i = 0; i < num_memory_chunks; i++) + node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); + + printk(KERN_DEBUG "pxm bitmap: "); + for (i = 0; i < sizeof(pxm_bitmap); i++) { + printk(KERN_CONT "%02x ", pxm_bitmap[i]); + } + printk(KERN_CONT "\n"); + printk(KERN_DEBUG "Number of logical nodes in system = %d\n", + num_online_nodes()); + printk(KERN_DEBUG "Number of memory chunks in system = %d\n", + num_memory_chunks); + + for (i = 0; i < MAX_APICID; i++) + apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); + + for (j = 0; j < num_memory_chunks; j++){ + struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; + printk(KERN_DEBUG + "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", + j, chunk->nid, chunk->start_pfn, chunk->end_pfn); + node_read_chunk(chunk->nid, chunk); + e820_register_active_regions(chunk->nid, chunk->start_pfn, + min(chunk->end_pfn, max_pfn)); + } + + for_each_online_node(nid) { + unsigned long start = node_start_pfn[nid]; + unsigned long end = min(node_end_pfn[nid], max_pfn); + + memory_present(nid, start, end); + node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); + } + return 1; +out_fail: + printk(KERN_ERR "failed to get NUMA memory information from SRAT" + " table\n"); + return 0; +} -- cgit v1.2.3 From 3a9e189d69479736a0d0901c87ad08c9e328b389 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 1 Jul 2008 14:45:32 -0500 Subject: x86: map UV chipset space - pagetable Add boot-time function for creating additional 2MB page table entries for mapping chipset specific cached/uncached ranges. Signed-off-by: Jack Steiner Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 77d129d62c9..57d5eff754c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -201,6 +201,46 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } +/* + * Create large page table mappings for a range of physical addresses. + */ +static void __init __init_extra_mapping(unsigned long phys, unsigned long size, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { + pgd = pgd_offset_k((unsigned long)__va(phys)); + if (pgd_none(*pgd)) { + pud = (pud_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | + _PAGE_USER)); + } + pud = pud_offset(pgd, (unsigned long)__va(phys)); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | + _PAGE_USER)); + } + pmd = pmd_offset(pud, phys); + BUG_ON(!pmd_none(*pmd)); + set_pmd(pmd, __pmd(phys | pgprot_val(prot))); + } +} + +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); +} + +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); +} + /* * The head.S code sets up the kernel high mapping: * -- cgit v1.2.3 From 5ed4273af8469ca4723d4bf1bcd3abe2cc792e9f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 4 Jul 2008 12:36:21 +0200 Subject: arch/x86/mm/pgtable_32.c: remove unused variable `fixmaps' arch/x86/mm/pgtable_32.c:144: warning: 'fixmaps' defined but not used Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable_32.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 828907d001e..b4becbf8c57 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -141,7 +141,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) __flush_tlb_one(vaddr); } -static int fixmaps; unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); -- cgit v1.2.3 From b50efd2a55fc1344654875369d458bb6838bd37a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 8 Jul 2008 01:41:05 -0700 Subject: x86: introduce page_size_mask for 64bit prepare for overmapped patch also printout last_map_addr together with end Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 98 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 35 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 57d5eff754c..7227a0a3f3a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -340,7 +340,8 @@ phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) } static unsigned long __meminit -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, + unsigned long page_size_mask) { unsigned long pages = 0; @@ -365,7 +366,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) continue; } - if (cpu_has_pse) { + if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); @@ -383,20 +384,22 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) } static unsigned long __meminit -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, + unsigned long page_size_mask) { pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; spin_lock(&init_mm.page_table_lock); - last_map_addr = phys_pmd_init(pmd, address, end); + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); return last_map_addr; } static unsigned long __meminit -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, + unsigned long page_size_mask) { unsigned long pages = 0; unsigned long last_map_addr = end; @@ -418,11 +421,12 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) if (pud_val(*pud)) { if (!pud_large(*pud)) - last_map_addr = phys_pmd_update(pud, addr, end); + last_map_addr = phys_pmd_update(pud, addr, end, + page_size_mask); continue; } - if (direct_gbpages) { + if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); @@ -433,7 +437,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) pmd = alloc_low_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); - last_map_addr = phys_pmd_init(pmd, addr, end); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); unmap_low_page(pmd); pud_populate(&init_mm, pud, __va(pmd_phys)); spin_unlock(&init_mm.page_table_lock); @@ -446,13 +450,14 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) } static unsigned long __meminit -phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end) +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long page_size_mask) { pud_t *pud; pud = (pud_t *)pgd_page_vaddr(*pgd); - return phys_pud_init(pud, addr, end); + return phys_pud_init(pud, addr, end, page_size_mask); } static void __init find_early_table_space(unsigned long end) @@ -608,29 +613,12 @@ static void __init early_memtest(unsigned long start, unsigned long end) } #endif -/* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. - */ -unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) +static unsigned long __init kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) { - unsigned long next, last_map_addr = end; - unsigned long start_phys = start, end_phys = end; - printk(KERN_INFO "init_memory_mapping\n"); - - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) { - init_gbpages(); - find_early_table_space(end); - } + unsigned long next, last_map_addr = end; start = (unsigned long)__va(start); end = (unsigned long)__va(end); @@ -645,7 +633,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon next = end; if (pgd_val(*pgd)) { - last_map_addr = phys_pud_update(pgd, __pa(start), __pa(end)); + last_map_addr = phys_pud_update(pgd, __pa(start), + __pa(end), page_size_mask); continue; } @@ -654,22 +643,61 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon else pud = alloc_low_page(&pud_phys); - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), + page_size_mask); unmap_low_page(pud); pgd_populate(&init_mm, pgd_offset_k(start), __va(pud_phys)); } + return last_map_addr; +} +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ +unsigned long __init_refok init_memory_mapping(unsigned long start, + unsigned long end) +{ + unsigned long last_map_addr; + unsigned long page_size_mask = 0; + + printk(KERN_INFO "init_memory_mapping\n"); + + /* + * Find space for the kernel direct mapping tables. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. + */ + if (!after_bootmem) { + init_gbpages(); + find_early_table_space(end); + } + + if (direct_gbpages) + page_size_mask |= 1 << PG_LEVEL_1G; + if (cpu_has_pse) + page_size_mask |= 1 << PG_LEVEL_2M; + + last_map_addr = kernel_physical_mapping_init(start, end, + page_size_mask); + if (!after_bootmem) mmu_cr4_features = read_cr4(); __flush_tlb_all(); - if (!after_bootmem) + if (!after_bootmem && table_end > table_start) reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); + printk(KERN_INFO "last_map_addr: %lx end: %lx\n", + last_map_addr, end); + if (!after_bootmem) - early_memtest(start_phys, end_phys); + early_memtest(start, end); return last_map_addr >> PAGE_SHIFT; } -- cgit v1.2.3 From 49c980df552499e5e8595b52448f612fdab0484a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 3 Jul 2008 12:29:34 -0700 Subject: x86: fix vmemmap printout check Signed-off-by: Yinghai Lu Cc: "Nick Piggin" Cc: "Mark McLoughlin" Cc: xen-devel Cc: "Eduardo Habkost" Cc: "Vegard Nossum" Cc: "Stephen Tweedie" Cc: "Jeremy Fitzhardinge" Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7227a0a3f3a..f7d80313ede 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1104,9 +1104,6 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); - addr_end = addr + PMD_SIZE; - p_end = p + PMD_SIZE; - /* check to see if we have contiguous blocks */ if (p_end != p || node_start != node) { if (p_start) @@ -1116,6 +1113,9 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) node_start = node; p_start = p; } + + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; } else vmemmap_verify((pte_t *)pmd, node, addr, next); } -- cgit v1.2.3 From c2e6d65bcea2672788f9bb58ce7606c41388387b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 8 Jul 2008 01:43:27 -0700 Subject: x86: not overmap more than the end of RAM in init_memory_mapping - 64bit handle head and tail that are not aligned to big pages (2MB/1GB boundary). with this patch, on system that support gbpages, change: last_map_addr: 1080000000 end: 1078000000 to: last_map_addr: 1078000000 end: 1078000000 Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 77 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 12 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f7d80313ede..51f69b39b75 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -462,18 +462,25 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, static void __init find_early_table_space(unsigned long end) { - unsigned long puds, tables, start; + unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); - if (!direct_gbpages) { - unsigned long pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - } - if (!cpu_has_pse) { - unsigned long ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); - } + if (direct_gbpages) { + unsigned long extra; + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); + pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + + if (cpu_has_pse) { + unsigned long extra; + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); /* * RED-PEN putting page tables only on node 0 could @@ -660,8 +667,9 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { - unsigned long last_map_addr; + unsigned long last_map_addr = end; unsigned long page_size_mask = 0; + unsigned long start_pfn, end_pfn; printk(KERN_INFO "init_memory_mapping\n"); @@ -682,8 +690,53 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (cpu_has_pse) page_size_mask |= 1 << PG_LEVEL_2M; - last_map_addr = kernel_physical_mapping_init(start, end, - page_size_mask); + /* head if not big page aligment ?*/ + start_pfn = start >> PAGE_SHIFT; + end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) + last_map_addr = kernel_physical_mapping_init( + start_pfn<>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)); + if (start_pfn < end_pfn) + last_map_addr = kernel_physical_mapping_init( + start_pfn<>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) + last_map_addr = kernel_physical_mapping_init( + start_pfn<>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) + last_map_addr = kernel_physical_mapping_init( + start_pfn<>PAGE_SHIFT; + if (start_pfn < end_pfn) + last_map_addr = kernel_physical_mapping_init( + start_pfn< Date: Wed, 9 Jul 2008 20:15:02 -0700 Subject: x86: overmapped fix when 4K pages on tail, 64-bit fix phys_pmd_init to make sure not to return bigger value than end. also print out range split:1G/2M/4K in init_memory_mapping(). Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 106 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 72 insertions(+), 34 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 51f69b39b75..48548ef7ddf 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -302,11 +302,13 @@ static __meminit void unmap_low_page(void *adr) early_iounmap(adr, PAGE_SIZE); } -static void __meminit +static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) { unsigned pages = 0; + unsigned long last_map_addr = end; int i; + pte_t *pte = pte_page + pte_index(addr); for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { @@ -326,17 +328,20 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) printk(" pte=%p addr=%lx pte=%016lx\n", pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; pages++; } update_page_count(PG_LEVEL_4K, pages); + + return last_map_addr; } -static void __meminit +static unsigned long __meminit phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) { pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - phys_pte_init(pte, address, end); + return phys_pte_init(pte, address, end); } static unsigned long __meminit @@ -344,6 +349,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, unsigned long page_size_mask) { unsigned long pages = 0; + unsigned long last_map_addr = end; int i = pmd_index(address); @@ -362,7 +368,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) - phys_pte_update(pmd, address, end); + last_map_addr = phys_pte_update(pmd, address, + end); continue; } @@ -370,17 +377,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pages++; set_pte((pte_t *)pmd, pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + last_map_addr = (address & PMD_MASK) + PMD_SIZE; continue; } pte = alloc_low_page(&pte_phys); - phys_pte_init(pte, address, end); + last_map_addr = phys_pte_init(pte, address, end); unmap_low_page(pte); pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); } update_page_count(PG_LEVEL_2M, pages); - return address; + return last_map_addr; } static unsigned long __meminit @@ -659,6 +667,32 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, return last_map_addr; } + +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +#define NR_RANGE_MR 5 + +static int save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<> PAGE_SHIFT; end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) - last_map_addr = kernel_physical_mapping_init( - start_pfn<>PMD_SHIFT) @@ -706,37 +741,40 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, << (PUD_SHIFT - PAGE_SHIFT); if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT))) end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)); - if (start_pfn < end_pfn) - last_map_addr = kernel_physical_mapping_init( - start_pfn<>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) - last_map_addr = kernel_physical_mapping_init( - start_pfn<>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) - last_map_addr = kernel_physical_mapping_init( - start_pfn<>PAGE_SHIFT; - if (start_pfn < end_pfn) + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG " %010lx - %010lx page %s\n", + mr[i].start, mr[i].end, + (mr[i].page_size_mask & (1< Date: Thu, 10 Jul 2008 20:36:37 -0700 Subject: x86: reserve SLIT save the SLIT, in case we are using fixmap to read it, and that fixmap could be cleared by others. Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 0fd67b81a8b..1b4763e26ea 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -100,7 +100,19 @@ static __init inline int srat_disabled(void) /* Callback for SLIT parsing */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { - acpi_slit = slit; + unsigned length; + unsigned long phys; + + length = slit->header.length; + phys = find_e820_area(0, max_pfn_mapped< LAPIC mapping */ -- cgit v1.2.3 From f361a450bf1ad14e2b003217dbf3958638631265 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 10 Jul 2008 20:38:26 -0700 Subject: x86: introduce max_low_pfn_mapped for 64-bit when more than 4g memory is installed, don't map the big hole below 4g. Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 1 + arch/x86/mm/init_64.c | 1 + arch/x86/mm/pageattr.c | 19 +++++++++++++++++-- arch/x86/mm/pat.c | 3 ++- 4 files changed, 21 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b5a0fd5f4c5..029e8cffca9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -50,6 +50,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; +unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 48548ef7ddf..122bcef222f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -53,6 +53,7 @@ * The direct mapping extends to max_pfn_mapped, so that we can directly access * apertures, ACPI and other tables without having to play with fixmaps. */ +unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; static unsigned long dma_reserve __initdata; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index afd40054d15..0389cb8f6b1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -536,8 +536,14 @@ static int split_large_page(pte_t *kpte, unsigned long address) set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); if (address >= (unsigned long)__va(0) && + address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) + split_page_count(level); + +#ifdef CONFIG_X86_64 + if (address >= (unsigned long)__va(1UL<<32) && address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) split_page_count(level); +#endif /* * Install the new, split up pagetable. Important details here: @@ -655,12 +661,21 @@ static int cpa_process_alias(struct cpa_data *cpa) if (cpa->pfn > max_pfn_mapped) return 0; +#ifdef CONFIG_X86_64 + if (cpa->pfn > max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) + return 0; +#endif /* * No need to redo, when the primary call touched the direct * mapping already: */ - if (!within(cpa->vaddr, PAGE_OFFSET, - PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { + if (!(within(cpa->vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) +#ifdef CONFIG_X86_64 + || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) +#endif + )) { alias_cpa = *cpa; alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index a885a1019b8..749766c3c5c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -449,7 +449,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (retval < 0) return 0; - if (pfn <= max_pfn_mapped && + if (((pfn <= max_low_pfn_mapped) || + (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn <= max_pfn_mapped)) && ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { free_memtype(offset, offset + size); printk(KERN_INFO -- cgit v1.2.3 From 965194c15dc9e4f3bc44432b39c441c86af7f11d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 12 Jul 2008 14:31:28 -0700 Subject: x86: max_low_pfn_mapped fix, #2 tighten the boundary checks around max_low_pfn_mapped - dont overmap nor undermap into holes. also print out tseg for AMD cpus, for diagnostic purposes. (this is an SMM area, and we split up any big mappings around that area) Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 4 ++-- arch/x86/mm/pat.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 0389cb8f6b1..fb6f2ab40dd 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -658,11 +658,11 @@ static int cpa_process_alias(struct cpa_data *cpa) struct cpa_data alias_cpa; int ret = 0; - if (cpa->pfn > max_pfn_mapped) + if (cpa->pfn >= max_pfn_mapped) return 0; #ifdef CONFIG_X86_64 - if (cpa->pfn > max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) + if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) return 0; #endif /* diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 749766c3c5c..d4585077977 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -449,8 +449,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (retval < 0) return 0; - if (((pfn <= max_low_pfn_mapped) || - (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn <= max_pfn_mapped)) && + if (((pfn < max_low_pfn_mapped) || + (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) && ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { free_memtype(offset, offset + size); printk(KERN_INFO -- cgit v1.2.3 From 9958e810f8ac92f8a447035ee6555420ba27b847 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 12 Jul 2008 14:32:45 -0700 Subject: x86: max_low_pfn_mapped fix, #3 optimization: try to merge the range with same page size in init_memory_mapping, to get the best possible linear mappings set up. thus when GBpages is not there, we could do 2M pages. Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 122bcef222f..a25cc6fa220 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -763,6 +763,20 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, end_pfn = end>>PAGE_SHIFT; nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof (struct map_range)); + mr[i].start = old_start; + nr_range--; + } + for (i = 0; i < nr_range; i++) printk(KERN_DEBUG " %010lx - %010lx page %s\n", mr[i].start, mr[i].end, -- cgit v1.2.3