From 3eefae994d9224fb7771a3ddb683868363c23510 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:04 +0200 Subject: ftrace: limit trace entries Currently there is no protection from the root user to use up all of memory for trace buffers. If the root user allocates too many entries, the OOM killer might start kill off all tasks. This patch adds an algorith to check the following condition: pages_requested > (freeable_memory + current_trace_buffer_pages) / 4 If the above is met then the allocation fails. The above prevents more than 1/4th of freeable memory from being used by trace buffers. To determine the freeable_memory, I made determine_dirtyable_memory in mm/page-writeback.c global. Special thanks goes to Peter Zijlstra for suggesting the above calculation. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- mm/page-writeback.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 789b6adbef3..b38f700825f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -126,8 +126,6 @@ static void background_writeout(unsigned long _min_pages); static struct prop_descriptor vm_completions; static struct prop_descriptor vm_dirties; -static unsigned long determine_dirtyable_memory(void); - /* * couple the period to the dirty_ratio: * @@ -347,7 +345,13 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) #endif } -static unsigned long determine_dirtyable_memory(void) +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +unsigned long determine_dirtyable_memory(void) { unsigned long x; -- cgit v1.2.3 From e8c27ac9191ab9e6506ae5cbe70d87ac50f8e960 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 Jun 2008 13:15:22 -0700 Subject: x86, numa, 32-bit: print out debug info on all kvas also fix the print out of node_remap_end_vaddr Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 63835579323..502223c3c2c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3486,6 +3486,11 @@ void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, calculate_node_totalpages(pgdat, zones_size, zholes_size); alloc_node_mem_map(pgdat); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", + nid, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); +#endif free_area_init_core(pgdat, zones_size, zholes_size); } -- cgit v1.2.3 From cc1a9d86ce989083703c4bdc11b75a87e1cc404a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 8 Jun 2008 19:39:16 -0700 Subject: mm, x86: shrink_active_range() should check all Now we are using register_e820_active_regions() instead of add_active_range() directly. So end_pfn could be different between the value in early_node_map to node_end_pfn. So we need to make shrink_active_range() smarter. shrink_active_range() is a generic MM function in mm/page_alloc.c but it is only used on 32-bit x86. Should we move it back to some file in arch/x86? Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 502223c3c2c..21540868407 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3579,25 +3579,49 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, /** * shrink_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @old_end_pfn: The old end PFN of the range * @new_end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. - * The map is kept at the end physical page range that has already been - * registered with add_active_range(). This function allows an arch to shrink - * an existing registered range. + * The map is kept near the end physical page range that has already been + * registered. This function allows an arch to shrink an existing registered + * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn) +void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) { - int i; + int i, j; + int removed = 0; /* Find the old active region end and shrink */ - for_each_active_range_index_in_nid(i, nid) - if (early_node_map[i].end_pfn == old_end_pfn) { + for_each_active_range_index_in_nid(i, nid) { + if (early_node_map[i].start_pfn >= new_end_pfn) { + /* clear it */ + early_node_map[i].end_pfn = 0; + removed = 1; + continue; + } + if (early_node_map[i].end_pfn > new_end_pfn) { early_node_map[i].end_pfn = new_end_pfn; - break; + continue; } + } + + if (!removed) + return; + + /* remove the blank ones */ + for (i = nr_nodemap_entries - 1; i > 0; i--) { + if (early_node_map[i].nid != nid) + continue; + if (early_node_map[i].end_pfn) + continue; + /* we found it, get rid of it */ + for (j = i; j < nr_nodemap_entries - 1; j++) + memcpy(&early_node_map[j], &early_node_map[j+1], + sizeof(early_node_map[j])); + j = nr_nodemap_entries - 1; + memset(&early_node_map[j], 0, sizeof(early_node_map[j])); + nr_nodemap_entries--; + } } /** -- cgit v1.2.3 From 1ea0704e0da65b2b46f9142ff1391163aac24060 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 04:30:00 -0700 Subject: mm: add a ptep_modify_prot transaction abstraction This patch adds an API for doing read-modify-write updates to a pte's protection bits which may race against hardware updates to the pte. After reading the pte, the hardware may asynchonously set the accessed or dirty bits on a pte, which would be lost when writing back the modified pte value. The existing technique to handle this race is to use ptep_get_and_clear() atomically fetch the old pte value and clear it in memory. This has the effect of marking the pte as non-present, which will prevent the hardware from updating its state. When the new value is written back, the pte will be present again, and the hardware can resume updating the access/dirty flags. When running in a virtualized environment, pagetable updates are relatively expensive, since they generally involve some trap into the hypervisor. To mitigate the cost of these updates, we tend to batch them. However, because of the atomic nature of ptep_get_and_clear(), it is inherently non-batchable. This new interface allows batching by giving the underlying implementation enough information to open a transaction between the read and write phases: ptep_modify_prot_start() returns the current pte value, and puts the pte entry into a state where either the hardware will not update the pte, or if it does, the updates will be preserved on commit. ptep_modify_prot_commit() writes back the updated pte, makes sure that any hardware updates made since ptep_modify_prot_start() are preserved. ptep_modify_prot_start() and _commit() must be exactly paired, and used while holding the appropriate pte lock. They do not protect against other software updates of the pte in any way. The current implementations of ptep_modify_prot_start and _commit are functionally unchanged from before: _start() uses ptep_get_and_clear() fetch the pte and zero the entry, preventing any hardware updates. _commit() simply writes the new pte value back knowing that the hardware has not updated the pte in the meantime. The only current user of this interface is mprotect Signed-off-by: Jeremy Fitzhardinge Acked-by: Linus Torvalds Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- mm/mprotect.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index a5bf31c2737..acfe7c8d72f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -47,19 +47,17 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, if (pte_present(oldpte)) { pte_t ptent; - /* Avoid an SMP race with hardware updated dirty/clean - * bits by wiping the pte and then setting the new pte - * into place. - */ - ptent = ptep_get_and_clear(mm, addr, pte); + ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); + /* * Avoid taking write faults for pages we know to be * dirty. */ if (dirty_accountable && pte_dirty(ptent)) ptent = pte_mkwrite(ptent); - set_pte_at(mm, addr, pte, ptent); + + ptep_modify_prot_commit(mm, addr, pte, ptent); #ifdef CONFIG_MIGRATION } else if (!pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); -- cgit v1.2.3 From 15c8b6c1aaaf1c4edd67e2f02e4d8e1bd1a51c0d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 May 2008 09:39:44 +0200 Subject: on_each_cpu(): kill unused 'retry' parameter It's not even passed on to smp_call_function() anymore, since that was removed. So kill it. Acked-by: Jeremy Fitzhardinge Reviewed-by: Paul E. McKenney Signed-off-by: Jens Axboe --- mm/page_alloc.c | 2 +- mm/slab.c | 4 ++-- mm/slub.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2f552955a02..53242344a77 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -918,7 +918,7 @@ void drain_local_pages(void *arg) */ void drain_all_pages(void) { - on_each_cpu(drain_local_pages, NULL, 0, 1); + on_each_cpu(drain_local_pages, NULL, 1); } #ifdef CONFIG_HIBERNATION diff --git a/mm/slab.c b/mm/slab.c index 046607f05f3..0772abb412b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2454,7 +2454,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep) struct kmem_list3 *l3; int node; - on_each_cpu(do_drain, cachep, 1, 1); + on_each_cpu(do_drain, cachep, 1); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; @@ -3939,7 +3939,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, } new->cachep = cachep; - on_each_cpu(do_ccupdate_local, (void *)new, 1, 1); + on_each_cpu(do_ccupdate_local, (void *)new, 1); check_irq_on(); cachep->batchcount = batchcount; diff --git a/mm/slub.c b/mm/slub.c index 0987d1cd943..44715eb70c0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1497,7 +1497,7 @@ static void flush_cpu_slab(void *d) static void flush_all(struct kmem_cache *s) { #ifdef CONFIG_SMP - on_each_cpu(flush_cpu_slab, s, 1, 1); + on_each_cpu(flush_cpu_slab, s, 1); #else unsigned long flags; -- cgit v1.2.3 From 38510754a50192a072210e24fdc4ae65592182f0 Mon Sep 17 00:00:00 2001 From: Haavard Skinnemoen Date: Mon, 14 Jan 2008 23:35:32 +0100 Subject: avr32: Use a quicklist for PTE allocation as well Using a quicklist to allocate PTEs might be slightly faster than using the page allocator directly since we might avoid zeroing the page after each allocation. Signed-off-by: Haavard Skinnemoen --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 3aa819d628c..60b26195278 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -199,7 +199,7 @@ config BOUNCE config NR_QUICK int depends on QUICKLIST - default "2" if SUPERH + default "2" if SUPERH || AVR32 default "1" config VIRT_TO_BUS -- cgit v1.2.3 From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Jun 2008 19:08:52 -0700 Subject: x86: replace shrink_active_range() with remove_active_range() in case we have kva before ramdisk on a node, we still need to use those ranges. v2: reserve_early kva ram area, in case there are holes in highmem, to avoid those area could be treat as free high pages. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee5ba7509c..d80e1868e57 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3552,30 +3552,47 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /** - * shrink_active_range - Shrink an existing registered range of PFNs + * remove_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @new_end_pfn: The new PFN of the range + * @start_pfn: The new PFN of the range + * @end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. * The map is kept near the end physical page range that has already been * registered. This function allows an arch to shrink an existing registered * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) +void __init remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) { int i, j; int removed = 0; + printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", + nid, start_pfn, end_pfn); + /* Find the old active region end and shrink */ for_each_active_range_index_in_nid(i, nid) { - if (early_node_map[i].start_pfn >= new_end_pfn) { + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn <= end_pfn) { /* clear it */ + early_node_map[i].start_pfn = 0; early_node_map[i].end_pfn = 0; removed = 1; continue; } - if (early_node_map[i].end_pfn > new_end_pfn) { - early_node_map[i].end_pfn = new_end_pfn; + if (early_node_map[i].start_pfn < start_pfn && + early_node_map[i].end_pfn > start_pfn) { + unsigned long temp_end_pfn = early_node_map[i].end_pfn; + early_node_map[i].end_pfn = start_pfn; + if (temp_end_pfn > end_pfn) + add_active_range(nid, end_pfn, temp_end_pfn); + continue; + } + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn > end_pfn && + early_node_map[i].start_pfn < end_pfn) { + early_node_map[i].start_pfn = end_pfn; continue; } } -- cgit v1.2.3 From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 14 Jun 2008 18:32:52 -0700 Subject: x86, mm: use add_highpages_with_active_regions() for high pages init v2 use early_node_map to init high pages, so we can remove page_is_ram() and page_is_reserved_early() in the big loop with add_one_highpage also remove page_is_reserved_early(), it is not needed anymore. v2: fix the build of other platforms Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d80e1868e57..41c6e3aa059 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid, } } +void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, + data); +} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. -- cgit v1.2.3 From d52d53b8a5b258bfaab9223a5e7284fcfdd48577 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 20:10:55 -0700 Subject: RFC x86: try to remove arch_get_ram_range want to remove arch_get_ram_range, and use early_node_map instead. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 41c6e3aa059..e25b6b24f84 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2932,10 +2932,14 @@ void __init free_bootmem_with_active_regions(int nid, void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; + int ret; - for_each_active_range_index_in_nid(i, nid) - work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, - data); + for_each_active_range_index_in_nid(i, nid) { + ret = work_fn(early_node_map[i].start_pfn, + early_node_map[i].end_pfn, data); + if (ret) + break; + } } /** * sparse_memory_present_with_active_regions - Call memory_present for each active range -- cgit v1.2.3 From e2fc252e0ce695b4c4abe27bb073c35bd0d73252 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 22 Jun 2008 07:22:12 -0700 Subject: x86 boot: show pfn addresses in hex not decimal in some kernel info printks Page frame numbers (the portion of physical addresses above the low order page offsets) are displayed in several kernel debug and info prints in decimal, not hex. Decimal addresse are unreadable. Use hex. Signed-off-by: Paul Jackson Cc: "Yinghai Lu" Cc: "Jack Steiner" Cc: "Mike Travis" Cc: "Huang Cc: Ying" Cc: "Andi Kleen" Cc: "Andrew Morton" Cc: Paul Jackson Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e25b6b24f84..c09c2c8d2c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, { int i; - printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " + printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) " "%d entries of %d used\n", nid, start_pfn, end_pfn, nr_nodemap_entries, MAX_ACTIVE_REGIONS); @@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s %8lu -> %8lu\n", + printk(" %-8s 0x%8lx -> 0x%8lx\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); @@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++) - printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, + printk(" %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn); -- cgit v1.2.3 From 2bc0d2615a15a93d344abbe8cb1b9056122bce9d Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 22 Jun 2008 07:22:17 -0700 Subject: x86 boot: more consistently use type int for node ids Everywhere I look, node id's are of type 'int', except in this one case, which has 'unsigned long'. Change this one to 'int' as well. There is nothing special about the way this variable 'nid' is used in this routine to justify using an unusual type here. Signed-off-by: Paul Jackson Cc: "Yinghai Lu" Cc: "Jack Steiner" Cc: "Mike Travis" Cc: "Huang Cc: Ying" Cc: "Andi Kleen" Cc: "Andrew Morton" Cc: Paul Jackson Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c09c2c8d2c6..b604c64a033 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3669,7 +3669,7 @@ static void __init sort_node_map(void) } /* Find the lowest pfn for a node */ -unsigned long __init find_min_pfn_for_node(unsigned long nid) +unsigned long __init find_min_pfn_for_node(int nid) { int i; unsigned long min_pfn = ULONG_MAX; @@ -3680,7 +3680,7 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid) if (min_pfn == ULONG_MAX) { printk(KERN_WARNING - "Could not find start_pfn for node %lu\n", nid); + "Could not find start_pfn for node %d\n", nid); return 0; } -- cgit v1.2.3 From 5dab8ec139be215fbaba216fb4aea914d0f4dac5 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 25 Jun 2008 05:44:40 -0700 Subject: mm, generic, x86 boot: more tweaks to hex prints of some pfn addresses Fix some problems with (and applies on top of) a previous patch: x86 boot: show pfn addresses in hex not decimal in some kernel info printks Primarily change "0x%8lx" format, which displays with a right aligned space filled hex number (spaces between the "0x" prefix and the number), into "%0#10lx" format, which zero fills instead of space fills, and which uses the printf flag '#' to request the "0x" prefix instead of hard coding it. Also replace some other "0x%lx" formats with "%#lx", making use of the '#' printf flag again. Signed-off-by: Paul Jackson Cc: "Yinghai Lu" Cc: "Jack Steiner" Cc: "Mike Travis" Cc: "Huang Cc: Ying" Cc: "Andi Kleen" Cc: "Andrew Morton" Cc: Paul Jackson Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b604c64a033..f024b9b3a2a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, { int i; - printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) " + printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " "%d entries of %d used\n", nid, start_pfn, end_pfn, nr_nodemap_entries, MAX_ACTIVE_REGIONS); @@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s 0x%8lx -> 0x%8lx\n", + printk(" %-8s %0#10lx -> %0#10lx\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); @@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++) - printk(" %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid, + printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn); -- cgit v1.2.3 From bdb21928512a860a60e6a24a849dc5b63cbaf96a Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 10 Jul 2008 22:21:58 +0200 Subject: slub: Fix use-after-preempt of per-CPU data structure Vegard Nossum reported a crash in kmem_cache_alloc(): BUG: unable to handle kernel paging request at da87d000 IP: [] kmem_cache_alloc+0xc7/0xe0 *pde = 28180163 *pte = 1a87d160 Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC Pid: 3850, comm: grep Not tainted (2.6.26-rc9-00059-gb190333 #5) EIP: 0060:[] EFLAGS: 00210203 CPU: 0 EIP is at kmem_cache_alloc+0xc7/0xe0 EAX: 00000000 EBX: da87c100 ECX: 1adad71a EDX: 6b6b6b6b ESI: 00200282 EDI: da87d000 EBP: f60bfe74 ESP: f60bfe54 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 and analyzed it: "The register %ecx looks innocent but is very important here. The disassembly: mov %edx,%ecx shr $0x2,%ecx rep stos %eax,%es:(%edi) <-- the fault So %ecx has been loaded from %edx... which is 0x6b6b6b6b/POISON_FREE. (0x6b6b6b6b >> 2 == 0x1adadada.) %ecx is the counter for the memset, from here: memset(object, 0, c->objsize); i.e. %ecx was loaded from c->objsize, so "c" must have been freed. Where did "c" come from? Uh-oh... c = get_cpu_slab(s, smp_processor_id()); This looks like it has very much to do with CPU hotplug/unplug. Is there a race between SLUB/hotplug since the CPU slab is used after it has been freed?" Good analysis. Yeah, it's possible that a caller of kmem_cache_alloc() -> slab_alloc() can be migrated on another CPU right after local_irq_restore() and before memset(). The inital cpu can become offline in the mean time (or a migration is a consequence of the CPU going offline) so its 'kmem_cache_cpu' structure gets freed ( slab_cpuup_callback). At some point of time the caller continues on another CPU having an obsolete pointer... Signed-off-by: Dmitry Adamushko Reported-by: Vegard Nossum Acked-by: Ingo Molnar Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/slub.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 1a427c0ae83..315c392253c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1628,9 +1628,11 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void **object; struct kmem_cache_cpu *c; unsigned long flags; + unsigned int objsize; local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); + objsize = c->objsize; if (unlikely(!c->freelist || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); @@ -1643,7 +1645,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, local_irq_restore(flags); if (unlikely((gfpflags & __GFP_ZERO) && object)) - memset(object, 0, c->objsize); + memset(object, 0, objsize); return object; } -- cgit v1.2.3 From f4c0a0fdfae708f7aa438c27a380ed4071294e11 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: vfs: export filemap_fdatawrite_range() Make filemap_fdatawrite_range() function public, so that it can later be used in ordered mode rewrite by JBD/JBD2. Signed-off-by: Jan Kara --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 1e6a7d34874..65d9d9e2b75 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawrite); -static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); /** * filemap_flush - mostly a non-blocking flush -- cgit v1.2.3 From 06d6cf6959d22037fcec598f4f954db5db3d7356 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: mm: Add range_cont mode for writeback Filesystems like ext4 needs to start a new transaction in the writepages for block allocation. This happens with delayed allocation and there is limit to how many credits we can request from the journal layer. So we call write_cache_pages multiple times with wbc->nr_to_write set to the maximum possible value limitted by the max journal credits available. Add a new mode to writeback that enables us to handle this behaviour. In the new mode we update the wbc->range_start to point to the new offset to be written. Next call to call to write_cache_pages will start writeout from specified range_start offset. In the new mode we also limit writing to the specified wbc->range_end. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Acked-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- mm/page-writeback.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 789b6adbef3..ded57d52806 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -956,6 +956,9 @@ retry: } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; + + if (wbc->range_cont) + wbc->range_start = index << PAGE_CACHE_SHIFT; return ret; } EXPORT_SYMBOL(write_cache_pages); -- cgit v1.2.3 From 421c175c4d609864350df495b34d3e99f9fb1bdd Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 14 Jul 2008 09:59:18 +0200 Subject: [S390] Add support for memory hot-add. Cc: Gerald Schaefer Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 3aa819d628c..4242743b981 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -129,7 +129,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC64 || SUPERH) + depends on (IA64 || X86 || PPC64 || SUPERH || S390) comment "Memory hotplug is currently incompatible with Software Suspend" depends on SPARSEMEM && HOTPLUG && HIBERNATION -- cgit v1.2.3 From 7daf705f362e349983e92037a198b8821db198af Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 14 Jul 2008 12:12:53 -0700 Subject: Start using the new '%pS' infrastructure to print symbols This simplifies the code significantly, and was the whole point of the exercise. Signed-off-by: Linus Torvalds --- mm/slub.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 315c392253c..5f6e2c4a2ba 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -431,9 +431,8 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "INFO: %s in ", s); - __print_symbol("%s", (unsigned long)t->addr); - printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); + printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, t->addr, jiffies - t->when, t->cpu, t->pid); } static void print_tracking(struct kmem_cache *s, void *object) -- cgit v1.2.3 From 0937502af7c9b648ed4e884ccb7f504b01a005a1 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 28 May 2008 10:32:22 -0700 Subject: slub: Add check for kfree() of non slab objects. We can detect kfree()s on non slab objects by checking for PageCompound(). Works in the same way as for ksize. This helped me catch an invalid kfree(). Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5f6e2c4a2ba..b3f2e713cdf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2766,6 +2766,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { + BUG_ON(!PageCompound(page)); put_page(page); return; } -- cgit v1.2.3 From 88e4ccf294ca62c2da998012a83533ce150c8dce Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 23 Jun 2008 02:58:37 +0400 Subject: slub: current is always valid Acked-by: Christoph Lameter Signed-off-by: Alexey Dobriyan Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index b3f2e713cdf..488400d1070 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -411,7 +411,7 @@ static void set_track(struct kmem_cache *s, void *object, if (addr) { p->addr = addr; p->cpu = smp_processor_id(); - p->pid = current ? current->pid : -1; + p->pid = current->pid; p->when = jiffies; } else memset(p, 0, sizeof(struct track)); -- cgit v1.2.3 From e79aec291da55aa322ddb5d8f3bb04cdf69470d5 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Fri, 4 Jul 2008 00:40:32 +0530 Subject: slab: rename slab_destroy_objs With the removal of destructors, slab_destroy_objs no longer actually destroys any objects, making the kernel doc incorrect and the function name misleading. In keeping with the other debug functions, rename it to slab_destroy_debugcheck and drop the kernel doc. Signed-off-by: Rabin Vincent Signed-off-by: Pekka Enberg --- mm/slab.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 046607f05f3..b4aa4c88250 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1901,15 +1901,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #endif #if DEBUG -/** - * slab_destroy_objs - destroy a slab and its objects - * @cachep: cache pointer being destroyed - * @slabp: slab pointer being destroyed - * - * Call the registered destructor for each object in a slab that is being - * destroyed. - */ -static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { @@ -1938,7 +1930,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) } } #else -static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) { } #endif @@ -1956,7 +1948,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) { void *addr = slabp->s_mem - slabp->colouroff; - slab_destroy_objs(cachep, slabp); + slab_destroy_debugcheck(cachep, slabp); if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { struct slab_rcu *slab_rcu; -- cgit v1.2.3