From 454ed842d55740160334efc9ad56cfef54ed37bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 09:30:25 +0200 Subject: lockdep: annotate mm_take_all_locks() The nesting is correct due to holding mmap_sem, use the new annotation to annotate this. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- mm/mmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 245c3d69067..5d09d08a412 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2273,14 +2273,14 @@ int install_special_mapping(struct mm_struct *mm, static DEFINE_MUTEX(mm_all_locks_mutex); -static void vm_lock_anon_vma(struct anon_vma *anon_vma) +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - spin_lock(&anon_vma->lock); + spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); /* * We can safely modify head.next after taking the * anon_vma->lock. If some other vma in this mm shares @@ -2296,7 +2296,7 @@ static void vm_lock_anon_vma(struct anon_vma *anon_vma) } } -static void vm_lock_mapping(struct address_space *mapping) +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) { if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* @@ -2310,7 +2310,7 @@ static void vm_lock_mapping(struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - spin_lock(&mapping->i_mmap_lock); + spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); } } @@ -2359,9 +2359,9 @@ int mm_take_all_locks(struct mm_struct *mm) if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) - vm_lock_anon_vma(vma->anon_vma); + vm_lock_anon_vma(mm, vma->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) - vm_lock_mapping(vma->vm_file->f_mapping); + vm_lock_mapping(mm, vma->vm_file->f_mapping); } ret = 0; -- cgit v1.2.3 From 7cd5a02f54f4c9d16cf7fdffa2122bc73bb09b43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 09:30:25 +0200 Subject: mm: fix mm_take_all_locks() locking order Lockdep spotted: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.27-rc1 #270 ------------------------------------------------------- qemu-kvm/2033 is trying to acquire lock: (&inode->i_data.i_mmap_lock){----}, at: [] mm_take_all_locks+0xc2/0xea but task is already holding lock: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&anon_vma->lock){----}: [] __lock_acquire+0x11be/0x14d2 [] lock_acquire+0x5e/0x7a [] _spin_lock+0x3b/0x47 [] vma_adjust+0x200/0x444 [] split_vma+0x12f/0x146 [] mprotect_fixup+0x13c/0x536 [] sys_mprotect+0x1a9/0x21e [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff -> #0 (&inode->i_data.i_mmap_lock){----}: [] __lock_acquire+0xedb/0x14d2 [] lock_release_non_nested+0x1c2/0x219 [] lock_release+0x127/0x14a [] _spin_unlock+0x1e/0x50 [] mm_drop_all_locks+0x7f/0xb0 [] do_mmu_notifier_register+0xe2/0x112 [] mmu_notifier_register+0xe/0x10 [] kvm_dev_ioctl+0x11e/0x287 [kvm] [] vfs_ioctl+0x2a/0x78 [] do_vfs_ioctl+0x257/0x274 [] sys_ioctl+0x55/0x78 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff other info that might help us debug this: 5 locks held by qemu-kvm/2033: #0: (&mm->mmap_sem){----}, at: [] do_mmu_notifier_register+0x55/0x112 #1: (mm_all_locks_mutex){--..}, at: [] mm_take_all_locks+0x34/0xea #2: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea #3: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea #4: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea stack backtrace: Pid: 2033, comm: qemu-kvm Not tainted 2.6.27-rc1 #270 Call Trace: [] print_circular_bug_tail+0xb8/0xc3 [] __lock_acquire+0xedb/0x14d2 [] ? add_lock_to_list+0x7e/0xad [] ? mm_take_all_locks+0x70/0xea [] ? mm_take_all_locks+0x70/0xea [] lock_release_non_nested+0x1c2/0x219 [] ? mm_take_all_locks+0xc2/0xea [] ? mm_take_all_locks+0xc2/0xea [] ? trace_hardirqs_on_caller+0x4d/0x115 [] ? mm_drop_all_locks+0x7f/0xb0 [] lock_release+0x127/0x14a [] _spin_unlock+0x1e/0x50 [] mm_drop_all_locks+0x7f/0xb0 [] do_mmu_notifier_register+0xe2/0x112 [] mmu_notifier_register+0xe/0x10 [] kvm_dev_ioctl+0x11e/0x287 [kvm] [] ? file_has_perm+0x83/0x8e [] vfs_ioctl+0x2a/0x78 [] do_vfs_ioctl+0x257/0x274 [] sys_ioctl+0x55/0x78 [] system_call_fastpath+0x16/0x1b Which the locking hierarchy in mm/rmap.c confirms as valid. Fix this by first taking all the mapping->i_mmap_lock instances and then take all anon_vma->lock instances. Signed-off-by: Peter Zijlstra Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- mm/mmap.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 5d09d08a412..32a287b631d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2358,11 +2358,17 @@ int mm_take_all_locks(struct mm_struct *mm) for (vma = mm->mmap; vma; vma = vma->vm_next) { if (signal_pending(current)) goto out_unlock; - if (vma->anon_vma) - vm_lock_anon_vma(mm, vma->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_lock_mapping(mm, vma->vm_file->f_mapping); } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + vm_lock_anon_vma(mm, vma->anon_vma); + } + ret = 0; out_unlock: -- cgit v1.2.3 From 912985dce45ef18fcdd9f5439fef054e0e22302a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 12 Aug 2008 17:52:52 -0500 Subject: mm: Make generic weak get_user_pages_fast and EXPORT_GPL it Out of line get_user_pages_fast fallback implementation, make it a weak symbol, get rid of CONFIG_HAVE_GET_USER_PAGES_FAST. Export the symbol to modules so lguest can use it. Signed-off-by: Nick Piggin Signed-off-by: Rusty Russell --- mm/Kconfig | 3 --- mm/util.c | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 446c6588c75..0bd9c2dbb2a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -77,9 +77,6 @@ config FLAT_NODE_MEM_MAP def_bool y depends on !SPARSEMEM -config HAVE_GET_USER_PAGES_FAST - bool - # # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's # to represent different areas of memory. This variable allows diff --git a/mm/util.c b/mm/util.c index 9341ca77bd8..cb00b748ce4 100644 --- a/mm/util.c +++ b/mm/util.c @@ -171,3 +171,18 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->unmap_area = arch_unmap_area; } #endif + +int __attribute__((weak)) get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, nr_pages, + write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(get_user_pages_fast); -- cgit v1.2.3 From caff3a2c333e11a794308bd9a875a09b94fee24a Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Tue, 12 Aug 2008 15:08:38 -0700 Subject: hugetlb: call arch_prepare_hugepage() for surplus pages The s390 software large page emulation implements shared page tables by using page->index of the first tail page from a compound large page to store page table information. This is set up in arch_prepare_hugepage(), which is called from alloc_fresh_huge_page_node(). A similar call to arch_prepare_hugepage() is missing for surplus large pages that are allocated in alloc_buddy_huge_page(), which breaks the software emulation mode for (surplus) large pages on s390. This patch adds the missing call to arch_prepare_hugepage(). It will have no effect on other architectures where arch_prepare_hugepage() is a nop. Also, use the correct order in the error path in alloc_fresh_huge_page_node(). Acked-by: Martin Schwidefsky Signed-off-by: Gerald Schaefer Acked-by: Nick Piggin Acked-by: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 757ca983fd9..92155db888b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); return NULL; } prep_new_huge_page(h, page, nid); @@ -665,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + if (page && arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } + spin_lock(&hugetlb_lock); if (page) { /* -- cgit v1.2.3 From 74768ed833344bb0f82b97cee46320a3d7f09ecd Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 12 Aug 2008 15:08:39 -0700 Subject: page allocator: use no-panic variant of alloc_bootmem() in alloc_large_system_hash() .. since a failed allocation is being (initially) handled gracefully, and panic()-ed upon failure explicitly in the function if retries with smaller sizes failed. Signed-off-by: Jan Beulich Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 401d104d2bb..af982f7cdb2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4437,7 +4437,7 @@ void *__init alloc_large_system_hash(const char *tablename, do { size = bucketsize << log2qty; if (flags & HASH_EARLY) - table = alloc_bootmem(size); + table = alloc_bootmem_nopanic(size); else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { -- cgit v1.2.3 From 9623e078c1f4692a91531af2f639ec8aff8f0472 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 12 Aug 2008 15:08:41 -0700 Subject: memcg: fix oops in mem_cgroup_shrink_usage Got an oops in mem_cgroup_shrink_usage() when testing loop over tmpfs: yes, of course, loop0 has no mm: other entry points check but this didn't. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7056c3bdb47..0f1f7a7374b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -796,6 +796,8 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) if (mem_cgroup_subsys.disabled) return 0; + if (!mm) + return 0; rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); -- cgit v1.2.3 From 57303d80175e10056bf51206f9961d586f02f967 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 12 Aug 2008 15:08:47 -0700 Subject: hugetlbfs: allocate structures for reservation tracking outside of spinlocks In the normal case, hugetlbfs reserves hugepages at map time so that the pages exist for future faults. A struct file_region is used to track when reservations have been consumed and where. These file_regions are allocated as necessary with kmalloc() which can sleep with the mm->page_table_lock held. This is wrong and triggers may-sleep warning when PREEMPT is enabled. Updates to the underlying file_region are done in two phases. The first phase prepares the region for the change, allocating any necessary memory, without actually making the change. The second phase actually commits the change. This patch makes use of this by checking the reservations before the page_table_lock is taken; triggering any necessary allocations. This may then be safely repeated within the locks without any allocations being required. Credit to Mel Gorman for diagnosing this failure and initial versions of the patch. Signed-off-by: Andy Whitcroft Tested-by: Gerald Schaefer Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92155db888b..4c97c174e2e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1942,6 +1942,15 @@ retry: lock_page(page); } + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) + vma_needs_reservation(h, vma, address); + spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -1978,6 +1987,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); @@ -2000,19 +2010,35 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; + /* + * If we are going to COW the mapping later, we examine the pending + * reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. For private mappings, we also lookup the pagecache + * page now as it is used to determine if a reservation has been + * consumed. + */ + if (write_access && !pte_write(entry)) { + vma_needs_reservation(h, vma, address); + + if (!(vma->vm_flags & VM_SHARED)) + pagecache_page = hugetlbfs_pagecache_page(h, + vma, address); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) { - struct page *page; - page = hugetlbfs_pagecache_page(h, vma, address); - ret = hugetlb_cow(mm, vma, address, ptep, entry, page); - if (page) { - unlock_page(page); - put_page(page); - } - } + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry, + pagecache_page); spin_unlock(&mm->page_table_lock); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + mutex_unlock(&hugetlb_instantiation_mutex); return ret; -- cgit v1.2.3 From 2b26736c88db85c038e04c2306d0745553e69602 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 12 Aug 2008 15:08:49 -0700 Subject: allocate structures for reservation tracking in hugetlbfs outside of spinlocks v2 [Andrew this should replace the previous version which did not check the returns from the region prepare for errors. This has been tested by us and Gerald and it looks good. Bah, while reviewing the locking based on your previous email I spotted that we need to check the return from the vma_needs_reservation call for allocation errors. Here is an updated patch to correct this. This passes testing here.] Signed-off-by: Andy Whitcroft Tested-by: Gerald Schaefer Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c97c174e2e..67a71191136 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1949,7 +1949,10 @@ retry: * the spinlock. */ if (write_access && !(vma->vm_flags & VM_SHARED)) - vma_needs_reservation(h, vma, address); + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); @@ -1976,6 +1979,7 @@ out: backout: spin_unlock(&mm->page_table_lock); +backout_unlocked: unlock_page(page); put_page(page); goto out; @@ -2004,8 +2008,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); - return ret; + goto out_unlock; } ret = 0; @@ -2019,7 +2022,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * consumed. */ if (write_access && !pte_write(entry)) { - vma_needs_reservation(h, vma, address); + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto out_unlock; + } if (!(vma->vm_flags & VM_SHARED)) pagecache_page = hugetlbfs_pagecache_page(h, @@ -2039,6 +2045,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(pagecache_page); } +out_unlock: mutex_unlock(&hugetlb_instantiation_mutex); return ret; -- cgit v1.2.3 From d6bf73e4340f52159c1d9f13836b62e20fcd12d3 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Tue, 12 Aug 2008 15:08:52 -0700 Subject: do_migrate_pages(): remove unused variable Signed-off-by: MinChan Kim Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e550bec2058..83369058ec1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) { - LIST_HEAD(pagelist); int busy = 0; int err = 0; nodemask_t tmp; -- cgit v1.2.3 From fc1efbdb7a1175759b099d74b67921396e5e8e3d Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Tue, 12 Aug 2008 15:09:00 -0700 Subject: mm/sparse.c: removed duplicated include Signed-off-by: Huang Weiyi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 5d9dbbb9d39..39db301b920 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -12,7 +12,6 @@ #include #include #include -#include "internal.h" /* * Permanent SPARSEMEM data: -- cgit v1.2.3 From 5cd9c58fbe9ec92b45b27e131719af4f2bd9eb40 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 14 Aug 2008 11:37:28 +0100 Subject: security: Fix setting of PF_SUPERPRIV by __capable() Fix the setting of PF_SUPERPRIV by __capable() as it could corrupt the flags the target process if that is not the current process and it is trying to change its own flags in a different way at the same time. __capable() is using neither atomic ops nor locking to protect t->flags. This patch removes __capable() and introduces has_capability() that doesn't set PF_SUPERPRIV on the process being queried. This patch further splits security_ptrace() in two: (1) security_ptrace_may_access(). This passes judgement on whether one process may access another only (PTRACE_MODE_ATTACH for ptrace() and PTRACE_MODE_READ for /proc), and takes a pointer to the child process. current is the parent. (2) security_ptrace_traceme(). This passes judgement on PTRACE_TRACEME only, and takes only a pointer to the parent process. current is the child. In Smack and commoncap, this uses has_capability() to determine whether the parent will be permitted to use PTRACE_ATTACH if normal checks fail. This does not set PF_SUPERPRIV. Two of the instances of __capable() actually only act on current, and so have been changed to calls to capable(). Of the places that were using __capable(): (1) The OOM killer calls __capable() thrice when weighing the killability of a process. All of these now use has_capability(). (2) cap_ptrace() and smack_ptrace() were using __capable() to check to see whether the parent was allowed to trace any process. As mentioned above, these have been split. For PTRACE_ATTACH and /proc, capable() is now used, and for PTRACE_TRACEME, has_capability() is used. (3) cap_safe_nice() only ever saw current, so now uses capable(). (4) smack_setprocattr() rejected accesses to tasks other than current just after calling __capable(), so the order of these two tests have been switched and capable() is used instead. (5) In smack_file_send_sigiotask(), we need to allow privileged processes to receive SIGIO on files they're manipulating. (6) In smack_task_wait(), we let a process wait for a privileged process, whether or not the process doing the waiting is privileged. I've tested this with the LTP SELinux and syscalls testscripts. Signed-off-by: David Howells Acked-by: Serge Hallyn Acked-by: Casey Schaufler Acked-by: Andrew G. Morgan Acked-by: Al Viro Signed-off-by: James Morris --- mm/oom_kill.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8a5467ee626..64e5b4bcd96 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -26,6 +26,7 @@ #include #include #include +#include int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Superuser processes are usually more important, so we make it * less likely that we kill those. */ - if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) + if (has_capability(p, CAP_SYS_ADMIN) || + has_capability(p, CAP_SYS_RESOURCE)) points /= 4; /* @@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * tend to only have this flag set on applications they think * of as important. */ - if (__capable(p, CAP_SYS_RAWIO)) + if (has_capability(p, CAP_SYS_RAWIO)) points /= 4; /* -- cgit v1.2.3 From 627240aaa92a4dc00d25584910b5f205e963747b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 15 Aug 2008 00:40:17 -0700 Subject: bootmem allocator: alloc_bootmem_core(): page-align the end offset This is the minimal sequence that jams the allocator: void *p, *q, *r; p = alloc_bootmem(PAGE_SIZE); q = alloc_bootmem(64); free_bootmem(p, PAGE_SIZE); p = alloc_bootmem(PAGE_SIZE); r = alloc_bootmem(64); after this sequence (assuming that the allocator was empty or page-aligned before), pointer "q" will be equal to pointer "r". What's hapenning inside the allocator: p = alloc_bootmem(PAGE_SIZE); in allocator: last_end_off == PAGE_SIZE, bitmap contains bits 10000... q = alloc_bootmem(64); in allocator: last_end_off == PAGE_SIZE + 64, bitmap contains 11000... free_bootmem(p, PAGE_SIZE); in allocator: last_end_off == PAGE_SIZE + 64, bitmap contains 01000... p = alloc_bootmem(PAGE_SIZE); in allocator: last_end_off == PAGE_SIZE, bitmap contains 11000... r = alloc_bootmem(64); and now: it finds bit "2", as a place where to allocate (sidx) it hits the condition if (bdata->last_end_off && PFN_DOWN(bdata->last_end_off) + 1 == sidx)) start_off = ALIGN(bdata->last_end_off, align); -you can see that the condition is true, so it assigns start_off = ALIGN(bdata->last_end_off, align); (that is PAGE_SIZE) and allocates over already allocated block. With the patch it tries to continue at the end of previous allocation only if the previous allocation ended in the middle of the page. Signed-off-by: Mikulas Patocka Acked-by: Johannes Weiner Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 4af15d0340a..e023c68b025 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -473,7 +473,7 @@ find_block: goto find_block; } - if (bdata->last_end_off && + if (bdata->last_end_off & (PAGE_SIZE - 1) && PFN_DOWN(bdata->last_end_off) + 1 == sidx) start_off = ALIGN(bdata->last_end_off, align); else -- cgit v1.2.3