From 5a6fe125950676015f5108fb71b2a67441755003 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Feb 2009 14:02:27 +0000 Subject: Do not account for the address space used by hugetlbfs using VM_ACCOUNT When overcommit is disabled, the core VM accounts for pages used by anonymous shared, private mappings and special mappings. It keeps track of VMAs that should be accounted for with VM_ACCOUNT and VMAs that never had a reserve with VM_NORESERVE. Overcommit for hugetlbfs is much riskier than overcommit for base pages due to contiguity requirements. It avoids overcommiting on both shared and private mappings using reservation counters that are checked and updated during mmap(). This ensures (within limits) that hugepages exist in the future when faults occurs or it is too easy to applications to be SIGKILLed. As hugetlbfs makes its own reservations of a different unit to the base page size, VM_ACCOUNT should never be set. Even if the units were correct, we would double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may be set because an application can request no reserves be made for hugetlbfs at the risk of getting killed later. With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This breaks the accounting for both the core VM and hugetlbfs, can trigger an OOM storm when hugepage pools are too small lockups and corrupted counters otherwise are used. This patch brings hugetlbfs more in line with how the core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set. Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/fremap.c | 2 +- mm/hugetlb.c | 39 +++++++++++++++++++++++++-------------- mm/mmap.c | 38 ++++++++++++++++++++++---------------- mm/mprotect.c | 5 +++-- 4 files changed, 51 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 736ba7f3306..b6ec85abbb3 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; get_file(file); addr = mmap_region(file, start, size, - flags, vma->vm_flags, pgoff, 1); + flags, vma->vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 618e9830408..20746420954 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int acctflag) { - long ret, chg; + long ret = 0, chg; struct hstate *h = hstate_inode(inode); - if (vma && vma->vm_flags & VM_NORESERVE) - return 0; - /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need @@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); - if (!resv_map) - return -ENOMEM; - + else chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); - } - if (chg < 0) return chg; if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; + + /* + * Only apply hugepage reservation if asked. We still have to + * take the filesystem quota because it is an upper limit + * defined for the mount and not necessarily memory as a whole + */ + if (acctflag & VM_NORESERVE) { + reset_vma_resv_huge_pages(vma); + return 0; + } + ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); @@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode, } if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); + else { + struct resv_map *resv_map = resv_map_alloc(); + + if (!resv_map) + return -ENOMEM; + + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 214b6a258ee..eb1270bebe6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, struct inode *inode; unsigned int vm_flags; int error; - int accountable = 1; unsigned long reqprot = prot; /* @@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, return -EPERM; vm_flags &= ~VM_MAYEXEC; } - if (is_file_hugepages(file)) - accountable = 0; if (!file->f_op || !file->f_op->mmap) return -ENODEV; @@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (error) return error; - return mmap_region(file, addr, len, flags, vm_flags, pgoff, - accountable); + return mmap_region(file, addr, len, flags, vm_flags, pgoff); } EXPORT_SYMBOL(do_mmap_pgoff); @@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma) /* * We account for memory if it's a private writeable mapping, - * and VM_NORESERVE wasn't set. + * not hugepages and VM_NORESERVE wasn't set. */ -static inline int accountable_mapping(unsigned int vm_flags) +static inline int accountable_mapping(struct file *file, unsigned int vm_flags) { + /* + * hugetlb has its own accounting separate from the core VM + * VM_HUGETLB may not be set yet so we cannot check for that flag. + */ + if (file && is_file_hugepages(file)) + return 0; + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff, - int accountable) + unsigned int vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1128,18 +1130,22 @@ munmap_back: /* * Set 'VM_NORESERVE' if we should not account for the - * memory use of this mapping. We only honor MAP_NORESERVE - * if we're allowed to overcommit memory. + * memory use of this mapping. */ - if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) - vm_flags |= VM_NORESERVE; - if (!accountable) - vm_flags |= VM_NORESERVE; + if ((flags & MAP_NORESERVE)) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; + } /* * Private writable mapping: check memory availability */ - if (accountable_mapping(vm_flags)) { + if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; if (security_vm_enough_memory(charged)) return -ENOMEM; diff --git a/mm/mprotect.c b/mm/mprotect.c index abe2694e13f..258197b76fb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we - * make it unwritable again. + * make it unwritable again. hugetlb mapping were accounted for + * even if read-only so there is no need to account for them here */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE| + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; if (security_vm_enough_memory(charged)) -- cgit v1.2.3 From 9f339e7028e2855717af3193c938f9960ad13b38 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Wed, 11 Feb 2009 15:10:27 +0100 Subject: x86, ptrace, mm: fix double-free on race Ptrace_detach() races with __ptrace_unlink() if the traced task is reaped while detaching. This might cause a double-free of the BTS buffer. Change the ptrace_detach() path to only do the memory accounting in ptrace_bts_detach() and leave the buffer free to ptrace_bts_untrace() which will be called from __ptrace_unlink(). The fix follows a proposal from Oleg Nesterov. Reported-by: Oleg Nesterov Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- mm/mlock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 028ec482fdd..2b57f7e6039 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -657,7 +657,7 @@ void *alloc_locked_buffer(size_t size) return buffer; } -void free_locked_buffer(void *buffer, size_t size) +void release_locked_buffer(void *buffer, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; @@ -667,6 +667,11 @@ void free_locked_buffer(void *buffer, size_t size) current->mm->locked_vm -= pgsz; up_write(¤t->mm->mmap_sem); +} + +void free_locked_buffer(void *buffer, size_t size) +{ + release_locked_buffer(buffer, size); kfree(buffer); } -- cgit v1.2.3 From 17c9d12e126cb0de8d535dc1908c4819d712bc68 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 11 Feb 2009 16:34:16 +0000 Subject: Do not account for hugetlbfs quota at mmap() time if mapping [SHM|MAP]_NORESERVE Commit 5a6fe125950676015f5108fb71b2a67441755003 brought hugetlbfs more in line with the core VM by obeying VM_NORESERVE and not reserving hugepages for both shared and private mappings when [SHM|MAP]_NORESERVE are specified. However, it is still taking filesystem quota unconditionally. At fault time, if there are no reserves and attempt is made to allocate the page and account for filesystem quota. If either fail, the fault fails. The impact is that quota is getting accounted for twice. This patch partially reverts 5a6fe125950676015f5108fb71b2a67441755003. To help prevent this mistake happening again, it improves the documentation of hugetlb_reserve_pages() Reported-by: Andy Whitcroft Signed-off-by: Mel Gorman Acked-by: Andy Whitcroft Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 53 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 20746420954..107da3d809a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2272,9 +2272,17 @@ int hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, int acctflag) { - long ret = 0, chg; + long ret, chg; struct hstate *h = hstate_inode(inode); + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page + * and filesystem quota without using reserves + */ + if (acctflag & VM_NORESERVE) + return 0; + /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need @@ -2283,42 +2291,47 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); - else + else { + struct resv_map *resv_map = resv_map_alloc(); + if (!resv_map) + return -ENOMEM; + chg = to - from; + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + if (chg < 0) return chg; + /* There must be enough filesystem quota for the mapping */ if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; /* - * Only apply hugepage reservation if asked. We still have to - * take the filesystem quota because it is an upper limit - * defined for the mount and not necessarily memory as a whole + * Check enough hugepages are available for the reservation. + * Hand back the quota if there are not */ - if (acctflag & VM_NORESERVE) { - reset_vma_resv_huge_pages(vma); - return 0; - } - ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); return ret; } + + /* + * Account for the reservations made. Shared mappings record regions + * that have reservations as they are shared by multiple VMAs. + * When the last VMA disappears, the region map says how much + * the reservation was and the page cache tells how much of + * the reservation was consumed. Private mappings are per-VMA and + * only the consumed reservations are tracked. When the VMA + * disappears, the original reservation is the VMA size and the + * consumed reservations are stored in the map. Hence, nothing + * else has to be done for private mappings here + */ if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); - - if (!resv_map) - return -ENOMEM; - - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); - } - return 0; } -- cgit v1.2.3 From 1001c9fb8721ab395e21f571ed2aaa523cdd1e29 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 11 Feb 2009 13:04:18 -0800 Subject: migration: migrate_vmas should check "vma" migrate_vmas() should check "vma" not "vma->vm_next" for for-loop condition. Signed-off-by: Daisuke Nishimura Cc: Christoph Lameter Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 2bb4e1d6352..a9eff3f092f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1129,7 +1129,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, struct vm_area_struct *vma; int err = 0; - for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { + for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { if (vma->vm_ops && vma->vm_ops->migrate) { err = vma->vm_ops->migrate(vma, to, from, flags); if (err) -- cgit v1.2.3 From fc3501d411d34823fb9be248a95a0c44f945866f Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 11 Feb 2009 13:04:23 -0800 Subject: mm: fix dirty_bytes/dirty_background_bytes sysctls on 64bit arches We need to pass an unsigned long as the minimum, because it gets casted to an unsigned long in the sysctl handler. If we pass an int, we'll access four more bytes on 64bit arches, resulting in a random minimum value. [rientjes@google.com: fix type of `old_bytes'] Signed-off-by: Sven Wegener Cc: Peter Zijlstra Cc: Dave Chinner Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dc32dae01e5..c17005e7397 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -209,7 +209,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int old_bytes = vm_dirty_bytes; + unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); -- cgit v1.2.3 From 508b9f8efdad123b202b228f71f59feba51e4fb5 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Wed, 11 Feb 2009 13:04:27 -0800 Subject: mm: fix mlocked page counter mismatch When I tested following program, I found that the mlocked counter is strange. It cannot free some mlocked pages. It is because try_to_unmap_file() doesn't check real page mappings in vmas. That is because the goal of an address_space for a file is to find all processes into which the file's specific interval is mapped. It is related to the file's interval, not to pages. Even if the page isn't really mapped by the vma, it returns SWAP_MLOCK since the vma has VM_LOCKED, then calls try_to_mlock_page. After this the mlocked counter is increased again. COWed anon page in a file-backed vma could be a such case. This patch resolves it. -- my test program -- int main() { mlockall(MCL_CURRENT); return 0; } -- before -- root@barrios-target-linux:~# cat /proc/meminfo | egrep 'Mlo|Unev' Unevictable: 0 kB Mlocked: 0 kB -- after -- root@barrios-target-linux:~# cat /proc/meminfo | egrep 'Mlo|Unev' Unevictable: 8 kB Mlocked: 8 kB Signed-off-by: MinChan Kim Acked-by: Lee Schermerhorn Acked-by: KOSAKI Motohiro Tested-by: Lee Schermerhorn Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index ac4af8cffbf..16521664010 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1072,7 +1072,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (MLOCK_PAGES && unlikely(unlock)) { - if (!(vma->vm_flags & VM_LOCKED)) + if (!((vma->vm_flags & VM_LOCKED) && + page_mapped_in_vma(page, vma))) continue; /* must visit all vmas */ ret = SWAP_MLOCK; } else { -- cgit v1.2.3 From 2e9c23724328ae4e56c42a35a717a956d7d3001d Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 11 Feb 2009 13:04:29 -0800 Subject: memcg: use __GFP_NOWARN in page cgroup allocation page_cgroup's page allocation at init/memory hotplug uses kmalloc() and vmalloc(). If kmalloc() failes, vmalloc() is used. This is because vmalloc() is very limited resource on 32bit systems. We want to use kmalloc() first. But in this kind of call, __GFP_NOWARN should be specified. Reported-by: Heiko Carstens Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 7006a11350c..ceecfbb143f 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -114,7 +114,8 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; if (slab_is_available()) { - base = kmalloc_node(table_size, GFP_KERNEL, nid); + base = kmalloc_node(table_size, + GFP_KERNEL | __GFP_NOWARN, nid); if (!base) base = vmalloc_node(table_size, nid); } else { -- cgit v1.2.3 From 89e1219004b3657cc014521663eeef0744f1c99d Mon Sep 17 00:00:00 2001 From: Federico Cuello Date: Wed, 11 Feb 2009 13:04:39 -0800 Subject: writeback: fix break condition Commit dcf6a79dda5cc2a2bec183e50d829030c0972aaa ("write-back: fix nr_to_write counter") fixed nr_to_write counter, but didn't set the break condition properly. If nr_to_write == 0 after being decremented it will loop one more time before setting done = 1 and breaking the loop. [akpm@linux-foundation.org: coding-style fixes] Cc: Artem Bityutskiy Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c17005e7397..6106a5c7ed4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1051,20 +1051,23 @@ continue_unlock: } } - if (nr_to_write > 0) + if (nr_to_write > 0) { nr_to_write--; - else if (wbc->sync_mode == WB_SYNC_NONE) { - /* - * We stop writing back only if we are not - * doing integrity sync. In case of integrity - * sync we have to keep going because someone - * may be concurrently dirtying pages, and we - * might have synced a lot of newly appeared - * dirty pages, but have not synced all of the - * old dirty pages. - */ - done = 1; - break; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + done = 1; + break; + } } if (wbc->nonblocking && bdi_write_congested(bdi)) { -- cgit v1.2.3 From 9480c53e9b2aa13a06283ffb96bb8f1873ac4e9a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 11 Feb 2009 13:04:41 -0800 Subject: mm: rearrange exit_mmap() to unlock before arch_exit_mmap Christophe Saout reported [in precursor to: http://marc.info/?l=linux-kernel&m=123209902707347&w=4]: > Note that I also some a different issue with CONFIG_UNEVICTABLE_LRU. > Seems like Xen tears down current->mm early on process termination, so > that __get_user_pages in exit_mmap causes nasty messages when the > process had any mlocked pages. (in fact, it somehow manages to get into > the swapping code and produces a null pointer dereference trying to get > a swap token) Jeremy explained: Yes. In the normal case under Xen, an in-use pagetable is "pinned", meaning that it is RO to the kernel, and all updates must go via hypercall (or writes are trapped and emulated, which is much the same thing). An unpinned pagetable is not currently in use by any process, and can be directly accessed as normal RW pages. As an optimisation at process exit time, we unpin the pagetable as early as possible (switching the process to init_mm), so that all the normal pagetable teardown can happen with direct memory accesses. This happens in exit_mmap() -> arch_exit_mmap(). The munlocking happens a few lines below. The obvious thing to do would be to move arch_exit_mmap() to below the munlock code, but I think we'd want to call it even if mm->mmap is NULL, just to be on the safe side. Thus, this patch: exit_mmap() needs to unlock any locked vmas before calling arch_exit_mmap, as the latter may switch the current mm to init_mm, which would cause the former to fail. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Lee Schermerhorn Cc: Christophe Saout Cc: Keir Fraser Cc: Christophe Saout Cc: Alex Williamson Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index eb1270bebe6..00ced3ee49a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2084,12 +2084,8 @@ void exit_mmap(struct mm_struct *mm) unsigned long end; /* mm's last user has gone, and its about to be pulled down */ - arch_exit_mmap(mm); mmu_notifier_release(mm); - if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ - return; - if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -2098,7 +2094,13 @@ void exit_mmap(struct mm_struct *mm) vma = vma->vm_next; } } + + arch_exit_mmap(mm); + vma = mm->mmap; + if (!vma) /* Can happen if dup_mmap() received an OOM */ + return; + lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); -- cgit v1.2.3 From b1aabecd55931ee754f6a913969516b26a0e682e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2009 15:21:44 +0200 Subject: mm: Export symbol ksize() Commit 7b2cd92adc5430b0c1adeb120971852b4ea1ab08 ("crypto: api - Fix zeroing on free") added modular user of ksize(). Export that to fix crypto.ko compilation. Cc: Herbert Xu Signed-off-by: Kirill A. Shutemov Signed-off-by: Pekka Enberg --- mm/slab.c | 1 + mm/slob.c | 1 + mm/slub.c | 1 + 3 files changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index ddc41f337d5..4d00855629c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4457,3 +4457,4 @@ size_t ksize(const void *objp) return obj_size(virt_to_cache(objp)); } +EXPORT_SYMBOL(ksize); diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed..52bc8a2bd9e 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -521,6 +521,7 @@ size_t ksize(const void *block) } else return sp->page.private; } +EXPORT_SYMBOL(ksize); struct kmem_cache { unsigned int size, align; diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a2..0280eee6cf3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2736,6 +2736,7 @@ size_t ksize(const void *object) */ return s->size; } +EXPORT_SYMBOL(ksize); void kfree(const void *x) { -- cgit v1.2.3 From 3a4c6800f31ea8395628af5e7e490270ee5d0585 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 12 Feb 2009 04:34:23 +0100 Subject: Fix page writeback thinko, causing Berkeley DB slowdown A bug was introduced into write_cache_pages cyclic writeout by commit 31a12666d8f0c22235297e1c1575f82061480029 ("mm: write_cache_pages cyclic fix"). The intention (and comments) is that we should cycle back and look for more dirty pages at the beginning of the file if there is no more work to be done. But the !done condition was dropped from the test. This means that any time the page writeout loop breaks (eg. due to nr_to_write == 0), we will set index to 0, then goto again. This will set done_index to index, then find done is set, so will proceed to the end of the function. When updating mapping->writeback_index for cyclic writeout, we now use done_index == 0, so we're always cycling back to 0. This seemed to be causing random mmap writes (slapadd and iozone) to start writing more pages from the LRU and writeout would slowdown, and caused bugzilla entry http://bugzilla.kernel.org/show_bug.cgi?id=12604 about Berkeley DB slowing down dramatically. With this patch, iozone random write performance is increased nearly 5x on my system (iozone -B -r 4k -s 64k -s 512m -s 1200m on ext2). Signed-off-by: Nick Piggin Reported-and-tested-by: Jan Kara Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6106a5c7ed4..3c84128596b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1079,7 +1079,7 @@ continue_unlock: pagevec_release(&pvec); cond_resched(); } - if (!cycled) { + if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap -- cgit v1.2.3