From 994fc28c7b1e697ac56befe4aecabf23f0689f46 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 15 Dec 2005 14:28:17 -0800 Subject: [PATCH] add AOP_TRUNCATED_PAGE, prepend AOP_ to WRITEPAGE_ACTIVATE readpage(), prepare_write(), and commit_write() callers are updated to understand the special return code AOP_TRUNCATED_PAGE in the style of writepage() and WRITEPAGE_ACTIVATE. AOP_TRUNCATED_PAGE tells the caller that the callee has unlocked the page and that the operation should be tried again with a new page. OCFS2 uses this to detect and work around a lock inversion in its aop methods. There should be no change in behaviour for methods that don't return AOP_TRUNCATED_PAGE. WRITEPAGE_ACTIVATE is also prepended with AOP_ for consistency and they are made enums so that kerneldoc can be used to document their semantics. Signed-off-by: Zach Brown --- mm/filemap.c | 73 ++++++++++++++++++++++++++++++++++++++++------------------ mm/readahead.c | 15 +++++++----- mm/shmem.c | 2 +- mm/vmscan.c | 2 +- 4 files changed, 61 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 33a28bfde15..6e1d08a2b8b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -831,8 +831,13 @@ readpage: /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); - if (unlikely(error)) + if (unlikely(error)) { + if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } goto readpage_error; + } if (!PageUptodate(page)) { lock_page(page); @@ -1152,26 +1157,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) { struct address_space *mapping = file->f_mapping; struct page *page; - int error; + int ret; - page = page_cache_alloc_cold(mapping); - if (!page) - return -ENOMEM; + do { + page = page_cache_alloc_cold(mapping); + if (!page) + return -ENOMEM; + + ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + if (ret == 0) + ret = mapping->a_ops->readpage(file, page); + else if (ret == -EEXIST) + ret = 0; /* losing race to add is OK */ - error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); - if (!error) { - error = mapping->a_ops->readpage(file, page); page_cache_release(page); - return error; - } - /* - * We arrive here in the unlikely event that someone - * raced with us and added our page to the cache first - * or we are out of memory for radix-tree nodes. - */ - page_cache_release(page); - return error == -EEXIST ? 0 : error; + } while (ret == AOP_TRUNCATED_PAGE); + + return ret; } #define MMAP_LOTSAMISS (100) @@ -1331,10 +1334,14 @@ page_not_uptodate: goto success; } - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1358,10 +1365,14 @@ page_not_uptodate: goto success; } ClearPageError(page); - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1444,10 +1455,14 @@ page_not_uptodate: goto success; } - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1470,10 +1485,14 @@ page_not_uptodate: } ClearPageError(page); - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1934,12 +1953,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { loff_t isize = i_size_read(inode); + + if (status != AOP_TRUNCATED_PAGE) + unlock_page(page); + page_cache_release(page); + if (status == AOP_TRUNCATED_PAGE) + continue; /* * prepare_write() may have instantiated a few blocks * outside i_size. Trim these off again. */ - unlock_page(page); - page_cache_release(page); if (pos + bytes > isize) vmtruncate(inode, isize); break; @@ -1952,6 +1975,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, cur_iov, iov_base, bytes); flush_dcache_page(page); status = a_ops->commit_write(file, page, offset, offset+bytes); + if (status == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + continue; + } if (likely(copied > 0)) { if (!status) status = copied; diff --git a/mm/readahead.c b/mm/readahead.c index 72e7adbb87c..8d6eeaaa629 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, { unsigned page_idx; struct pagevec lru_pvec; - int ret = 0; + int ret; if (mapping->a_ops->readpages) { ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); @@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp, list_del(&page->lru); if (!add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { - mapping->a_ops->readpage(filp, page); - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); - } else { - page_cache_release(page); + ret = mapping->a_ops->readpage(filp, page); + if (ret != AOP_TRUNCATED_PAGE) { + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + continue; + } /* else fall through to release */ } + page_cache_release(page); } pagevec_lru_add(&lru_pvec); + ret = 0; out: return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index dc25565a61e..d9fc277940d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -855,7 +855,7 @@ unlock: swap_free(swap); redirty: set_page_dirty(page); - return WRITEPAGE_ACTIVATE; /* Return with the page locked */ + return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */ } #ifdef CONFIG_NUMA diff --git a/mm/vmscan.c b/mm/vmscan.c index b0cd81c32de..795a050fe47 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -367,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) res = mapping->a_ops->writepage(page, &wbc); if (res < 0) handle_write_error(mapping, page, res); - if (res == WRITEPAGE_ACTIVATE) { + if (res == AOP_WRITEPAGE_ACTIVATE) { ClearPageReclaim(page); return PAGE_ACTIVATE; } -- cgit v1.2.3 From 47f3a867f6310d6abfa185ab12baaba7ed1d69af Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 6 Jan 2006 00:10:32 -0800 Subject: [PATCH] mm: fix __alloc_pages cpuset ALLOC_* flags Two changes to the setting of the ALLOC_CPUSET flag in mm/page_alloc.c:__alloc_pages() - A bug fix - the "ignoring mins" case should not be honoring ALLOC_CPUSET. This case of all cases, since it is handling a request that will free up more memory than is asked for (exiting tasks, e.g.) should be allowed to escape cpuset constraints when memory is tight. - A logic change to make it simpler. Honor cpusets even on GFP_ATOMIC (!wait) requests. With this, cpuset confinement applies to all requests except ALLOC_NO_WATERMARKS, so that in a subsequent cleanup patch, I can remove the ALLOC_CPUSET flag entirely. Since I don't know any real reason this logic has to be either way, I am choosing the path of the simplest code. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe14a8c87fc..1e49dc7cd61 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -903,8 +903,7 @@ restart: alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) alloc_flags |= ALLOC_HIGH; - if (wait) - alloc_flags |= ALLOC_CPUSET; + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations @@ -926,7 +925,7 @@ restart: nofail_alloc: /* go through the zonelist yet again, ignoring mins */ page = get_page_from_freelist(gfp_mask, order, - zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); + zonelist, ALLOC_NO_WATERMARKS); if (page) goto got_pg; if (gfp_mask & __GFP_NOFAIL) { -- cgit v1.2.3 From 5ac24eefd1d89bc6aa2817741c3bd5d4205b2efd Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 6 Jan 2006 00:10:33 -0800 Subject: [PATCH] memhotplug: __add_section remove unused pgdat definition __add_section defines an unused pointer to the zones pgdat. Remove this definition. This fixes a compile warning. Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f6d4af8af8a..a918f77f02f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, int nr_pages); static int __add_section(struct zone *zone, unsigned long phys_start_pfn) { - struct pglist_data *pgdat = zone->zone_pgdat; int nr_pages = PAGES_PER_SECTION; int ret; -- cgit v1.2.3 From d7339071f6a8b50101d7ba327926b770f22d5d8b Mon Sep 17 00:00:00 2001 From: Hans Reiser Date: Fri, 6 Jan 2006 00:10:36 -0800 Subject: [PATCH] reiser4: vfs: add truncate_inode_pages_range() This patch makes truncate_inode_pages_range from truncate_inode_pages. truncate_inode_pages became a one-liner call to truncate_inode_pages_range. Reiser4 needs truncate_inode_pages_ranges because it tries to keep correspondence between existences of metadata pointing to data pages and pages to which those metadata point to. So, when metadata of certain part of file is removed from filesystem tree, only pages of corresponding range are to be truncated. (Needed by the madvise(MADV_REMOVE) patch) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 9173ab50060..7dee3274590 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) } /** - * truncate_inode_pages - truncate *all* the pages from an offset + * truncate_inode_pages - truncate range of pages specified by start and + * end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate + * @lend: offset to which to truncate * - * Truncate the page cache at a set offset, removing the pages that are beyond - * that offset (and zeroing out partial pages). + * Truncate the page cache, removing the pages that are between + * specified offsets (and zeroing out partial page + * (if lstart is not page aligned)). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass @@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. - * - * Called under (and serialised by) inode->i_sem. */ -void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +void truncate_inode_pages_range(struct address_space *mapping, + loff_t lstart, loff_t lend) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + pgoff_t end; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; pgoff_t next; @@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) if (mapping->nrpages == 0) return; + BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); + end = (lend >> PAGE_CACHE_SHIFT); + pagevec_init(&pvec, 0); next = start; - while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (next <= end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; + if (page_index > end) { + next = page_index; + break; + } + if (page_index > next) next = page_index; next++; @@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) next = start; continue; } + if (pvec.pages[0]->index > end) { + pagevec_release(&pvec); + break; + } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; + if (page->index > end) + break; lock_page(page); wait_on_page_writeback(page); if (page->index > next) @@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) pagevec_release(&pvec); } } +EXPORT_SYMBOL(truncate_inode_pages_range); +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * + * Called under (and serialised by) inode->i_sem. + */ +void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +{ + truncate_inode_pages_range(mapping, lstart, (loff_t)-1); +} EXPORT_SYMBOL(truncate_inode_pages); /** -- cgit v1.2.3 From f6b3ec238d12c8cc6cc71490c6e3127988460349 Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Fri, 6 Jan 2006 00:10:38 -0800 Subject: [PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing store Here is the patch to implement madvise(MADV_REMOVE) - which frees up a given range of pages & its associated backing store. Current implementation supports only shmfs/tmpfs and other filesystems return -ENOSYS. "Some app allocates large tmpfs files, then when some task quits and some client disconnect, some memory can be released. However the only way to release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli Databases want to use this feature to drop a section of their bufferpool (shared memory segments) - without writing back to disk/swap space. This feature is also useful for supporting hot-plug memory on UML. Concerns raised by Andrew Morton: - "We have no plan for holepunching! If we _do_ have such a plan (or might in the future) then what would the API look like? I think sys_holepunch(fd, start, len), so we should start out with that." - Using madvise is very weird, because people will ask "why do I need to mmap my file before I can stick a hole in it?" - None of the other madvise operations call into the filesystem in this manner. A broad question is: is this capability an MM operation or a filesytem operation? truncate, for example, is a filesystem operation which sometimes has MM side-effects. madvise is an mm operation and with this patch, it gains FS side-effects, only they're really, really significant ones." Comments: - Andrea suggested the fs operation too but then it's more efficient to have it as a mm operation with fs side effects, because they don't immediatly know fd and physical offset of the range. It's possible to fixup in userland and to use the fs operation but it's more expensive, the vmas are already in the kernel and we can use them. Short term plan & Future Direction: - We seem to need this interface only for shmfs/tmpfs files in the short term. We have to add hooks into the filesystem for correctness and completeness. This is what this patch does. - In the future, plan is to support both fs and mmap apis also. This also involves (other) filesystem specific functions to be implemented. - Current patch doesn't support VM_NONLINEAR - which can be addressed in the future. Signed-off-by: Badari Pulavarty Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Michael Kerrisk Cc: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 35 +++++++++++++++++++++++++++++++++++ mm/memory.c | 25 ++++++++++++++++++++++++- mm/shmem.c | 32 ++++++++++++++++++++++++-------- 3 files changed, 83 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 2b7cf0400a2..ae0ae3ea299 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma, return 0; } +/* + * Application wants to free up the pages and associated backing store. + * This is effectively punching a hole into the middle of a file. + * + * NOTE: Currently, only shmfs/tmpfs is supported for this operation. + * Other filesystems return -ENOSYS. + */ +static long madvise_remove(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct address_space *mapping; + loff_t offset, endoff; + + if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + return -EINVAL; + + if (!vma->vm_file || !vma->vm_file->f_mapping + || !vma->vm_file->f_mapping->host) { + return -EINVAL; + } + + mapping = vma->vm_file->f_mapping; + + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + endoff = (loff_t)(end - vma->vm_start - 1) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + return vmtruncate_range(mapping->host, offset, endoff); +} + static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) @@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_RANDOM: error = madvise_behavior(vma, prev, start, end, behavior); break; + case MADV_REMOVE: + error = madvise_remove(vma, start, end); + break; case MADV_WILLNEED: error = madvise_willneed(vma, prev, start, end); @@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. + * MADV_REMOVE - the application wants to free up the given range of + * pages and associated backing store. * * return values: * zero - success diff --git a/mm/memory.c b/mm/memory.c index d8dde07a365..e249088908c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1770,9 +1770,32 @@ out_big: out_busy: return -ETXTBSY; } - EXPORT_SYMBOL(vmtruncate); +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + + /* + * If the underlying filesystem is not going to provide + * a way to truncate a range of blocks (punch a hole) - + * we should return failure right now. + */ + if (!inode->i_op || !inode->i_op->truncate_range) + return -ENOSYS; + + down(&inode->i_sem); + down_write(&inode->i_alloc_sem); + unmap_mapping_range(mapping, offset, (end - offset), 1); + truncate_inode_pages_range(mapping, offset, end); + inode->i_op->truncate_range(inode, offset, end); + up_write(&inode->i_alloc_sem); + up(&inode->i_sem); + + return 0; +} +EXPORT_SYMBOL(vmtruncate_range); + /* * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen diff --git a/mm/shmem.c b/mm/shmem.c index d9fc277940d..65c148efa2e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next) } while (next); } -static void shmem_truncate(struct inode *inode) +static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) { struct shmem_inode_info *info = SHMEM_I(inode); unsigned long idx; @@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode) long nr_swaps_freed = 0; int offset; int freed; + int punch_hole = 0; inode->i_ctime = inode->i_mtime = CURRENT_TIME; - idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (idx >= info->next_index) return; spin_lock(&info->lock); info->flags |= SHMEM_TRUNCATE; - limit = info->next_index; - info->next_index = idx; + if (likely(end == (loff_t) -1)) { + limit = info->next_index; + info->next_index = idx; + } else { + limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (limit > info->next_index) + limit = info->next_index; + punch_hole = 1; + } + topdir = info->i_indirect; - if (topdir && idx <= SHMEM_NR_DIRECT) { + if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { info->i_indirect = NULL; nr_pages_to_free++; list_add(&topdir->lru, &pages_to_free); @@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode) set_page_private(subdir, page_private(subdir) - freed); if (offset) spin_unlock(&info->lock); - BUG_ON(page_private(subdir) > offset); + if (!punch_hole) + BUG_ON(page_private(subdir) > offset); } if (offset) offset = 0; - else if (subdir) { + else if (subdir && !page_private(subdir)) { dir[diroff] = NULL; nr_pages_to_free++; list_add(&subdir->lru, &pages_to_free); @@ -594,7 +604,7 @@ done2: * Also, though shmem_getpage checks i_size before adding to * cache, no recheck after: so fix the narrow window there too. */ - truncate_inode_pages(inode->i_mapping, inode->i_size); + truncate_inode_pages_range(inode->i_mapping, start, end); } spin_lock(&info->lock); @@ -614,6 +624,11 @@ done2: } } +static void shmem_truncate(struct inode *inode) +{ + shmem_truncate_range(inode, inode->i_size, (loff_t)-1); +} + static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; @@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = { static struct inode_operations shmem_inode_operations = { .truncate = shmem_truncate, .setattr = shmem_notify_change, + .truncate_range = shmem_truncate_range, }; static struct inode_operations shmem_dir_inode_operations = { -- cgit v1.2.3 From f0916794f00be44154102dedaeafe68b743078a2 Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Fri, 6 Jan 2006 00:10:40 -0800 Subject: [PATCH] Hugetlb: Remove duplicate i_size check cleanup Signed-off-by: David Gibson Signed-off-by: Adam Litke Cc: William Lee Irwin III Cc: "Seth, Rohit" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e52df7c471..acb864130f8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -350,19 +350,12 @@ static struct page *find_lock_huge_page(struct address_space *mapping, { struct page *page; int err; - struct inode *inode = mapping->host; - unsigned long size; retry: page = find_lock_page(mapping, idx); if (page) goto out; - /* Check to make sure the mapping hasn't been truncated */ - size = i_size_read(inode) >> HPAGE_SHIFT; - if (idx >= size) - goto out; - if (hugetlb_get_quota(mapping)) goto out; page = alloc_huge_page(); -- cgit v1.2.3 From 85ef47f74afe96c8c23eaa605f28cc01443c905f Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Fri, 6 Jan 2006 00:10:42 -0800 Subject: [PATCH] Hugetlb: Rename find_lock_page to find_or_alloc_huge_page find_lock_huge_page() isn't a great name, since it does extra things not analagous to find_lock_page(). Rename it find_or_alloc_huge_page() which is closer to the mark. Signed-off-by: David Gibson Signed-off-by: Adam Litke Cc: William Lee Irwin III Cc: "Seth, Rohit" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index acb864130f8..fdbbbb90caa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -345,8 +345,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); } -static struct page *find_lock_huge_page(struct address_space *mapping, - unsigned long idx) +static struct page *find_or_alloc_huge_page(struct address_space *mapping, + unsigned long idx) { struct page *page; int err; @@ -398,7 +398,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - page = find_lock_huge_page(mapping, idx); + page = find_or_alloc_huge_page(mapping, idx); if (!page) goto out; -- cgit v1.2.3 From 86e5216f8d8aa258ba836caffe2613d79cc9aead Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Fri, 6 Jan 2006 00:10:43 -0800 Subject: [PATCH] Hugetlb: Reorganize hugetlb_fault to prepare for COW This patch splits the "no_page()" type activity into its own function, hugetlb_no_page(). hugetlb_fault() becomes the entry point for hugetlb faults and delegates to the appropriate handler depending on the type of fault. Right now we still have only hugetlb_no_page() but a later patch introduces a COW fault. Signed-off-by: David Gibson Signed-off-by: Adam Litke Cc: William Lee Irwin III Cc: "Seth, Rohit" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fdbbbb90caa..cf8225108b2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -376,20 +376,15 @@ out: return page; } -int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) +int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { int ret = VM_FAULT_SIGBUS; unsigned long idx; unsigned long size; - pte_t *pte; struct page *page; struct address_space *mapping; - pte = huge_pte_alloc(mm, address); - if (!pte) - goto out; - mapping = vma->vm_file->f_mapping; idx = ((address - vma->vm_start) >> HPAGE_SHIFT) + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); @@ -408,11 +403,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto backout; ret = VM_FAULT_MINOR; - if (!pte_none(*pte)) + if (!pte_none(*ptep)) goto backout; add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); - set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); + set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page)); spin_unlock(&mm->page_table_lock); unlock_page(page); out: @@ -426,6 +421,27 @@ backout: goto out; } +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access) +{ + pte_t *ptep; + pte_t entry; + + ptep = huge_pte_alloc(mm, address); + if (!ptep) + return VM_FAULT_OOM; + + entry = *ptep; + if (pte_none(entry)) + return hugetlb_no_page(mm, vma, address, ptep); + + /* + * We could get here if another thread instantiated the pte + * before the test above. + */ + return VM_FAULT_MINOR; +} + int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i) -- cgit v1.2.3 From 1e8f889b10d8d2223105719e36ce45688fedbd59 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 6 Jan 2006 00:10:44 -0800 Subject: [PATCH] Hugetlb: Copy on Write support Implement copy-on-write support for hugetlb mappings so MAP_PRIVATE can be supported. This helps us to safely use hugetlb pages in many more applications. The patch makes the following changes. If needed, I also have it broken out according to the following paragraphs. 1. Add a pair of functions to set/clear write access on huge ptes. The writable check in make_huge_pte is moved out to the caller for use by COW later. 2. Hugetlb copy-on-write requires special case handling in the following situations: - copy_hugetlb_page_range() - Copied pages must be write protected so a COW fault will be triggered (if necessary) if those pages are written to. - find_or_alloc_huge_page() - Only MAP_SHARED pages are added to the page cache. MAP_PRIVATE pages still need to be locked however. 3. Provide hugetlb_cow() and calls from hugetlb_fault() and hugetlb_no_page() which handles the COW fault by making the actual copy. 4. Remove the check in hugetlbfs_file_map() so that MAP_PRIVATE mmaps will be allowed. Make MAP_HUGETLB exempt from the depricated VM_RESERVED mapping check. Signed-off-by: David Gibson Signed-off-by: Adam Litke Cc: William Lee Irwin III Cc: "Seth, Rohit" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 108 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cf8225108b2..da8a211414c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -261,11 +261,12 @@ struct vm_operations_struct hugetlb_vm_ops = { .nopage = hugetlb_nopage, }; -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) +static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, + int writable) { pte_t entry; - if (vma->vm_flags & VM_WRITE) { + if (writable) { entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); } else { @@ -277,12 +278,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) return entry; } +static void set_huge_ptep_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t entry; + + entry = pte_mkwrite(pte_mkdirty(*ptep)); + ptep_set_access_flags(vma, address, ptep, entry, 1); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); +} + + int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pte_t *src_pte, *dst_pte, entry; struct page *ptepage; unsigned long addr; + int cow; + + cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { src_pte = huge_pte_offset(src, addr); @@ -294,6 +310,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_lock(&dst->page_table_lock); spin_lock(&src->page_table_lock); if (!pte_none(*src_pte)) { + if (cow) + ptep_set_wrprotect(src, addr, src_pte); entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); @@ -346,7 +364,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } static struct page *find_or_alloc_huge_page(struct address_space *mapping, - unsigned long idx) + unsigned long idx, int shared) { struct page *page; int err; @@ -364,26 +382,80 @@ retry: goto out; } - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); - if (err) { - put_page(page); - hugetlb_put_quota(mapping); - if (err == -EEXIST) - goto retry; - page = NULL; + if (shared) { + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + if (err) { + put_page(page); + hugetlb_put_quota(mapping); + if (err == -EEXIST) + goto retry; + page = NULL; + } + } else { + /* Caller expects a locked page */ + lock_page(page); } out: return page; } +static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, pte_t pte) +{ + struct page *old_page, *new_page; + int i, avoidcopy; + + old_page = pte_page(pte); + + /* If no-one else is actually using this page, avoid the copy + * and just make the page writable */ + avoidcopy = (page_count(old_page) == 1); + if (avoidcopy) { + set_huge_ptep_writable(vma, address, ptep); + return VM_FAULT_MINOR; + } + + page_cache_get(old_page); + new_page = alloc_huge_page(); + + if (!new_page) { + page_cache_release(old_page); + + /* Logically this is OOM, not a SIGBUS, but an OOM + * could cause the kernel to go killing other + * processes which won't help the hugepage situation + * at all (?) */ + return VM_FAULT_SIGBUS; + } + + spin_unlock(&mm->page_table_lock); + for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) + copy_user_highpage(new_page + i, old_page + i, + address + i*PAGE_SIZE); + spin_lock(&mm->page_table_lock); + + ptep = huge_pte_offset(mm, address & HPAGE_MASK); + if (likely(pte_same(*ptep, pte))) { + /* Break COW */ + set_huge_pte_at(mm, address, ptep, + make_huge_pte(vma, new_page, 1)); + /* Make the old page be freed below */ + new_page = old_page; + } + page_cache_release(new_page); + page_cache_release(old_page); + return VM_FAULT_MINOR; +} + int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) + unsigned long address, pte_t *ptep, int write_access) { int ret = VM_FAULT_SIGBUS; unsigned long idx; unsigned long size; struct page *page; struct address_space *mapping; + pte_t new_pte; mapping = vma->vm_file->f_mapping; idx = ((address - vma->vm_start) >> HPAGE_SHIFT) @@ -393,10 +465,13 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - page = find_or_alloc_huge_page(mapping, idx); + page = find_or_alloc_huge_page(mapping, idx, + vma->vm_flags & VM_SHARED); if (!page) goto out; + BUG_ON(!PageLocked(page)); + spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> HPAGE_SHIFT; if (idx >= size) @@ -407,7 +482,15 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, goto backout; add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); - set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page)); + new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED))); + set_huge_pte_at(mm, address, ptep, new_pte); + + if (write_access && !(vma->vm_flags & VM_SHARED)) { + /* Optimization, do the COW without a second fault */ + ret = hugetlb_cow(mm, vma, address, ptep, new_pte); + } + spin_unlock(&mm->page_table_lock); unlock_page(page); out: @@ -426,6 +509,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, { pte_t *ptep; pte_t entry; + int ret; ptep = huge_pte_alloc(mm, address); if (!ptep) @@ -433,13 +517,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = *ptep; if (pte_none(entry)) - return hugetlb_no_page(mm, vma, address, ptep); + return hugetlb_no_page(mm, vma, address, ptep, write_access); - /* - * We could get here if another thread instantiated the pte - * before the test above. - */ - return VM_FAULT_MINOR; + ret = VM_FAULT_MINOR; + + spin_lock(&mm->page_table_lock); + /* Check for a racing update before calling hugetlb_cow */ + if (likely(pte_same(entry, *ptep))) + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry); + spin_unlock(&mm->page_table_lock); + + return ret; } int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, -- cgit v1.2.3 From 96df9333c94d7d5aeceb21f6c5e7ae8ff34753cf Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:10:45 -0800 Subject: [PATCH] mm: dequeue a huge page near to this node This was discussed at http://marc.theaimsgroup.com/?l=linux-kernel&m=113166526217117&w=2 This patch changes the dequeueing to select a huge page near the node executing instead of always beginning to check for free nodes from node 0. This will result in a placement of the huge pages near the executing processor improving performance. The existing implementation can place the huge pages far away from the executing processor causing significant degradation of performance. The search starting from zero also means that the lower zones quickly run out of memory. Selecting a huge page near the process distributed the huge pages better. Signed-off-by: Christoph Lameter Cc: William Lee Irwin III Cc: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index da8a211414c..e93bd63462f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,14 +40,16 @@ static struct page *dequeue_huge_page(void) { int nid = numa_node_id(); struct page *page = NULL; + struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists; + struct zone **z; - if (list_empty(&hugepage_freelists[nid])) { - for (nid = 0; nid < MAX_NUMNODES; ++nid) - if (!list_empty(&hugepage_freelists[nid])) - break; + for (z = zonelist->zones; *z; z++) { + nid = (*z)->zone_pgdat->node_id; + if (!list_empty(&hugepage_freelists[nid])) + break; } - if (nid >= 0 && nid < MAX_NUMNODES && - !list_empty(&hugepage_freelists[nid])) { + + if (*z) { page = list_entry(hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); -- cgit v1.2.3 From 5da7ca86078964cbfe6c83efc1205904587706fe Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:10:46 -0800 Subject: [PATCH] Add NUMA policy support for huge pages. The huge_zonelist() function in the memory policy layer provides an list of zones ordered by NUMA distance. The hugetlb layer will walk that list looking for a zone that has available huge pages but is also in the nodeset of the current cpuset. This patch does not contain the folding of find_or_alloc_huge_page() that was controversial in the earlier discussion. Signed-off-by: Christoph Lameter Cc: Andi Kleen Acked-by: William Lee Irwin III Cc: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 24 ++++++++++++++---------- mm/mempolicy.c | 39 ++++++++++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e93bd63462f..eb405565949 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -11,6 +11,8 @@ #include #include #include +#include + #include #include @@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page) free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(void) +static struct page *dequeue_huge_page(struct vm_area_struct *vma, + unsigned long address) { int nid = numa_node_id(); struct page *page = NULL; - struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists; + struct zonelist *zonelist = huge_zonelist(vma, address); struct zone **z; for (z = zonelist->zones; *z; z++) { @@ -87,13 +90,13 @@ void free_huge_page(struct page *page) spin_unlock(&hugetlb_lock); } -struct page *alloc_huge_page(void) +struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { struct page *page; int i; spin_lock(&hugetlb_lock); - page = dequeue_huge_page(); + page = dequeue_huge_page(vma, addr); if (!page) { spin_unlock(&hugetlb_lock); return NULL; @@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count) spin_lock(&hugetlb_lock); try_to_free_low(count); while (count < nr_huge_pages) { - struct page *page = dequeue_huge_page(); + struct page *page = dequeue_huge_page(NULL, 0); if (!page) break; update_and_free_page(page); @@ -365,8 +368,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); } -static struct page *find_or_alloc_huge_page(struct address_space *mapping, - unsigned long idx, int shared) +static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, struct address_space *mapping, + unsigned long idx, int shared) { struct page *page; int err; @@ -378,7 +382,7 @@ retry: if (hugetlb_get_quota(mapping)) goto out; - page = alloc_huge_page(); + page = alloc_huge_page(vma, addr); if (!page) { hugetlb_put_quota(mapping); goto out; @@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_get(old_page); - new_page = alloc_huge_page(); + new_page = alloc_huge_page(vma, address); if (!new_page) { page_cache_release(old_page); @@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - page = find_or_alloc_huge_page(mapping, idx, + page = find_or_alloc_huge_page(vma, address, mapping, idx, vma->vm_flags & VM_SHARED); if (!page) goto out; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72f402cc9c9..45c51ac6344 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol, return nid; } +/* Determine a node number for interleave */ +static inline unsigned interleave_nid(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long addr, int shift) +{ + if (vma) { + unsigned long off; + + off = vma->vm_pgoff; + off += (addr - vma->vm_start) >> shift; + return offset_il_node(pol, vma, off); + } else + return interleave_nodes(pol); +} + +/* Return a zonelist suitable for a huge page allocation. */ +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(current, vma, addr); + + if (pol->policy == MPOL_INTERLEAVE) { + unsigned nid; + + nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); + return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); + } + return zonelist_policy(GFP_HIGHUSER, pol); +} + /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, @@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) if (unlikely(pol->policy == MPOL_INTERLEAVE)) { unsigned nid; - if (vma) { - unsigned long off; - off = vma->vm_pgoff; - off += (addr - vma->vm_start) >> PAGE_SHIFT; - nid = offset_il_node(pol, vma, off); - } else { - /* fall back to process interleaving */ - nid = interleave_nodes(pol); - } + + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); return alloc_page_interleave(gfp, 0, nid); } return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); -- cgit v1.2.3 From 21abb1478a87e26f5fa71dbcb7cf4264272c2248 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:10:47 -0800 Subject: [PATCH] Remove old node based policy interface from mempolicy.c mempolicy.c contains provisional interface for huge page allocation based on node numbers. This is in use in SLES9 but was never used (AFAIK) in upstream versions of Linux. Huge page allocations now use zonelists to figure out where to allocate pages. The use of zonelists allows us to find the closest hugepage which was the consideration of the NUMA distance for huge page allocations. Remove the obsolete functions. Signed-off-by: Christoph Lameter Cc: Andi Kleen Acked-by: William Lee Irwin III Cc: Adam Litke Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 48 ------------------------------------------------ 1 file changed, 48 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 45c51ac6344..96714e2646a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -960,54 +960,6 @@ void __mpol_free(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -/* - * Hugetlb policy. Same as above, just works with node numbers instead of - * zonelists. - */ - -/* Find first node suitable for an allocation */ -int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) -{ - struct mempolicy *pol = get_vma_policy(current, vma, addr); - - switch (pol->policy) { - case MPOL_DEFAULT: - return numa_node_id(); - case MPOL_BIND: - return pol->v.zonelist->zones[0]->zone_pgdat->node_id; - case MPOL_INTERLEAVE: - return interleave_nodes(pol); - case MPOL_PREFERRED: - return pol->v.preferred_node >= 0 ? - pol->v.preferred_node : numa_node_id(); - } - BUG(); - return 0; -} - -/* Find secondary valid nodes for an allocation */ -int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) -{ - struct mempolicy *pol = get_vma_policy(current, vma, addr); - - switch (pol->policy) { - case MPOL_PREFERRED: - case MPOL_DEFAULT: - case MPOL_INTERLEAVE: - return 1; - case MPOL_BIND: { - struct zone **z; - for (z = pol->v.zonelist->zones; *z; z++) - if ((*z)->zone_pgdat->node_id == nid) - return 1; - return 0; - } - default: - BUG(); - return 0; - } -} - /* * Shared memory backing store policy support. * -- cgit v1.2.3 From 6bda666a03f063968833760c5bb5c13062ab9291 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:10:49 -0800 Subject: [PATCH] hugepages: fold find_or_alloc_pages into huge_no_page() The number of parameters for find_or_alloc_page increases significantly after policy support is added to huge pages. Simplify the code by folding find_or_alloc_huge_page() into hugetlb_no_page(). Adam Litke objected to this piece in an earlier patch but I think this is a good simplification. Diffstat shows that we can get rid of almost half of the lines of find_or_alloc_page(). If we can find no consensus then lets simply drop this patch. Signed-off-by: Christoph Lameter Cc: Andi Kleen Acked-by: William Lee Irwin III Cc: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 66 ++++++++++++++++++++++-------------------------------------- 1 file changed, 24 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eb405565949..f4c43d7980b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -368,43 +368,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); } -static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr, struct address_space *mapping, - unsigned long idx, int shared) -{ - struct page *page; - int err; - -retry: - page = find_lock_page(mapping, idx); - if (page) - goto out; - - if (hugetlb_get_quota(mapping)) - goto out; - page = alloc_huge_page(vma, addr); - if (!page) { - hugetlb_put_quota(mapping); - goto out; - } - - if (shared) { - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); - if (err) { - put_page(page); - hugetlb_put_quota(mapping); - if (err == -EEXIST) - goto retry; - page = NULL; - } - } else { - /* Caller expects a locked page */ - lock_page(page); - } -out: - return page; -} - static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte) { @@ -471,12 +434,31 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - page = find_or_alloc_huge_page(vma, address, mapping, idx, - vma->vm_flags & VM_SHARED); - if (!page) - goto out; +retry: + page = find_lock_page(mapping, idx); + if (!page) { + if (hugetlb_get_quota(mapping)) + goto out; + page = alloc_huge_page(vma, address); + if (!page) { + hugetlb_put_quota(mapping); + goto out; + } - BUG_ON(!PageLocked(page)); + if (vma->vm_flags & VM_SHARED) { + int err; + + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + if (err) { + put_page(page); + hugetlb_put_quota(mapping); + if (err == -EEXIST) + goto retry; + goto out; + } + } else + lock_page(page); + } spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> HPAGE_SHIFT; -- cgit v1.2.3 From a94b3ab7eab4edcc9b2cb474b188f774c331adf7 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 6 Jan 2006 00:10:51 -0800 Subject: [PATCH] mm: remove arch independent NODES_SPAN_OTHER_NODES The NODES_SPAN_OTHER_NODES config option was created so that DISCONTIGMEM could handle pSeries numa layouts. However, support for DISCONTIGMEM has been replaced by SPARSEMEM on powerpc. As a result, this config option and supporting code is no longer needed. I have already sent a patch to Paul that removes the option from powerpc specific code. This removes the arch independent piece. Doesn't really matter which is applied first. Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1e49dc7cd61..07825c637a5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1708,8 +1708,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { if (!early_pfn_valid(pfn)) continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); set_page_count(page, 1); -- cgit v1.2.3 From c484d41042e6ccb88089ca41e3b3eed1bafdae21 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 6 Jan 2006 00:10:55 -0800 Subject: [PATCH] mm: free_pages_and_swap_cache opt Minor optimization (though it doesn't help in the PREEMPT case, severely constrained by small ZAP_BLOCK_SIZE). free_pages_and_swap_cache works in chunks of 16, calling release_pages which works in chunks of PAGEVEC_SIZE. But PAGEVEC_SIZE was dropped from 16 to 14 in 2.6.10, so we're now doing more spin_lock_irq'ing than necessary: use PAGEVEC_SIZE throughout. Signed-off-by: Hugh Dickins Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index 0df9a57b1de..fc2aecb70a9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page) */ void free_pages_and_swap_cache(struct page **pages, int nr) { - int chunk = 16; struct page **pagep = pages; lru_add_drain(); while (nr) { - int todo = min(chunk, nr); + int todo = min(nr, PAGEVEC_SIZE); int i; for (i = 0; i < todo; i++) -- cgit v1.2.3 From c54ad30c784b84d0275152d0ca80985b21471811 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:56 -0800 Subject: [PATCH] mm: pagealloc opt Slightly optimise some page allocation and freeing functions by taking advantage of knowing whether or not interrupts are disabled. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07825c637a5..680cbe5b6ba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -375,11 +375,10 @@ static int free_pages_bulk(struct zone *zone, int count, struct list_head *list, unsigned int order) { - unsigned long flags; struct page *page = NULL; int ret = 0; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; while (!list_empty(list) && count--) { @@ -389,12 +388,13 @@ free_pages_bulk(struct zone *zone, int count, __free_pages_bulk(page, zone, order); ret++; } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); return ret; } void __free_pages_ok(struct page *page, unsigned int order) { + unsigned long flags; LIST_HEAD(list); int i; int reserved = 0; @@ -415,7 +415,9 @@ void __free_pages_ok(struct page *page, unsigned int order) list_add(&page->lru, &list); mod_page_state(pgfree, 1 << order); kernel_map_pages(page, 1<lock, flags); + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { page = __rmqueue(zone, order); if (page == NULL) @@ -552,7 +553,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, allocated++; list_add_tail(&page->lru, list); } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); return allocated; } @@ -589,6 +590,7 @@ void drain_remote_pages(void) #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { + unsigned long flags; struct zone *zone; int i; @@ -600,8 +602,10 @@ static void __drain_pages(unsigned int cpu) struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; + local_irq_save(flags); pcp->count -= free_pages_bulk(zone, pcp->count, &pcp->list, 0); + local_irq_restore(flags); } } } @@ -744,7 +748,7 @@ again: if (pcp->count <= pcp->low) pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); - if (pcp->count) { + if (likely(pcp->count)) { page = list_entry(pcp->list.next, struct page, lru); list_del(&page->lru); pcp->count--; -- cgit v1.2.3 From 77a8a78834561398fb4cb1480afa7b0e80b1dd53 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:57 -0800 Subject: [PATCH] mm: set_page_refs opt Inline set_page_refs. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 19 +++++++++++++++++-- mm/page_alloc.c | 17 ----------------- 2 files changed, 17 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 6bf134e8fb3..85004f540e3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -9,5 +9,20 @@ * 2 of the License, or (at your option) any later version. */ -/* page_alloc.c */ -extern void set_page_refs(struct page *page, int order); +static inline void set_page_refs(struct page *page, int order) +{ +#ifdef CONFIG_MMU + set_page_count(page, 1); +#else + int i; + + /* + * We need to reference all the pages for this order, otherwise if + * anyone accesses one of the pages with (get/put) it will be freed. + * - eg: access_process_vm() + */ + for (i = 0; i < (1 << order); i++) + set_page_count(page + i, 1); +#endif /* CONFIG_MMU */ +} + diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 680cbe5b6ba..6d513faf050 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -453,23 +453,6 @@ expand(struct zone *zone, struct page *page, return page; } -void set_page_refs(struct page *page, int order) -{ -#ifdef CONFIG_MMU - set_page_count(page, 1); -#else - int i; - - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - * - eg: access_process_vm() - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page + i, 1); -#endif /* CONFIG_MMU */ -} - /* * This page is about to be returned from the page allocator */ -- cgit v1.2.3 From 92be2e33b155ee76399f51f41fb061f850d02f08 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:57 -0800 Subject: [PATCH] mm: microopt conditions Micro optimise some conditionals where we don't need lazy evaluation. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d513faf050..b0647b51527 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -336,9 +336,9 @@ static inline void __free_pages_bulk (struct page *page, static inline int free_pages_check(const char *function, struct page *page) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -348,7 +348,7 @@ static inline int free_pages_check(const char *function, struct page *page) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved ))) + 1 << PG_reserved )))) bad_page(function, page); if (PageDirty(page)) __ClearPageDirty(page); @@ -458,9 +458,9 @@ expand(struct zone *zone, struct page *page, */ static int prep_new_page(struct page *page, int order) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -471,7 +471,7 @@ static int prep_new_page(struct page *page, int order) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved ))) + 1 << PG_reserved )))) bad_page(__FUNCTION__, page); /* -- cgit v1.2.3 From 13e7444b0ec59f96d81a4e8c379d5f38fc5f2cc1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:58 -0800 Subject: [PATCH] mm: remove bad_range bad_range is supposed to be a temporary check. It would be a pity to throw it out. Make it depend on CONFIG_DEBUG_VM instead. CONFIG_HOLES_IN_ZONE systems were relying on this to check pfn_valid in the page allocator. Add that to page_is_buddy instead. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b0647b51527..088712f2ac0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -81,6 +81,7 @@ int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; +#ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { int ret = 0; @@ -122,6 +123,13 @@ static int bad_range(struct zone *zone, struct page *page) return 0; } +#else +static inline int bad_range(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + static void bad_page(const char *function, struct page *page) { printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", @@ -255,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is free && - * (b) the buddy is on the buddy system && - * (c) a page and its buddy have the same order. + * (a) the buddy is not in a hole && + * (b) the buddy is free && + * (c) the buddy is on the buddy system && + * (d) a page and its buddy have the same order. * for recording page's order, we use page_private(page) and PG_private. * */ static inline int page_is_buddy(struct page *page, int order) { +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (PagePrivate(page) && (page_order(page) == order) && page_count(page) == 0) @@ -314,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page, struct free_area *area; struct page *buddy; - combined_idx = __find_combined_index(page_idx, order); buddy = __page_find_buddy(page, page_idx, order); - - if (bad_range(zone, buddy)) - break; if (!page_is_buddy(buddy, order)) break; /* Move the buddy up one level. */ + list_del(&buddy->lru); area = zone->free_area + order; area->nr_free--; rmv_page_order(buddy); + combined_idx = __find_combined_index(page_idx, order); page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; -- cgit v1.2.3 From 2d92c5c9150a2a9ca3dc25da58d5042e17a96b6a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:59 -0800 Subject: [PATCH] mm: remove pcp low struct per_cpu_pages.low is useless. Remove it. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 088712f2ac0..7cff958e781 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -740,7 +740,7 @@ again: page = NULL; pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) + if (!pcp->count) pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); if (likely(pcp->count)) { @@ -1345,10 +1345,9 @@ void show_free_areas(void) pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", + printk("cpu %d %s: high %d, batch %d used:%d\n", cpu, temperature ? "cold" : "hot", - pageset->pcp[temperature].low, pageset->pcp[temperature].high, pageset->pcp[temperature].batch, pageset->pcp[temperature].count); @@ -1790,14 +1789,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp[0]; /* hot */ pcp->count = 0; - pcp->low = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); pcp = &p->pcp[1]; /* cold*/ pcp->count = 0; - pcp->low = 0; pcp->high = 2 * batch; pcp->batch = max(1UL, batch/2); INIT_LIST_HEAD(&pcp->list); @@ -2193,12 +2190,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) seq_printf(m, "\n cpu: %i pcp: %i" "\n count: %i" - "\n low: %i" "\n high: %i" "\n batch: %i", i, j, pageset->pcp[j].count, - pageset->pcp[j].low, pageset->pcp[j].high, pageset->pcp[j].batch); } -- cgit v1.2.3 From a86b1f53166a260ced8f3c8c526945bf496f2e78 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:00 -0800 Subject: [PATCH] mm: page_state fixes read_page_state and __get_page_state only traverse online CPUs, which will cause results to fluctuate when CPUs are plugged in or out. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7cff958e781..379618747de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1169,12 +1169,11 @@ EXPORT_SYMBOL(nr_pagecache); DEFINE_PER_CPU(long, nr_pagecache_local) = 0; #endif -void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) +static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) { int cpu = 0; memset(ret, 0, sizeof(*ret)); - cpus_and(*cpumask, *cpumask, cpu_online_map); cpu = first_cpu(*cpumask); while (cpu < NR_CPUS) { @@ -1227,7 +1226,7 @@ unsigned long __read_page_state(unsigned long offset) unsigned long ret = 0; int cpu; - for_each_online_cpu(cpu) { + for_each_cpu(cpu) { unsigned long in; in = (unsigned long)&per_cpu(page_states, cpu) + offset; -- cgit v1.2.3 From 085cc7d5de3cc662da7ea78296464a0d52f3f01f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:01 -0800 Subject: [PATCH] mm: page_alloc cleanups Small cleanups that does not change generated code with the gcc's I've tested with. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 379618747de..925b0b985f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -447,8 +447,7 @@ void __free_pages_ok(struct page *page, unsigned int order) * * -- wli */ -static inline struct page * -expand(struct zone *zone, struct page *page, +static inline void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area) { unsigned long size = 1 << high; @@ -462,7 +461,6 @@ expand(struct zone *zone, struct page *page, area->nr_free++; set_page_order(&page[size], high); } - return page; } /* @@ -522,7 +520,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) rmv_page_order(page); area->nr_free--; zone->free_pages -= 1UL << order; - return expand(zone, page, order, current_order, area); + expand(zone, page, order, current_order, area); + return page; } return NULL; @@ -537,19 +536,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list) { int i; - int allocated = 0; - struct page *page; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + struct page *page = __rmqueue(zone, order); + if (unlikely(page == NULL)) break; - allocated++; list_add_tail(&page->lru, list); } spin_unlock(&zone->lock); - return allocated; + return i; } #ifdef CONFIG_NUMA -- cgit v1.2.3 From 008857c1a49ccffc31a54c3ea7e182833bd61304 Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Fri, 6 Jan 2006 00:11:01 -0800 Subject: [PATCH] Cleanup bootmem allocator and fix alloc_bootmem_low Patch cleans up the alloc_bootmem fix for swiotlb. Patch removes alloc_bootmem_*_limit api and fixes alloc_boot_*low api to do the right thing -- allocate from low32 memory. Signed-off-by: Ravikiran Thirumalai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 16b9465eb4e..cbb82ee14fb 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -393,15 +393,14 @@ unsigned long __init free_all_bootmem (void) return(free_all_bootmem_core(NODE_DATA(0))); } -void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit) +void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { pg_data_t *pgdat = pgdat_list; void *ptr; for_each_pgdat(pgdat) if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, - align, goal, limit))) + align, goal, 0))) return(ptr); /* @@ -413,15 +412,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un } -void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, + unsigned long goal) { void *ptr; - ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit); + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); if (ptr) return (ptr); - return __alloc_bootmem_limit(size, align, goal, limit); + return __alloc_bootmem(size, align, goal); } +#define LOW32LIMIT 0xffffffff + +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) +{ + pg_data_t *pgdat = pgdat_list; + void *ptr; + + for_each_pgdat(pgdat) + if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, + align, goal, LOW32LIMIT))) + return(ptr); + + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size); + panic("Out of low memory"); + return NULL; +} + +void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); +} -- cgit v1.2.3 From a226f6c899799fe2c4919daa0767ac579c88f7bd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Jan 2006 00:11:08 -0800 Subject: [PATCH] FRV: Clean up bootmem allocator's page freeing algorithm The attached patch cleans up the way the bootmem allocator frees pages. A new function, __free_pages_bootmem(), is provided in mm/page_alloc.c that is called from mm/bootmem.c to turn pages over to the main allocator. All the bits of code to initialise pages (clearing PG_reserved and setting the page count) are moved to here. The checks on page validity are removed, on the assumption that the struct page arrays will have been prepared correctly. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 20 ++++---------------- mm/internal.h | 2 ++ mm/page_alloc.c | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index cbb82ee14fb..35c32290f71 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) unsigned long v = ~map[i / BITS_PER_LONG]; if (gofast && v == ~0UL) { - int j, order; + int order; page = pfn_to_page(pfn); count += BITS_PER_LONG; - __ClearPageReserved(page); order = ffs(BITS_PER_LONG) - 1; - set_page_refs(page, order); - for (j = 1; j < BITS_PER_LONG; j++) { - if (j + 16 < BITS_PER_LONG) - prefetchw(page + j + 16); - __ClearPageReserved(page + j); - set_page_count(page + j, 0); - } - __free_pages(page, order); + __free_pages_bootmem(page, order); i += BITS_PER_LONG; page += BITS_PER_LONG; } else if (v) { @@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) for (m = 1; m && i < idx; m<<=1, page++, i++) { if (v & m) { count++; - __ClearPageReserved(page); - set_page_refs(page, 0); - __free_page(page); + __free_pages_bootmem(page, 0); } } } else { @@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) count = 0; for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { count++; - __ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); + __free_pages_bootmem(page, 0); } total += count; bdata->node_bootmem_map = NULL; diff --git a/mm/internal.h b/mm/internal.h index 85004f540e3..17256bb2f4e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -26,3 +26,5 @@ static inline void set_page_refs(struct page *page, int order) #endif /* CONFIG_MMU */ } +extern void fastcall __init __free_pages_bootmem(struct page *page, + unsigned int order); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 925b0b985f7..cdad3249cf7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,8 @@ unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; +static void fastcall free_hot_cold_page(struct page *page, int cold); + /* * results with 256, 32 in the lowmem_reserve sysctl: * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) @@ -432,6 +434,39 @@ void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } +/* + * permit the bootmem allocator to evade page validation on high-order frees + */ +void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) +{ + if (order == 0) { + __ClearPageReserved(page); + set_page_count(page, 0); + + free_hot_cold_page(page, 0); + } else { + LIST_HEAD(list); + int loop; + + for (loop = 0; loop < BITS_PER_LONG; loop++) { + struct page *p = &page[loop]; + + if (loop + 16 < BITS_PER_LONG) + prefetchw(p + 16); + __ClearPageReserved(p); + set_page_count(p, 0); + } + + arch_free_page(page, order); + + mod_page_state(pgfree, 1 << order); + + list_add(&page->lru, &list); + kernel_map_pages(page, 1 << order, 0); + free_pages_bulk(page_zone(page), 1, &list, order); + } +} + /* * The order of subdivision here is critical for the IO subsystem. @@ -671,7 +706,6 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) /* * Free a 0-order page */ -static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); static void fastcall free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); -- cgit v1.2.3 From bbfbb7cec9dd7266534b2b4b9c8be2fa425bbfc9 Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Fri, 6 Jan 2006 00:11:08 -0800 Subject: [PATCH] find_lock_page(): call __lock_page() directly. As find_lock_page() already checks with TestSetPageLocked() that page is locked, there is no need to call lock_page() that will try-lock page again (chances of page being unlocked in between are small). Call __lock_page() directly, this saves one atomic operation. Also, mark truncate-while-slept path as unlikely while we are here. (akpm: ug. But this is actually a common path for normal old read()s against a page which is under readahead I/O so ho-hum.) Signed-off-by: Nikita Danilov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 6e1d08a2b8b..4ef24a39768 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -555,11 +555,12 @@ repeat: page_cache_get(page); if (TestSetPageLocked(page)) { read_unlock_irq(&mapping->tree_lock); - lock_page(page); + __lock_page(page); read_lock_irq(&mapping->tree_lock); /* Has the page been truncated while we slept? */ - if (page->mapping != mapping || page->index != offset) { + if (unlikely(page->mapping != mapping || + page->index != offset)) { unlock_page(page); page_cache_release(page); goto repeat; -- cgit v1.2.3 From 7756b9e4e321c3c83c7aa5b9532d3e7fd7ddeb4a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Jan 2006 00:11:09 -0800 Subject: [PATCH] kill last zone_reclaim() bits Remove the last bits of Martin's ill-fated sys_set_zone_reclaim(). Cc: Martin Hicks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 80 ------------------------------------------------------------- 1 file changed, 80 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 795a050fe47..b2baca7645d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -74,9 +74,6 @@ struct scan_control { int may_writepage; - /* Can pages be swapped as part of reclaim? */ - int may_swap; - /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. * In this context, it doesn't matter that we scan the @@ -430,8 +427,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page)) { - if (!sc->may_swap) - goto keep_locked; if (!add_to_swap(page)) goto activate_locked; } @@ -952,7 +947,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) sc.gfp_mask = gfp_mask; sc.may_writepage = 0; - sc.may_swap = 1; inc_page_state(allocstall); @@ -1055,7 +1049,6 @@ loop_again: total_reclaimed = 0; sc.gfp_mask = GFP_KERNEL; sc.may_writepage = 0; - sc.may_swap = 1; sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); @@ -1353,76 +1346,3 @@ static int __init kswapd_init(void) } module_init(kswapd_init) - - -/* - * Try to free up some pages from this zone through reclaim. - */ -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) -{ - struct scan_control sc; - int nr_pages = 1 << order; - int total_reclaimed = 0; - - /* The reclaim may sleep, so don't do it if sleep isn't allowed */ - if (!(gfp_mask & __GFP_WAIT)) - return 0; - if (zone->all_unreclaimable) - return 0; - - sc.gfp_mask = gfp_mask; - sc.may_writepage = 0; - sc.may_swap = 0; - sc.nr_mapped = read_page_state(nr_mapped); - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - /* scan at the highest priority */ - sc.priority = 0; - disable_swap_token(); - - if (nr_pages > SWAP_CLUSTER_MAX) - sc.swap_cluster_max = nr_pages; - else - sc.swap_cluster_max = SWAP_CLUSTER_MAX; - - /* Don't reclaim the zone if there are other reclaimers active */ - if (atomic_read(&zone->reclaim_in_progress) > 0) - goto out; - - shrink_zone(zone, &sc); - total_reclaimed = sc.nr_reclaimed; - - out: - return total_reclaimed; -} - -asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone, - unsigned int state) -{ - struct zone *z; - int i; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (node >= MAX_NUMNODES || !node_online(node)) - return -EINVAL; - - /* This will break if we ever add more zones */ - if (!(zone & (1<node_zones[i]; - - if (state) - z->reclaim_pages = 1; - else - z->reclaim_pages = 0; - } - - return 0; -} -- cgit v1.2.3 From 9328b8faae922e52073785ed6c1eaa8565648a0e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:10 -0800 Subject: [PATCH] mm: dma32 zone statistics Add dma32 to zone statistics. Also attempt to arrange struct page_state a bit better (visually). Signed-off-by: Nick Piggin Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cdad3249cf7..e12154d9c4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2277,32 +2277,40 @@ static char *vmstat_text[] = { "pgpgout", "pswpin", "pswpout", - "pgalloc_high", + "pgalloc_high", "pgalloc_normal", + "pgalloc_dma32", "pgalloc_dma", + "pgfree", "pgactivate", "pgdeactivate", "pgfault", "pgmajfault", + "pgrefill_high", "pgrefill_normal", + "pgrefill_dma32", "pgrefill_dma", "pgsteal_high", "pgsteal_normal", + "pgsteal_dma32", "pgsteal_dma", + "pgscan_kswapd_high", "pgscan_kswapd_normal", - + "pgscan_kswapd_dma32", "pgscan_kswapd_dma", + "pgscan_direct_high", "pgscan_direct_normal", + "pgscan_direct_dma32", "pgscan_direct_dma", - "pginodesteal", + "pginodesteal", "slabs_scanned", "kswapd_steal", "kswapd_inodesteal", -- cgit v1.2.3 From 224abf92b2f439a9030f21d2926ec8047d1ffcdb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:11 -0800 Subject: [PATCH] mm: bad_page optimisation Cut down size slightly by not passing bad_page the function name (it should be able to be determined by dump_stack()). And cut down the number of printks in bad_page. Also, cut down some branching in the destroy_compound_page path. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e12154d9c4e..b9fd2c238f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -132,16 +132,16 @@ static inline int bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(const char *function, struct page *page) -{ - printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", - function, current->comm, page); - printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, - page->mapping, page_mapcount(page), page_count(page)); - printk(KERN_EMERG "Backtrace:\n"); +static void bad_page(struct page *page) +{ + printk(KERN_EMERG "Bad page state in process '%s'\n" + "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" + "Trying to fix it up, but a reboot is needed\n" + "Backtrace:\n", + current->comm, page, (int)(2*sizeof(unsigned long)), + (unsigned long)page->flags, page->mapping, + page_mapcount(page), page_count(page)); dump_stack(); - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); page->flags &= ~(1 << PG_lru | 1 << PG_private | 1 << PG_locked | @@ -194,19 +194,15 @@ static void destroy_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - if (!PageCompound(page)) - return; - - if (page[1].index != order) - bad_page(__FUNCTION__, page); + if (unlikely(page[1].index != order)) + bad_page(page); for (i = 0; i < nr_pages; i++) { struct page *p = page + i; - if (!PageCompound(p)) - bad_page(__FUNCTION__, page); - if (page_private(p) != (unsigned long)page) - bad_page(__FUNCTION__, page); + if (unlikely(!PageCompound(p) | + (page_private(p) != (unsigned long)page))) + bad_page(page); ClearPageCompound(p); } } @@ -316,7 +312,7 @@ static inline void __free_pages_bulk (struct page *page, unsigned long page_idx; int order_size = 1 << order; - if (unlikely(order)) + if (unlikely(PageCompound(page))) destroy_compound_page(page, order); page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); @@ -348,7 +344,7 @@ static inline void __free_pages_bulk (struct page *page, zone->free_area[order].nr_free++; } -static inline int free_pages_check(const char *function, struct page *page) +static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -363,7 +359,7 @@ static inline int free_pages_check(const char *function, struct page *page) 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved )))) - bad_page(function, page); + bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); /* @@ -422,7 +418,7 @@ void __free_pages_ok(struct page *page, unsigned int order) #endif for (i = 0 ; i < (1 << order) ; ++i) - reserved += free_pages_check(__FUNCTION__, page + i); + reserved += free_pages_check(page + i); if (reserved) return; @@ -517,7 +513,7 @@ static int prep_new_page(struct page *page, int order) 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved )))) - bad_page(__FUNCTION__, page); + bad_page(page); /* * For now, we report if PG_reserved was found set, but do not @@ -716,7 +712,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (PageAnon(page)) page->mapping = NULL; - if (free_pages_check(__FUNCTION__, page)) + if (free_pages_check(page)) return; inc_page_state(pgfree); -- cgit v1.2.3 From 9617d95e6e9ffd883cf90a89724fe60d7ab22f9a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:12 -0800 Subject: [PATCH] mm: rmap optimisation Optimise rmap functions by minimising atomic operations when we know there will be no concurrent modifications. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 +++--- mm/rmap.c | 49 ++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 41 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e249088908c..d7ca7de10f4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1498,7 +1498,7 @@ gotten: update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); lru_cache_add_active(new_page); - page_add_anon_rmap(new_page, vma, address); + page_add_new_anon_rmap(new_page, vma, address); /* Free the old page.. */ new_page = old_page; @@ -1978,7 +1978,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); SetPageReferenced(page); - page_add_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address); } else { /* Map the ZERO_PAGE - vm_page_prot is readonly */ page = ZERO_PAGE(address); @@ -2109,7 +2109,7 @@ retry: if (anon) { inc_mm_counter(mm, anon_rss); lru_cache_add_active(new_page); - page_add_anon_rmap(new_page, vma, address); + page_add_new_anon_rmap(new_page, vma, address); } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); diff --git a/mm/rmap.c b/mm/rmap.c index f853c6def15..4107f64ff74 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -434,6 +434,26 @@ int page_referenced(struct page *page, int is_locked) return referenced; } +/** + * page_set_anon_rmap - setup new anonymous rmap + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + */ +static void __page_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + BUG_ON(!anon_vma); + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + + page->index = linear_page_index(vma, address); + + inc_page_state(nr_mapped); +} + /** * page_add_anon_rmap - add pte mapping to an anonymous page * @page: the page to add the mapping to @@ -445,20 +465,27 @@ int page_referenced(struct page *page, int is_locked) void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - if (atomic_inc_and_test(&page->_mapcount)) { - struct anon_vma *anon_vma = vma->anon_vma; - - BUG_ON(!anon_vma); - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - page->mapping = (struct address_space *) anon_vma; - - page->index = linear_page_index(vma, address); - - inc_page_state(nr_mapped); - } + if (atomic_inc_and_test(&page->_mapcount)) + __page_set_anon_rmap(page, vma, address); /* else checking page index and mapping is racy */ } +/* + * page_add_new_anon_rmap - add pte mapping to a new anonymous page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + * + * Same as page_add_anon_rmap but must only be called on *new* pages. + * This means the inc-and-test can be bypassed. + */ +void page_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ + __page_set_anon_rmap(page, vma, address); +} + /** * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to -- cgit v1.2.3 From 41e9b63b35b52cf918a4ffdb8d77862ab824aa8b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:13 -0800 Subject: [PATCH] mm: pfault optimisation This atomic operation is superfluous: the pte will be added with the referenced bit set, and the page will be referenced through this mapping after the page fault handler returns anyway. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index d7ca7de10f4..7197f9bcd38 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1977,7 +1977,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto release; inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); - SetPageReferenced(page); page_add_new_anon_rmap(page, vma, address); } else { /* Map the ZERO_PAGE - vm_page_prot is readonly */ -- cgit v1.2.3 From 210fe530305ee50cd889fe9250168228b2994f32 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Jan 2006 00:11:14 -0800 Subject: [PATCH] vmscan: balancing fix Revert a patch which went into 2.6.8-rc1. The changelog for that patch was: The shrink_zone() logic can, under some circumstances, cause far too many pages to be reclaimed. Say, we're scanning at high priority and suddenly hit a large number of reclaimable pages on the LRU. Change things so we bale out when SWAP_CLUSTER_MAX pages have been reclaimed. Problem is, this change caused significant imbalance in inter-zone scan balancing by truncating scans of larger zones. Suppose, for example, ZONE_HIGHMEM is 10x the size of ZONE_NORMAL. The zone balancing algorithm would require that if we're scanning 100 pages of ZONE_HIGHMEM, we should scan 10 pages of ZONE_NORMAL. But this logic will cause the scanning of ZONE_HIGHMEM to bale out after only 32 pages are reclaimed. Thus effectively causing smaller zones to be scanned relatively harder than large ones. Now I need to remember what the workload was which caused me to write this patch originally, then fix it up in a different way... Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index b2baca7645d..5c8a412b43f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -63,9 +63,6 @@ struct scan_control { unsigned long nr_mapped; /* From page_state */ - /* How many pages shrink_cache() should reclaim */ - int nr_to_reclaim; - /* Ask shrink_caches, or shrink_zone to scan at this priority */ unsigned int priority; @@ -656,7 +653,6 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) if (current_is_kswapd()) mod_page_state(kswapd_steal, nr_freed); mod_page_state_zone(zone, pgsteal, nr_freed); - sc->nr_to_reclaim -= nr_freed; spin_lock_irq(&zone->lru_lock); /* @@ -856,8 +852,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) else nr_inactive = 0; - sc->nr_to_reclaim = sc->swap_cluster_max; - while (nr_active || nr_inactive) { if (nr_active) { sc->nr_to_scan = min(nr_active, @@ -871,8 +865,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) (unsigned long)sc->swap_cluster_max); nr_inactive -= sc->nr_to_scan; shrink_cache(zone, sc); - if (sc->nr_to_reclaim <= 0) - break; } } -- cgit v1.2.3 From 80bfed904c690642db9d4178950735299160950b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Jan 2006 00:11:14 -0800 Subject: [PATCH] consolidate lru_add_drain() and lru_drain_cache() Cc: Christoph Lameter Cc: Rajesh Shah Cc: Li Shaohua Cc: Srivatsa Vaddagiri Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 73d351439ef..ee6d71ccfa5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page) put_cpu_var(lru_add_active_pvecs); } -void lru_add_drain(void) +static void __lru_add_drain(int cpu) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); + /* CPU is dead, so no locking needed. */ if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &__get_cpu_var(lru_add_active_pvecs); + pvec = &per_cpu(lru_add_active_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_pvecs); +} + +void lru_add_drain(void) +{ + __lru_add_drain(get_cpu()); + put_cpu(); } /* @@ -412,17 +418,6 @@ void vm_acct_memory(long pages) } #ifdef CONFIG_HOTPLUG_CPU -static void lru_drain_cache(unsigned int cpu) -{ - struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); - - /* CPU is dead, so no locking needed. */ - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); - pvec = &per_cpu(lru_add_active_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add_active(pvec); -} /* Drop the CPU's cached committed space back into the central pool. */ static int cpu_swap_callback(struct notifier_block *nfb, @@ -435,7 +430,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, if (action == CPU_DEAD) { atomic_add(*committed, &vm_committed_space); *committed = 0; - lru_drain_cache((long)hcpu); + __lru_add_drain((long)hcpu); } return NOTIFY_OK; } -- cgit v1.2.3 From f3fe65122da05e1cd4c9140340d96ea2f95d0c49 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 6 Jan 2006 00:11:15 -0800 Subject: [PATCH] mm: add populated_zone() helper There are numerous places we check whether a zone is populated or not. Provide a helper function to check for populated zones and convert all checks for zone->present_pages. Signed-off-by: Con Kolivas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- mm/vmscan.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b9fd2c238f1..8f3de5af92d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1358,7 +1358,7 @@ void show_free_areas(void) show_node(zone); printk("%s per-cpu:", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk(" empty\n"); continue; } else @@ -1435,7 +1435,7 @@ void show_free_areas(void) show_node(zone); printk("%s: ", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk("empty\n"); continue; } @@ -2134,7 +2134,7 @@ static int frag_show(struct seq_file *m, void *arg) int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!zone->present_pages) + if (!populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2167,7 +2167,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { int i; - if (!zone->present_pages) + if (!populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); diff --git a/mm/vmscan.c b/mm/vmscan.c index 5c8a412b43f..7681d8ee04f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -897,7 +897,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc) for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; - if (zone->present_pages == 0) + if (!populated_zone(zone)) continue; if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) @@ -1069,7 +1069,7 @@ loop_again: for (i = pgdat->nr_zones - 1; i >= 0; i--) { struct zone *zone = pgdat->node_zones + i; - if (zone->present_pages == 0) + if (!populated_zone(zone)) continue; if (zone->all_unreclaimable && @@ -1106,7 +1106,7 @@ scan: struct zone *zone = pgdat->node_zones + i; int nr_slab; - if (zone->present_pages == 0) + if (!populated_zone(zone)) continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) @@ -1258,7 +1258,7 @@ void wakeup_kswapd(struct zone *zone, int order) { pg_data_t *pgdat; - if (zone->present_pages == 0) + if (!populated_zone(zone)) return; pgdat = zone->zone_pgdat; -- cgit v1.2.3 From 1a93205bdffd9d7278d4a66081cdb48452522a58 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:16 -0800 Subject: [PATCH] mm: simplify build_zonelists_node by removing the case statement. Simplify build_zonelists_node by removing the case statement. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f3de5af92d..7adc9526d32 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1455,35 +1455,23 @@ void show_free_areas(void) /* * Builds allocation fallback zone lists. + * + * Add all populated zones of a node to the zonelist. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) -{ - switch (k) { - struct zone *zone; - default: - BUG(); - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { +static int __init build_zonelists_node(pg_data_t *pgdat, + struct zonelist *zonelist, int j, int k) +{ + struct zone *zone; + + BUG_ON(k > ZONE_HIGHMEM); + for (zone = pgdat->node_zones + k; zone >= pgdat->node_zones; zone--) { + if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG(); + BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL); #endif zonelist->zones[j++] = zone; } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA32: - zone = pgdat->node_zones + ZONE_DMA32; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) - zonelist->zones[j++] = zone; } - return j; } -- cgit v1.2.3 From 4be38e351c5f455f6f490f5aff29053e33ab4f99 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:17 -0800 Subject: [PATCH] mm: move determination of policy_zone into page allocator Currently the function to build a zonelist for a BIND policy has the side effect to set the policy_zone. This seems to be a bit strange. policy zone seems to not be initialized elsewhere and therefore 0. Do we police ZONE_DMA if no bind policy has been used yet? This patch moves the determination of the zone to apply policies to into the page allocator. We determine the zone while building the zonelist for nodes. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 15 +++------------ mm/page_alloc.c | 2 ++ 2 files changed, 5 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 96714e2646a..0f1d2b8a952 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -93,7 +93,7 @@ static kmem_cache_t *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ -static int policy_zone; +int policy_zone = ZONE_DMA; struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ @@ -131,17 +131,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) if (!zl) return NULL; num = 0; - for_each_node_mask(nd, *nodes) { - int k; - for (k = MAX_NR_ZONES-1; k >= 0; k--) { - struct zone *z = &NODE_DATA(nd)->node_zones[k]; - if (!z->present_pages) - continue; - zl->zones[num++] = z; - if (k > policy_zone) - policy_zone = k; - } - } + for_each_node_mask(nd, *nodes) + zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; zl->zones[num] = NULL; return zl; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7adc9526d32..512e3f4d496 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1470,6 +1471,7 @@ static int __init build_zonelists_node(pg_data_t *pgdat, BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL); #endif zonelist->zones[j++] = zone; + check_highest_zone(k); } } return j; -- cgit v1.2.3 From 02a68a5ebc7dd823da7496116f42290103e1e4a9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:18 -0800 Subject: [PATCH] Fix zone policy determination The use k in the inner loop means that the highest zone nr is always used if any zone of a node is populated. This means that the policy zone is not correctly determined on arches that do no use HIGHMEM like ia64. Change the loop to decrement k which also simplifies the BUG_ON. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 512e3f4d496..ca978992c89 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1465,15 +1465,19 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zone *zone; BUG_ON(k > ZONE_HIGHMEM); - for (zone = pgdat->node_zones + k; zone >= pgdat->node_zones; zone--) { + + do { + zone = pgdat->node_zones + k; if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL); + BUG_ON(k > ZONE_NORMAL); #endif zonelist->zones[j++] = zone; check_highest_zone(k); } - } + k--; + + } while (k >= 0); return j; } -- cgit v1.2.3 From 070f80326a215d8e6c4fd6f175e28eb446c492bc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:19 -0800 Subject: [PATCH] build_zonelists_node(): rename args Give j and r meaningful names. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ca978992c89..7f580779abd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1460,25 +1460,25 @@ void show_free_areas(void) * Add all populated zones of a node to the zonelist. */ static int __init build_zonelists_node(pg_data_t *pgdat, - struct zonelist *zonelist, int j, int k) + struct zonelist *zonelist, int nr_zones, int zone_type) { struct zone *zone; - BUG_ON(k > ZONE_HIGHMEM); + BUG_ON(zone_type > ZONE_HIGHMEM); do { - zone = pgdat->node_zones + k; + zone = pgdat->node_zones + zone_type; if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG_ON(k > ZONE_NORMAL); + BUG_ON(zone_type > ZONE_NORMAL); #endif - zonelist->zones[j++] = zone; - check_highest_zone(k); + zonelist->zones[nr_zones++] = zone; + check_highest_zone(zone_type); } - k--; + zone_type--; - } while (k >= 0); - return j; + } while (zone_type >= 0); + return nr_zones; } static inline int highest_zone(int zone_bits) -- cgit v1.2.3 From a74609fafa2e5cc31d558012abaaa55ec9ad9da4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:20 -0800 Subject: [PATCH] mm: page_state opt Optimise page_state manipulations by introducing interrupt unsafe accessors to page_state fields. Callers must provide their own locking (either disable interrupts or not update from interrupt context). Switch over the hot callsites that can easily be moved under interrupts off sections. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 89 ++++++++++++++++++++++++++++++++------------------------- mm/rmap.c | 10 +++++-- mm/vmscan.c | 27 +++++++++-------- 3 files changed, 72 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7f580779abd..fd47494cb98 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -424,9 +424,9 @@ void __free_pages_ok(struct page *page, unsigned int order) return; list_add(&page->lru, &list); - mod_page_state(pgfree, 1 << order); kernel_map_pages(page, 1<zone_pgdat; pg_data_t *orig = zonelist->zones[0]->zone_pgdat; struct per_cpu_pageset *p; - local_irq_save(flags); - cpu = smp_processor_id(); - p = zone_pcp(z,cpu); + p = zone_pcp(z, cpu); if (pg == orig) { p->numa_hit++; } else { @@ -696,7 +692,6 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) p->local_node++; else p->other_node++; - local_irq_restore(flags); #endif } @@ -716,11 +711,11 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (free_pages_check(page)) return; - inc_page_state(pgfree); kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); + __inc_page_state(pgfree); list_add(&page->lru, &pcp->list); pcp->count++; if (pcp->count >= pcp->high) @@ -753,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page * -buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) +static struct page *buffered_rmqueue(struct zonelist *zonelist, + struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); + int cpu; again: + cpu = get_cpu(); if (order == 0) { struct per_cpu_pages *pcp; - page = NULL; - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + pcp = &zone_pcp(zone, cpu)->pcp[cold]; local_irq_save(flags); - if (!pcp->count) + if (!pcp->count) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); - if (likely(pcp->count)) { - page = list_entry(pcp->list.next, struct page, lru); - list_del(&page->lru); - pcp->count--; + if (unlikely(!pcp->count)) + goto failed; } - local_irq_restore(flags); - put_cpu(); + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); + pcp->count--; } else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); + if (!page) + goto failed; } - if (page != NULL) { - BUG_ON(bad_range(zone, page)); - mod_page_state_zone(zone, pgalloc, 1 << order); - if (prep_new_page(page, order)) - goto again; + __mod_page_state_zone(zone, pgalloc, 1 << order); + zone_statistics(zonelist, zone, cpu); + local_irq_restore(flags); + put_cpu(); - if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); + BUG_ON(bad_range(zone, page)); + if (prep_new_page(page, order)) + goto again; - if (order && (gfp_flags & __GFP_COMP)) - prep_compound_page(page, order); - } + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); return page; + +failed: + local_irq_restore(flags); + put_cpu(); + return NULL; } #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ @@ -871,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, continue; } - page = buffered_rmqueue(*z, order, gfp_mask); + page = buffered_rmqueue(zonelist, *z, order, gfp_mask); if (page) { - zone_statistics(zonelist, *z); break; } } while (*(++z) != NULL); @@ -1248,7 +1251,7 @@ void get_full_page_state(struct page_state *ret) __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); } -unsigned long __read_page_state(unsigned long offset) +unsigned long read_page_state_offset(unsigned long offset) { unsigned long ret = 0; int cpu; @@ -1262,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset) return ret; } -void __mod_page_state(unsigned long offset, unsigned long delta) +void __mod_page_state_offset(unsigned long offset, unsigned long delta) +{ + void *ptr; + + ptr = &__get_cpu_var(page_states); + *(unsigned long *)(ptr + offset) += delta; +} +EXPORT_SYMBOL(__mod_page_state_offset); + +void mod_page_state_offset(unsigned long offset, unsigned long delta) { unsigned long flags; - void* ptr; + void *ptr; local_irq_save(flags); ptr = &__get_cpu_var(page_states); - *(unsigned long*)(ptr + offset) += delta; + *(unsigned long *)(ptr + offset) += delta; local_irq_restore(flags); } - -EXPORT_SYMBOL(__mod_page_state); +EXPORT_SYMBOL(mod_page_state_offset); void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) diff --git a/mm/rmap.c b/mm/rmap.c index 4107f64ff74..6f3f7db2712 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -451,7 +451,11 @@ static void __page_set_anon_rmap(struct page *page, page->index = linear_page_index(vma, address); - inc_page_state(nr_mapped); + /* + * nr_mapped state can be updated without turning off + * interrupts because it is not modified via interrupt. + */ + __inc_page_state(nr_mapped); } /** @@ -498,7 +502,7 @@ void page_add_file_rmap(struct page *page) BUG_ON(!pfn_valid(page_to_pfn(page))); if (atomic_inc_and_test(&page->_mapcount)) - inc_page_state(nr_mapped); + __inc_page_state(nr_mapped); } /** @@ -522,7 +526,7 @@ void page_remove_rmap(struct page *page) */ if (page_test_and_clear_dirty(page)) set_page_dirty(page); - dec_page_state(nr_mapped); + __dec_page_state(nr_mapped); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 7681d8ee04f..be8235fb193 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -645,16 +645,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) goto done; max_scan -= nr_scan; - if (current_is_kswapd()) - mod_page_state_zone(zone, pgscan_kswapd, nr_scan); - else - mod_page_state_zone(zone, pgscan_direct, nr_scan); nr_freed = shrink_list(&page_list, sc); - if (current_is_kswapd()) - mod_page_state(kswapd_steal, nr_freed); - mod_page_state_zone(zone, pgsteal, nr_freed); - spin_lock_irq(&zone->lru_lock); + local_irq_disable(); + if (current_is_kswapd()) { + __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); + __mod_page_state(kswapd_steal, nr_freed); + } else + __mod_page_state_zone(zone, pgscan_direct, nr_scan); + __mod_page_state_zone(zone, pgsteal, nr_freed); + + spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. */ @@ -816,11 +817,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } } zone->nr_active += pgmoved; - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); + spin_unlock(&zone->lru_lock); + + __mod_page_state_zone(zone, pgrefill, pgscanned); + __mod_page_state(pgdeactivate, pgdeactivate); + local_irq_enable(); - mod_page_state_zone(zone, pgrefill, pgscanned); - mod_page_state(pgdeactivate, pgdeactivate); + pagevec_release(&pvec); } /* -- cgit v1.2.3 From b0e15190ead07056ab0c3844a499ff35e66d27cc Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Jan 2006 00:11:42 -0800 Subject: [PATCH] NOMMU: Make SYSV IPC SHM use ramfs facilities on NOMMU The attached patch makes the SYSV IPC shared memory facilities use the new ramfs facilities on a no-MMU kernel. The following changes are made: (1) There are now shmem_mmap() and shmem_get_unmapped_area() functions to allow the IPC SHM facilities to commune with the tiny-shmem and shmem code. (2) ramfs files now need resizing using do_truncate() rather than by modifying the inode size directly (see shmem_file_setup()). This causes ramfs to attempt to bind a block of pages of sufficient size to the inode. (3) CONFIG_SYSVIPC is no longer contingent on CONFIG_MMU. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 7 +++++++ mm/shmem.c | 2 +- mm/tiny-shmem.c | 29 ++++++++++++++++++++++++++++- 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index c1196812876..c10262d6823 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr) { return 0; } + +struct page *filemap_nopage(struct vm_area_struct *area, + unsigned long address, int *type) +{ + BUG(); + return NULL; +} diff --git a/mm/shmem.c b/mm/shmem.c index 65c148efa2e..a1f2f02af72 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1270,7 +1270,7 @@ out_nomem: return retval; } -static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index b58abcf44ed..cdc6d431972 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) goto close_file; d_instantiate(dentry, inode); - inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ + file->f_vfsmnt = mntget(shm_mnt); file->f_dentry = dentry; file->f_mapping = inode->i_mapping; file->f_op = &ramfs_file_operations; file->f_mode = FMODE_WRITE | FMODE_READ; + + /* notify everyone as to the change of file size */ + error = do_truncate(dentry, size, file); + if (error < 0) + goto close_file; + return file; close_file: @@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page) { return 0; } + +int shmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); +#ifndef CONFIG_MMU + return ramfs_nommu_mmap(file, vma); +#else + return 0; +#endif +} + +#ifndef CONFIG_MMU +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags); +} +#endif -- cgit v1.2.3 From c898ec16e83331abde39118e22e9e38335bbb950 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 6 Jan 2006 00:12:07 -0800 Subject: [PATCH] allow flatmem to be disabled when only sparsemem is implemented On architectures that implement sparsemem but not discontigmem we want to be able to hide the flatmem option in some cases. On ppc64 for example, when we select NUMA we must not select flatmem. Signed-off-by: Anton Blanchard Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 21eb51d4da8..b3db11f137e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -11,7 +11,7 @@ choice config FLATMEM_MANUAL bool "Flat Memory" - depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE + depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE help This option allows you to change some of the ways that Linux manages its memory internally. Most users will -- cgit v1.2.3 From 3a291a20bd6fcfafb2109031f0760a0d3e92ecd7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Jan 2006 00:16:37 -0800 Subject: [PATCH] mm: add a new function (needed for swap suspend) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds the function get_swap_page_of_type() allowing us to specify an index in swap_info[] and select a swap_info_struct structure to be used for allocating a swap page. This function (or another one of similar functionality) will be necessary for implementing the image-writing part of swsusp in the user space.  It can also be used for simplifying the current in-kernel implementation of the image-writing part of swsusp. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index edafeace301..6da4b28b896 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -211,6 +211,26 @@ noswap: return (swp_entry_t) {0}; } +swp_entry_t get_swap_page_of_type(int type) +{ + struct swap_info_struct *si; + pgoff_t offset; + + spin_lock(&swap_lock); + si = swap_info + type; + if (si->flags & SWP_WRITEOK) { + nr_swap_pages--; + offset = scan_swap_map(si); + if (offset) { + spin_unlock(&swap_lock); + return swp_entry(type, offset); + } + nr_swap_pages++; + } + spin_unlock(&swap_lock); + return (swp_entry_t) {0}; +} + static struct swap_info_struct * swap_info_get(swp_entry_t entry) { struct swap_info_struct * p; -- cgit v1.2.3