From bb1f17b0372de93758653ca3454bc0df18dc2e5c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jun 2009 15:31:18 -0700 Subject: mm: consolidate init_mm definition * create mm/init-mm.c, move init_mm there * remove INIT_MM, initialize init_mm with C99 initializer * unexport init_mm on all arches: init_mm is already unexported on x86. One strange place is some OMAP driver (drivers/video/omap/) which won't build modular, but it's already wants get_vm_area() export. Somebody should look there. [akpm@linux-foundation.org: add missing #includes] Signed-off-by: Alexey Dobriyan Cc: Mike Frysinger Cc: Americo Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 1 + mm/init-mm.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 mm/init-mm.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index e89acb090b4..cf76be785ad 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o $(mmu-y) +obj-y += init-mm.o obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o diff --git a/mm/init-mm.c b/mm/init-mm.c new file mode 100644 index 00000000000..57aba0da966 --- /dev/null +++ b/mm/init-mm.c @@ -0,0 +1,20 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +struct mm_struct init_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; -- cgit v1.2.3 From f7e839dd36fd940b0202cfb7d39b2a1b2dc59b1b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:20 -0700 Subject: readahead: move max_sane_readahead() calls into force_page_cache_readahead() Impact: code simplification. Cc: Nick Piggin Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 2 +- mm/filemap.c | 3 +-- mm/madvise.c | 3 +-- mm/readahead.c | 1 + 4 files changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index 54a0f8040af..e43359214f6 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) ret = force_page_cache_readahead(mapping, file, start_index, - max_sane_readahead(nrpages)); + nrpages); if (ret > 0) ret = 0; break; diff --git a/mm/filemap.c b/mm/filemap.c index 1b60f30cebf..dcef9fd6b92 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1390,8 +1390,7 @@ do_readahead(struct address_space *mapping, struct file *filp, if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; - force_page_cache_readahead(mapping, filp, index, - max_sane_readahead(nr)); + force_page_cache_readahead(mapping, filp, index, nr); return 0; } diff --git a/mm/madvise.c b/mm/madvise.c index b9ce574827c..e994dcb479d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma, end = vma->vm_end; end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - force_page_cache_readahead(file->f_mapping, - file, start, max_sane_readahead(end - start)); + force_page_cache_readahead(file->f_mapping, file, start, end - start); return 0; } diff --git a/mm/readahead.c b/mm/readahead.c index 133b6d52551..a224182a3a6 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -210,6 +210,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; + nr_to_read = max_sane_readahead(nr_to_read); while (nr_to_read) { int err; -- cgit v1.2.3 From fc31d16add13773265cc53d59f2e7594cb3c0a14 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:21 -0700 Subject: readahead: apply max_sane_readahead() limit in ondemand_readahead() Just in case someone aggressively sets a huge readahead size. Cc: Nick Piggin Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index a224182a3a6..bd0f0e88b0a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -357,7 +357,7 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - int max = ra->ra_pages; /* max readahead pages */ + unsigned long max = max_sane_readahead(ra->ra_pages); pgoff_t prev_offset; int sequential; -- cgit v1.2.3 From caca7cb748571a5b39943a9b3e7081feef055e5e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:21 -0700 Subject: readahead: remove one unnecessary radix tree lookup (hit_readahead_marker != 0) means the page at @offset is present, so we can search for non-present page starting from @offset+1. Reported-by: Xu Chenfeng Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index bd0f0e88b0a..56731158309 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -395,7 +395,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); + start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); rcu_read_unlock(); if (!start || start - offset > max) -- cgit v1.2.3 From 160334a0cfa8e578b718f81038026326845d07d7 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:23 -0700 Subject: readahead: increase interleaved readahead size Make sure interleaved readahead size is larger than request size. This also makes the readahead window grow up more quickly. Reported-by: Xu Chenfeng Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 56731158309..16378b90843 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -403,6 +403,7 @@ ondemand_readahead(struct address_space *mapping, ra->start = start; ra->size = start - offset; /* old async_size */ + ra->size += req_size; ra->size = get_next_ra_size(ra, max); ra->async_size = ra->size; goto readit; -- cgit v1.2.3 From 51daa88ebd8e0d437289f589af29d4b39379ea76 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:24 -0700 Subject: readahead: remove sync/async readahead call dependency The readahead call scheme is error-prone in that it expects the call sites to check for async readahead after doing a sync one. I.e. if (!page) page_cache_sync_readahead(); page = find_get_page(); if (page && PageReadahead(page)) page_cache_async_readahead(); This is because PG_readahead could be set by a sync readahead for the _current_ newly faulted in page, and the readahead code simply expects one more callback on the same page to start the async readahead. If the caller fails to do so, it will miss the PG_readahead bits and never able to start an async readahead. Eliminate this insane constraint by piggy-backing the async part into the current readahead window. Now if an async readahead should be started immediately after a sync one, the readahead logic itself will do it. So the following code becomes valid: (the 'else' in particular) if (!page) page_cache_sync_readahead(); else if (PageReadahead(page)) page_cache_async_readahead(); Cc: Nick Piggin Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 16378b90843..d7c6e143a12 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -421,6 +421,16 @@ ondemand_readahead(struct address_space *mapping, ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; readit: + /* + * Will this read hit the readahead marker made by itself? + * If so, trigger the readahead marker hit now, and merge + * the resulted next readahead window into the current one. + */ + if (offset == ra->start && ra->size == ra->async_size) { + ra->async_size = get_next_ra_size(ra, max); + ra->size += ra->async_size; + } + return ra_submit(ra, mapping, filp); } -- cgit v1.2.3 From ef00e08e26dd5d84271ef706262506b82195e752 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 16 Jun 2009 15:31:25 -0700 Subject: readahead: clean up and simplify the code for filemap page fault readahead This shouldn't really change behavior all that much, but the single rather complex function with read-ahead inside a loop etc is broken up into more manageable pieces. The behaviour is also less subtle, with the read-ahead being done up-front rather than inside some subtle loop and thus avoiding the now unnecessary extra state variables (ie "did_readaround" is gone). Fengguang: the code split in fact fixed a bug reported by Pavel Levshin: the PGMAJFAULT accounting used to be bypassed when MADV_RANDOM is set, in which case the original code will directly jump to no_cached_page reading. Cc: Pavel Levshin Cc: Cc: Nick Piggin Signed-off-by: Wu Fengguang Signed-off-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 156 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 89 insertions(+), 67 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index dcef9fd6b92..82753648559 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1456,6 +1456,68 @@ static int page_cache_read(struct file *file, pgoff_t offset) #define MMAP_LOTSAMISS (100) +/* + * Synchronous readahead happens when we don't even find + * a page in the page cache at all. + */ +static void do_sync_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + pgoff_t offset) +{ + unsigned long ra_pages; + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + + if (VM_SequentialReadHint(vma)) { + page_cache_sync_readahead(mapping, ra, file, offset, 1); + return; + } + + if (ra->mmap_miss < INT_MAX) + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > MMAP_LOTSAMISS) + return; + + ra_pages = max_sane_readahead(ra->ra_pages); + if (ra_pages) { + pgoff_t start = 0; + + if (offset > ra_pages / 2) + start = offset - ra_pages / 2; + do_page_cache_readahead(mapping, file, start, ra_pages); + } +} + +/* + * Asynchronous readahead happens when we find the page and PG_readahead, + * so we want to possibly extend the readahead further.. + */ +static void do_async_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + struct page *page, + pgoff_t offset) +{ + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + if (ra->mmap_miss > 0) + ra->mmap_miss--; + if (PageReadahead(page)) + page_cache_async_readahead(mapping, ra, file, page, offset, 1); +} + /** * filemap_fault - read in file data for page fault handling * @vma: vma in which the fault was taken @@ -1475,78 +1537,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; + pgoff_t offset = vmf->pgoff; struct page *page; pgoff_t size; - int did_readaround = 0; int ret = 0; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) + if (offset >= size) return VM_FAULT_SIGBUS; - /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) - goto no_cached_page; - /* * Do we have something in the page cache already? */ -retry_find: - page = find_lock_page(mapping, vmf->pgoff); - /* - * For sequential accesses, we use the generic readahead logic. - */ - if (VM_SequentialReadHint(vma)) { - if (!page) { - page_cache_sync_readahead(mapping, ra, file, - vmf->pgoff, 1); - page = find_lock_page(mapping, vmf->pgoff); - if (!page) - goto no_cached_page; - } - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, ra, file, page, - vmf->pgoff, 1); - } - } - - if (!page) { - unsigned long ra_pages; - - ra->mmap_miss++; - + page = find_get_page(mapping, offset); + if (likely(page)) { /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. + * We found the page, so try async readahead before + * waiting for the lock. */ - if (ra->mmap_miss > MMAP_LOTSAMISS) - goto no_cached_page; + do_async_mmap_readahead(vma, ra, file, page, offset); + lock_page(page); - /* - * To keep the pgmajfault counter straight, we need to - * check did_readaround, as this is an inner loop. - */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); - if (ra_pages) { - pgoff_t start = 0; - - if (vmf->pgoff > ra_pages / 2) - start = vmf->pgoff - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto no_cached_page; } - page = find_lock_page(mapping, vmf->pgoff); + } else { + /* No page in the page cache at all */ + do_sync_mmap_readahead(vma, ra, file, offset); + count_vm_event(PGMAJFAULT); + ret = VM_FAULT_MAJOR; +retry_find: + page = find_lock_page(mapping, offset); if (!page) goto no_cached_page; } - if (!did_readaround) - ra->mmap_miss--; - /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -1554,18 +1582,18 @@ retry_find: if (unlikely(!PageUptodate(page))) goto page_not_uptodate; - /* Must recheck i_size under page lock */ + /* + * Found the page and have a reference on it. + * We must recheck i_size under page lock. + */ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { + if (unlikely(offset >= size)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; } - /* - * Found the page and have a reference on it. - */ - ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; + ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -1574,7 +1602,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, vmf->pgoff); + error = page_cache_read(file, offset); /* * The page we want has now been added to the page cache. @@ -1594,12 +1622,6 @@ no_cached_page: return VM_FAULT_SIGBUS; page_not_uptodate: - /* IO error path */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, -- cgit v1.2.3 From 70ac23cfa31f68289d4b720c6162b3929ab4de36 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:28 -0700 Subject: readahead: sequential mmap readahead Auto-detect sequential mmap reads and do readahead for them. The sequential mmap readahead will be triggered when - sync readahead: it's a major fault and (prev_offset == offset-1); - async readahead: minor fault on PG_readahead page with valid readahead state. The benefits of doing readahead instead of read-around: - less I/O wait thanks to async readahead - double real I/O size and no more cache hits The single stream case is improved a little. For 100,000 sequential mmap reads: user system cpu total (1-1) plain -mm, 128KB readaround: 3.224 2.554 48.40% 11.838 (1-2) plain -mm, 256KB readaround: 3.170 2.392 46.20% 11.976 (2) patched -mm, 128KB readahead: 3.117 2.448 47.33% 11.607 The patched (2) has smallest total time, since it has no cache hit overheads and less I/O block time(thanks to async readahead). Here the I/O size makes no much difference, since there's only one single stream. Note that (1-1)'s real I/O size is 64KB and (1-2)'s real I/O size is 128KB, since the half of the read-around pages will be readahead cache hits. This is going to make _real_ differences for _concurrent_ IO streams. Cc: Nick Piggin Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 82753648559..99977f0a94e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1472,7 +1472,8 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, if (VM_RandomReadHint(vma)) return; - if (VM_SequentialReadHint(vma)) { + if (VM_SequentialReadHint(vma) || + offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { page_cache_sync_readahead(mapping, ra, file, offset, 1); return; } -- cgit v1.2.3 From 2fad6f5deee5556f511eab58da78737a23ddb35d Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:29 -0700 Subject: readahead: enforce full readahead size on async mmap readahead We need this in one particular case and two more general ones. Now we do async readahead for sequential mmap reads, and do it with the help of PG_readahead. For normal reads, PG_readahead is the sufficient condition to do a sequential readahead. But unfortunately, for mmap reads, there is a tiny nuisance: [11736.998347] readahead-init0(process: sh/23926, file: sda1/w3m, offset=0:4503599627370495, ra=0+4-3) = 4 [11737.014985] readahead-around(process: w3m/23926, file: sda1/w3m, offset=0:0, ra=290+32-0) = 17 [11737.019488] readahead-around(process: w3m/23926, file: sda1/w3m, offset=0:0, ra=118+32-0) = 32 [11737.024921] readahead-interleaved(process: w3m/23926, file: sda1/w3m, offset=0:2, ra=4+6-6) = 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~ An unfavorably small readahead. The original dumb read-around size could be more efficient. That happened because ld-linux.so does a read(832) in L1 before mmap(), which triggers a 4-page readahead, with the second page tagged PG_readahead. L0: open("/lib/libc.so.6", O_RDONLY) = 3 L1: read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340\342"..., 832) = 832 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ L2: fstat(3, {st_mode=S_IFREG|0755, st_size=1420624, ...}) = 0 L3: mmap(NULL, 3527256, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fac6e51d000 L4: mprotect(0x7fac6e671000, 2097152, PROT_NONE) = 0 L5: mmap(0x7fac6e871000, 20480, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x154000) = 0x7fac6e871000 L6: mmap(0x7fac6e876000, 16984, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fac6e876000 L7: close(3) = 0 In general, the PG_readahead flag will also be hit in cases - sequential reads - clustered random reads A full readahead size is desirable in both cases. Cc: Nick Piggin Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 99977f0a94e..5c0c6518f34 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1516,7 +1516,8 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, if (ra->mmap_miss > 0) ra->mmap_miss--; if (PageReadahead(page)) - page_cache_async_readahead(mapping, ra, file, page, offset, 1); + page_cache_async_readahead(mapping, ra, file, + page, offset, ra->ra_pages); } /** -- cgit v1.2.3 From d30a11004e3411909f2448546f036a011978062e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:30 -0700 Subject: readahead: record mmap read-around states in file_ra_state Mmap read-around now shares the same code style and data structure with readahead code. This also removes do_page_cache_readahead(). Its last user, mmap read-around, has been changed to call ra_submit(). The no-readahead-if-congested logic is dumped by the way. Users will be pretty sensitive about the slow loading of executables. So it's unfavorable to disabled mmap read-around on a congested queue. [akpm@linux-foundation.org: coding-style fixes] Cc: Nick Piggin Signed-off-by: Fengguang Wu Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 12 +++++++----- mm/readahead.c | 23 ++--------------------- 2 files changed, 9 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 5c0c6518f34..734891d0663 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1488,13 +1488,15 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, if (ra->mmap_miss > MMAP_LOTSAMISS) return; + /* + * mmap read-around + */ ra_pages = max_sane_readahead(ra->ra_pages); if (ra_pages) { - pgoff_t start = 0; - - if (offset > ra_pages / 2) - start = offset - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); + ra->start = max_t(long, 0, offset - ra_pages/2); + ra->size = ra_pages; + ra->async_size = 0; + ra_submit(ra, mapping, file); } } diff --git a/mm/readahead.c b/mm/readahead.c index d7c6e143a12..a7f01fcce9e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -133,15 +133,12 @@ out: } /* - * do_page_cache_readahead actually reads a chunk of disk. It allocates all + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all * the pages first, then submits them all for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. * * Returns the number of pages requested, or the maximum amount of I/O allowed. - * - * do_page_cache_readahead() returns -1 if it encountered request queue - * congestion. */ static int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, @@ -231,22 +228,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return ret; } -/* - * This version skips the IO if the queue is read-congested, and will tell the - * block layer to abandon the readahead if request allocation would block. - * - * force_page_cache_readahead() will ignore queue congestion and will block on - * request queues. - */ -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) -{ - if (bdi_read_congested(mapping->backing_dev_info)) - return -1; - - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); -} - /* * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. @@ -260,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr) /* * Submit IO for the read-ahead request in file_ra_state. */ -static unsigned long ra_submit(struct file_ra_state *ra, +unsigned long ra_submit(struct file_ra_state *ra, struct address_space *mapping, struct file *filp) { int actual; -- cgit v1.2.3 From 045a2529a3513faed2d45bd82f9013b124309d94 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:33 -0700 Subject: readahead: move the random read case to bottom Split all readahead cases, and move the random one to bottom. No behavior changes. This is to prepare for the introduction of context readahead, and make it easy for inserting accounting/tracing points for each case. Signed-off-by: Wu Fengguang Cc: Vladislav Bolkhovitin Cc: Jens Axboe Cc: Jeff Moyer Cc: Nick Piggin Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index a7f01fcce9e..ceed7e4790b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -339,33 +339,25 @@ ondemand_readahead(struct address_space *mapping, unsigned long req_size) { unsigned long max = max_sane_readahead(ra->ra_pages); - pgoff_t prev_offset; - int sequential; + + /* + * start of file + */ + if (!offset) + goto initial_readahead; /* * It's the expected callback offset, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - if (offset && (offset == (ra->start + ra->size - ra->async_size) || - offset == (ra->start + ra->size))) { + if ((offset == (ra->start + ra->size - ra->async_size) || + offset == (ra->start + ra->size))) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max); ra->async_size = ra->size; goto readit; } - prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; - sequential = offset - prev_offset <= 1UL || req_size > max; - - /* - * Standalone, small read. - * Read as is, and do not pollute the readahead state. - */ - if (!hit_readahead_marker && !sequential) { - return __do_page_cache_readahead(mapping, filp, - offset, req_size, 0); - } - /* * Hit a marked page without valid readahead state. * E.g. interleaved reads. @@ -391,12 +383,24 @@ ondemand_readahead(struct address_space *mapping, } /* - * It may be one of - * - first read on start of file - * - sequential cache miss - * - oversize random read - * Start readahead for it. + * oversize read */ + if (req_size > max) + goto initial_readahead; + + /* + * sequential cache miss + */ + if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) + goto initial_readahead; + + /* + * standalone, small random read + * Read as is, and do not pollute the readahead state. + */ + return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); + +initial_readahead: ra->start = offset; ra->size = get_init_ra_size(req_size, max); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; -- cgit v1.2.3 From 10be0b372cac50e2e7a477852f98bf069a97a3fa Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:36 -0700 Subject: readahead: introduce context readahead algorithm Introduce page cache context based readahead algorithm. This is to better support concurrent read streams in general. RATIONALE --------- The current readahead algorithm detects interleaved reads in a _passive_ way. Given a sequence of interleaved streams 1,1001,2,1002,3,4,1003,5,1004,1005,6,... By checking for (offset == prev_offset + 1), it will discover the sequentialness between 3,4 and between 1004,1005, and start doing sequential readahead for the individual streams since page 4 and page 1005. The context readahead algorithm guarantees to discover the sequentialness no matter how the streams are interleaved. For the above example, it will start sequential readahead since page 2 and 1002. The trick is to poke for page @offset-1 in the page cache when it has no other clues on the sequentialness of request @offset: if the current requenst belongs to a sequential stream, that stream must have accessed page @offset-1 recently, and the page will still be cached now. So if page @offset-1 is there, we can take request @offset as a sequential access. BENEFICIARIES ------------- - strictly interleaved reads i.e. 1,1001,2,1002,3,1003,... the current readahead will take them as silly random reads; the context readahead will take them as two sequential streams. - cooperative IO processes i.e. NFS and SCST They create a thread pool, farming off (sequential) IO requests to different threads which will be performing interleaved IO. It was not easy(or possible) to reliably tell from file->f_ra all those cooperative processes working on the same sequential stream, since they will have different file->f_ra instances. And NFSD's file->f_ra is particularly unusable, since their file objects are dynamically created for each request. The nfsd does have code trying to restore the f_ra bits, but not satisfactory. The new scheme is to detect the sequential pattern via looking up the page cache, which provides one single and consistent view of the pages recently accessed. That makes sequential detection for cooperative processes possible. USER REPORT ----------- Vladislav recommends the addition of context readahead as a result of his SCST benchmarks. It leads to 6%~40% performance gains in various cases and achieves equal performance in others. http://lkml.org/lkml/2009/3/19/239 OVERHEADS --------- In theory, it introduces one extra page cache lookup per random read. However the below benchmark shows context readahead to be slightly faster, wondering.. Randomly reading 200MB amount of data on a sparse file, repeat 20 times for each block size. The average throughputs are: original ra context ra gain 4K random reads: 65.561MB/s 65.648MB/s +0.1% 16K random reads: 124.767MB/s 124.951MB/s +0.1% 64K random reads: 162.123MB/s 162.278MB/s +0.1% Cc: Jens Axboe Cc: Jeff Moyer Tested-by: Vladislav Bolkhovitin Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index ceed7e4790b..aa1aa234523 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -329,6 +329,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, * it approaches max_readhead. */ +/* + * Count contiguously cached pages from @offset-1 to @offset-@max, + * this count is a conservative estimation of + * - length of the sequential read sequence, or + * - thrashing threshold in memory tight systems + */ +static pgoff_t count_history_pages(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t offset, unsigned long max) +{ + pgoff_t head; + + rcu_read_lock(); + head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + rcu_read_unlock(); + + return offset - 1 - head; +} + +/* + * page cache context based read-ahead + */ +static int try_context_readahead(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t offset, + unsigned long req_size, + unsigned long max) +{ + pgoff_t size; + + size = count_history_pages(mapping, ra, offset, max); + + /* + * no history pages: + * it could be a random read + */ + if (!size) + return 0; + + /* + * starts from beginning of file: + * it is a strong indication of long-run stream (or whole-file-read) + */ + if (size >= offset) + size *= 2; + + ra->start = offset; + ra->size = get_init_ra_size(size + req_size, max); + ra->async_size = ra->size; + + return 1; +} + /* * A minimal readahead algorithm for trivial sequential/random reads. */ @@ -394,6 +447,13 @@ ondemand_readahead(struct address_space *mapping, if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) goto initial_readahead; + /* + * Query the page cache and look for the traces(cached history pages) + * that a sequential stream would leave behind. + */ + if (try_context_readahead(mapping, ra, offset, req_size, max)) + goto readit; + /* * standalone, small random read * Read as is, and do not pollute the readahead state. -- cgit v1.2.3 From 61b7cbdba2f3c588a0cf3db574c562805454b09b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:36 -0700 Subject: readahead: remove redundant test in shrink_readahead_size_eio() Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 734891d0663..2e9bcc2e797 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); static void shrink_readahead_size_eio(struct file *filp, struct file_ra_state *ra) { - if (!ra->ra_pages) - return; - ra->ra_pages /= 4; } -- cgit v1.2.3 From 7ffc59b4d0bdfa00e882339f85b8a969bb7021e2 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:38 -0700 Subject: readahead: enforce full sync mmap readahead size Now that we do readahead for sequential mmap reads, here is a simple evaluation of the impacts, and one further optimization. It's an NFS-root debian desktop system, readahead size = 60 pages. The numbers are grabbed after a fresh boot into console. approach pgmajfault RA miss ratio mmap IO count avg IO size(pages) A 383 31.6% 383 11 B 225 32.4% 390 11 C 224 32.6% 307 13 case A: mmap sync/async readahead disabled case B: mmap sync/async readahead enabled, with enforced full async readahead size case C: mmap sync/async readahead enabled, with enforced full sync/async readahead size or: A = vanilla 2.6.30-rc1 B = A plus mmap readahead C = B plus this patch The numbers show that - there are good possibilities for random mmap reads to trigger readahead - 'pgmajfault' is reduced by 1/3, due to the _async_ nature of readahead - case C can further reduce IO count by 1/4 - readahead miss ratios are not quite affected The theory is - readahead is _good_ for clustered random reads, and can perform _better_ than readaround because they could be _async_. - async readahead size is guaranteed to be larger than readaround size, and they are _async_, hence will mostly behave better However for B - sync readahead size could be smaller than readaround size, hence may make things worse by produce more smaller IOs which will be fixed by this patch. Final conclusion: - mmap readahead reduced major faults by 1/3 and no obvious overheads; - mmap io can be further reduced by 1/4 with this patch. Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 2e9bcc2e797..6846a902f5c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1471,7 +1471,8 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, if (VM_SequentialReadHint(vma) || offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { - page_cache_sync_readahead(mapping, ra, file, offset, 1); + page_cache_sync_readahead(mapping, ra, file, offset, + ra->ra_pages); return; } -- cgit v1.2.3 From d2bf6be8ab63aa84e6149aac934649aadf3828b1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 16 Jun 2009 15:31:39 -0700 Subject: mm: clean up get_user_pages_fast() documentation Move more documentation for get_user_pages_fast into the new kerneldoc comment. Add some comments for get_user_pages as well. Also, move get_user_pages_fast declaration up to get_user_pages. It wasn't there initially because it was once a static inline function. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Nick Piggin Cc: Andy Grover Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ mm/util.c | 16 ++++++++++++---- 2 files changed, 62 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 4126dd16778..891bad0613f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i; } +/** + * get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @len: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force write access even if user mapping is + * readonly. This will result in the page being COWed even + * in MAP_SHARED mappings. You do not want this. + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If len is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) diff --git a/mm/util.c b/mm/util.c index abc65aa7cdf..d5d2213728c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -233,13 +233,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. + * + * get_user_pages_fast provides equivalent functionality to get_user_pages, + * operating on current and current->mm, with force=0 and vma=NULL. However + * unlike get_user_pages, it must be called without mmap_sem held. + * + * get_user_pages_fast may take mmap_sem and page table locks, so no + * assumptions can be made about lack of locking. get_user_pages_fast is to be + * implemented in a way that is advantageous (vs get_user_pages()) when the + * user memory area is already faulted in and present in ptes. However if the + * pages have to be faulted in, it may turn out to be slightly slower so + * callers need to carefully consider what to use. On many architectures, + * get_user_pages_fast simply falls back to get_user_pages. */ int __attribute__((weak)) get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) -- cgit v1.2.3 From 78dc583d3ab43115579cb5f3f7bd12e3548dd5a5 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 16 Jun 2009 15:31:40 -0700 Subject: vmscan: low order lumpy reclaim also should use PAGEOUT_IO_SYNC Commit 33c120ed2843090e2bd316de1588b8bf8b96cbde ("more aggressively use lumpy reclaim") increased how aggressive lumpy reclaim was by isolating both active and inactive pages for asynchronous lumpy reclaim on costly-high-order pages and for cheap-high-order when memory pressure is high. However, if the system is under heavy pressure and there are dirty pages, asynchronous IO may not be sufficient to reclaim a suitable page in time. This patch causes the caller to enter synchronous lumpy reclaim for costly-high-order pages and for cheap-high-order pages when under memory pressure. Minchan.kim@gmail.com said: Andy added synchronous lumpy reclaim with c661b078fd62abe06fd11fab4ac5e4eeafe26b6d. At that time, lumpy reclaim is not agressive. His intension is just for high-order users.(above PAGE_ALLOC_COSTLY_ORDER). After some time, Rik added aggressive lumpy reclaim with 33c120ed2843090e2bd316de1588b8bf8b96cbde. His intention was to do lumpy reclaim when high-order users and trouble getting a small set of contiguous pages. So we also have to add synchronous pageout for small set of contiguous pages. Cc: Lee Schermerhorn Cc: Andy Whitcroft Acked-by: Peter Zijlstra Cc: Rik van Riel Signed-off-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Reviewed-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 95c08a8cc2b..a6b7d14812e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1061,6 +1061,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_scanned = 0; unsigned long nr_reclaimed = 0; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + int lumpy_reclaim = 0; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + * + * We use the same threshold as pageout congestion_wait below. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + lumpy_reclaim = 1; + else if (sc->order && priority < DEF_PRIORITY - 2) + lumpy_reclaim = 1; pagevec_init(&pvec, 1); @@ -1073,19 +1086,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_freed; unsigned long nr_active; unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = ISOLATE_INACTIVE; - - /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - * - * We use the same threshold as pageout congestion_wait below. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - mode = ISOLATE_BOTH; - else if (sc->order && priority < DEF_PRIORITY - 2) - mode = ISOLATE_BOTH; + int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; nr_taken = sc->isolate_pages(sc->swap_cluster_max, &page_list, &nr_scan, sc->order, mode, @@ -1122,7 +1123,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * but that should be acceptable to the caller */ if (nr_freed < nr_taken && !current_is_kswapd() && - sc->order > PAGE_ALLOC_COSTLY_ORDER) { + lumpy_reclaim) { congestion_wait(WRITE, HZ/10); /* -- cgit v1.2.3 From dcf975d58565880a134afb13bde511d1b873ce79 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 16 Jun 2009 15:31:44 -0700 Subject: mm/page-writeback.c: dirty limit type should be unsigned long get_dirty_limits() calls clip_bdi_dirty_limit() and task_dirty_limit() with variable pbdi_dirty as one of the arguments. This variable is an unsigned long * but both functions expect it to be a long *. This causes the following sparse warnings: warning: incorrect type in argument 3 (different signedness) expected long *pbdi_dirty got unsigned long *pbdi_dirty warning: incorrect type in argument 2 (different signedness) expected long *pdirty got unsigned long *pbdi_dirty Fix the warnings by changing the long * to unsigned long * in both functions. Signed-off-by: H Hartley Sweeten Cc: Johannes Weiner Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bb553c3e955..7b0dcea4935 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, * This avoids exceeding the total dirty_limit when the floating averages * fluctuate too quickly. */ -static void -clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) +static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, + unsigned long dirty, unsigned long *pbdi_dirty) { - long avail_dirty; + unsigned long avail_dirty; - avail_dirty = dirty - - (global_page_state(NR_FILE_DIRTY) + + avail_dirty = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_WRITEBACK) + global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK_TEMP)); + global_page_state(NR_WRITEBACK_TEMP); - if (avail_dirty < 0) + if (avail_dirty < dirty) + avail_dirty = dirty - avail_dirty; + else avail_dirty = 0; avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + @@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk, * * dirty -= (dirty/8) * p_{t} */ -static void task_dirty_limit(struct task_struct *tsk, long *pdirty) +static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) { long numerator, denominator; - long dirty = *pdirty; + unsigned long dirty = *pdirty; u64 inv = dirty >> 3; task_dirties_fraction(tsk, &numerator, &denominator); -- cgit v1.2.3 From 58568d2a8215cb6f55caf2332017d7bdff954e1c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Jun 2009 15:31:49 -0700 Subject: cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Paul Menage Cc: Nick Piggin Cc: Yasunori Goto Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 143 ++++++++++++++++++++++++++++++++++++++++---------------- mm/page_alloc.c | 5 +- 2 files changed, 104 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3eb4a6fdc04..46bdf9ddf2b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) return 0; } -/* Create a new policy */ +/* + * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if + * any, for the new policy. mpol_new() has already validated the nodes + * parameter with respect to the policy mode and flags. But, we need to + * handle an empty nodemask with MPOL_PREFERRED here. + * + * Must be called holding task's alloc_lock to protect task's mems_allowed + * and mempolicy. May also be called holding the mmap_semaphore for write. + */ +static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +{ + nodemask_t cpuset_context_nmask; + int ret; + + /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ + if (pol == NULL) + return 0; + + VM_BUG_ON(!nodes); + if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) + nodes = NULL; /* explicit local allocation */ + else { + if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&cpuset_context_nmask, nodes, + &cpuset_current_mems_allowed); + else + nodes_and(cpuset_context_nmask, *nodes, + cpuset_current_mems_allowed); + if (mpol_store_user_nodemask(pol)) + pol->w.user_nodemask = *nodes; + else + pol->w.cpuset_mems_allowed = + cpuset_current_mems_allowed; + } + + ret = mpol_ops[pol->mode].create(pol, + nodes ? &cpuset_context_nmask : NULL); + return ret; +} + +/* + * This function just creates a new policy, does some check and simple + * initialization. You must invoke mpol_set_nodemask() to set nodes. + */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; - nodemask_t cpuset_context_nmask; - int ret; pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); @@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); - nodes = NULL; /* flag local alloc */ } } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); @@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, policy->mode = mode; policy->flags = flags; - if (nodes) { - /* - * cpuset related setup doesn't apply to local allocation - */ - cpuset_update_task_memory_state(); - if (flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&cpuset_context_nmask, nodes, - &cpuset_current_mems_allowed); - else - nodes_and(cpuset_context_nmask, *nodes, - cpuset_current_mems_allowed); - if (mpol_store_user_nodemask(policy)) - policy->w.user_nodemask = *nodes; - else - policy->w.cpuset_mems_allowed = - cpuset_mems_allowed(current); - } - - ret = mpol_ops[mode].create(policy, - nodes ? &cpuset_context_nmask : NULL); - if (ret < 0) { - kmem_cache_free(policy_cache, policy); - return ERR_PTR(ret); - } return policy; } @@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. + * + * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) @@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void) static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { - struct mempolicy *new; + struct mempolicy *new, *old; struct mm_struct *mm = current->mm; + int ret; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) @@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, */ if (mm) down_write(&mm->mmap_sem); - mpol_put(current->mempolicy); + task_lock(current); + ret = mpol_set_nodemask(new, nodes); + if (ret) { + task_unlock(current); + if (mm) + up_write(&mm->mmap_sem); + mpol_put(new); + return ret; + } + old = current->mempolicy; current->mempolicy = new; mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); + task_unlock(current); if (mm) up_write(&mm->mmap_sem); + mpol_put(old); return 0; } /* * Return nodemask for policy for get_mempolicy() query + * + * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) { @@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; - cpuset_update_task_memory_state(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; @@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ + task_lock(current); *nmask = cpuset_current_mems_allowed; + task_unlock(current); return 0; } @@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } err = 0; - if (nmask) + if (nmask) { + task_lock(current); get_policy_nodemask(pol, nmask); + task_unlock(current); + } out: mpol_cond_put(pol); @@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len, return err; } down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask); + task_unlock(current); + if (err) { + up_write(&mm->mmap_sem); + mpol_put(new); + return err; + } vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); @@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; - cpuset_update_task_memory_state(); - if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; - if ((gfp & __GFP_WAIT) && !in_interrupt()) - cpuset_update_task_memory_state(); if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; @@ -1854,6 +1894,8 @@ restart: */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { + int ret; + sp->root = RB_ROOT; /* empty tree == default mempolicy */ spin_lock_init(&sp->lock); @@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - mpol_put(mpol); /* drop our ref on sb mpol */ - if (IS_ERR(new)) + if (IS_ERR(new)) { + mpol_put(mpol); /* drop our ref on sb mpol */ return; /* no valid nodemask intersection */ + } + + task_lock(current); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); + task_unlock(current); + mpol_put(mpol); /* drop our ref on sb mpol */ + if (ret) { + mpol_put(new); + return; + } /* Create pseudo-vma that contains just the policy */ memset(&pvma, 0, sizeof(struct vm_area_struct)); @@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) err = 1; - else if (no_context) - new->w.user_nodemask = nodes; /* save for contextualization */ + else { + int ret; + + task_lock(current); + ret = mpol_set_nodemask(new, &nodes); + task_unlock(current); + if (ret) + err = 1; + else if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } + } out: /* Restore string for error message */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 17d5f539a9a..7cc3179e359 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1569,10 +1569,7 @@ nofail_alloc: /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - /* - * The task's cpuset might have expanded its set of allowable nodes - */ - cpuset_update_task_memory_state(); + p->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); -- cgit v1.2.3 From 6c0db4664b49417d80988953e69c323721353227 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 16 Jun 2009 15:31:50 -0700 Subject: mm: alloc_large_system_hash check order On an x86_64 with 4GB ram, tcp_init()'s call to alloc_large_system_hash(), to allocate tcp_hashinfo.ehash, is now triggering an mmotm WARN_ON_ONCE on order >= MAX_ORDER - it's hoping for order 11. alloc_large_system_hash() had better make its own check on the order. Signed-off-by: Hugh Dickins Cc: David Miller Cc: Mel Gorman Cc: Eric Dumazet Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7cc3179e359..cbed869fd83 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4511,7 +4511,10 @@ void *__init alloc_large_system_hash(const char *tablename, table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { unsigned long order = get_order(size); - table = (void*) __get_free_pages(GFP_ATOMIC, order); + + if (order < MAX_ORDER) + table = (void *)__get_free_pages(GFP_ATOMIC, + order); /* * If bucketsize is not a power-of-two, we may free * some pages at the end of hash table. -- cgit v1.2.3 From d239171e4f6efd58d7e423853056b1b6a74f1446 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:52 -0700 Subject: page allocator: replace __alloc_pages_internal() with __alloc_pages_nodemask() The start of a large patch series to clean up and optimise the page allocator. The performance improvements are in a wide range depending on the exact machine but the results I've seen so fair are approximately; kernbench: 0 to 0.12% (elapsed time) 0.49% to 3.20% (sys time) aim9: -4% to 30% (for page_test and brk_test) tbench: -1% to 4% hackbench: -2.5% to 3.45% (mostly within the noise though) netperf-udp -1.34% to 4.06% (varies between machines a bit) netperf-tcp -0.44% to 5.22% (varies between machines a bit) I haven't sysbench figures at hand, but previously they were within the -0.5% to 2% range. On netperf, the client and server were bound to opposite number CPUs to maximise the problems with cache line bouncing of the struct pages so I expect different people to report different results for netperf depending on their exact machine and how they ran the test (different machines, same cpus client/server, shared cache but two threads client/server, different socket client/server etc). I also measured the vmlinux sizes for a single x86-based config with CONFIG_DEBUG_INFO enabled but not CONFIG_DEBUG_VM. The core of the .config is based on the Debian Lenny kernel config so I expect it to be reasonably typical. This patch: __alloc_pages_internal is the core page allocator function but essentially it is an alias of __alloc_pages_nodemask. Naming a publicly available and exported function "internal" is also a big ugly. This patch renames __alloc_pages_internal() to __alloc_pages_nodemask() and deletes the old nodemask function. Warning - This patch renames an exported symbol. No kernel driver is affected by external drivers calling __alloc_pages_internal() should change the call to __alloc_pages_nodemask() without any alteration of parameters. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbed869fd83..d58df903150 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1458,7 +1458,7 @@ try_next_zone: * This is the 'heart' of the zoned buddy allocator. */ struct page * -__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { const gfp_t wait = gfp_mask & __GFP_WAIT; @@ -1667,7 +1667,7 @@ nopage: got_pg: return page; } -EXPORT_SYMBOL(__alloc_pages_internal); +EXPORT_SYMBOL(__alloc_pages_nodemask); /* * Common helper functions. -- cgit v1.2.3 From b3c466ce512923298ae8c0121d3e9f397a3f1210 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:53 -0700 Subject: page allocator: do not sanity check order in the fast path No user of the allocator API should be passing in an order >= MAX_ORDER but we check for it on each and every allocation. Delete this check and make it a VM_BUG_ON check further down the call path. [akpm@linux-foundation.org: s/VM_BUG_ON/WARN_ON_ONCE/] Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d58df903150..bfbd95c0610 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1401,6 +1401,9 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, classzone_idx = zone_idx(preferred_zone); + if (WARN_ON_ONCE(order >= MAX_ORDER)) + return NULL; + zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. -- cgit v1.2.3 From 6484eb3e2a81807722c5f28efef94d8338b7b996 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:54 -0700 Subject: page allocator: do not check NUMA node ID when the caller knows the node is valid Callers of alloc_pages_node() can optionally specify -1 as a node to mean "allocate from the current node". However, a number of the callers in fast paths know for a fact their node is valid. To avoid a comparison and branch, this patch adds alloc_pages_exact_node() that only checks the nid with VM_BUG_ON(). Callers that know their node is valid are then converted. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Acked-by: Paul Mundt [for the SLOB NUMA bits] Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- mm/slab.c | 4 ++-- mm/slob.c | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 6846a902f5c..22396713feb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, 0); + return alloc_pages_exact_node(n, gfp, 0); } return alloc_pages(gfp, 0); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e83ad2c9228..2f8241f300f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -630,7 +630,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) if (h->order >= MAX_ORDER) return NULL; - page = alloc_pages_node(nid, + page = alloc_pages_exact_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -649,7 +649,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * Use a helper variable to find the next node and then * copy it back to hugetlb_next_nid afterwards: * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really * doesn't matter if occasionally a racer chooses the * same nid as we do. Move nid forward in the mask even diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 46bdf9ddf2b..e08e2c4da63 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -803,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* diff --git a/mm/migrate.c b/mm/migrate.c index 068655d8f88..5a24923e7fd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_node(pm->node, + return alloc_pages_exact_node(pm->node, GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); } diff --git a/mm/slab.c b/mm/slab.c index 18e3164de09..bb3254c95cd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1707,7 +1707,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - page = alloc_pages_node(nodeid, flags, cachep->gfporder); + page = alloc_pages_exact_node(nodeid, flags, cachep->gfporder); if (!page) return NULL; @@ -3261,7 +3261,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, -1); + obj = kmem_getpages(cache, local_flags, numa_node_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { diff --git a/mm/slob.c b/mm/slob.c index 12f26149992..64f6db1943b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -46,7 +46,7 @@ * NUMA support in SLOB is fairly simplistic, pushing most of the real * logic down to the page allocator, and simply doing the node accounting * on the upper levels. In the event that a node id is explicitly - * provided, alloc_pages_node() with the specified node id is used + * provided, alloc_pages_exact_node() with the specified node id is used * instead. The common case (or when the node id isn't explicitly provided) * will default to the current node, as per numa_node_id(). * @@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) #ifdef CONFIG_NUMA if (node != -1) - page = alloc_pages_node(node, gfp, order); + page = alloc_pages_exact_node(node, gfp, order); else #endif page = alloc_pages(gfp, order); -- cgit v1.2.3 From 7f82af9742a9346794ecc1515139daed480e7025 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:56 -0700 Subject: page allocator: check only once if the zonelist is suitable for the allocation It is possible with __GFP_THISNODE that no zones are suitable. This patch makes sure the check is only made once. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bfbd95c0610..6be8fcb6f74 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1483,9 +1483,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return NULL; -restart: - z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ - + /* the list of zones suitable for gfp_mask */ + z = zonelist->_zonerefs; if (unlikely(!z->zone)) { /* * Happens if we have an empty zonelist as a result of @@ -1494,6 +1493,7 @@ restart: return NULL; } +restart: page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); if (page) -- cgit v1.2.3 From 11e33f6a55ed7847d9c8ffe185ef87faf7806abe Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:57 -0700 Subject: page allocator: break up the allocator entry point into fast and slow paths The core of the page allocator is one giant function which allocates memory on the stack and makes calculations that may not be needed for every allocation. This patch breaks up the allocator path into fast and slow paths for clarity. Note the slow paths are still inlined but the entry is marked unlikely. If they were not inlined, it actally increases text size to generate the as there is only one call site. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 353 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 228 insertions(+), 125 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6be8fcb6f74..512bf9a618c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1457,47 +1457,171 @@ try_next_zone: return page; } -/* - * This is the 'heart' of the zoned buddy allocator. - */ -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +static inline int +should_alloc_retry(gfp_t gfp_mask, unsigned int order, + unsigned long pages_reclaimed) { - const gfp_t wait = gfp_mask & __GFP_WAIT; - enum zone_type high_zoneidx = gfp_zone(gfp_mask); - struct zoneref *z; - struct zone *zone; - struct page *page; - struct reclaim_state reclaim_state; - struct task_struct *p = current; - int do_retry; - int alloc_flags; - unsigned long did_some_progress; - unsigned long pages_reclaimed = 0; + /* Do not loop if specifically requested */ + if (gfp_mask & __GFP_NORETRY) + return 0; - lockdep_trace_alloc(gfp_mask); + /* + * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER + * means __GFP_NOFAIL, but that may not be true in other + * implementations. + */ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return 1; - might_sleep_if(wait); + /* + * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is + * specified, then we retry until we no longer reclaim any pages + * (above), or we've reclaimed an order of pages at least as + * large as the allocation's order. In both cases, if the + * allocation still fails, we stop retrying. + */ + if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) + return 1; - if (should_fail_alloc_page(gfp_mask, order)) - return NULL; + /* + * Don't let big-order allocations loop unless the caller + * explicitly requests that. + */ + if (gfp_mask & __GFP_NOFAIL) + return 1; - /* the list of zones suitable for gfp_mask */ - z = zonelist->_zonerefs; - if (unlikely(!z->zone)) { - /* - * Happens if we have an empty zonelist as a result of - * GFP_THISNODE being used on a memoryless node - */ + return 0; +} + +static inline struct page * +__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask) +{ + struct page *page; + + /* Acquire the OOM killer lock for the zones in zonelist */ + if (!try_set_zone_oom(zonelist, gfp_mask)) { + schedule_timeout_uninterruptible(1); return NULL; } -restart: - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); + /* + * Go through the zonelist yet one more time, keep very high watermark + * here, this is only to catch a parallel oom killing, we must fail if + * we're still under heavy pressure. + */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, + order, zonelist, high_zoneidx, + ALLOC_WMARK_HIGH|ALLOC_CPUSET); if (page) - goto got_pg; + goto out; + + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + + /* Exhausted what can be done so it's blamo time */ + out_of_memory(zonelist, gfp_mask, order); + +out: + clear_zonelist_oom(zonelist, gfp_mask); + return page; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress) +{ + struct page *page = NULL; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + + cond_resched(); + + /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); + + /* + * The task's cpuset might have expanded its set of allowable nodes + */ + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; + + cond_resched(); + + if (order != 0) + drain_all_pages(); + + if (likely(*did_some_progress)) + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, alloc_flags); + return page; +} + +static inline int +is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask) +{ + if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) + && !in_interrupt()) + return 1; + return 0; +} + +/* + * This is called in the allocator slow-path if the allocation request is of + * sufficient urgency to ignore watermarks and take other desperate measures + */ +static inline struct page * +__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask) +{ + struct page *page; + + do { + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); + + if (!page && gfp_mask & __GFP_NOFAIL) + congestion_wait(WRITE, HZ/50); + } while (!page && (gfp_mask & __GFP_NOFAIL)); + + return page; +} + +static inline +void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, + enum zone_type high_zoneidx) +{ + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order); +} + +static inline struct page * +__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask) +{ + const gfp_t wait = gfp_mask & __GFP_WAIT; + struct page *page = NULL; + int alloc_flags; + unsigned long pages_reclaimed = 0; + unsigned long did_some_progress; + struct task_struct *p = current; /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and @@ -1510,8 +1634,7 @@ restart: if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wake_all_kswapd(order, zonelist, high_zoneidx); /* * OK, we're below the kswapd watermark and have kicked background @@ -1531,6 +1654,7 @@ restart: if (wait) alloc_flags |= ALLOC_CPUSET; +restart: /* * Go through the zonelist again. Let __GFP_HIGH and allocations * coming from realtime tasks go deeper into reserves. @@ -1544,23 +1668,18 @@ restart: if (page) goto got_pg; - /* This allocation should allow future memory freeing. */ - rebalance: - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) - && !in_interrupt()) { + /* Allocate without watermarks if the context allows */ + if (is_allocation_high_priority(p, gfp_mask)) { + /* Do not dip into emergency reserves if specified */ if (!(gfp_mask & __GFP_NOMEMALLOC)) { -nofail_alloc: - /* go through the zonelist yet again, ignoring mins */ - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); + page = __alloc_pages_high_priority(gfp_mask, order, + zonelist, high_zoneidx, nodemask); if (page) goto got_pg; - if (gfp_mask & __GFP_NOFAIL) { - congestion_wait(WRITE, HZ/50); - goto nofail_alloc; - } } + + /* Ensure no recursion into the allocator */ goto nopage; } @@ -1568,93 +1687,42 @@ nofail_alloc: if (!wait) goto nopage; - cond_resched(); - - /* We now go into synchronous reclaim */ - cpuset_memory_pressure_bump(); - - p->flags |= PF_MEMALLOC; - - lockdep_set_current_reclaim_state(gfp_mask); - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; - - did_some_progress = try_to_free_pages(zonelist, order, - gfp_mask, nodemask); - - p->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + /* Try direct reclaim and then allocating */ + page = __alloc_pages_direct_reclaim(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, &did_some_progress); + if (page) + goto got_pg; - cond_resched(); + /* + * If we failed to make any progress reclaiming, then we are + * running out of options and have to consider going OOM + */ + if (!did_some_progress) { + if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + page = __alloc_pages_may_oom(gfp_mask, order, + zonelist, high_zoneidx, + nodemask); + if (page) + goto got_pg; - if (order != 0) - drain_all_pages(); + /* + * The OOM killer does not trigger for high-order allocations + * but if no progress is being made, there are no other + * options and retrying is unlikely to help + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto nopage; - if (likely(did_some_progress)) { - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, alloc_flags); - if (page) - goto got_pg; - } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { - if (!try_set_zone_oom(zonelist, gfp_mask)) { - schedule_timeout_uninterruptible(1); goto restart; } - - /* - * Go through the zonelist yet one more time, keep - * very high watermark here, this is only to catch - * a parallel oom killing, we must fail if we're still - * under heavy pressure. - */ - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, - order, zonelist, high_zoneidx, - ALLOC_WMARK_HIGH|ALLOC_CPUSET); - if (page) { - clear_zonelist_oom(zonelist, gfp_mask); - goto got_pg; - } - - /* The OOM killer will not help higher order allocs so fail */ - if (order > PAGE_ALLOC_COSTLY_ORDER) { - clear_zonelist_oom(zonelist, gfp_mask); - goto nopage; - } - - out_of_memory(zonelist, gfp_mask, order); - clear_zonelist_oom(zonelist, gfp_mask); - goto restart; } - /* - * Don't let big-order allocations loop unless the caller explicitly - * requests that. Wait for some write requests to complete then retry. - * - * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER - * means __GFP_NOFAIL, but that may not be true in other - * implementations. - * - * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is - * specified, then we retry until we no longer reclaim any pages - * (above), or we've reclaimed an order of pages at least as - * large as the allocation's order. In both cases, if the - * allocation still fails, we stop retrying. - */ + /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; - do_retry = 0; - if (!(gfp_mask & __GFP_NORETRY)) { - if (order <= PAGE_ALLOC_COSTLY_ORDER) { - do_retry = 1; - } else { - if (gfp_mask & __GFP_REPEAT && - pages_reclaimed < (1 << order)) - do_retry = 1; - } - if (gfp_mask & __GFP_NOFAIL) - do_retry = 1; - } - if (do_retry) { + if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { + /* Wait for some write requests to complete then retry */ congestion_wait(WRITE, HZ/50); goto rebalance; } @@ -1669,6 +1737,41 @@ nopage: } got_pg: return page; + +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct page *page; + + lockdep_trace_alloc(gfp_mask); + + might_sleep_if(gfp_mask & __GFP_WAIT); + + if (should_fail_alloc_page(gfp_mask, order)) + return NULL; + + /* + * Check the zones suitable for the gfp_mask contain at least one + * valid zone. It's possible to have an empty zonelist as a result + * of GFP_THISNODE and a memoryless node + */ + if (unlikely(!zonelist->_zonerefs->zone)) + return NULL; + + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, + zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (unlikely(!page)) + page = __alloc_pages_slowpath(gfp_mask, order, + zonelist, high_zoneidx, nodemask); + + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); -- cgit v1.2.3 From 49255c619fbd482d704289b5eb2795f8e3b7ff2e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:58 -0700 Subject: page allocator: move check for disabled anti-fragmentation out of fastpath On low-memory systems, anti-fragmentation gets disabled as there is nothing it can do and it would just incur overhead shuffling pages between lists constantly. Currently the check is made in the free page fast path for every page. This patch moves it to a slow path. On machines with low memory, there will be small amount of additional overhead as pages get shuffled between lists but it should quickly settle. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 512bf9a618c..b09859629e9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -168,6 +168,10 @@ int page_group_by_mobility_disabled __read_mostly; static void set_pageblock_migratetype(struct page *page, int migratetype) { + + if (unlikely(page_group_by_mobility_disabled)) + migratetype = MIGRATE_UNMOVABLE; + set_pageblock_flags_group(page, (unsigned long)migratetype, PB_migrate, PB_migrate_end); } -- cgit v1.2.3 From 5117f45d11a9ee62d9b086f1312f3f31781ff155 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:59 -0700 Subject: page allocator: calculate the preferred zone for allocation only once get_page_from_freelist() can be called multiple times for an allocation. Part of this calculates the preferred_zone which is the first usable zone in the zonelist but the zone depends on the GFP flags specified at the beginning of the allocation call. This patch calculates preferred_zone once. It's safe to do this because if preferred_zone is NULL at the start of the call, no amount of direct reclaim or other actions will change the fact the allocation will fail. [akpm@linux-foundation.org: remove (void) casts] Signed-off-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Christoph Lameter Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 54 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b09859629e9..8fc6d1f42f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1388,26 +1388,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) */ static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, - struct zonelist *zonelist, int high_zoneidx, int alloc_flags) + struct zonelist *zonelist, int high_zoneidx, int alloc_flags, + struct zone *preferred_zone) { struct zoneref *z; struct page *page = NULL; int classzone_idx; - struct zone *zone, *preferred_zone; + struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ - (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, - &preferred_zone); - if (!preferred_zone) - return NULL; - - classzone_idx = zone_idx(preferred_zone); - if (WARN_ON_ONCE(order >= MAX_ORDER)) return NULL; + classzone_idx = zone_idx(preferred_zone); zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. @@ -1500,7 +1495,7 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask) + nodemask_t *nodemask, struct zone *preferred_zone) { struct page *page; @@ -1517,7 +1512,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, - ALLOC_WMARK_HIGH|ALLOC_CPUSET); + ALLOC_WMARK_HIGH|ALLOC_CPUSET, + preferred_zone); if (page) goto out; @@ -1537,7 +1533,8 @@ out: static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress) + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + unsigned long *did_some_progress) { struct page *page = NULL; struct reclaim_state reclaim_state; @@ -1569,7 +1566,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, if (likely(*did_some_progress)) page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, alloc_flags); + zonelist, high_zoneidx, + alloc_flags, preferred_zone); return page; } @@ -1589,13 +1587,14 @@ is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask) static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask) + nodemask_t *nodemask, struct zone *preferred_zone) { struct page *page; do { page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); + zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, + preferred_zone); if (!page && gfp_mask & __GFP_NOFAIL) congestion_wait(WRITE, HZ/50); @@ -1618,7 +1617,7 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask) + nodemask_t *nodemask, struct zone *preferred_zone) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; @@ -1668,7 +1667,8 @@ restart: * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, - high_zoneidx, alloc_flags); + high_zoneidx, alloc_flags, + preferred_zone); if (page) goto got_pg; @@ -1678,7 +1678,7 @@ rebalance: /* Do not dip into emergency reserves if specified */ if (!(gfp_mask & __GFP_NOMEMALLOC)) { page = __alloc_pages_high_priority(gfp_mask, order, - zonelist, high_zoneidx, nodemask); + zonelist, high_zoneidx, nodemask, preferred_zone); if (page) goto got_pg; } @@ -1695,7 +1695,8 @@ rebalance: page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, nodemask, - alloc_flags, &did_some_progress); + alloc_flags, preferred_zone, + &did_some_progress); if (page) goto got_pg; @@ -1707,7 +1708,7 @@ rebalance: if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, - nodemask); + nodemask, preferred_zone); if (page) goto got_pg; @@ -1752,6 +1753,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zone *preferred_zone; struct page *page; lockdep_trace_alloc(gfp_mask); @@ -1769,11 +1771,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + /* The preferred zone is used for statistics later */ + first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + if (!preferred_zone) + return NULL; + + /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); + zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, + preferred_zone); if (unlikely(!page)) page = __alloc_pages_slowpath(gfp_mask, order, - zonelist, high_zoneidx, nodemask); + zonelist, high_zoneidx, nodemask, + preferred_zone); return page; } -- cgit v1.2.3 From 3dd2826698b6902aafd9441ce28ebb44735fd0d6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:00 -0700 Subject: page allocator: calculate the migratetype for allocation only once GFP mask is converted into a migratetype when deciding which pagelist to take a page from. However, it is happening multiple times per allocation, at least once per zone traversed. Calculate it once. Signed-off-by: Mel Gorman Cc: Christoph Lameter Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8fc6d1f42f0..d3be076ea9c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1061,13 +1061,13 @@ void split_page(struct page *page, unsigned int order) * or two. */ static struct page *buffered_rmqueue(struct zone *preferred_zone, - struct zone *zone, int order, gfp_t gfp_flags) + struct zone *zone, int order, gfp_t gfp_flags, + int migratetype) { unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); int cpu; - int migratetype = allocflags_to_migratetype(gfp_flags); again: cpu = get_cpu(); @@ -1389,7 +1389,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, struct zonelist *zonelist, int high_zoneidx, int alloc_flags, - struct zone *preferred_zone) + struct zone *preferred_zone, int migratetype) { struct zoneref *z; struct page *page = NULL; @@ -1433,7 +1433,8 @@ zonelist_scan: } } - page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); + page = buffered_rmqueue(preferred_zone, zone, order, + gfp_mask, migratetype); if (page) break; this_zone_full: @@ -1495,7 +1496,8 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone) + nodemask_t *nodemask, struct zone *preferred_zone, + int migratetype) { struct page *page; @@ -1513,7 +1515,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET, - preferred_zone); + preferred_zone, migratetype); if (page) goto out; @@ -1534,7 +1536,7 @@ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress) { struct page *page = NULL; struct reclaim_state reclaim_state; @@ -1567,7 +1569,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, if (likely(*did_some_progress)) page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, - alloc_flags, preferred_zone); + alloc_flags, preferred_zone, + migratetype); return page; } @@ -1587,14 +1590,15 @@ is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask) static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone) + nodemask_t *nodemask, struct zone *preferred_zone, + int migratetype) { struct page *page; do { page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, - preferred_zone); + preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) congestion_wait(WRITE, HZ/50); @@ -1617,7 +1621,8 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone) + nodemask_t *nodemask, struct zone *preferred_zone, + int migratetype) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; @@ -1668,7 +1673,8 @@ restart: */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags, - preferred_zone); + preferred_zone, + migratetype); if (page) goto got_pg; @@ -1678,7 +1684,8 @@ rebalance: /* Do not dip into emergency reserves if specified */ if (!(gfp_mask & __GFP_NOMEMALLOC)) { page = __alloc_pages_high_priority(gfp_mask, order, - zonelist, high_zoneidx, nodemask, preferred_zone); + zonelist, high_zoneidx, nodemask, preferred_zone, + migratetype); if (page) goto got_pg; } @@ -1696,7 +1703,7 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - &did_some_progress); + migratetype, &did_some_progress); if (page) goto got_pg; @@ -1708,7 +1715,8 @@ rebalance: if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, - nodemask, preferred_zone); + nodemask, preferred_zone, + migratetype); if (page) goto got_pg; @@ -1755,6 +1763,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; struct page *page; + int migratetype = allocflags_to_migratetype(gfp_mask); lockdep_trace_alloc(gfp_mask); @@ -1779,11 +1788,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, - preferred_zone); + preferred_zone, migratetype); if (unlikely(!page)) page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone); + preferred_zone, migratetype); return page; } -- cgit v1.2.3 From 341ce06f69abfafa31b9468410a13dbd60e2b237 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Jun 2009 15:32:02 -0700 Subject: page allocator: calculate the alloc_flags for allocation only once Factor out the mapping between GFP and alloc_flags only once. Once factored out, it only needs to be calculated once but some care must be taken. [neilb@suse.de says] As the test: - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) - && !in_interrupt()) { - if (!(gfp_mask & __GFP_NOMEMALLOC)) { has been replaced with a slightly weaker one: + if (alloc_flags & ALLOC_NO_WATERMARKS) { Without care, this would allow recursion into the allocator via direct reclaim. This patch ensures we do not recurse when PF_MEMALLOC is set but TF_MEMDIE callers are now allowed to directly reclaim where they would have been prevented in the past. Signed-off-by: Peter Zijlstra Acked-by: Pekka Enberg Signed-off-by: Mel Gorman Cc: Neil Brown Cc: Christoph Lameter Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 94 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 50 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d3be076ea9c..ef870fb92f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1574,15 +1574,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, return page; } -static inline int -is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask) -{ - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) - && !in_interrupt()) - return 1; - return 0; -} - /* * This is called in the allocator slow-path if the allocation request is of * sufficient urgency to ignore watermarks and take other desperate measures @@ -1618,6 +1609,42 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, wakeup_kswapd(zone, order); } +static inline int +gfp_to_alloc_flags(gfp_t gfp_mask) +{ + struct task_struct *p = current; + int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + const gfp_t wait = gfp_mask & __GFP_WAIT; + + /* + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). + */ + if (gfp_mask & __GFP_HIGH) + alloc_flags |= ALLOC_HIGH; + + if (!wait) { + alloc_flags |= ALLOC_HARDER; + /* + * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. + * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + */ + alloc_flags &= ~ALLOC_CPUSET; + } else if (unlikely(rt_task(p))) + alloc_flags |= ALLOC_HARDER; + + if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { + if (!in_interrupt() && + ((p->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) + alloc_flags |= ALLOC_NO_WATERMARKS; + } + + return alloc_flags; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -1648,56 +1675,35 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according * to how we want to proceed. - * - * The caller may dip into page reserves a bit more if the caller - * cannot run direct reclaim, or if the caller has realtime scheduling - * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags = ALLOC_WMARK_MIN; - if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) - alloc_flags |= ALLOC_HARDER; - if (gfp_mask & __GFP_HIGH) - alloc_flags |= ALLOC_HIGH; - if (wait) - alloc_flags |= ALLOC_CPUSET; + alloc_flags = gfp_to_alloc_flags(gfp_mask); restart: - /* - * Go through the zonelist again. Let __GFP_HIGH and allocations - * coming from realtime tasks go deeper into reserves. - * - * This is the last chance, in general, before the goto nopage. - * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. - */ + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, - high_zoneidx, alloc_flags, - preferred_zone, - migratetype); + high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); if (page) goto got_pg; rebalance: /* Allocate without watermarks if the context allows */ - if (is_allocation_high_priority(p, gfp_mask)) { - /* Do not dip into emergency reserves if specified */ - if (!(gfp_mask & __GFP_NOMEMALLOC)) { - page = __alloc_pages_high_priority(gfp_mask, order, - zonelist, high_zoneidx, nodemask, preferred_zone, - migratetype); - if (page) - goto got_pg; - } - - /* Ensure no recursion into the allocator */ - goto nopage; + if (alloc_flags & ALLOC_NO_WATERMARKS) { + page = __alloc_pages_high_priority(gfp_mask, order, + zonelist, high_zoneidx, nodemask, + preferred_zone, migratetype); + if (page) + goto got_pg; } /* Atomic allocations - we can't balance anything */ if (!wait) goto nopage; + /* Avoid recursion of direct reclaim */ + if (p->flags & PF_MEMALLOC) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, -- cgit v1.2.3 From a56f57ff94c25d5d80def06f3ed8fe7f99147762 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:02 -0700 Subject: page allocator: remove a branch by assuming __GFP_HIGH == ALLOC_HIGH Allocations that specify __GFP_HIGH get the ALLOC_HIGH flag. If these flags are equal to each other, we can eliminate a branch. [akpm@linux-foundation.org: Suggested the hack] Signed-off-by: Mel Gorman Reviewed-by: Pekka Enberg Reviewed-by: KOSAKI Motohiro Cc: Christoph Lameter Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef870fb92f7..94f33e2b7f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1616,14 +1616,16 @@ gfp_to_alloc_flags(gfp_t gfp_mask) int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; const gfp_t wait = gfp_mask & __GFP_WAIT; + /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ + BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); + /* * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ - if (gfp_mask & __GFP_HIGH) - alloc_flags |= ALLOC_HIGH; + alloc_flags |= (gfp_mask & __GFP_HIGH); if (!wait) { alloc_flags |= ALLOC_HARDER; -- cgit v1.2.3 From 728ec980fb9fa2d65d9e05444079a53615985e7b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:04 -0700 Subject: page allocator: inline __rmqueue_smallest() Inline __rmqueue_smallest by altering flow very slightly so that there is only one call site. Because there is only one call-site, this function can then be inlined without causing text bloat. On an x86-based config, this patch reduces text by 16 bytes. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94f33e2b7f0..04713f649fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -661,7 +661,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */ -static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, +static inline +struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { unsigned int current_order; @@ -831,8 +832,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, } } - /* Use MIGRATE_RESERVE rather than fail an allocation */ - return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); + return NULL; } /* @@ -844,11 +844,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, { struct page *page; +retry_reserve: page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page)) + if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { page = __rmqueue_fallback(zone, order, migratetype); + /* + * Use MIGRATE_RESERVE rather than fail an allocation. goto + * is used because __rmqueue_smallest is an inline function + * and we want just one call site + */ + if (!page) { + migratetype = MIGRATE_RESERVE; + goto retry_reserve; + } + } + return page; } -- cgit v1.2.3 From 0a15c3e9f649f71464ac39e6378f1fde6f995322 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:05 -0700 Subject: page allocator: inline buffered_rmqueue() buffered_rmqueue() is in the fast path so inline it. Because it only has one call site, this function can then be inlined without causing text bloat. On an x86-based config, it made no difference as the savings were padded out by NOP instructions. Milage varies but text will either decrease in size or remain static. Signed-off-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Cc: Christoph Lameter Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 04713f649fd..c101921e6a6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1072,7 +1072,8 @@ void split_page(struct page *page, unsigned int order) * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page *buffered_rmqueue(struct zone *preferred_zone, +static inline +struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, int order, gfp_t gfp_flags, int migratetype) { -- cgit v1.2.3 From 0ac3a4099b0171ff965836182bc688bb8ca01058 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:06 -0700 Subject: page allocator: inline __rmqueue_fallback() __rmqueue_fallback() is in the slow path but has only one call site. Because there is only one call-site, this function can then be inlined without causing text bloat. On an x86-based config, it made no difference as the savings were padded out by NOP instructions. Milage varies but text will either decrease in size or remain static. Signed-off-by: Mel Gorman Cc: Christoph Lameter Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c101921e6a6..91e29b3ed2b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -771,8 +771,8 @@ static int move_freepages_block(struct zone *zone, struct page *page, } /* Remove an element from the buddy allocator from the fallback list */ -static struct page *__rmqueue_fallback(struct zone *zone, int order, - int start_migratetype) +static inline struct page * +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { struct free_area * area; int current_order; -- cgit v1.2.3 From ed0ae21dc5fe3b9ad4cf1c7bb2bfd2ad596c481c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:07 -0700 Subject: page allocator: do not call get_pageblock_migratetype() more than necessary get_pageblock_migratetype() is potentially called twice for every page free. Once, when being freed to the pcp lists and once when being freed back to buddy. When freeing from the pcp lists, it is known what the pageblock type was at the time of free so use it rather than rechecking. In low memory situations under memory pressure, this might skew anti-fragmentation slightly but the interference is minimal and decisions that are fragmenting memory are being made anyway. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 91e29b3ed2b..8f334d339b0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -452,16 +452,18 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, */ static inline void __free_one_page(struct page *page, - struct zone *zone, unsigned int order) + struct zone *zone, unsigned int order, + int migratetype) { unsigned long page_idx; int order_size = 1 << order; - int migratetype = get_pageblock_migratetype(page); if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) return; + VM_BUG_ON(migratetype == -1); + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); VM_BUG_ON(page_idx & (order_size - 1)); @@ -530,17 +532,18 @@ static void free_pages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); - __free_one_page(page, zone, order); + __free_one_page(page, zone, order, page_private(page)); } spin_unlock(&zone->lock); } -static void free_one_page(struct zone *zone, struct page *page, int order) +static void free_one_page(struct zone *zone, struct page *page, int order, + int migratetype) { spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; - __free_one_page(page, zone, order); + __free_one_page(page, zone, order, migratetype); spin_unlock(&zone->lock); } @@ -565,7 +568,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, order); + free_one_page(page_zone(page), page, order, + get_pageblock_migratetype(page)); local_irq_restore(flags); } -- cgit v1.2.3 From da456f14d2f2d7350f2b9440af79c85a34c7eed5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:08 -0700 Subject: page allocator: do not disable interrupts in free_page_mlock() free_page_mlock() tests and clears PG_mlocked using locked versions of the bit operations. If set, it disables interrupts to update counters and this happens on every page free even though interrupts are disabled very shortly afterwards a second time. This is wasteful. This patch splits what free_page_mlock() does. The bit check is still made. However, the update of counters is delayed until the interrupts are disabled and the non-lock version for clearing the bit is used. One potential weirdness with this split is that the counters do not get updated if the bad_page() check is triggered but a system showing bad pages is getting screwed already. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: Pekka Enberg Reviewed-by: KOSAKI Motohiro Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Acked-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 11 +++-------- mm/page_alloc.c | 8 +++++++- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 987bb03fbdd..58ec1bc262c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -157,14 +157,9 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) */ static inline void free_page_mlock(struct page *page) { - if (unlikely(TestClearPageMlocked(page))) { - unsigned long flags; - - local_irq_save(flags); - __dec_zone_page_state(page, NR_MLOCK); - __count_vm_event(UNEVICTABLE_MLOCKFREED); - local_irq_restore(flags); - } + __ClearPageMlocked(page); + __dec_zone_page_state(page, NR_MLOCK); + __count_vm_event(UNEVICTABLE_MLOCKFREED); } #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f334d339b0..03a386d24ef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -495,7 +495,6 @@ static inline void __free_one_page(struct page *page, static inline int free_pages_check(struct page *page) { - free_page_mlock(page); if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (page_count(page) != 0) | @@ -552,6 +551,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) unsigned long flags; int i; int bad = 0; + int clearMlocked = PageMlocked(page); for (i = 0 ; i < (1 << order) ; ++i) bad += free_pages_check(page + i); @@ -567,6 +567,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) kernel_map_pages(page, 1 << order, 0); local_irq_save(flags); + if (unlikely(clearMlocked)) + free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order, get_pageblock_migratetype(page)); @@ -1013,6 +1015,7 @@ static void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + int clearMlocked = PageMlocked(page); if (PageAnon(page)) page->mapping = NULL; @@ -1028,7 +1031,10 @@ static void free_hot_cold_page(struct page *page, int cold) pcp = &zone_pcp(zone, get_cpu())->pcp; local_irq_save(flags); + if (unlikely(clearMlocked)) + free_page_mlock(page); __count_vm_event(PGFREE); + if (cold) list_add_tail(&page->lru, &pcp->list); else -- cgit v1.2.3 From d395b73428d9748fb70b33477c9b2acae62f360a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:09 -0700 Subject: page allocator: do not setup zonelist cache when there is only one node There is a zonelist cache which is used to track zones that are not in the allowed cpuset or found to be recently full. This is to reduce cache footprint on large machines. On smaller machines, it just incurs cost for no gain. This patch only uses the zonelist cache when there are NUMA nodes. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03a386d24ef..fd8e3ca0cf3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1464,8 +1464,11 @@ this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); try_next_zone: - if (NUMA_BUILD && !did_zlc_setup) { - /* we do zlc_setup after the first zone is tried */ + if (NUMA_BUILD && !did_zlc_setup && num_online_nodes() > 1) { + /* + * we do zlc_setup after the first zone is tried but only + * if there are multiple nodes make it worthwhile + */ allowednodes = zlc_setup(zonelist, alloc_flags); zlc_active = 1; did_zlc_setup = 1; -- cgit v1.2.3 From a3af9c389a7f3e675313f442fdd8c247c1cdb66b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 16 Jun 2009 15:32:10 -0700 Subject: page allocator: do not check for compound pages during the page allocator sanity checks A number of sanity checks are made on each page allocation and free including that the page count is zero. page_count() checks for compound pages and checks the count of the head page if true. However, in these paths, we do not care if the page is compound or not as the count of each tail page should also be zero. This patch makes two changes to the use of page_count() in the free path. It converts one check of page_count() to a VM_BUG_ON() as the count should have been unconditionally checked earlier in the free path. It also avoids checking for compound pages. [mel@csn.ul.ie: Wrote changelog] Signed-off-by: Mel Gorman Signed-off-by: Nick Piggin Reviewed-by: Christoph Lameter Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd8e3ca0cf3..8485735fc69 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -421,7 +421,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, return 0; if (PageBuddy(buddy) && page_order(buddy) == order) { - BUG_ON(page_count(buddy) != 0); + VM_BUG_ON(page_count(buddy) != 0); return 1; } return 0; @@ -497,7 +497,7 @@ static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_count(page) != 0) | + (atomic_read(&page->_count) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { bad_page(page); return 1; @@ -642,7 +642,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_count(page) != 0) | + (atomic_read(&page->_count) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { bad_page(page); return 1; -- cgit v1.2.3 From 418589663d6011de9006425b6c5721e1544fb47a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:12 -0700 Subject: page allocator: use allocation flags as an index to the zone watermark ALLOC_WMARK_MIN, ALLOC_WMARK_LOW and ALLOC_WMARK_HIGH determin whether pages_min, pages_low or pages_high is used as the zone watermark when allocating the pages. Two branches in the allocator hotpath determine which watermark to use. This patch uses the flags as an array index into a watermark array that is indexed with WMARK_* defines accessed via helpers. All call sites that use zone->pages_* are updated to use the helpers for accessing the values and the array offsets for setting. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 51 ++++++++++++++++++++++++++------------------------- mm/vmscan.c | 39 +++++++++++++++++++++------------------ mm/vmstat.c | 6 +++--- 3 files changed, 50 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8485735fc69..abe26003124 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1150,10 +1150,15 @@ failed: return NULL; } -#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ -#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ -#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ -#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +/* The ALLOC_WMARK bits are used as an index to zone->watermark */ +#define ALLOC_WMARK_MIN WMARK_MIN +#define ALLOC_WMARK_LOW WMARK_LOW +#define ALLOC_WMARK_HIGH WMARK_HIGH +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ + +/* Mask to get the watermark bits */ +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) + #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ @@ -1440,14 +1445,10 @@ zonelist_scan: !cpuset_zone_allowed_softwall(zone, gfp_mask)) goto try_next_zone; + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; - if (alloc_flags & ALLOC_WMARK_MIN) - mark = zone->pages_min; - else if (alloc_flags & ALLOC_WMARK_LOW) - mark = zone->pages_low; - else - mark = zone->pages_high; + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) { if (!zone_reclaim_mode || @@ -1959,7 +1960,7 @@ static unsigned int nr_free_zone_pages(int offset) for_each_zone_zonelist(zone, z, zonelist, offset) { unsigned long size = zone->present_pages; - unsigned long high = zone->pages_high; + unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; } @@ -2096,9 +2097,9 @@ void show_free_areas(void) "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high), + K(min_wmark_pages(zone)), + K(low_wmark_pages(zone)), + K(high_wmark_pages(zone)), K(zone_page_state(zone, NR_ACTIVE_ANON)), K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), @@ -2702,8 +2703,8 @@ static inline unsigned long wait_table_bits(unsigned long size) /* * Mark a number of pageblocks as MIGRATE_RESERVE. The number - * of blocks reserved is based on zone->pages_min. The memory within the - * reserve will tend to store contiguous free pages. Setting min_free_kbytes + * of blocks reserved is based on min_wmark_pages(zone). The memory within + * the reserve will tend to store contiguous free pages. Setting min_free_kbytes * higher will lead to a bigger reserve which will get freed as contiguous * blocks as reclaim kicks in */ @@ -2716,7 +2717,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) /* Get the start pfn, end pfn and the number of blocks to reserve */ start_pfn = zone->zone_start_pfn; end_pfn = start_pfn + zone->spanned_pages; - reserve = roundup(zone->pages_min, pageblock_nr_pages) >> + reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { @@ -4319,8 +4320,8 @@ static void calculate_totalreserve_pages(void) max = zone->lowmem_reserve[j]; } - /* we treat pages_high as reserved pages. */ - max += zone->pages_high; + /* we treat the high watermark as reserved pages. */ + max += high_wmark_pages(zone); if (max > zone->present_pages) max = zone->present_pages; @@ -4400,7 +4401,7 @@ void setup_per_zone_pages_min(void) * need highmem pages, so cap pages_min to a small * value here. * - * The (pages_high-pages_low) and (pages_low-pages_min) + * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) * deltas controls asynch page reclaim, and so should * not be capped for highmem. */ @@ -4411,17 +4412,17 @@ void setup_per_zone_pages_min(void) min_pages = SWAP_CLUSTER_MAX; if (min_pages > 128) min_pages = 128; - zone->pages_min = min_pages; + zone->watermark[WMARK_MIN] = min_pages; } else { /* * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->pages_min = tmp; + zone->watermark[WMARK_MIN] = tmp; } - zone->pages_low = zone->pages_min + (tmp >> 2); - zone->pages_high = zone->pages_min + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -4566,7 +4567,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * whenever sysctl_lowmem_reserve_ratio changes. * * The reserve ratio obviously has absolutely no relation with the - * pages_min watermarks. The lowmem reserve ratio can only make sense + * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, diff --git a/mm/vmscan.c b/mm/vmscan.c index a6b7d14812e..e5245d05164 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1401,7 +1401,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= zone->pages_high)) { + if (unlikely(file + free <= high_wmark_pages(zone))) { percent[0] = 100; percent[1] = 0; return; @@ -1533,11 +1533,13 @@ static void shrink_zone(int priority, struct zone *zone, * try to reclaim pages from zones which will satisfy the caller's allocation * request. * - * We reclaim from a zone even if that zone is over pages_high. Because: + * We reclaim from a zone even if that zone is over high_wmark_pages(zone). + * Because: * a) The caller may be trying to free *extra* pages to satisfy a higher-order * allocation or - * b) The zones may be over pages_high but they must go *over* pages_high to - * satisfy the `incremental min' zone defense algorithm. + * b) The target zone may be at high_wmark_pages(zone) but the lower zones + * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' + * zone defense algorithm. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. @@ -1743,7 +1745,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, /* * For kswapd, balance_pgdat() will work across all this node's zones until - * they are all at pages_high. + * they are all at high_wmark_pages(zone). * * Returns the number of pages which were actually freed. * @@ -1756,11 +1758,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * the zone for when the problem goes away. * * kswapd scans the zones in the highmem->normal->dma direction. It skips - * zones which have free_pages > pages_high, but once a zone is found to have - * free_pages <= pages_high, we scan that zone and the lower zones regardless - * of the number of free pages in the lower zones. This interoperates with - * the page allocator fallback scheme to ensure that aging of pages is balanced - * across the zones. + * zones which have free_pages > high_wmark_pages(zone), but once a zone is + * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the + * lower zones regardless of the number of free pages in the lower zones. This + * interoperates with the page allocator fallback scheme to ensure that aging + * of pages is balanced across the zones. */ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) { @@ -1781,7 +1783,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) }; /* * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to free_pages == pages_high. + * this zone was successfully refilled to + * free_pages == high_wmark_pages(zone). */ int temp_priority[MAX_NR_ZONES]; @@ -1826,8 +1829,8 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, zone->pages_high, - 0, 0)) { + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), 0, 0)) { end_zone = i; break; } @@ -1861,8 +1864,8 @@ loop_again: priority != DEF_PRIORITY) continue; - if (!zone_watermark_ok(zone, order, zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), end_zone, 0)) all_zones_ok = 0; temp_priority[i] = priority; sc.nr_scanned = 0; @@ -1871,8 +1874,8 @@ loop_again: * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, 8*zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, @@ -2038,7 +2041,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; diff --git a/mm/vmstat.c b/mm/vmstat.c index 74d66dba0cb..415110772c7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -714,9 +714,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), - zone->pages_min, - zone->pages_low, - zone->pages_high, + min_wmark_pages(zone), + low_wmark_pages(zone), + high_wmark_pages(zone), zone->pages_scanned, zone->lru[LRU_ACTIVE_ANON].nr_scan, zone->lru[LRU_INACTIVE_ANON].nr_scan, -- cgit v1.2.3 From f2260e6b1f4eba0f5b5906795117791b5c660154 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:13 -0700 Subject: page allocator: update NR_FREE_PAGES only as necessary When pages are being freed to the buddy allocator, the zone NR_FREE_PAGES counter must be updated. In the case of bulk per-cpu page freeing, it's updated once per page. This retouches cache lines more than necessary. Update the counters one per per-cpu bulk free. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index abe26003124..d56e377ad08 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -456,7 +456,6 @@ static inline void __free_one_page(struct page *page, int migratetype) { unsigned long page_idx; - int order_size = 1 << order; if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) @@ -466,10 +465,9 @@ static inline void __free_one_page(struct page *page, page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - VM_BUG_ON(page_idx & (order_size - 1)); + VM_BUG_ON(page_idx & ((1 << order) - 1)); VM_BUG_ON(bad_range(zone, page)); - __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); while (order < MAX_ORDER-1) { unsigned long combined_idx; struct page *buddy; @@ -524,6 +522,8 @@ static void free_pages_bulk(struct zone *zone, int count, spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; + + __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); while (count--) { struct page *page; @@ -542,6 +542,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order, spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; + + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); __free_one_page(page, zone, order, migratetype); spin_unlock(&zone->lock); } @@ -686,7 +688,6 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, list_del(&page->lru); rmv_page_order(page); area->nr_free--; - __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); expand(zone, page, order, current_order, area, migratetype); return page; } @@ -826,8 +827,6 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); - __mod_zone_page_state(zone, NR_FREE_PAGES, - -(1UL << order)); if (current_order == pageblock_order) set_pageblock_migratetype(page, @@ -900,6 +899,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, set_page_private(page, migratetype); list = &page->lru; } + __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); return i; } @@ -1129,6 +1129,7 @@ again: } else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); spin_unlock(&zone->lock); if (!page) goto failed; -- cgit v1.2.3 From 974709bdb2a34db378fc84140220f363f558d0d6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:14 -0700 Subject: page allocator: get the pageblock migratetype without disabling interrupts Local interrupts are disabled when freeing pages to the PCP list. Part of that free checks what the migratetype of the pageblock the page is in but it checks this with interrupts disabled and interupts should never be disabled longer than necessary. This patch checks the pagetype with interrupts enabled with the impact that it is possible a page is freed to the wrong list when a pageblock changes type. As that block is now already considered mixed from an anti-fragmentation perspective, it's not of vital importance. Signed-off-by: Mel Gorman Cc: Christoph Lameter Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d56e377ad08..e60e4147433 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1030,6 +1030,7 @@ static void free_hot_cold_page(struct page *page, int cold) kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp; + set_page_private(page, get_pageblock_migratetype(page)); local_irq_save(flags); if (unlikely(clearMlocked)) free_page_mlock(page); @@ -1039,7 +1040,6 @@ static void free_hot_cold_page(struct page *page, int cold) list_add_tail(&page->lru, &pcp->list); else list_add(&page->lru, &pcp->list); - set_page_private(page, get_pageblock_migratetype(page)); pcp->count++; if (pcp->count >= pcp->high) { free_pages_bulk(zone, pcp->batch, &pcp->list, 0); -- cgit v1.2.3 From 62bc62a873116805774ffd37d7f86aa4faa832b1 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 16 Jun 2009 15:32:15 -0700 Subject: page allocator: use a pre-calculated value instead of num_online_nodes() in fast paths num_online_nodes() is called in a number of places but most often by the page allocator when deciding whether the zonelist needs to be filtered based on cpusets or the zonelist cache. This is actually a heavy function and touches a number of cache lines. This patch stores the number of online nodes at boot time and updates the value when nodes get onlined and offlined. The value is then used in a number of important paths in place of num_online_nodes(). [rientjes@google.com: do not override definition of node_set_online() with macro] Signed-off-by: Christoph Lameter Signed-off-by: Mel Gorman Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- mm/page_alloc.c | 10 ++++++---- mm/slub.c | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2f8241f300f..7b9b6015b2e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -875,7 +875,7 @@ static void return_unused_surplus_pages(struct hstate *h, * can no longer free unreserved surplus pages. This occurs when * the nodes with surplus pages have no free pages. */ - unsigned long remaining_iterations = num_online_nodes(); + unsigned long remaining_iterations = nr_online_nodes; /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -904,7 +904,7 @@ static void return_unused_surplus_pages(struct hstate *h, h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; nr_pages--; - remaining_iterations = num_online_nodes(); + remaining_iterations = nr_online_nodes; } } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e60e4147433..0c9f406e3c4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -161,7 +161,9 @@ static unsigned long __meminitdata dma_reserve; #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; +int nr_online_nodes __read_mostly = 1; EXPORT_SYMBOL(nr_node_ids); +EXPORT_SYMBOL(nr_online_nodes); #endif int page_group_by_mobility_disabled __read_mostly; @@ -1466,7 +1468,7 @@ this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); try_next_zone: - if (NUMA_BUILD && !did_zlc_setup && num_online_nodes() > 1) { + if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { /* * we do zlc_setup after the first zone is tried but only * if there are multiple nodes make it worthwhile @@ -2265,7 +2267,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, } -#define MAX_NODE_LOAD (num_online_nodes()) +#define MAX_NODE_LOAD (nr_online_nodes) static int node_load[MAX_NUMNODES]; /** @@ -2474,7 +2476,7 @@ static void build_zonelists(pg_data_t *pgdat) /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = num_online_nodes(); + load = nr_online_nodes; prev_node = local_node; nodes_clear(used_mask); @@ -2625,7 +2627,7 @@ void build_all_zonelists(void) printk("Built %i zonelists in %s order, mobility grouping %s. " "Total pages: %ld\n", - num_online_nodes(), + nr_online_nodes, zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); diff --git a/mm/slub.c b/mm/slub.c index 30354bfeb43..1c950775d6b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3737,7 +3737,7 @@ static int list_locations(struct kmem_cache *s, char *buf, to_cpumask(l->cpus)); } - if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && + if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " nodes="); len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, -- cgit v1.2.3 From b6e68bc1baed9b6972a250aba66b8c5276cf6fb1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:16 -0700 Subject: page allocator: slab: use nr_online_nodes to check for a NUMA platform SLAB currently avoids checking a bitmap repeatedly by checking once and storing a flag. When the addition of nr_online_nodes as a cheaper version of num_online_nodes(), this check can be replaced by nr_online_nodes. (Christoph did a patch that this is lifted almost verbatim from) Signed-off-by: Christoph Lameter Signed-off-by: Mel Gorman Cc: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index bb3254c95cd..744ab9a665a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -898,7 +898,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, */ static int use_alien_caches __read_mostly = 1; -static int numa_platform __read_mostly = 1; static int __init noaliencache_setup(char *s) { use_alien_caches = 0; @@ -1457,10 +1456,8 @@ void __init kmem_cache_init(void) int order; int node; - if (num_possible_nodes() == 1) { + if (num_possible_nodes() == 1) use_alien_caches = 0; - numa_platform = 0; - } for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); @@ -3590,7 +3587,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) * variable to skip the call, which is mostly likely to be present in * the cache. */ - if (numa_platform && cache_free_alien(cachep, objp)) + if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { -- cgit v1.2.3 From 092cead6175bb1b3d3078a34ba71c939d526c70b Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 16 Jun 2009 15:32:17 -0700 Subject: page allocator: move free_page_mlock() to page_alloc.c Currently, free_page_mlock() is only called from page_alloc.c. Thus, we can move it to page_alloc.c. Cc: Lee Schermerhorn Cc: Mel Gorman Cc: Christoph Lameter Cc: Pekka Enberg Cc: Dave Hansen Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 13 ------------- mm/page_alloc.c | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 58ec1bc262c..4b1672a8cf7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -150,18 +150,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } -/* - * free_page_mlock() -- clean up attempts to free and mlocked() page. - * Page should not be on lru, so no need to fix that up. - * free_pages_check() will verify... - */ -static inline void free_page_mlock(struct page *page) -{ - __ClearPageMlocked(page); - __dec_zone_page_state(page, NR_MLOCK); - __count_vm_event(UNEVICTABLE_MLOCKFREED); -} - #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { @@ -170,7 +158,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } -static inline void free_page_mlock(struct page *page) { } #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0c9f406e3c4..5dac5d8cb14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -493,6 +493,22 @@ static inline void __free_one_page(struct page *page, zone->free_area[order].nr_free++; } +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT +/* + * free_page_mlock() -- clean up attempts to free and mlocked() page. + * Page should not be on lru, so no need to fix that up. + * free_pages_check() will verify... + */ +static inline void free_page_mlock(struct page *page) +{ + __ClearPageMlocked(page); + __dec_zone_page_state(page, NR_MLOCK); + __count_vm_event(UNEVICTABLE_MLOCKFREED); +} +#else +static void free_page_mlock(struct page *page) { } +#endif + static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | -- cgit v1.2.3 From 72807a74c0172376bba6b5b27702c9f702b526e9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:18 -0700 Subject: page allocator: sanity check order in the page allocator slow path Callers may speculatively call different allocators in order of preference trying to allocate a buffer of a given size. The order needed to allocate this may be larger than what the page allocator can normally handle. While the allocator mostly does the right thing, it should not direct reclaim or wakeup kswapd with a bogus order. This patch sanity checks the order in the slow path and returns NULL if it is too large. Signed-off-by: Mel Gorman Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dac5d8cb14..85759cdd697 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1446,9 +1446,6 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ - if (WARN_ON_ONCE(order >= MAX_ORDER)) - return NULL; - classzone_idx = zone_idx(preferred_zone); zonelist_scan: /* @@ -1706,6 +1703,15 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; struct task_struct *p = current; + /* + * In the slowpath, we sanity check order to avoid ever trying to + * reclaim >= MAX_ORDER areas which will never succeed. Callers may + * be using allocators in order of preference for an area that is + * too large. + */ + if (WARN_ON_ONCE(order >= MAX_ORDER)) + return NULL; + /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and * __GFP_NOWARN set) should not cause reclaim since the subsystem -- cgit v1.2.3 From a1dd268cf6306565a31a48deff8bf4f6b4b105f7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:19 -0700 Subject: mm: use alloc_pages_exact() in alloc_large_system_hash() to avoid duplicated logic alloc_large_system_hash() has logic for freeing pages at the end of an excessively large power-of-two buffer that is a duplicate of what is in alloc_pages_exact(). This patch converts alloc_large_system_hash() to use alloc_pages_exact(). Signed-off-by: Mel Gorman Acked-by: Hugh Dickins Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 85759cdd697..8ca06d87dc1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4699,26 +4699,13 @@ void *__init alloc_large_system_hash(const char *tablename, else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { - unsigned long order = get_order(size); - - if (order < MAX_ORDER) - table = (void *)__get_free_pages(GFP_ATOMIC, - order); /* * If bucketsize is not a power-of-two, we may free - * some pages at the end of hash table. + * some pages at the end of hash table which + * alloc_pages_exact() automatically does */ - if (table) { - unsigned long alloc_end = (unsigned long)table + - (PAGE_SIZE << order); - unsigned long used = (unsigned long)table + - PAGE_ALIGN(size); - split_page(virt_to_page(table), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } - } + if (get_order(size) < MAX_ORDER) + table = alloc_pages_exact(size, GFP_ATOMIC); } } while (!table && size > PAGE_SIZE && --log2qty); -- cgit v1.2.3 From 20a0307c0396c2edb651401d2f2db193dda2f3c9 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:22 -0700 Subject: mm: introduce PageHuge() for testing huge/gigantic pages A series of patches to enhance the /proc/pagemap interface and to add a userspace executable which can be used to present the pagemap data. Export 10 more flags to end users (and more for kernel developers): 11. KPF_MMAP (pseudo flag) memory mapped page 12. KPF_ANON (pseudo flag) memory mapped page (anonymous) 13. KPF_SWAPCACHE page is in swap cache 14. KPF_SWAPBACKED page is swap/RAM backed 15. KPF_COMPOUND_HEAD (*) 16. KPF_COMPOUND_TAIL (*) 17. KPF_HUGE hugeTLB pages 18. KPF_UNEVICTABLE page is in the unevictable LRU list 19. KPF_HWPOISON hardware detected corruption 20. KPF_NOPAGE (pseudo flag) no page frame at the address (*) For compound pages, exporting _both_ head/tail info enables users to tell where a compound page starts/ends, and its order. a simple demo of the page-types tool # ./page-types -h page-types [options] -r|--raw Raw mode, for kernel developers -a|--addr addr-spec Walk a range of pages -b|--bits bits-spec Walk pages with specified bits -l|--list Show page details in ranges -L|--list-each Show page details one by one -N|--no-summary Don't show summay info -h|--help Show this usage message addr-spec: N one page at offset N (unit: pages) N+M pages range from N to N+M-1 N,M pages range from N to M-1 N, pages range from N to end ,M pages range from 0 to M bits-spec: bit1,bit2 (flags & (bit1|bit2)) != 0 bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1 bit1,~bit2 (flags & (bit1|bit2)) == bit1 =bit1,bit2 flags == (bit1|bit2) bit-names: locked error referenced uptodate dirty lru active slab writeback reclaim buddy mmap anonymous swapcache swapbacked compound_head compound_tail huge unevictable hwpoison nopage reserved(r) mlocked(r) mappedtodisk(r) private(r) private_2(r) owner_private(r) arch(r) uncached(r) readahead(o) slob_free(o) slub_frozen(o) slub_debug(o) (r) raw mode bits (o) overloaded bits # ./page-types flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000000 487369 1903 _________________________________ 0x0000000000000014 5 0 __R_D____________________________ referenced,dirty 0x0000000000000020 1 0 _____l___________________________ lru 0x0000000000000024 34 0 __R__l___________________________ referenced,lru 0x0000000000000028 3838 14 ___U_l___________________________ uptodate,lru 0x0001000000000028 48 0 ___U_l_______________________I___ uptodate,lru,readahead 0x000000000000002c 6478 25 __RU_l___________________________ referenced,uptodate,lru 0x000100000000002c 47 0 __RU_l_______________________I___ referenced,uptodate,lru,readahead 0x0000000000000040 8344 32 ______A__________________________ active 0x0000000000000060 1 0 _____lA__________________________ lru,active 0x0000000000000068 348 1 ___U_lA__________________________ uptodate,lru,active 0x0001000000000068 12 0 ___U_lA______________________I___ uptodate,lru,active,readahead 0x000000000000006c 988 3 __RU_lA__________________________ referenced,uptodate,lru,active 0x000100000000006c 48 0 __RU_lA______________________I___ referenced,uptodate,lru,active,readahead 0x0000000000004078 1 0 ___UDlA_______b__________________ uptodate,dirty,lru,active,swapbacked 0x000000000000407c 34 0 __RUDlA_______b__________________ referenced,uptodate,dirty,lru,active,swapbacked 0x0000000000000400 503 1 __________B______________________ buddy 0x0000000000000804 1 0 __R________M_____________________ referenced,mmap 0x0000000000000828 1029 4 ___U_l_____M_____________________ uptodate,lru,mmap 0x0001000000000828 43 0 ___U_l_____M_________________I___ uptodate,lru,mmap,readahead 0x000000000000082c 382 1 __RU_l_____M_____________________ referenced,uptodate,lru,mmap 0x000100000000082c 12 0 __RU_l_____M_________________I___ referenced,uptodate,lru,mmap,readahead 0x0000000000000868 192 0 ___U_lA____M_____________________ uptodate,lru,active,mmap 0x0001000000000868 12 0 ___U_lA____M_________________I___ uptodate,lru,active,mmap,readahead 0x000000000000086c 800 3 __RU_lA____M_____________________ referenced,uptodate,lru,active,mmap 0x000100000000086c 31 0 __RU_lA____M_________________I___ referenced,uptodate,lru,active,mmap,readahead 0x0000000000004878 2 0 ___UDlA____M__b__________________ uptodate,dirty,lru,active,mmap,swapbacked 0x0000000000001000 492 1 ____________a____________________ anonymous 0x0000000000005808 4 0 ___U_______Ma_b__________________ uptodate,mmap,anonymous,swapbacked 0x0000000000005868 2839 11 ___U_lA____Ma_b__________________ uptodate,lru,active,mmap,anonymous,swapbacked 0x000000000000586c 30 0 __RU_lA____Ma_b__________________ referenced,uptodate,lru,active,mmap,anonymous,swapbacked total 513968 2007 # ./page-types -r flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000000 468002 1828 _________________________________ 0x0000000100000000 19102 74 _____________________r___________ reserved 0x0000000000008000 41 0 _______________H_________________ compound_head 0x0000000000010000 188 0 ________________T________________ compound_tail 0x0000000000008014 1 0 __R_D__________H_________________ referenced,dirty,compound_head 0x0000000000010014 4 0 __R_D___________T________________ referenced,dirty,compound_tail 0x0000000000000020 1 0 _____l___________________________ lru 0x0000000800000024 34 0 __R__l__________________P________ referenced,lru,private 0x0000000000000028 3794 14 ___U_l___________________________ uptodate,lru 0x0001000000000028 46 0 ___U_l_______________________I___ uptodate,lru,readahead 0x0000000400000028 44 0 ___U_l_________________d_________ uptodate,lru,mappedtodisk 0x0001000400000028 2 0 ___U_l_________________d_____I___ uptodate,lru,mappedtodisk,readahead 0x000000000000002c 6434 25 __RU_l___________________________ referenced,uptodate,lru 0x000100000000002c 47 0 __RU_l_______________________I___ referenced,uptodate,lru,readahead 0x000000040000002c 14 0 __RU_l_________________d_________ referenced,uptodate,lru,mappedtodisk 0x000000080000002c 30 0 __RU_l__________________P________ referenced,uptodate,lru,private 0x0000000800000040 8124 31 ______A_________________P________ active,private 0x0000000000000040 219 0 ______A__________________________ active 0x0000000800000060 1 0 _____lA_________________P________ lru,active,private 0x0000000000000068 322 1 ___U_lA__________________________ uptodate,lru,active 0x0001000000000068 12 0 ___U_lA______________________I___ uptodate,lru,active,readahead 0x0000000400000068 13 0 ___U_lA________________d_________ uptodate,lru,active,mappedtodisk 0x0000000800000068 12 0 ___U_lA_________________P________ uptodate,lru,active,private 0x000000000000006c 977 3 __RU_lA__________________________ referenced,uptodate,lru,active 0x000100000000006c 48 0 __RU_lA______________________I___ referenced,uptodate,lru,active,readahead 0x000000040000006c 5 0 __RU_lA________________d_________ referenced,uptodate,lru,active,mappedtodisk 0x000000080000006c 3 0 __RU_lA_________________P________ referenced,uptodate,lru,active,private 0x0000000c0000006c 3 0 __RU_lA________________dP________ referenced,uptodate,lru,active,mappedtodisk,private 0x0000000c00000068 1 0 ___U_lA________________dP________ uptodate,lru,active,mappedtodisk,private 0x0000000000004078 1 0 ___UDlA_______b__________________ uptodate,dirty,lru,active,swapbacked 0x000000000000407c 34 0 __RUDlA_______b__________________ referenced,uptodate,dirty,lru,active,swapbacked 0x0000000000000400 538 2 __________B______________________ buddy 0x0000000000000804 1 0 __R________M_____________________ referenced,mmap 0x0000000000000828 1029 4 ___U_l_____M_____________________ uptodate,lru,mmap 0x0001000000000828 43 0 ___U_l_____M_________________I___ uptodate,lru,mmap,readahead 0x000000000000082c 382 1 __RU_l_____M_____________________ referenced,uptodate,lru,mmap 0x000100000000082c 12 0 __RU_l_____M_________________I___ referenced,uptodate,lru,mmap,readahead 0x0000000000000868 192 0 ___U_lA____M_____________________ uptodate,lru,active,mmap 0x0001000000000868 12 0 ___U_lA____M_________________I___ uptodate,lru,active,mmap,readahead 0x000000000000086c 800 3 __RU_lA____M_____________________ referenced,uptodate,lru,active,mmap 0x000100000000086c 31 0 __RU_lA____M_________________I___ referenced,uptodate,lru,active,mmap,readahead 0x0000000000004878 2 0 ___UDlA____M__b__________________ uptodate,dirty,lru,active,mmap,swapbacked 0x0000000000001000 492 1 ____________a____________________ anonymous 0x0000000000005008 2 0 ___U________a_b__________________ uptodate,anonymous,swapbacked 0x0000000000005808 4 0 ___U_______Ma_b__________________ uptodate,mmap,anonymous,swapbacked 0x000000000000580c 1 0 __RU_______Ma_b__________________ referenced,uptodate,mmap,anonymous,swapbacked 0x0000000000005868 2839 11 ___U_lA____Ma_b__________________ uptodate,lru,active,mmap,anonymous,swapbacked 0x000000000000586c 29 0 __RU_lA____Ma_b__________________ referenced,uptodate,lru,active,mmap,anonymous,swapbacked total 513968 2007 # ./page-types --raw --list --no-summary --bits reserved offset count flags 0 15 _____________________r___________ 31 4 _____________________r___________ 159 97 _____________________r___________ 4096 2067 _____________________r___________ 6752 2390 _____________________r___________ 9355 3 _____________________r___________ 9728 14526 _____________________r___________ This patch: Introduce PageHuge(), which identifies huge/gigantic pages by their dedicated compound destructor functions. Also move prep_compound_gigantic_page() to hugetlb.c and make __free_pages_ok() non-static. Signed-off-by: Wu Fengguang Cc: KOSAKI Motohiro Cc: Andi Kleen Cc: Matt Mackall Cc: Alexey Dobriyan Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 98 ++++++++++++++++++++++++++++++++++++--------------------- mm/internal.h | 5 ++- mm/page_alloc.c | 17 ---------- 3 files changed, 65 insertions(+), 55 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7b9b6015b2e..a56e6f3ce97 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -578,41 +578,6 @@ static void free_huge_page(struct page *page) hugetlb_put_quota(mapping, 1); } -/* - * Increment or decrement surplus_huge_pages. Keep node-specific counters - * balanced by operating on them in a round-robin fashion. - * Returns 1 if an adjustment was made. - */ -static int adjust_pool_surplus(struct hstate *h, int delta) -{ - static int prev_nid; - int nid = prev_nid; - int ret = 0; - - VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) - continue; - - h->surplus_huge_pages += delta; - h->surplus_huge_pages_node[nid] += delta; - ret = 1; - break; - } while (nid != prev_nid); - - prev_nid = nid; - return ret; -} - static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { set_compound_page_dtor(page, free_huge_page); @@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } +static void prep_compound_gigantic_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + /* we rely on prep_new_huge_page to set the destructor */ + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __SetPageTail(p); + p->first_page = page; + } +} + +int PageHuge(struct page *page) +{ + compound_page_dtor *dtor; + + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + dtor = get_compound_page_dtor(page); + + return dtor == free_huge_page; +} + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) } #endif +/* + * Increment or decrement surplus_huge_pages. Keep node-specific counters + * balanced by operating on them in a round-robin fashion. + * Returns 1 if an adjustment was made. + */ +static int adjust_pool_surplus(struct hstate *h, int delta) +{ + static int prev_nid; + int nid = prev_nid; + int ret = 0; + + VM_BUG_ON(delta != -1 && delta != 1); + do { + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); + + /* To shrink on this node, there must be a surplus page */ + if (delta < 0 && !h->surplus_huge_pages_node[nid]) + continue; + /* Surplus cannot exceed the total number of pages */ + if (delta > 0 && h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) + continue; + + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[nid] += delta; + ret = 1; + break; + } while (nid != prev_nid); + + prev_nid = nid; + return ret; +} + #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { diff --git a/mm/internal.h b/mm/internal.h index 4b1672a8cf7..b4ac332e807 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -16,9 +16,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -extern void prep_compound_page(struct page *page, unsigned long order); -extern void prep_compound_gigantic_page(struct page *page, unsigned long order); - static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_count, v); @@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page); */ extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); +extern void prep_compound_page(struct page *page, unsigned long order); + /* * function for dealing with page's order in buddy system. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8ca06d87dc1..131655cdb6b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -300,23 +300,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -#ifdef CONFIG_HUGETLBFS -void prep_compound_gigantic_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - struct page *p = page + 1; - - set_compound_page_dtor(page, free_compound_page); - set_compound_order(page, order); - __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __SetPageTail(p); - p->first_page = page; - } -} -#endif - static int destroy_compound_page(struct page *page, unsigned long order) { int i; -- cgit v1.2.3 From 56e49d218890f49b0057710a4b6fef31f5ffbfec Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 16 Jun 2009 15:32:28 -0700 Subject: vmscan: evict use-once pages first When the file LRU lists are dominated by streaming IO pages, evict those pages first, before considering evicting other pages. This should be safe from deadlocks or performance problems because only three things can happen to an inactive file page: 1) referenced twice and promoted to the active list 2) evicted by the pageout code 3) under IO, after which it will get evicted or promoted The pages freed in this way can either be reused for streaming IO, or allocated for something else. If the pages are used for streaming IO, this pageout pattern continues. Otherwise, we will fall back to the normal pageout pattern. Signed-off-by: Rik van Riel Reported-by: Elladan Cc: KOSAKI Motohiro Cc: Peter Zijlstra Cc: Lee Schermerhorn Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 +++++++++++ mm/vmscan.c | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78eb8552818..70db6e0a5ee 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -570,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) return 0; } +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +{ + unsigned long active; + unsigned long inactive; + + inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); + active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); + + return (active > inactive); +} + unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru) diff --git a/mm/vmscan.c b/mm/vmscan.c index e5245d05164..9673437a545 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1351,12 +1351,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) return low; } +static int inactive_file_is_low_global(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_FILE); + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + + return (active > inactive); +} + +/** + * inactive_file_is_low - check if file pages need to be deactivated + * @zone: zone to check + * @sc: scan control of this context + * + * When the system is doing streaming IO, memory pressure here + * ensures that active file pages get deactivated, until more + * than half of the file pages are on the inactive list. + * + * Once we get to that situation, protect the system's working + * set from being evicted by disabling active file page aging. + * + * This uses a different ratio than the anonymous pages, because + * the page cache uses a use-once replacement algorithm. + */ +static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +{ + int low; + + if (scanning_global_lru(sc)) + low = inactive_file_is_low_global(zone); + else + low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); + return low; +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE) { + if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } -- cgit v1.2.3 From 6e08a369ee10b361ac1cdcdf4fabd420fd08beb3 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:29 -0700 Subject: vmscan: cleanup the scan batching code The vmscan batching logic is twisting. Move it into a standalone function nr_scan_try_batch() and document it. No behavior change. Signed-off-by: Wu Fengguang Acked-by: Rik van Riel Cc: Nick Piggin Cc: Christoph Lameter Acked-by: Peter Zijlstra Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- mm/vmscan.c | 39 ++++++++++++++++++++++++++++----------- mm/vmstat.c | 8 ++++---- 3 files changed, 33 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 131655cdb6b..e5b8f628d16 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3657,7 +3657,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_scan = 0; + zone->lru[l].nr_saved_scan = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 9673437a545..d4da097533c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1492,6 +1492,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, percent[1] = 100 - percent[0]; } +/* + * Smallish @nr_to_scan's are deposited in @nr_saved_scan, + * until we collected @swap_cluster_max pages to scan. + */ +static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, + unsigned long *nr_saved_scan, + unsigned long swap_cluster_max) +{ + unsigned long nr; + + *nr_saved_scan += nr_to_scan; + nr = *nr_saved_scan; + + if (nr >= swap_cluster_max) + *nr_saved_scan = 0; + else + nr = 0; + + return nr; +} /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. @@ -1517,14 +1537,11 @@ static void shrink_zone(int priority, struct zone *zone, scan >>= priority; scan = (scan * percent[file]) / 100; } - if (scanning_global_lru(sc)) { - zone->lru[l].nr_scan += scan; - nr[l] = zone->lru[l].nr_scan; - if (nr[l] >= swap_cluster_max) - zone->lru[l].nr_scan = 0; - else - nr[l] = 0; - } else + if (scanning_global_lru(sc)) + nr[l] = nr_scan_try_batch(scan, + &zone->lru[l].nr_saved_scan, + swap_cluster_max); + else nr[l] = scan; } @@ -2124,11 +2141,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, l == LRU_ACTIVE_FILE)) continue; - zone->lru[l].nr_scan += (lru_pages >> prio) + 1; - if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { + zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; + if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { unsigned long nr_to_scan; - zone->lru[l].nr_scan = 0; + zone->lru[l].nr_saved_scan = 0; nr_to_scan = min(nr_pages, lru_pages); nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); diff --git a/mm/vmstat.c b/mm/vmstat.c index 415110772c7..84c05555691 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -718,10 +718,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, low_wmark_pages(zone), high_wmark_pages(zone), zone->pages_scanned, - zone->lru[LRU_ACTIVE_ANON].nr_scan, - zone->lru[LRU_INACTIVE_ANON].nr_scan, - zone->lru[LRU_ACTIVE_FILE].nr_scan, - zone->lru[LRU_INACTIVE_FILE].nr_scan, + zone->lru[LRU_ACTIVE_ANON].nr_saved_scan, + zone->lru[LRU_INACTIVE_ANON].nr_saved_scan, + zone->lru[LRU_ACTIVE_FILE].nr_saved_scan, + zone->lru[LRU_INACTIVE_FILE].nr_saved_scan, zone->spanned_pages, zone->present_pages); -- cgit v1.2.3 From 08d9ae7cbbd0c5c07573d072ec771e997a9a39e0 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:30 -0700 Subject: vmscan: don't export nr_saved_scan in /proc/zoneinfo The lru->nr_saved_scan's are not meaningful counters for even kernel developers. They typically are smaller than 32 and are always 0 for large lists. So remove them from /proc/zoneinfo. Hopefully this interface change won't break too many scripts. /proc/zoneinfo is too unstructured to be script friendly, and I wonder the affected scripts - if there are any - are still bleeding since the not long ago commit "vmscan: split LRU lists into anon & file sets", which also touched the "scanned" line :) If we are to re-export accumulated vmscan counts in the future, they can go to new lines in /proc/zoneinfo instead of the current form, or to /sys/devices/system/node/node0/meminfo? Signed-off-by: Wu Fengguang Acked-by: Rik van Riel Cc: Nick Piggin Acked-by: Christoph Lameter Cc: Peter Zijlstra Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 84c05555691..1e151cf6bf8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -710,7 +710,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" + "\n scanned %lu" "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), @@ -718,10 +718,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, low_wmark_pages(zone), high_wmark_pages(zone), zone->pages_scanned, - zone->lru[LRU_ACTIVE_ANON].nr_saved_scan, - zone->lru[LRU_INACTIVE_ANON].nr_saved_scan, - zone->lru[LRU_ACTIVE_FILE].nr_saved_scan, - zone->lru[LRU_INACTIVE_FILE].nr_saved_scan, zone->spanned_pages, zone->present_pages); -- cgit v1.2.3 From af166777cf451f0373b952ce6766dc1c25385686 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:31 -0700 Subject: vmscan: ZVC updates in shrink_active_list() can be done once This effectively lifts the unit of updates to nr_inactive_* and pgdeactivate from PAGEVEC_SIZE=14 to SWAP_CLUSTER_MAX=32, or MAX_ORDER_NR_PAGES=1024 for reclaim_zone(). Cc: Johannes Weiner Acked-by: Rik van Riel Reviewed-by: Minchan Kim Signed-off-by: Wu Fengguang Cc: Peter Zijlstra Cc: Christoph Lameter Cc: KOSAKI Motohiro Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index d4da097533c..7592d8eb114 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1223,7 +1223,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) { unsigned long pgmoved; - int pgdeactivate = 0; unsigned long pgscanned; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); @@ -1252,7 +1251,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); spin_unlock_irq(&zone->lru_lock); - pgmoved = 0; + pgmoved = 0; /* count referenced (mapping) mapped pages */ while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); @@ -1286,7 +1285,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, */ reclaim_stat->recent_rotated[!!file] += pgmoved; - pgmoved = 0; + pgmoved = 0; /* count pages moved to inactive list */ while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); @@ -1299,10 +1298,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, mem_cgroup_add_lru_list(page, lru); pgmoved++; if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); spin_unlock_irq(&zone->lru_lock); - pgdeactivate += pgmoved; - pgmoved = 0; if (buffer_heads_over_limit) pagevec_strip(&pvec); __pagevec_release(&pvec); @@ -1310,9 +1306,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } } __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - pgdeactivate += pgmoved; __count_zone_vm_events(PGREFILL, zone, pgscanned); - __count_vm_events(PGDEACTIVATE, pgdeactivate); + __count_vm_events(PGDEACTIVATE, pgmoved); spin_unlock_irq(&zone->lru_lock); if (buffer_heads_over_limit) pagevec_strip(&pvec); -- cgit v1.2.3 From 5c87eada68fe5d29a5f67528f81b6e45124f579b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 16 Jun 2009 15:32:32 -0700 Subject: mm: setup_per_zone_inactive_ratio - do not call for int_sqrt if not needed int_sqrt() returns 0 if its argument is zero so call it if only needed. Signed-off-by: Cyrill Gorcunov Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e5b8f628d16..db8c46ffa9f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4471,8 +4471,9 @@ static void setup_per_zone_inactive_ratio(void) /* Zone size in gigabytes */ gb = zone->present_pages >> (30 - PAGE_SHIFT); - ratio = int_sqrt(10 * gb); - if (!ratio) + if (gb) + ratio = int_sqrt(10 * gb); + else ratio = 1; zone->inactive_ratio = ratio; -- cgit v1.2.3 From e9bb35df6f813ca46f8e6273add657643c7df73f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 16 Jun 2009 15:32:32 -0700 Subject: mm: setup_per_zone_inactive_ratio - fix comment and make it __init The caller of setup_per_zone_inactive_ratio is an __init function. There is no need to keep the callee after it completed as well. Also fix a comment. Acked-by: David Rientjes Signed-off-by: Cyrill Gorcunov Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index db8c46ffa9f..076463cb21b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4440,8 +4440,6 @@ void setup_per_zone_pages_min(void) } /** - * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. - * * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance * to be referenced again before it is swapped out. @@ -4462,7 +4460,7 @@ void setup_per_zone_pages_min(void) * 1TB 101 10GB * 10TB 320 32GB */ -static void setup_per_zone_inactive_ratio(void) +static void __init setup_per_zone_inactive_ratio(void) { struct zone *zone; -- cgit v1.2.3 From f8ad0f499fad5cdbcaaa2d97542b2db869b5a770 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 16 Jun 2009 15:32:33 -0700 Subject: mm: introduce follow_pte() A generic readonly page table lookup helper to map an address space and an address from it to a pte. Signed-off-by: Johannes Weiner Cc: Christoph Hellwig Acked-by: Magnus Damm Cc: Hans Verkuil Cc: Paul Mundt Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 891bad0613f..7135d6b2599 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3103,6 +3103,43 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ +static int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out; + + /* We cannot handle huge page PFN maps. Luckily they don't exist. */ + if (pmd_huge(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + if (!ptep) + goto out; + if (!pte_present(*ptep)) + goto unlock; + *ptepp = ptep; + return 0; +unlock: + pte_unmap_unlock(ptep, *ptlp); +out: + return -EINVAL; +} + #ifdef CONFIG_HAVE_IOREMAP_PROT int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, -- cgit v1.2.3 From 03668a4debf4f50de55c34b6e66dae63e1c73716 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 16 Jun 2009 15:32:34 -0700 Subject: mm: use generic follow_pte() in follow_phys() Signed-off-by: Johannes Weiner Cc: Christoph Hellwig Acked-by: Magnus Damm Cc: Hans Verkuil Cc: Paul Mundt Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 36 +++++------------------------------- 1 file changed, 5 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 7135d6b2599..24ff20480bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3145,50 +3145,24 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; + int ret = -EINVAL; pte_t *ptep, pte; spinlock_t *ptl; - resource_size_t phys_addr = 0; - struct mm_struct *mm = vma->vm_mm; - int ret = -EINVAL; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto out; - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto out; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - - /* We cannot handle huge page PFN maps. Luckily they don't exist. */ - if (pmd_huge(*pmd)) + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!ptep) - goto out; - pte = *ptep; - if (!pte_present(pte)) - goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; - phys_addr = pte_pfn(pte); - phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ *prot = pgprot_val(pte_pgprot(pte)); - *phys = phys_addr; - ret = 0; + *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + ret = 0; unlock: pte_unmap_unlock(ptep, ptl); out: -- cgit v1.2.3 From 3b6748e2dd69906af3835db4dc9d1c8a3ee4c68c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 16 Jun 2009 15:32:35 -0700 Subject: mm: introduce follow_pfn() Analoguous to follow_phys(), add a helper that looks up the PFN at a user virtual address in an IO mapping or a raw PFN mapping. Signed-off-by: Johannes Weiner Cc: Christoph Hellwig Acked-by: Magnus Damm Cc: Hans Verkuil Cc: Paul Mundt Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 24ff20480bb..d5d1653d60a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3140,6 +3140,35 @@ out: return -EINVAL; } +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + int ret = -EINVAL; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return ret; + + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + if (ret) + return ret; + *pfn = pte_pfn(*ptep); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(follow_pfn); + #ifdef CONFIG_HAVE_IOREMAP_PROT int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, -- cgit v1.2.3 From dab48dab37d2770824420d1e01730a107fade1aa Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 16 Jun 2009 15:32:37 -0700 Subject: page-allocator: warn if __GFP_NOFAIL is used for a large allocation __GFP_NOFAIL is a bad fiction. Allocations _can_ fail, and callers should detect and suitably handle this (and not by lamely moving the infinite loop up to the caller level either). Attempting to use __GFP_NOFAIL for a higher-order allocation is even worse, so add a once-off runtime check for this to slap people around for even thinking about trying it. Cc: David Rientjes Acked-by: Mel Gorman Acked-by: Peter Zijlstra Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 076463cb21b..61290ea721c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1128,6 +1128,19 @@ again: list_del(&page->lru); pcp->count--; } else { + if (unlikely(gfp_flags & __GFP_NOFAIL)) { + /* + * __GFP_NOFAIL is not to be used in new code. + * + * All __GFP_NOFAIL callers should be fixed so that they + * properly detect and handle allocation failures. + * + * We most definitely don't want callers attempting to + * allocate greater than single-page units with + * __GFP_NOFAIL. + */ + WARN_ON_ONCE(order > 0); + } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); -- cgit v1.2.3 From 75927af8bcb940dad4fe281713d526cb520869ff Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 16 Jun 2009 15:32:38 -0700 Subject: mm: madvise(): correct return code The posix_madvise() function succeeds (and does nothing) when called with parameters (NULL, 0, -1); according to LSB tests, it should fail with EINVAL because -1 is not a valid flag. When called with a valid address and size, it correctly fails. So perform an initial check for valid flags first. Reported-by: Jiri Dluhos Signed-off-by: Nick Piggin Reviewed-and-Tested-by: WANG Cong Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index e994dcb479d..76eb4193acd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -238,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, break; default: - error = -EINVAL; + BUG(); break; } return error; } +static int +madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_DOFORK: + case MADV_DONTFORK: + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + return 1; + + default: + return 0; + } +} /* * The madvise(2) system call. * @@ -289,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int write; size_t len; + if (!madvise_behavior_valid(behavior)) + return error; + write = madvise_need_mmap_write(behavior); if (write) down_write(¤t->mm->mmap_sem); -- cgit v1.2.3 From 7f33d49a2ed546e01f7b1d0607661810f2421859 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 16 Jun 2009 15:32:41 -0700 Subject: mm, PM/Freezer: Disable OOM killer when tasks are frozen Currently, the following scenario appears to be possible in theory: * Tasks are frozen for hibernation or suspend. * Free pages are almost exhausted. * Certain piece of code in the suspend code path attempts to allocate some memory using GFP_KERNEL and allocation order less than or equal to PAGE_ALLOC_COSTLY_ORDER. * __alloc_pages_internal() cannot find a free page so it invokes the OOM killer. * The OOM killer attempts to kill a task, but the task is frozen, so it doesn't die immediately. * __alloc_pages_internal() jumps to 'restart', unsuccessfully tries to find a free page and invokes the OOM killer. * No progress can be made. Although it is now hard to trigger during hibernation due to the memory shrinking carried out by the hibernation code, it is theoretically possible to trigger during suspend after the memory shrinking has been removed from that code path. Moreover, since memory allocations are going to be used for the hibernation memory shrinking, it will be even more likely to happen during hibernation. To prevent it from happening, introduce the oom_killer_disabled switch that will cause __alloc_pages_internal() to fail in the situations in which the OOM killer would have been called and make the freezer set this switch after tasks have been successfully frozen. [akpm@linux-foundation.org: be nicer to the namespace] Signed-off-by: Rafael J. Wysocki Cc: Fengguang Wu Cc: David Rientjes Acked-by: Pavel Machek Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 61290ea721c..5b09488d0f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -178,6 +178,8 @@ static void set_pageblock_migratetype(struct page *page, int migratetype) PB_migrate, PB_migrate_end); } +bool oom_killer_disabled __read_mostly; + #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -1769,6 +1771,8 @@ rebalance: */ if (!did_some_progress) { if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (oom_killer_disabled) + goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, -- cgit v1.2.3 From 35282a2de4e5e4e173ab61aa9d7015886021a821 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Tue, 16 Jun 2009 15:32:43 -0700 Subject: migration: only migrate_prep() once per move_pages() migrate_prep() is fairly expensive (72us on 16-core barcelona 1.9GHz). Commit 3140a2273009c01c27d316f35ab76a37e105fdd8 improved move_pages() throughput by breaking it into chunks, but it also made migrate_prep() be called once per chunk (every 128pages or so) instead of once per move_pages(). This patch reverts to calling migrate_prep() only once per chunk as we did before 2.6.29. It is also a followup to commit 0aedadf91a70a11c4a3e7c7d99b21e5528af8d5d ("mm: move migrate_prep out from under mmap_sem"). This improves migration throughput on the above machine from 600MB/s to 750MB/s. Signed-off-by: Brice Goglin Acked-by: Christoph Lameter Cc: KOSAKI Motohiro Cc: Heiko Carstens Cc: Nick Piggin Cc: Hugh Dickins Cc: Rik van Riel Cc: Lee Schermerhorn Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 5a24923e7fd..939888f9dda 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, struct page_to_node *pp; LIST_HEAD(pagelist); - migrate_prep(); down_read(&mm->mmap_sem); /* @@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); if (!pm) goto out; + + migrate_prep(); + /* * Store a chunk of page_to_node array in a page, * but keep the last one as a marker -- cgit v1.2.3 From 69c854817566db82c362797b4a6521d0b00fe1d8 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Tue, 16 Jun 2009 15:32:44 -0700 Subject: vmscan: prevent shrinking of active anon lru list in case of no swap space V3 shrink_zone() can deactivate active anon pages even if we don't have a swap device. Many embedded products don't have a swap device. So the deactivation of anon pages is unnecessary. This patch prevents unnecessary deactivation of anon lru pages. But, it don't prevent aging of anon pages to swap out. Signed-off-by: Minchan Kim Acked-by: KOSAKI Motohiro Cc: Johannes Weiner Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7592d8eb114..879d034930c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1570,7 +1570,7 @@ static void shrink_zone(int priority, struct zone *zone, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc)) + if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); -- cgit v1.2.3 From bc75d33f0fc1d56e734db1f56d3cfc8097b8e0cf Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:32:48 -0700 Subject: page-allocator: clean up functions related to pages_min Change the names of two functions. It doesn't affect behavior. Presently, setup_per_zone_pages_min() changes low, high of zone as well as min. So a better name is setup_per_zone_wmarks(). That's because Mel changed zone->pages_[hig/low/min] to zone->watermark array in "page allocator: replace the watermark-related union in struct zone with a watermark[] array". * setup_per_zone_pages_min => setup_per_zone_wmarks Of course, we have to change init_per_zone_pages_min, too. There are not pages_min any more. * init_per_zone_pages_min => init_per_zone_wmark_min [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Minchan Kim Acked-by: Mel Gorman Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c083cf5fd6d..037291e15b2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -422,7 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); if (onlined_pages) { kswapd_run(zone_to_nid(zone)); node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b09488d0f5..8629c84fd9b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4396,12 +4396,13 @@ static void setup_per_zone_lowmem_reserve(void) } /** - * setup_per_zone_pages_min - called when min_free_kbytes changes. + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-added * - * Ensures that the pages_{min,low,high} values for each zone are set correctly - * with respect to min_free_kbytes. + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. */ -void setup_per_zone_pages_min(void) +void setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -4519,7 +4520,7 @@ static void __init setup_per_zone_inactive_ratio(void) * 8192MB: 11584k * 16384MB: 16384k */ -static int __init init_per_zone_pages_min(void) +static int __init init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; @@ -4530,12 +4531,12 @@ static int __init init_per_zone_pages_min(void) min_free_kbytes = 128; if (min_free_kbytes > 65536) min_free_kbytes = 65536; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; } -module_init(init_per_zone_pages_min) +module_init(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so @@ -4547,7 +4548,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, { proc_dointvec(table, write, file, buffer, length, ppos); if (write) - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); return 0; } -- cgit v1.2.3 From 96cb4df5ddf5e6d5785b5acd4003e3689b87f896 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:32:49 -0700 Subject: page-allocator: add inactive ratio calculation function of each zone Factor the per-zone arithemetic inside setup_per_zone_inactive_ratio()'s loop into a a separate function, calculate_zone_inactive_ratio(). This function will be used in a later patch [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8629c84fd9b..303607f1d32 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4478,22 +4478,26 @@ void setup_per_zone_wmarks(void) * 1TB 101 10GB * 10TB 320 32GB */ -static void __init setup_per_zone_inactive_ratio(void) +void calculate_zone_inactive_ratio(struct zone *zone) { - struct zone *zone; + unsigned int gb, ratio; - for_each_zone(zone) { - unsigned int gb, ratio; + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + if (gb) + ratio = int_sqrt(10 * gb); + else + ratio = 1; - /* Zone size in gigabytes */ - gb = zone->present_pages >> (30 - PAGE_SHIFT); - if (gb) - ratio = int_sqrt(10 * gb); - else - ratio = 1; + zone->inactive_ratio = ratio; +} - zone->inactive_ratio = ratio; - } +static void __init setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) + calculate_zone_inactive_ratio(zone); } /* -- cgit v1.2.3 From bce7394a3ef82b8477952fbab838e4a6e8cb47d2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:32:50 -0700 Subject: page-allocator: reset wmark_min and inactive ratio of zone when hotplug happens Solve two problems. Whenever memory hotplug sucessfully happens, zone->present_pages have to be changed. 1) Now memory hotplug calls setup_per_zone_wmark_min only when online_pages called, not offline_pages. It breaks balance. 2) If zone->present_pages is changed, we also have to change zone->inactive_ratio. That's because inactive_ratio depends on zone->present_pages. Signed-off-by: Minchan Kim Acked-by: Yasunori Goto Cc: Rik van Riel Cc: KOSAKI Motohiro Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++++ mm/page_alloc.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 037291e15b2..e4412a676c8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -423,6 +423,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->zone_pgdat->node_present_pages += onlined_pages; setup_per_zone_wmarks(); + calculate_zone_inactive_ratio(zone); if (onlined_pages) { kswapd_run(zone_to_nid(zone)); node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); @@ -832,6 +833,9 @@ repeat: totalram_pages -= offlined_pages; num_physpages -= offlined_pages; + setup_per_zone_wmarks(); + calculate_zone_inactive_ratio(zone); + vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 303607f1d32..00e293734fc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4397,7 +4397,7 @@ static void setup_per_zone_lowmem_reserve(void) /** * setup_per_zone_wmarks - called when min_free_kbytes changes - * or when memory is hot-added + * or when memory is hot-{added|removed} * * Ensures that the watermark[min,low,high] values for each zone are set * correctly with respect to min_free_kbytes. -- cgit v1.2.3 From 6837765963f1723e80ca97b1fae660f3a60d77df Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 16 Jun 2009 15:32:51 -0700 Subject: mm: remove CONFIG_UNEVICTABLE_LRU config option Currently, nobody wants to turn UNEVICTABLE_LRU off. Thus this configurability is unnecessary. Signed-off-by: KOSAKI Motohiro Cc: Johannes Weiner Cc: Andi Kleen Acked-by: Minchan Kim Cc: David Woodhouse Cc: Matt Mackall Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 14 +------------- mm/internal.h | 6 ------ mm/mlock.c | 22 ---------------------- mm/page_alloc.c | 9 --------- mm/rmap.c | 3 +-- mm/vmscan.c | 17 ----------------- mm/vmstat.c | 4 ---- 7 files changed, 2 insertions(+), 73 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 71830ba7b98..97d2c88b745 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -203,25 +203,13 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config UNEVICTABLE_LRU - bool "Add LRU list to track non-evictable pages" - default y - help - Keeps unevictable pages off of the active and inactive pageout - lists, so kswapd will not waste CPU time or have its balancing - algorithms thrown off by scanning these pages. Selecting this - will use one page flag and increase the code size a little, - say Y unless you know what you are doing. - - See Documentation/vm/unevictable-lru.txt for more information. - config HAVE_MLOCK bool default y if MMU=y config HAVE_MLOCKED_PAGE_BIT bool - default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y + default y if HAVE_MLOCK=y config MMU_NOTIFIER bool diff --git a/mm/internal.h b/mm/internal.h index b4ac332e807..f02c7508068 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -73,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * unevictable_migrate_page() called only from migrate_page_copy() to * migrate unevictable flag to new page. @@ -85,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) if (TestClearPageUnevictable(old)) SetPageUnevictable(new); } -#else -static inline void unevictable_migrate_page(struct page *new, struct page *old) -{ -} -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* diff --git a/mm/mlock.c b/mm/mlock.c index ac130433c7d..45eb650b965 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -31,7 +31,6 @@ int can_do_mlock(void) } EXPORT_SYMBOL(can_do_mlock); -#ifdef CONFIG_UNEVICTABLE_LRU /* * Mlocked pages are marked with PageMlocked() flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate @@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval) return retval; } -#else /* CONFIG_UNEVICTABLE_LRU */ - -/* - * Just make pages present if VM_LOCKED. No-op if unlocking. - */ -static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int mlock) -{ - if (mlock && (vma->vm_flags & VM_LOCKED)) - return make_pages_present(start, end); - return 0; -} - -static inline int __mlock_posix_error_return(long retval) -{ - return 0; -} - -#endif /* CONFIG_UNEVICTABLE_LRU */ - /** * mlock_vma_pages_range() - mlock pages in specified vma range. * @vma - the vma containing the specfied address range diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 00e293734fc..c95a77cd581 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2077,19 +2077,14 @@ void show_free_areas(void) printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" " inactive_file:%lu" -//TODO: check/adjust line lengths -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lu" -#endif " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_INACTIVE_FILE), -#ifdef CONFIG_UNEVICTABLE_LRU global_page_state(NR_UNEVICTABLE), -#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -2113,9 +2108,7 @@ void show_free_areas(void) " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lukB" -#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -2129,9 +2122,7 @@ void show_free_areas(void) K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), -#ifdef CONFIG_UNEVICTABLE_LRU K(zone_page_state(zone, NR_UNEVICTABLE)), -#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") diff --git a/mm/rmap.c b/mm/rmap.c index 23122af3261..316c9d6930a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1202,7 +1202,6 @@ int try_to_unmap(struct page *page, int migration) return ret; } -#ifdef CONFIG_UNEVICTABLE_LRU /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked @@ -1226,4 +1225,4 @@ int try_to_munlock(struct page *page) else return try_to_unmap_file(page, 1, 0); } -#endif + diff --git a/mm/vmscan.c b/mm/vmscan.c index 879d034930c..2c4b945b011 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -514,7 +514,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) * * lru_lock must not be held, interrupts must be enabled. */ -#ifdef CONFIG_UNEVICTABLE_LRU void putback_lru_page(struct page *page) { int lru; @@ -568,20 +567,6 @@ redo: put_page(page); /* drop ref from isolate */ } -#else /* CONFIG_UNEVICTABLE_LRU */ - -void putback_lru_page(struct page *page) -{ - int lru; - VM_BUG_ON(PageLRU(page)); - - lru = !!TestClearPageActive(page) + page_is_file_cache(page); - lru_cache_add_lru(page, lru); - put_page(page); -} -#endif /* CONFIG_UNEVICTABLE_LRU */ - - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -2470,7 +2455,6 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * page_evictable - test whether a page is evictable * @page: the page to test @@ -2717,4 +2701,3 @@ void scan_unevictable_unregister_node(struct node *node) sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } -#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 1e151cf6bf8..1e3aa8139f2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -629,10 +629,8 @@ static const char * const vmstat_text[] = { "nr_active_anon", "nr_inactive_file", "nr_active_file", -#ifdef CONFIG_UNEVICTABLE_LRU "nr_unevictable", "nr_mlock", -#endif "nr_anon_pages", "nr_mapped", "nr_file_pages", @@ -687,7 +685,6 @@ static const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif -#ifdef CONFIG_UNEVICTABLE_LRU "unevictable_pgs_culled", "unevictable_pgs_scanned", "unevictable_pgs_rescued", @@ -697,7 +694,6 @@ static const char * const vmstat_text[] = { "unevictable_pgs_stranded", "unevictable_pgs_mlockfreed", #endif -#endif }; static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, -- cgit v1.2.3 From cb4b86ba47bb0937b71fb825b3ed88adf7a190f0 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 16 Jun 2009 15:32:52 -0700 Subject: mm: add swap cache interface for swap reference In a following patch, the usage of swap cache is recorded into swap_map. This patch is for necessary interface changes to do that. 2 interfaces: - swapcache_prepare() - swapcache_free() are added for allocating/freeing refcnt from swap-cache to existing swap entries. But implementation itself is not changed under this patch. At adding swapcache_free(), memcg's hook code is moved under swapcache_free(). This is better than using scattered hooks. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Daisuke Nishimura Acked-by: Balbir Singh Cc: Hugh Dickins Cc: Johannes Weiner Cc: Li Zefan Cc: Dhaval Giani Cc: YAMAMOTO Takashi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- mm/swap_state.c | 11 +++++------ mm/swapfile.c | 19 +++++++++++++++++++ mm/vmscan.c | 3 +-- 4 files changed, 26 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 0132fbd45a2..47ab1918228 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) shmem_swp_unmap(entry); unlock: spin_unlock(&info->lock); - swap_free(swap); + swapcache_free(swap, NULL); redirty: set_page_dirty(page); if (wbc->for_reclaim) diff --git a/mm/swap_state.c b/mm/swap_state.c index 1416e7e9e02..19bdf3017a9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -162,11 +162,11 @@ int add_to_swap(struct page *page) return 1; case -EEXIST: /* Raced with "speculative" read_swap_cache_async */ - swap_free(entry); + swapcache_free(entry, NULL); continue; default: /* -ENOMEM radix-tree allocation failure */ - swap_free(entry); + swapcache_free(entry, NULL); return 0; } } @@ -188,8 +188,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&swapper_space.tree_lock); - mem_cgroup_uncharge_swapcache(page, entry); - swap_free(entry); + swapcache_free(entry, page); page_cache_release(page); } @@ -293,7 +292,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Swap entry may have been freed since our caller observed it. */ - if (!swap_duplicate(entry)) + if (!swapcache_prepare(entry)) break; /* @@ -317,7 +316,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } ClearPageSwapBacked(new_page); __clear_page_locked(new_page); - swap_free(entry); + swapcache_free(entry, NULL); } while (err != -ENOMEM); if (new_page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 312fafe0ab6..3187079903f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -509,6 +509,16 @@ void swap_free(swp_entry_t entry) } } +/* + * Called after dropping swapcache to decrease refcnt to swap entries. + */ +void swapcache_free(swp_entry_t entry, struct page *page) +{ + if (page) + mem_cgroup_uncharge_swapcache(page, entry); + return swap_free(entry); +} + /* * How many references to page are currently swapped out? */ @@ -1979,6 +1989,15 @@ bad_file: goto out; } +/* + * Called when allocating swap cache for exising swap entry, + */ +int swapcache_prepare(swp_entry_t entry) +{ + return swap_duplicate(entry); +} + + struct swap_info_struct * get_swap_info_struct(unsigned type) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 2c4b945b011..52339dd7bf8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_swapcache(page, swap); - swap_free(swap); + swapcache_free(swap, page); } else { __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); -- cgit v1.2.3 From 355cfa73ddff2fb8fa14e93bd94a057cc022512e Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 16 Jun 2009 15:32:53 -0700 Subject: mm: modify swap_map and add SWAP_HAS_CACHE flag This is a part of the patches for fixing memcg's swap accountinf leak. But, IMHO, not a bad patch even if no memcg. There are 2 kinds of references to swap. - reference from swap entry - reference from swap cache Then, - If there is swap cache && swap's refcnt is 1, there is only swap cache. (*) swapcount(entry) == 1 && find_get_page(swapper_space, entry) != NULL This counting logic have worked well for a long time. But considering that we cannot know there is a _real_ reference or not by swap_map[], current usage of counter is not very good. This patch adds a flag SWAP_HAS_CACHE and recored information that a swap entry has a cache or not. This will remove -1 magic used in swapfile.c and be a help to avoid unnecessary find_get_page(). Signed-off-by: KAMEZAWA Hiroyuki Tested-by: Daisuke Nishimura Cc: Balbir Singh Cc: Hugh Dickins Cc: Johannes Weiner Cc: Li Zefan Cc: Dhaval Giani Cc: YAMAMOTO Takashi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 5 +- mm/swapfile.c | 214 +++++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 163 insertions(+), 56 deletions(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index 19bdf3017a9..b9ca029673a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -292,7 +292,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Swap entry may have been freed since our caller observed it. */ - if (!swapcache_prepare(entry)) + err = swapcache_prepare(entry); + if (err == -EEXIST) /* seems racy */ + continue; + if (err) /* swp entry is obsolete ? */ break; /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 3187079903f..0d7296971ad 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,6 +53,33 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); +/* For reference count accounting in swap_map */ +/* enum for swap_map[] handling. internal use only */ +enum { + SWAP_MAP = 0, /* ops for reference from swap users */ + SWAP_CACHE, /* ops for reference from swap cache */ +}; + +static inline int swap_count(unsigned short ent) +{ + return ent & SWAP_COUNT_MASK; +} + +static inline bool swap_has_cache(unsigned short ent) +{ + return !!(ent & SWAP_HAS_CACHE); +} + +static inline unsigned short encode_swapmap(int count, bool has_cache) +{ + unsigned short ret = count; + + if (has_cache) + return SWAP_HAS_CACHE | ret; + return ret; +} + + /* * We need this because the bdev->unplug_fn can sleep and we cannot * hold swap_lock while calling the unplug_fn. And swap_lock @@ -167,7 +194,8 @@ static int wait_for_discard(void *word) #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 -static inline unsigned long scan_swap_map(struct swap_info_struct *si) +static inline unsigned long scan_swap_map(struct swap_info_struct *si, + int cache) { unsigned long offset; unsigned long scan_base; @@ -285,7 +313,10 @@ checks: si->lowest_bit = si->max; si->highest_bit = 0; } - si->swap_map[offset] = 1; + if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ + si->swap_map[offset] = encode_swapmap(0, true); + else /* at suspend */ + si->swap_map[offset] = encode_swapmap(1, false); si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; @@ -401,7 +432,8 @@ swp_entry_t get_swap_page(void) continue; swap_list.next = next; - offset = scan_swap_map(si); + /* This is called for allocating swap entry for cache */ + offset = scan_swap_map(si, SWAP_CACHE); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -415,6 +447,7 @@ noswap: return (swp_entry_t) {0}; } +/* The only caller of this function is now susupend routine */ swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; @@ -424,7 +457,8 @@ swp_entry_t get_swap_page_of_type(int type) si = swap_info + type; if (si->flags & SWP_WRITEOK) { nr_swap_pages--; - offset = scan_swap_map(si); + /* This is called for allocating swap entry, not cache */ + offset = scan_swap_map(si, SWAP_MAP); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -471,25 +505,38 @@ out: return NULL; } -static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) +static int swap_entry_free(struct swap_info_struct *p, + swp_entry_t ent, int cache) { unsigned long offset = swp_offset(ent); - int count = p->swap_map[offset]; - - if (count < SWAP_MAP_MAX) { - count--; - p->swap_map[offset] = count; - if (!count) { - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) - p->highest_bit = offset; - if (p->prio > swap_info[swap_list.next].prio) - swap_list.next = p - swap_info; - nr_swap_pages++; - p->inuse_pages--; - mem_cgroup_uncharge_swap(ent); + int count = swap_count(p->swap_map[offset]); + bool has_cache; + + has_cache = swap_has_cache(p->swap_map[offset]); + + if (cache == SWAP_MAP) { /* dropping usage count of swap */ + if (count < SWAP_MAP_MAX) { + count--; + p->swap_map[offset] = encode_swapmap(count, has_cache); } + } else { /* dropping swap cache flag */ + VM_BUG_ON(!has_cache); + p->swap_map[offset] = encode_swapmap(count, false); + + } + /* return code. */ + count = p->swap_map[offset]; + /* free if no reference */ + if (!count) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = p - swap_info; + nr_swap_pages++; + p->inuse_pages--; + mem_cgroup_uncharge_swap(ent); } return count; } @@ -504,7 +551,7 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { - swap_entry_free(p, entry); + swap_entry_free(p, entry, SWAP_MAP); spin_unlock(&swap_lock); } } @@ -514,9 +561,16 @@ void swap_free(swp_entry_t entry) */ void swapcache_free(swp_entry_t entry, struct page *page) { + struct swap_info_struct *p; + if (page) mem_cgroup_uncharge_swapcache(page, entry); - return swap_free(entry); + p = swap_info_get(entry); + if (p) { + swap_entry_free(p, entry, SWAP_CACHE); + spin_unlock(&swap_lock); + } + return; } /* @@ -531,8 +585,7 @@ static inline int page_swapcount(struct page *page) entry.val = page_private(page); p = swap_info_get(entry); if (p) { - /* Subtract the 1 for the swap cache itself */ - count = p->swap_map[swp_offset(entry)] - 1; + count = swap_count(p->swap_map[swp_offset(entry)]); spin_unlock(&swap_lock); } return count; @@ -594,7 +647,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry) == 1) { + if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); @@ -901,7 +954,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, i = 1; } count = si->swap_map[i]; - if (count && count != SWAP_MAP_BAD) + if (count && swap_count(count) != SWAP_MAP_BAD) break; } return i; @@ -1005,13 +1058,13 @@ static int try_to_unuse(unsigned int type) */ shmem = 0; swcount = *swap_map; - if (swcount > 1) { + if (swap_count(swcount)) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else retval = unuse_mm(start_mm, entry, page); } - if (*swap_map > 1) { + if (swap_count(*swap_map)) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; struct mm_struct *new_start_mm = start_mm; @@ -1021,7 +1074,7 @@ static int try_to_unuse(unsigned int type) atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (*swap_map > 1 && !retval && !shmem && + while (swap_count(*swap_map) && !retval && !shmem && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) @@ -1033,14 +1086,16 @@ static int try_to_unuse(unsigned int type) cond_resched(); swcount = *swap_map; - if (swcount <= 1) + if (!swap_count(swcount)) /* any usage ? */ ; else if (mm == &init_mm) { set_start_mm = 1; shmem = shmem_unuse(entry, page); } else retval = unuse_mm(mm, entry, page); - if (set_start_mm && *swap_map < swcount) { + + if (set_start_mm && + swap_count(*swap_map) < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); new_start_mm = mm; @@ -1067,21 +1122,25 @@ static int try_to_unuse(unsigned int type) } /* - * How could swap count reach 0x7fff when the maximum - * pid is 0x7fff, and there's no way to repeat a swap - * page within an mm (except in shmem, where it's the - * shared object which takes the reference count)? - * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. - * + * How could swap count reach 0x7ffe ? + * There's no way to repeat a swap page within an mm + * (except in shmem, where it's the shared object which takes + * the reference count)? + * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned + * short is too small....) * If that's wrong, then we should worry more about * exit_mmap() and do_munmap() cases described above: * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. */ - if (*swap_map == SWAP_MAP_MAX) { + /* We might release the lock_page() in unuse_mm(). */ + if (!PageSwapCache(page) || page_private(page) != entry.val) + goto retry; + + if (swap_count(*swap_map) == SWAP_MAP_MAX) { spin_lock(&swap_lock); - *swap_map = 1; + *swap_map = encode_swapmap(0, true); spin_unlock(&swap_lock); reset_overflow = 1; } @@ -1099,7 +1158,8 @@ static int try_to_unuse(unsigned int type) * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. */ - if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { + if (swap_count(*swap_map) && + PageDirty(page) && PageSwapCache(page)) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; @@ -1126,6 +1186,7 @@ static int try_to_unuse(unsigned int type) * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); +retry: unlock_page(page); page_cache_release(page); @@ -1952,15 +2013,23 @@ void si_swapinfo(struct sysinfo *val) * * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as * "permanent", but will be reclaimed by the next swapoff. + * Returns error code in following case. + * - success -> 0 + * - swp_entry is invalid -> EINVAL + * - swp_entry is migration entry -> EINVAL + * - swap-cache reference is requested but there is already one. -> EEXIST + * - swap-cache reference is requested but the entry is not used. -> ENOENT */ -int swap_duplicate(swp_entry_t entry) +static int __swap_duplicate(swp_entry_t entry, bool cache) { struct swap_info_struct * p; unsigned long offset, type; - int result = 0; + int result = -EINVAL; + int count; + bool has_cache; if (is_migration_entry(entry)) - return 1; + return -EINVAL; type = swp_type(entry); if (type >= nr_swapfiles) @@ -1969,17 +2038,40 @@ int swap_duplicate(swp_entry_t entry) offset = swp_offset(entry); spin_lock(&swap_lock); - if (offset < p->max && p->swap_map[offset]) { - if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { - p->swap_map[offset]++; - result = 1; - } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { + + if (unlikely(offset >= p->max)) + goto unlock_out; + + count = swap_count(p->swap_map[offset]); + has_cache = swap_has_cache(p->swap_map[offset]); + + if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ + + /* set SWAP_HAS_CACHE if there is no cache and entry is used */ + if (!has_cache && count) { + p->swap_map[offset] = encode_swapmap(count, true); + result = 0; + } else if (has_cache) /* someone added cache */ + result = -EEXIST; + else if (!count) /* no users */ + result = -ENOENT; + + } else if (count || has_cache) { + if (count < SWAP_MAP_MAX - 1) { + p->swap_map[offset] = encode_swapmap(count + 1, + has_cache); + result = 0; + } else if (count <= SWAP_MAP_MAX) { if (swap_overflow++ < 5) - printk(KERN_WARNING "swap_dup: swap entry overflow\n"); - p->swap_map[offset] = SWAP_MAP_MAX; - result = 1; + printk(KERN_WARNING + "swap_dup: swap entry overflow\n"); + p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, + has_cache); + result = 0; } - } + } else + result = -ENOENT; /* unused swap entry */ +unlock_out: spin_unlock(&swap_lock); out: return result; @@ -1988,13 +2080,25 @@ bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } +/* + * increase reference count of swap entry by 1. + */ +void swap_duplicate(swp_entry_t entry) +{ + __swap_duplicate(entry, SWAP_MAP); +} /* + * @entry: swap entry for which we allocate swap cache. + * * Called when allocating swap cache for exising swap entry, + * This can return error codes. Returns 0 at success. + * -EBUSY means there is a swap cache. + * Note: return code is different from swap_duplicate(). */ int swapcache_prepare(swp_entry_t entry) { - return swap_duplicate(entry); + return __swap_duplicate(entry, SWAP_CACHE); } @@ -2035,7 +2139,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) /* Don't read in free or bad pages */ if (!si->swap_map[toff]) break; - if (si->swap_map[toff] == SWAP_MAP_BAD) + if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; } /* Count contiguous allocated slots below our target */ @@ -2043,7 +2147,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) /* Don't read in free or bad pages */ if (!si->swap_map[toff]) break; - if (si->swap_map[toff] == SWAP_MAP_BAD) + if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; } spin_unlock(&swap_lock); -- cgit v1.2.3 From c9e444103b5e7a5a3519f9913f59767f92e33baf Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 16 Jun 2009 15:32:54 -0700 Subject: mm: reuse unused swap entry if necessary Presently we can know a swap entry is just used as SwapCache via swap_map, without looking up swap cache. Then, we have a chance to reuse swap-cache-only swap entries in get_swap_pages(). This patch tries to free swap-cache-only swap entries if swap is not enough. Note: We hit following path when swap_cluster code cannot find a free cluster. Then, vm_swap_full() is not only condition to allow the kernel to reclaim unused swap. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Hugh Dickins Cc: Johannes Weiner Cc: Li Zefan Cc: Dhaval Giani Cc: YAMAMOTO Takashi Tested-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 0d7296971ad..28faa01cf57 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -79,6 +79,32 @@ static inline unsigned short encode_swapmap(int count, bool has_cache) return ret; } +/* returnes 1 if swap entry is freed */ +static int +__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) +{ + int type = si - swap_info; + swp_entry_t entry = swp_entry(type, offset); + struct page *page; + int ret = 0; + + page = find_get_page(&swapper_space, entry.val); + if (!page) + return 0; + /* + * This function is called from scan_swap_map() and it's called + * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. + * We have to use trylock for avoiding deadlock. This is a special + * case and you should use try_to_free_swap() with explicit lock_page() + * in usual operations. + */ + if (trylock_page(page)) { + ret = try_to_free_swap(page); + unlock_page(page); + } + page_cache_release(page); + return ret; +} /* * We need this because the bdev->unplug_fn can sleep and we cannot @@ -301,6 +327,19 @@ checks: goto no_page; if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; + + /* reuse swap entry of cache-only swap if not busy. */ + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; + spin_unlock(&swap_lock); + swap_was_freed = __try_to_reclaim_swap(si, offset); + spin_lock(&swap_lock); + /* entry was freed successfully, try to use this again */ + if (swap_was_freed) + goto checks; + goto scan; /* check next one */ + } + if (si->swap_map[offset]) goto scan; @@ -382,6 +421,10 @@ scan: spin_lock(&swap_lock); goto checks; } + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&swap_lock); + goto checks; + } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; @@ -393,6 +436,10 @@ scan: spin_lock(&swap_lock); goto checks; } + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&swap_lock); + goto checks; + } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; -- cgit v1.2.3 From 2ff05b2b4eac2e63d345fc731ea151a060247f53 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 16 Jun 2009 15:32:56 -0700 Subject: oom: move oom_adj value from task_struct to mm_struct The per-task oom_adj value is a characteristic of its mm more than the task itself since it's not possible to oom kill any thread that shares the mm. If a task were to be killed while attached to an mm that could not be freed because another thread were set to OOM_DISABLE, it would have needlessly been terminated since there is no potential for future memory freeing. This patch moves oomkilladj (now more appropriately named oom_adj) from struct task_struct to struct mm_struct. This requires task_lock() on a task to check its oom_adj value to protect against exec, but it's already necessary to take the lock when dereferencing the mm to find the total VM size for the badness heuristic. This fixes a livelock if the oom killer chooses a task and another thread sharing the same memory has an oom_adj value of OOM_DISABLE. This occurs because oom_kill_task() repeatedly returns 1 and refuses to kill the chosen task while select_bad_process() will repeatedly choose the same task during the next retry. Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in oom_kill_task() to check for threads sharing the same memory will be removed in the next patch in this series where it will no longer be necessary. Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since these threads are immune from oom killing already. They simply report an oom_adj value of OOM_DISABLE. Cc: Nick Piggin Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a7b2460e922..b60913520ef 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; + int oom_adj; task_lock(p); mm = p->mm; @@ -65,6 +66,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } + oom_adj = mm->oom_adj; /* * The memory size of the process is the basis for the badness. @@ -148,15 +150,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 8; /* - * Adjust the score by oomkilladj. + * Adjust the score by oom_adj. */ - if (p->oomkilladj) { - if (p->oomkilladj > 0) { + if (oom_adj) { + if (oom_adj > 0) { if (!points) points = 1; - points <<= p->oomkilladj; + points <<= oom_adj; } else - points >>= -(p->oomkilladj); + points >>= -(oom_adj); } #ifdef DEBUG @@ -251,8 +253,12 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } - if (p->oomkilladj == OOM_DISABLE) + task_lock(p); + if (p->mm && p->mm->oom_adj == OOM_DISABLE) { + task_unlock(p); continue; + } + task_unlock(p); points = badness(p, uptime.tv_sec); if (points > *ppoints || !chosen) { @@ -304,8 +310,7 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, - p->comm); + get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -367,8 +372,12 @@ static int oom_kill_task(struct task_struct *p) * Don't kill the process if any threads are set to OOM_DISABLE */ do_each_thread(g, q) { - if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + task_lock(q); + if (q->mm == mm && q->mm && q->mm->oom_adj == OOM_DISABLE) { + task_unlock(q); return 1; + } + task_unlock(q); } while_each_thread(g, q); __oom_kill_task(p, 1); @@ -393,10 +402,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", - current->comm, gfp_mask, order, current->oomkilladj); task_lock(current); + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oom_adj=%d\n", + current->comm, gfp_mask, order, + current->mm ? current->mm->oom_adj : OOM_DISABLE); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); -- cgit v1.2.3 From 4d8b9135c30ccbe46e621fefd862969819003fd6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 16 Jun 2009 15:32:57 -0700 Subject: oom: avoid unnecessary mm locking and scanning for OOM_DISABLE This moves the check for OOM_DISABLE to the badness heuristic so it is only necessary to hold task_lock() once. If the mm is OOM_DISABLE, the score is 0, which is also correctly exported via /proc/pid/oom_score. This requires that tasks with badness scores of 0 are prohibited from being oom killed, which makes sense since they would not allow for future memory freeing anyway. Since the oom_adj value is a characteristic of an mm and not a task, it is no longer necessary to check the oom_adj value for threads sharing the same memory (except when simply issuing SIGKILLs for threads in other thread groups). Cc: Nick Piggin Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b60913520ef..cdcf89cb9ff 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -67,6 +67,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) return 0; } oom_adj = mm->oom_adj; + if (oom_adj == OOM_DISABLE) { + task_unlock(p); + return 0; + } /* * The memory size of the process is the basis for the badness. @@ -253,15 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } - task_lock(p); - if (p->mm && p->mm->oom_adj == OOM_DISABLE) { - task_unlock(p); - continue; - } - task_unlock(p); - points = badness(p, uptime.tv_sec); - if (points > *ppoints || !chosen) { + if (points > *ppoints) { chosen = p; *ppoints = points; } @@ -354,32 +351,13 @@ static int oom_kill_task(struct task_struct *p) struct mm_struct *mm; struct task_struct *g, *q; + task_lock(p); mm = p->mm; - - /* WARNING: mm may not be dereferenced since we did not obtain its - * value from get_task_mm(p). This is OK since all we need to do is - * compare mm to q->mm below. - * - * Furthermore, even if mm contains a non-NULL value, p->mm may - * change to NULL at any time since we do not hold task_lock(p). - * However, this is of no concern to us. - */ - - if (mm == NULL) + if (!mm || mm->oom_adj == OOM_DISABLE) { + task_unlock(p); return 1; - - /* - * Don't kill the process if any threads are set to OOM_DISABLE - */ - do_each_thread(g, q) { - task_lock(q); - if (q->mm == mm && q->mm && q->mm->oom_adj == OOM_DISABLE) { - task_unlock(q); - return 1; - } - task_unlock(q); - } while_each_thread(g, q); - + } + task_unlock(p); __oom_kill_task(p, 1); /* -- cgit v1.2.3 From 82553a937f12352c26fe457510ebab3f512cd3fa Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 16 Jun 2009 15:32:58 -0700 Subject: oom: invoke oom killer for __GFP_NOFAIL The oom killer must be invoked regardless of the order if the allocation is __GFP_NOFAIL, otherwise it will loop forever when reclaim fails to free some memory. Cc: Nick Piggin Acked-by: Rik van Riel Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Dave Hansen Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c95a77cd581..c5fb017c943 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1561,7 +1561,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) goto out; /* Exhausted what can be done so it's blamo time */ @@ -1781,11 +1781,13 @@ rebalance: goto got_pg; /* - * The OOM killer does not trigger for high-order allocations - * but if no progress is being made, there are no other - * options and retrying is unlikely to help + * The OOM killer does not trigger for high-order + * ~__GFP_NOFAIL allocations so if no progress is being + * made, there are no other options and retrying is + * unlikely to help. */ - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (order > PAGE_ALLOC_COSTLY_ORDER && + !(gfp_mask & __GFP_NOFAIL)) goto nopage; goto restart; -- cgit v1.2.3 From 286973552f051404abdb58dd9b2f8f7558efe4e5 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Tue, 16 Jun 2009 15:32:59 -0700 Subject: mm: remove __invalidate_mapping_pages variant Remove __invalidate_mapping_pages atomic variant now that its sole caller can sleep (fixed in eccb95cee4f0d56faa46ef22fb94dd4a3578d3eb ("vfs: fix lock inversion in drop_pagecache_sb()")). This fixes softlockups that can occur while in the drop_caches path. Signed-off-by: Mike Waychison Cc: Jan Kara Cc: Wu Fengguang Cc: Dave Chinner Cc: Nick Piggin Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 12e1579f916..ccc3ecf7cb9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) } EXPORT_SYMBOL(truncate_inode_pages); -unsigned long __invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end, bool be_atomic) +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) { struct pagevec pvec; pgoff_t next = start; @@ -309,30 +322,10 @@ unlock: break; } pagevec_release(&pvec); - if (likely(!be_atomic)) - cond_resched(); + cond_resched(); } return ret; } - -/** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - */ -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) -{ - return __invalidate_mapping_pages(mapping, start, end, false); -} EXPORT_SYMBOL(invalidate_mapping_pages); /* -- cgit v1.2.3 From 73d60b7f747176dbdff826c4127d22e1fd3f9f74 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 16 Jun 2009 15:33:00 -0700 Subject: page-allocator: clear N_HIGH_MEMORY map before we set it again SRAT tables may contains nodes of very small size. The arch code may decide to not activate such a node. However, currently the early boot code sets N_HIGH_MEMORY for such nodes. These nodes therefore seem to be active although these nodes have no present pages. For 64bit N_HIGH_MEMORY == N_NORMAL_MEMORY, so that works for 64 bit too Signed-off-by: Yinghai Lu Tested-by: Jack Steiner Acked-by: Christoph Lameter Cc: Mel Gorman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c5fb017c943..6407cbfccd7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4204,6 +4204,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) early_node_map[i].start_pfn, early_node_map[i].end_pfn); + /* + * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init + * that node_mask, clear it at first + */ + nodes_clear(node_states[N_HIGH_MEMORY]); /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); -- cgit v1.2.3 From 8192da6a8811ab6c3d29dc590a5f94a377c43739 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:33:01 -0700 Subject: mm: remove annotation of gfp_mask in add_to_swap Hugh removed add_to_swap's gfp_mask argument. (mm: remove gfp_mask from add_to_swap) So we have to remove annotation of gfp_mask of the function. Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index b9ca029673a..b62e7f56051 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -124,7 +124,6 @@ void __delete_from_swap_cache(struct page *page) /** * add_to_swap - allocate swap space for a page * @page: page we want to move to swap - * @gfp_mask: memory allocation flags * * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. -- cgit v1.2.3 From aca8bf323edd31ad462dc98c107c23a5c6022ca2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:33:02 -0700 Subject: mm: remove file argument from swap_readpage() The file argument resulted from address_space's readpage long time ago. We don't use it any more. Let's remove unnecessary argement. Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 2 +- mm/swap_state.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 3023c475e04..c6f3e5071de 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -120,7 +120,7 @@ out: return ret; } -int swap_readpage(struct file *file, struct page *page) +int swap_readpage(struct page *page) { struct bio *bio; int ret = 0; diff --git a/mm/swap_state.c b/mm/swap_state.c index b62e7f56051..42cd38eba79 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -313,7 +313,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Initiate read into locked page and return. */ lru_cache_add_anon(new_page); - swap_readpage(NULL, new_page); + swap_readpage(new_page); return new_page; } ClearPageSwapBacked(new_page); -- cgit v1.2.3 From 168f5ac668f63dfb64439766e3ef9e866b83719d Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Tue, 16 Jun 2009 15:33:02 -0700 Subject: mm cleanup: shmem_file_setup: 'char *' -> 'const char *' for name argument As function shmem_file_setup does not modify/allocate/free/pass given filename - mark it as const. Signed-off-by: Sergei Trofimovich Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 47ab1918228..e89d7ec18ed 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ -struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) { int error; struct file *file; -- cgit v1.2.3 From 6fe6b7e35785e3232ffe7f81d3893f1316710a02 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:33:05 -0700 Subject: vmscan: report vm_flags in page_referenced() Collect vma->vm_flags of the VMAs that actually referenced the page. This is preparing for more informed reclaim heuristics, eg. to protect executable file pages more aggressively. For now only the VM_EXEC bit will be used by the caller. Thanks to Johannes, Peter and Minchan for all the good tips. Acked-by: Peter Zijlstra Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 37 ++++++++++++++++++++++++++----------- mm/vmscan.c | 7 +++++-- 2 files changed, 31 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 316c9d6930a..c9ccc1a72dc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) * repeatedly from either page_referenced_anon or page_referenced_file. */ static int page_referenced_one(struct page *page, - struct vm_area_struct *vma, unsigned int *mapcount) + struct vm_area_struct *vma, + unsigned int *mapcount, + unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -381,11 +383,14 @@ out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); out: + if (referenced) + *vm_flags |= vma->vm_flags; return referenced; } static int page_referenced_anon(struct page *page, - struct mem_cgroup *mem_cont) + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { unsigned int mapcount; struct anon_vma *anon_vma; @@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, &mapcount); + referenced += page_referenced_one(page, vma, + &mapcount, vm_flags); if (!mapcount) break; } @@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page, * page_referenced_file - referenced check for object-based rmap * @page: the page we're checking references on. * @mem_cont: target memory controller + * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * For an object-based mapped page, find all the places it is mapped and * check/clear the referenced flag. This is done by following the page->mapping @@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page, * This function is only called from page_referenced for object-based pages. */ static int page_referenced_file(struct page *page, - struct mem_cgroup *mem_cont) + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { unsigned int mapcount; struct address_space *mapping = page->mapping; @@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, &mapcount); + referenced += page_referenced_one(page, vma, + &mapcount, vm_flags); if (!mapcount) break; } @@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page, * @page: the page to test * @is_locked: caller holds lock on the page * @mem_cont: target memory controller + * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. */ -int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *mem_cont) +int page_referenced(struct page *page, + int is_locked, + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { int referenced = 0; if (TestClearPageReferenced(page)) referenced++; + *vm_flags = 0; if (page_mapped(page) && page->mapping) { if (PageAnon(page)) - referenced += page_referenced_anon(page, mem_cont); + referenced += page_referenced_anon(page, mem_cont, + vm_flags); else if (is_locked) - referenced += page_referenced_file(page, mem_cont); + referenced += page_referenced_file(page, mem_cont, + vm_flags); else if (!trylock_page(page)) referenced++; else { if (page->mapping) - referenced += - page_referenced_file(page, mem_cont); + referenced += page_referenced_file(page, + mem_cont, vm_flags); unlock_page(page); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 52339dd7bf8..6be2068f61c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -577,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct pagevec freed_pvec; int pgactivate = 0; unsigned long nr_reclaimed = 0; + unsigned long vm_flags; cond_resched(); @@ -627,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; } - referenced = page_referenced(page, 1, sc->mem_cgroup); + referenced = page_referenced(page, 1, + sc->mem_cgroup, &vm_flags); /* In active use or really unfreeable? Activate it. */ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced && page_mapping_inuse(page)) @@ -1208,6 +1210,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, { unsigned long pgmoved; unsigned long pgscanned; + unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); struct page *page; @@ -1248,7 +1251,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup)) + page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) pgmoved++; list_add(&page->lru, &l_inactive); -- cgit v1.2.3 From 8cab4754d24a0f2e05920170c845bd84472814c6 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:33:12 -0700 Subject: vmscan: make mapped executable pages the first class citizen Protect referenced PROT_EXEC mapped pages from being deactivated. PROT_EXEC(or its internal presentation VM_EXEC) pages normally belong to some currently running executables and their linked libraries, they shall really be cached aggressively to provide good user experiences. Thanks to Johannes Weiner for the advice to reuse the VMA walk in page_referenced() to get the PROT_EXEC bit. [more details] ( The consequences of this patch will have to be discussed together with Rik van Riel's recent patch "vmscan: evict use-once pages first". ) ( Some of the good points and insights are taken into this changelog. Thanks to all the involved people for the great LKML discussions. ) the problem =========== For a typical desktop, the most precious working set is composed of *actively accessed* (1) memory mapped executables (2) and their anonymous pages (3) and other files (4) and the dcache/icache/.. slabs while the least important data are (5) infrequently used or use-once files For a typical desktop, one major problem is busty and large amount of (5) use-once files flushing out the working set. Inside the working set, (4) dcache/icache have already been too sticky ;-) So we only have to care (2) anonymous and (1)(3) file pages. anonymous pages =============== Anonymous pages are effectively immune to the streaming IO attack, because we now have separate file/anon LRU lists. When the use-once files crowd into the file LRU, the list's "quality" is significantly lowered. Therefore the scan balance policy in get_scan_ratio() will choose to scan the (low quality) file LRU much more frequently than the anon LRU. file pages ========== Rik proposed to *not* scan the active file LRU when the inactive list grows larger than active list. This guarantees that when there are use-once streaming IO, and the working set is not too large(so that active_size < inactive_size), the active file LRU will *not* be scanned at all. So the not-too-large working set can be well protected. But there are also situations where the file working set is a bit large so that (active_size >= inactive_size), or the streaming IOs are not purely use-once. In these cases, the active list will be scanned slowly. Because the current shrink_active_list() policy is to deactivate active pages regardless of their referenced bits. The deactivated pages become susceptible to the streaming IO attack: the inactive list could be scanned fast (500MB / 50MBps = 10s) so that the deactivated pages don't have enough time to get re-referenced. Because a user tend to switch between windows in intervals from seconds to minutes. This patch holds mapped executable pages in the active list as long as they are referenced during each full scan of the active list. Because the active list is normally scanned much slower, they get longer grace time (eg. 100s) for further references, which better matches the pace of user operations. Therefore this patch greatly prolongs the in-cache time of executable code, when there are moderate memory pressures. before patch: guaranteed to be cached if reference intervals < I after patch: guaranteed to be cached if reference intervals < I+A (except when randomly reclaimed by the lumpy reclaim) where A = time to fully scan the active file LRU I = time to fully scan the inactive file LRU Note that normally A >> I. side effects ============ This patch is safe in general, it restores the pre-2.6.28 mmap() behavior but in a much smaller and well targeted scope. One may worry about some one to abuse the PROT_EXEC heuristic. But as Andrew Morton stated, there are other tricks to getting that sort of boost. Another concern is the PROT_EXEC mapped pages growing large in rare cases, and therefore hurting reclaim efficiency. But a sane application targeted for large audience will never use PROT_EXEC for data mappings. If some home made application tries to abuse that bit, it shall be aware of the consequences. If it is abused to scale of 2/3 total memory, it gains nothing but overheads. benchmarks ========== 1) memory tight desktop 1.1) brief summary - clock time and major faults are reduced by 50%; - pswpin numbers are reduced to ~1/3. That means X desktop responsiveness is doubled under high memory/swap pressure. 1.2) test scenario - nfsroot gnome desktop with 512M physical memory - run some programs, and switch between the existing windows after starting each new program. 1.3) progress timing (seconds) before after programs 0.02 0.02 N xeyes 0.75 0.76 N firefox 2.02 1.88 N nautilus 3.36 3.17 N nautilus --browser 5.26 4.89 N gthumb 7.12 6.47 N gedit 9.22 8.16 N xpdf /usr/share/doc/shared-mime-info/shared-mime-info-spec.pdf 13.58 12.55 N xterm 15.87 14.57 N mlterm 18.63 17.06 N gnome-terminal 21.16 18.90 N urxvt 26.24 23.48 N gnome-system-monitor 28.72 26.52 N gnome-help 32.15 29.65 N gnome-dictionary 39.66 36.12 N /usr/games/sol 43.16 39.27 N /usr/games/gnometris 48.65 42.56 N /usr/games/gnect 53.31 47.03 N /usr/games/gtali 58.60 52.05 N /usr/games/iagno 65.77 55.42 N /usr/games/gnotravex 70.76 61.47 N /usr/games/mahjongg 76.15 67.11 N /usr/games/gnome-sudoku 86.32 75.15 N /usr/games/glines 92.21 79.70 N /usr/games/glchess 103.79 88.48 N /usr/games/gnomine 113.84 96.51 N /usr/games/gnotski 124.40 102.19 N /usr/games/gnibbles 137.41 114.93 N /usr/games/gnobots2 155.53 125.02 N /usr/games/blackjack 179.85 135.11 N /usr/games/same-gnome 224.49 154.50 N /usr/bin/gnome-window-properties 248.44 162.09 N /usr/bin/gnome-default-applications-properties 282.62 173.29 N /usr/bin/gnome-at-properties 323.72 188.21 N /usr/bin/gnome-typing-monitor 363.99 199.93 N /usr/bin/gnome-at-visual 394.21 206.95 N /usr/bin/gnome-sound-properties 435.14 224.49 N /usr/bin/gnome-at-mobility 463.05 234.11 N /usr/bin/gnome-keybinding-properties 503.75 248.59 N /usr/bin/gnome-about-me 554.00 276.27 N /usr/bin/gnome-display-properties 615.48 304.39 N /usr/bin/gnome-network-preferences 693.03 342.01 N /usr/bin/gnome-mouse-properties 759.90 388.58 N /usr/bin/gnome-appearance-properties 937.90 508.47 N /usr/bin/gnome-control-center 1109.75 587.57 N /usr/bin/gnome-keyboard-properties 1399.05 758.16 N : oocalc 1524.64 830.03 N : oodraw 1684.31 900.03 N : ooimpress 1874.04 993.91 N : oomath 2115.12 1081.89 N : ooweb 2369.02 1161.99 N : oowriter Note that the last ": oo*" commands are actually commented out. 1.4) vmstat numbers (some relevant ones are marked with *) before after nr_free_pages 1293 3898 nr_inactive_anon 59956 53460 nr_active_anon 26815 30026 nr_inactive_file 2657 3218 nr_active_file 2019 2806 nr_unevictable 4 4 nr_mlock 4 4 nr_anon_pages 26706 27859 *nr_mapped 3542 4469 nr_file_pages 72232 67681 nr_dirty 1 0 nr_writeback 123 19 nr_slab_reclaimable 3375 3534 nr_slab_unreclaimable 11405 10665 nr_page_table_pages 8106 7864 nr_unstable 0 0 nr_bounce 0 0 *nr_vmscan_write 394776 230839 nr_writeback_temp 0 0 numa_hit 6843353 3318676 numa_miss 0 0 numa_foreign 0 0 numa_interleave 1719 1719 numa_local 6843353 3318676 numa_other 0 0 *pgpgin 5954683 2057175 *pgpgout 1578276 922744 *pswpin 1486615 512238 *pswpout 394568 230685 pgalloc_dma 277432 56602 pgalloc_dma32 6769477 3310348 pgalloc_normal 0 0 pgalloc_movable 0 0 pgfree 7048396 3371118 pgactivate 2036343 1471492 pgdeactivate 2189691 1612829 pgfault 3702176 3100702 *pgmajfault 452116 201343 pgrefill_dma 12185 7127 pgrefill_dma32 334384 653703 pgrefill_normal 0 0 pgrefill_movable 0 0 pgsteal_dma 74214 22179 pgsteal_dma32 3334164 1638029 pgsteal_normal 0 0 pgsteal_movable 0 0 pgscan_kswapd_dma 1081421 1216199 pgscan_kswapd_dma32 58979118 46002810 pgscan_kswapd_normal 0 0 pgscan_kswapd_movable 0 0 pgscan_direct_dma 2015438 1086109 pgscan_direct_dma32 55787823 36101597 pgscan_direct_normal 0 0 pgscan_direct_movable 0 0 pginodesteal 3461 7281 slabs_scanned 564864 527616 kswapd_steal 2889797 1448082 kswapd_inodesteal 14827 14835 pageoutrun 43459 21562 allocstall 9653 4032 pgrotated 384216 228631 1.5) free numbers at the end of the tests before patch: total used free shared buffers cached Mem: 474 467 7 0 0 236 -/+ buffers/cache: 230 243 Swap: 1023 418 605 after patch: total used free shared buffers cached Mem: 474 457 16 0 0 236 -/+ buffers/cache: 221 253 Swap: 1023 404 619 2) memory flushing in a file server 2.1) brief summary The number of major faults from 50 to 3 during 10% cache hot reads. That means this patch successfully stops major faults when the active file list is slowly scanned when there are partially cache hot streaming IO. 2.2) test scenario Do 100000 pread(size=110 pages, offset=(i*100) pages), where 10% of the pages will be activated: for i in `seq 0 100 10000000`; do echo $i 110; done > pattern-hot-10 iotrace.rb --load pattern-hot-10 --play /b/sparse vmmon nr_mapped nr_active_file nr_inactive_file pgmajfault pgdeactivate pgfree and monitor /proc/vmstat during the time. The test box has 2G memory. I carried out tests on fresh booted console as well as X desktop, and fetched the vmstat numbers on (1) begin: shortly after the big read IO starts; (2) end: just before the big read IO stops; (3) restore: the big read IO stops and the zsh working set restored (4) restore X: after IO, switch back and forth between the urxvt and firefox windows to restore their working set. 2.3) console mode results nr_mapped nr_active_file nr_inactive_file pgmajfault pgdeactivate pgfree 2.6.29 VM_EXEC protection ON: begin: 2481 2237 8694 630 0 574299 end: 275 231976 233914 633 776271 20933042 restore: 370 232154 234524 691 777183 20958453 2.6.29 VM_EXEC protection ON (second run): begin: 2434 2237 8493 629 0 574195 end: 284 231970 233536 632 771918 20896129 restore: 399 232218 234789 690 774526 20957909 2.6.30-rc4-mm VM_EXEC protection OFF: begin: 2479 2344 9659 210 0 579643 end: 284 232010 234142 260 772776 20917184 restore: 379 232159 234371 301 774888 20967849 The above console numbers show that - The startup pgmajfault of 2.6.30-rc4-mm is merely 1/3 that of 2.6.29. I'd attribute that improvement to the mmap readahead improvements :-) - The pgmajfault increment during the file copy is 633-630=3 vs 260-210=50. That's a huge improvement - which means with the VM_EXEC protection logic, active mmap pages is pretty safe even under partially cache hot streaming IO. - when active:inactive file lru size reaches 1:1, their scan rates is 1:20.8 under 10% cache hot IO. (computed with formula Dpgdeactivate:Dpgfree) That roughly means the active mmap pages get 20.8 more chances to get re-referenced to stay in memory. - The absolute nr_mapped drops considerably to 1/9 during the big IO, and the dropped pages are mostly inactive ones. The patch has almost no impact in this aspect, that means it won't unnecessarily increase memory pressure. (In contrast, your 20% mmap protection ratio will keep them all, and therefore eliminate the extra 41 major faults to restore working set of zsh etc.) The iotrace.rb read throughput is 151.194384MB/s 284.198252s 100001x 450560b --load pattern-hot-10 --play /b/sparse which means the inactive list is rotated at the speed of 250MB/s, so a full scan of which takes about 3.5 seconds, while a full scan of active file list takes about 77 seconds. 2.4) X mode results We can reach roughly the same conclusions for X desktop: nr_mapped nr_active_file nr_inactive_file pgmajfault pgdeactivate pgfree 2.6.30-rc4-mm VM_EXEC protection ON: begin: 9740 8920 64075 561 0 678360 end: 768 218254 220029 565 798953 21057006 restore: 857 218543 220987 606 799462 21075710 restore X: 2414 218560 225344 797 799462 21080795 2.6.30-rc4-mm VM_EXEC protection OFF: begin: 9368 5035 26389 554 0 633391 end: 770 218449 221230 661 646472 17832500 restore: 1113 218466 220978 710 649881 17905235 restore X: 2687 218650 225484 947 802700 21083584 - the absolute nr_mapped drops considerably (to 1/13 of the original size) during the streaming IO. - the delta of pgmajfault is 3 vs 107 during IO, or 236 vs 393 during the whole process. Cc: Elladan Cc: Nick Piggin Cc: Andi Kleen Cc: Christoph Lameter Acked-by: Rik van Riel Acked-by: Peter Zijlstra Acked-by: KOSAKI Motohiro Reviewed-by: Johannes Weiner Reviewed-by: Minchan Kim Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 52 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 6be2068f61c..1024979d658 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1212,6 +1212,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, unsigned long pgscanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; struct pagevec pvec; @@ -1251,28 +1252,42 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) + page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { pgmoved++; + /* + * Identify referenced, file-backed active pages and + * give them one more trip around the active list. So + * that executable code get better chances to stay in + * memory under moderate memory pressure. Anon pages + * are not likely to be evicted by use-once streaming + * IO, plus JVM can create lots of anon VM_EXEC pages, + * so we ignore them here. + */ + if ((vm_flags & VM_EXEC) && !PageAnon(page)) { + list_add(&page->lru, &l_active); + continue; + } + } list_add(&page->lru, &l_inactive); } /* - * Move the pages to the [file or anon] inactive list. + * Move pages back to the lru list. */ pagevec_init(&pvec, 1); - lru = LRU_BASE + file * LRU_FILE; spin_lock_irq(&zone->lru_lock); /* - * Count referenced pages from currently used mappings as - * rotated, even though they are moved to the inactive list. - * This helps balance scan pressure between file and anonymous - * pages in get_scan_ratio. + * Count referenced pages from currently used mappings as rotated, + * even though only some of them are actually re-activated. This + * helps balance scan pressure between file and anonymous pages in + * get_scan_ratio. */ reclaim_stat->recent_rotated[!!file] += pgmoved; pgmoved = 0; /* count pages moved to inactive list */ + lru = LRU_BASE + file * LRU_FILE; while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); @@ -1295,6 +1310,29 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); __count_zone_vm_events(PGREFILL, zone, pgscanned); __count_vm_events(PGDEACTIVATE, pgmoved); + + pgmoved = 0; /* count pages moved back to active list */ + lru = LRU_ACTIVE + file * LRU_FILE; + while (!list_empty(&l_active)) { + page = lru_to_page(&l_active); + prefetchw_prev_lru_page(page, &l_active, flags); + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + VM_BUG_ON(!PageActive(page)); + + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_add_lru_list(page, lru); + pgmoved++; + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + spin_unlock_irq(&zone->lru_lock); if (buffer_heads_over_limit) pagevec_strip(&pvec); -- cgit v1.2.3 From 3eb4140f0389bdada022d5e8efd88504ad30df14 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:33:13 -0700 Subject: vmscan: merge duplicate code in shrink_active_list() The "move pages to active list" and "move pages to inactive list" code blocks are mostly identical and can be served by a function. Thanks to Andrew Morton for pointing this out. Note that buffer_heads_over_limit check will also be carried out for re-activated pages, which is slightly different from pre-2.6.28 kernels. Also, Rik's "vmscan: evict use-once pages first" patch could totally stop scans of active file list when memory pressure is low. So the net effect could be, the number of buffer heads is now more likely to grow large. However that's fine according to Johannes' comments: I don't think that this could be harmful. We just preserve the buffer mappings of what we consider the working set and with low memory pressure, as you say, this set is not big. As to stripping of reactivated pages: the only pages we re-activate for now are those VM_EXEC mapped ones. Since we don't expect IO from or to these pages, removing the buffer mappings in case they grow too large should be okay, I guess. Cc: Pekka Enberg Acked-by: Peter Zijlstra Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 95 +++++++++++++++++++++++++++---------------------------------- 1 file changed, 42 insertions(+), 53 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 1024979d658..f3a55d1f9ab 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1204,6 +1204,43 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) * But we had to alter page->flags anyway. */ +static void move_active_pages_to_lru(struct zone *zone, + struct list_head *list, + enum lru_list lru) +{ + unsigned long pgmoved = 0; + struct pagevec pvec; + struct page *page; + + pagevec_init(&pvec, 1); + + while (!list_empty(list)) { + page = lru_to_page(list); + prefetchw_prev_lru_page(page, list, flags); + + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + + VM_BUG_ON(!PageActive(page)); + if (!is_active_lru(lru)) + ClearPageActive(page); /* we are de-activating */ + + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_add_lru_list(page, lru); + pgmoved++; + + if (!pagevec_add(&pvec, page) || list_empty(list)) { + spin_unlock_irq(&zone->lru_lock); + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) + __count_vm_events(PGDEACTIVATE, pgmoved); +} static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) @@ -1215,8 +1252,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct pagevec pvec; - enum lru_list lru; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); lru_add_drain(); @@ -1233,6 +1268,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[!!file] += pgmoved; + __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); else @@ -1275,8 +1311,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* * Move pages back to the lru list. */ - pagevec_init(&pvec, 1); - spin_lock_irq(&zone->lru_lock); /* * Count referenced pages from currently used mappings as rotated, @@ -1286,57 +1320,12 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, */ reclaim_stat->recent_rotated[!!file] += pgmoved; - pgmoved = 0; /* count pages moved to inactive list */ - lru = LRU_BASE + file * LRU_FILE; - while (!list_empty(&l_inactive)) { - page = lru_to_page(&l_inactive); - prefetchw_prev_lru_page(page, &l_inactive, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - ClearPageActive(page); - - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - __count_zone_vm_events(PGREFILL, zone, pgscanned); - __count_vm_events(PGDEACTIVATE, pgmoved); - - pgmoved = 0; /* count pages moved back to active list */ - lru = LRU_ACTIVE + file * LRU_FILE; - while (!list_empty(&l_active)) { - page = lru_to_page(&l_active); - prefetchw_prev_lru_page(page, &l_active, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + move_active_pages_to_lru(zone, &l_active, + LRU_ACTIVE + file * LRU_FILE); + move_active_pages_to_lru(zone, &l_inactive, + LRU_BASE + file * LRU_FILE); spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - pagevec_release(&pvec); } static int inactive_anon_is_low_global(struct zone *zone) -- cgit v1.2.3 From 9198e96c06744517e3b18fce8be6db61e96a3227 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Tue, 16 Jun 2009 15:33:15 -0700 Subject: vmscan: handle may_swap more strictly Commit 2e2e425989080cc534fc0fca154cae515f971cf5 ("vmscan,memcg: reintroduce sc->may_swap) add may_swap flag and handle it at get_scan_ratio(). But the result of get_scan_ratio() is ignored when priority == 0, so anon lru is scanned even if may_swap == 0 or nr_swap_pages == 0. IMHO, this is not an expected behavior. As for memcg especially, because of this behavior many and many pages are swapped-out just in vain when oom is invoked by mem+swap limit. This patch is for handling may_swap flag more strictly. Signed-off-by: Daisuke Nishimura Reviewed-by: KOSAKI Motohiro Cc: Minchan Kim Cc: Johannes Weiner Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index f3a55d1f9ab..057e44b97aa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1430,13 +1430,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - percent[0] = 0; - percent[1] = 100; - return; - } - anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + @@ -1534,15 +1527,22 @@ static void shrink_zone(int priority, struct zone *zone, enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long swap_cluster_max = sc->swap_cluster_max; + int noswap = 0; - get_scan_ratio(zone, sc, percent); + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (nr_swap_pages <= 0)) { + noswap = 1; + percent[0] = 0; + percent[1] = 100; + } else + get_scan_ratio(zone, sc, percent); for_each_evictable_lru(l) { int file = is_file_lru(l); unsigned long scan; scan = zone_nr_pages(zone, sc, l); - if (priority) { + if (priority || noswap) { scan >>= priority; scan = (scan * percent[file]) / 100; } -- cgit v1.2.3 From 81236810226f71bd9ff77321c8e8276dae7efc61 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 16 Jun 2009 15:33:16 -0700 Subject: oom: only oom kill exiting tasks with attached memory When a task is chosen for oom kill and is found to be PF_EXITING, __oom_kill_task() is called to elevate the task's timeslice and give it access to memory reserves so that it may quickly exit. This privilege is unnecessary, however, if the task has already detached its mm. Although its possible for the mm to become detached later since task_lock() is not held, __oom_kill_task() will simply be a no-op in such circumstances. Subsequently, it is no longer necessary to warn about killing mm-less tasks since it is a no-op. Signed-off-by: David Rientjes Acked-by: Rik van Riel Cc: Balbir Singh Cc: Minchan Kim Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index cdcf89cb9ff..175a67a78a9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -325,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } - if (!p->mm) { - WARN_ON(1); - printk(KERN_WARNING "tried to kill an mm-less task!\n"); + if (!p->mm) return; - } if (verbose) printk(KERN_ERR "Killed process %d (%s)\n", @@ -397,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly + * if its mm is still attached. */ - if (p->flags & PF_EXITING) { + if (p->mm && (p->flags & PF_EXITING)) { __oom_kill_task(p, 0); return 0; } -- cgit v1.2.3 From 90afa5de6f3fa89a733861e843377302479fcf7e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:33:20 -0700 Subject: vmscan: properly account for the number of page cache pages zone_reclaim() can reclaim A bug was brought to my attention against a distro kernel but it affects mainline and I believe problems like this have been reported in various guises on the mailing lists although I don't have specific examples at the moment. The reported problem was that malloc() stalled for a long time (minutes in some cases) if a large tmpfs mount was occupying a large percentage of memory overall. The pages did not get cleaned or reclaimed by zone_reclaim() because the zone_reclaim_mode was unsuitable, but the lists are uselessly scanned frequencly making the CPU spin at near 100%. This patchset intends to address that bug and bring the behaviour of zone_reclaim() more in line with expectations which were noticed during investigation. It is based on top of mmotm and takes advantage of Kosaki's work with respect to zone_reclaim(). Patch 1 fixes the heuristics that zone_reclaim() uses to determine if the scan should go ahead. The broken heuristic is what was causing the malloc() stall as it uselessly scanned the LRU constantly. Currently, zone_reclaim is assuming zone_reclaim_mode is 1 and historically it could not deal with tmpfs pages at all. This fixes up the heuristic so that an unnecessary scan is more likely to be correctly avoided. Patch 2 notes that zone_reclaim() returning a failure automatically means the zone is marked full. This is not always true. It could have failed because the GFP mask or zone_reclaim_mode were unsuitable. Patch 3 introduces a counter zreclaim_failed that will increment each time the zone_reclaim scan-avoidance heuristics fail. If that counter is rapidly increasing, then zone_reclaim_mode should be set to 0 as a temporarily resolution and a bug reported because the scan-avoidance heuristic is still broken. This patch: On NUMA machines, the administrator can configure zone_reclaim_mode that is a more targetted form of direct reclaim. On machines with large NUMA distances for example, a zone_reclaim_mode defaults to 1 meaning that clean unmapped pages will be reclaimed if the zone watermarks are not being met. There is a heuristic that determines if the scan is worthwhile but the problem is that the heuristic is not being properly applied and is basically assuming zone_reclaim_mode is 1 if it is enabled. The lack of proper detection can manfiest as high CPU usage as the LRU list is scanned uselessly. Historically, once enabled it was depending on NR_FILE_PAGES which may include swapcache pages that the reclaim_mode cannot deal with. Patch vmscan-change-the-number-of-the-unmapped-files-in-zone-reclaim.patch by Kosaki Motohiro noted that zone_page_state(zone, NR_FILE_PAGES) included pages that were not file-backed such as swapcache and made a calculation based on the inactive, active and mapped files. This is far superior when zone_reclaim==1 but if RECLAIM_SWAP is set, then NR_FILE_PAGES is a reasonable starting figure. This patch alters how zone_reclaim() works out how many pages it might be able to reclaim given the current reclaim_mode. If RECLAIM_SWAP is set in the reclaim_mode it will either consider NR_FILE_PAGES as potential candidates or else use NR_{IN}ACTIVE}_PAGES-NR_FILE_MAPPED to discount swapcache and other non-file-backed pages. If RECLAIM_WRITE is not set, then NR_FILE_DIRTY number of pages are not candidates. If RECLAIM_SWAP is not set, then NR_FILE_MAPPED are not. [kosaki.motohiro@jp.fujitsu.com: Estimate unmapped pages minus tmpfs pages] [fengguang.wu@intel.com: Fix underflow problem in Kosaki's estimate] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Acked-by: Christoph Lameter Cc: KOSAKI Motohiro Cc: Wu Fengguang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 52 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 057e44b97aa..79a98d98ed3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2356,6 +2356,48 @@ int sysctl_min_unmapped_ratio = 1; */ int sysctl_min_slab_ratio = 5; +static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +{ + unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); + unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_FILE); + + /* + * It's possible for there to be more file mapped pages than + * accounted for by the pages on the file LRU lists because + * tmpfs pages accounted for as ANON can also be FILE_MAPPED + */ + return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; +} + +/* Work out how many page cache pages we can reclaim in this reclaim_mode */ +static long zone_pagecache_reclaimable(struct zone *zone) +{ + long nr_pagecache_reclaimable; + long delta = 0; + + /* + * If RECLAIM_SWAP is set, then all file pages are considered + * potentially reclaimable. Otherwise, we have to worry about + * pages like swapcache and zone_unmapped_file_pages() provides + * a better estimate + */ + if (zone_reclaim_mode & RECLAIM_SWAP) + nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + else + nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + + /* If we can't clean pages, remove dirty pages from consideration */ + if (!(zone_reclaim_mode & RECLAIM_WRITE)) + delta += zone_page_state(zone, NR_FILE_DIRTY); + + /* Watch for any possible underflows due to delta */ + if (unlikely(delta > nr_pagecache_reclaimable)) + delta = nr_pagecache_reclaimable; + + return nr_pagecache_reclaimable - delta; +} + /* * Try to free up some pages from this zone through reclaim. */ @@ -2390,9 +2432,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) > - zone->min_unmapped_pages) { + if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. @@ -2450,10 +2490,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * if less than a specified percentage of the zone is used by * unmapped file backed pages. */ - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages - && zone_page_state(zone, NR_SLAB_RECLAIMABLE) - <= zone->min_slab_pages) + if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return 0; if (zone_is_all_unreclaimable(zone)) -- cgit v1.2.3 From fa5e084e43eb14c14942027e1e2e894aeed96097 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:33:22 -0700 Subject: vmscan: do not unconditionally treat zones that fail zone_reclaim() as full On NUMA machines, the administrator can configure zone_reclaim_mode that is a more targetted form of direct reclaim. On machines with large NUMA distances for example, a zone_reclaim_mode defaults to 1 meaning that clean unmapped pages will be reclaimed if the zone watermarks are not being met. The problem is that zone_reclaim() failing at all means the zone gets marked full. This can cause situations where a zone is usable, but is being skipped because it has been considered full. Take a situation where a large tmpfs mount is occuping a large percentage of memory overall. The pages do not get cleaned or reclaimed by zone_reclaim(), but the zone gets marked full and the zonelist cache considers them not worth trying in the future. This patch makes zone_reclaim() return more fine-grained information about what occured when zone_reclaim() failued. The zone only gets marked full if it really is unreclaimable. If it's a case that the scan did not occur or if enough pages were not reclaimed with the limited reclaim_mode, then the zone is simply skipped. There is a side-effect to this patch. Currently, if zone_reclaim() successfully reclaimed SWAP_CLUSTER_MAX, an allocation attempt would go ahead. With this patch applied, zone watermarks are rechecked after zone_reclaim() does some work. This bug was introduced by commit 9276b1bc96a132f4068fdee00983c532f43d3a26 ("memory page_alloc zonelist caching speedup") way back in 2.6.19 when the zonelist_cache was introduced. It was not intended that zone_reclaim() aggressively consider the zone to be full when it failed as full direct reclaim can still be an option. Due to the age of the bug, it should be considered a -stable candidate. Signed-off-by: Mel Gorman Reviewed-by: Wu Fengguang Reviewed-by: Rik van Riel Reviewed-by: KOSAKI Motohiro Cc: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 4 ++++ mm/page_alloc.c | 26 ++++++++++++++++++++++---- mm/vmscan.c | 11 ++++++----- 3 files changed, 32 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index f02c7508068..f290c4db528 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -259,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int flags, struct page **pages, struct vm_area_struct **vmas); +#define ZONE_RECLAIM_NOSCAN -2 +#define ZONE_RECLAIM_FULL -1 +#define ZONE_RECLAIM_SOME 0 +#define ZONE_RECLAIM_SUCCESS 1 #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6407cbfccd7..2f457a756d4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1462,15 +1462,33 @@ zonelist_scan: BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; + int ret; + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (!zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) { - if (!zone_reclaim_mode || - !zone_reclaim(zone, gfp_mask, order)) + if (zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) + goto try_this_zone; + + if (zone_reclaim_mode == 0) + goto this_zone_full; + + ret = zone_reclaim(zone, gfp_mask, order); + switch (ret) { + case ZONE_RECLAIM_NOSCAN: + /* did not scan */ + goto try_next_zone; + case ZONE_RECLAIM_FULL: + /* scanned but unreclaimable */ + goto this_zone_full; + default: + /* did we reclaim enough */ + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) goto this_zone_full; } } +try_this_zone: page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask, migratetype); if (page) diff --git a/mm/vmscan.c b/mm/vmscan.c index 79a98d98ed3..16c82a868e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2492,16 +2492,16 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) - return 0; + return ZONE_RECLAIM_FULL; if (zone_is_all_unreclaimable(zone)) - return 0; + return ZONE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) - return 0; + return ZONE_RECLAIM_NOSCAN; /* * Only run zone reclaim on the local zone or on zones that do not @@ -2511,10 +2511,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ node_id = zone_to_nid(zone); if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return 0; + return ZONE_RECLAIM_NOSCAN; if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) - return 0; + return ZONE_RECLAIM_NOSCAN; + ret = __zone_reclaim(zone, gfp_mask, order); zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); -- cgit v1.2.3 From 24cf72518c79cdcda486ed26074ff8151291cf65 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:33:23 -0700 Subject: vmscan: count the number of times zone_reclaim() scans and fails On NUMA machines, the administrator can configure zone_reclaim_mode that is a more targetted form of direct reclaim. On machines with large NUMA distances for example, a zone_reclaim_mode defaults to 1 meaning that clean unmapped pages will be reclaimed if the zone watermarks are not being met. There is a heuristic that determines if the scan is worthwhile but it is possible that the heuristic will fail and the CPU gets tied up scanning uselessly. Detecting the situation requires some guesswork and experimentation so this patch adds a counter "zreclaim_failed" to /proc/vmstat. If during high CPU utilisation this counter is increasing rapidly, then the resolution to the problem may be to set /proc/sys/vm/zone_reclaim_mode to 0. [akpm@linux-foundation.org: name things consistently] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Wu Fengguang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +++ mm/vmstat.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 16c82a868e2..3018ad75613 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2519,6 +2519,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) ret = __zone_reclaim(zone, gfp_mask, order); zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + if (!ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); + return ret; } #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 1e3aa8139f2..138bed53706 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -673,6 +673,9 @@ static const char * const vmstat_text[] = { TEXTS_FOR_ZONES("pgscan_kswapd") TEXTS_FOR_ZONES("pgscan_direct") +#ifdef CONFIG_NUMA + "zone_reclaim_failed", +#endif "pginodesteal", "slabs_scanned", "kswapd_steal", -- cgit v1.2.3 From ee993b135ec75a93bd5c45e636bb210d2975159b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 16 Jun 2009 15:33:24 -0700 Subject: mm: fix lumpy reclaim lru handling at isolate_lru_pages At lumpy reclaim, a page failed to be taken by __isolate_lru_page() can be pushed back to "src" list by list_move(). But the page may not be from "src" list. This pushes the page back to wrong LRU. And list_move() itself is unnecessary because the page is not on top of LRU. Then, leave it as it is if __isolate_lru_page() fails. Reviewed-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Acked-by: Mel Gorman Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 3018ad75613..4139aa52b94 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -929,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; - switch (__isolate_lru_page(cursor_page, mode, file)) { - case 0: + if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); nr_taken++; scan++; - break; - - case -EBUSY: - /* else it is being freed elsewhere */ - list_move(&cursor_page->lru, src); - default: - break; /* ! on LRU or wrong list */ } } } -- cgit v1.2.3