From caeab084deb61cd2d51cb8facc0e894a5b406aa4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 12 Mar 2008 23:57:49 -0700 Subject: slub page alloc fallback: Enable interrupts for GFP_WAIT. The fallback path needs to enable interrupts like done for the other page allocator calls. This was not necessary with the alternate fast path since we handled irq enable/disable in the slow path. The regular fastpath handles irq enable/disable around calls to the slow path so we need to restore the proper status before calling the page allocator from the slowpath. Signed-off-by: Christoph Lameter --- mm/slub.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 96d63eb3ab1..ca71d5b81e4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1536,9 +1536,15 @@ new_slab: * That is only possible if certain conditions are met that are being * checked when a slab is created. */ - if (!(gfpflags & __GFP_NORETRY) && (s->flags & __PAGE_ALLOC_FALLBACK)) - return kmalloc_large(s->objsize, gfpflags); - + if (!(gfpflags & __GFP_NORETRY) && + (s->flags & __PAGE_ALLOC_FALLBACK)) { + if (gfpflags & __GFP_WAIT) + local_irq_enable(); + object = kmalloc_large(s->objsize, gfpflags); + if (gfpflags & __GFP_WAIT) + local_irq_disable(); + return object; + } return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) -- cgit v1.2.3 From 7682486b3ee06f800d5b11033371c7c5e92e3057 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:00:40 -0700 Subject: mm: fix various kernel-doc comments Fix various kernel-doc notation in mm/: filemap.c: add function short description; convert 2 to kernel-doc fremap.c: change parameter 'prot' to @prot pagewalk.c: change "-" in function parameters to ":" slab.c: fix short description of kmem_ptr_validate() swap.c: fix description & parameters of put_pages_list() swap_state.c: fix function parameters vmalloc.c: change "@returns" to "Returns:" since that is not a parameter Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 20 +++++++++++++++++--- mm/fremap.c | 2 +- mm/pagewalk.c | 10 +++++----- mm/slab.c | 5 ++--- mm/swap.c | 5 ++--- mm/swap_state.c | 2 ++ mm/vmalloc.c | 6 ++++-- 7 files changed, 33 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index df343d1e634..07e9d9258b4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -343,7 +343,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, EXPORT_SYMBOL(sync_page_range); /** - * sync_page_range_nolock + * sync_page_range_nolock - write & wait on all pages in the passed range without locking * @inode: target inode * @mapping: target address_space * @pos: beginning offset in pages to write @@ -611,7 +611,10 @@ int __lock_page_killable(struct page *page) sync_page_killable, TASK_KILLABLE); } -/* +/** + * __lock_page_nosync - get a lock on the page, without calling sync_page() + * @page: the page to lock + * * Variant of lock_page that does not require the caller to hold a reference * on the page's mapping. */ @@ -1538,9 +1541,20 @@ repeat: return page; } -/* +/** + * read_cache_page_async - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * * Same as read_cache_page, but don't wait for page to become unlocked * after submitting it to the filler. + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page but don't wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. */ struct page *read_cache_page_async(struct address_space *mapping, pgoff_t index, diff --git a/mm/fremap.c b/mm/fremap.c index 69a37c2bdf8..07a9c82ce1a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -113,7 +113,7 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, * mmap()/mremap() it does not create any new vmas. The new mappings are * also safe across swapout. * - * NOTE: the 'prot' parameter right now is ignored (but must be zero), + * NOTE: the @prot parameter right now is ignored (but must be zero), * and the vma's default protection is used. Arbitrary protections * might be implemented in the future. */ diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b4f27d22da9..1cf1417ef8b 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -77,11 +77,11 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, /** * walk_page_range - walk a memory map's page tables with a callback - * @mm - memory map to walk - * @addr - starting address - * @end - ending address - * @walk - set of callbacks to invoke for each level of the tree - * @private - private data passed to the callback function + * @mm: memory map to walk + * @addr: starting address + * @end: ending address + * @walk: set of callbacks to invoke for each level of the tree + * @private: private data passed to the callback function * * Recursively walk the page table for the memory area in a VMA, * calling supplied callbacks. Callbacks are called in-order (first diff --git a/mm/slab.c b/mm/slab.c index e6c698f5567..bb4070e1079 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3624,12 +3624,11 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** - * kmem_ptr_validate - check if an untrusted pointer might - * be a slab entry. + * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. * @cachep: the cache we're checking against * @ptr: pointer to validate * - * This verifies that the untrusted pointer looks sane: + * This verifies that the untrusted pointer looks sane; * it is _not_ a guarantee that the pointer is actually * part of the slab cache in question, but it at least * validates that the pointer can be dereferenced and diff --git a/mm/swap.c b/mm/swap.c index d4ec59aa5c4..aa1139ccf3a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -78,12 +78,11 @@ void put_page(struct page *page) EXPORT_SYMBOL(put_page); /** - * put_pages_list(): release a list of pages + * put_pages_list() - release a list of pages + * @pages: list of pages threaded on page->lru * * Release a list of pages which are strung together on page.lru. Currently * used by read_cache_pages() and related error recovery code. - * - * @pages: list of pages threaded on page->lru */ void put_pages_list(struct list_head *pages) { diff --git a/mm/swap_state.c b/mm/swap_state.c index ec42f01a8d0..50757ee3f9f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -115,6 +115,7 @@ void __delete_from_swap_cache(struct page *page) /** * add_to_swap - allocate swap space for a page * @page: page we want to move to swap + * @gfp_mask: memory allocation flags * * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. @@ -315,6 +316,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags * @vma: user vma this address belongs to * @addr: target address for mempolicy * diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 950c0be9ca8..ecf91f8034b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -757,7 +757,8 @@ finished: * @vma: vma to cover (map full range of vma) * @addr: vmalloc memory * @pgoff: number of pages into addr before first page to map - * @returns: 0 for success, -Exxx on failure + * + * Returns: 0 for success, -Exxx on failure * * This function checks that addr is a valid vmalloc'ed area, and * that it is big enough to cover the vma. Will return failure if @@ -829,7 +830,8 @@ static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) /** * alloc_vm_area - allocate a range of kernel address space * @size: size of the area - * @returns: NULL on failure, vm_struct on success + * + * Returns: NULL on failure, vm_struct on success * * This function reserves a range of kernel address space, and * allocates pagetables to map that range. No actual mappings -- cgit v1.2.3 From 46711810200c50e639ffc52e755b3dba9b4c82a3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:00:41 -0700 Subject: mm/shmem and tiny-shmem: fix some kernel-doc Convert tiny-shmem.c function comments to kernel-doc. Add parameters and convert/fix other kernel-doc in shmem.c. Signed-off-by: Randy Dunlap Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 25 ++++++++++--------------- mm/tiny-shmem.c | 8 +++----- 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 3372bc579e8..f514dd392cd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -244,9 +244,8 @@ static void shmem_free_inode(struct super_block *sb) } } -/* +/** * shmem_recalc_inode - recalculate the size of an inode - * * @inode: inode to recalc * * We have to calculate the free blocks since the mm can drop @@ -270,9 +269,8 @@ static void shmem_recalc_inode(struct inode *inode) } } -/* +/** * shmem_swp_entry - find the swap vector position in the info structure - * * @info: info structure for the inode * @index: index of the page to find * @page: optional page to add to the structure. Has to be preset to @@ -374,13 +372,13 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns } } -/* +/** * shmem_swp_alloc - get the position of the swap entry for the page. - * If it does not exist allocate the entry. - * * @info: info structure for the inode * @index: index of the page to find * @sgp: check and recheck i_size? skip allocation? + * + * If the entry does not exist, allocate it. */ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) { @@ -440,9 +438,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long return entry; } -/* +/** * shmem_free_swp - free some swap entries in a directory - * * @dir: pointer to the directory * @edir: pointer after last entry of the directory * @punch_lock: pointer to spinlock when needed for the holepunch case @@ -2022,7 +2019,7 @@ static const struct inode_operations shmem_symlink_inode_operations = { }; #ifdef CONFIG_TMPFS_POSIX_ACL -/** +/* * Superblocks without xattr inode operations will get security.* xattr * support from the VFS "for free". As soon as we have any other xattrs * like ACLs, we also need to implement the security.* handlers at @@ -2561,12 +2558,11 @@ out4: } module_init(init_tmpfs) -/* +/** * shmem_file_setup - get an unlinked file living in tmpfs - * * @name: name for dentry (to be seen in /proc//maps * @size: size to be set for the file - * + * @flags: vm_flags */ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) { @@ -2621,9 +2617,8 @@ put_memory: return ERR_PTR(error); } -/* +/** * shmem_zero_setup - setup a shared anonymous mapping - * * @vma: the vma to be mmapped is prepared by do_mmap_pgoff */ int shmem_zero_setup(struct vm_area_struct *vma) diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 702083638c1..f0f55875dd6 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -39,12 +39,11 @@ static int __init init_tmpfs(void) } module_init(init_tmpfs) -/* +/** * shmem_file_setup - get an unlinked file living in tmpfs - * * @name: name for dentry (to be seen in /proc//maps * @size: size to be set for the file - * + * @flags: vm_flags */ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) { @@ -95,9 +94,8 @@ put_memory: return ERR_PTR(error); } -/* +/** * shmem_zero_setup - setup a shared anonymous mapping - * * @vma: the vma to be mmapped is prepared by do_mmap_pgoff */ int shmem_zero_setup(struct vm_area_struct *vma) -- cgit v1.2.3 From 1b578df02207a67a29e8ced4db3b36d89df52fef Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:00:42 -0700 Subject: mm/oom_kill: fix kernel-doc Fix kernel-doc notation in oom_kill.c. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 44b2da11bf4..f255eda693b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -37,6 +37,7 @@ static DEFINE_SPINLOCK(zone_scan_mutex); * badness - calculate a numeric value for how bad this task has been * @p: task struct of which task we should calculate * @uptime: current uptime in seconds + * @mem: target memory controller * * The formula used is relatively simple and documented inline in the * function. The main rationale is that we want to select a good task @@ -264,6 +265,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, } /** + * dump_tasks - dump current memory state of all system tasks + * @mem: target memory controller + * * Dumps the current memory state of all system tasks, excluding kernel threads. * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj * score, and name. @@ -298,7 +302,7 @@ static void dump_tasks(const struct mem_cgroup *mem) } while_each_thread(g, p); } -/** +/* * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO * set. @@ -504,6 +508,9 @@ void clear_zonelist_oom(struct zonelist *zonelist) /** * out_of_memory - kill the "best" process when we run out of memory + * @zonelist: zonelist pointer + * @gfp_mask: memory allocation flags + * @order: amount of memory being requested as a power of 2 * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) -- cgit v1.2.3 From 77f6078aa8945a18a7780694940e52be0322c2b8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:00:42 -0700 Subject: mm: highmem kernel-doc additions Add kernel-doc comments to highmem.c. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 35d47733cde..7da4a7b6af1 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -104,8 +104,9 @@ static void flush_all_zero_pkmaps(void) flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } -/* Flush all unused kmap mappings in order to remove stray - mappings. */ +/** + * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings + */ void kmap_flush_unused(void) { spin_lock(&kmap_lock); @@ -163,6 +164,14 @@ start: return vaddr; } +/** + * kmap_high - map a highmem page into memory + * @page: &struct page to map + * + * Returns the page's virtual memory address. + * + * We cannot call this from interrupts, as it may block. + */ void *kmap_high(struct page *page) { unsigned long vaddr; @@ -170,8 +179,6 @@ void *kmap_high(struct page *page) /* * For highmem pages, we can't trust "virtual" until * after we have the lock. - * - * We cannot call this from interrupts, as it may block */ spin_lock(&kmap_lock); vaddr = (unsigned long)page_address(page); @@ -185,6 +192,10 @@ void *kmap_high(struct page *page) EXPORT_SYMBOL(kmap_high); +/** + * kunmap_high - map a highmem page into memory + * @page: &struct page to unmap + */ void kunmap_high(struct page *page) { unsigned long vaddr; @@ -259,6 +270,12 @@ static struct page_address_slot *page_slot(struct page *page) return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; } +/** + * page_address - get the mapped virtual address of a page + * @page: &struct page to get the virtual address of + * + * Returns the page's virtual address. + */ void *page_address(struct page *page) { unsigned long flags; @@ -288,6 +305,11 @@ done: EXPORT_SYMBOL(page_address); +/** + * set_page_address - set a page's virtual address + * @page: &struct page to set + * @virtual: virtual address to use + */ void set_page_address(struct page *page, void *virtual) { unsigned long flags; -- cgit v1.2.3 From 43d8eac44f28d384d2377dcdd1407f51f79dda55 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:00:43 -0700 Subject: mm: rmap kernel-doc fixes Correct kernel-doc function names and parameters in rmap.c. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 0c9a2df06c3..997f06907b6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -335,6 +335,7 @@ static int page_referenced_anon(struct page *page, /** * page_referenced_file - referenced check for object-based rmap * @page: the page we're checking references on. + * @mem_cont: target memory controller * * For an object-based mapped page, find all the places it is mapped and * check/clear the referenced flag. This is done by following the page->mapping @@ -402,6 +403,7 @@ static int page_referenced_file(struct page *page, * page_referenced - test if the page was referenced * @page: the page to test * @is_locked: caller holds lock on the page + * @mem_cont: target memory controller * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. @@ -506,7 +508,7 @@ int page_mkclean(struct page *page) EXPORT_SYMBOL_GPL(page_mkclean); /** - * page_set_anon_rmap - setup new anonymous rmap + * __page_set_anon_rmap - setup new anonymous rmap * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped @@ -530,7 +532,7 @@ static void __page_set_anon_rmap(struct page *page, } /** - * page_set_anon_rmap - sanity check anonymous rmap addition + * __page_check_anon_rmap - sanity check anonymous rmap addition * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped @@ -583,7 +585,7 @@ void page_add_anon_rmap(struct page *page, } } -/* +/** * page_add_new_anon_rmap - add pte mapping to a new anonymous page * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added @@ -623,6 +625,8 @@ void page_add_file_rmap(struct page *page) /** * page_dup_rmap - duplicate pte mapping to a page * @page: the page to add the mapping to + * @vma: the vm area being duplicated + * @address: the user virtual address mapped * * For copy_page_range only: minimal extract from page_add_file_rmap / * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's @@ -642,6 +646,7 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from + * @vma: the vm area in which the mapping is removed * * The caller needs to hold the pte lock. */ @@ -890,6 +895,7 @@ static int try_to_unmap_anon(struct page *page, int migration) /** * try_to_unmap_file - unmap file page using the object-based rmap method * @page: the page to unmap + * @migration: migration flag * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. @@ -986,6 +992,7 @@ out: /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped + * @migration: migration flag * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. -- cgit v1.2.3 From 52ea27eb4cd5f250f33638029a134ff03c5e6bbb Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 19 Mar 2008 17:00:45 -0700 Subject: memcgroup: fix check for thread being a group leader in memcgroup The check t->pid == t->pid is not the blessed way to check whether a task is a group leader. This is not about the code beautifulness only, but about pid namespaces fixes - both the tgid and the pid fields on the task_struct are (slowly :( ) becoming deprecated. Besides, the thread_group_leader() macro makes only one dereference :) Signed-off-by: Pavel Emelyanov Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8b9f6cae938..9b648bd6345 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1079,7 +1079,7 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, * Only thread group leaders are allowed to migrate, the mm_struct is * in effect owned by the leader */ - if (p->tgid != p->pid) + if (!thread_group_leader(p)) goto out; css_get(&mem->css); -- cgit v1.2.3 From f7850d932fc69cb4bad83117f0bef1a658cce350 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Mar 2008 17:01:02 -0700 Subject: mm/readahead: fix kernel-doc notation Fix kernel-doc notation in mm/readahead.c. Change ":" to ";" so that it doesn't get treated as a doc section heading. Move the comment block ending "*/" to a line by itself so that the text on that last line is not lost (dropped). Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index c9c50ca1ec3..8762e898897 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -443,9 +443,10 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); * pagecache pages * * page_cache_async_ondemand() should be called when a page is used which - * has the PG_readahead flag: this is a marker to suggest that the application + * has the PG_readahead flag; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in - * more pages. */ + * more pages. + */ void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, -- cgit v1.2.3 From 5a982cbc7b3fe6cf72266f319286f29963c71b9e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 24 Mar 2008 12:29:45 -0700 Subject: mm: fix boundary checking in free_bootmem_core With numa enabled, some callers could have a range of memory on one node but try to free that on other node. This can cause some pages to be freed wrongly. For example: when we try to allocate 128g boot ram early for gart/swiotlb, and free that range later so gart/swiotlb can get some range afterwards. With this patch, we don't need to care which node holds the range, just loop to call free_bootmem_node for all online nodes. This patch makes free_bootmem_core() more robust by trimming the sidx and eidx according the ram range that the node has. And make the free_bootmem_core handle this out of range case. We could use bdata_list to make sure the range can be freed for sure. So next time, we don't need to loop online nodes and could use free_bootmem directly. Signed-off-by: Yinghai Lu Cc: Andi Kleen Cc: Yasunori Goto Cc: KAMEZAWA Hiroyuki Acked-by: Ingo Molnar Tested-by: Ingo Molnar Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index f6ff4337b42..2ccea700968 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -125,6 +125,7 @@ static int __init reserve_bootmem_core(bootmem_data_t *bdata, BUG_ON(!size); BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn); BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn); + BUG_ON(addr < bdata->node_boot_start); sidx = PFN_DOWN(addr - bdata->node_boot_start); eidx = PFN_UP(addr + size - bdata->node_boot_start); @@ -156,21 +157,31 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long sidx, eidx; unsigned long i; + BUG_ON(!size); + + /* out range */ + if (addr + size < bdata->node_boot_start || + PFN_DOWN(addr) > bdata->node_low_pfn) + return; /* * round down end of usable mem, partially free pages are * considered reserved. */ - BUG_ON(!size); - BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn); - if (addr < bdata->last_success) + if (addr >= bdata->node_boot_start && addr < bdata->last_success) bdata->last_success = addr; /* - * Round up the beginning of the address. + * Round up to index to the range. */ - sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); + if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start)) + sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); + else + sidx = 0; + eidx = PFN_DOWN(addr + size - bdata->node_boot_start); + if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) + eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); for (i = sidx; i < eidx; i++) { if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) @@ -421,7 +432,9 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, void __init free_bootmem(unsigned long addr, unsigned long size) { - free_bootmem_core(NODE_DATA(0)->bdata, addr, size); + bootmem_data_t *bdata; + list_for_each_entry(bdata, &bdata_list, list) + free_bootmem_core(bdata, addr, size); } unsigned long __init free_all_bootmem(void) -- cgit v1.2.3 From 4dd4b920218326231156c7991ce5b94afad841c3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 24 Mar 2008 12:29:52 -0700 Subject: revert "kswapd should only wait on IO if there is IO" Revert commit f1a9ee758de7de1e040de849fdef46e6802ea117: Author: Rik van Riel Date: Thu Feb 7 00:14:08 2008 -0800 kswapd should only wait on IO if there is IO The current kswapd (and try_to_free_pages) code has an oddity where the code will wait on IO, even if there is no IO in flight. This problem is notable especially when the system scans through many unfreeable pages, causing unnecessary stalls in the VM. Additionally, tasks without __GFP_FS or __GFP_IO in the direct reclaim path will sleep if a significant number of pages are encountered that should be written out. This gives kswapd a chance to write out those pages, while the direct reclaim task sleeps. Signed-off-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Because of large latencies and interactivity problems reported by Carlos, here: http://lkml.org/lkml/2008/3/22/211 Cc: Rik van Riel Cc: "Carlos R. Mafra" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 45711585684..4046434046e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -70,13 +70,6 @@ struct scan_control { int order; - /* - * Pages that have (or should have) IO pending. If we run into - * a lot of these, we're better off waiting a little for IO to - * finish rather than scanning more pages in the VM. - */ - int nr_io_pages; - /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -512,10 +505,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) wait_on_page_writeback(page); - else { - sc->nr_io_pages++; + else goto keep_locked; - } } referenced = page_referenced(page, 1, sc->mem_cgroup); @@ -554,10 +545,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageDirty(page)) { if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) goto keep_locked; - if (!may_enter_fs) { - sc->nr_io_pages++; + if (!may_enter_fs) goto keep_locked; - } if (!sc->may_writepage) goto keep_locked; @@ -568,10 +557,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: - if (PageWriteback(page) || PageDirty(page)) { - sc->nr_io_pages++; + if (PageWriteback(page) || PageDirty(page)) goto keep; - } /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. @@ -1344,7 +1331,6 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; - sc->nr_io_pages = 0; if (!priority) disable_swap_token(); nr_reclaimed += shrink_zones(priority, zones, sc); @@ -1379,8 +1365,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, } /* Take a nap, wait for some writeback to complete */ - if (sc->nr_scanned && priority < DEF_PRIORITY - 2 && - sc->nr_io_pages > sc->swap_cluster_max) + if (sc->nr_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); } /* top priority shrink_caches still had more to do? don't OOM, then */ @@ -1514,7 +1499,6 @@ loop_again: if (!priority) disable_swap_token(); - sc.nr_io_pages = 0; all_zones_ok = 1; /* @@ -1607,8 +1591,7 @@ loop_again: * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && priority < DEF_PRIORITY - 2 && - sc.nr_io_pages > sc.swap_cluster_max) + if (total_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); /* -- cgit v1.2.3