From 7a81b88cb53e335ff7d019e6398c95792c817d93 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 7 Jan 2009 18:07:48 -0800 Subject: memcg: introduce charge-commit-cancel style of functions There is a small race in do_swap_page(). When the page swapped-in is charged, the mapcount can be greater than 0. But, at the same time some process (shares it ) call unmap and make mapcount 1->0 and the page is uncharged. CPUA CPUB mapcount == 1. (1) charge if mapcount==0 zap_pte_range() (2) mapcount 1 => 0. (3) uncharge(). (success) (4) set page's rmap() mapcount 0=>1 Then, this swap page's account is leaked. For fixing this, I added a new interface. - charge account to res_counter by PAGE_SIZE and try to free pages if necessary. - commit register page_cgroup and add to LRU if necessary. - cancel uncharge PAGE_SIZE because of do_swap_page failure. CPUA (1) charge (always) (2) set page's rmap (mapcount > 0) (3) commit charge was necessary or not after set_pte(). This protocol uses PCG_USED bit on page_cgroup for avoiding over accounting. Usual mem_cgroup_charge_common() does charge -> commit at a time. And this patch also adds following function to clarify all charges. - mem_cgroup_newpage_charge() ....replacement for mem_cgroup_charge() called against newly allocated anon pages. - mem_cgroup_charge_migrate_fixup() called only from remove_migration_ptes(). we'll have to rewrite this later.(this patch just keeps old behavior) This function will be removed by additional patch to make migration clearer. Good for clarifying "what we do" Then, we have 4 following charge points. - newpage - swap-in - add-to-cache. - migration. [akpm@linux-foundation.org: add missing inline directives to stubs] Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index eec5ca758a2..fb926efb516 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -690,17 +690,18 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { + struct mem_cgroup *ptr = NULL; spinlock_t *ptl; pte_t *pte; int ret = 1; - if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + if (mem_cgroup_try_charge(vma->vm_mm, GFP_KERNEL, &ptr)) ret = -ENOMEM; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { if (ret > 0) - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge_swapin(ptr); ret = 0; goto out; } @@ -710,6 +711,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, addr); + mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); /* * Move the page to the active list so it is not -- cgit v1.2.3 From bced0520fe462bb94021dcabd32e99630c171be2 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 7 Jan 2009 18:07:49 -0800 Subject: memcg: fix gfp_mask of callers of charge Fix misuse of gfp_kernel. Now, most of callers of mem_cgroup_charge_xxx functions uses GFP_KERNEL. I think that this is from the fact that page_cgroup *was* dynamically allocated. But now, we allocate all page_cgroup at boot. And mem_cgroup_try_to_free_pages() reclaim memory from GFP_HIGHUSER_MOVABLE + specified GFP_RECLAIM_MASK. * This is because we just want to reduce memory usage. "Where we should reclaim from ?" is not a problem in memcg. This patch modifies gfp masks to be GFP_HIGUSER_MOVABLE if possible. Note: This patch is not for fixing behavior but for showing sane information in source code. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index fb926efb516..ddc6d92be2c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -695,7 +695,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge(vma->vm_mm, GFP_KERNEL, &ptr)) + if (mem_cgroup_try_charge(vma->vm_mm, GFP_HIGHUSER_MOVABLE, &ptr)) ret = -ENOMEM; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); -- cgit v1.2.3 From 27a7faa0779dd13729196c1a818c294f44bbd1ee Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 7 Jan 2009 18:07:58 -0800 Subject: memcg: swap cgroup for remembering usage For accounting swap, we need a record per swap entry, at least. This patch adds following function. - swap_cgroup_swapon() .... called from swapon - swap_cgroup_swapoff() ... called at the end of swapoff. - swap_cgroup_record() .... record information of swap entry. - swap_cgroup_lookup() .... lookup information of swap entry. This patch just implements "how to record information". No actual method for limit the usage of swap. These routine uses flat table to record and lookup. "wise" lookup system like radix-tree requires requires memory allocation at new records but swap-out is usually called under memory shortage (or memcg hits limit.) So, I used static allocation. (maybe dynamic allocation is not very hard but it adds additional memory allocation in memory shortage path.) Note1: In this, we use pointer to record information and this means 8bytes per swap entry. I think we can reduce this when we create "id of cgroup" in the range of 0-65535 or 0-255. Reported-by: Daisuke Nishimura Reviewed-by: Daisuke Nishimura Tested-by: Daisuke Nishimura Reported-by: Hugh Dickins Reported-by: Balbir Singh Reported-by: Andrew Morton Signed-off-by: KAMEZAWA Hiroyuki Cc: Pavel Emelianov Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index ddc6d92be2c..1e7a715a386 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -33,6 +33,7 @@ #include #include #include +#include static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -1494,6 +1495,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile) spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + /* Destroy swap account informatin */ + swap_cgroup_swapoff(type); + inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); @@ -1811,6 +1815,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) } swap_map[page_nr] = SWAP_MAP_BAD; } + + error = swap_cgroup_swapon(type, maxpages); + if (error) + goto bad_swap; + nr_good_pages = swap_header->info.last_page - swap_header->info.nr_badpages - 1 /* header page */; @@ -1882,6 +1891,7 @@ bad_swap: bd_release(bdev); } destroy_swap_extents(p); + swap_cgroup_swapoff(type); bad_swap_2: spin_lock(&swap_lock); p->swap_file = NULL; -- cgit v1.2.3 From 8c7c6e34a1256a5082d38c8e9bd1474476912715 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 7 Jan 2009 18:08:00 -0800 Subject: memcg: mem+swap controller core This patch implements per cgroup limit for usage of memory+swap. However there are SwapCache, double counting of swap-cache and swap-entry is avoided. Mem+Swap controller works as following. - memory usage is limited by memory.limit_in_bytes. - memory + swap usage is limited by memory.memsw_limit_in_bytes. This has following benefits. - A user can limit total resource usage of mem+swap. Without this, because memory resource controller doesn't take care of usage of swap, a process can exhaust all the swap (by memory leak.) We can avoid this case. And Swap is shared resource but it cannot be reclaimed (goes back to memory) until it's used. This characteristic can be trouble when the memory is divided into some parts by cpuset or memcg. Assume group A and group B. After some application executes, the system can be.. Group A -- very large free memory space but occupy 99% of swap. Group B -- under memory shortage but cannot use swap...it's nearly full. Ability to set appropriate swap limit for each group is required. Maybe someone wonder "why not swap but mem+swap ?" - The global LRU(kswapd) can swap out arbitrary pages. Swap-out means to move account from memory to swap...there is no change in usage of mem+swap. In other words, when we want to limit the usage of swap without affecting global LRU, mem+swap limit is better than just limiting swap. Accounting target information is stored in swap_cgroup which is per swap entry record. Charge is done as following. map - charge page and memsw. unmap - uncharge page/memsw if not SwapCache. swap-out (__delete_from_swap_cache) - uncharge page - record mem_cgroup information to swap_cgroup. swap-in (do_swap_page) - charged as page and memsw. record in swap_cgroup is cleared. memsw accounting is decremented. swap-free (swap_free()) - if swap entry is freed, memsw is uncharged by PAGE_SIZE. There are people work under never-swap environments and consider swap as something bad. For such people, this mem+swap controller extension is just an overhead. This overhead is avoided by config or boot option. (see Kconfig. detail is not in this patch.) TODO: - maybe more optimization can be don in swap-in path. (but not very safe.) But we just do simple accounting at this stage. [nishimura@mxp.nes.nec.co.jp: make resize limit hold mutex] [hugh@veritas.com: memswap controller core swapcache fixes] Signed-off-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Balbir Singh Cc: Pavel Emelyanov Signed-off-by: Daisuke Nishimura Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 1e7a715a386..0579d9069b6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -471,8 +471,9 @@ out: return NULL; } -static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) +static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) { + unsigned long offset = swp_offset(ent); int count = p->swap_map[offset]; if (count < SWAP_MAP_MAX) { @@ -487,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) swap_list.next = p - swap_info; nr_swap_pages++; p->inuse_pages--; + mem_cgroup_uncharge_swap(ent); } } return count; @@ -502,7 +504,7 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { - swap_entry_free(p, swp_offset(entry)); + swap_entry_free(p, entry); spin_unlock(&swap_lock); } } @@ -582,7 +584,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) { + if (swap_entry_free(p, entry) == 1) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); @@ -696,7 +698,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge(vma->vm_mm, GFP_HIGHUSER_MOVABLE, &ptr)) + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, + GFP_HIGHUSER_MOVABLE, &ptr)) ret = -ENOMEM; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); -- cgit v1.2.3 From 2c26fdd70c3094fa3e84caf9ef434911933d5477 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 7 Jan 2009 18:08:10 -0800 Subject: memcg: revert gfp mask fix My patch, memcg-fix-gfp_mask-of-callers-of-charge.patch changed gfp_mask of callers of charge to be GFP_HIGHUSER_MOVABLE for showing what will happen at memory reclaim. But in recent discussion, it's NACKed because it sounds ugly. This patch is for reverting it and add some clean up to gfp_mask of callers of charge. No behavior change but need review before generating HUNK in deep queue. This patch also adds explanation to meaning of gfp_mask passed to charge functions in memcontrol.h. Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Daisuke Nishimura Cc: Hugh Dickins Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 0579d9069b6..da422c47e2e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -698,8 +698,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, - GFP_HIGHUSER_MOVABLE, &ptr)) + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) ret = -ENOMEM; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); -- cgit v1.2.3