diff options
author | Anton Altaparmakov <aia21@hera.kernel.org> | 2006-03-23 17:05:47 +0000 |
---|---|---|
committer | Anton Altaparmakov <aia21@hera.kernel.org> | 2006-03-23 17:05:47 +0000 |
commit | a05ba4561fa3ad8b64a27577d0d38c190f60f762 (patch) | |
tree | 5eb7561113e006b7bad0bef50dec6821962b1b36 /kernel | |
parent | 74293759002aa7db0179158c20676a034614577b (diff) | |
parent | b0e6e962992b76580f4900b166a337bad7c1e81b (diff) |
Merge branch 'master' of /home/aia21/ntfs-2.6/
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 212 | ||||
-rw-r--r-- | kernel/exit.c | 4 | ||||
-rw-r--r-- | kernel/fork.c | 8 | ||||
-rw-r--r-- | kernel/kprobes.c | 14 | ||||
-rw-r--r-- | kernel/kthread.c | 7 | ||||
-rw-r--r-- | kernel/module.c | 53 | ||||
-rw-r--r-- | kernel/panic.c | 97 | ||||
-rw-r--r-- | kernel/posix-timers.c | 1 | ||||
-rw-r--r-- | kernel/power/Makefile | 2 | ||||
-rw-r--r-- | kernel/power/disk.c | 20 | ||||
-rw-r--r-- | kernel/power/main.c | 2 | ||||
-rw-r--r-- | kernel/power/pm.c | 21 | ||||
-rw-r--r-- | kernel/power/power.h | 75 | ||||
-rw-r--r-- | kernel/power/process.c | 61 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 335 | ||||
-rw-r--r-- | kernel/power/swap.c | 544 | ||||
-rw-r--r-- | kernel/power/swsusp.c | 887 | ||||
-rw-r--r-- | kernel/power/user.c | 333 | ||||
-rw-r--r-- | kernel/profile.c | 11 | ||||
-rw-r--r-- | kernel/rcupdate.c | 14 | ||||
-rw-r--r-- | kernel/sched.c | 13 | ||||
-rw-r--r-- | kernel/signal.c | 11 | ||||
-rw-r--r-- | kernel/spinlock.c | 9 | ||||
-rw-r--r-- | kernel/sys.c | 46 |
24 files changed, 1705 insertions, 1075 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 12815d3f1a0..c86ee051b73 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -53,7 +53,7 @@ #include <asm/uaccess.h> #include <asm/atomic.h> -#include <asm/semaphore.h> +#include <linux/mutex.h> #define CPUSET_SUPER_MAGIC 0x27e0eb @@ -168,63 +168,57 @@ static struct vfsmount *cpuset_mount; static struct super_block *cpuset_sb; /* - * We have two global cpuset semaphores below. They can nest. - * It is ok to first take manage_sem, then nest callback_sem. We also + * We have two global cpuset mutexes below. They can nest. + * It is ok to first take manage_mutex, then nest callback_mutex. We also * require taking task_lock() when dereferencing a tasks cpuset pointer. * See "The task_lock() exception", at the end of this comment. * - * A task must hold both semaphores to modify cpusets. If a task - * holds manage_sem, then it blocks others wanting that semaphore, - * ensuring that it is the only task able to also acquire callback_sem + * A task must hold both mutexes to modify cpusets. If a task + * holds manage_mutex, then it blocks others wanting that mutex, + * ensuring that it is the only task able to also acquire callback_mutex * and be able to modify cpusets. It can perform various checks on * the cpuset structure first, knowing nothing will change. It can - * also allocate memory while just holding manage_sem. While it is + * also allocate memory while just holding manage_mutex. While it is * performing these checks, various callback routines can briefly - * acquire callback_sem to query cpusets. Once it is ready to make - * the changes, it takes callback_sem, blocking everyone else. + * acquire callback_mutex to query cpusets. Once it is ready to make + * the changes, it takes callback_mutex, blocking everyone else. * * Calls to the kernel memory allocator can not be made while holding - * callback_sem, as that would risk double tripping on callback_sem + * callback_mutex, as that would risk double tripping on callback_mutex * from one of the callbacks into the cpuset code from within * __alloc_pages(). * - * If a task is only holding callback_sem, then it has read-only + * If a task is only holding callback_mutex, then it has read-only * access to cpusets. * * The task_struct fields mems_allowed and mems_generation may only * be accessed in the context of that task, so require no locks. * * Any task can increment and decrement the count field without lock. - * So in general, code holding manage_sem or callback_sem can't rely + * So in general, code holding manage_mutex or callback_mutex can't rely * on the count field not changing. However, if the count goes to - * zero, then only attach_task(), which holds both semaphores, can + * zero, then only attach_task(), which holds both mutexes, can * increment it again. Because a count of zero means that no tasks * are currently attached, therefore there is no way a task attached * to that cpuset can fork (the other way to increment the count). - * So code holding manage_sem or callback_sem can safely assume that + * So code holding manage_mutex or callback_mutex can safely assume that * if the count is zero, it will stay zero. Similarly, if a task - * holds manage_sem or callback_sem on a cpuset with zero count, it + * holds manage_mutex or callback_mutex on a cpuset with zero count, it * knows that the cpuset won't be removed, as cpuset_rmdir() needs - * both of those semaphores. - * - * A possible optimization to improve parallelism would be to make - * callback_sem a R/W semaphore (rwsem), allowing the callback routines - * to proceed in parallel, with read access, until the holder of - * manage_sem needed to take this rwsem for exclusive write access - * and modify some cpusets. + * both of those mutexes. * * The cpuset_common_file_write handler for operations that modify - * the cpuset hierarchy holds manage_sem across the entire operation, + * the cpuset hierarchy holds manage_mutex across the entire operation, * single threading all such cpuset modifications across the system. * - * The cpuset_common_file_read() handlers only hold callback_sem across + * The cpuset_common_file_read() handlers only hold callback_mutex across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't - * (usually) take either semaphore. These are the two most performance + * (usually) take either mutex. These are the two most performance * critical pieces of code here. The exception occurs on cpuset_exit(), - * when a task in a notify_on_release cpuset exits. Then manage_sem + * when a task in a notify_on_release cpuset exits. Then manage_mutex * is taken, and if the cpuset count is zero, a usermode call made * to /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. @@ -242,9 +236,9 @@ static struct super_block *cpuset_sb; * * The need for this exception arises from the action of attach_task(), * which overwrites one tasks cpuset pointer with another. It does - * so using both semaphores, however there are several performance + * so using both mutexes, however there are several performance * critical places that need to reference task->cpuset without the - * expense of grabbing a system global semaphore. Therefore except as + * expense of grabbing a system global mutex. Therefore except as * noted below, when dereferencing or, as in attach_task(), modifying * a tasks cpuset pointer we use task_lock(), which acts on a spinlock * (task->alloc_lock) already in the task_struct routinely used for @@ -256,8 +250,8 @@ static struct super_block *cpuset_sb; * the routine cpuset_update_task_memory_state(). */ -static DECLARE_MUTEX(manage_sem); -static DECLARE_MUTEX(callback_sem); +static DEFINE_MUTEX(manage_mutex); +static DEFINE_MUTEX(callback_mutex); /* * A couple of forward declarations required, due to cyclic reference loop: @@ -432,7 +426,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) } /* - * Call with manage_sem held. Writes path of cpuset into buf. + * Call with manage_mutex held. Writes path of cpuset into buf. * Returns 0 on success, -errno on error. */ @@ -484,11 +478,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) * status of the /sbin/cpuset_release_agent task, so no sense holding * our caller up for that. * - * When we had only one cpuset semaphore, we had to call this + * When we had only one cpuset mutex, we had to call this * without holding it, to avoid deadlock when call_usermodehelper() * allocated memory. With two locks, we could now call this while - * holding manage_sem, but we still don't, so as to minimize - * the time manage_sem is held. + * holding manage_mutex, but we still don't, so as to minimize + * the time manage_mutex is held. */ static void cpuset_release_agent(const char *pathbuf) @@ -520,15 +514,15 @@ static void cpuset_release_agent(const char *pathbuf) * cs is notify_on_release() and now both the user count is zero and * the list of children is empty, prepare cpuset path in a kmalloc'd * buffer, to be returned via ppathbuf, so that the caller can invoke - * cpuset_release_agent() with it later on, once manage_sem is dropped. - * Call here with manage_sem held. + * cpuset_release_agent() with it later on, once manage_mutex is dropped. + * Call here with manage_mutex held. * * This check_for_release() routine is responsible for kmalloc'ing * pathbuf. The above cpuset_release_agent() is responsible for * kfree'ing pathbuf. The caller of these routines is responsible * for providing a pathbuf pointer, initialized to NULL, then - * calling check_for_release() with manage_sem held and the address - * of the pathbuf pointer, then dropping manage_sem, then calling + * calling check_for_release() with manage_mutex held and the address + * of the pathbuf pointer, then dropping manage_mutex, then calling * cpuset_release_agent() with pathbuf, as set by check_for_release(). */ @@ -559,7 +553,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) * One way or another, we guarantee to return some non-empty subset * of cpu_online_map. * - * Call with callback_sem held. + * Call with callback_mutex held. */ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) @@ -583,7 +577,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) * One way or another, we guarantee to return some non-empty subset * of node_online_map. * - * Call with callback_sem held. + * Call with callback_mutex held. */ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) @@ -608,12 +602,12 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * current->cpuset if a task has its memory placement changed. * Do not call this routine if in_interrupt(). * - * Call without callback_sem or task_lock() held. May be called - * with or without manage_sem held. Doesn't need task_lock to guard + * Call without callback_mutex or task_lock() held. May be called + * with or without manage_mutex held. Doesn't need task_lock to guard * against another task changing a non-NULL cpuset pointer to NULL, * as that is only done by a task on itself, and if the current task * is here, it is not simultaneously in the exit code NULL'ing its - * cpuset pointer. This routine also might acquire callback_sem and + * cpuset pointer. This routine also might acquire callback_mutex and * current->mm->mmap_sem during call. * * Reading current->cpuset->mems_generation doesn't need task_lock @@ -658,13 +652,13 @@ void cpuset_update_task_memory_state(void) } if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { - down(&callback_sem); + mutex_lock(&callback_mutex); task_lock(tsk); cs = tsk->cpuset; /* Maybe changed when task not locked */ guarantee_online_mems(cs, &tsk->mems_allowed); tsk->cpuset_mems_generation = cs->mems_generation; task_unlock(tsk); - up(&callback_sem); + mutex_unlock(&callback_mutex); mpol_rebind_task(tsk, &tsk->mems_allowed); } } @@ -674,7 +668,7 @@ void cpuset_update_task_memory_state(void) * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding manage_sem. + * are only set if the other's are set. Call holding manage_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -692,7 +686,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * manage_sem held. + * manage_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -746,7 +740,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * exclusive child cpusets * Build these two partitions by calling partition_sched_domains * - * Call with manage_sem held. May nest a call to the + * Call with manage_mutex held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. */ @@ -792,7 +786,7 @@ static void update_cpu_domains(struct cpuset *cur) } /* - * Call with manage_sem held. May take callback_sem during call. + * Call with manage_mutex held. May take callback_mutex during call. */ static int update_cpumask(struct cpuset *cs, char *buf) @@ -811,9 +805,9 @@ static int update_cpumask(struct cpuset *cs, char *buf) if (retval < 0) return retval; cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); - down(&callback_sem); + mutex_lock(&callback_mutex); cs->cpus_allowed = trialcs.cpus_allowed; - up(&callback_sem); + mutex_unlock(&callback_mutex); if (is_cpu_exclusive(cs) && !cpus_unchanged) update_cpu_domains(cs); return 0; @@ -827,7 +821,7 @@ static int update_cpumask(struct cpuset *cs, char *buf) * the cpuset is marked 'memory_migrate', migrate the tasks * pages to the new memory. * - * Call with manage_sem held. May take callback_sem during call. + * Call with manage_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -862,11 +856,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) if (retval < 0) goto done; - down(&callback_sem); + mutex_lock(&callback_mutex); cs->mems_allowed = trialcs.mems_allowed; atomic_inc(&cpuset_mems_generation); cs->mems_generation = atomic_read(&cpuset_mems_generation); - up(&callback_sem); + mutex_unlock(&callback_mutex); set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ @@ -922,7 +916,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) * tasklist_lock. Forks can happen again now - the mpol_copy() * cpuset_being_rebound check will catch such forks, and rebind * their vma mempolicies too. Because we still hold the global - * cpuset manage_sem, we know that no other rebind effort will + * cpuset manage_mutex, we know that no other rebind effort will * be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -948,7 +942,7 @@ done: } /* - * Call with manage_sem held. + * Call with manage_mutex held. */ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) @@ -967,7 +961,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 * - * Call with manage_sem held. + * Call with manage_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) @@ -989,12 +983,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) return err; cpu_exclusive_changed = (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); - down(&callback_sem); + mutex_lock(&callback_mutex); if (turning_on) set_bit(bit, &cs->flags); else clear_bit(bit, &cs->flags); - up(&callback_sem); + mutex_unlock(&callback_mutex); if (cpu_exclusive_changed) update_cpu_domains(cs); @@ -1104,7 +1098,7 @@ static int fmeter_getrate(struct fmeter *fmp) * writing the path of the old cpuset in 'ppathbuf' if it needs to be * notified on release. * - * Call holding manage_sem. May take callback_sem and task_lock of + * Call holding manage_mutex. May take callback_mutex and task_lock of * the task 'pid' during call. */ @@ -1144,13 +1138,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) get_task_struct(tsk); } - down(&callback_sem); + mutex_lock(&callback_mutex); task_lock(tsk); oldcs = tsk->cpuset; if (!oldcs) { task_unlock(tsk); - up(&callback_sem); + mutex_unlock(&callback_mutex); put_task_struct(tsk); return -ESRCH; } @@ -1164,7 +1158,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) from = oldcs->mems_allowed; to = cs->mems_allowed; - up(&callback_sem); + mutex_unlock(&callback_mutex); mm = get_task_mm(tsk); if (mm) { @@ -1221,7 +1215,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us } buffer[nbytes] = 0; /* nul-terminate */ - down(&manage_sem); + mutex_lock(&manage_mutex); if (is_removed(cs)) { retval = -ENODEV; @@ -1264,7 +1258,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us if (retval == 0) retval = nbytes; out2: - up(&manage_sem); + mutex_unlock(&manage_mutex); cpuset_release_agent(pathbuf); out1: kfree(buffer); @@ -1304,9 +1298,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { cpumask_t mask; - down(&callback_sem); + mutex_lock(&callback_mutex); mask = cs->cpus_allowed; - up(&callback_sem); + mutex_unlock(&callback_mutex); return cpulist_scnprintf(page, PAGE_SIZE, mask); } @@ -1315,9 +1309,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { nodemask_t mask; - down(&callback_sem); + mutex_lock(&callback_mutex); mask = cs->mems_allowed; - up(&callback_sem); + mutex_unlock(&callback_mutex); return nodelist_scnprintf(page, PAGE_SIZE, mask); } @@ -1598,7 +1592,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) * Handle an open on 'tasks' file. Prepare a buffer listing the * process id's of tasks currently attached to the cpuset being opened. * - * Does not require any specific cpuset semaphores, and does not take any. + * Does not require any specific cpuset mutexes, and does not take any. */ static int cpuset_tasks_open(struct inode *unused, struct file *file) { @@ -1754,7 +1748,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) * name: name of the new cpuset. Will be strcpy'ed. * mode: mode to set on new inode * - * Must be called with the semaphore on the parent inode held + * Must be called with the mutex on the parent inode held */ static long cpuset_create(struct cpuset *parent, const char *name, int mode) @@ -1766,7 +1760,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) if (!cs) return -ENOMEM; - down(&manage_sem); + mutex_lock(&manage_mutex); cpuset_update_task_memory_state(); cs->flags = 0; if (notify_on_release(parent)) @@ -1782,28 +1776,28 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) cs->parent = parent; - down(&callback_sem); + mutex_lock(&callback_mutex); list_add(&cs->sibling, &cs->parent->children); number_of_cpusets++; - up(&callback_sem); + mutex_unlock(&callback_mutex); err = cpuset_create_dir(cs, name, mode); if (err < 0) goto err; /* - * Release manage_sem before cpuset_populate_dir() because it + * Release manage_mutex before cpuset_populate_dir() because it * will down() this new directory's i_mutex and if we race with * another mkdir, we might deadlock. */ - up(&manage_sem); + mutex_unlock(&manage_mutex); err = cpuset_populate_dir(cs->dentry); /* If err < 0, we have a half-filled directory - oh well ;) */ return 0; err: list_del(&cs->sibling); - up(&manage_sem); + mutex_unlock(&manage_mutex); kfree(cs); return err; } @@ -1825,18 +1819,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) /* the vfs holds both inode->i_mutex already */ - down(&manage_sem); + mutex_lock(&manage_mutex); cpuset_update_task_memory_state(); if (atomic_read(&cs->count) > 0) { - up(&manage_sem); + mutex_unlock(&manage_mutex); return -EBUSY; } if (!list_empty(&cs->children)) { - up(&manage_sem); + mutex_unlock(&manage_mutex); return -EBUSY; } parent = cs->parent; - down(&callback_sem); + mutex_lock(&callback_mutex); set_bit(CS_REMOVED, &cs->flags); if (is_cpu_exclusive(cs)) update_cpu_domains(cs); @@ -1848,10 +1842,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) cpuset_d_remove_dir(d); dput(d); number_of_cpusets--; - up(&callback_sem); + mutex_unlock(&callback_mutex); if (list_empty(&parent->children)) check_for_release(parent, &pathbuf); - up(&manage_sem); + mutex_unlock(&manage_mutex); cpuset_release_agent(pathbuf); return 0; } @@ -1960,19 +1954,19 @@ void cpuset_fork(struct task_struct *child) * Description: Detach cpuset from @tsk and release it. * * Note that cpusets marked notify_on_release force every task in - * them to take the global manage_sem semaphore when exiting. + * them to take the global manage_mutex mutex when exiting. * This could impact scaling on very large systems. Be reluctant to * use notify_on_release cpusets where very high task exit scaling * is required on large systems. * * Don't even think about derefencing 'cs' after the cpuset use count - * goes to zero, except inside a critical section guarded by manage_sem - * or callback_sem. Otherwise a zero cpuset use count is a license to + * goes to zero, except inside a critical section guarded by manage_mutex + * or callback_mutex. Otherwise a zero cpuset use count is a license to * any other task to nuke the cpuset immediately, via cpuset_rmdir(). * - * This routine has to take manage_sem, not callback_sem, because - * it is holding that semaphore while calling check_for_release(), - * which calls kmalloc(), so can't be called holding callback__sem(). + * This routine has to take manage_mutex, not callback_mutex, because + * it is holding that mutex while calling check_for_release(), + * which calls kmalloc(), so can't be called holding callback_mutex(). * * We don't need to task_lock() this reference to tsk->cpuset, * because tsk is already marked PF_EXITING, so attach_task() won't @@ -2022,10 +2016,10 @@ void cpuset_exit(struct task_struct *tsk) if (notify_on_release(cs)) { char *pathbuf = NULL; - down(&manage_sem); + mutex_lock(&manage_mutex); if (atomic_dec_and_test(&cs->count)) check_for_release(cs, &pathbuf); - up(&manage_sem); + mutex_unlock(&manage_mutex); cpuset_release_agent(pathbuf); } else { atomic_dec(&cs->count); @@ -2046,11 +2040,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) { cpumask_t mask; - down(&callback_sem); + mutex_lock(&callback_mutex); task_lock(tsk); guarantee_online_cpus(tsk->cpuset, &mask); task_unlock(tsk); - up(&callback_sem); + mutex_unlock(&callback_mutex); return mask; } @@ -2074,11 +2068,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { nodemask_t mask; - down(&callback_sem); + mutex_lock(&callback_mutex); task_lock(tsk); guarantee_online_mems(tsk->cpuset, &mask); task_unlock(tsk); - up(&callback_sem); + mutex_unlock(&callback_mutex); return mask; } @@ -2104,7 +2098,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) /* * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive - * ancestor to the specified cpuset. Call holding callback_sem. + * ancestor to the specified cpuset. Call holding callback_mutex. * If no ancestor is mem_exclusive (an unusual configuration), then * returns the root cpuset. */ @@ -2131,12 +2125,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * GFP_KERNEL allocations are not so marked, so can escape to the * nearest mem_exclusive ancestor cpuset. * - * Scanning up parent cpusets requires callback_sem. The __alloc_pages() + * Scanning up parent cpusets requires callback_mutex. The __alloc_pages() * routine only calls here with __GFP_HARDWALL bit _not_ set if * it's a GFP_KERNEL allocation, and all nodes in the current tasks * mems_allowed came up empty on the first pass over the zonelist. * So only GFP_KERNEL allocations, if all nodes in the cpuset are - * short of memory, might require taking the callback_sem semaphore. + * short of memory, might require taking the callback_mutex mutex. * * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing @@ -2171,31 +2165,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) return 1; /* Not hardwall and node outside mems_allowed: scan up cpusets */ - down(&callback_sem); + mutex_lock(&callback_mutex); task_lock(current); cs = nearest_exclusive_ancestor(current->cpuset); task_unlock(current); allowed = node_isset(node, cs->mems_allowed); - up(&callback_sem); + mutex_unlock(&callback_mutex); return allowed; } /** * cpuset_lock - lock out any changes to cpuset structures * - * The out of memory (oom) code needs to lock down cpusets + * The out of memory (oom) code needs to mutex_lock cpusets * from being changed while it scans the tasklist looking for a - * task in an overlapping cpuset. Expose callback_sem via this + * task in an overlapping cpuset. Expose callback_mutex via this * cpuset_lock() routine, so the oom code can lock it, before * locking the task list. The tasklist_lock is a spinlock, so - * must be taken inside callback_sem. + * must be taken inside callback_mutex. */ void cpuset_lock(void) { - down(&callback_sem); + mutex_lock(&callback_mutex); } /** @@ -2206,7 +2200,7 @@ void cpuset_lock(void) void cpuset_unlock(void) { - up(&callback_sem); + mutex_unlock(&callback_mutex); } /** @@ -2218,7 +2212,7 @@ void cpuset_unlock(void) * determine if task @p's memory usage might impact the memory * available to the current task. * - * Call while holding callback_sem. + * Call while holding callback_mutex. **/ int cpuset_excl_nodes_overlap(const struct task_struct *p) @@ -2289,7 +2283,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take manage_sem, keeping attach_task() from changing it + * and we take manage_mutex, keeping attach_task() from changing it * anyway. */ @@ -2305,7 +2299,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) return -ENOMEM; tsk = m->private; - down(&manage_sem); + mutex_lock(&manage_mutex); cs = tsk->cpuset; if (!cs) { retval = -EINVAL; @@ -2318,7 +2312,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) seq_puts(m, buf); seq_putc(m, '\n'); out: - up(&manage_sem); + mutex_unlock(&manage_mutex); kfree(buf); return retval; } diff --git a/kernel/exit.c b/kernel/exit.c index d1e8d500a7e..8037405e136 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -345,9 +345,9 @@ void daemonize(const char *name, ...) exit_mm(current); set_special_pids(1, 1); - down(&tty_sem); + mutex_lock(&tty_mutex); current->signal->tty = NULL; - up(&tty_sem); + mutex_unlock(&tty_mutex); /* Block and flush all signals */ sigfillset(&blocked); diff --git a/kernel/fork.c b/kernel/fork.c index 9bd7b65ee41..c79ae0b19a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -607,12 +607,12 @@ static struct files_struct *alloc_files(void) atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + newf->next_fd = 0; fdt = &newf->fdtab; - fdt->next_fd = 0; fdt->max_fds = NR_OPEN_DEFAULT; - fdt->max_fdset = __FD_SETSIZE; - fdt->close_on_exec = &newf->close_on_exec_init; - fdt->open_fds = &newf->open_fds_init; + fdt->max_fdset = EMBEDDED_FD_SET_SIZE; + fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; + fdt->open_fds = (fd_set *)&newf->open_fds_init; fdt->fd = &newf->fd_array[0]; INIT_RCU_HEAD(&fdt->rcu); fdt->free_files = NULL; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fef1af8a73c..1fb9f753ef6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -48,7 +48,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; -DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ +DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; @@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, } p->nmissed = 0; - down(&kprobe_mutex); + mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); if (old_p) { ret = register_aggr_kprobe(old_p, p); @@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, arch_arm_kprobe(p); out: - up(&kprobe_mutex); + mutex_unlock(&kprobe_mutex); if (ret && probed_mod) module_put(probed_mod); @@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p) struct kprobe *old_p, *list_p; int cleanup_p; - down(&kprobe_mutex); + mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); if (unlikely(!old_p)) { - up(&kprobe_mutex); + mutex_unlock(&kprobe_mutex); return; } if (p != old_p) { @@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p) if (list_p == p) /* kprobe p is a valid probe */ goto valid_p; - up(&kprobe_mutex); + mutex_unlock(&kprobe_mutex); return; } valid_p: @@ -523,7 +523,7 @@ valid_p: cleanup_p = 0; } - up(&kprobe_mutex); + mutex_unlock(&kprobe_mutex); synchronize_sched(); if (p->mod_refcounted && diff --git a/kernel/kthread.c b/kernel/kthread.c index e75950a1092..6a5373868a9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -12,6 +12,7 @@ #include <linux/unistd.h> #include <linux/file.h> #include <linux/module.h> +#include <linux/mutex.h> #include <asm/semaphore.h> /* @@ -41,7 +42,7 @@ struct kthread_stop_info /* Thread stopping is done by setthing this var: lock serializes * multiple kthread_stop calls. */ -static DECLARE_MUTEX(kthread_stop_lock); +static DEFINE_MUTEX(kthread_stop_lock); static struct kthread_stop_info kthread_stop_info; int kthread_should_stop(void) @@ -173,7 +174,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) { int ret; - down(&kthread_stop_lock); + mutex_lock(&kthread_stop_lock); /* It could exit after stop_info.k set, but before wake_up_process. */ get_task_struct(k); @@ -194,7 +195,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) wait_for_completion(&kthread_stop_info.done); kthread_stop_info.k = NULL; ret = kthread_stop_info.err; - up(&kthread_stop_lock); + mutex_unlock(&kthread_stop_lock); return ret; } diff --git a/kernel/module.c b/kernel/module.c index 77764f22f02..fb404299082 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -39,6 +39,7 @@ #include <linux/device.h> #include <linux/string.h> #include <linux/sched.h> +#include <linux/mutex.h> #include <asm/uaccess.h> #include <asm/semaphore.h> #include <asm/cacheflush.h> @@ -60,18 +61,18 @@ static DEFINE_SPINLOCK(modlist_lock); /* List of modules, protected by module_mutex AND modlist_lock */ -static DECLARE_MUTEX(module_mutex); +static DEFINE_MUTEX(module_mutex); static LIST_HEAD(modules); -static DECLARE_MUTEX(notify_mutex); +static DEFINE_MUTEX(notify_mutex); static struct notifier_block * module_notify_list; int register_module_notifier(struct notifier_block * nb) { int err; - down(¬ify_mutex); + mutex_lock(¬ify_mutex); err = notifier_chain_register(&module_notify_list, nb); - up(¬ify_mutex); + mutex_unlock(¬ify_mutex); return err; } EXPORT_SYMBOL(register_module_notifier); @@ -79,9 +80,9 @@ EXPORT_SYMBOL(register_module_notifier); int unregister_module_notifier(struct notifier_block * nb) { int err; - down(¬ify_mutex); + mutex_lock(¬ify_mutex); err = notifier_chain_unregister(&module_notify_list, nb); - up(¬ify_mutex); + mutex_unlock(¬ify_mutex); return err; } EXPORT_SYMBOL(unregister_module_notifier); @@ -601,7 +602,7 @@ static void free_module(struct module *mod); static void wait_for_zero_refcount(struct module *mod) { /* Since we might sleep for some time, drop the semaphore first */ - up(&module_mutex); + mutex_unlock(&module_mutex); for (;;) { DEBUGP("Looking at refcount...\n"); set_current_state(TASK_UNINTERRUPTIBLE); @@ -610,7 +611,7 @@ static void wait_for_zero_refcount(struct module *mod) schedule(); } current->state = TASK_RUNNING; - down(&module_mutex); + mutex_lock(&module_mutex); } asmlinkage long @@ -627,7 +628,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - if (down_interruptible(&module_mutex) != 0) + if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; mod = find_module(name); @@ -676,14 +677,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags) /* Final destruction now noone is using it. */ if (mod->exit != NULL) { - up(&module_mutex); + mutex_unlock(&module_mutex); mod->exit(); - down(&module_mutex); + mutex_lock(&module_mutex); } free_module(mod); out: - up(&module_mutex); + mutex_unlock(&module_mutex); return ret; } @@ -1972,13 +1973,13 @@ sys_init_module(void __user *umod, return -EPERM; /* Only one module load at a time, please */ - if (down_interruptible(&module_mutex) != 0) + if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; /* Do all the hard work */ mod = load_module(umod, len, uargs); if (IS_ERR(mod)) { - up(&module_mutex); + mutex_unlock(&module_mutex); return PTR_ERR(mod); } @@ -1987,11 +1988,11 @@ sys_init_module(void __user *umod, stop_machine_run(__link_module, mod, NR_CPUS); /* Drop lock so they can recurse */ - up(&module_mutex); + mutex_unlock(&module_mutex); - down(¬ify_mutex); + mutex_lock(¬ify_mutex); notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); - up(¬ify_mutex); + mutex_unlock(¬ify_mutex); /* Start the module */ if (mod->init != NULL) @@ -2006,15 +2007,15 @@ sys_init_module(void __user *umod, mod->name); else { module_put(mod); - down(&module_mutex); + mutex_lock(&module_mutex); free_module(mod); - up(&module_mutex); + mutex_unlock(&module_mutex); } return ret; } /* Now it's a first class citizen! */ - down(&module_mutex); + mutex_lock(&module_mutex); mod->state = MODULE_STATE_LIVE; /* Drop initial reference. */ module_put(mod); @@ -2022,7 +2023,7 @@ sys_init_module(void __user *umod, mod->module_init = NULL; mod->init_size = 0; mod->init_text_size = 0; - up(&module_mutex); + mutex_unlock(&module_mutex); return 0; } @@ -2112,7 +2113,7 @@ struct module *module_get_kallsym(unsigned int symnum, { struct module *mod; - down(&module_mutex); + mutex_lock(&module_mutex); list_for_each_entry(mod, &modules, list) { if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; @@ -2120,12 +2121,12 @@ struct module *module_get_kallsym(unsigned int symnum, strncpy(namebuf, mod->strtab + mod->symtab[symnum].st_name, 127); - up(&module_mutex); + mutex_unlock(&module_mutex); return mod; } symnum -= mod->num_symtab; } - up(&module_mutex); + mutex_unlock(&module_mutex); return NULL; } @@ -2168,7 +2169,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) struct list_head *i; loff_t n = 0; - down(&module_mutex); + mutex_lock(&module_mutex); list_for_each(i, &modules) { if (n++ == *pos) break; @@ -2189,7 +2190,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos) static void m_stop(struct seq_file *m, void *p) { - up(&module_mutex); + mutex_unlock(&module_mutex); } static int m_show(struct seq_file *m, void *p) diff --git a/kernel/panic.c b/kernel/panic.c index 126dc43f1c7..acd95adddb9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -20,10 +20,13 @@ #include <linux/nmi.h> #include <linux/kexec.h> -int panic_timeout; int panic_on_oops; int tainted; +static int pause_on_oops; +static int pause_on_oops_flag; +static DEFINE_SPINLOCK(pause_on_oops_lock); +int panic_timeout; EXPORT_SYMBOL(panic_timeout); struct notifier_block *panic_notifier_list; @@ -174,3 +177,95 @@ void add_taint(unsigned flag) tainted |= flag; } EXPORT_SYMBOL(add_taint); + +static int __init pause_on_oops_setup(char *str) +{ + pause_on_oops = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("pause_on_oops=", pause_on_oops_setup); + +static void spin_msec(int msecs) +{ + int i; + + for (i = 0; i < msecs; i++) { + touch_nmi_watchdog(); + mdelay(1); + } +} + +/* + * It just happens that oops_enter() and oops_exit() are identically + * implemented... + */ +static void do_oops_enter_exit(void) +{ + unsigned long flags; + static int spin_counter; + + if (!pause_on_oops) + return; + + spin_lock_irqsave(&pause_on_oops_lock, flags); + if (pause_on_oops_flag == 0) { + /* This CPU may now print the oops message */ + pause_on_oops_flag = 1; + } else { + /* We need to stall this CPU */ + if (!spin_counter) { + /* This CPU gets to do the counting */ + spin_counter = pause_on_oops; + do { + spin_unlock(&pause_on_oops_lock); + spin_msec(MSEC_PER_SEC); + spin_lock(&pause_on_oops_lock); + } while (--spin_counter); + pause_on_oops_flag = 0; + } else { + /* This CPU waits for a different one */ + while (spin_counter) { + spin_unlock(&pause_on_oops_lock); + spin_msec(1); + spin_lock(&pause_on_oops_lock); + } + } + } + spin_unlock_irqrestore(&pause_on_oops_lock, flags); +} + +/* + * Return true if the calling CPU is allowed to print oops-related info. This + * is a bit racy.. + */ +int oops_may_print(void) +{ + return pause_on_oops_flag == 0; +} + +/* + * Called when the architecture enters its oops handler, before it prints + * anything. If this is the first CPU to oops, and it's oopsing the first time + * then let it proceed. + * + * This is all enabled by the pause_on_oops kernel boot option. We do all this + * to ensure that oopses don't scroll off the screen. It has the side-effect + * of preventing later-oopsing CPUs from mucking up the display, too. + * + * It turns out that the CPU which is allowed to print ends up pausing for the + * right duration, whereas all the other CPUs pause for twice as long: once in + * oops_enter(), once in oops_exit(). + */ +void oops_enter(void) +{ + do_oops_enter_exit(); +} + +/* + * Called when the architecture exits its oops handler, after printing + * everything. + */ +void oops_exit(void) +{ + do_oops_enter_exit(); +} diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index fa895fc2ecf..9944379360b 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -35,6 +35,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/time.h> +#include <linux/mutex.h> #include <asm/uaccess.h> #include <asm/semaphore.h> diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 04be7d0d96a..8d0af3d37a4 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -5,7 +5,7 @@ endif obj-y := main.o process.o console.o obj-$(CONFIG_PM_LEGACY) += pm.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o obj-$(CONFIG_SUSPEND_SMP) += smp.o diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 0b43847dc98..81d4d982f3f 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -22,17 +22,6 @@ #include "power.h" -extern suspend_disk_method_t pm_disk_mode; - -extern int swsusp_shrink_memory(void); -extern int swsusp_suspend(void); -extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages); -extern int swsusp_check(void); -extern int swsusp_read(struct pbe **pblist_ptr); -extern void swsusp_close(void); -extern int swsusp_resume(void); - - static int noresume = 0; char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; @@ -70,10 +59,6 @@ static void power_down(suspend_disk_method_t mode) while(1); } - -static int in_suspend __nosavedata = 0; - - static inline void platform_finish(void) { if (pm_disk_mode == PM_DISK_PLATFORM) { @@ -87,7 +72,6 @@ static int prepare_processes(void) int error; pm_prepare_console(); - sys_sync(); disable_nonboot_cpus(); if (freeze_processes()) { @@ -145,7 +129,7 @@ int pm_suspend_disk(void) if (in_suspend) { device_resume(); pr_debug("PM: writing image.\n"); - error = swsusp_write(pagedir_nosave, nr_copy_pages); + error = swsusp_write(); if (!error) power_down(pm_disk_mode); else { @@ -216,7 +200,7 @@ static int software_resume(void) pr_debug("PM: Reading swsusp image.\n"); - if ((error = swsusp_read(&pagedir_nosave))) { + if ((error = swsusp_read())) { swsusp_free(); goto Thaw; } diff --git a/kernel/power/main.c b/kernel/power/main.c index 9cb235cba4a..ee371f50cca 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state) } -static int suspend_enter(suspend_state_t state) +int suspend_enter(suspend_state_t state) { int error = 0; unsigned long flags; diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 33c508e857d..0f6908cce1d 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c @@ -25,6 +25,7 @@ #include <linux/pm.h> #include <linux/pm_legacy.h> #include <linux/interrupt.h> +#include <linux/mutex.h> int pm_active; @@ -40,7 +41,7 @@ int pm_active; * until a resume but that will be fine. */ -static DECLARE_MUTEX(pm_devs_lock); +static DEFINE_MUTEX(pm_devs_lock); static LIST_HEAD(pm_devs); /** @@ -67,9 +68,9 @@ struct pm_dev *pm_register(pm_dev_t type, dev->id = id; dev->callback = callback; - down(&pm_devs_lock); + mutex_lock(&pm_devs_lock); list_add(&dev->entry, &pm_devs); - up(&pm_devs_lock); + mutex_unlock(&pm_devs_lock); } return dev; } @@ -85,9 +86,9 @@ struct pm_dev *pm_register(pm_dev_t type, void pm_unregister(struct pm_dev *dev) { if (dev) { - down(&pm_devs_lock); + mutex_lock(&pm_devs_lock); list_del(&dev->entry); - up(&pm_devs_lock); + mutex_unlock(&pm_devs_lock); kfree(dev); } @@ -118,7 +119,7 @@ void pm_unregister_all(pm_callback callback) if (!callback) return; - down(&pm_devs_lock); + mutex_lock(&pm_devs_lock); entry = pm_devs.next; while (entry != &pm_devs) { struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); @@ -126,7 +127,7 @@ void pm_unregister_all(pm_callback callback) if (dev->callback == callback) __pm_unregister(dev); } - up(&pm_devs_lock); + mutex_unlock(&pm_devs_lock); } /** @@ -234,7 +235,7 @@ int pm_send_all(pm_request_t rqst, void *data) { struct list_head *entry; - down(&pm_devs_lock); + mutex_lock(&pm_devs_lock); entry = pm_devs.next; while (entry != &pm_devs) { struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); @@ -246,13 +247,13 @@ int pm_send_all(pm_request_t rqst, void *data) */ if (rqst == PM_SUSPEND) pm_undo_all(dev); - up(&pm_devs_lock); + mutex_unlock(&pm_devs_lock); return status; } } entry = entry->next; } - up(&pm_devs_lock); + mutex_unlock(&pm_devs_lock); return 0; } diff --git a/kernel/power/power.h b/kernel/power/power.h index 388dba68084..f06f12f2176 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -8,6 +8,7 @@ struct swsusp_info { int cpus; unsigned long image_pages; unsigned long pages; + unsigned long size; } __attribute__((aligned(PAGE_SIZE))); @@ -37,21 +38,79 @@ extern struct subsystem power_subsys; /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; -extern unsigned int nr_copy_pages; extern struct pbe *pagedir_nosave; /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; +extern int in_suspend; +extern dev_t swsusp_resume_device; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); extern unsigned int count_data_pages(void); -extern void free_pagedir(struct pbe *pblist); -extern void release_eaten_pages(void); -extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); + +struct snapshot_handle { + loff_t offset; + unsigned int page; + unsigned int page_offset; + unsigned int prev; + struct pbe *pbe; + void *buffer; + unsigned int buf_offset; +}; + +#define data_of(handle) ((handle).buffer + (handle).buf_offset) + +extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); +extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); +int snapshot_image_loaded(struct snapshot_handle *handle); + +#define SNAPSHOT_IOC_MAGIC '3' +#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) +#define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2) +#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) +#define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4) +#define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5) +#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) +#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) +#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) +#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) +#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) +#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) +#define SNAPSHOT_IOC_MAXNR 11 + +/** + * The bitmap is used for tracing allocated swap pages + * + * The entire bitmap consists of a number of bitmap_page + * structures linked with the help of the .next member. + * Thus each page can be allocated individually, so we only + * need to make 0-order memory allocations to create + * the bitmap. + */ + +#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *)) +#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long)) +#define BITS_PER_CHUNK (sizeof(long) * 8) +#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK) + +struct bitmap_page { + unsigned long chunks[BITMAP_PAGE_CHUNKS]; + struct bitmap_page *next; +}; + +extern void free_bitmap(struct bitmap_page *bitmap); +extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); +extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); +extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); + +extern int swsusp_check(void); +extern int swsusp_shrink_memory(void); extern void swsusp_free(void); -extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); -extern unsigned int snapshot_nr_pages(void); -extern struct pbe *snapshot_pblist(void); -extern void snapshot_pblist_set(struct pbe *pblist); +extern int swsusp_suspend(void); +extern int swsusp_resume(void); +extern int swsusp_read(void); +extern int swsusp_write(void); +extern void swsusp_close(void); +extern int suspend_enter(suspend_state_t state); diff --git a/kernel/power/process.c b/kernel/power/process.c index 28de118f7a0..8ac7c35fad7 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -12,11 +12,12 @@ #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/module.h> +#include <linux/syscalls.h> /* * Timeout for stopping processes */ -#define TIMEOUT (6 * HZ) +#define TIMEOUT (20 * HZ) static inline int freezeable(struct task_struct * p) @@ -54,38 +55,62 @@ void refrigerator(void) current->state = save; } +static inline void freeze_process(struct task_struct *p) +{ + unsigned long flags; + + if (!freezing(p)) { + freeze(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + signal_wake_up(p, 0); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +} + /* 0 = success, else # of processes that we failed to stop */ int freeze_processes(void) { - int todo; + int todo, nr_user, user_frozen; unsigned long start_time; struct task_struct *g, *p; unsigned long flags; printk( "Stopping tasks: " ); start_time = jiffies; + user_frozen = 0; do { - todo = 0; + nr_user = todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { if (!freezeable(p)) continue; if (frozen(p)) continue; - - freeze(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - todo++; + if (p->mm && !(p->flags & PF_BORROWED_MM)) { + /* The task is a user-space one. + * Freeze it unless there's a vfork completion + * pending + */ + if (!p->vfork_done) + freeze_process(p); + nr_user++; + } else { + /* Freeze only if the user space is frozen */ + if (user_frozen) + freeze_process(p); + todo++; + } } while_each_thread(g, p); read_unlock(&tasklist_lock); + todo += nr_user; + if (!user_frozen && !nr_user) { + sys_sync(); + start_time = jiffies; + } + user_frozen = !nr_user; yield(); /* Yield is okay here */ - if (todo && time_after(jiffies, start_time + TIMEOUT)) { - printk( "\n" ); - printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); + if (todo && time_after(jiffies, start_time + TIMEOUT)) break; - } } while(todo); /* This does not unfreeze processes that are already frozen @@ -94,8 +119,14 @@ int freeze_processes(void) * but it cleans up leftover PF_FREEZE requests. */ if (todo) { + printk( "\n" ); + printk(KERN_ERR " stopping tasks timed out " + "after %d seconds (%d tasks remaining):\n", + TIMEOUT / HZ, todo); read_lock(&tasklist_lock); - do_each_thread(g, p) + do_each_thread(g, p) { + if (freezeable(p) && !frozen(p)) + printk(KERN_ERR " %s\n", p->comm); if (freezing(p)) { pr_debug(" clean up: %s\n", p->comm); p->flags &= ~PF_FREEZE; @@ -103,7 +134,7 @@ int freeze_processes(void) recalc_sigpending_tsk(p); spin_unlock_irqrestore(&p->sighand->siglock, flags); } - while_each_thread(g, p); + } while_each_thread(g, p); read_unlock(&tasklist_lock); return todo; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 8d5a5986d62..c5863d02c89 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -10,6 +10,7 @@ */ +#include <linux/version.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/suspend.h> @@ -34,7 +35,9 @@ #include "power.h" struct pbe *pagedir_nosave; -unsigned int nr_copy_pages; +static unsigned int nr_copy_pages; +static unsigned int nr_meta_pages; +static unsigned long *buffer; #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void) @@ -80,7 +83,7 @@ static int save_highmem_zone(struct zone *zone) void *kaddr; unsigned long pfn = zone_pfn + zone->zone_start_pfn; - if (!(pfn%1000)) + if (!(pfn%10000)) printk("."); if (!pfn_valid(pfn)) continue; @@ -119,13 +122,15 @@ int save_highmem(void) struct zone *zone; int res = 0; - pr_debug("swsusp: Saving Highmem\n"); + pr_debug("swsusp: Saving Highmem"); + drain_local_pages(); for_each_zone (zone) { if (is_highmem(zone)) res = save_highmem_zone(zone); if (res) return res; } + printk("\n"); return 0; } @@ -235,7 +240,7 @@ static void copy_data_pages(struct pbe *pblist) * free_pagedir - free pages allocated with alloc_pagedir() */ -void free_pagedir(struct pbe *pblist) +static void free_pagedir(struct pbe *pblist) { struct pbe *pbe; @@ -301,7 +306,7 @@ struct eaten_page { static struct eaten_page *eaten_pages = NULL; -void release_eaten_pages(void) +static void release_eaten_pages(void) { struct eaten_page *p, *q; @@ -376,7 +381,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed if (!nr_pages) return NULL; - pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); pblist = alloc_image_page(gfp_mask, safe_needed); /* FIXME: rewrite this ugly loop */ for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; @@ -388,7 +392,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed free_pagedir(pblist); pblist = NULL; } else - create_pbe_list(pblist, nr_pages); + create_pbe_list(pblist, nr_pages); return pblist; } @@ -414,6 +418,10 @@ void swsusp_free(void) } } } + nr_copy_pages = 0; + nr_meta_pages = 0; + pagedir_nosave = NULL; + buffer = NULL; } @@ -437,7 +445,7 @@ static int enough_free_mem(unsigned int nr_pages) (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); } -int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) +static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) { struct pbe *p; @@ -504,7 +512,318 @@ asmlinkage int swsusp_save(void) */ nr_copy_pages = nr_pages; + nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); return 0; } + +static void init_header(struct swsusp_info *info) +{ + memset(info, 0, sizeof(struct swsusp_info)); + info->version_code = LINUX_VERSION_CODE; + info->num_physpages = num_physpages; + memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); + info->cpus = num_online_cpus(); + info->image_pages = nr_copy_pages; + info->pages = nr_copy_pages + nr_meta_pages + 1; + info->size = info->pages; + info->size <<= PAGE_SHIFT; +} + +/** + * pack_orig_addresses - the .orig_address fields of the PBEs from the + * list starting at @pbe are stored in the array @buf[] (1 page) + */ + +static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe) +{ + int j; + + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + buf[j] = pbe->orig_address; + pbe = pbe->next; + } + if (!pbe) + for (; j < PAGE_SIZE / sizeof(long); j++) + buf[j] = 0; + return pbe; +} + +/** + * snapshot_read_next - used for reading the system memory snapshot. + * + * On the first call to it @handle should point to a zeroed + * snapshot_handle structure. The structure gets updated and a pointer + * to it should be passed to this function every next time. + * + * The @count parameter should contain the number of bytes the caller + * wants to read from the snapshot. It must not be zero. + * + * On success the function returns a positive number. Then, the caller + * is allowed to read up to the returned number of bytes from the memory + * location computed by the data_of() macro. The number returned + * may be smaller than @count, but this only happens if the read would + * cross a page boundary otherwise. + * + * The function returns 0 to indicate the end of data stream condition, + * and a negative number is returned on error. In such cases the + * structure pointed to by @handle is not updated and should not be used + * any more. + */ + +int snapshot_read_next(struct snapshot_handle *handle, size_t count) +{ + if (handle->page > nr_meta_pages + nr_copy_pages) + return 0; + if (!buffer) { + /* This makes the buffer be freed by swsusp_free() */ + buffer = alloc_image_page(GFP_ATOMIC, 0); + if (!buffer) + return -ENOMEM; + } + if (!handle->offset) { + init_header((struct swsusp_info *)buffer); + handle->buffer = buffer; + handle->pbe = pagedir_nosave; + } + if (handle->prev < handle->page) { + if (handle->page <= nr_meta_pages) { + handle->pbe = pack_orig_addresses(buffer, handle->pbe); + if (!handle->pbe) + handle->pbe = pagedir_nosave; + } else { + handle->buffer = (void *)handle->pbe->address; + handle->pbe = handle->pbe->next; + } + handle->prev = handle->page; + } + handle->buf_offset = handle->page_offset; + if (handle->page_offset + count >= PAGE_SIZE) { + count = PAGE_SIZE - handle->page_offset; + handle->page_offset = 0; + handle->page++; + } else { + handle->page_offset += count; + } + handle->offset += count; + return count; +} + +/** + * mark_unsafe_pages - mark the pages that cannot be used for storing + * the image during resume, because they conflict with the pages that + * had been used before suspend + */ + +static int mark_unsafe_pages(struct pbe *pblist) +{ + struct zone *zone; + unsigned long zone_pfn; + struct pbe *p; + + if (!pblist) /* a sanity check */ + return -EINVAL; + + /* Clear page flags */ + for_each_zone (zone) { + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + if (pfn_valid(zone_pfn + zone->zone_start_pfn)) + ClearPageNosaveFree(pfn_to_page(zone_pfn + + zone->zone_start_pfn)); + } + + /* Mark orig addresses */ + for_each_pbe (p, pblist) { + if (virt_addr_valid(p->orig_address)) + SetPageNosaveFree(virt_to_page(p->orig_address)); + else + return -EFAULT; + } + + return 0; +} + +static void copy_page_backup_list(struct pbe *dst, struct pbe *src) +{ + /* We assume both lists contain the same number of elements */ + while (src) { + dst->orig_address = src->orig_address; + dst = dst->next; + src = src->next; + } +} + +static int check_header(struct swsusp_info *info) +{ + char *reason = NULL; + + if (info->version_code != LINUX_VERSION_CODE) + reason = "kernel version"; + if (info->num_physpages != num_physpages) + reason = "memory size"; + if (strcmp(info->uts.sysname,system_utsname.sysname)) + reason = "system type"; + if (strcmp(info->uts.release,system_utsname.release)) + reason = "kernel release"; + if (strcmp(info->uts.version,system_utsname.version)) + reason = "version"; + if (strcmp(info->uts.machine,system_utsname.machine)) + reason = "machine"; + if (reason) { + printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); + return -EPERM; + } + return 0; +} + +/** + * load header - check the image header and copy data from it + */ + +static int load_header(struct snapshot_handle *handle, + struct swsusp_info *info) +{ + int error; + struct pbe *pblist; + + error = check_header(info); + if (!error) { + pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0); + if (!pblist) + return -ENOMEM; + pagedir_nosave = pblist; + handle->pbe = pblist; + nr_copy_pages = info->image_pages; + nr_meta_pages = info->pages - info->image_pages - 1; + } + return error; +} + +/** + * unpack_orig_addresses - copy the elements of @buf[] (1 page) to + * the PBEs in the list starting at @pbe + */ + +static inline struct pbe *unpack_orig_addresses(unsigned long *buf, + struct pbe *pbe) +{ + int j; + + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + pbe->orig_address = buf[j]; + pbe = pbe->next; + } + return pbe; +} + +/** + * create_image - use metadata contained in the PBE list + * pointed to by pagedir_nosave to mark the pages that will + * be overwritten in the process of restoring the system + * memory state from the image and allocate memory for + * the image avoiding these pages + */ + +static int create_image(struct snapshot_handle *handle) +{ + int error = 0; + struct pbe *p, *pblist; + + p = pagedir_nosave; + error = mark_unsafe_pages(p); + if (!error) { + pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); + if (pblist) + copy_page_backup_list(pblist, p); + free_pagedir(p); + if (!pblist) + error = -ENOMEM; + } + if (!error) + error = alloc_data_pages(pblist, GFP_ATOMIC, 1); + if (!error) { + release_eaten_pages(); + pagedir_nosave = pblist; + } else { + pagedir_nosave = NULL; + handle->pbe = NULL; + nr_copy_pages = 0; + nr_meta_pages = 0; + } + return error; +} + +/** + * snapshot_write_next - used for writing the system memory snapshot. + * + * On the first call to it @handle should point to a zeroed + * snapshot_handle structure. The structure gets updated and a pointer + * to it should be passed to this function every next time. + * + * The @count parameter should contain the number of bytes the caller + * wants to write to the image. It must not be zero. + * + * On success the function returns a positive number. Then, the caller + * is allowed to write up to the returned number of bytes to the memory + * location computed by the data_of() macro. The number returned + * may be smaller than @count, but this only happens if the write would + * cross a page boundary otherwise. + * + * The function returns 0 to indicate the "end of file" condition, + * and a negative number is returned on error. In such cases the + * structure pointed to by @handle is not updated and should not be used + * any more. + */ + +int snapshot_write_next(struct snapshot_handle *handle, size_t count) +{ + int error = 0; + + if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages) + return 0; + if (!buffer) { + /* This makes the buffer be freed by swsusp_free() */ + buffer = alloc_image_page(GFP_ATOMIC, 0); + if (!buffer) + return -ENOMEM; + } + if (!handle->offset) + handle->buffer = buffer; + if (handle->prev < handle->page) { + if (!handle->prev) { + error = load_header(handle, (struct swsusp_info *)buffer); + if (error) + return error; + } else if (handle->prev <= nr_meta_pages) { + handle->pbe = unpack_orig_addresses(buffer, handle->pbe); + if (!handle->pbe) { + error = create_image(handle); + if (error) + return error; + handle->pbe = pagedir_nosave; + handle->buffer = (void *)handle->pbe->address; + } + } else { + handle->pbe = handle->pbe->next; + handle->buffer = (void *)handle->pbe->address; + } + handle->prev = handle->page; + } + handle->buf_offset = handle->page_offset; + if (handle->page_offset + count >= PAGE_SIZE) { + count = PAGE_SIZE - handle->page_offset; + handle->page_offset = 0; + handle->page++; + } else { + handle->page_offset += count; + } + handle->offset += count; + return count; +} + +int snapshot_image_loaded(struct snapshot_handle *handle) +{ + return !(!handle->pbe || handle->pbe->next || !nr_copy_pages || + handle->page <= nr_meta_pages + nr_copy_pages); +} diff --git a/kernel/power/swap.c b/kernel/power/swap.c new file mode 100644 index 00000000000..9177f3f73a6 --- /dev/null +++ b/kernel/power/swap.c @@ -0,0 +1,544 @@ +/* + * linux/kernel/power/swap.c + * + * This file provides functions for reading the suspend image from + * and writing it to a swap partition. + * + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * + * This file is released under the GPLv2. + * + */ + +#include <linux/module.h> +#include <linux/smp_lock.h> +#include <linux/file.h> +#include <linux/utsname.h> +#include <linux/version.h> +#include <linux/delay.h> +#include <linux/bitops.h> +#include <linux/genhd.h> +#include <linux/device.h> +#include <linux/buffer_head.h> +#include <linux/bio.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pm.h> + +#include "power.h" + +extern char resume_file[]; + +#define SWSUSP_SIG "S1SUSPEND" + +static struct swsusp_header { + char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; + swp_entry_t image; + char orig_sig[10]; + char sig[10]; +} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; + +/* + * Saving part... + */ + +static unsigned short root_swap = 0xffff; + +static int mark_swapfiles(swp_entry_t start) +{ + int error; + + rw_swap_page_sync(READ, + swp_entry(root_swap, 0), + virt_to_page((unsigned long)&swsusp_header)); + if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || + !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { + memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); + memcpy(swsusp_header.sig,SWSUSP_SIG, 10); + swsusp_header.image = start; + error = rw_swap_page_sync(WRITE, + swp_entry(root_swap, 0), + virt_to_page((unsigned long) + &swsusp_header)); + } else { + pr_debug("swsusp: Partition is not swap space.\n"); + error = -ENODEV; + } + return error; +} + +/** + * swsusp_swap_check - check if the resume device is a swap device + * and get its index (if so) + */ + +static int swsusp_swap_check(void) /* This is called before saving image */ +{ + int res = swap_type_of(swsusp_resume_device); + + if (res >= 0) { + root_swap = res; + return 0; + } + return res; +} + +/** + * write_page - Write one page to given swap location. + * @buf: Address we're writing. + * @offset: Offset of the swap page we're writing to. + */ + +static int write_page(void *buf, unsigned long offset) +{ + swp_entry_t entry; + int error = -ENOSPC; + + if (offset) { + entry = swp_entry(root_swap, offset); + error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf)); + } + return error; +} + +/* + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page + * structures that contain each an array of MAP_PAGE_SIZE swap entries. + * These structures are stored on the swap and linked together with the + * help of the .next_swap member. + * + * The swap map is created during suspend. The swap map pages are + * allocated and populated one at a time, so we only need one memory + * page to set up the entire structure. + * + * During resume we also only need to use one swap_map_page structure + * at a time. + */ + +#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) + +struct swap_map_page { + unsigned long entries[MAP_PAGE_ENTRIES]; + unsigned long next_swap; +}; + +/** + * The swap_map_handle structure is used for handling swap in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + unsigned long cur_swap; + struct bitmap_page *bitmap; + unsigned int k; +}; + +static void release_swap_writer(struct swap_map_handle *handle) +{ + if (handle->cur) + free_page((unsigned long)handle->cur); + handle->cur = NULL; + if (handle->bitmap) + free_bitmap(handle->bitmap); + handle->bitmap = NULL; +} + +static int get_swap_writer(struct swap_map_handle *handle) +{ + handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); + if (!handle->cur) + return -ENOMEM; + handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0)); + if (!handle->bitmap) { + release_swap_writer(handle); + return -ENOMEM; + } + handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); + if (!handle->cur_swap) { + release_swap_writer(handle); + return -ENOSPC; + } + handle->k = 0; + return 0; +} + +static int swap_write_page(struct swap_map_handle *handle, void *buf) +{ + int error; + unsigned long offset; + + if (!handle->cur) + return -EINVAL; + offset = alloc_swap_page(root_swap, handle->bitmap); + error = write_page(buf, offset); + if (error) + return error; + handle->cur->entries[handle->k++] = offset; + if (handle->k >= MAP_PAGE_ENTRIES) { + offset = alloc_swap_page(root_swap, handle->bitmap); + if (!offset) + return -ENOSPC; + handle->cur->next_swap = offset; + error = write_page(handle->cur, handle->cur_swap); + if (error) + return error; + memset(handle->cur, 0, PAGE_SIZE); + handle->cur_swap = offset; + handle->k = 0; + } + return 0; +} + +static int flush_swap_writer(struct swap_map_handle *handle) +{ + if (handle->cur && handle->cur_swap) + return write_page(handle->cur, handle->cur_swap); + else + return -EINVAL; +} + +/** + * save_image - save the suspend image data + */ + +static int save_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_pages) +{ + unsigned int m; + int ret; + int error = 0; + + printk("Saving image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + do { + ret = snapshot_read_next(snapshot, PAGE_SIZE); + if (ret > 0) { + error = swap_write_page(handle, data_of(*snapshot)); + if (error) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + } while (ret > 0); + if (!error) + printk("\b\b\b\bdone\n"); + return error; +} + +/** + * enough_swap - Make sure we have enough swap to save the image. + * + * Returns TRUE or FALSE after checking the total amount of swap + * space avaiable from the resume partition. + */ + +static int enough_swap(unsigned int nr_pages) +{ + unsigned int free_swap = count_swap_pages(root_swap, 1); + + pr_debug("swsusp: free swap pages: %u\n", free_swap); + return free_swap > (nr_pages + PAGES_FOR_IO + + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); +} + +/** + * swsusp_write - Write entire image and metadata. + * + * It is important _NOT_ to umount filesystems at this point. We want + * them synced (in case something goes wrong) but we DO not want to mark + * filesystem clean: it is not. (And it does not matter, if we resume + * correctly, we'll mark system clean, anyway.) + */ + +int swsusp_write(void) +{ + struct swap_map_handle handle; + struct snapshot_handle snapshot; + struct swsusp_info *header; + unsigned long start; + int error; + + if ((error = swsusp_swap_check())) { + printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); + return error; + } + memset(&snapshot, 0, sizeof(struct snapshot_handle)); + error = snapshot_read_next(&snapshot, PAGE_SIZE); + if (error < PAGE_SIZE) + return error < 0 ? error : -EFAULT; + header = (struct swsusp_info *)data_of(snapshot); + if (!enough_swap(header->pages)) { + printk(KERN_ERR "swsusp: Not enough free swap\n"); + return -ENOSPC; + } + error = get_swap_writer(&handle); + if (!error) { + start = handle.cur_swap; + error = swap_write_page(&handle, header); + } + if (!error) + error = save_image(&handle, &snapshot, header->pages - 1); + if (!error) { + flush_swap_writer(&handle); + printk("S"); + error = mark_swapfiles(swp_entry(root_swap, start)); + printk("|\n"); + } + if (error) + free_all_swap_pages(root_swap, handle.bitmap); + release_swap_writer(&handle); + return error; +} + +/* + * Using bio to read from swap. + * This code requires a bit more work than just using buffer heads + * but, it is the recommended way for 2.5/2.6. + * The following are to signal the beginning and end of I/O. Bios + * finish asynchronously, while we want them to happen synchronously. + * A simple atomic_t, and a wait loop take care of this problem. + */ + +static atomic_t io_done = ATOMIC_INIT(0); + +static int end_io(struct bio *bio, unsigned int num, int err) +{ + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + panic("I/O error reading memory image"); + atomic_set(&io_done, 0); + return 0; +} + +static struct block_device *resume_bdev; + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @off physical offset of page. + * @page: page we're reading or writing. + * + * Straight from the textbook - allocate and initialize the bio. + * If we're writing, make sure the page is marked as dirty. + * Then submit it and wait. + */ + +static int submit(int rw, pgoff_t page_off, void *page) +{ + int error = 0; + struct bio *bio; + + bio = bio_alloc(GFP_ATOMIC, 1); + if (!bio) + return -ENOMEM; + bio->bi_sector = page_off * (PAGE_SIZE >> 9); + bio->bi_bdev = resume_bdev; + bio->bi_end_io = end_io; + + if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { + printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); + error = -EFAULT; + goto Done; + } + + atomic_set(&io_done, 1); + submit_bio(rw | (1 << BIO_RW_SYNC), bio); + while (atomic_read(&io_done)) + yield(); + if (rw == READ) + bio_set_pages_dirty(bio); + Done: + bio_put(bio); + return error; +} + +static int bio_read_page(pgoff_t page_off, void *page) +{ + return submit(READ, page_off, page); +} + +static int bio_write_page(pgoff_t page_off, void *page) +{ + return submit(WRITE, page_off, page); +} + +/** + * The following functions allow us to read data using a swap map + * in a file-alike way + */ + +static void release_swap_reader(struct swap_map_handle *handle) +{ + if (handle->cur) + free_page((unsigned long)handle->cur); + handle->cur = NULL; +} + +static int get_swap_reader(struct swap_map_handle *handle, + swp_entry_t start) +{ + int error; + + if (!swp_offset(start)) + return -EINVAL; + handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + if (!handle->cur) + return -ENOMEM; + error = bio_read_page(swp_offset(start), handle->cur); + if (error) { + release_swap_reader(handle); + return error; + } + handle->k = 0; + return 0; +} + +static int swap_read_page(struct swap_map_handle *handle, void *buf) +{ + unsigned long offset; + int error; + + if (!handle->cur) + return -EINVAL; + offset = handle->cur->entries[handle->k]; + if (!offset) + return -EFAULT; + error = bio_read_page(offset, buf); + if (error) + return error; + if (++handle->k >= MAP_PAGE_ENTRIES) { + handle->k = 0; + offset = handle->cur->next_swap; + if (!offset) + release_swap_reader(handle); + else + error = bio_read_page(offset, handle->cur); + } + return error; +} + +/** + * load_image - load the image using the swap map handle + * @handle and the snapshot handle @snapshot + * (assume there are @nr_pages pages to load) + */ + +static int load_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_pages) +{ + unsigned int m; + int ret; + int error = 0; + + printk("Loading image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + do { + ret = snapshot_write_next(snapshot, PAGE_SIZE); + if (ret > 0) { + error = swap_read_page(handle, data_of(*snapshot)); + if (error) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + } while (ret > 0); + if (!error) + printk("\b\b\b\bdone\n"); + if (!snapshot_image_loaded(snapshot)) + error = -ENODATA; + return error; +} + +int swsusp_read(void) +{ + int error; + struct swap_map_handle handle; + struct snapshot_handle snapshot; + struct swsusp_info *header; + + if (IS_ERR(resume_bdev)) { + pr_debug("swsusp: block device not initialised\n"); + return PTR_ERR(resume_bdev); + } + + memset(&snapshot, 0, sizeof(struct snapshot_handle)); + error = snapshot_write_next(&snapshot, PAGE_SIZE); + if (error < PAGE_SIZE) + return error < 0 ? error : -EFAULT; + header = (struct swsusp_info *)data_of(snapshot); + error = get_swap_reader(&handle, swsusp_header.image); + if (!error) + error = swap_read_page(&handle, header); + if (!error) + error = load_image(&handle, &snapshot, header->pages - 1); + release_swap_reader(&handle); + + blkdev_put(resume_bdev); + + if (!error) + pr_debug("swsusp: Reading resume file was successful\n"); + else + pr_debug("swsusp: Error %d resuming\n", error); + return error; +} + +/** + * swsusp_check - Check for swsusp signature in the resume device + */ + +int swsusp_check(void) +{ + int error; + + resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); + if (!IS_ERR(resume_bdev)) { + set_blocksize(resume_bdev, PAGE_SIZE); + memset(&swsusp_header, 0, sizeof(swsusp_header)); + if ((error = bio_read_page(0, &swsusp_header))) + return error; + if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { + memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + /* Reset swap signature now */ + error = bio_write_page(0, &swsusp_header); + } else { + return -EINVAL; + } + if (error) + blkdev_put(resume_bdev); + else + pr_debug("swsusp: Signature found, resuming\n"); + } else { + error = PTR_ERR(resume_bdev); + } + + if (error) + pr_debug("swsusp: Error %d check for resume file\n", error); + + return error; +} + +/** + * swsusp_close - close swap device. + */ + +void swsusp_close(void) +{ + if (IS_ERR(resume_bdev)) { + pr_debug("swsusp: block device not initialised\n"); + return; + } + + blkdev_put(resume_bdev); +} diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 2d9d08f72f7..c4016cbbd3e 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -31,41 +31,24 @@ * Fixed runaway init * * Rafael J. Wysocki <rjw@sisk.pl> - * Added the swap map data structure and reworked the handling of swap + * Reworked the freeing of memory and the handling of swap * * More state savers are welcome. Especially for the scsi layer... * * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt */ -#include <linux/module.h> #include <linux/mm.h> #include <linux/suspend.h> -#include <linux/smp_lock.h> -#include <linux/file.h> -#include <linux/utsname.h> -#include <linux/version.h> -#include <linux/delay.h> -#include <linux/bitops.h> #include <linux/spinlock.h> -#include <linux/genhd.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/swap.h> #include <linux/pm.h> -#include <linux/device.h> -#include <linux/buffer_head.h> #include <linux/swapops.h> #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/highmem.h> -#include <linux/bio.h> - -#include <asm/uaccess.h> -#include <asm/mmu_context.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/io.h> #include "power.h" @@ -77,6 +60,8 @@ */ unsigned long image_size = 500 * 1024 * 1024; +int in_suspend __nosavedata = 0; + #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void); int save_highmem(void); @@ -87,471 +72,97 @@ static int restore_highmem(void) { return 0; } static unsigned int count_highmem_pages(void) { return 0; } #endif -extern char resume_file[]; - -#define SWSUSP_SIG "S1SUSPEND" - -static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; - swp_entry_t image; - char orig_sig[10]; - char sig[10]; -} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; - -static struct swsusp_info swsusp_info; - -/* - * Saving part... - */ - -static unsigned short root_swap = 0xffff; - -static int mark_swapfiles(swp_entry_t start) -{ - int error; - - rw_swap_page_sync(READ, - swp_entry(root_swap, 0), - virt_to_page((unsigned long)&swsusp_header)); - if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || - !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { - memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); - memcpy(swsusp_header.sig,SWSUSP_SIG, 10); - swsusp_header.image = start; - error = rw_swap_page_sync(WRITE, - swp_entry(root_swap, 0), - virt_to_page((unsigned long) - &swsusp_header)); - } else { - pr_debug("swsusp: Partition is not swap space.\n"); - error = -ENODEV; - } - return error; -} - -/* - * Check whether the swap device is the specified resume - * device, irrespective of whether they are specified by - * identical names. - * - * (Thus, device inode aliasing is allowed. You can say /dev/hda4 - * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs] - * and they'll be considered the same device. This is *necessary* for - * devfs, since the resume code can only recognize the form /dev/hda4, - * but the suspend code would see the long name.) - */ -static inline int is_resume_device(const struct swap_info_struct *swap_info) -{ - struct file *file = swap_info->swap_file; - struct inode *inode = file->f_dentry->d_inode; - - return S_ISBLK(inode->i_mode) && - swsusp_resume_device == MKDEV(imajor(inode), iminor(inode)); -} - -static int swsusp_swap_check(void) /* This is called before saving image */ -{ - int i; - - spin_lock(&swap_lock); - for (i = 0; i < MAX_SWAPFILES; i++) { - if (!(swap_info[i].flags & SWP_WRITEOK)) - continue; - if (!swsusp_resume_device || is_resume_device(swap_info + i)) { - spin_unlock(&swap_lock); - root_swap = i; - return 0; - } - } - spin_unlock(&swap_lock); - return -ENODEV; -} - -/** - * write_page - Write one page to a fresh swap location. - * @addr: Address we're writing. - * @loc: Place to store the entry we used. - * - * Allocate a new swap entry and 'sync' it. Note we discard -EIO - * errors. That is an artifact left over from swsusp. It did not - * check the return of rw_swap_page_sync() at all, since most pages - * written back to swap would return -EIO. - * This is a partial improvement, since we will at least return other - * errors, though we need to eventually fix the damn code. - */ -static int write_page(unsigned long addr, swp_entry_t *loc) -{ - swp_entry_t entry; - int error = -ENOSPC; - - entry = get_swap_page_of_type(root_swap); - if (swp_offset(entry)) { - error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr)); - if (!error || error == -EIO) - *loc = entry; - } - return error; -} - /** - * Swap map-handling functions - * - * The swap map is a data structure used for keeping track of each page - * written to the swap. It consists of many swap_map_page structures - * that contain each an array of MAP_PAGE_SIZE swap entries. - * These structures are linked together with the help of either the - * .next (in memory) or the .next_swap (in swap) member. + * The following functions are used for tracing the allocated + * swap pages, so that they can be freed in case of an error. * - * The swap map is created during suspend. At that time we need to keep - * it in memory, because we have to free all of the allocated swap - * entries if an error occurs. The memory needed is preallocated - * so that we know in advance if there's enough of it. - * - * The first swap_map_page structure is filled with the swap entries that - * correspond to the first MAP_PAGE_SIZE data pages written to swap and - * so on. After the all of the data pages have been written, the order - * of the swap_map_page structures in the map is reversed so that they - * can be read from swap in the original order. This causes the data - * pages to be loaded in exactly the same order in which they have been - * saved. - * - * During resume we only need to use one swap_map_page structure - * at a time, which means that we only need to use two memory pages for - * reading the image - one for reading the swap_map_page structures - * and the second for reading the data pages from swap. + * The functions operate on a linked bitmap structure defined + * in power.h */ -#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ - / sizeof(swp_entry_t)) - -struct swap_map_page { - swp_entry_t entries[MAP_PAGE_SIZE]; - swp_entry_t next_swap; - struct swap_map_page *next; -}; - -static inline void free_swap_map(struct swap_map_page *swap_map) +void free_bitmap(struct bitmap_page *bitmap) { - struct swap_map_page *swp; + struct bitmap_page *bp; - while (swap_map) { - swp = swap_map->next; - free_page((unsigned long)swap_map); - swap_map = swp; + while (bitmap) { + bp = bitmap->next; + free_page((unsigned long)bitmap); + bitmap = bp; } } -static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) +struct bitmap_page *alloc_bitmap(unsigned int nr_bits) { - struct swap_map_page *swap_map, *swp; - unsigned n = 0; + struct bitmap_page *bitmap, *bp; + unsigned int n; - if (!nr_pages) + if (!nr_bits) return NULL; - pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); - swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); - swp = swap_map; - for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { - swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); - swp = swp->next; - if (!swp) { - free_swap_map(swap_map); + bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); + bp = bitmap; + for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) { + bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); + bp = bp->next; + if (!bp) { + free_bitmap(bitmap); return NULL; } } - return swap_map; + return bitmap; } -/** - * reverse_swap_map - reverse the order of pages in the swap map - * @swap_map - */ - -static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map) -{ - struct swap_map_page *prev, *next; - - prev = NULL; - while (swap_map) { - next = swap_map->next; - swap_map->next = prev; - prev = swap_map; - swap_map = next; - } - return prev; -} - -/** - * free_swap_map_entries - free the swap entries allocated to store - * the swap map @swap_map (this is only called in case of an error) - */ -static inline void free_swap_map_entries(struct swap_map_page *swap_map) -{ - while (swap_map) { - if (swap_map->next_swap.val) - swap_free(swap_map->next_swap); - swap_map = swap_map->next; - } -} - -/** - * save_swap_map - save the swap map used for tracing the data pages - * stored in the swap - */ - -static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start) -{ - swp_entry_t entry = (swp_entry_t){0}; - int error; - - while (swap_map) { - swap_map->next_swap = entry; - if ((error = write_page((unsigned long)swap_map, &entry))) - return error; - swap_map = swap_map->next; - } - *start = entry; - return 0; -} - -/** - * free_image_entries - free the swap entries allocated to store - * the image data pages (this is only called in case of an error) - */ - -static inline void free_image_entries(struct swap_map_page *swp) +static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit) { - unsigned k; + unsigned int n; - while (swp) { - for (k = 0; k < MAP_PAGE_SIZE; k++) - if (swp->entries[k].val) - swap_free(swp->entries[k]); - swp = swp->next; + n = BITMAP_PAGE_BITS; + while (bitmap && n <= bit) { + n += BITMAP_PAGE_BITS; + bitmap = bitmap->next; } -} - -/** - * The swap_map_handle structure is used for handling the swap map in - * a file-alike way - */ - -struct swap_map_handle { - struct swap_map_page *cur; - unsigned int k; -}; - -static inline void init_swap_map_handle(struct swap_map_handle *handle, - struct swap_map_page *map) -{ - handle->cur = map; - handle->k = 0; -} - -static inline int swap_map_write_page(struct swap_map_handle *handle, - unsigned long addr) -{ - int error; - - error = write_page(addr, handle->cur->entries + handle->k); - if (error) - return error; - if (++handle->k >= MAP_PAGE_SIZE) { - handle->cur = handle->cur->next; - handle->k = 0; + if (!bitmap) + return -EINVAL; + n -= BITMAP_PAGE_BITS; + bit -= n; + n = 0; + while (bit >= BITS_PER_CHUNK) { + bit -= BITS_PER_CHUNK; + n++; } + bitmap->chunks[n] |= (1UL << bit); return 0; } -/** - * save_image_data - save the data pages pointed to by the PBEs - * from the list @pblist using the swap map handle @handle - * (assume there are @nr_pages data pages to save) - */ - -static int save_image_data(struct pbe *pblist, - struct swap_map_handle *handle, - unsigned int nr_pages) -{ - unsigned int m; - struct pbe *p; - int error = 0; - - printk("Saving image data pages (%u pages) ... ", nr_pages); - m = nr_pages / 100; - if (!m) - m = 1; - nr_pages = 0; - for_each_pbe (p, pblist) { - error = swap_map_write_page(handle, p->address); - if (error) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - if (!error) - printk("\b\b\b\bdone\n"); - return error; -} - -static void dump_info(void) -{ - pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code); - pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages); - pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname); - pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename); - pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release); - pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version); - pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine); - pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); - pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); - pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); - pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages); -} - -static void init_header(unsigned int nr_pages) -{ - memset(&swsusp_info, 0, sizeof(swsusp_info)); - swsusp_info.version_code = LINUX_VERSION_CODE; - swsusp_info.num_physpages = num_physpages; - memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); - - swsusp_info.cpus = num_online_cpus(); - swsusp_info.image_pages = nr_pages; - swsusp_info.pages = nr_pages + - ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; -} - -/** - * pack_orig_addresses - the .orig_address fields of the PBEs from the - * list starting at @pbe are stored in the array @buf[] (1 page) - */ - -static inline struct pbe *pack_orig_addresses(unsigned long *buf, - struct pbe *pbe) -{ - int j; - - for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { - buf[j] = pbe->orig_address; - pbe = pbe->next; - } - if (!pbe) - for (; j < PAGE_SIZE / sizeof(long); j++) - buf[j] = 0; - return pbe; -} - -/** - * save_image_metadata - save the .orig_address fields of the PBEs - * from the list @pblist using the swap map handle @handle - */ - -static int save_image_metadata(struct pbe *pblist, - struct swap_map_handle *handle) +unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) { - unsigned long *buf; - unsigned int n = 0; - struct pbe *p; - int error = 0; + unsigned long offset; - printk("Saving image metadata ... "); - buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); - if (!buf) - return -ENOMEM; - p = pblist; - while (p) { - p = pack_orig_addresses(buf, p); - error = swap_map_write_page(handle, (unsigned long)buf); - if (error) - break; - n++; + offset = swp_offset(get_swap_page_of_type(swap)); + if (offset) { + if (bitmap_set(bitmap, offset)) { + swap_free(swp_entry(swap, offset)); + offset = 0; + } } - free_page((unsigned long)buf); - if (!error) - printk("done (%u pages saved)\n", n); - return error; + return offset; } -/** - * enough_swap - Make sure we have enough swap to save the image. - * - * Returns TRUE or FALSE after checking the total amount of swap - * space avaiable from the resume partition. - */ - -static int enough_swap(unsigned int nr_pages) +void free_all_swap_pages(int swap, struct bitmap_page *bitmap) { - unsigned int free_swap = swap_info[root_swap].pages - - swap_info[root_swap].inuse_pages; - - pr_debug("swsusp: free swap pages: %u\n", free_swap); - return free_swap > (nr_pages + PAGES_FOR_IO + - (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); -} + unsigned int bit, n; + unsigned long test; -/** - * swsusp_write - Write entire image and metadata. - * - * It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) - */ - -int swsusp_write(struct pbe *pblist, unsigned int nr_pages) -{ - struct swap_map_page *swap_map; - struct swap_map_handle handle; - swp_entry_t start; - int error; - - if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); - return error; - } - if (!enough_swap(nr_pages)) { - printk(KERN_ERR "swsusp: Not enough free swap\n"); - return -ENOSPC; + bit = 0; + while (bitmap) { + for (n = 0; n < BITMAP_PAGE_CHUNKS; n++) + for (test = 1UL; test; test <<= 1) { + if (bitmap->chunks[n] & test) + swap_free(swp_entry(swap, bit)); + bit++; + } + bitmap = bitmap->next; } - - init_header(nr_pages); - swap_map = alloc_swap_map(swsusp_info.pages); - if (!swap_map) - return -ENOMEM; - init_swap_map_handle(&handle, swap_map); - - error = swap_map_write_page(&handle, (unsigned long)&swsusp_info); - if (!error) - error = save_image_metadata(pblist, &handle); - if (!error) - error = save_image_data(pblist, &handle, nr_pages); - if (error) - goto Free_image_entries; - - swap_map = reverse_swap_map(swap_map); - error = save_swap_map(swap_map, &start); - if (error) - goto Free_map_entries; - - dump_info(); - printk( "S" ); - error = mark_swapfiles(start); - printk( "|\n" ); - if (error) - goto Free_map_entries; - -Free_swap_map: - free_swap_map(swap_map); - return error; - -Free_map_entries: - free_swap_map_entries(swap_map); -Free_image_entries: - free_image_entries(swap_map); - goto Free_swap_map; } /** @@ -660,379 +271,3 @@ int swsusp_resume(void) local_irq_enable(); return error; } - -/** - * mark_unsafe_pages - mark the pages that cannot be used for storing - * the image during resume, because they conflict with the pages that - * had been used before suspend - */ - -static void mark_unsafe_pages(struct pbe *pblist) -{ - struct zone *zone; - unsigned long zone_pfn; - struct pbe *p; - - if (!pblist) /* a sanity check */ - return; - - /* Clear page flags */ - for_each_zone (zone) { - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - if (pfn_valid(zone_pfn + zone->zone_start_pfn)) - ClearPageNosaveFree(pfn_to_page(zone_pfn + - zone->zone_start_pfn)); - } - - /* Mark orig addresses */ - for_each_pbe (p, pblist) - SetPageNosaveFree(virt_to_page(p->orig_address)); - -} - -static void copy_page_backup_list(struct pbe *dst, struct pbe *src) -{ - /* We assume both lists contain the same number of elements */ - while (src) { - dst->orig_address = src->orig_address; - dst = dst->next; - src = src->next; - } -} - -/* - * Using bio to read from swap. - * This code requires a bit more work than just using buffer heads - * but, it is the recommended way for 2.5/2.6. - * The following are to signal the beginning and end of I/O. Bios - * finish asynchronously, while we want them to happen synchronously. - * A simple atomic_t, and a wait loop take care of this problem. - */ - -static atomic_t io_done = ATOMIC_INIT(0); - -static int end_io(struct bio *bio, unsigned int num, int err) -{ - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - panic("I/O error reading memory image"); - atomic_set(&io_done, 0); - return 0; -} - -static struct block_device *resume_bdev; - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * - * Straight from the textbook - allocate and initialize the bio. - * If we're writing, make sure the page is marked as dirty. - * Then submit it and wait. - */ - -static int submit(int rw, pgoff_t page_off, void *page) -{ - int error = 0; - struct bio *bio; - - bio = bio_alloc(GFP_ATOMIC, 1); - if (!bio) - return -ENOMEM; - bio->bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = resume_bdev; - bio->bi_end_io = end_io; - - if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { - printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); - error = -EFAULT; - goto Done; - } - - - atomic_set(&io_done, 1); - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - while (atomic_read(&io_done)) - yield(); - if (rw == READ) - bio_set_pages_dirty(bio); - Done: - bio_put(bio); - return error; -} - -static int bio_read_page(pgoff_t page_off, void *page) -{ - return submit(READ, page_off, page); -} - -static int bio_write_page(pgoff_t page_off, void *page) -{ - return submit(WRITE, page_off, page); -} - -/** - * The following functions allow us to read data using a swap map - * in a file-alike way - */ - -static inline void release_swap_map_reader(struct swap_map_handle *handle) -{ - if (handle->cur) - free_page((unsigned long)handle->cur); - handle->cur = NULL; -} - -static inline int get_swap_map_reader(struct swap_map_handle *handle, - swp_entry_t start) -{ - int error; - - if (!swp_offset(start)) - return -EINVAL; - handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); - if (!handle->cur) - return -ENOMEM; - error = bio_read_page(swp_offset(start), handle->cur); - if (error) { - release_swap_map_reader(handle); - return error; - } - handle->k = 0; - return 0; -} - -static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf) -{ - unsigned long offset; - int error; - - if (!handle->cur) - return -EINVAL; - offset = swp_offset(handle->cur->entries[handle->k]); - if (!offset) - return -EINVAL; - error = bio_read_page(offset, buf); - if (error) - return error; - if (++handle->k >= MAP_PAGE_SIZE) { - handle->k = 0; - offset = swp_offset(handle->cur->next_swap); - if (!offset) - release_swap_map_reader(handle); - else - error = bio_read_page(offset, handle->cur); - } - return error; -} - -static int check_header(void) -{ - char *reason = NULL; - - dump_info(); - if (swsusp_info.version_code != LINUX_VERSION_CODE) - reason = "kernel version"; - if (swsusp_info.num_physpages != num_physpages) - reason = "memory size"; - if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) - reason = "system type"; - if (strcmp(swsusp_info.uts.release,system_utsname.release)) - reason = "kernel release"; - if (strcmp(swsusp_info.uts.version,system_utsname.version)) - reason = "version"; - if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) - reason = "machine"; - if (reason) { - printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); - return -EPERM; - } - return 0; -} - -/** - * load_image_data - load the image data using the swap map handle - * @handle and store them using the page backup list @pblist - * (assume there are @nr_pages pages to load) - */ - -static int load_image_data(struct pbe *pblist, - struct swap_map_handle *handle, - unsigned int nr_pages) -{ - int error; - unsigned int m; - struct pbe *p; - - if (!pblist) - return -EINVAL; - printk("Loading image data pages (%u pages) ... ", nr_pages); - m = nr_pages / 100; - if (!m) - m = 1; - nr_pages = 0; - p = pblist; - while (p) { - error = swap_map_read_page(handle, (void *)p->address); - if (error) - break; - p = p->next; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - if (!error) - printk("\b\b\b\bdone\n"); - return error; -} - -/** - * unpack_orig_addresses - copy the elements of @buf[] (1 page) to - * the PBEs in the list starting at @pbe - */ - -static inline struct pbe *unpack_orig_addresses(unsigned long *buf, - struct pbe *pbe) -{ - int j; - - for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { - pbe->orig_address = buf[j]; - pbe = pbe->next; - } - return pbe; -} - -/** - * load_image_metadata - load the image metadata using the swap map - * handle @handle and put them into the PBEs in the list @pblist - */ - -static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle) -{ - struct pbe *p; - unsigned long *buf; - unsigned int n = 0; - int error = 0; - - printk("Loading image metadata ... "); - buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); - if (!buf) - return -ENOMEM; - p = pblist; - while (p) { - error = swap_map_read_page(handle, buf); - if (error) - break; - p = unpack_orig_addresses(buf, p); - n++; - } - free_page((unsigned long)buf); - if (!error) - printk("done (%u pages loaded)\n", n); - return error; -} - -int swsusp_read(struct pbe **pblist_ptr) -{ - int error; - struct pbe *p, *pblist; - struct swap_map_handle handle; - unsigned int nr_pages; - - if (IS_ERR(resume_bdev)) { - pr_debug("swsusp: block device not initialised\n"); - return PTR_ERR(resume_bdev); - } - - error = get_swap_map_reader(&handle, swsusp_header.image); - if (!error) - error = swap_map_read_page(&handle, &swsusp_info); - if (!error) - error = check_header(); - if (error) - return error; - nr_pages = swsusp_info.image_pages; - p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0); - if (!p) - return -ENOMEM; - error = load_image_metadata(p, &handle); - if (!error) { - mark_unsafe_pages(p); - pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); - if (pblist) - copy_page_backup_list(pblist, p); - free_pagedir(p); - if (!pblist) - error = -ENOMEM; - - /* Allocate memory for the image and read the data from swap */ - if (!error) - error = alloc_data_pages(pblist, GFP_ATOMIC, 1); - if (!error) { - release_eaten_pages(); - error = load_image_data(pblist, &handle, nr_pages); - } - if (!error) - *pblist_ptr = pblist; - } - release_swap_map_reader(&handle); - - blkdev_put(resume_bdev); - - if (!error) - pr_debug("swsusp: Reading resume file was successful\n"); - else - pr_debug("swsusp: Error %d resuming\n", error); - return error; -} - -/** - * swsusp_check - Check for swsusp signature in the resume device - */ - -int swsusp_check(void) -{ - int error; - - resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); - if (!IS_ERR(resume_bdev)) { - set_blocksize(resume_bdev, PAGE_SIZE); - memset(&swsusp_header, 0, sizeof(swsusp_header)); - if ((error = bio_read_page(0, &swsusp_header))) - return error; - if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { - memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); - /* Reset swap signature now */ - error = bio_write_page(0, &swsusp_header); - } else { - return -EINVAL; - } - if (error) - blkdev_put(resume_bdev); - else - pr_debug("swsusp: Signature found, resuming\n"); - } else { - error = PTR_ERR(resume_bdev); - } - - if (error) - pr_debug("swsusp: Error %d check for resume file\n", error); - - return error; -} - -/** - * swsusp_close - close swap device. - */ - -void swsusp_close(void) -{ - if (IS_ERR(resume_bdev)) { - pr_debug("swsusp: block device not initialised\n"); - return; - } - - blkdev_put(resume_bdev); -} diff --git a/kernel/power/user.c b/kernel/power/user.c new file mode 100644 index 00000000000..3f1539fbe48 --- /dev/null +++ b/kernel/power/user.c @@ -0,0 +1,333 @@ +/* + * linux/kernel/power/user.c + * + * This file provides the user space interface for software suspend/resume. + * + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * + * This file is released under the GPLv2. + * + */ + +#include <linux/suspend.h> +#include <linux/syscalls.h> +#include <linux/string.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pm.h> +#include <linux/fs.h> + +#include <asm/uaccess.h> + +#include "power.h" + +#define SNAPSHOT_MINOR 231 + +static struct snapshot_data { + struct snapshot_handle handle; + int swap; + struct bitmap_page *bitmap; + int mode; + char frozen; + char ready; +} snapshot_state; + +static atomic_t device_available = ATOMIC_INIT(1); + +static int snapshot_open(struct inode *inode, struct file *filp) +{ + struct snapshot_data *data; + + if (!atomic_add_unless(&device_available, -1, 0)) + return -EBUSY; + + if ((filp->f_flags & O_ACCMODE) == O_RDWR) + return -ENOSYS; + + nonseekable_open(inode, filp); + data = &snapshot_state; + filp->private_data = data; + memset(&data->handle, 0, sizeof(struct snapshot_handle)); + if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { + data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; + data->mode = O_RDONLY; + } else { + data->swap = -1; + data->mode = O_WRONLY; + } + data->bitmap = NULL; + data->frozen = 0; + data->ready = 0; + + return 0; +} + +static int snapshot_release(struct inode *inode, struct file *filp) +{ + struct snapshot_data *data; + + swsusp_free(); + data = filp->private_data; + free_all_swap_pages(data->swap, data->bitmap); + free_bitmap(data->bitmap); + if (data->frozen) { + down(&pm_sem); + thaw_processes(); + enable_nonboot_cpus(); + up(&pm_sem); + } + atomic_inc(&device_available); + return 0; +} + +static ssize_t snapshot_read(struct file *filp, char __user *buf, + size_t count, loff_t *offp) +{ + struct snapshot_data *data; + ssize_t res; + + data = filp->private_data; + res = snapshot_read_next(&data->handle, count); + if (res > 0) { + if (copy_to_user(buf, data_of(data->handle), res)) + res = -EFAULT; + else + *offp = data->handle.offset; + } + return res; +} + +static ssize_t snapshot_write(struct file *filp, const char __user *buf, + size_t count, loff_t *offp) +{ + struct snapshot_data *data; + ssize_t res; + + data = filp->private_data; + res = snapshot_write_next(&data->handle, count); + if (res > 0) { + if (copy_from_user(data_of(data->handle), buf, res)) + res = -EFAULT; + else + *offp = data->handle.offset; + } + return res; +} + +static int snapshot_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + int error = 0; + struct snapshot_data *data; + loff_t offset, avail; + + if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) + return -ENOTTY; + if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) + return -ENOTTY; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + data = filp->private_data; + + switch (cmd) { + + case SNAPSHOT_FREEZE: + if (data->frozen) + break; + down(&pm_sem); + disable_nonboot_cpus(); + if (freeze_processes()) { + thaw_processes(); + enable_nonboot_cpus(); + error = -EBUSY; + } + up(&pm_sem); + if (!error) + data->frozen = 1; + break; + + case SNAPSHOT_UNFREEZE: + if (!data->frozen) + break; + down(&pm_sem); + thaw_processes(); + enable_nonboot_cpus(); + up(&pm_sem); + data->frozen = 0; + break; + + case SNAPSHOT_ATOMIC_SNAPSHOT: + if (data->mode != O_RDONLY || !data->frozen || data->ready) { + error = -EPERM; + break; + } + down(&pm_sem); + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); + if (!error) { + error = device_suspend(PMSG_FREEZE); + if (!error) { + in_suspend = 1; + error = swsusp_suspend(); + device_resume(); + } + } + up(&pm_sem); + if (!error) + error = put_user(in_suspend, (unsigned int __user *)arg); + if (!error) + data->ready = 1; + break; + + case SNAPSHOT_ATOMIC_RESTORE: + if (data->mode != O_WRONLY || !data->frozen || + !snapshot_image_loaded(&data->handle)) { + error = -EPERM; + break; + } + down(&pm_sem); + pm_prepare_console(); + error = device_suspend(PMSG_FREEZE); + if (!error) { + error = swsusp_resume(); + device_resume(); + } + pm_restore_console(); + up(&pm_sem); + break; + + case SNAPSHOT_FREE: + swsusp_free(); + memset(&data->handle, 0, sizeof(struct snapshot_handle)); + data->ready = 0; + break; + + case SNAPSHOT_SET_IMAGE_SIZE: + image_size = arg; + break; + + case SNAPSHOT_AVAIL_SWAP: + avail = count_swap_pages(data->swap, 1); + avail <<= PAGE_SHIFT; + error = put_user(avail, (loff_t __user *)arg); + break; + + case SNAPSHOT_GET_SWAP_PAGE: + if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { + error = -ENODEV; + break; + } + if (!data->bitmap) { + data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0)); + if (!data->bitmap) { + error = -ENOMEM; + break; + } + } + offset = alloc_swap_page(data->swap, data->bitmap); + if (offset) { + offset <<= PAGE_SHIFT; + error = put_user(offset, (loff_t __user *)arg); + } else { + error = -ENOSPC; + } + break; + + case SNAPSHOT_FREE_SWAP_PAGES: + if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { + error = -ENODEV; + break; + } + free_all_swap_pages(data->swap, data->bitmap); + free_bitmap(data->bitmap); + data->bitmap = NULL; + break; + + case SNAPSHOT_SET_SWAP_FILE: + if (!data->bitmap) { + /* + * User space encodes device types as two-byte values, + * so we need to recode them + */ + if (old_decode_dev(arg)) { + data->swap = swap_type_of(old_decode_dev(arg)); + if (data->swap < 0) + error = -ENODEV; + } else { + data->swap = -1; + error = -EINVAL; + } + } else { + error = -EPERM; + } + break; + + case SNAPSHOT_S2RAM: + if (!data->frozen) { + error = -EPERM; + break; + } + + if (down_trylock(&pm_sem)) { + error = -EBUSY; + break; + } + + if (pm_ops->prepare) { + error = pm_ops->prepare(PM_SUSPEND_MEM); + if (error) + goto OutS3; + } + + /* Put devices to sleep */ + error = device_suspend(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "Failed to suspend some devices.\n"); + } else { + /* Enter S3, system is already frozen */ + suspend_enter(PM_SUSPEND_MEM); + + /* Wake up devices */ + device_resume(); + } + + if (pm_ops->finish) + pm_ops->finish(PM_SUSPEND_MEM); + +OutS3: + up(&pm_sem); + break; + + default: + error = -ENOTTY; + + } + + return error; +} + +static struct file_operations snapshot_fops = { + .open = snapshot_open, + .release = snapshot_release, + .read = snapshot_read, + .write = snapshot_write, + .llseek = no_llseek, + .ioctl = snapshot_ioctl, +}; + +static struct miscdevice snapshot_device = { + .minor = SNAPSHOT_MINOR, + .name = "snapshot", + .fops = &snapshot_fops, +}; + +static int __init snapshot_device_init(void) +{ + return misc_register(&snapshot_device); +}; + +device_initcall(snapshot_device_init); diff --git a/kernel/profile.c b/kernel/profile.c index f89248e6d70..ad81f799a9b 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -23,6 +23,7 @@ #include <linux/cpu.h> #include <linux/profile.h> #include <linux/highmem.h> +#include <linux/mutex.h> #include <asm/sections.h> #include <asm/semaphore.h> @@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); -static DECLARE_MUTEX(profile_flip_mutex); +static DEFINE_MUTEX(profile_flip_mutex); #endif /* CONFIG_SMP */ static int __init profile_setup(char * str) @@ -243,7 +244,7 @@ static void profile_flip_buffers(void) { int i, j, cpu; - down(&profile_flip_mutex); + mutex_lock(&profile_flip_mutex); j = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); on_each_cpu(__profile_flip_buffers, NULL, 0, 1); @@ -259,14 +260,14 @@ static void profile_flip_buffers(void) hits[i].hits = hits[i].pc = 0; } } - up(&profile_flip_mutex); + mutex_unlock(&profile_flip_mutex); } static void profile_discard_flip_buffers(void) { int i, cpu; - down(&profile_flip_mutex); + mutex_lock(&profile_flip_mutex); i = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); on_each_cpu(__profile_flip_buffers, NULL, 0, 1); @@ -274,7 +275,7 @@ static void profile_discard_flip_buffers(void) struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); } - up(&profile_flip_mutex); + mutex_unlock(&profile_flip_mutex); } void profile_hit(int type, void *__pc) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index fedf5e36975..6df1559b1c0 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -47,15 +47,16 @@ #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/cpu.h> +#include <linux/mutex.h> /* Definition for rcupdate control block. */ -struct rcu_ctrlblk rcu_ctrlblk = { +static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, .lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE, }; -struct rcu_ctrlblk rcu_bh_ctrlblk = { +static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, .lock = SPIN_LOCK_UNLOCKED, @@ -75,7 +76,7 @@ static int rsinterval = 1000; #endif static atomic_t rcu_barrier_cpu_count; -static struct semaphore rcu_barrier_sema; +static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; #ifdef CONFIG_SMP @@ -207,13 +208,13 @@ static void rcu_barrier_func(void *notused) void rcu_barrier(void) { BUG_ON(in_interrupt()); - /* Take cpucontrol semaphore to protect against CPU hotplug */ - down(&rcu_barrier_sema); + /* Take cpucontrol mutex to protect against CPU hotplug */ + mutex_lock(&rcu_barrier_mutex); init_completion(&rcu_barrier_completion); atomic_set(&rcu_barrier_cpu_count, 0); on_each_cpu(rcu_barrier_func, NULL, 0, 1); wait_for_completion(&rcu_barrier_completion); - up(&rcu_barrier_sema); + mutex_unlock(&rcu_barrier_mutex); } EXPORT_SYMBOL_GPL(rcu_barrier); @@ -549,7 +550,6 @@ static struct notifier_block __devinitdata rcu_nb = { */ void __init rcu_init(void) { - sema_init(&rcu_barrier_sema, 1); rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/kernel/sched.c b/kernel/sched.c index 6b6e0d70eb3..7ffaabd64f8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -237,6 +237,7 @@ struct runqueue { task_t *migration_thread; struct list_head migration_queue; + int cpu; #endif #ifdef CONFIG_SCHEDSTATS @@ -1654,6 +1655,9 @@ unsigned long nr_iowait(void) /* * double_rq_lock - safely lock two runqueues * + * We must take them in cpu order to match code in + * dependent_sleeper and wake_dependent_sleeper. + * * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ @@ -1665,7 +1669,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ } else { - if (rq1 < rq2) { + if (rq1->cpu < rq2->cpu) { spin_lock(&rq1->lock); spin_lock(&rq2->lock); } else { @@ -1701,7 +1705,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) __acquires(this_rq->lock) { if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { + if (busiest->cpu < this_rq->cpu) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); @@ -2869,7 +2873,7 @@ asmlinkage void __sched schedule(void) */ if (likely(!current->exit_state)) { if (unlikely(in_atomic())) { - printk(KERN_ERR "scheduling while atomic: " + printk(KERN_ERR "BUG: scheduling while atomic: " "%s/0x%08x/%d\n", current->comm, preempt_count(), current->pid); dump_stack(); @@ -6029,6 +6033,7 @@ void __init sched_init(void) rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); + rq->cpu = i; #endif atomic_set(&rq->nr_iowait, 0); @@ -6069,7 +6074,7 @@ void __might_sleep(char *file, int line) if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; - printk(KERN_ERR "Debug: sleeping function called from invalid" + printk(KERN_ERR "BUG: sleeping function called from invalid" " context at %s:%d\n", file, line); printk("in_atomic():%d, irqs_disabled():%d\n", in_atomic(), irqs_disabled()); diff --git a/kernel/signal.c b/kernel/signal.c index ea154104a00..75f7341b0c3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1922,6 +1922,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, sigset_t *mask = ¤t->blocked; int signr = 0; + try_to_freeze(); + relock: spin_lock_irq(¤t->sighand->siglock); for (;;) { @@ -2099,10 +2101,11 @@ long do_no_restart_syscall(struct restart_block *param) int sigprocmask(int how, sigset_t *set, sigset_t *oldset) { int error; - sigset_t old_block; spin_lock_irq(¤t->sighand->siglock); - old_block = current->blocked; + if (oldset) + *oldset = current->blocked; + error = 0; switch (how) { case SIG_BLOCK: @@ -2119,8 +2122,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) } recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (oldset) - *oldset = old_block; + return error; } @@ -2307,7 +2309,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, timeout = schedule_timeout_interruptible(timeout); - try_to_freeze(); spin_lock_irq(¤t->sighand->siglock); sig = dequeue_signal(current, &these, &info); current->blocked = current->real_blocked; diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 0375fcd5921..d1b810782bc 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock); #define BUILD_LOCK_OPS(op, locktype) \ void __lockfunc _##op##_lock(locktype##_t *lock) \ { \ - preempt_disable(); \ for (;;) { \ + preempt_disable(); \ if (likely(_raw_##op##_trylock(lock))) \ break; \ preempt_enable(); \ + \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ cpu_relax(); \ - preempt_disable(); \ } \ (lock)->break_lock = 0; \ } \ @@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ - preempt_disable(); \ for (;;) { \ + preempt_disable(); \ local_irq_save(flags); \ if (likely(_raw_##op##_trylock(lock))) \ break; \ local_irq_restore(flags); \ - \ preempt_enable(); \ + \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ cpu_relax(); \ - preempt_disable(); \ } \ (lock)->break_lock = 0; \ return flags; \ diff --git a/kernel/sys.c b/kernel/sys.c index f91218a5463..c0fcad9f826 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1227,7 +1227,7 @@ asmlinkage long sys_setsid(void) struct pid *pid; int err = -EPERM; - down(&tty_sem); + mutex_lock(&tty_mutex); write_lock_irq(&tasklist_lock); pid = find_pid(PIDTYPE_PGID, group_leader->pid); @@ -1241,7 +1241,7 @@ asmlinkage long sys_setsid(void) err = process_group(group_leader); out: write_unlock_irq(&tasklist_lock); - up(&tty_sem); + mutex_unlock(&tty_mutex); return err; } @@ -1677,9 +1677,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) * a lot simpler! (Which we're not doing right now because we're not * measuring them yet). * - * This expects to be called with tasklist_lock read-locked or better, - * and the siglock not locked. It may momentarily take the siglock. - * * When sampling multiple threads for RUSAGE_SELF, under SMP we might have * races with threads incrementing their own counters. But since word * reads are atomic, we either get new values or old values and we don't @@ -1687,6 +1684,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) * the c* fields from p->signal from races with exit.c updating those * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. + * + * tasklist_lock locking optimisation: + * If we are current and single threaded, we do not need to take the tasklist + * lock or the siglock. No one else can take our signal_struct away, + * no one else can reap the children to update signal->c* counters, and + * no one else can race with the signal-> fields. + * If we do not take the tasklist_lock, the signal-> fields could be read + * out of order while another thread was just exiting. So we place a + * read memory barrier when we avoid the lock. On the writer side, + * write memory barrier is implied in __exit_signal as __exit_signal releases + * the siglock spinlock after updating the signal-> fields. + * + * We don't really need the siglock when we access the non c* fields + * of the signal_struct (for RUSAGE_SELF) even in multithreaded + * case, since we take the tasklist lock for read and the non c* signal-> + * fields are updated only in __exit_signal, which is called with + * tasklist_lock taken for write, hence these two threads cannot execute + * concurrently. + * */ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) @@ -1694,13 +1710,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; + int need_lock = 0; memset((char *) r, 0, sizeof *r); + utime = stime = cputime_zero; - if (unlikely(!p->signal)) - return; + if (p != current || !thread_group_empty(p)) + need_lock = 1; - utime = stime = cputime_zero; + if (need_lock) { + read_lock(&tasklist_lock); + if (unlikely(!p->signal)) { + read_unlock(&tasklist_lock); + return; + } + } else + /* See locking comments above */ + smp_rmb(); switch (who) { case RUSAGE_BOTH: @@ -1740,6 +1766,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) BUG(); } + if (need_lock) + read_unlock(&tasklist_lock); cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); } @@ -1747,9 +1775,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) int getrusage(struct task_struct *p, int who, struct rusage __user *ru) { struct rusage r; - read_lock(&tasklist_lock); k_getrusage(p, who, &r); - read_unlock(&tasklist_lock); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } |