aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt4
-rw-r--r--kernel/configs.c15
-rw-r--r--kernel/cpu.c66
-rw-r--r--kernel/cpuset.c7
-rw-r--r--kernel/exit.c18
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/futex.c988
-rw-r--r--kernel/futex_compat.c22
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/irq/handle.c1
-rw-r--r--kernel/kmod.c6
-rw-r--r--kernel/kthread.c113
-rw-r--r--kernel/module.c10
-rw-r--r--kernel/mutex.c8
-rw-r--r--kernel/power/disk.c195
-rw-r--r--kernel/power/main.c42
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/snapshot.c14
-rw-r--r--kernel/power/user.c13
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/rcupdate.c2
-rw-r--r--kernel/relay.c37
-rw-r--r--kernel/rtmutex.c41
-rw-r--r--kernel/rtmutex_common.h34
-rw-r--r--kernel/sched.c72
-rw-r--r--kernel/signal.c140
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/sys.c98
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/time/clocksource.c51
-rw-r--r--kernel/time/timer_list.c25
-rw-r--r--kernel/timer.c14
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/workqueue.c783
35 files changed, 1738 insertions, 1120 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0b46a5dff4c..c64ce9c1420 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -23,7 +23,7 @@ config PREEMPT_VOLUNTARY
"explicit preemption points" to the kernel code. These new
preemption points have been selected to reduce the maximum
latency of rescheduling, providing faster application reactions,
- at the cost of slighly lower throughput.
+ at the cost of slightly lower throughput.
This allows reaction to interactive events by allowing a
low priority process to voluntarily preempt itself even if it
@@ -43,7 +43,7 @@ config PREEMPT
even if it is in kernel mode executing a system call and would
otherwise not be about to reach a natural preemption point.
This allows applications to run more 'smoothly' even when the
- system is under load, at the cost of slighly lower throughput
+ system is under load, at the cost of slightly lower throughput
and a slight runtime overhead to kernel code.
Select this if you are building a kernel for a desktop or
diff --git a/kernel/configs.c b/kernel/configs.c
index 8fa1fb28f8a..e84d3f9c6c7 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -61,18 +61,9 @@ static ssize_t
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
- loff_t pos = *offset;
- ssize_t count;
-
- if (pos >= kernel_config_data_size)
- return 0;
-
- count = min(len, (size_t)(kernel_config_data_size - pos));
- if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
- return -EFAULT;
-
- *offset += count;
- return count;
+ return simple_read_from_buffer(buf, len, offset,
+ kernel_config_data + MAGIC_SIZE,
+ kernel_config_data_size);
}
static const struct file_operations ikconfig_file_ops = {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 36e70845cfc..208cf3497c1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -97,7 +97,7 @@ static inline void check_for_tasks(int cpu)
(!cputime_eq(p->utime, cputime_zero) ||
!cputime_eq(p->stime, cputime_zero)))
printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
- (state = %ld, flags = %lx) \n",
+ (state = %ld, flags = %x) \n",
p->comm, p->pid, cpu, p->state, p->flags);
}
write_unlock_irq(&tasklist_lock);
@@ -120,11 +120,13 @@ static int take_cpu_down(void *unused)
}
/* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu)
+static int _cpu_down(unsigned int cpu, int tasks_frozen)
{
- int err;
+ int err, nr_calls = 0;
struct task_struct *p;
cpumask_t old_allowed, tmp;
+ void *hcpu = (void *)(long)cpu;
+ unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
if (num_online_cpus() == 1)
return -EBUSY;
@@ -132,12 +134,16 @@ static int _cpu_down(unsigned int cpu)
if (!cpu_online(cpu))
return -EINVAL;
- err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
- (void *)(long)cpu);
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+ err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
+ hcpu, -1, &nr_calls);
if (err == NOTIFY_BAD) {
+ __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+ hcpu, nr_calls, NULL);
printk("%s: attempt to take down CPU %u failed\n",
__FUNCTION__, cpu);
- return -EINVAL;
+ err = -EINVAL;
+ goto out_release;
}
/* Ensure that we are not runnable on dying cpu */
@@ -152,8 +158,8 @@ static int _cpu_down(unsigned int cpu)
if (IS_ERR(p) || cpu_online(cpu)) {
/* CPU didn't die: tell everyone. Can't complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
- (void *)(long)cpu) == NOTIFY_BAD)
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+ hcpu) == NOTIFY_BAD)
BUG();
if (IS_ERR(p)) {
@@ -170,13 +176,9 @@ static int _cpu_down(unsigned int cpu)
/* This actually kills the CPU. */
__cpu_die(cpu);
- /* Move it here so it can run. */
- kthread_bind(p, get_cpu());
- put_cpu();
-
/* CPU is completely dead: tell everyone. Too late to complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD,
- (void *)(long)cpu) == NOTIFY_BAD)
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
+ hcpu) == NOTIFY_BAD)
BUG();
check_for_tasks(cpu);
@@ -185,6 +187,8 @@ out_thread:
err = kthread_stop(p);
out_allowed:
set_cpus_allowed(current, old_allowed);
+out_release:
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
return err;
}
@@ -196,7 +200,7 @@ int cpu_down(unsigned int cpu)
if (cpu_hotplug_disabled)
err = -EBUSY;
else
- err = _cpu_down(cpu);
+ err = _cpu_down(cpu, 0);
mutex_unlock(&cpu_add_remove_lock);
return err;
@@ -204,15 +208,18 @@ int cpu_down(unsigned int cpu)
#endif /*CONFIG_HOTPLUG_CPU*/
/* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu)
+static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
{
- int ret;
+ int ret, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
+ unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;
- ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+ ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
+ -1, &nr_calls);
if (ret == NOTIFY_BAD) {
printk("%s: attempt to bring up CPU %u failed\n",
__FUNCTION__, cpu);
@@ -229,12 +236,13 @@ static int __cpuinit _cpu_up(unsigned int cpu)
BUG_ON(!cpu_online(cpu));
/* Now call notifier in preparation. */
- raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
out_notify:
if (ret != 0)
- raw_notifier_call_chain(&cpu_chain,
- CPU_UP_CANCELED, hcpu);
+ __raw_notifier_call_chain(&cpu_chain,
+ CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
return ret;
}
@@ -247,19 +255,13 @@ int __cpuinit cpu_up(unsigned int cpu)
if (cpu_hotplug_disabled)
err = -EBUSY;
else
- err = _cpu_up(cpu);
+ err = _cpu_up(cpu, 0);
mutex_unlock(&cpu_add_remove_lock);
return err;
}
#ifdef CONFIG_SUSPEND_SMP
-/* Needed to prevent the microcode driver from requesting firmware in its CPU
- * hotplug notifier during the suspend/resume.
- */
-int suspend_cpu_hotplug;
-EXPORT_SYMBOL(suspend_cpu_hotplug);
-
static cpumask_t frozen_cpus;
int disable_nonboot_cpus(void)
@@ -267,7 +269,6 @@ int disable_nonboot_cpus(void)
int cpu, first_cpu, error = 0;
mutex_lock(&cpu_add_remove_lock);
- suspend_cpu_hotplug = 1;
first_cpu = first_cpu(cpu_online_map);
/* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
@@ -277,7 +278,7 @@ int disable_nonboot_cpus(void)
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
- error = _cpu_down(cpu);
+ error = _cpu_down(cpu, 1);
if (!error) {
cpu_set(cpu, frozen_cpus);
printk("CPU%d is down\n", cpu);
@@ -294,7 +295,6 @@ int disable_nonboot_cpus(void)
} else {
printk(KERN_ERR "Non-boot CPUs are not disabled\n");
}
- suspend_cpu_hotplug = 0;
mutex_unlock(&cpu_add_remove_lock);
return error;
}
@@ -309,10 +309,9 @@ void enable_nonboot_cpus(void)
if (cpus_empty(frozen_cpus))
goto out;
- suspend_cpu_hotplug = 1;
printk("Enabling non-boot CPUs ...\n");
for_each_cpu_mask(cpu, frozen_cpus) {
- error = _cpu_up(cpu);
+ error = _cpu_up(cpu, 1);
if (!error) {
printk("CPU%d is up\n", cpu);
continue;
@@ -320,7 +319,6 @@ void enable_nonboot_cpus(void)
printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
}
cpus_clear(frozen_cpus);
- suspend_cpu_hotplug = 0;
out:
mutex_unlock(&cpu_add_remove_lock);
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 88b416dfbc7..f57854b0892 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1772,12 +1772,7 @@ static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
{
struct ctr_struct *ctr = file->private_data;
- if (*ppos + nbytes > ctr->bufsz)
- nbytes = ctr->bufsz - *ppos;
- if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
- return -EFAULT;
- *ppos += nbytes;
- return nbytes;
+ return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
}
static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
diff --git a/kernel/exit.c b/kernel/exit.c
index f5a7abb621f..b0c6f0c3a2d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -26,6 +26,7 @@
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
+#include <linux/kthread.h>
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
@@ -254,26 +255,25 @@ static int has_stopped_jobs(struct pid *pgrp)
}
/**
- * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to.
+ * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
*
* If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to init so that
- * it is correctly cleaned up on exit.
+ * it ever exits, it should generally reparent itself to kthreadd so it
+ * isn't in the way of other processes and is correctly cleaned up on exit.
*
* The various task state such as scheduling policy and priority may have
* been inherited from a user process, so we reset them to sane values here.
*
- * NOTE that reparent_to_init() gives the caller full capabilities.
+ * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
*/
-static void reparent_to_init(void)
+static void reparent_to_kthreadd(void)
{
write_lock_irq(&tasklist_lock);
ptrace_unlink(current);
/* Reparent to init */
remove_parent(current);
- current->parent = child_reaper(current);
- current->real_parent = child_reaper(current);
+ current->real_parent = current->parent = kthreadd_task;
add_parent(current);
/* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -347,7 +347,7 @@ int disallow_signal(int sig)
return -EINVAL;
spin_lock_irq(&current->sighand->siglock);
- sigaddset(&current->blocked, sig);
+ current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
return 0;
@@ -400,7 +400,7 @@ void daemonize(const char *name, ...)
current->files = init_task.files;
atomic_inc(&current->files->count);
- reparent_to_init();
+ reparent_to_kthreadd();
}
EXPORT_SYMBOL(daemonize);
diff --git a/kernel/fork.c b/kernel/fork.c
index a8dd75d4992..5dd3979747f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,7 +105,7 @@ static struct kmem_cache *mm_cachep;
void free_task(struct task_struct *tsk)
{
- free_thread_info(tsk->thread_info);
+ free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
free_task_struct(tsk);
}
@@ -175,7 +175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
}
*tsk = *orig;
- tsk->thread_info = ti;
+ tsk->stack = ti;
setup_thread_stack(tsk, orig);
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/futex.c b/kernel/futex.c
index 600bc9d801f..b7ce15c67e3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
*
+ * PRIVATE futexes by Eric Dumazet
+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -53,6 +56,12 @@
#include "rtmutex_common.h"
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
@@ -81,12 +90,12 @@ struct futex_pi_state {
* we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
- * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
* The order of wakup is always to make the first condition true, then
* wake up q->waiters, then make the second condition true.
*/
struct futex_q {
- struct list_head list;
+ struct plist_node list;
wait_queue_head_t waiters;
/* Which hash list lock to use: */
@@ -102,14 +111,20 @@ struct futex_q {
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
struct task_struct *task;
+
+ /*
+ * This waiter is used in case of requeue from a
+ * normal futex to a PI-futex
+ */
+ struct rt_mutex_waiter waiter;
};
/*
* Split the global futex_lock into every hash list lock.
*/
struct futex_hash_bucket {
- spinlock_t lock;
- struct list_head chain;
+ spinlock_t lock;
+ struct plist_head chain;
};
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -138,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
&& key1->both.offset == key2->both.offset);
}
-/*
- * Get parameters which are the keys for a futex.
+/**
+ * get_futex_key - Get parameters which are the keys for a futex.
+ * @uaddr: virtual address of the futex
+ * @shared: NULL for a PROCESS_PRIVATE futex,
+ * &current->mm->mmap_sem for a PROCESS_SHARED futex
+ * @key: address where result is stored.
+ *
+ * Returns a negative error code or 0
+ * The key words are stored in *key on success.
*
* For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
- * Returns: 0, or negative error code.
- * The key words are stored in *key on success.
- *
- * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
+ * fshared is NULL for PROCESS_PRIVATE futexes
+ * For other futexes, it points to &current->mm->mmap_sem and
+ * caller must have taken the reader lock. but NOT any spinlocks.
*/
-int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
+ union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -162,11 +184,25 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
- if (unlikely((key->both.offset % sizeof(u32)) != 0))
+ if (unlikely((address % sizeof(u32)) != 0))
return -EINVAL;
address -= key->both.offset;
/*
+ * PROCESS_PRIVATE futexes are fast.
+ * As the mm cannot disappear under us and the 'key' only needs
+ * virtual address, we dont even have to find the underlying vma.
+ * Note : We do have to check 'uaddr' is a valid user address,
+ * but access_ok() should be faster than find_vma()
+ */
+ if (!fshared) {
+ if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+ return -EFAULT;
+ key->private.mm = mm;
+ key->private.address = address;
+ return 0;
+ }
+ /*
* The futex is hashed differently depending on whether
* it's in a shared or private mapping. So check vma first.
*/
@@ -180,6 +216,9 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+ /* Save the user address in the ley */
+ key->uaddr = uaddr;
+
/*
* Private mappings are handled in a simple way.
*
@@ -190,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
* mappings of _writable_ handles.
*/
if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+ key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
key->private.mm = mm;
key->private.address = address;
return 0;
@@ -199,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
* Linear file mappings are also simple.
*/
key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
- key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
+ vma->vm_pgoff);
@@ -227,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key);
* Take a reference to the resource addressed by a key.
* Can be called while holding spinlocks.
*
- * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
- * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
*/
inline void get_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr != 0) {
- if (key->both.offset & 1)
+ if (key->both.ptr == 0)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
atomic_inc(&key->shared.inode->i_count);
- else
+ break;
+ case FUT_OFF_MMSHARED:
atomic_inc(&key->private.mm->mm_count);
+ break;
}
}
EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -247,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
*/
void drop_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr != 0) {
- if (key->both.offset & 1)
+ if (key->both.ptr == 0)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
iput(key->shared.inode);
- else
+ break;
+ case FUT_OFF_MMSHARED:
mmdrop(key->private.mm);
+ break;
}
}
EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -268,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
}
/*
- * Fault handling. Called with current->mm->mmap_sem held.
+ * Fault handling.
+ * if fshared is non NULL, current->mm->mmap_sem is already held
*/
-static int futex_handle_fault(unsigned long address, int attempt)
+static int futex_handle_fault(unsigned long address,
+ struct rw_semaphore *fshared, int attempt)
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
+ int ret = -EFAULT;
- if (attempt > 2 || !(vma = find_vma(mm, address)) ||
- vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
- return -EFAULT;
+ if (attempt > 2)
+ return ret;
- switch (handle_mm_fault(mm, vma, address, 1)) {
- case VM_FAULT_MINOR:
- current->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- current->maj_flt++;
- break;
- default:
- return -EFAULT;
+ if (!fshared)
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, address);
+ if (vma && address >= vma->vm_start &&
+ (vma->vm_flags & VM_WRITE)) {
+ switch (handle_mm_fault(mm, vma, address, 1)) {
+ case VM_FAULT_MINOR:
+ ret = 0;
+ current->min_flt++;
+ break;
+ case VM_FAULT_MAJOR:
+ ret = 0;
+ current->maj_flt++;
+ break;
+ }
}
- return 0;
+ if (!fshared)
+ up_read(&mm->mmap_sem);
+ return ret;
}
/*
@@ -439,18 +495,19 @@ void exit_pi_state_list(struct task_struct *curr)
}
static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
- struct list_head *head;
+ struct plist_head *head;
struct task_struct *p;
pid_t pid;
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
- if (match_futex(&this->key, &me->key)) {
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex(&this->key, key)) {
/*
* Another waiter already exists - bump up
* the refcount and return its pi_state:
@@ -465,7 +522,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
WARN_ON(!atomic_read(&pi_state->refcount));
atomic_inc(&pi_state->refcount);
- me->pi_state = pi_state;
+ *ps = pi_state;
return 0;
}
@@ -492,7 +549,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
/* Store the key for possible exit cleanups: */
- pi_state->key = me->key;
+ pi_state->key = *key;
spin_lock_irq(&p->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
@@ -502,7 +559,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
put_task_struct(p);
- me->pi_state = pi_state;
+ *ps = pi_state;
return 0;
}
@@ -513,12 +570,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
*/
static void wake_futex(struct futex_q *q)
{
- list_del_init(&q->list);
+ plist_del(&q->list, &q->list.plist);
if (q->filp)
send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
/*
* The lock in wake_up_all() is a crucial memory barrier after the
- * list_del_init() and also before assigning to q->lock_ptr.
+ * plist_del() and also before assigning to q->lock_ptr.
*/
wake_up_all(&q->waiters);
/*
@@ -562,6 +619,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
*/
if (!(uval & FUTEX_OWNER_DIED)) {
newval = FUTEX_WAITERS | new_owner->pid;
+ /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -629,17 +688,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
-static int futex_wake(u32 __user *uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
+ int nr_wake)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct list_head *head;
+ struct plist_head *head;
union futex_key key;
int ret;
- down_read(&current->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &key);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -647,7 +708,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
spin_lock(&hb->lock);
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) {
if (this->pi_state) {
ret = -EINVAL;
@@ -661,7 +722,261 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
spin_unlock(&hb->lock);
out:
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
+ return ret;
+}
+
+/*
+ * Called from futex_requeue_pi.
+ * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
+ * PI-futex value; search its associated pi_state if an owner exist
+ * or create a new one without owner.
+ */
+static inline int
+lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **pi_state)
+{
+ u32 curval, uval, newval;
+
+retry:
+ /*
+ * We can't handle a fault cleanly because we can't
+ * release the locks here. Simply return the fault.
+ */
+ if (get_futex_value_locked(&curval, uaddr))
+ return -EFAULT;
+
+ /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
+ if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
+ != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
+ /*
+ * No waiters yet, we prepare the futex to have some waiters.
+ */
+
+ uval = curval;
+ newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
+
+ pagefault_disable();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ pagefault_enable();
+
+ if (unlikely(curval == -EFAULT))
+ return -EFAULT;
+ if (unlikely(curval != uval))
+ goto retry;
+ }
+
+ if (!(curval & FUTEX_TID_MASK)
+ || lookup_pi_state(curval, hb, key, pi_state)) {
+ /* the futex has no owner (yet) or the lookup failed:
+ allocate one pi_state without owner */
+
+ *pi_state = alloc_pi_state();
+
+ /* Already stores the key: */
+ (*pi_state)->key = *key;
+
+ /* init the mutex without owner */
+ __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
+ }
+
+ return 0;
+}
+
+/*
+ * Keep the first nr_wake waiter from futex1, wake up one,
+ * and requeue the next nr_requeue waiters following hashed on
+ * one physical page to another physical page (PI-futex uaddr2)
+ */
+static int futex_requeue_pi(u32 __user *uaddr1,
+ struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
+ int nr_wake, int nr_requeue, u32 *cmpval)
+{
+ union futex_key key1, key2;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct plist_head *head1;
+ struct futex_q *this, *next;
+ struct futex_pi_state *pi_state2 = NULL;
+ struct rt_mutex_waiter *waiter, *top_waiter = NULL;
+ struct rt_mutex *lock2 = NULL;
+ int ret, drop_count = 0;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+retry:
+ /*
+ * First take all the futex related locks:
+ */
+ if (fshared)
+ down_read(fshared);
+
+ ret = get_futex_key(uaddr1, fshared, &key1);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, fshared, &key2);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u32 curval;
+
+ ret = get_futex_value_locked(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /*
+ * If we would have faulted, release mmap_sem, fault
+ * it in and start all over again.
+ */
+ if (fshared)
+ up_read(fshared);
+
+ ret = get_user(curval, uaddr1);
+
+ if (!ret)
+ goto retry;
+
+ return ret;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ head1 = &hb1->chain;
+ plist_for_each_entry_safe(this, next, head1, list) {
+ if (!match_futex (&this->key, &key1))
+ continue;
+ if (++ret <= nr_wake) {
+ wake_futex(this);
+ } else {
+ /*
+ * FIRST: get and set the pi_state
+ */
+ if (!pi_state2) {
+ int s;
+ /* do this only the first time we requeue someone */
+ s = lookup_pi_state_for_requeue(uaddr2, hb2,
+ &key2, &pi_state2);
+ if (s) {
+ ret = s;
+ goto out_unlock;
+ }
+
+ lock2 = &pi_state2->pi_mutex;
+ spin_lock(&lock2->wait_lock);
+
+ /* Save the top waiter of the wait_list */
+ if (rt_mutex_has_waiters(lock2))
+ top_waiter = rt_mutex_top_waiter(lock2);
+ } else
+ atomic_inc(&pi_state2->refcount);
+
+
+ this->pi_state = pi_state2;
+
+ /*
+ * SECOND: requeue futex_q to the correct hashbucket
+ */
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(head1 != &hb2->chain)) {
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
+ this->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+ this->list.plist.lock = &hb2->lock;
+#endif
+ }
+ this->key = key2;
+ get_futex_key_refs(&key2);
+ drop_count++;
+
+
+ /*
+ * THIRD: queue it to lock2
+ */
+ spin_lock_irq(&this->task->pi_lock);
+ waiter = &this->waiter;
+ waiter->task = this->task;
+ waiter->lock = lock2;
+ plist_node_init(&waiter->list_entry, this->task->prio);
+ plist_node_init(&waiter->pi_list_entry, this->task->prio);
+ plist_add(&waiter->list_entry, &lock2->wait_list);
+ this->task->pi_blocked_on = waiter;
+ spin_unlock_irq(&this->task->pi_lock);
+
+ if (ret - nr_wake >= nr_requeue)
+ break;
+ }
+ }
+
+ /* If we've requeued some tasks and the top_waiter of the rt_mutex
+ has changed, we must adjust the priority of the owner, if any */
+ if (drop_count) {
+ struct task_struct *owner = rt_mutex_owner(lock2);
+ if (owner &&
+ (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
+ int chain_walk = 0;
+
+ spin_lock_irq(&owner->pi_lock);
+ if (top_waiter)
+ plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+ else
+ /*
+ * There was no waiters before the requeue,
+ * the flag must be updated
+ */
+ mark_rt_mutex_waiters(lock2);
+
+ plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+ __rt_mutex_adjust_prio(owner);
+ if (owner->pi_blocked_on) {
+ chain_walk = 1;
+ get_task_struct(owner);
+ }
+
+ spin_unlock_irq(&owner->pi_lock);
+ spin_unlock(&lock2->wait_lock);
+
+ if (chain_walk)
+ rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
+ current);
+ } else {
+ /* No owner or the top_waiter does not change */
+ mark_rt_mutex_waiters(lock2);
+ spin_unlock(&lock2->wait_lock);
+ }
+ }
+
+out_unlock:
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /* drop_futex_key_refs() must be called outside the spinlocks. */
+ while (--drop_count >= 0)
+ drop_futex_key_refs(&key1);
+
+out:
+ if (fshared)
+ up_read(fshared);
return ret;
}
@@ -670,22 +985,24 @@ out:
* to this virtual address:
*/
static int
-futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
- struct list_head *head;
+ struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret, attempt = 0;
retryfull:
- down_read(&current->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr1, &key1);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
@@ -725,11 +1042,10 @@ retry:
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr2,
- attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr2,
+ fshared, attempt);
+ if (ret)
goto out;
- }
goto retry;
}
@@ -737,7 +1053,8 @@ retry:
* If we would have faulted, release mmap_sem,
* fault it in and start all over again.
*/
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(dummy, uaddr2);
if (ret)
@@ -748,7 +1065,7 @@ retry:
head = &hb1->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key1)) {
wake_futex(this);
if (++ret >= nr_wake)
@@ -760,7 +1077,7 @@ retry:
head = &hb2->chain;
op_ret = 0;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key2)) {
wake_futex(this);
if (++op_ret >= nr_wake2)
@@ -774,7 +1091,8 @@ retry:
if (hb1 != hb2)
spin_unlock(&hb2->lock);
out:
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
@@ -782,22 +1100,24 @@ out:
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
-static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
int nr_wake, int nr_requeue, u32 *cmpval)
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
- struct list_head *head1;
+ struct plist_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
retry:
- down_read(&current->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr1, &key1);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
@@ -820,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
* If we would have faulted, release mmap_sem, fault
* it in and start all over again.
*/
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(curval, uaddr1);
@@ -836,7 +1157,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
}
head1 = &hb1->chain;
- list_for_each_entry_safe(this, next, head1, list) {
+ plist_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
continue;
if (++ret <= nr_wake) {
@@ -847,9 +1168,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
* requeue.
*/
if (likely(head1 != &hb2->chain)) {
- list_move_tail(&this->list, &hb2->chain);
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
this->lock_ptr = &hb2->lock;
- }
+#ifdef CONFIG_DEBUG_PI_LIST
+ this->list.plist.lock = &hb2->lock;
+#endif
+ }
this->key = key2;
get_futex_key_refs(&key2);
drop_count++;
@@ -869,7 +1194,8 @@ out_unlock:
drop_futex_key_refs(&key1);
out:
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
@@ -894,7 +1220,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
- list_add_tail(&q->list, &hb->chain);
+ int prio;
+
+ /*
+ * The priority used to register this element is
+ * - either the real thread-priority for the real-time threads
+ * (i.e. threads with a priority lower than MAX_RT_PRIO)
+ * - or MAX_RT_PRIO for non-RT threads.
+ * Thus, all RT-threads are woken first in priority order, and
+ * the others are woken last, in FIFO order.
+ */
+ prio = min(current->normal_prio, MAX_RT_PRIO);
+
+ plist_node_init(&q->list, prio);
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb->lock;
+#endif
+ plist_add(&q->list, &hb->chain);
q->task = current;
spin_unlock(&hb->lock);
}
@@ -949,8 +1291,8 @@ static int unqueue_me(struct futex_q *q)
spin_unlock(lock_ptr);
goto retry;
}
- WARN_ON(list_empty(&q->list));
- list_del(&q->list);
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
BUG_ON(q->pi_state);
@@ -964,39 +1306,104 @@ static int unqueue_me(struct futex_q *q)
/*
* PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock is held on entry and dropped here.
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
+ * and dropped here.
*/
-static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
+static void unqueue_me_pi(struct futex_q *q)
{
- WARN_ON(list_empty(&q->list));
- list_del(&q->list);
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
BUG_ON(!q->pi_state);
free_pi_state(q->pi_state);
q->pi_state = NULL;
- spin_unlock(&hb->lock);
+ spin_unlock(q->lock_ptr);
drop_futex_key_refs(&q->key);
}
+/*
+ * Fixup the pi_state owner with current.
+ *
+ * The cur->mm semaphore must be held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
+ struct futex_q *q,
+ struct futex_hash_bucket *hb,
+ struct task_struct *curr)
+{
+ u32 newtid = curr->pid | FUTEX_WAITERS;
+ struct futex_pi_state *pi_state = q->pi_state;
+ u32 uval, curval, newval;
+ int ret;
+
+ /* Owner died? */
+ if (pi_state->owner != NULL) {
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+ } else
+ newtid |= FUTEX_OWNER_DIED;
+
+ pi_state->owner = curr;
+
+ spin_lock_irq(&curr->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &curr->pi_state_list);
+ spin_unlock_irq(&curr->pi_lock);
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(q);
+ if (fshared)
+ up_read(fshared);
+ /*
+ * We own it, so we have to replace the pending owner
+ * TID. This must be atomic as we have preserve the
+ * owner died bit here.
+ */
+ ret = get_user(uval, uaddr);
+ while (!ret) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval, newval);
+ if (curval == -EFAULT)
+ ret = -EFAULT;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+ return ret;
+}
+
+/*
+ * In case we must use restart_block to restart a futex_wait,
+ * we encode in the 'arg3' shared capability
+ */
+#define ARG3_SHARED 1
+
static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait_abstime(u32 __user *uaddr, u32 val,
- int timed, unsigned long abs_time)
+static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+ u32 val, ktime_t *abs_time)
{
struct task_struct *curr = current;
DECLARE_WAITQUEUE(wait, curr);
struct futex_hash_bucket *hb;
struct futex_q q;
- unsigned long time_left = 0;
u32 uval;
int ret;
+ struct hrtimer_sleeper t, *to = NULL;
+ int rem = 0;
q.pi_state = NULL;
retry:
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &q.key);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
@@ -1019,8 +1426,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
* a wakeup when *uaddr != val on entry to the syscall. This is
* rare, but normal.
*
- * We hold the mmap semaphore, so the mapping cannot have changed
- * since we looked it up in get_futex_key.
+ * for shared futexes, we hold the mmap semaphore, so the mapping
+ * cannot have changed since we looked it up in get_futex_key.
*/
ret = get_futex_value_locked(&uval, uaddr);
@@ -1031,7 +1438,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
* If we would have faulted, release mmap_sem, fault it in and
* start all over again.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
@@ -1043,6 +1451,14 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
if (uval != val)
goto out_unlock_release_sem;
+ /*
+ * This rt_mutex_waiter structure is prepared here and will
+ * be used only if this task is requeued from a normal futex to
+ * a PI-futex with futex_requeue_pi.
+ */
+ debug_rt_mutex_init_waiter(&q.waiter);
+ q.waiter.task = NULL;
+
/* Only actually queue if *uaddr contained val. */
__queue_me(&q, hb);
@@ -1050,7 +1466,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
/*
* There might have been scheduling since the queue_me(), as we
@@ -1065,23 +1482,33 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&q.waiters, &wait);
/*
- * !list_empty() is safe here without any lock.
+ * !plist_node_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- time_left = 0;
- if (likely(!list_empty(&q.list))) {
- unsigned long rel_time;
-
- if (timed) {
- unsigned long now = jiffies;
- if (time_after(now, abs_time))
- rel_time = 0;
- else
- rel_time = abs_time - now;
- } else
- rel_time = MAX_SCHEDULE_TIMEOUT;
+ if (likely(!plist_node_empty(&q.list))) {
+ if (!abs_time)
+ schedule();
+ else {
+ to = &t;
+ hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(&t, current);
+ t.timer.expires = *abs_time;
- time_left = schedule_timeout(rel_time);
+ hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
+
+ /*
+ * the timer could have already expired, in which
+ * case current would be flagged for rescheduling.
+ * Don't bother calling schedule.
+ */
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+
+ /* Flag if a timeout occured */
+ rem = (t.task == NULL);
+ }
}
__set_current_state(TASK_RUNNING);
@@ -1090,17 +1517,80 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
* we are the only user of it.
*/
+ if (q.pi_state) {
+ /*
+ * We were woken but have been requeued on a PI-futex.
+ * We have to complete the lock acquisition by taking
+ * the rtmutex.
+ */
+
+ struct rt_mutex *lock = &q.pi_state->pi_mutex;
+
+ spin_lock(&lock->wait_lock);
+ if (unlikely(q.waiter.task)) {
+ remove_waiter(lock, &q.waiter);
+ }
+ spin_unlock(&lock->wait_lock);
+
+ if (rem)
+ ret = -ETIMEDOUT;
+ else
+ ret = rt_mutex_timed_lock(lock, to, 1);
+
+ if (fshared)
+ down_read(fshared);
+ spin_lock(q.lock_ptr);
+
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case.
+ */
+ if (!ret && q.pi_state->owner != curr) {
+ /*
+ * We MUST play with the futex we were requeued on,
+ * NOT the current futex.
+ * We can retrieve it from the key of the pi_state
+ */
+ uaddr = q.pi_state->key.uaddr;
+
+ /* mmap_sem and hash_bucket lock are unlocked at
+ return of this function */
+ ret = fixup_pi_state_owner(uaddr, fshared,
+ &q, hb, curr);
+ } else {
+ /*
+ * Catch the rare case, where the lock was released
+ * when we were on the way back before we locked
+ * the hash bucket.
+ */
+ if (ret && q.pi_state->owner == curr) {
+ if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+ ret = 0;
+ }
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q);
+ if (fshared)
+ up_read(fshared);
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
+ return ret;
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
/* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
return 0;
- if (time_left == 0)
+ if (rem)
return -ETIMEDOUT;
/*
* We expect signal_pending(current), but another thread may
* have handled it for us already.
*/
- if (time_left == MAX_SCHEDULE_TIMEOUT)
+ if (!abs_time)
return -ERESTARTSYS;
else {
struct restart_block *restart;
@@ -1108,8 +1598,10 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
restart->fn = futex_wait_restart;
restart->arg0 = (unsigned long)uaddr;
restart->arg1 = (unsigned long)val;
- restart->arg2 = (unsigned long)timed;
- restart->arg3 = abs_time;
+ restart->arg2 = (unsigned long)abs_time;
+ restart->arg3 = 0;
+ if (fshared)
+ restart->arg3 |= ARG3_SHARED;
return -ERESTART_RESTARTBLOCK;
}
@@ -1117,65 +1609,111 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
queue_unlock(&q, hb);
out_release_sem:
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
-{
- int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
- return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
-}
static long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = (u32 __user *)restart->arg0;
u32 val = (u32)restart->arg1;
- int timed = (int)restart->arg2;
- unsigned long abs_time = restart->arg3;
+ ktime_t *abs_time = (ktime_t *)restart->arg2;
+ struct rw_semaphore *fshared = NULL;
restart->fn = do_no_restart_syscall;
- return (long)futex_wait_abstime(uaddr, val, timed, abs_time);
+ if (restart->arg3 & ARG3_SHARED)
+ fshared = &current->mm->mmap_sem;
+ return (long)futex_wait(uaddr, fshared, val, abs_time);
}
+static void set_pi_futex_owner(struct futex_hash_bucket *hb,
+ union futex_key *key, struct task_struct *p)
+{
+ struct plist_head *head;
+ struct futex_q *this, *next;
+ struct futex_pi_state *pi_state = NULL;
+ struct rt_mutex *lock;
+
+ /* Search a waiter that should already exists */
+
+ head = &hb->chain;
+
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, key)) {
+ pi_state = this->pi_state;
+ break;
+ }
+ }
+
+ BUG_ON(!pi_state);
+
+ /* set p as pi_state's owner */
+ lock = &pi_state->pi_mutex;
+
+ spin_lock(&lock->wait_lock);
+ spin_lock_irq(&p->pi_lock);
+
+ list_add(&pi_state->list, &p->pi_state_list);
+ pi_state->owner = p;
+
+
+ /* set p as pi_mutex's owner */
+ debug_rt_mutex_proxy_lock(lock, p);
+ WARN_ON(rt_mutex_owner(lock));
+ rt_mutex_set_owner(lock, p, 0);
+ rt_mutex_deadlock_account_lock(lock, p);
+
+ plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
+ &p->pi_waiters);
+ __rt_mutex_adjust_prio(p);
+
+ spin_unlock_irq(&p->pi_lock);
+ spin_unlock(&lock->wait_lock);
+}
+
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
- long nsec, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+ int detect, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct task_struct *curr = current;
struct futex_hash_bucket *hb;
u32 uval, newval, curval;
struct futex_q q;
- int ret, attempt = 0;
+ int ret, lock_held, attempt = 0;
if (refill_pi_state_cache())
return -ENOMEM;
- if (sec != MAX_SCHEDULE_TIMEOUT) {
+ if (time) {
to = &timeout;
hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
- to->timer.expires = ktime_set(sec, nsec);
+ to->timer.expires = *time;
}
q.pi_state = NULL;
retry:
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &q.key);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
hb = queue_lock(&q, -1, NULL);
retry_locked:
+ lock_held = 0;
+
/*
* To avoid races, we attempt to take the lock here again
* (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1194,7 +1732,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
if (!detect && 0)
force_sig(SIGKILL, current);
- ret = -EDEADLK;
+ /*
+ * Normally, this check is done in user space.
+ * In case of requeue, the owner may attempt to lock this futex,
+ * even if the ownership has already been given by the previous
+ * waker.
+ * In the usual case, this is a case of deadlock, but not in case
+ * of REQUEUE_PI.
+ */
+ if (!(curval & FUTEX_WAITER_REQUEUED))
+ ret = -EDEADLK;
goto out_unlock_release_sem;
}
@@ -1206,7 +1753,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
goto out_unlock_release_sem;
uval = curval;
- newval = uval | FUTEX_WAITERS;
+ /*
+ * In case of a requeue, check if there already is an owner
+ * If not, just take the futex.
+ */
+ if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
+ /* set current as futex owner */
+ newval = curval | current->pid;
+ lock_held = 1;
+ } else
+ /* Set the WAITERS flag, so the owner will know it has someone
+ to wake at next unlock */
+ newval = curval | FUTEX_WAITERS;
pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1217,11 +1775,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
if (unlikely(curval != uval))
goto retry_locked;
+ if (lock_held) {
+ set_pi_futex_owner(hb, &q.key, curr);
+ goto out_unlock_release_sem;
+ }
+
/*
* We dont have the lock. Look up the PI state (or create it if
* we are the first waiter):
*/
- ret = lookup_pi_state(uval, hb, &q);
+ ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
if (unlikely(ret)) {
/*
@@ -1263,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
WARN_ON(!q.pi_state);
/*
@@ -1277,52 +1841,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
ret = ret ? 0 : -EWOULDBLOCK;
}
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
spin_lock(q.lock_ptr);
/*
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case.
*/
- if (!ret && q.pi_state->owner != curr) {
- u32 newtid = current->pid | FUTEX_WAITERS;
-
- /* Owner died? */
- if (q.pi_state->owner != NULL) {
- spin_lock_irq(&q.pi_state->owner->pi_lock);
- WARN_ON(list_empty(&q.pi_state->list));
- list_del_init(&q.pi_state->list);
- spin_unlock_irq(&q.pi_state->owner->pi_lock);
- } else
- newtid |= FUTEX_OWNER_DIED;
-
- q.pi_state->owner = current;
-
- spin_lock_irq(&current->pi_lock);
- WARN_ON(!list_empty(&q.pi_state->list));
- list_add(&q.pi_state->list, &current->pi_state_list);
- spin_unlock_irq(&current->pi_lock);
-
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
- /*
- * We own it, so we have to replace the pending owner
- * TID. This must be atomic as we have preserve the
- * owner died bit here.
- */
- ret = get_user(uval, uaddr);
- while (!ret) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = futex_atomic_cmpxchg_inatomic(uaddr,
- uval, newval);
- if (curval == -EFAULT)
- ret = -EFAULT;
- if (curval == uval)
- break;
- uval = curval;
- }
- } else {
+ if (!ret && q.pi_state->owner != curr)
+ /* mmap_sem is unlocked at return of this function */
+ ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
+ else {
/*
* Catch the rare case, where the lock was released
* when we were on the way back before we locked
@@ -1333,8 +1863,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
ret = 0;
}
/* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
+ unqueue_me_pi(&q);
+ if (fshared)
+ up_read(fshared);
}
if (!detect && ret == -EDEADLK && 0)
@@ -1346,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
queue_unlock(&q, hb);
out_release_sem:
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
uaddr_faulted:
@@ -1357,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr, fshared,
+ attempt);
+ if (ret)
goto out_unlock_release_sem;
- }
goto retry_locked;
}
queue_unlock(&q, hb);
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
@@ -1379,12 +1912,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-static int futex_unlock_pi(u32 __user *uaddr)
+static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
u32 uval;
- struct list_head *head;
+ struct plist_head *head;
union futex_key key;
int ret, attempt = 0;
@@ -1399,9 +1932,10 @@ retry:
/*
* First take all the futex related locks:
*/
- down_read(&current->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &key);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -1435,7 +1969,7 @@ retry_locked:
*/
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (!match_futex (&this->key, &key))
continue;
ret = wake_futex_pi(uaddr, uval, this);
@@ -1460,7 +1994,8 @@ retry_locked:
out_unlock:
spin_unlock(&hb->lock);
out:
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
@@ -1472,15 +2007,16 @@ pi_faulted:
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr, fshared,
+ attempt);
+ if (ret)
goto out_unlock;
- }
goto retry_locked;
}
spin_unlock(&hb->lock);
- up_read(&current->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
@@ -1509,10 +2045,10 @@ static unsigned int futex_poll(struct file *filp,
poll_wait(filp, &q->waiters, wait);
/*
- * list_empty() is safe here without any lock.
+ * plist_node_empty() is safe here without any lock.
* q->lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- if (list_empty(&q->list))
+ if (plist_node_empty(&q->list))
ret = POLLIN | POLLRDNORM;
return ret;
@@ -1532,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
struct futex_q *q;
struct file *filp;
int ret, err;
+ struct rw_semaphore *fshared;
static unsigned long printk_interval;
if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -1573,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
}
q->pi_state = NULL;
- down_read(&current->mm->mmap_sem);
- err = get_futex_key(uaddr, &q->key);
+ fshared = &current->mm->mmap_sem;
+ down_read(fshared);
+ err = get_futex_key(uaddr, fshared, &q->key);
if (unlikely(err != 0)) {
- up_read(&current->mm->mmap_sem);
+ up_read(fshared);
kfree(q);
goto error;
}
@@ -1589,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
filp->private_data = q;
queue_me(q, ret, filp);
- up_read(&current->mm->mmap_sem);
+ up_read(fshared);
/* Now we map fd to filp, so userspace can access it */
fd_install(ret, filp);
@@ -1702,6 +2240,8 @@ retry:
* userspace.
*/
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+ /* Also keep the FUTEX_WAITER_REQUEUED flag if set */
+ mval |= (uval & FUTEX_WAITER_REQUEUED);
nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
if (nval == -EFAULT)
@@ -1716,7 +2256,7 @@ retry:
*/
if (!pi) {
if (uval & FUTEX_WAITERS)
- futex_wake(uaddr, 1);
+ futex_wake(uaddr, &curr->mm->mmap_sem, 1);
}
}
return 0;
@@ -1772,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)
return;
if (pending)
- handle_futex_death((void __user *)pending + futex_offset, curr, pip);
+ handle_futex_death((void __user *)pending + futex_offset,
+ curr, pip);
while (entry != &head->list) {
/*
@@ -1798,39 +2339,47 @@ void exit_robust_list(struct task_struct *curr)
}
}
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
int ret;
+ int cmd = op & FUTEX_CMD_MASK;
+ struct rw_semaphore *fshared = NULL;
+
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ fshared = &current->mm->mmap_sem;
- switch (op) {
+ switch (cmd) {
case FUTEX_WAIT:
- ret = futex_wait(uaddr, val, timeout);
+ ret = futex_wait(uaddr, fshared, val, timeout);
break;
case FUTEX_WAKE:
- ret = futex_wake(uaddr, val);
+ ret = futex_wake(uaddr, fshared, val);
break;
case FUTEX_FD:
/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
ret = futex_fd(uaddr, val);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
break;
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
break;
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+ ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
break;
case FUTEX_LOCK_PI:
- ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+ ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
break;
case FUTEX_UNLOCK_PI:
- ret = futex_unlock_pi(uaddr);
+ ret = futex_unlock_pi(uaddr, fshared);
break;
case FUTEX_TRYLOCK_PI:
- ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+ ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+ break;
+ case FUTEX_CMP_REQUEUE_PI:
+ ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
break;
default:
ret = -ENOSYS;
@@ -1843,29 +2392,30 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
struct timespec __user *utime, u32 __user *uaddr2,
u32 val3)
{
- struct timespec t;
- unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
u32 val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
- if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
- if (copy_from_user(&t, utime, sizeof(t)) != 0)
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
+ if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
return -EFAULT;
- if (!timespec_valid(&t))
+ if (!timespec_valid(&ts))
return -EINVAL;
- if (op == FUTEX_WAIT)
- timeout = timespec_to_jiffies(&t) + 1;
- else {
- timeout = t.tv_sec;
- val2 = t.tv_nsec;
- }
+
+ t = timespec_to_ktime(ts);
+ if (cmd == FUTEX_WAIT)
+ t = ktime_add(ktime_get(), t);
+ tp = &t;
}
/*
- * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
+ * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
*/
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
+ || cmd == FUTEX_CMP_REQUEUE_PI)
val2 = (u32) (unsigned long) utime;
- return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
static int futexfs_get_sb(struct file_system_type *fs_type,
@@ -1895,7 +2445,7 @@ static int __init init(void)
}
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
- INIT_LIST_HEAD(&futex_queues[i].chain);
+ plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
spin_lock_init(&futex_queues[i].lock);
}
return 0;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 50f24eea6cd..338a9b489fb 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -141,24 +141,24 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
struct compat_timespec __user *utime, u32 __user *uaddr2,
u32 val3)
{
- struct timespec t;
- unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
int val2 = 0;
if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
- if (get_compat_timespec(&t, utime))
+ if (get_compat_timespec(&ts, utime))
return -EFAULT;
- if (!timespec_valid(&t))
+ if (!timespec_valid(&ts))
return -EINVAL;
+
+ t = timespec_to_ktime(ts);
if (op == FUTEX_WAIT)
- timeout = timespec_to_jiffies(&t) + 1;
- else {
- timeout = t.tv_sec;
- val2 = t.tv_nsec;
- }
+ t = ktime_add(ktime_get(), t);
+ tp = &t;
}
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+ || op == FUTEX_CMP_REQUEUE_PI)
val2 = (int) (unsigned long) utime;
- return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c9f4f044a8a..23c03f43e19 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1411,11 +1411,13 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
init_hrtimers_cpu(cpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
migrate_hrtimers(cpu);
break;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 32e1ab1477d..e391cbb1f56 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,7 +22,6 @@
* handle_bad_irq - handle spurious and unhandled irqs
* @irq: the interrupt number
* @desc: description of the interrupt
- * @regs: pointer to a register structure
*
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 49cc4b9c1a8..4d32eb07717 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -135,7 +135,6 @@ static int ____call_usermodehelper(void *data)
/* Unblock all signals and set the session keyring. */
new_session = key_get(sub_info->ring);
- flush_signals(current);
spin_lock_irq(&current->sighand->siglock);
old_session = __install_session_keyring(current, new_session);
flush_signal_handlers(current, 1);
@@ -186,14 +185,9 @@ static int wait_for_helper(void *data)
{
struct subprocess_info *sub_info = data;
pid_t pid;
- struct k_sigaction sa;
/* Install a handler: if SIGCLD isn't handled sys_wait4 won't
* populate the status, but will return -ECHILD. */
- sa.sa.sa_handler = SIG_IGN;
- sa.sa.sa_flags = 0;
- siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
- do_sigaction(SIGCHLD, &sa, NULL);
allow_signal(SIGCHLD);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 87c50ccd1d4..df8a8e8f6ca 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,7 +1,7 @@
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
*
- * Creation is done via keventd, so that we get a clean environment
+ * Creation is done via kthreadd, so that we get a clean environment
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
@@ -15,24 +15,22 @@
#include <linux/mutex.h>
#include <asm/semaphore.h>
-/*
- * We dont want to execute off keventd since it might
- * hold a semaphore our callers hold too:
- */
-static struct workqueue_struct *helper_wq;
+static DEFINE_SPINLOCK(kthread_create_lock);
+static LIST_HEAD(kthread_create_list);
+struct task_struct *kthreadd_task;
struct kthread_create_info
{
- /* Information passed to kthread() from keventd. */
+ /* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
struct completion started;
- /* Result passed back to kthread_create() from keventd. */
+ /* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
struct completion done;
- struct work_struct work;
+ struct list_head list;
};
struct kthread_stop_info
@@ -60,42 +58,17 @@ int kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
-static void kthread_exit_files(void)
-{
- struct fs_struct *fs;
- struct task_struct *tsk = current;
-
- exit_fs(tsk); /* current->fs->count--; */
- fs = init_task.fs;
- tsk->fs = fs;
- atomic_inc(&fs->count);
- exit_files(tsk);
- current->files = init_task.files;
- atomic_inc(&tsk->files->count);
-}
-
static int kthread(void *_create)
{
struct kthread_create_info *create = _create;
int (*threadfn)(void *data);
void *data;
- sigset_t blocked;
int ret = -EINTR;
- kthread_exit_files();
-
- /* Copy data: it's on keventd's stack */
+ /* Copy data: it's on kthread's stack */
threadfn = create->threadfn;
data = create->data;
- /* Block and flush all signals (in case we're not from keventd). */
- sigfillset(&blocked);
- sigprocmask(SIG_BLOCK, &blocked, NULL);
- flush_signals(current);
-
- /* By default we can run anywhere, unlike keventd. */
- set_cpus_allowed(current, CPU_MASK_ALL);
-
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_INTERRUPTIBLE);
complete(&create->started);
@@ -112,11 +85,8 @@ static int kthread(void *_create)
return 0;
}
-/* We are keventd: create a thread. */
-static void keventd_create_kthread(struct work_struct *work)
+static void create_kthread(struct kthread_create_info *create)
{
- struct kthread_create_info *create =
- container_of(work, struct kthread_create_info, work);
int pid;
/* We want our own signal handler (we take no signals by default). */
@@ -162,17 +132,14 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
create.data = data;
init_completion(&create.started);
init_completion(&create.done);
- INIT_WORK(&create.work, keventd_create_kthread);
-
- /*
- * The workqueue needs to start up first:
- */
- if (!helper_wq)
- create.work.func(&create.work);
- else {
- queue_work(helper_wq, &create.work);
- wait_for_completion(&create.done);
- }
+
+ spin_lock(&kthread_create_lock);
+ list_add_tail(&create.list, &kthread_create_list);
+ wake_up_process(kthreadd_task);
+ spin_unlock(&kthread_create_lock);
+
+ wait_for_completion(&create.done);
+
if (!IS_ERR(create.result)) {
va_list args;
va_start(args, namefmt);
@@ -180,7 +147,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
namefmt, args);
va_end(args);
}
-
return create.result;
}
EXPORT_SYMBOL(kthread_create);
@@ -245,12 +211,47 @@ int kthread_stop(struct task_struct *k)
}
EXPORT_SYMBOL(kthread_stop);
-static __init int helper_init(void)
+
+static __init void kthreadd_setup(void)
{
- helper_wq = create_singlethread_workqueue("kthread");
- BUG_ON(!helper_wq);
+ struct task_struct *tsk = current;
- return 0;
+ set_task_comm(tsk, "kthreadd");
+
+ ignore_signals(tsk);
+
+ set_user_nice(tsk, -5);
+ set_cpus_allowed(tsk, CPU_MASK_ALL);
}
-core_initcall(helper_init);
+int kthreadd(void *unused)
+{
+ /* Setup a clean context for our children to inherit. */
+ kthreadd_setup();
+
+ current->flags |= PF_NOFREEZE;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty(&kthread_create_list))
+ schedule();
+ __set_current_state(TASK_RUNNING);
+
+ spin_lock(&kthread_create_lock);
+ while (!list_empty(&kthread_create_list)) {
+ struct kthread_create_info *create;
+
+ create = list_entry(kthread_create_list.next,
+ struct kthread_create_info, list);
+ list_del_init(&create->list);
+ spin_unlock(&kthread_create_lock);
+
+ create_kthread(create);
+
+ spin_lock(&kthread_create_lock);
+ }
+ spin_unlock(&kthread_create_lock);
+ }
+
+ return 0;
+}
diff --git a/kernel/module.c b/kernel/module.c
index d36e45477fa..9bd93de01f4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -96,9 +96,9 @@ static inline void add_taint_module(struct module *mod, unsigned flag)
mod->taints |= flag;
}
-/* A thread that wants to hold a reference to a module only while it
- * is running can call ths to safely exit.
- * nfsd and lockd use this.
+/*
+ * A thread that wants to hold a reference to a module only while it
+ * is running can call this to safely exit. nfsd and lockd use this.
*/
void __module_put_and_exit(struct module *mod, long code)
{
@@ -1199,7 +1199,7 @@ static int __unlink_module(void *_mod)
return 0;
}
-/* Free a module, remove from lists, etc (must hold module mutex). */
+/* Free a module, remove from lists, etc (must hold module_mutex). */
static void free_module(struct module *mod)
{
/* Delete from various lists */
@@ -1246,7 +1246,7 @@ EXPORT_SYMBOL_GPL(__symbol_get);
/*
* Ensure that an exported symbol [global namespace] does not already exist
- * in the Kernel or in some other modules exported symbol table.
+ * in the kernel or in some other module's exported symbol table.
*/
static int verify_export_symbols(struct module *mod)
{
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e7cbbb82765..303eab18484 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -133,7 +133,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
debug_mutex_lock_common(lock, &waiter);
mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- debug_mutex_add_waiter(lock, &waiter, task->thread_info);
+ debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
@@ -159,7 +159,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
*/
if (unlikely(state == TASK_INTERRUPTIBLE &&
signal_pending(task))) {
- mutex_remove_waiter(lock, &waiter, task->thread_info);
+ mutex_remove_waiter(lock, &waiter, task_thread_info(task));
mutex_release(&lock->dep_map, 1, _RET_IP_);
spin_unlock_mutex(&lock->wait_lock, flags);
@@ -175,8 +175,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
}
/* got the lock - rejoice! */
- mutex_remove_waiter(lock, &waiter, task->thread_info);
- debug_mutex_set_owner(lock, task->thread_info);
+ mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+ debug_mutex_set_owner(lock, task_thread_info(task));
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 06331374d86..b5f0543ed84 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,30 +30,69 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
+enum {
+ HIBERNATION_INVALID,
+ HIBERNATION_PLATFORM,
+ HIBERNATION_TEST,
+ HIBERNATION_TESTPROC,
+ HIBERNATION_SHUTDOWN,
+ HIBERNATION_REBOOT,
+ /* keep last */
+ __HIBERNATION_AFTER_LAST
+};
+#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
+#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
+
+static int hibernation_mode = HIBERNATION_SHUTDOWN;
+
+struct hibernation_ops *hibernation_ops;
+
+/**
+ * hibernation_set_ops - set the global hibernate operations
+ * @ops: the hibernation operations to use in subsequent hibernation transitions
+ */
+
+void hibernation_set_ops(struct hibernation_ops *ops)
+{
+ if (ops && !(ops->prepare && ops->enter && ops->finish)) {
+ WARN_ON(1);
+ return;
+ }
+ mutex_lock(&pm_mutex);
+ hibernation_ops = ops;
+ if (ops)
+ hibernation_mode = HIBERNATION_PLATFORM;
+ else if (hibernation_mode == HIBERNATION_PLATFORM)
+ hibernation_mode = HIBERNATION_SHUTDOWN;
+
+ mutex_unlock(&pm_mutex);
+}
+
+
/**
* platform_prepare - prepare the machine for hibernation using the
* platform driver if so configured and return an error code if it fails
*/
-static inline int platform_prepare(void)
+static int platform_prepare(void)
{
- int error = 0;
+ return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ?
+ hibernation_ops->prepare() : 0;
+}
- switch (pm_disk_mode) {
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- break;
- default:
- if (pm_ops && pm_ops->prepare)
- error = pm_ops->prepare(PM_SUSPEND_DISK);
- }
- return error;
+/**
+ * platform_finish - switch the machine to the normal mode of operation
+ * using the platform driver (must be called after platform_prepare())
+ */
+
+static void platform_finish(void)
+{
+ if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops)
+ hibernation_ops->finish();
}
/**
- * power_down - Shut machine down for hibernate.
+ * power_down - Shut the machine down for hibernation.
*
* Use the platform driver, if configured so; otherwise try
* to power off or reboot.
@@ -61,20 +100,20 @@ static inline int platform_prepare(void)
static void power_down(void)
{
- switch (pm_disk_mode) {
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
+ switch (hibernation_mode) {
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
break;
- case PM_DISK_SHUTDOWN:
+ case HIBERNATION_SHUTDOWN:
kernel_power_off();
break;
- case PM_DISK_REBOOT:
+ case HIBERNATION_REBOOT:
kernel_restart(NULL);
break;
- default:
- if (pm_ops && pm_ops->enter) {
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops) {
kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
- pm_ops->enter(PM_SUSPEND_DISK);
+ hibernation_ops->enter();
break;
}
}
@@ -87,20 +126,6 @@ static void power_down(void)
while(1);
}
-static inline void platform_finish(void)
-{
- switch (pm_disk_mode) {
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- break;
- default:
- if (pm_ops && pm_ops->finish)
- pm_ops->finish(PM_SUSPEND_DISK);
- }
-}
-
static void unprepare_processes(void)
{
thaw_processes();
@@ -120,13 +145,10 @@ static int prepare_processes(void)
}
/**
- * pm_suspend_disk - The granpappy of hibernation power management.
- *
- * If not, then call swsusp to do its thing, then figure out how
- * to power down the system.
+ * hibernate - The granpappy of the built-in hibernation management
*/
-int pm_suspend_disk(void)
+int hibernate(void)
{
int error;
@@ -143,7 +165,8 @@ int pm_suspend_disk(void)
if (error)
goto Finish;
- if (pm_disk_mode == PM_DISK_TESTPROC) {
+ mutex_lock(&pm_mutex);
+ if (hibernation_mode == HIBERNATION_TESTPROC) {
printk("swsusp debug: Waiting for 5 seconds.\n");
mdelay(5000);
goto Thaw;
@@ -168,7 +191,7 @@ int pm_suspend_disk(void)
if (error)
goto Enable_cpus;
- if (pm_disk_mode == PM_DISK_TEST) {
+ if (hibernation_mode == HIBERNATION_TEST) {
printk("swsusp debug: Waiting for 5 seconds.\n");
mdelay(5000);
goto Enable_cpus;
@@ -205,6 +228,7 @@ int pm_suspend_disk(void)
device_resume();
resume_console();
Thaw:
+ mutex_unlock(&pm_mutex);
unprepare_processes();
Finish:
free_basic_memory_bitmaps();
@@ -220,7 +244,7 @@ int pm_suspend_disk(void)
* Called as a late_initcall (so all devices are discovered and
* initialized), we call swsusp to see if we have a saved image or not.
* If so, we quiesce devices, the restore the saved image. We will
- * return above (in pm_suspend_disk() ) if everything goes well.
+ * return above (in hibernate() ) if everything goes well.
* Otherwise, we fail gracefully and return to the normally
* scheduled program.
*
@@ -315,25 +339,26 @@ static int software_resume(void)
late_initcall(software_resume);
-static const char * const pm_disk_modes[] = {
- [PM_DISK_PLATFORM] = "platform",
- [PM_DISK_SHUTDOWN] = "shutdown",
- [PM_DISK_REBOOT] = "reboot",
- [PM_DISK_TEST] = "test",
- [PM_DISK_TESTPROC] = "testproc",
+static const char * const hibernation_modes[] = {
+ [HIBERNATION_PLATFORM] = "platform",
+ [HIBERNATION_SHUTDOWN] = "shutdown",
+ [HIBERNATION_REBOOT] = "reboot",
+ [HIBERNATION_TEST] = "test",
+ [HIBERNATION_TESTPROC] = "testproc",
};
/**
- * disk - Control suspend-to-disk mode
+ * disk - Control hibernation mode
*
* Suspend-to-disk can be handled in several ways. We have a few options
* for putting the system to sleep - using the platform driver (e.g. ACPI
- * or other pm_ops), powering off the system or rebooting the system
- * (for testing) as well as the two test modes.
+ * or other hibernation_ops), powering off the system or rebooting the
+ * system (for testing) as well as the two test modes.
*
* The system can support 'platform', and that is known a priori (and
- * encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot'
- * as alternatives, as well as the test modes 'test' and 'testproc'.
+ * encoded by the presence of hibernation_ops). However, the user may
+ * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
+ * test modes, 'test' or 'testproc'.
*
* show() will display what the mode is currently set to.
* store() will accept one of
@@ -345,7 +370,7 @@ static const char * const pm_disk_modes[] = {
* 'testproc'
*
* It will only change to 'platform' if the system
- * supports it (as determined from pm_ops->pm_disk_mode).
+ * supports it (as determined by having hibernation_ops).
*/
static ssize_t disk_show(struct kset *kset, char *buf)
@@ -353,28 +378,25 @@ static ssize_t disk_show(struct kset *kset, char *buf)
int i;
char *start = buf;
- for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
- if (!pm_disk_modes[i])
+ for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+ if (!hibernation_modes[i])
continue;
switch (i) {
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
+ case HIBERNATION_SHUTDOWN:
+ case HIBERNATION_REBOOT:
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
break;
- default:
- if (pm_ops && pm_ops->enter &&
- (i == pm_ops->pm_disk_mode))
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops)
break;
/* not a valid mode, continue with loop */
continue;
}
- if (i == pm_disk_mode)
- buf += sprintf(buf, "[%s]", pm_disk_modes[i]);
+ if (i == hibernation_mode)
+ buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
else
- buf += sprintf(buf, "%s", pm_disk_modes[i]);
- if (i+1 != PM_DISK_MAX)
- buf += sprintf(buf, " ");
+ buf += sprintf(buf, "%s ", hibernation_modes[i]);
}
buf += sprintf(buf, "\n");
return buf-start;
@@ -387,39 +409,38 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
int i;
int len;
char *p;
- suspend_disk_method_t mode = 0;
+ int mode = HIBERNATION_INVALID;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
mutex_lock(&pm_mutex);
- for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
- if (!strncmp(buf, pm_disk_modes[i], len)) {
+ for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+ if (!strncmp(buf, hibernation_modes[i], len)) {
mode = i;
break;
}
}
- if (mode) {
+ if (mode != HIBERNATION_INVALID) {
switch (mode) {
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
- pm_disk_mode = mode;
+ case HIBERNATION_SHUTDOWN:
+ case HIBERNATION_REBOOT:
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
+ hibernation_mode = mode;
break;
- default:
- if (pm_ops && pm_ops->enter &&
- (mode == pm_ops->pm_disk_mode))
- pm_disk_mode = mode;
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops)
+ hibernation_mode = mode;
else
error = -EINVAL;
}
- } else {
+ } else
error = -EINVAL;
- }
- pr_debug("PM: suspend-to-disk mode set to '%s'\n",
- pm_disk_modes[mode]);
+ if (!error)
+ pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+ hibernation_modes[mode]);
mutex_unlock(&pm_mutex);
return error ? error : n;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6dda685e7e..40d56a31245 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -30,7 +30,6 @@
DEFINE_MUTEX(pm_mutex);
struct pm_ops *pm_ops;
-suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
/**
* pm_set_ops - Set the global power method table.
@@ -41,10 +40,6 @@ void pm_set_ops(struct pm_ops * ops)
{
mutex_lock(&pm_mutex);
pm_ops = ops;
- if (ops && ops->pm_disk_mode != PM_DISK_INVALID) {
- pm_disk_mode = ops->pm_disk_mode;
- } else
- pm_disk_mode = PM_DISK_SHUTDOWN;
mutex_unlock(&pm_mutex);
}
@@ -184,24 +179,12 @@ static void suspend_finish(suspend_state_t state)
static const char * const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
- [PM_SUSPEND_DISK] = "disk",
};
static inline int valid_state(suspend_state_t state)
{
- /* Suspend-to-disk does not really need low-level support.
- * It can work with shutdown/reboot if needed. If it isn't
- * configured, then it cannot be supported.
- */
- if (state == PM_SUSPEND_DISK)
-#ifdef CONFIG_SOFTWARE_SUSPEND
- return 1;
-#else
- return 0;
-#endif
-
- /* all other states need lowlevel support and need to be
- * valid to the lowlevel implementation, no valid callback
+ /* All states need lowlevel support and need to be valid
+ * to the lowlevel implementation, no valid callback
* implies that none are valid. */
if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))
return 0;
@@ -229,11 +212,6 @@ static int enter_state(suspend_state_t state)
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
- if (state == PM_SUSPEND_DISK) {
- error = pm_suspend_disk();
- goto Unlock;
- }
-
pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
if ((error = suspend_prepare(state)))
goto Unlock;
@@ -251,7 +229,7 @@ static int enter_state(suspend_state_t state)
/**
* pm_suspend - Externally visible function for suspending system.
- * @state: Enumarted value of state to enter.
+ * @state: Enumerated value of state to enter.
*
* Determine whether or not value is within range, get state
* structure, and enter (above).
@@ -289,7 +267,13 @@ static ssize_t state_show(struct kset *kset, char *buf)
if (pm_states[i] && valid_state(i))
s += sprintf(s,"%s ", pm_states[i]);
}
- s += sprintf(s,"\n");
+#ifdef CONFIG_SOFTWARE_SUSPEND
+ s += sprintf(s, "%s\n", "disk");
+#else
+ if (s != buf)
+ /* convert the last space to a newline */
+ *(s-1) = '\n';
+#endif
return (s - buf);
}
@@ -304,6 +288,12 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
+ /* First, check if we are requested to hibernate */
+ if (!strncmp(buf, "disk", len)) {
+ error = hibernate();
+ return error ? error : n;
+ }
+
for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
if (*s && !strncmp(buf, *s, len))
break;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 34b43542785..51381487103 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,12 +25,7 @@ struct swsusp_info {
*/
#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
-extern int pm_suspend_disk(void);
-#else
-static inline int pm_suspend_disk(void)
-{
- return -EPERM;
-}
+extern struct hibernation_ops *hibernation_ops;
#endif
extern int pfn_is_nosave(unsigned long);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b7039772b05..a3b7854b8f7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -607,7 +607,8 @@ static LIST_HEAD(nosave_regions);
*/
void __init
-register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
+__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
+ int use_kmalloc)
{
struct nosave_region *region;
@@ -623,8 +624,13 @@ register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
goto Report;
}
}
- /* This allocation cannot fail */
- region = alloc_bootmem_low(sizeof(struct nosave_region));
+ if (use_kmalloc) {
+ /* during init, this shouldn't fail */
+ region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
+ BUG_ON(!region);
+ } else
+ /* This allocation cannot fail */
+ region = alloc_bootmem_low(sizeof(struct nosave_region));
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
@@ -1227,7 +1233,7 @@ asmlinkage int swsusp_save(void)
nr_copy_pages = nr_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+ printk("swsusp: critical section: done (%d pages copied)\n", nr_pages);
return 0;
}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 040560d9c31..24d7d78e6f4 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -130,16 +130,16 @@ static inline int platform_prepare(void)
{
int error = 0;
- if (pm_ops && pm_ops->prepare)
- error = pm_ops->prepare(PM_SUSPEND_DISK);
+ if (hibernation_ops)
+ error = hibernation_ops->prepare();
return error;
}
static inline void platform_finish(void)
{
- if (pm_ops && pm_ops->finish)
- pm_ops->finish(PM_SUSPEND_DISK);
+ if (hibernation_ops)
+ hibernation_ops->finish();
}
static inline int snapshot_suspend(int platform_suspend)
@@ -384,7 +384,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
switch (arg) {
case PMOPS_PREPARE:
- if (pm_ops && pm_ops->enter) {
+ if (hibernation_ops) {
data->platform_suspend = 1;
error = 0;
} else {
@@ -395,8 +395,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
case PMOPS_ENTER:
if (data->platform_suspend) {
kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
- error = pm_ops->enter(PM_SUSPEND_DISK);
- error = 0;
+ error = hibernation_ops->enter();
}
break;
diff --git a/kernel/profile.c b/kernel/profile.c
index 9bfadb248dd..cc91b9bf759 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -340,6 +340,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
@@ -365,10 +366,13 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
__free_page(page);
return NOTIFY_BAD;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
cpu_set(cpu, prof_cpu_mask);
break;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
cpu_clear(cpu, prof_cpu_mask);
if (per_cpu(cpu_profile_hits, cpu)[0]) {
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 3554b76da84..2c2dd8410dc 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -558,9 +558,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
long cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
rcu_online_cpu(cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
rcu_offline_cpu(cpu);
break;
default:
diff --git a/kernel/relay.c b/kernel/relay.c
index 577f251c7e2..4311101b0ca 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -310,16 +310,13 @@ static struct rchan_callbacks default_channel_callbacks = {
/**
* wakeup_readers - wake up readers waiting on a channel
- * @work: work struct that contains the the channel buffer
+ * @data: contains the channel buffer
*
- * This is the work function used to defer reader waking. The
- * reason waking is deferred is that calling directly from write
- * causes problems if you're writing from say the scheduler.
+ * This is the timer function used to defer reader waking.
*/
-static void wakeup_readers(struct work_struct *work)
+static void wakeup_readers(unsigned long data)
{
- struct rchan_buf *buf =
- container_of(work, struct rchan_buf, wake_readers.work);
+ struct rchan_buf *buf = (struct rchan_buf *)data;
wake_up_interruptible(&buf->read_wait);
}
@@ -337,11 +334,9 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
if (init) {
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
- INIT_DELAYED_WORK(&buf->wake_readers, NULL);
- } else {
- cancel_delayed_work(&buf->wake_readers);
- flush_scheduled_work();
- }
+ setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+ } else
+ del_timer_sync(&buf->timer);
buf->subbufs_produced = 0;
buf->subbufs_consumed = 0;
@@ -447,8 +442,7 @@ end:
static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
- cancel_delayed_work(&buf->wake_readers);
- flush_scheduled_work();
+ del_timer_sync(&buf->timer);
kref_put(&buf->kref, relay_remove_buf);
}
@@ -490,6 +484,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
switch(action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
mutex_lock(&relay_channels_mutex);
list_for_each_entry(chan, &relay_channels, list) {
if (chan->buf[hotcpu])
@@ -506,6 +501,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
mutex_unlock(&relay_channels_mutex);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
/* No need to flush the cpu : will be flushed upon
* final relay_flush() call. */
break;
@@ -608,11 +604,14 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
buf->padding[old_subbuf];
smp_mb();
- if (waitqueue_active(&buf->read_wait)) {
- PREPARE_DELAYED_WORK(&buf->wake_readers,
- wakeup_readers);
- schedule_delayed_work(&buf->wake_readers, 1);
- }
+ if (waitqueue_active(&buf->read_wait))
+ /*
+ * Calling wake_up_interruptible() from here
+ * will deadlock if we happen to be logging
+ * from the scheduler (trying to re-grab
+ * rq->lock), so defer it.
+ */
+ __mod_timer(&buf->timer, jiffies + 1);
}
old = buf->data;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 180978cb2f7..12879f6c1ec 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -56,7 +56,7 @@
* state.
*/
-static void
+void
rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
unsigned long mask)
{
@@ -81,29 +81,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
}
/*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
- */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
-# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
- unsigned long owner, *p = (unsigned long *) &lock->owner;
-
- do {
- owner = *p;
- } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
-}
-#else
-# define rt_mutex_cmpxchg(l,c,n) (0)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
- lock->owner = (struct task_struct *)
- ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
-}
-#endif
-
-/*
* Calculate task priority from the waiter list priority
*
* Return task->normal_prio when the waiter list is empty or when
@@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task)
*
* This can be both boosting and unboosting. task->pi_lock must be held.
*/
-static void __rt_mutex_adjust_prio(struct task_struct *task)
+void __rt_mutex_adjust_prio(struct task_struct *task)
{
int prio = rt_mutex_getprio(task);
@@ -159,11 +136,11 @@ int max_lock_depth = 1024;
* Decreases task's usage by one - may thus free the task.
* Returns 0 or -EDEADLK.
*/
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- int deadlock_detect,
- struct rt_mutex *orig_lock,
- struct rt_mutex_waiter *orig_waiter,
- struct task_struct *top_task)
+int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
{
struct rt_mutex *lock;
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
*
* Must be called with lock->wait_lock held
*/
-static void remove_waiter(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
+void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
{
int first = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856e791..242ec7ee740 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -113,6 +113,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
}
/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ do {
+ owner = *p;
+ } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n) (0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
* PI-futex support (proxy locking functions, etc.):
*/
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+
+extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+ unsigned long mask);
+extern void __rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task);
+extern void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index a3a04085e79..799d23b4e35 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -169,7 +169,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
#define TASK_PREEMPTS_CURR(p, rq) \
- (((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active))
+ ((p)->prio < (rq)->curr->prio)
#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
@@ -305,6 +305,7 @@ struct rq {
};
static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_MUTEX(sched_hotcpu_mutex);
static inline int cpu_of(struct rq *rq)
{
@@ -4076,13 +4077,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
struct prio_array *array;
unsigned long flags;
struct rq *rq;
- int delta;
+ int oldprio;
BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags);
- delta = prio - p->prio;
+ oldprio = p->prio;
array = p->array;
if (array)
dequeue_task(p, array);
@@ -4098,11 +4099,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
enqueue_task(p, array);
/*
* Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if our priority became higher
- * than the current's.
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
*/
- if (TASK_PREEMPTS_CURR(p, rq) ||
- (delta > 0 && task_running(rq, p)))
+ if (task_running(rq, p)) {
+ if (p->prio > oldprio)
+ resched_task(rq->curr);
+ } else if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
task_rq_unlock(rq, &flags);
@@ -4150,12 +4153,10 @@ void set_user_nice(struct task_struct *p, long nice)
enqueue_task(p, array);
inc_raw_weighted_load(rq, p);
/*
- * Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if our priority became higher
- * than the current's.
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
*/
- if (TASK_PREEMPTS_CURR(p, rq) ||
- (delta > 0 && task_running(rq, p)))
+ if (delta < 0 || (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
out_unlock:
@@ -4382,11 +4383,13 @@ recheck:
__activate_task(p, rq);
/*
* Reschedule if we are currently running on this runqueue and
- * our priority decreased, or our priority became higher
- * than the current's.
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
*/
- if (TASK_PREEMPTS_CURR(p, rq) ||
- (task_running(rq, p) && p->prio > oldprio))
+ if (task_running(rq, p)) {
+ if (p->prio > oldprio)
+ resched_task(rq->curr);
+ } else if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
__task_rq_unlock(rq);
@@ -4518,13 +4521,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
struct task_struct *p;
int retval;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
if (!p) {
read_unlock(&tasklist_lock);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return -ESRCH;
}
@@ -4551,7 +4554,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
out_unlock:
put_task_struct(p);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return retval;
}
@@ -4608,7 +4611,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
struct task_struct *p;
int retval;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
read_lock(&tasklist_lock);
retval = -ESRCH;
@@ -4624,7 +4627,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
out_unlock:
read_unlock(&tasklist_lock);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
if (retval)
return retval;
@@ -5386,7 +5389,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
struct rq *rq;
switch (action) {
+ case CPU_LOCK_ACQUIRE:
+ mutex_lock(&sched_hotcpu_mutex);
+ break;
+
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
if (IS_ERR(p))
return NOTIFY_BAD;
@@ -5400,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
/* Strictly unneccessary, as first user will wake it. */
wake_up_process(cpu_rq(cpu)->migration_thread);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
if (!cpu_rq(cpu)->migration_thread)
break;
/* Unbind it from offline cpu so it can run. Fall thru. */
@@ -5416,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
@@ -5431,7 +5442,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
BUG_ON(rq->nr_running != 0);
/* No need to migrate the tasks: it was best-effort if
- * they didn't do lock_cpu_hotplug(). Just wake up
+ * they didn't take sched_hotcpu_mutex. Just wake up
* the requestors. */
spin_lock_irq(&rq->lock);
while (!list_empty(&rq->migration_queue)) {
@@ -5445,6 +5456,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
spin_unlock_irq(&rq->lock);
break;
#endif
+ case CPU_LOCK_RELEASE:
+ mutex_unlock(&sched_hotcpu_mutex);
+ break;
}
return NOTIFY_OK;
}
@@ -6820,10 +6834,10 @@ int arch_reinit_sched_domains(void)
{
int err;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
detach_destroy_domains(&cpu_online_map);
err = arch_init_sched_domains(&cpu_online_map);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return err;
}
@@ -6902,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb,
{
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
detach_destroy_domains(&cpu_online_map);
return NOTIFY_OK;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
/*
* Fall through and re-initialise the domains.
*/
@@ -6928,12 +6948,12 @@ void __init sched_init_smp(void)
{
cpumask_t non_isolated_cpus;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
arch_init_sched_domains(&cpu_online_map);
cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
if (cpus_empty(non_isolated_cpus))
cpu_set(smp_processor_id(), non_isolated_cpus);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);
diff --git a/kernel/signal.c b/kernel/signal.c
index 1368e67c848..2ac3a668d9d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -38,125 +38,6 @@
static struct kmem_cache *sigqueue_cachep;
-/*
- * In POSIX a signal is sent either to a specific thread (Linux task)
- * or to the process as a whole (Linux thread group). How the signal
- * is sent determines whether it's to one thread or the whole group,
- * which determines which signal mask(s) are involved in blocking it
- * from being delivered until later. When the signal is delivered,
- * either it's caught or ignored by a user handler or it has a default
- * effect that applies to the whole thread group (POSIX process).
- *
- * The possible effects an unblocked signal set to SIG_DFL can have are:
- * ignore - Nothing Happens
- * terminate - kill the process, i.e. all threads in the group,
- * similar to exit_group. The group leader (only) reports
- * WIFSIGNALED status to its parent.
- * coredump - write a core dump file describing all threads using
- * the same mm and then kill all those threads
- * stop - stop all the threads in the group, i.e. TASK_STOPPED state
- *
- * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
- * Other signals when not blocked and set to SIG_DFL behaves as follows.
- * The job control signals also have other special effects.
- *
- * +--------------------+------------------+
- * | POSIX signal | default action |
- * +--------------------+------------------+
- * | SIGHUP | terminate |
- * | SIGINT | terminate |
- * | SIGQUIT | coredump |
- * | SIGILL | coredump |
- * | SIGTRAP | coredump |
- * | SIGABRT/SIGIOT | coredump |
- * | SIGBUS | coredump |
- * | SIGFPE | coredump |
- * | SIGKILL | terminate(+) |
- * | SIGUSR1 | terminate |
- * | SIGSEGV | coredump |
- * | SIGUSR2 | terminate |
- * | SIGPIPE | terminate |
- * | SIGALRM | terminate |
- * | SIGTERM | terminate |
- * | SIGCHLD | ignore |
- * | SIGCONT | ignore(*) |
- * | SIGSTOP | stop(*)(+) |
- * | SIGTSTP | stop(*) |
- * | SIGTTIN | stop(*) |
- * | SIGTTOU | stop(*) |
- * | SIGURG | ignore |
- * | SIGXCPU | coredump |
- * | SIGXFSZ | coredump |
- * | SIGVTALRM | terminate |
- * | SIGPROF | terminate |
- * | SIGPOLL/SIGIO | terminate |
- * | SIGSYS/SIGUNUSED | coredump |
- * | SIGSTKFLT | terminate |
- * | SIGWINCH | ignore |
- * | SIGPWR | terminate |
- * | SIGRTMIN-SIGRTMAX | terminate |
- * +--------------------+------------------+
- * | non-POSIX signal | default action |
- * +--------------------+------------------+
- * | SIGEMT | coredump |
- * +--------------------+------------------+
- *
- * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
- * (*) Special job control effects:
- * When SIGCONT is sent, it resumes the process (all threads in the group)
- * from TASK_STOPPED state and also clears any pending/queued stop signals
- * (any of those marked with "stop(*)"). This happens regardless of blocking,
- * catching, or ignoring SIGCONT. When any stop signal is sent, it clears
- * any pending/queued SIGCONT signals; this happens regardless of blocking,
- * catching, or ignored the stop signal, though (except for SIGSTOP) the
- * default action of stopping the process may happen later or never.
- */
-
-#ifdef SIGEMT
-#define M_SIGEMT M(SIGEMT)
-#else
-#define M_SIGEMT 0
-#endif
-
-#if SIGRTMIN > BITS_PER_LONG
-#define M(sig) (1ULL << ((sig)-1))
-#else
-#define M(sig) (1UL << ((sig)-1))
-#endif
-#define T(sig, mask) (M(sig) & (mask))
-
-#define SIG_KERNEL_ONLY_MASK (\
- M(SIGKILL) | M(SIGSTOP) )
-
-#define SIG_KERNEL_STOP_MASK (\
- M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) )
-
-#define SIG_KERNEL_COREDUMP_MASK (\
- M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \
- M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \
- M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT )
-
-#define SIG_KERNEL_IGNORE_MASK (\
- M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) )
-
-#define sig_kernel_only(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK))
-#define sig_kernel_coredump(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK))
-#define sig_kernel_ignore(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK))
-#define sig_kernel_stop(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
-
-#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
-
-#define sig_user_defined(t, signr) \
- (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
- ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
-
-#define sig_fatal(t, signr) \
- (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
- (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
static int sig_ignored(struct task_struct *t, int sig)
{
@@ -328,6 +209,16 @@ void flush_signals(struct task_struct *t)
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
+void ignore_signals(struct task_struct *t)
+{
+ int i;
+
+ for (i = 0; i < _NSIG; ++i)
+ t->sighand->action[i].sa.sa_handler = SIG_IGN;
+
+ flush_signals(t);
+}
+
/*
* Flush all handlers for a task.
*/
@@ -1032,17 +923,6 @@ void zap_other_threads(struct task_struct *p)
if (t->exit_state)
continue;
- /*
- * We don't want to notify the parent, since we are
- * killed as part of a thread group due to another
- * thread doing an execve() or similar. So set the
- * exit signal to -1 to allow immediate reaping of
- * the process. But don't detach the thread group
- * leader.
- */
- if (t != p->group_leader)
- t->exit_signal = -1;
-
/* SIGKILL will be handled before any pending SIGSTOP */
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8b75008e2bd..0b9886a00e7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -593,6 +593,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
if (IS_ERR(p)) {
printk("ksoftirqd for %i failed\n", hotcpu);
@@ -602,16 +603,19 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
per_cpu(ksoftirqd, hotcpu) = p;
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
wake_up_process(per_cpu(ksoftirqd, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(ksoftirqd, hotcpu))
break;
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(ksoftirqd, hotcpu),
any_online_cpu(cpu_online_map));
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
p = per_cpu(ksoftirqd, hotcpu);
per_cpu(ksoftirqd, hotcpu) = NULL;
kthread_stop(p);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 8fa7040247a..0131e296ffb 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -146,6 +146,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
BUG_ON(per_cpu(watchdog_task, hotcpu));
p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
if (IS_ERR(p)) {
@@ -157,16 +158,19 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
kthread_bind(p, hotcpu);
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(watchdog_task, hotcpu))
break;
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(watchdog_task, hotcpu),
any_online_cpu(cpu_online_map));
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
p = per_cpu(watchdog_task, hotcpu);
per_cpu(watchdog_task, hotcpu) = NULL;
kthread_stop(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 926bf9d7ac4..cdb7e9457ba 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -134,19 +134,39 @@ static int notifier_chain_unregister(struct notifier_block **nl,
return -ENOENT;
}
+/**
+ * notifier_call_chain - Informs the registered notifiers about an event.
+ * @nl: Pointer to head of the blocking notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: Number of notifier functions to be called. Don't care
+ * value of this parameter is -1.
+ * @nr_calls: Records the number of notifications sent. Don't care
+ * value of this field is NULL.
+ * @returns: notifier_call_chain returns the value returned by the
+ * last notifier function called.
+ */
+
static int __kprobes notifier_call_chain(struct notifier_block **nl,
- unsigned long val, void *v)
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
struct notifier_block *nb, *next_nb;
nb = rcu_dereference(*nl);
- while (nb) {
+
+ while (nb && nr_to_call) {
next_nb = rcu_dereference(nb->next);
ret = nb->notifier_call(nb, val, v);
+
+ if (nr_calls)
+ (*nr_calls)++;
+
if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
break;
nb = next_nb;
+ nr_to_call--;
}
return ret;
}
@@ -205,10 +225,12 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
/**
- * atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See the comment for notifier_call_chain.
+ * @nr_calls: See the comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in an atomic context, so they must not block.
@@ -222,19 +244,27 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
* of the last notifier function called.
*/
-int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v)
+int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret;
rcu_read_lock();
- ret = notifier_call_chain(&nh->head, val, v);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+
+int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
/*
* Blocking notifier chain routines. All access to the chain is
* synchronized by an rwsem.
@@ -304,10 +334,12 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
/**
- * blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -320,8 +352,9 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
* of the last notifier function called.
*/
-int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
- unsigned long val, void *v)
+int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
@@ -332,12 +365,19 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
*/
if (rcu_dereference(nh->head)) {
down_read(&nh->rwsem);
- ret = notifier_call_chain(&nh->head, val, v);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
+ nr_calls);
up_read(&nh->rwsem);
}
return ret;
}
+EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
+}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
/*
@@ -383,10 +423,12 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
/**
- * raw_notifier_call_chain - Call functions in a raw notifier chain
+ * __raw_notifier_call_chain - Call functions in a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in an undefined context.
@@ -400,10 +442,19 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
* of the last notifier function called.
*/
+int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+}
+
+EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
+
int raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v)
{
- return notifier_call_chain(&nh->head, val, v);
+ return __raw_notifier_call_chain(nh, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
@@ -478,10 +529,12 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
/**
- * srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -494,18 +547,25 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
* of the last notifier function called.
*/
-int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
- unsigned long val, void *v)
+int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret;
int idx;
idx = srcu_read_lock(&nh->srcu);
- ret = notifier_call_chain(&nh->head, val, v);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
srcu_read_unlock(&nh->srcu, idx);
return ret;
}
+EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
+}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
/**
@@ -881,7 +941,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
#ifdef CONFIG_SOFTWARE_SUSPEND
case LINUX_REBOOT_CMD_SW_SUSPEND:
{
- int ret = pm_suspend(PM_SUSPEND_DISK);
+ int ret = hibernate();
unlock_kernel();
return ret;
}
@@ -1292,7 +1352,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
}
/*
- * Samma på svenska..
+ * Samma på svenska..
*/
asmlinkage long sys_setfsgid(gid_t gid)
{
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f0664bd5011..4073353abd4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -77,6 +77,7 @@ extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
extern int maps_protect;
+extern int sysctl_stat_interval;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -857,6 +858,17 @@ static ctl_table vm_table[] = {
.extra2 = &one_hundred,
},
#endif
+#ifdef CONFIG_SMP
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "stat_interval",
+ .data = &sysctl_stat_interval,
+ .maxlen = sizeof(sysctl_stat_interval),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+#endif
#if defined(CONFIG_X86_32) || \
(defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
{
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index fe5c7db2424..3db5c3c460d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -74,15 +74,17 @@ static struct clocksource *watchdog;
static struct timer_list watchdog_timer;
static DEFINE_SPINLOCK(watchdog_lock);
static cycle_t watchdog_last;
+static int watchdog_resumed;
+
/*
- * Interval: 0.5sec Treshold: 0.0625s
+ * Interval: 0.5sec Threshold: 0.0625s
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
{
- if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
+ if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
return;
printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
@@ -98,15 +100,26 @@ static void clocksource_watchdog(unsigned long data)
struct clocksource *cs, *tmp;
cycle_t csnow, wdnow;
int64_t wd_nsec, cs_nsec;
+ int resumed;
spin_lock(&watchdog_lock);
+ resumed = watchdog_resumed;
+ if (unlikely(resumed))
+ watchdog_resumed = 0;
+
wdnow = watchdog->read();
wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
watchdog_last = wdnow;
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
csnow = cs->read();
+
+ if (unlikely(resumed)) {
+ cs->wd_last = csnow;
+ continue;
+ }
+
/* Initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
@@ -136,6 +149,13 @@ static void clocksource_watchdog(unsigned long data)
}
spin_unlock(&watchdog_lock);
}
+static void clocksource_resume_watchdog(void)
+{
+ spin_lock(&watchdog_lock);
+ watchdog_resumed = 1;
+ spin_unlock(&watchdog_lock);
+}
+
static void clocksource_check_watchdog(struct clocksource *cs)
{
struct clocksource *cse;
@@ -182,9 +202,34 @@ static void clocksource_check_watchdog(struct clocksource *cs)
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
+
+static inline void clocksource_resume_watchdog(void) { }
#endif
/**
+ * clocksource_resume - resume the clocksource(s)
+ */
+void clocksource_resume(void)
+{
+ struct list_head *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+
+ list_for_each(tmp, &clocksource_list) {
+ struct clocksource *cs;
+
+ cs = list_entry(tmp, struct clocksource, list);
+ if (cs->resume)
+ cs->resume();
+ }
+
+ clocksource_resume_watchdog();
+
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
+/**
* clocksource_get_next - Returns the selected clocksource
*
*/
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index b734ca4bc75..8bbcfb77f7d 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -65,7 +65,7 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
#endif
SEQ_printf(m, "\n");
- SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
+ SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n",
(unsigned long long)ktime_to_ns(timer->expires),
(unsigned long long)(ktime_to_ns(timer->expires) - now));
}
@@ -111,14 +111,14 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
SEQ_printf(m, " .index: %d\n",
base->index);
- SEQ_printf(m, " .resolution: %Ld nsecs\n",
+ SEQ_printf(m, " .resolution: %Lu nsecs\n",
(unsigned long long)ktime_to_ns(base->resolution));
SEQ_printf(m, " .get_time: ");
print_name_offset(m, base->get_time);
SEQ_printf(m, "\n");
#ifdef CONFIG_HIGH_RES_TIMERS
- SEQ_printf(m, " .offset: %Ld nsecs\n",
- ktime_to_ns(base->offset));
+ SEQ_printf(m, " .offset: %Lu nsecs\n",
+ (unsigned long long) ktime_to_ns(base->offset));
#endif
SEQ_printf(m, "active timers:\n");
print_active_timers(m, base, now);
@@ -135,10 +135,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
print_base(m, cpu_base->clock_base + i, now);
}
#define P(x) \
- SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(cpu_base->x))
#define P_ns(x) \
- SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
- (u64)(ktime_to_ns(cpu_base->x)))
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(cpu_base->x)))
#ifdef CONFIG_HIGH_RES_TIMERS
P_ns(expires_next);
@@ -150,10 +151,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
#ifdef CONFIG_TICK_ONESHOT
# define P(x) \
- SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x))
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(ts->x))
# define P_ns(x) \
- SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
- (u64)(ktime_to_ns(ts->x)))
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(ts->x)))
{
struct tick_sched *ts = tick_get_tick_sched(cpu);
P(nohz_mode);
@@ -167,7 +169,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P(last_jiffies);
P(next_jiffies);
P_ns(idle_expires);
- SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
+ SEQ_printf(m, "jiffies: %Lu\n",
+ (unsigned long long)jiffies);
}
#endif
diff --git a/kernel/timer.c b/kernel/timer.c
index 7a6448340f9..59a28b1752f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -92,24 +92,24 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
{
- return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+ return (unsigned int)((unsigned long)base & TBASE_DEFERRABLE_FLAG);
}
static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
{
- return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+ return (tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG);
}
static inline void timer_set_deferrable(struct timer_list *timer)
{
- timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
- TBASE_DEFERRABLE_FLAG));
+ timer->base = (tvec_base_t *)((unsigned long)timer->base |
+ TBASE_DEFERRABLE_FLAG);
}
static inline void
timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
{
- timer->base = (tvec_base_t *)((unsigned long)(new_base) |
+ timer->base = (tvec_base_t *)((unsigned long)new_base |
tbase_get_deferrable(timer->base));
}
@@ -1293,11 +1293,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
long cpu = (long)hcpu;
switch(action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
if (init_timers_cpu(cpu) < 0)
return NOTIFY_BAD;
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
migrate_timers(cpu);
break;
#endif
@@ -1497,6 +1499,8 @@ unregister_time_interpolator(struct time_interpolator *ti)
prev = &curr->next;
}
+ clocksource_resume();
+
write_seqlock_irqsave(&xtime_lock, flags);
if (ti == time_interpolator) {
/* we lost the best time-interpolator: */
diff --git a/kernel/wait.c b/kernel/wait.c
index 59a82f63275..444ddbfaefc 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL(remove_wait_queue);
* The spin_unlock() itself is semi-permeable and only protects
* one way (it only protects stuff inside the critical region and
* stops them from bleeding out - it would still allow subsequent
- * loads to move into the the critical region).
+ * loads to move into the critical region).
*/
void fastcall
prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6fa5e63085..fb56fedd5c0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,30 +36,20 @@
/*
* The per-CPU workqueue (if single thread, we always use the first
* possible cpu).
- *
- * The sequence counters are for flush_scheduled_work(). It wants to wait
- * until all currently-scheduled works are completed, but it doesn't
- * want to be livelocked by new, incoming ones. So it waits until
- * remove_sequence is >= the insert_sequence which pertained when
- * flush_scheduled_work() was called.
*/
struct cpu_workqueue_struct {
spinlock_t lock;
- long remove_sequence; /* Least-recently added (next to run) */
- long insert_sequence; /* Next to add */
-
struct list_head worklist;
wait_queue_head_t more_work;
- wait_queue_head_t work_done;
+ struct work_struct *current_work;
struct workqueue_struct *wq;
struct task_struct *thread;
+ int should_stop;
int run_depth; /* Detect run_workqueue() recursion depth */
-
- int freezeable; /* Freeze the thread during suspend */
} ____cacheline_aligned;
/*
@@ -68,8 +58,10 @@ struct cpu_workqueue_struct {
*/
struct workqueue_struct {
struct cpu_workqueue_struct *cpu_wq;
+ struct list_head list;
const char *name;
- struct list_head list; /* Empty if single thread */
+ int singlethread;
+ int freezeable; /* Freeze threads during suspend */
};
/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
@@ -77,106 +69,68 @@ struct workqueue_struct {
static DEFINE_MUTEX(workqueue_mutex);
static LIST_HEAD(workqueues);
-static int singlethread_cpu;
+static int singlethread_cpu __read_mostly;
+static cpumask_t cpu_singlethread_map __read_mostly;
+/* optimization, we could use cpu_possible_map */
+static cpumask_t cpu_populated_map __read_mostly;
/* If it's single threaded, it isn't in the list of workqueues. */
static inline int is_single_threaded(struct workqueue_struct *wq)
{
- return list_empty(&wq->list);
+ return wq->singlethread;
+}
+
+static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
+{
+ return is_single_threaded(wq)
+ ? &cpu_singlethread_map : &cpu_populated_map;
+}
+
+static
+struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
+{
+ if (unlikely(is_single_threaded(wq)))
+ cpu = singlethread_cpu;
+ return per_cpu_ptr(wq->cpu_wq, cpu);
}
/*
* Set the workqueue on which a work item is to be run
* - Must *only* be called if the pending flag is set
*/
-static inline void set_wq_data(struct work_struct *work, void *wq)
+static inline void set_wq_data(struct work_struct *work,
+ struct cpu_workqueue_struct *cwq)
{
unsigned long new;
BUG_ON(!work_pending(work));
- new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
+ new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
atomic_long_set(&work->data, new);
}
-static inline void *get_wq_data(struct work_struct *work)
+static inline
+struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
{
return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
}
-static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
+static void insert_work(struct cpu_workqueue_struct *cwq,
+ struct work_struct *work, int tail)
{
- int ret = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&cwq->lock, flags);
+ set_wq_data(work, cwq);
/*
- * We need to re-validate the work info after we've gotten
- * the cpu_workqueue lock. We can run the work now iff:
- *
- * - the wq_data still matches the cpu_workqueue_struct
- * - AND the work is still marked pending
- * - AND the work is still on a list (which will be this
- * workqueue_struct list)
- *
- * All these conditions are important, because we
- * need to protect against the work being run right
- * now on another CPU (all but the last one might be
- * true if it's currently running and has not been
- * released yet, for example).
+ * Ensure that we get the right work->data if we see the
+ * result of list_add() below, see try_to_grab_pending().
*/
- if (get_wq_data(work) == cwq
- && work_pending(work)
- && !list_empty(&work->entry)) {
- work_func_t f = work->func;
- list_del_init(&work->entry);
- spin_unlock_irqrestore(&cwq->lock, flags);
-
- if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
- work_release(work);
- f(work);
-
- spin_lock_irqsave(&cwq->lock, flags);
- cwq->remove_sequence++;
- wake_up(&cwq->work_done);
- ret = 1;
- }
- spin_unlock_irqrestore(&cwq->lock, flags);
- return ret;
-}
-
-/**
- * run_scheduled_work - run scheduled work synchronously
- * @work: work to run
- *
- * This checks if the work was pending, and runs it
- * synchronously if so. It returns a boolean to indicate
- * whether it had any scheduled work to run or not.
- *
- * NOTE! This _only_ works for normal work_structs. You
- * CANNOT use this for delayed work, because the wq data
- * for delayed work will not point properly to the per-
- * CPU workqueue struct, but will change!
- */
-int fastcall run_scheduled_work(struct work_struct *work)
-{
- for (;;) {
- struct cpu_workqueue_struct *cwq;
-
- if (!work_pending(work))
- return 0;
- if (list_empty(&work->entry))
- return 0;
- /* NOTE! This depends intimately on __queue_work! */
- cwq = get_wq_data(work);
- if (!cwq)
- return 0;
- if (__run_work(cwq, work))
- return 1;
- }
+ smp_wmb();
+ if (tail)
+ list_add_tail(&work->entry, &cwq->worklist);
+ else
+ list_add(&work->entry, &cwq->worklist);
+ wake_up(&cwq->more_work);
}
-EXPORT_SYMBOL(run_scheduled_work);
/* Preempt must be disabled. */
static void __queue_work(struct cpu_workqueue_struct *cwq,
@@ -185,10 +139,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
unsigned long flags;
spin_lock_irqsave(&cwq->lock, flags);
- set_wq_data(work, cwq);
- list_add_tail(&work->entry, &cwq->worklist);
- cwq->insert_sequence++;
- wake_up(&cwq->more_work);
+ insert_work(cwq, work, 1);
spin_unlock_irqrestore(&cwq->lock, flags);
}
@@ -204,16 +155,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
*/
int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
{
- int ret = 0, cpu = get_cpu();
+ int ret = 0;
if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
- if (unlikely(is_single_threaded(wq)))
- cpu = singlethread_cpu;
BUG_ON(!list_empty(&work->entry));
- __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+ __queue_work(wq_per_cpu(wq, get_cpu()), work);
+ put_cpu();
ret = 1;
}
- put_cpu();
return ret;
}
EXPORT_SYMBOL_GPL(queue_work);
@@ -221,13 +170,10 @@ EXPORT_SYMBOL_GPL(queue_work);
void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
- struct workqueue_struct *wq = get_wq_data(&dwork->work);
- int cpu = smp_processor_id();
+ struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
+ struct workqueue_struct *wq = cwq->wq;
- if (unlikely(is_single_threaded(wq)))
- cpu = singlethread_cpu;
-
- __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
+ __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
}
/**
@@ -241,27 +187,11 @@ void delayed_work_timer_fn(unsigned long __data)
int fastcall queue_delayed_work(struct workqueue_struct *wq,
struct delayed_work *dwork, unsigned long delay)
{
- int ret = 0;
- struct timer_list *timer = &dwork->timer;
- struct work_struct *work = &dwork->work;
-
- timer_stats_timer_set_start_info(timer);
+ timer_stats_timer_set_start_info(&dwork->timer);
if (delay == 0)
- return queue_work(wq, work);
-
- if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
- BUG_ON(timer_pending(timer));
- BUG_ON(!list_empty(&work->entry));
+ return queue_work(wq, &dwork->work);
- /* This stores wq for the moment, for the timer_fn */
- set_wq_data(work, wq);
- timer->expires = jiffies + delay;
- timer->data = (unsigned long)dwork;
- timer->function = delayed_work_timer_fn;
- add_timer(timer);
- ret = 1;
- }
- return ret;
+ return queue_delayed_work_on(-1, wq, dwork, delay);
}
EXPORT_SYMBOL_GPL(queue_delayed_work);
@@ -285,12 +215,16 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
BUG_ON(timer_pending(timer));
BUG_ON(!list_empty(&work->entry));
- /* This stores wq for the moment, for the timer_fn */
- set_wq_data(work, wq);
+ /* This stores cwq for the moment, for the timer_fn */
+ set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
timer->expires = jiffies + delay;
timer->data = (unsigned long)dwork;
timer->function = delayed_work_timer_fn;
- add_timer_on(timer, cpu);
+
+ if (unlikely(cpu >= 0))
+ add_timer_on(timer, cpu);
+ else
+ add_timer(timer);
ret = 1;
}
return ret;
@@ -299,13 +233,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
- unsigned long flags;
-
- /*
- * Keep taking off work from the queue until
- * done.
- */
- spin_lock_irqsave(&cwq->lock, flags);
+ spin_lock_irq(&cwq->lock);
cwq->run_depth++;
if (cwq->run_depth > 3) {
/* morton gets to eat his hat */
@@ -318,12 +246,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
struct work_struct, entry);
work_func_t f = work->func;
+ cwq->current_work = work;
list_del_init(cwq->worklist.next);
- spin_unlock_irqrestore(&cwq->lock, flags);
+ spin_unlock_irq(&cwq->lock);
BUG_ON(get_wq_data(work) != cwq);
- if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
- work_release(work);
+ work_clear_pending(work);
f(work);
if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
@@ -337,63 +265,81 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
dump_stack();
}
- spin_lock_irqsave(&cwq->lock, flags);
- cwq->remove_sequence++;
- wake_up(&cwq->work_done);
+ spin_lock_irq(&cwq->lock);
+ cwq->current_work = NULL;
}
cwq->run_depth--;
- spin_unlock_irqrestore(&cwq->lock, flags);
+ spin_unlock_irq(&cwq->lock);
+}
+
+/*
+ * NOTE: the caller must not touch *cwq if this func returns true
+ */
+static int cwq_should_stop(struct cpu_workqueue_struct *cwq)
+{
+ int should_stop = cwq->should_stop;
+
+ if (unlikely(should_stop)) {
+ spin_lock_irq(&cwq->lock);
+ should_stop = cwq->should_stop && list_empty(&cwq->worklist);
+ if (should_stop)
+ cwq->thread = NULL;
+ spin_unlock_irq(&cwq->lock);
+ }
+
+ return should_stop;
}
static int worker_thread(void *__cwq)
{
struct cpu_workqueue_struct *cwq = __cwq;
- DECLARE_WAITQUEUE(wait, current);
- struct k_sigaction sa;
- sigset_t blocked;
+ DEFINE_WAIT(wait);
- if (!cwq->freezeable)
+ if (!cwq->wq->freezeable)
current->flags |= PF_NOFREEZE;
set_user_nice(current, -5);
- /* Block and flush all signals */
- sigfillset(&blocked);
- sigprocmask(SIG_BLOCK, &blocked, NULL);
- flush_signals(current);
-
- /*
- * We inherited MPOL_INTERLEAVE from the booting kernel.
- * Set MPOL_DEFAULT to insure node local allocations.
- */
- numa_default_policy();
-
- /* SIG_IGN makes children autoreap: see do_notify_parent(). */
- sa.sa.sa_handler = SIG_IGN;
- sa.sa.sa_flags = 0;
- siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
- do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+ for (;;) {
+ prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
+ if (!freezing(current) && !cwq->should_stop
+ && list_empty(&cwq->worklist))
+ schedule();
+ finish_wait(&cwq->more_work, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- if (cwq->freezeable)
- try_to_freeze();
+ try_to_freeze();
- add_wait_queue(&cwq->more_work, &wait);
- if (list_empty(&cwq->worklist))
- schedule();
- else
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&cwq->more_work, &wait);
+ if (cwq_should_stop(cwq))
+ break;
- if (!list_empty(&cwq->worklist))
- run_workqueue(cwq);
- set_current_state(TASK_INTERRUPTIBLE);
+ run_workqueue(cwq);
}
- __set_current_state(TASK_RUNNING);
+
return 0;
}
+struct wq_barrier {
+ struct work_struct work;
+ struct completion done;
+};
+
+static void wq_barrier_func(struct work_struct *work)
+{
+ struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
+ complete(&barr->done);
+}
+
+static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+ struct wq_barrier *barr, int tail)
+{
+ INIT_WORK(&barr->work, wq_barrier_func);
+ __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
+
+ init_completion(&barr->done);
+
+ insert_work(cwq, &barr->work, tail);
+}
+
static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
{
if (cwq->thread == current) {
@@ -403,21 +349,18 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
*/
run_workqueue(cwq);
} else {
- DEFINE_WAIT(wait);
- long sequence_needed;
+ struct wq_barrier barr;
+ int active = 0;
spin_lock_irq(&cwq->lock);
- sequence_needed = cwq->insert_sequence;
-
- while (sequence_needed - cwq->remove_sequence > 0) {
- prepare_to_wait(&cwq->work_done, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&cwq->lock);
- schedule();
- spin_lock_irq(&cwq->lock);
+ if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+ insert_wq_barrier(cwq, &barr, 1);
+ active = 1;
}
- finish_wait(&cwq->work_done, &wait);
spin_unlock_irq(&cwq->lock);
+
+ if (active)
+ wait_for_completion(&barr.done);
}
}
@@ -428,151 +371,145 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
* Forces execution of the workqueue and blocks until its completion.
* This is typically used in driver shutdown handlers.
*
- * This function will sample each workqueue's current insert_sequence number and
- * will sleep until the head sequence is greater than or equal to that. This
- * means that we sleep until all works which were queued on entry have been
- * handled, but we are not livelocked by new incoming ones.
+ * We sleep until all works which were queued on entry have been handled,
+ * but we are not livelocked by new incoming ones.
*
* This function used to run the workqueues itself. Now we just wait for the
* helper threads to do it.
*/
void fastcall flush_workqueue(struct workqueue_struct *wq)
{
+ const cpumask_t *cpu_map = wq_cpu_map(wq);
+ int cpu;
+
might_sleep();
+ for_each_cpu_mask(cpu, *cpu_map)
+ flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+}
+EXPORT_SYMBOL_GPL(flush_workqueue);
- if (is_single_threaded(wq)) {
- /* Always use first cpu's area. */
- flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
- } else {
- int cpu;
+/*
+ * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit,
+ * so this work can't be re-armed in any way.
+ */
+static int try_to_grab_pending(struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq;
+ int ret = 0;
- mutex_lock(&workqueue_mutex);
- for_each_online_cpu(cpu)
- flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
- mutex_unlock(&workqueue_mutex);
+ if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+ return 1;
+
+ /*
+ * The queueing is in progress, or it is already queued. Try to
+ * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+ */
+
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return ret;
+
+ spin_lock_irq(&cwq->lock);
+ if (!list_empty(&work->entry)) {
+ /*
+ * This work is queued, but perhaps we locked the wrong cwq.
+ * In that case we must see the new value after rmb(), see
+ * insert_work()->wmb().
+ */
+ smp_rmb();
+ if (cwq == get_wq_data(work)) {
+ list_del_init(&work->entry);
+ ret = 1;
+ }
}
+ spin_unlock_irq(&cwq->lock);
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(flush_workqueue);
-static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
- int cpu, int freezeable)
+static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
+ struct work_struct *work)
{
- struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- struct task_struct *p;
+ struct wq_barrier barr;
+ int running = 0;
- spin_lock_init(&cwq->lock);
- cwq->wq = wq;
- cwq->thread = NULL;
- cwq->insert_sequence = 0;
- cwq->remove_sequence = 0;
- cwq->freezeable = freezeable;
- INIT_LIST_HEAD(&cwq->worklist);
- init_waitqueue_head(&cwq->more_work);
- init_waitqueue_head(&cwq->work_done);
+ spin_lock_irq(&cwq->lock);
+ if (unlikely(cwq->current_work == work)) {
+ insert_wq_barrier(cwq, &barr, 0);
+ running = 1;
+ }
+ spin_unlock_irq(&cwq->lock);
- if (is_single_threaded(wq))
- p = kthread_create(worker_thread, cwq, "%s", wq->name);
- else
- p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
- if (IS_ERR(p))
- return NULL;
- cwq->thread = p;
- return p;
+ if (unlikely(running))
+ wait_for_completion(&barr.done);
}
-struct workqueue_struct *__create_workqueue(const char *name,
- int singlethread, int freezeable)
+static void wait_on_work(struct work_struct *work)
{
- int cpu, destroy = 0;
+ struct cpu_workqueue_struct *cwq;
struct workqueue_struct *wq;
- struct task_struct *p;
+ const cpumask_t *cpu_map;
+ int cpu;
- wq = kzalloc(sizeof(*wq), GFP_KERNEL);
- if (!wq)
- return NULL;
+ might_sleep();
- wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
- if (!wq->cpu_wq) {
- kfree(wq);
- return NULL;
- }
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return;
- wq->name = name;
- mutex_lock(&workqueue_mutex);
- if (singlethread) {
- INIT_LIST_HEAD(&wq->list);
- p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
- if (!p)
- destroy = 1;
- else
- wake_up_process(p);
- } else {
- list_add(&wq->list, &workqueues);
- for_each_online_cpu(cpu) {
- p = create_workqueue_thread(wq, cpu, freezeable);
- if (p) {
- kthread_bind(p, cpu);
- wake_up_process(p);
- } else
- destroy = 1;
- }
- }
- mutex_unlock(&workqueue_mutex);
+ wq = cwq->wq;
+ cpu_map = wq_cpu_map(wq);
- /*
- * Was there any error during startup? If yes then clean up:
- */
- if (destroy) {
- destroy_workqueue(wq);
- wq = NULL;
- }
- return wq;
+ for_each_cpu_mask(cpu, *cpu_map)
+ wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
}
-EXPORT_SYMBOL_GPL(__create_workqueue);
-static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
+/**
+ * cancel_work_sync - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * cancel_work_sync() will cancel the work if it is queued. If the work's
+ * callback appears to be running, cancel_work_sync() will block until it
+ * has completed.
+ *
+ * It is possible to use this function if the work re-queues itself. It can
+ * cancel the work even if it migrates to another workqueue, however in that
+ * case it only guarantees that work->func() has completed on the last queued
+ * workqueue.
+ *
+ * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
+ * pending, otherwise it goes into a busy-wait loop until the timer expires.
+ *
+ * The caller must ensure that workqueue_struct on which this work was last
+ * queued can't be destroyed before this function returns.
+ */
+void cancel_work_sync(struct work_struct *work)
{
- struct cpu_workqueue_struct *cwq;
- unsigned long flags;
- struct task_struct *p;
-
- cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- spin_lock_irqsave(&cwq->lock, flags);
- p = cwq->thread;
- cwq->thread = NULL;
- spin_unlock_irqrestore(&cwq->lock, flags);
- if (p)
- kthread_stop(p);
+ while (!try_to_grab_pending(work))
+ cpu_relax();
+ wait_on_work(work);
+ work_clear_pending(work);
}
+EXPORT_SYMBOL_GPL(cancel_work_sync);
/**
- * destroy_workqueue - safely terminate a workqueue
- * @wq: target workqueue
+ * cancel_rearming_delayed_work - reliably kill off a delayed work.
+ * @dwork: the delayed work struct
*
- * Safely destroy a workqueue. All work currently pending will be done first.
+ * It is possible to use this function if @dwork rearms itself via queue_work()
+ * or queue_delayed_work(). See also the comment for cancel_work_sync().
*/
-void destroy_workqueue(struct workqueue_struct *wq)
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
{
- int cpu;
-
- flush_workqueue(wq);
-
- /* We don't need the distraction of CPUs appearing and vanishing. */
- mutex_lock(&workqueue_mutex);
- if (is_single_threaded(wq))
- cleanup_workqueue_thread(wq, singlethread_cpu);
- else {
- for_each_online_cpu(cpu)
- cleanup_workqueue_thread(wq, cpu);
- list_del(&wq->list);
- }
- mutex_unlock(&workqueue_mutex);
- free_percpu(wq->cpu_wq);
- kfree(wq);
+ while (!del_timer(&dwork->timer) &&
+ !try_to_grab_pending(&dwork->work))
+ cpu_relax();
+ wait_on_work(&dwork->work);
+ work_clear_pending(&dwork->work);
}
-EXPORT_SYMBOL_GPL(destroy_workqueue);
+EXPORT_SYMBOL(cancel_rearming_delayed_work);
-static struct workqueue_struct *keventd_wq;
+static struct workqueue_struct *keventd_wq __read_mostly;
/**
* schedule_work - put work task in global workqueue
@@ -638,7 +575,7 @@ int schedule_on_each_cpu(work_func_t func)
if (!works)
return -ENOMEM;
- mutex_lock(&workqueue_mutex);
+ preempt_disable(); /* CPU hotplug */
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);
@@ -646,7 +583,7 @@ int schedule_on_each_cpu(work_func_t func)
set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
}
- mutex_unlock(&workqueue_mutex);
+ preempt_enable();
flush_workqueue(keventd_wq);
free_percpu(works);
return 0;
@@ -659,29 +596,6 @@ void flush_scheduled_work(void)
EXPORT_SYMBOL(flush_scheduled_work);
/**
- * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work.
- * @wq: the controlling workqueue structure
- * @dwork: the delayed work struct
- */
-void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
- struct delayed_work *dwork)
-{
- while (!cancel_delayed_work(dwork))
- flush_workqueue(wq);
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
-
-/**
- * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work.
- * @dwork: the delayed work struct
- */
-void cancel_rearming_delayed_work(struct delayed_work *dwork)
-{
- cancel_rearming_delayed_workqueue(keventd_wq, dwork);
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_work);
-
-/**
* execute_in_process_context - reliably execute the routine with user context
* @fn: the function to execute
* @ew: guaranteed storage for the execute work structure (must
@@ -728,94 +642,209 @@ int current_is_keventd(void)
}
-/* Take the work from this (downed) CPU. */
-static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
+static struct cpu_workqueue_struct *
+init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
{
struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- struct list_head list;
- struct work_struct *work;
- spin_lock_irq(&cwq->lock);
- list_replace_init(&cwq->worklist, &list);
+ cwq->wq = wq;
+ spin_lock_init(&cwq->lock);
+ INIT_LIST_HEAD(&cwq->worklist);
+ init_waitqueue_head(&cwq->more_work);
+
+ return cwq;
+}
+
+static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+ struct workqueue_struct *wq = cwq->wq;
+ const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
+ struct task_struct *p;
+
+ p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
+ /*
+ * Nobody can add the work_struct to this cwq,
+ * if (caller is __create_workqueue)
+ * nobody should see this wq
+ * else // caller is CPU_UP_PREPARE
+ * cpu is not on cpu_online_map
+ * so we can abort safely.
+ */
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ cwq->thread = p;
+ cwq->should_stop = 0;
+
+ return 0;
+}
+
+static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+ struct task_struct *p = cwq->thread;
- while (!list_empty(&list)) {
- printk("Taking work for %s\n", wq->name);
- work = list_entry(list.next,struct work_struct,entry);
- list_del(&work->entry);
- __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
+ if (p != NULL) {
+ if (cpu >= 0)
+ kthread_bind(p, cpu);
+ wake_up_process(p);
}
- spin_unlock_irq(&cwq->lock);
}
-/* We're holding the cpucontrol mutex here */
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+struct workqueue_struct *__create_workqueue(const char *name,
+ int singlethread, int freezeable)
{
- unsigned int hotcpu = (unsigned long)hcpu;
struct workqueue_struct *wq;
+ struct cpu_workqueue_struct *cwq;
+ int err = 0, cpu;
- switch (action) {
- case CPU_UP_PREPARE:
- mutex_lock(&workqueue_mutex);
- /* Create a new workqueue thread for it. */
- list_for_each_entry(wq, &workqueues, list) {
- if (!create_workqueue_thread(wq, hotcpu, 0)) {
- printk("workqueue for %i failed\n", hotcpu);
- return NOTIFY_BAD;
- }
- }
- break;
+ wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+ if (!wq)
+ return NULL;
- case CPU_ONLINE:
- /* Kick off worker threads. */
- list_for_each_entry(wq, &workqueues, list) {
- struct cpu_workqueue_struct *cwq;
+ wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+ if (!wq->cpu_wq) {
+ kfree(wq);
+ return NULL;
+ }
- cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
- kthread_bind(cwq->thread, hotcpu);
- wake_up_process(cwq->thread);
- }
- mutex_unlock(&workqueue_mutex);
- break;
+ wq->name = name;
+ wq->singlethread = singlethread;
+ wq->freezeable = freezeable;
+ INIT_LIST_HEAD(&wq->list);
- case CPU_UP_CANCELED:
- list_for_each_entry(wq, &workqueues, list) {
- if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
+ if (singlethread) {
+ cwq = init_cpu_workqueue(wq, singlethread_cpu);
+ err = create_workqueue_thread(cwq, singlethread_cpu);
+ start_workqueue_thread(cwq, -1);
+ } else {
+ mutex_lock(&workqueue_mutex);
+ list_add(&wq->list, &workqueues);
+
+ for_each_possible_cpu(cpu) {
+ cwq = init_cpu_workqueue(wq, cpu);
+ if (err || !cpu_online(cpu))
continue;
- /* Unbind so it can run. */
- kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
- any_online_cpu(cpu_online_map));
- cleanup_workqueue_thread(wq, hotcpu);
+ err = create_workqueue_thread(cwq, cpu);
+ start_workqueue_thread(cwq, cpu);
}
mutex_unlock(&workqueue_mutex);
- break;
+ }
+
+ if (err) {
+ destroy_workqueue(wq);
+ wq = NULL;
+ }
+ return wq;
+}
+EXPORT_SYMBOL_GPL(__create_workqueue);
+
+static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+ struct wq_barrier barr;
+ int alive = 0;
+
+ spin_lock_irq(&cwq->lock);
+ if (cwq->thread != NULL) {
+ insert_wq_barrier(cwq, &barr, 1);
+ cwq->should_stop = 1;
+ alive = 1;
+ }
+ spin_unlock_irq(&cwq->lock);
+
+ if (alive) {
+ wait_for_completion(&barr.done);
- case CPU_DOWN_PREPARE:
+ while (unlikely(cwq->thread != NULL))
+ cpu_relax();
+ /*
+ * Wait until cwq->thread unlocks cwq->lock,
+ * it won't touch *cwq after that.
+ */
+ smp_rmb();
+ spin_unlock_wait(&cwq->lock);
+ }
+}
+
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
+void destroy_workqueue(struct workqueue_struct *wq)
+{
+ const cpumask_t *cpu_map = wq_cpu_map(wq);
+ struct cpu_workqueue_struct *cwq;
+ int cpu;
+
+ mutex_lock(&workqueue_mutex);
+ list_del(&wq->list);
+ mutex_unlock(&workqueue_mutex);
+
+ for_each_cpu_mask(cpu, *cpu_map) {
+ cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+ cleanup_workqueue_thread(cwq, cpu);
+ }
+
+ free_percpu(wq->cpu_wq);
+ kfree(wq);
+}
+EXPORT_SYMBOL_GPL(destroy_workqueue);
+
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct cpu_workqueue_struct *cwq;
+ struct workqueue_struct *wq;
+
+ action &= ~CPU_TASKS_FROZEN;
+
+ switch (action) {
+ case CPU_LOCK_ACQUIRE:
mutex_lock(&workqueue_mutex);
- break;
+ return NOTIFY_OK;
- case CPU_DOWN_FAILED:
+ case CPU_LOCK_RELEASE:
mutex_unlock(&workqueue_mutex);
- break;
+ return NOTIFY_OK;
- case CPU_DEAD:
- list_for_each_entry(wq, &workqueues, list)
- cleanup_workqueue_thread(wq, hotcpu);
- list_for_each_entry(wq, &workqueues, list)
- take_over_work(wq, hotcpu);
- mutex_unlock(&workqueue_mutex);
- break;
+ case CPU_UP_PREPARE:
+ cpu_set(cpu, cpu_populated_map);
+ }
+
+ list_for_each_entry(wq, &workqueues, list) {
+ cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (!create_workqueue_thread(cwq, cpu))
+ break;
+ printk(KERN_ERR "workqueue for %i failed\n", cpu);
+ return NOTIFY_BAD;
+
+ case CPU_ONLINE:
+ start_workqueue_thread(cwq, cpu);
+ break;
+
+ case CPU_UP_CANCELED:
+ start_workqueue_thread(cwq, -1);
+ case CPU_DEAD:
+ cleanup_workqueue_thread(cwq, cpu);
+ break;
+ }
}
return NOTIFY_OK;
}
-void init_workqueues(void)
+void __init init_workqueues(void)
{
+ cpu_populated_map = cpu_online_map;
singlethread_cpu = first_cpu(cpu_possible_map);
+ cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu);
hotcpu_notifier(workqueue_cpu_callback, 0);
keventd_wq = create_workqueue("events");
BUG_ON(!keventd_wq);
}
-