aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c222
-rw-r--r--kernel/cgroup.c312
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/cpuset.c357
-rw-r--r--kernel/delayacct.c16
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/exit.c116
-rw-r--r--kernel/fork.c118
-rw-r--r--kernel/irq/chip.c12
-rw-r--r--kernel/irq/manage.c7
-rw-r--r--kernel/kallsyms.c2
-rw-r--r--kernel/kexec.c104
-rw-r--r--kernel/kmod.c9
-rw-r--r--kernel/kprobes.c132
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/marker.c25
-rw-r--r--kernel/ns_cgroup.c8
-rw-r--r--kernel/nsproxy.c8
-rw-r--r--kernel/panic.c22
-rw-r--r--kernel/pid.c10
-rw-r--r--kernel/pid_namespace.c10
-rw-r--r--kernel/posix-timers.c21
-rw-r--r--kernel/power/main.c7
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/printk.c17
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/relay.c170
-rw-r--r--kernel/res_counter.c48
-rw-r--r--kernel/sched.c36
-rw-r--r--kernel/signal.c179
-rw-r--r--kernel/smp.c4
-rw-r--r--kernel/softirq.c3
-rw-r--r--kernel/softlockup.c25
-rw-r--r--kernel/sys.c35
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c170
-rw-r--r--kernel/sysctl_check.c2
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/trace/trace.c3
-rw-r--r--kernel/trace/trace_irqsoff.c8
-rw-r--r--kernel/trace/trace_sched_wakeup.c27
-rw-r--r--kernel/trace/trace_sysprof.c2
-rw-r--r--kernel/tsacct.c25
-rw-r--r--kernel/workqueue.c112
46 files changed, 1477 insertions, 930 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 15ab63ffe64..54f69837d35 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,7 +2,7 @@
# Makefile for the linux kernel.
#
-obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
cpu.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o \
@@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
CFLAGS_REMOVE_sched.o = -mno-spe -pg
endif
+obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
diff --git a/kernel/acct.c b/kernel/acct.c
index 91e1cfd734d..dd68b905941 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,37 +75,39 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct,
+ struct pid_namespace *ns, struct file *);
/*
* This structure is used so that all the data protected by lock
* can be placed in the same cache line as the lock. This primes
* the cache line to have the data after getting the lock.
*/
-struct acct_glbs {
- spinlock_t lock;
+struct bsd_acct_struct {
volatile int active;
volatile int needcheck;
struct file *file;
struct pid_namespace *ns;
struct timer_list timer;
+ struct list_head list;
};
-static struct acct_glbs acct_globals __cacheline_aligned =
- {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
+static DEFINE_SPINLOCK(acct_lock);
+static LIST_HEAD(acct_list);
/*
* Called whenever the timer says to check the free space.
*/
-static void acct_timeout(unsigned long unused)
+static void acct_timeout(unsigned long x)
{
- acct_globals.needcheck = 1;
+ struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
+ acct->needcheck = 1;
}
/*
* Check the amount of free space and suspend/resume accordingly.
*/
-static int check_free_space(struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
{
struct kstatfs sbuf;
int res;
@@ -113,11 +115,11 @@ static int check_free_space(struct file *file)
sector_t resume;
sector_t suspend;
- spin_lock(&acct_globals.lock);
- res = acct_globals.active;
- if (!file || !acct_globals.needcheck)
+ spin_lock(&acct_lock);
+ res = acct->active;
+ if (!file || !acct->needcheck)
goto out;
- spin_unlock(&acct_globals.lock);
+ spin_unlock(&acct_lock);
/* May block */
if (vfs_statfs(file->f_path.dentry, &sbuf))
@@ -136,35 +138,35 @@ static int check_free_space(struct file *file)
act = 0;
/*
- * If some joker switched acct_globals.file under us we'ld better be
+ * If some joker switched acct->file under us we'ld better be
* silent and _not_ touch anything.
*/
- spin_lock(&acct_globals.lock);
- if (file != acct_globals.file) {
+ spin_lock(&acct_lock);
+ if (file != acct->file) {
if (act)
res = act>0;
goto out;
}
- if (acct_globals.active) {
+ if (acct->active) {
if (act < 0) {
- acct_globals.active = 0;
+ acct->active = 0;
printk(KERN_INFO "Process accounting paused\n");
}
} else {
if (act > 0) {
- acct_globals.active = 1;
+ acct->active = 1;
printk(KERN_INFO "Process accounting resumed\n");
}
}
- del_timer(&acct_globals.timer);
- acct_globals.needcheck = 0;
- acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
- add_timer(&acct_globals.timer);
- res = acct_globals.active;
+ del_timer(&acct->timer);
+ acct->needcheck = 0;
+ acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+ add_timer(&acct->timer);
+ res = acct->active;
out:
- spin_unlock(&acct_globals.lock);
+ spin_unlock(&acct_lock);
return res;
}
@@ -172,39 +174,41 @@ out:
* Close the old accounting file (if currently open) and then replace
* it with file (if non-NULL).
*
- * NOTE: acct_globals.lock MUST be held on entry and exit.
+ * NOTE: acct_lock MUST be held on entry and exit.
*/
-static void acct_file_reopen(struct file *file)
+static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
+ struct pid_namespace *ns)
{
struct file *old_acct = NULL;
struct pid_namespace *old_ns = NULL;
- if (acct_globals.file) {
- old_acct = acct_globals.file;
- old_ns = acct_globals.ns;
- del_timer(&acct_globals.timer);
- acct_globals.active = 0;
- acct_globals.needcheck = 0;
- acct_globals.file = NULL;
+ if (acct->file) {
+ old_acct = acct->file;
+ old_ns = acct->ns;
+ del_timer(&acct->timer);
+ acct->active = 0;
+ acct->needcheck = 0;
+ acct->file = NULL;
+ acct->ns = NULL;
+ list_del(&acct->list);
}
if (file) {
- acct_globals.file = file;
- acct_globals.ns = get_pid_ns(task_active_pid_ns(current));
- acct_globals.needcheck = 0;
- acct_globals.active = 1;
+ acct->file = file;
+ acct->ns = ns;
+ acct->needcheck = 0;
+ acct->active = 1;
+ list_add(&acct->list, &acct_list);
/* It's been deleted if it was used before so this is safe */
- init_timer(&acct_globals.timer);
- acct_globals.timer.function = acct_timeout;
- acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
- add_timer(&acct_globals.timer);
+ setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
+ acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+ add_timer(&acct->timer);
}
if (old_acct) {
mnt_unpin(old_acct->f_path.mnt);
- spin_unlock(&acct_globals.lock);
- do_acct_process(old_ns, old_acct);
+ spin_unlock(&acct_lock);
+ do_acct_process(acct, old_ns, old_acct);
filp_close(old_acct, NULL);
- put_pid_ns(old_ns);
- spin_lock(&acct_globals.lock);
+ spin_lock(&acct_lock);
}
}
@@ -212,6 +216,8 @@ static int acct_on(char *name)
{
struct file *file;
int error;
+ struct pid_namespace *ns;
+ struct bsd_acct_struct *acct = NULL;
/* Difference from BSD - they don't do O_APPEND */
file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
@@ -228,18 +234,34 @@ static int acct_on(char *name)
return -EIO;
}
+ ns = task_active_pid_ns(current);
+ if (ns->bacct == NULL) {
+ acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+ if (acct == NULL) {
+ filp_close(file, NULL);
+ return -ENOMEM;
+ }
+ }
+
error = security_acct(file);
if (error) {
+ kfree(acct);
filp_close(file, NULL);
return error;
}
- spin_lock(&acct_globals.lock);
+ spin_lock(&acct_lock);
+ if (ns->bacct == NULL) {
+ ns->bacct = acct;
+ acct = NULL;
+ }
+
mnt_pin(file->f_path.mnt);
- acct_file_reopen(file);
- spin_unlock(&acct_globals.lock);
+ acct_file_reopen(ns->bacct, file, ns);
+ spin_unlock(&acct_lock);
mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+ kfree(acct);
return 0;
}
@@ -269,11 +291,17 @@ asmlinkage long sys_acct(const char __user *name)
error = acct_on(tmp);
putname(tmp);
} else {
+ struct bsd_acct_struct *acct;
+
+ acct = task_active_pid_ns(current)->bacct;
+ if (acct == NULL)
+ return 0;
+
error = security_acct(NULL);
if (!error) {
- spin_lock(&acct_globals.lock);
- acct_file_reopen(NULL);
- spin_unlock(&acct_globals.lock);
+ spin_lock(&acct_lock);
+ acct_file_reopen(acct, NULL, NULL);
+ spin_unlock(&acct_lock);
}
}
return error;
@@ -288,10 +316,16 @@ asmlinkage long sys_acct(const char __user *name)
*/
void acct_auto_close_mnt(struct vfsmount *m)
{
- spin_lock(&acct_globals.lock);
- if (acct_globals.file && acct_globals.file->f_path.mnt == m)
- acct_file_reopen(NULL);
- spin_unlock(&acct_globals.lock);
+ struct bsd_acct_struct *acct;
+
+ spin_lock(&acct_lock);
+restart:
+ list_for_each_entry(acct, &acct_list, list)
+ if (acct->file && acct->file->f_path.mnt == m) {
+ acct_file_reopen(acct, NULL, NULL);
+ goto restart;
+ }
+ spin_unlock(&acct_lock);
}
/**
@@ -303,12 +337,31 @@ void acct_auto_close_mnt(struct vfsmount *m)
*/
void acct_auto_close(struct super_block *sb)
{
- spin_lock(&acct_globals.lock);
- if (acct_globals.file &&
- acct_globals.file->f_path.mnt->mnt_sb == sb) {
- acct_file_reopen(NULL);
+ struct bsd_acct_struct *acct;
+
+ spin_lock(&acct_lock);
+restart:
+ list_for_each_entry(acct, &acct_list, list)
+ if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+ acct_file_reopen(acct, NULL, NULL);
+ goto restart;
+ }
+ spin_unlock(&acct_lock);
+}
+
+void acct_exit_ns(struct pid_namespace *ns)
+{
+ struct bsd_acct_struct *acct;
+
+ spin_lock(&acct_lock);
+ acct = ns->bacct;
+ if (acct != NULL) {
+ if (acct->file != NULL)
+ acct_file_reopen(acct, NULL, NULL);
+
+ kfree(acct);
}
- spin_unlock(&acct_globals.lock);
+ spin_unlock(&acct_lock);
}
/*
@@ -425,7 +478,8 @@ static u32 encode_float(u64 value)
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
-static void do_acct_process(struct pid_namespace *ns, struct file *file)
+static void do_acct_process(struct bsd_acct_struct *acct,
+ struct pid_namespace *ns, struct file *file)
{
struct pacct_struct *pacct = &current->signal->pacct;
acct_t ac;
@@ -440,7 +494,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file)
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
- if (!check_free_space(file))
+ if (!check_free_space(acct, file))
return;
/*
@@ -577,34 +631,46 @@ void acct_collect(long exitcode, int group_dead)
spin_unlock_irq(&current->sighand->siglock);
}
-/**
- * acct_process - now just a wrapper around do_acct_process
- * @exitcode: task exit code
- *
- * handles process accounting for an exiting task
- */
-void acct_process(void)
+static void acct_process_in_ns(struct pid_namespace *ns)
{
struct file *file = NULL;
- struct pid_namespace *ns;
+ struct bsd_acct_struct *acct;
+ acct = ns->bacct;
/*
* accelerate the common fastpath:
*/
- if (!acct_globals.file)
+ if (!acct || !acct->file)
return;
- spin_lock(&acct_globals.lock);
- file = acct_globals.file;
+ spin_lock(&acct_lock);
+ file = acct->file;
if (unlikely(!file)) {
- spin_unlock(&acct_globals.lock);
+ spin_unlock(&acct_lock);
return;
}
get_file(file);
- ns = get_pid_ns(acct_globals.ns);
- spin_unlock(&acct_globals.lock);
+ spin_unlock(&acct_lock);
- do_acct_process(ns, file);
+ do_acct_process(acct, ns, file);
fput(file);
- put_pid_ns(ns);
+}
+
+/**
+ * acct_process - now just a wrapper around acct_process_in_ns,
+ * which in turn is a wrapper around do_acct_process.
+ *
+ * handles process accounting for an exiting task
+ */
+void acct_process(void)
+{
+ struct pid_namespace *ns;
+
+ /*
+ * This loop is safe lockless, since current is still
+ * alive and holds its namespace, which in turn holds
+ * its parent.
+ */
+ for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
+ acct_process_in_ns(ns);
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15ac0e1e4f4..657f8f8d93a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,6 +45,7 @@
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
#include <linux/hash.h>
+#include <linux/namei.h>
#include <asm/atomic.h>
@@ -89,11 +90,7 @@ struct cgroupfs_root {
/* Hierarchy-specific flags */
unsigned long flags;
- /* The path to use for release notifications. No locking
- * between setting and use - so if userspace updates this
- * while child cgroups exist, you could miss a
- * notification. We ensure that it's always a valid
- * NUL-terminated string */
+ /* The path to use for release notifications. */
char release_agent_path[PATH_MAX];
};
@@ -118,7 +115,7 @@ static int root_count;
* extra work in the fork/exit path if none of the subsystems need to
* be called.
*/
-static int need_forkexit_callback;
+static int need_forkexit_callback __read_mostly;
static int need_mm_owner_callback __read_mostly;
/* convenient tests for these bits */
@@ -220,7 +217,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
* task until after the first call to cgroup_iter_start(). This
* reduces the fork()/exit() overhead for people who have cgroups
* compiled into their kernel but not actually in use */
-static int use_task_css_set_links;
+static int use_task_css_set_links __read_mostly;
/* When we create or destroy a css_set, the operation simply
* takes/releases a reference count on all the cgroups referenced
@@ -241,17 +238,20 @@ static int use_task_css_set_links;
*/
static void unlink_css_set(struct css_set *cg)
{
+ struct cg_cgroup_link *link;
+ struct cg_cgroup_link *saved_link;
+
write_lock(&css_set_lock);
hlist_del(&cg->hlist);
css_set_count--;
- while (!list_empty(&cg->cg_links)) {
- struct cg_cgroup_link *link;
- link = list_entry(cg->cg_links.next,
- struct cg_cgroup_link, cg_link_list);
+
+ list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+ cg_link_list) {
list_del(&link->cg_link_list);
list_del(&link->cgrp_link_list);
kfree(link);
}
+
write_unlock(&css_set_lock);
}
@@ -363,15 +363,14 @@ static struct css_set *find_existing_css_set(
static int allocate_cg_links(int count, struct list_head *tmp)
{
struct cg_cgroup_link *link;
+ struct cg_cgroup_link *saved_link;
int i;
INIT_LIST_HEAD(tmp);
for (i = 0; i < count; i++) {
link = kmalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
- while (!list_empty(tmp)) {
- link = list_entry(tmp->next,
- struct cg_cgroup_link,
- cgrp_link_list);
+ list_for_each_entry_safe(link, saved_link, tmp,
+ cgrp_link_list) {
list_del(&link->cgrp_link_list);
kfree(link);
}
@@ -384,11 +383,10 @@ static int allocate_cg_links(int count, struct list_head *tmp)
static void free_cg_links(struct list_head *tmp)
{
- while (!list_empty(tmp)) {
- struct cg_cgroup_link *link;
- link = list_entry(tmp->next,
- struct cg_cgroup_link,
- cgrp_link_list);
+ struct cg_cgroup_link *link;
+ struct cg_cgroup_link *saved_link;
+
+ list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
list_del(&link->cgrp_link_list);
kfree(link);
}
@@ -415,11 +413,11 @@ static struct css_set *find_css_set(
/* First see if we already have a cgroup group that matches
* the desired set */
- write_lock(&css_set_lock);
+ read_lock(&css_set_lock);
res = find_existing_css_set(oldcg, cgrp, template);
if (res)
get_css_set(res);
- write_unlock(&css_set_lock);
+ read_unlock(&css_set_lock);
if (res)
return res;
@@ -507,10 +505,6 @@ static struct css_set *find_css_set(
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
- * The cgroup_common_file_write handler for operations that modify
- * the cgroup hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cgroup modifications across the system.
- *
* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
* (usually) take cgroup_mutex. These are the two most performance
* critical pieces of code here. The exception occurs on cgroup_exit(),
@@ -1093,6 +1087,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
struct cgroupfs_root *root = sb->s_fs_info;
struct cgroup *cgrp = &root->top_cgroup;
int ret;
+ struct cg_cgroup_link *link;
+ struct cg_cgroup_link *saved_link;
BUG_ON(!root);
@@ -1112,10 +1108,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
* root cgroup
*/
write_lock(&css_set_lock);
- while (!list_empty(&cgrp->css_sets)) {
- struct cg_cgroup_link *link;
- link = list_entry(cgrp->css_sets.next,
- struct cg_cgroup_link, cgrp_link_list);
+
+ list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
+ cgrp_link_list) {
list_del(&link->cg_link_list);
list_del(&link->cgrp_link_list);
kfree(link);
@@ -1281,18 +1276,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with
- * cgroup_mutex, may take task_lock of task
+ * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
+ * held. May take task_lock of task
*/
-static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
{
- pid_t pid;
struct task_struct *tsk;
int ret;
- if (sscanf(pidbuf, "%d", &pid) != 1)
- return -EIO;
-
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
@@ -1318,6 +1309,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
return ret;
}
+static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+{
+ int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+ ret = attach_task_by_pid(cgrp, pid);
+ cgroup_unlock();
+ return ret;
+}
+
/* The various types of files and directories in a cgroup file system */
enum cgroup_filetype {
FILE_ROOT,
@@ -1327,12 +1328,54 @@ enum cgroup_filetype {
FILE_RELEASE_AGENT,
};
+/**
+ * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+ * @cgrp: the cgroup to be checked for liveness
+ *
+ * On success, returns true; the lock should be later released with
+ * cgroup_unlock(). On failure returns false with no lock held.
+ */
+bool cgroup_lock_live_group(struct cgroup *cgrp)
+{
+ mutex_lock(&cgroup_mutex);
+ if (cgroup_is_removed(cgrp)) {
+ mutex_unlock(&cgroup_mutex);
+ return false;
+ }
+ return true;
+}
+
+static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+ strcpy(cgrp->root->release_agent_path, buffer);
+ cgroup_unlock();
+ return 0;
+}
+
+static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *seq)
+{
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+ seq_puts(seq, cgrp->root->release_agent_path);
+ seq_putc(seq, '\n');
+ cgroup_unlock();
+ return 0;
+}
+
+/* A buffer size big enough for numbers or short strings */
+#define CGROUP_LOCAL_BUFFER_SIZE 64
+
static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
struct file *file,
const char __user *userbuf,
size_t nbytes, loff_t *unused_ppos)
{
- char buffer[64];
+ char buffer[CGROUP_LOCAL_BUFFER_SIZE];
int retval = 0;
char *end;
@@ -1361,68 +1404,36 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
return retval;
}
-static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
- struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
+static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ const char __user *userbuf,
+ size_t nbytes, loff_t *unused_ppos)
{
- enum cgroup_filetype type = cft->private;
- char *buffer;
+ char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
int retval = 0;
+ size_t max_bytes = cft->max_write_len;
+ char *buffer = local_buffer;
- if (nbytes >= PATH_MAX)
+ if (!max_bytes)
+ max_bytes = sizeof(local_buffer) - 1;
+ if (nbytes >= max_bytes)
return -E2BIG;
-
- /* +1 for nul-terminator */
- buffer = kmalloc(nbytes + 1, GFP_KERNEL);
- if (buffer == NULL)
- return -ENOMEM;
-
- if (copy_from_user(buffer, userbuf, nbytes)) {
- retval = -EFAULT;
- goto out1;
+ /* Allocate a dynamic buffer if we need one */
+ if (nbytes >= sizeof(local_buffer)) {
+ buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+ if (buffer == NULL)
+ return -ENOMEM;
}
- buffer[nbytes] = 0; /* nul-terminate */
- strstrip(buffer); /* strip -just- trailing whitespace */
-
- mutex_lock(&cgroup_mutex);
+ if (nbytes && copy_from_user(buffer, userbuf, nbytes))
+ return -EFAULT;
- /*
- * This was already checked for in cgroup_file_write(), but
- * check again now we're holding cgroup_mutex.
- */
- if (cgroup_is_removed(cgrp)) {
- retval = -ENODEV;
- goto out2;
- }
-
- switch (type) {
- case FILE_TASKLIST:
- retval = attach_task_by_pid(cgrp, buffer);
- break;
- case FILE_NOTIFY_ON_RELEASE:
- clear_bit(CGRP_RELEASABLE, &cgrp->flags);
- if (simple_strtoul(buffer, NULL, 10) != 0)
- set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
- else
- clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
- break;
- case FILE_RELEASE_AGENT:
- BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
- strcpy(cgrp->root->release_agent_path, buffer);
- break;
- default:
- retval = -EINVAL;
- goto out2;
- }
-
- if (retval == 0)
+ buffer[nbytes] = 0; /* nul-terminate */
+ strstrip(buffer);
+ retval = cft->write_string(cgrp, cft, buffer);
+ if (!retval)
retval = nbytes;
-out2:
- mutex_unlock(&cgroup_mutex);
-out1:
- kfree(buffer);
+ if (buffer != local_buffer)
+ kfree(buffer);
return retval;
}
@@ -1438,6 +1449,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
return cft->write(cgrp, cft, file, buf, nbytes, ppos);
if (cft->write_u64 || cft->write_s64)
return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+ if (cft->write_string)
+ return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
if (cft->trigger) {
int ret = cft->trigger(cgrp, (unsigned int)cft->private);
return ret ? ret : nbytes;
@@ -1450,7 +1463,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
char __user *buf, size_t nbytes,
loff_t *ppos)
{
- char tmp[64];
+ char tmp[CGROUP_LOCAL_BUFFER_SIZE];
u64 val = cft->read_u64(cgrp, cft);
int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
@@ -1462,56 +1475,13 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
char __user *buf, size_t nbytes,
loff_t *ppos)
{
- char tmp[64];
+ char tmp[CGROUP_LOCAL_BUFFER_SIZE];
s64 val = cft->read_s64(cgrp, cft);
int len = sprintf(tmp, "%lld\n", (long long) val);
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
-static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
- struct cftype *cft,
- struct file *file,
- char __user *buf,
- size_t nbytes, loff_t *ppos)
-{
- enum cgroup_filetype type = cft->private;
- char *page;
- ssize_t retval = 0;
- char *s;
-
- if (!(page = (char *)__get_free_page(GFP_KERNEL)))
- return -ENOMEM;
-
- s = page;
-
- switch (type) {
- case FILE_RELEASE_AGENT:
- {
- struct cgroupfs_root *root;
- size_t n;
- mutex_lock(&cgroup_mutex);
- root = cgrp->root;
- n = strnlen(root->release_agent_path,
- sizeof(root->release_agent_path));
- n = min(n, (size_t) PAGE_SIZE);
- strncpy(s, root->release_agent_path, n);
- mutex_unlock(&cgroup_mutex);
- s += n;
- break;
- }
- default:
- retval = -EINVAL;
- goto out;
- }
- *s++ = '\n';
-
- retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-out:
- free_page((unsigned long)page);
- return retval;
-}
-
static ssize_t cgroup_file_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos)
{
@@ -1560,7 +1530,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
return cft->read_seq_string(state->cgroup, cft, m);
}
-int cgroup_seqfile_release(struct inode *inode, struct file *file)
+static int cgroup_seqfile_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
kfree(seq->private);
@@ -1569,6 +1539,7 @@ int cgroup_seqfile_release(struct inode *inode, struct file *file)
static struct file_operations cgroup_seqfile_operations = {
.read = seq_read,
+ .write = cgroup_file_write,
.llseek = seq_lseek,
.release = cgroup_seqfile_release,
};
@@ -1756,15 +1727,11 @@ int cgroup_add_files(struct cgroup *cgrp,
int cgroup_task_count(const struct cgroup *cgrp)
{
int count = 0;
- struct list_head *l;
+ struct cg_cgroup_link *link;
read_lock(&css_set_lock);
- l = cgrp->css_sets.next;
- while (l != &cgrp->css_sets) {
- struct cg_cgroup_link *link =
- list_entry(l, struct cg_cgroup_link, cgrp_link_list);
+ list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
count += atomic_read(&link->cg->ref.refcount);
- l = l->next;
}
read_unlock(&css_set_lock);
return count;
@@ -2227,6 +2194,18 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
return notify_on_release(cgrp);
}
+static int cgroup_write_notify_on_release(struct cgroup *cgrp,
+ struct cftype *cft,
+ u64 val)
+{
+ clear_bit(CGRP_RELEASABLE, &cgrp->flags);
+ if (val)
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ else
+ clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ return 0;
+}
+
/*
* for the common functions, 'private' gives the type of file
*/
@@ -2235,7 +2214,7 @@ static struct cftype files[] = {
.name = "tasks",
.open = cgroup_tasks_open,
.read = cgroup_tasks_read,
- .write = cgroup_common_file_write,
+ .write_u64 = cgroup_tasks_write,
.release = cgroup_tasks_release,
.private = FILE_TASKLIST,
},
@@ -2243,15 +2222,16 @@ static struct cftype files[] = {
{
.name = "notify_on_release",
.read_u64 = cgroup_read_notify_on_release,
- .write = cgroup_common_file_write,
+ .write_u64 = cgroup_write_notify_on_release,
.private = FILE_NOTIFY_ON_RELEASE,
},
};
static struct cftype cft_release_agent = {
.name = "release_agent",
- .read = cgroup_common_file_read,
- .write = cgroup_common_file_write,
+ .read_seq_string = cgroup_release_agent_show,
+ .write_string = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX,
.private = FILE_RELEASE_AGENT,
};
@@ -2869,16 +2849,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
* cgroup_clone - clone the cgroup the given subsystem is attached to
* @tsk: the task to be moved
* @subsys: the given subsystem
+ * @nodename: the name for the new cgroup
*
* Duplicate the current cgroup in the hierarchy that the given
* subsystem is attached to, and move this task into the new
* child.
*/
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
+int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
+ char *nodename)
{
struct dentry *dentry;
int ret = 0;
- char nodename[MAX_CGROUP_TYPE_NAMELEN];
struct cgroup *parent, *child;
struct inode *inode;
struct css_set *cg;
@@ -2903,8 +2884,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
cg = tsk->cgroups;
parent = task_cgroup(tsk, subsys->subsys_id);
- snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
-
/* Pin the hierarchy */
atomic_inc(&parent->root->sb->s_active);
@@ -3078,27 +3057,24 @@ static void cgroup_release_agent(struct work_struct *work)
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
- char *pathbuf;
+ char *pathbuf = NULL, *agentbuf = NULL;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
spin_unlock(&release_list_lock);
pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!pathbuf) {
- spin_lock(&release_list_lock);
- continue;
- }
-
- if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
- kfree(pathbuf);
- spin_lock(&release_list_lock);
- continue;
- }
+ if (!pathbuf)
+ goto continue_free;
+ if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+ goto continue_free;
+ agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+ if (!agentbuf)
+ goto continue_free;
i = 0;
- argv[i++] = cgrp->root->release_agent_path;
- argv[i++] = (char *)pathbuf;
+ argv[i++] = agentbuf;
+ argv[i++] = pathbuf;
argv[i] = NULL;
i = 0;
@@ -3112,8 +3088,10 @@ static void cgroup_release_agent(struct work_struct *work)
* be a slow process */
mutex_unlock(&cgroup_mutex);
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- kfree(pathbuf);
mutex_lock(&cgroup_mutex);
+ continue_free:
+ kfree(pathbuf);
+ kfree(agentbuf);
spin_lock(&release_list_lock);
}
spin_unlock(&release_list_lock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2cc409ce0a8..10ba5f1004a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -285,6 +285,11 @@ out_allowed:
set_cpus_allowed_ptr(current, &old_allowed);
out_release:
cpu_hotplug_done();
+ if (!err) {
+ if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
+ hcpu) == NOTIFY_BAD)
+ BUG();
+ }
return err;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5738910c34..91cf85b36dd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -227,10 +227,6 @@ static struct cpuset top_cpuset = {
* The task_struct fields mems_allowed and mems_generation may only
* be accessed in the context of that task, so require no locks.
*
- * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cpuset modifications across the system.
- *
* The cpuset_common_file_read() handlers only hold callback_mutex across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
@@ -369,7 +365,7 @@ void cpuset_update_task_memory_state(void)
my_cpusets_mem_gen = top_cpuset.mems_generation;
} else {
rcu_read_lock();
- my_cpusets_mem_gen = task_cs(current)->mems_generation;
+ my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
rcu_read_unlock();
}
@@ -500,11 +496,16 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
/*
* rebuild_sched_domains()
*
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
+ * This routine will be called to rebuild the scheduler's dynamic
+ * sched domains:
+ * - if the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes,
+ * - or if the 'cpus' allowed changes in any cpuset which has that
+ * flag enabled,
+ * - or if the 'sched_relax_domain_level' of any cpuset which has
+ * that flag enabled and with non-empty 'cpus' changes,
+ * - or if any cpuset with non-empty 'cpus' is removed,
+ * - or if a cpu gets offlined.
*
* This routine builds a partial partition of the systems CPUs
* (the set of non-overlappping cpumask_t's in the array 'part'
@@ -609,8 +610,13 @@ void rebuild_sched_domains(void)
while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
struct cgroup *cont;
struct cpuset *child; /* scans child cpusets of cp */
+
+ if (cpus_empty(cp->cpus_allowed))
+ continue;
+
if (is_sched_load_balance(cp))
csa[csn++] = cp;
+
list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
child = cgroup_cs(cont);
__kfifo_put(q, (void *)&child, sizeof(cp));
@@ -703,36 +709,6 @@ done:
/* Don't kfree(dattr) -- partition_sched_domains() does that. */
}
-static inline int started_after_time(struct task_struct *t1,
- struct timespec *time,
- struct task_struct *t2)
-{
- int start_diff = timespec_compare(&t1->start_time, time);
- if (start_diff > 0) {
- return 1;
- } else if (start_diff < 0) {
- return 0;
- } else {
- /*
- * Arbitrarily, if two processes started at the same
- * time, we'll say that the lower pointer value
- * started first. Note that t2 may have exited by now
- * so this may not be a valid pointer any longer, but
- * that's fine - it still serves to distinguish
- * between two tasks started (effectively)
- * simultaneously.
- */
- return t1 > t2;
- }
-}
-
-static inline int started_after(void *p1, void *p2)
-{
- struct task_struct *t1 = p1;
- struct task_struct *t2 = p2;
- return started_after_time(t1, &t2->start_time, t2);
-}
-
/**
* cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
* @tsk: task to test
@@ -768,15 +744,49 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
}
/**
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_cpumask(struct cpuset *cs)
+{
+ struct cgroup_scanner scan;
+ struct ptr_heap heap;
+ int retval;
+
+ /*
+ * cgroup_scan_tasks() will initialize heap->gt for us.
+ * heap_init() is still needed here for we should not change
+ * cs->cpus_allowed when heap_init() fails.
+ */
+ retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+ if (retval)
+ return retval;
+
+ scan.cg = cs->css.cgroup;
+ scan.test_task = cpuset_test_cpumask;
+ scan.process_task = cpuset_change_cpumask;
+ scan.heap = &heap;
+ retval = cgroup_scan_tasks(&scan);
+
+ heap_free(&heap);
+ return retval;
+}
+
+/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
* @buf: buffer of cpu numbers written to this cpuset
*/
-static int update_cpumask(struct cpuset *cs, char *buf)
+static int update_cpumask(struct cpuset *cs, const char *buf)
{
struct cpuset trialcs;
- struct cgroup_scanner scan;
- struct ptr_heap heap;
int retval;
int is_load_balanced;
@@ -792,7 +802,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
* that parsing. The validate_change() call ensures that cpusets
* with tasks have cpus.
*/
- buf = strstrip(buf);
if (!*buf) {
cpus_clear(trialcs.cpus_allowed);
} else {
@@ -811,10 +820,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
return 0;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
- if (retval)
- return retval;
-
is_load_balanced = is_sched_load_balance(&trialcs);
mutex_lock(&callback_mutex);
@@ -825,12 +830,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
* Scan tasks in the cpuset, and update the cpumasks of any
* that need an update.
*/
- scan.cg = cs->css.cgroup;
- scan.test_task = cpuset_test_cpumask;
- scan.process_task = cpuset_change_cpumask;
- scan.heap = &heap;
- cgroup_scan_tasks(&scan);
- heap_free(&heap);
+ retval = update_tasks_cpumask(cs);
+ if (retval < 0)
+ return retval;
if (is_load_balanced)
rebuild_sched_domains();
@@ -886,74 +888,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
mutex_unlock(&callback_mutex);
}
-/*
- * Handle user request to change the 'mems' memory placement
- * of a cpuset. Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
- *
- * Call with cgroup_mutex held. May take callback_mutex during call.
- * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
- * their mempolicies to the cpusets new mems_allowed.
- */
-
static void *cpuset_being_rebound;
-static int update_nodemask(struct cpuset *cs, char *buf)
+/**
+ * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
+ * @oldmem: old mems_allowed of cpuset cs
+ *
+ * Called with cgroup_mutex held
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
{
- struct cpuset trialcs;
- nodemask_t oldmem;
struct task_struct *p;
struct mm_struct **mmarray;
int i, n, ntasks;
int migrate;
int fudge;
- int retval;
struct cgroup_iter it;
-
- /*
- * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
- * it's read-only
- */
- if (cs == &top_cpuset)
- return -EACCES;
-
- trialcs = *cs;
-
- /*
- * An empty mems_allowed is ok iff there are no tasks in the cpuset.
- * Since nodelist_parse() fails on an empty mask, we special case
- * that parsing. The validate_change() call ensures that cpusets
- * with tasks have memory.
- */
- buf = strstrip(buf);
- if (!*buf) {
- nodes_clear(trialcs.mems_allowed);
- } else {
- retval = nodelist_parse(buf, trialcs.mems_allowed);
- if (retval < 0)
- goto done;
-
- if (!nodes_subset(trialcs.mems_allowed,
- node_states[N_HIGH_MEMORY]))
- return -EINVAL;
- }
- oldmem = cs->mems_allowed;
- if (nodes_equal(oldmem, trialcs.mems_allowed)) {
- retval = 0; /* Too easy - nothing to do */
- goto done;
- }
- retval = validate_change(cs, &trialcs);
- if (retval < 0)
- goto done;
-
- mutex_lock(&callback_mutex);
- cs->mems_allowed = trialcs.mems_allowed;
- cs->mems_generation = cpuset_mems_generation++;
- mutex_unlock(&callback_mutex);
+ int retval;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
@@ -1020,7 +973,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
- cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
+ cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
mmput(mm);
}
@@ -1032,6 +985,70 @@ done:
return retval;
}
+/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset. Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies and if
+ * the cpuset is marked 'memory_migrate', migrate the tasks
+ * pages to the new memory.
+ *
+ * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
+ */
+static int update_nodemask(struct cpuset *cs, const char *buf)
+{
+ struct cpuset trialcs;
+ nodemask_t oldmem;
+ int retval;
+
+ /*
+ * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+ * it's read-only
+ */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
+ trialcs = *cs;
+
+ /*
+ * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+ * Since nodelist_parse() fails on an empty mask, we special case
+ * that parsing. The validate_change() call ensures that cpusets
+ * with tasks have memory.
+ */
+ if (!*buf) {
+ nodes_clear(trialcs.mems_allowed);
+ } else {
+ retval = nodelist_parse(buf, trialcs.mems_allowed);
+ if (retval < 0)
+ goto done;
+
+ if (!nodes_subset(trialcs.mems_allowed,
+ node_states[N_HIGH_MEMORY]))
+ return -EINVAL;
+ }
+ oldmem = cs->mems_allowed;
+ if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+ retval = 0; /* Too easy - nothing to do */
+ goto done;
+ }
+ retval = validate_change(cs, &trialcs);
+ if (retval < 0)
+ goto done;
+
+ mutex_lock(&callback_mutex);
+ cs->mems_allowed = trialcs.mems_allowed;
+ cs->mems_generation = cpuset_mems_generation++;
+ mutex_unlock(&callback_mutex);
+
+ retval = update_tasks_nodemask(cs, &oldmem);
+done:
+ return retval;
+}
+
int current_cpuset_is_being_rebound(void)
{
return task_cs(current) == cpuset_being_rebound;
@@ -1044,7 +1061,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
if (val != cs->relax_domain_level) {
cs->relax_domain_level = val;
- rebuild_sched_domains();
+ if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
+ rebuild_sched_domains();
}
return 0;
@@ -1256,72 +1274,14 @@ typedef enum {
FILE_SPREAD_SLAB,
} cpuset_filetype_t;
-static ssize_t cpuset_common_file_write(struct cgroup *cont,
- struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
-{
- struct cpuset *cs = cgroup_cs(cont);
- cpuset_filetype_t type = cft->private;
- char *buffer;
- int retval = 0;
-
- /* Crude upper limit on largest legitimate cpulist user might write. */
- if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
- return -E2BIG;
-
- /* +1 for nul-terminator */
- buffer = kmalloc(nbytes + 1, GFP_KERNEL);
- if (!buffer)
- return -ENOMEM;
-
- if (copy_from_user(buffer, userbuf, nbytes)) {
- retval = -EFAULT;
- goto out1;
- }
- buffer[nbytes] = 0; /* nul-terminate */
-
- cgroup_lock();
-
- if (cgroup_is_removed(cont)) {
- retval = -ENODEV;
- goto out2;
- }
-
- switch (type) {
- case FILE_CPULIST:
- retval = update_cpumask(cs, buffer);
- break;
- case FILE_MEMLIST:
- retval = update_nodemask(cs, buffer);
- break;
- default:
- retval = -EINVAL;
- goto out2;
- }
-
- if (retval == 0)
- retval = nbytes;
-out2:
- cgroup_unlock();
-out1:
- kfree(buffer);
- return retval;
-}
-
static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
- cgroup_lock();
-
- if (cgroup_is_removed(cgrp)) {
- cgroup_unlock();
+ if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
- }
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1367,12 +1327,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
- cgroup_lock();
-
- if (cgroup_is_removed(cgrp)) {
- cgroup_unlock();
+ if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
- }
+
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
retval = update_relax_domain_level(cs, val);
@@ -1386,6 +1343,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
}
/*
+ * Common handling for a write to a "cpus" or "mems" file.
+ */
+static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
+ const char *buf)
+{
+ int retval = 0;
+
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
+ switch (cft->private) {
+ case FILE_CPULIST:
+ retval = update_cpumask(cgroup_cs(cgrp), buf);
+ break;
+ case FILE_MEMLIST:
+ retval = update_nodemask(cgroup_cs(cgrp), buf);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+ cgroup_unlock();
+ return retval;
+}
+
+/*
* These ascii lists should be read in a single call, by using a user
* buffer large enough to hold the entire map. If read in smaller
* chunks, there is no guarantee of atomicity. Since the display format
@@ -1504,14 +1487,16 @@ static struct cftype files[] = {
{
.name = "cpus",
.read = cpuset_common_file_read,
- .write = cpuset_common_file_write,
+ .write_string = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
},
{
.name = "mems",
.read = cpuset_common_file_read,
- .write = cpuset_common_file_write,
+ .write_string = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
},
@@ -1792,7 +1777,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
scan.scan.heap = NULL;
scan.to = to->css.cgroup;
- if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
+ if (cgroup_scan_tasks(&scan.scan))
printk(KERN_ERR "move_member_tasks_to_cpuset: "
"cgroup_scan_tasks failed\n");
}
@@ -1852,6 +1837,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
struct cpuset *child; /* scans child cpusets of cp */
struct list_head queue;
struct cgroup *cont;
+ nodemask_t oldmems;
INIT_LIST_HEAD(&queue);
@@ -1871,6 +1857,8 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
continue;
+ oldmems = cp->mems_allowed;
+
/* Remove offline cpus and mems from this cpuset. */
mutex_lock(&callback_mutex);
cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
@@ -1882,6 +1870,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
if (cpus_empty(cp->cpus_allowed) ||
nodes_empty(cp->mems_allowed))
remove_tasks_in_empty_cpuset(cp);
+ else {
+ update_tasks_cpumask(cp);
+ update_tasks_nodemask(cp, &oldmems);
+ }
}
}
@@ -1974,7 +1966,6 @@ void __init cpuset_init_smp(void)
}
/**
-
* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
* @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 10e43fd8b72..b3179dad71b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -145,8 +145,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+ tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
+ d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
+ d->freepages_count += tsk->delays->freepages_count;
spin_unlock_irqrestore(&tsk->delays->lock, flags);
done:
@@ -165,3 +168,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
return ret;
}
+void __delayacct_freepages_start(void)
+{
+ delayacct_start(&current->delays->freepages_start);
+}
+
+void __delayacct_freepages_end(void)
+{
+ delayacct_end(&current->delays->freepages_start,
+ &current->delays->freepages_end,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count);
+}
+
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c1ef192aa65..0d407e88673 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -168,7 +168,6 @@ __set_personality(u_long personality)
current->personality = personality;
oep = current_thread_info()->exec_domain;
current_thread_info()->exec_domain = ep;
- set_fs_altroot();
module_put(oep->module);
return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index 93d2711b938..0caf590548a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
#include <linux/resource.h>
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/tracehook.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -85,7 +86,6 @@ static void __exit_signal(struct task_struct *tsk)
BUG_ON(!sig);
BUG_ON(!atomic_read(&sig->count));
- rcu_read_lock();
sighand = rcu_dereference(tsk->sighand);
spin_lock(&sighand->siglock);
@@ -121,6 +121,18 @@ static void __exit_signal(struct task_struct *tsk)
sig->nivcsw += tsk->nivcsw;
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
+#ifdef CONFIG_TASK_XACCT
+ sig->rchar += tsk->rchar;
+ sig->wchar += tsk->wchar;
+ sig->syscr += tsk->syscr;
+ sig->syscw += tsk->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ sig->ioac.read_bytes += tsk->ioac.read_bytes;
+ sig->ioac.write_bytes += tsk->ioac.write_bytes;
+ sig->ioac.cancelled_write_bytes +=
+ tsk->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig = NULL; /* Marker for below. */
}
@@ -136,7 +148,6 @@ static void __exit_signal(struct task_struct *tsk)
tsk->signal = NULL;
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
- rcu_read_unlock();
__cleanup_sighand(sighand);
clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
@@ -152,27 +163,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(container_of(rhp, struct task_struct, rcu));
}
-/*
- * Do final ptrace-related cleanup of a zombie being reaped.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_release_task(struct task_struct *p)
-{
- BUG_ON(!list_empty(&p->ptraced));
- ptrace_unlink(p);
- BUG_ON(!list_empty(&p->ptrace_entry));
-}
void release_task(struct task_struct * p)
{
struct task_struct *leader;
int zap_leader;
repeat:
+ tracehook_prepare_release_task(p);
atomic_dec(&p->user->processes);
proc_flush_task(p);
write_lock_irq(&tasklist_lock);
- ptrace_release_task(p);
+ tracehook_finish_release_task(p);
__exit_signal(p);
/*
@@ -194,6 +195,13 @@ repeat:
* that case.
*/
zap_leader = task_detached(leader);
+
+ /*
+ * This maintains the invariant that release_task()
+ * only runs on a task in EXIT_DEAD, just for sanity.
+ */
+ if (zap_leader)
+ leader->exit_state = EXIT_DEAD;
}
write_unlock_irq(&tasklist_lock);
@@ -432,7 +440,7 @@ void daemonize(const char *name, ...)
* We don't want to have TIF_FREEZE set if the system-wide hibernation
* or suspend transition begins right now.
*/
- current->flags |= PF_NOFREEZE;
+ current->flags |= (PF_NOFREEZE | PF_KTHREAD);
if (current->nsproxy != &init_nsproxy) {
get_nsproxy(&init_nsproxy);
@@ -557,8 +565,6 @@ void put_fs_struct(struct fs_struct *fs)
if (atomic_dec_and_test(&fs->count)) {
path_put(&fs->root);
path_put(&fs->pwd);
- if (fs->altroot.dentry)
- path_put(&fs->altroot);
kmem_cache_free(fs_cachep, fs);
}
}
@@ -666,26 +672,40 @@ assign_new_owner:
static void exit_mm(struct task_struct * tsk)
{
struct mm_struct *mm = tsk->mm;
+ struct core_state *core_state;
mm_release(tsk, mm);
if (!mm)
return;
/*
* Serialize with any possible pending coredump.
- * We must hold mmap_sem around checking core_waiters
+ * We must hold mmap_sem around checking core_state
* and clearing tsk->mm. The core-inducing thread
- * will increment core_waiters for each thread in the
+ * will increment ->nr_threads for each thread in the
* group with ->mm != NULL.
*/
down_read(&mm->mmap_sem);
- if (mm->core_waiters) {
+ core_state = mm->core_state;
+ if (core_state) {
+ struct core_thread self;
up_read(&mm->mmap_sem);
- down_write(&mm->mmap_sem);
- if (!--mm->core_waiters)
- complete(mm->core_startup_done);
- up_write(&mm->mmap_sem);
- wait_for_completion(&mm->core_done);
+ self.task = tsk;
+ self.next = xchg(&core_state->dumper.next, &self);
+ /*
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
+ */
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
+
+ for (;;) {
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ schedule();
+ }
+ __set_task_state(tsk, TASK_RUNNING);
down_read(&mm->mmap_sem);
}
atomic_inc(&mm->mm_count);
@@ -863,7 +883,8 @@ static void forget_original_parent(struct task_struct *father)
*/
static void exit_notify(struct task_struct *tsk, int group_dead)
{
- int state;
+ int signal;
+ void *cookie;
/*
* This does two things:
@@ -900,22 +921,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
!capable(CAP_KILL))
tsk->exit_signal = SIGCHLD;
- /* If something other than our normal parent is ptracing us, then
- * send it a SIGCHLD instead of honoring exit_signal. exit_signal
- * only has special meaning to our real parent.
- */
- if (!task_detached(tsk) && thread_group_empty(tsk)) {
- int signal = ptrace_reparented(tsk) ?
- SIGCHLD : tsk->exit_signal;
- do_notify_parent(tsk, signal);
- } else if (tsk->ptrace) {
- do_notify_parent(tsk, SIGCHLD);
- }
+ signal = tracehook_notify_death(tsk, &cookie, group_dead);
+ if (signal > 0)
+ signal = do_notify_parent(tsk, signal);
- state = EXIT_ZOMBIE;
- if (task_detached(tsk) && likely(!tsk->ptrace))
- state = EXIT_DEAD;
- tsk->exit_state = state;
+ tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE;
/* mt-exec, de_thread() is waiting for us */
if (thread_group_leader(tsk) &&
@@ -925,8 +935,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
write_unlock_irq(&tasklist_lock);
+ tracehook_report_death(tsk, signal, cookie, group_dead);
+
/* If the process is dead, release it - nobody will wait for it */
- if (state == EXIT_DEAD)
+ if (signal < 0)
release_task(tsk);
}
@@ -1005,10 +1017,7 @@ NORET_TYPE void do_exit(long code)
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
- if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
- current->ptrace_message = code;
- ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
- }
+ tracehook_report_exit(&code);
/*
* We're taking recursive faults here in do_exit. Safest is to just
@@ -1354,6 +1363,21 @@ static int wait_task_zombie(struct task_struct *p, int options,
psig->coublock +=
task_io_get_oublock(p) +
sig->oublock + sig->coublock;
+#ifdef CONFIG_TASK_XACCT
+ psig->rchar += p->rchar + sig->rchar;
+ psig->wchar += p->wchar + sig->wchar;
+ psig->syscr += p->syscr + sig->syscr;
+ psig->syscw += p->syscw + sig->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ psig->ioac.read_bytes +=
+ p->ioac.read_bytes + sig->ioac.read_bytes;
+ psig->ioac.write_bytes +=
+ p->ioac.write_bytes + sig->ioac.write_bytes;
+ psig->ioac.cancelled_write_bytes +=
+ p->ioac.cancelled_write_bytes +
+ sig->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
spin_unlock_irq(&p->parent->sighand->siglock);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 552c8d8e77a..5e050c1317c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,6 +37,7 @@
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
+#include <linux/tracehook.h>
#include <linux/futex.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
@@ -93,6 +94,23 @@ int nr_processes(void)
static struct kmem_cache *task_struct_cachep;
#endif
+#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+{
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ gfp_t mask = GFP_KERNEL | __GFP_ZERO;
+#else
+ gfp_t mask = GFP_KERNEL;
+#endif
+ return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+}
+
+static inline void free_thread_info(struct thread_info *ti)
+{
+ free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+}
+#endif
+
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -383,7 +401,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
INIT_LIST_HEAD(&mm->mmlist);
mm->flags = (current->mm) ? current->mm->flags
: MMF_DUMP_FILTER_DEFAULT;
- mm->core_waiters = 0;
+ mm->core_state = NULL;
mm->nr_ptes = 0;
set_mm_counter(mm, file_rss, 0);
set_mm_counter(mm, anon_rss, 0);
@@ -457,7 +475,7 @@ EXPORT_SYMBOL_GPL(mmput);
/**
* get_task_mm - acquire a reference to the task's mm
*
- * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning
+ * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
* this kernel workthread has transiently adopted a user mm with use_mm,
* to do its AIO) is not set and if so returns a reference to it, after
* bumping up the use count. User must release the mm via mmput()
@@ -470,7 +488,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
task_lock(task);
mm = task->mm;
if (mm) {
- if (task->flags & PF_BORROWED_MM)
+ if (task->flags & PF_KTHREAD)
mm = NULL;
else
atomic_inc(&mm->mm_users);
@@ -639,13 +657,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
path_get(&old->root);
fs->pwd = old->pwd;
path_get(&old->pwd);
- if (old->altroot.dentry) {
- fs->altroot = old->altroot;
- path_get(&old->altroot);
- } else {
- fs->altroot.mnt = NULL;
- fs->altroot.dentry = NULL;
- }
read_unlock(&old->lock);
}
return fs;
@@ -795,6 +806,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+#ifdef CONFIG_TASK_XACCT
+ sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ memset(&sig->ioac, 0, sizeof(sig->ioac));
+#endif
sig->sum_sched_runtime = 0;
INIT_LIST_HEAD(&sig->cpu_timers[0]);
INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -842,8 +859,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
new_flags &= ~PF_SUPERPRIV;
new_flags |= PF_FORKNOEXEC;
- if (!(clone_flags & CLONE_PTRACE))
- p->ptrace = 0;
+ new_flags |= PF_STARTING;
p->flags = new_flags;
clear_freeze_flag(p);
}
@@ -884,7 +900,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
struct pt_regs *regs,
unsigned long stack_size,
int __user *child_tidptr,
- struct pid *pid)
+ struct pid *pid,
+ int trace)
{
int retval;
struct task_struct *p;
@@ -1090,6 +1107,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;
+ if (current->nsproxy != p->nsproxy) {
+ retval = ns_cgroup_clone(p, pid);
+ if (retval)
+ goto bad_fork_free_pid;
+ }
+
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
@@ -1134,8 +1157,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
*/
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group);
- INIT_LIST_HEAD(&p->ptrace_entry);
- INIT_LIST_HEAD(&p->ptraced);
/* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible
@@ -1166,7 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->real_parent = current->real_parent;
else
p->real_parent = current;
- p->parent = p->real_parent;
spin_lock(&current->sighand->siglock);
@@ -1208,8 +1228,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (likely(p->pid)) {
list_add_tail(&p->sibling, &p->real_parent->children);
- if (unlikely(p->ptrace & PT_PTRACED))
- __ptrace_link(p, current->parent);
+ tracehook_finish_clone(p, clone_flags, trace);
if (thread_group_leader(p)) {
if (clone_flags & CLONE_NEWPID)
@@ -1294,29 +1313,13 @@ struct task_struct * __cpuinit fork_idle(int cpu)
struct pt_regs regs;
task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
- &init_struct_pid);
+ &init_struct_pid, 0);
if (!IS_ERR(task))
init_idle(task, cpu);
return task;
}
-static int fork_traceflag(unsigned clone_flags)
-{
- if (clone_flags & CLONE_UNTRACED)
- return 0;
- else if (clone_flags & CLONE_VFORK) {
- if (current->ptrace & PT_TRACE_VFORK)
- return PTRACE_EVENT_VFORK;
- } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
- if (current->ptrace & PT_TRACE_CLONE)
- return PTRACE_EVENT_CLONE;
- } else if (current->ptrace & PT_TRACE_FORK)
- return PTRACE_EVENT_FORK;
-
- return 0;
-}
-
/*
* Ok, this is the main fork-routine.
*
@@ -1351,14 +1354,14 @@ long do_fork(unsigned long clone_flags,
}
}
- if (unlikely(current->ptrace)) {
- trace = fork_traceflag (clone_flags);
- if (trace)
- clone_flags |= CLONE_PTRACE;
- }
+ /*
+ * When called from kernel_thread, don't do user tracing stuff.
+ */
+ if (likely(user_mode(regs)))
+ trace = tracehook_prepare_clone(clone_flags);
p = copy_process(clone_flags, stack_start, regs, stack_size,
- child_tidptr, NULL);
+ child_tidptr, NULL, trace);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
@@ -1376,32 +1379,35 @@ long do_fork(unsigned long clone_flags,
init_completion(&vfork);
}
- if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
+ tracehook_report_clone(trace, regs, clone_flags, nr, p);
+
+ /*
+ * We set PF_STARTING at creation in case tracing wants to
+ * use this to distinguish a fully live task from one that
+ * hasn't gotten to tracehook_report_clone() yet. Now we
+ * clear it and set the child going.
+ */
+ p->flags &= ~PF_STARTING;
+
+ if (unlikely(clone_flags & CLONE_STOPPED)) {
/*
* We'll start up with an immediate SIGSTOP.
*/
sigaddset(&p->pending.signal, SIGSTOP);
set_tsk_thread_flag(p, TIF_SIGPENDING);
- }
-
- if (!(clone_flags & CLONE_STOPPED))
- wake_up_new_task(p, clone_flags);
- else
__set_task_state(p, TASK_STOPPED);
-
- if (unlikely (trace)) {
- current->ptrace_message = nr;
- ptrace_notify ((trace << 8) | SIGTRAP);
+ } else {
+ wake_up_new_task(p, clone_flags);
}
+ tracehook_report_clone_complete(trace, regs,
+ clone_flags, nr, p);
+
if (clone_flags & CLONE_VFORK) {
freezer_do_not_count();
wait_for_completion(&vfork);
freezer_count();
- if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
- current->ptrace_message = nr;
- ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
- }
+ tracehook_report_vfork_done(p, nr);
}
} else {
nr = PTR_ERR(p);
@@ -1413,7 +1419,7 @@ long do_fork(unsigned long clone_flags,
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
-static void sighand_ctor(struct kmem_cache *cachep, void *data)
+static void sighand_ctor(void *data)
{
struct sighand_struct *sighand = data;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 964964baefa..3cd441ebf5d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -28,8 +28,7 @@ void dynamic_irq_init(unsigned int irq)
unsigned long flags;
if (irq >= NR_IRQS) {
- printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
- WARN_ON(1);
+ WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
return;
}
@@ -62,8 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq)
unsigned long flags;
if (irq >= NR_IRQS) {
- printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
- WARN_ON(1);
+ WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
return;
}
@@ -71,9 +69,8 @@ void dynamic_irq_cleanup(unsigned int irq)
spin_lock_irqsave(&desc->lock, flags);
if (desc->action) {
spin_unlock_irqrestore(&desc->lock, flags);
- printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n",
+ WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
irq);
- WARN_ON(1);
return;
}
desc->msi_desc = NULL;
@@ -96,8 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
unsigned long flags;
if (irq >= NR_IRQS) {
- printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
- WARN_ON(1);
+ WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
return -EINVAL;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5bc6e5ecc49..152abfd3589 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -177,8 +177,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
{
switch (desc->depth) {
case 0:
- printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
- WARN_ON(1);
+ WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
break;
case 1: {
unsigned int status = desc->status & ~IRQ_DISABLED;
@@ -260,9 +259,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
}
} else {
if (desc->wake_depth == 0) {
- printk(KERN_WARNING "Unbalanced IRQ %d "
- "wake disable\n", irq);
- WARN_ON(1);
+ WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
} else if (--desc->wake_depth == 0) {
ret = set_irq_wake_real(irq, on);
if (ret)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6fc0040f3e3..38fc10ac754 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -176,7 +176,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
high = kallsyms_num_syms;
while (high - low > 1) {
- mid = (low + high) / 2;
+ mid = low + (high - low) / 2;
if (kallsyms_addresses[mid] <= addr)
low = mid;
else
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1c5fcacbcf3..c8a4370e2a3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -24,6 +24,12 @@
#include <linux/utsrelease.h>
#include <linux/utsname.h>
#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -242,6 +248,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
goto out;
}
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ printk(KERN_ERR "Could not allocate swap buffer\n");
+ goto out;
+ }
+
result = 0;
out:
if (result == 0)
@@ -589,14 +601,12 @@ static void kimage_free_extra_pages(struct kimage *image)
kimage_free_page_list(&image->unuseable_pages);
}
-static int kimage_terminate(struct kimage *image)
+static void kimage_terminate(struct kimage *image)
{
if (*image->entry != 0)
image->entry++;
*image->entry = IND_DONE;
-
- return 0;
}
#define for_each_kimage_entry(image, ptr, entry) \
@@ -988,6 +998,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
if (result)
goto out;
+ if (flags & KEXEC_PRESERVE_CONTEXT)
+ image->preserve_context = 1;
result = machine_kexec_prepare(image);
if (result)
goto out;
@@ -997,9 +1009,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
if (result)
goto out;
}
- result = kimage_terminate(image);
- if (result)
- goto out;
+ kimage_terminate(image);
}
/* Install the new kernel, and Uninstall the old */
image = xchg(dest_image, image);
@@ -1415,3 +1425,85 @@ static int __init crash_save_vmcoreinfo_init(void)
}
module_init(crash_save_vmcoreinfo_init)
+
+/**
+ * kernel_kexec - reboot the system
+ *
+ * Move into place and start executing a preloaded standalone
+ * executable. If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+ int error = 0;
+
+ if (xchg(&kexec_lock, 1))
+ return -EBUSY;
+ if (!kexec_image) {
+ error = -EINVAL;
+ goto Unlock;
+ }
+
+ if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+ mutex_lock(&pm_mutex);
+ pm_prepare_console();
+ error = freeze_processes();
+ if (error) {
+ error = -EBUSY;
+ goto Restore_console;
+ }
+ suspend_console();
+ error = device_suspend(PMSG_FREEZE);
+ if (error)
+ goto Resume_console;
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Resume_devices;
+ local_irq_disable();
+ /* At this point, device_suspend() has been called,
+ * but *not* device_power_down(). We *must*
+ * device_power_down() now. Otherwise, drivers for
+ * some devices (e.g. interrupt controllers) become
+ * desynchronized with the actual state of the
+ * hardware at resume time, and evil weirdness ensues.
+ */
+ error = device_power_down(PMSG_FREEZE);
+ if (error)
+ goto Enable_irqs;
+ save_processor_state();
+#endif
+ } else {
+ blocking_notifier_call_chain(&reboot_notifier_list,
+ SYS_RESTART, NULL);
+ system_state = SYSTEM_RESTART;
+ device_shutdown();
+ sysdev_shutdown();
+ printk(KERN_EMERG "Starting new kernel\n");
+ machine_shutdown();
+ }
+
+ machine_kexec(kexec_image);
+
+ if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+ restore_processor_state();
+ device_power_up(PMSG_RESTORE);
+ Enable_irqs:
+ local_irq_enable();
+ enable_nonboot_cpus();
+ Resume_devices:
+ device_resume(PMSG_RESTORE);
+ Resume_console:
+ resume_console();
+ thaw_processes();
+ Restore_console:
+ pm_restore_console();
+ mutex_unlock(&pm_mutex);
+#endif
+ }
+
+ Unlock:
+ xchg(&kexec_lock, 0);
+
+ return error;
+}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2989f67c444..2456d1a0bef 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -352,16 +352,17 @@ static inline void register_pm_notifier_callback(void) {}
* @path: path to usermode executable
* @argv: arg vector for process
* @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
*
* Returns either %NULL on allocation failure, or a subprocess_info
* structure. This should be passed to call_usermodehelper_exec to
* exec the process and free the structure.
*/
-struct subprocess_info *call_usermodehelper_setup(char *path,
- char **argv, char **envp)
+struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
+ char **envp, gfp_t gfp_mask)
{
struct subprocess_info *sub_info;
- sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
+ sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
if (!sub_info)
goto out;
@@ -494,7 +495,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
struct subprocess_info *sub_info;
int ret;
- sub_info = call_usermodehelper_setup(path, argv, envp);
+ sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
if (sub_info == NULL)
return -ENOMEM;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1485ca8d0e0..75bc2cd9ebc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -62,6 +62,7 @@
addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
#endif
+static int kprobes_initialized;
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -69,8 +70,15 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
static bool kprobe_enabled;
DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
-DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static struct {
+ spinlock_t lock ____cacheline_aligned;
+} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+
+static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+{
+ return &(kretprobe_table_locks[hash].lock);
+}
/*
* Normally, functions that we'd want to prohibit kprobes in, are marked
@@ -368,26 +376,53 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
return;
}
-/* Called with kretprobe_lock held */
void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
struct hlist_head *head)
{
+ struct kretprobe *rp = ri->rp;
+
/* remove rp inst off the rprobe_inst_table */
hlist_del(&ri->hlist);
- if (ri->rp) {
- /* remove rp inst off the used list */
- hlist_del(&ri->uflist);
- /* put rp inst back onto the free list */
- INIT_HLIST_NODE(&ri->uflist);
- hlist_add_head(&ri->uflist, &ri->rp->free_instances);
+ INIT_HLIST_NODE(&ri->hlist);
+ if (likely(rp)) {
+ spin_lock(&rp->lock);
+ hlist_add_head(&ri->hlist, &rp->free_instances);
+ spin_unlock(&rp->lock);
} else
/* Unregistering */
hlist_add_head(&ri->hlist, head);
}
-struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
+void kretprobe_hash_lock(struct task_struct *tsk,
+ struct hlist_head **head, unsigned long *flags)
{
- return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+ unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+ spinlock_t *hlist_lock;
+
+ *head = &kretprobe_inst_table[hash];
+ hlist_lock = kretprobe_table_lock_ptr(hash);
+ spin_lock_irqsave(hlist_lock, *flags);
+}
+
+void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
+{
+ spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+ spin_lock_irqsave(hlist_lock, *flags);
+}
+
+void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
+{
+ unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+ spinlock_t *hlist_lock;
+
+ hlist_lock = kretprobe_table_lock_ptr(hash);
+ spin_unlock_irqrestore(hlist_lock, *flags);
+}
+
+void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+{
+ spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+ spin_unlock_irqrestore(hlist_lock, *flags);
}
/*
@@ -401,17 +436,21 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
struct kretprobe_instance *ri;
struct hlist_head *head, empty_rp;
struct hlist_node *node, *tmp;
- unsigned long flags = 0;
+ unsigned long hash, flags = 0;
- INIT_HLIST_HEAD(&empty_rp);
- spin_lock_irqsave(&kretprobe_lock, flags);
- head = kretprobe_inst_table_head(tk);
+ if (unlikely(!kprobes_initialized))
+ /* Early boot. kretprobe_table_locks not yet initialized. */
+ return;
+
+ hash = hash_ptr(tk, KPROBE_HASH_BITS);
+ head = &kretprobe_inst_table[hash];
+ kretprobe_table_lock(hash, &flags);
hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
if (ri->task == tk)
recycle_rp_inst(ri, &empty_rp);
}
- spin_unlock_irqrestore(&kretprobe_lock, flags);
-
+ kretprobe_table_unlock(hash, &flags);
+ INIT_HLIST_HEAD(&empty_rp);
hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
@@ -423,24 +462,29 @@ static inline void free_rp_inst(struct kretprobe *rp)
struct kretprobe_instance *ri;
struct hlist_node *pos, *next;
- hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
- hlist_del(&ri->uflist);
+ hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
+ hlist_del(&ri->hlist);
kfree(ri);
}
}
static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
{
- unsigned long flags;
+ unsigned long flags, hash;
struct kretprobe_instance *ri;
struct hlist_node *pos, *next;
+ struct hlist_head *head;
+
/* No race here */
- spin_lock_irqsave(&kretprobe_lock, flags);
- hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
- ri->rp = NULL;
- hlist_del(&ri->uflist);
+ for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
+ kretprobe_table_lock(hash, &flags);
+ head = &kretprobe_inst_table[hash];
+ hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+ if (ri->rp == rp)
+ ri->rp = NULL;
+ }
+ kretprobe_table_unlock(hash, &flags);
}
- spin_unlock_irqrestore(&kretprobe_lock, flags);
free_rp_inst(rp);
}
@@ -831,32 +875,37 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
- unsigned long flags = 0;
+ unsigned long hash, flags = 0;
+ struct kretprobe_instance *ri;
/*TODO: consider to only swap the RA after the last pre_handler fired */
- spin_lock_irqsave(&kretprobe_lock, flags);
+ hash = hash_ptr(current, KPROBE_HASH_BITS);
+ spin_lock_irqsave(&rp->lock, flags);
if (!hlist_empty(&rp->free_instances)) {
- struct kretprobe_instance *ri;
-
ri = hlist_entry(rp->free_instances.first,
- struct kretprobe_instance, uflist);
+ struct kretprobe_instance, hlist);
+ hlist_del(&ri->hlist);
+ spin_unlock_irqrestore(&rp->lock, flags);
+
ri->rp = rp;
ri->task = current;
if (rp->entry_handler && rp->entry_handler(ri, regs)) {
- spin_unlock_irqrestore(&kretprobe_lock, flags);
+ spin_unlock_irqrestore(&rp->lock, flags);
return 0;
}
arch_prepare_kretprobe(ri, regs);
/* XXX(hch): why is there no hlist_move_head? */
- hlist_del(&ri->uflist);
- hlist_add_head(&ri->uflist, &ri->rp->used_instances);
- hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
- } else
+ INIT_HLIST_NODE(&ri->hlist);
+ kretprobe_table_lock(hash, &flags);
+ hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
+ kretprobe_table_unlock(hash, &flags);
+ } else {
rp->nmissed++;
- spin_unlock_irqrestore(&kretprobe_lock, flags);
+ spin_unlock_irqrestore(&rp->lock, flags);
+ }
return 0;
}
@@ -892,7 +941,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
rp->maxactive = NR_CPUS;
#endif
}
- INIT_HLIST_HEAD(&rp->used_instances);
+ spin_lock_init(&rp->lock);
INIT_HLIST_HEAD(&rp->free_instances);
for (i = 0; i < rp->maxactive; i++) {
inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -901,8 +950,8 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
free_rp_inst(rp);
return -ENOMEM;
}
- INIT_HLIST_NODE(&inst->uflist);
- hlist_add_head(&inst->uflist, &rp->free_instances);
+ INIT_HLIST_NODE(&inst->hlist);
+ hlist_add_head(&inst->hlist, &rp->free_instances);
}
rp->nmissed = 0;
@@ -1009,6 +1058,7 @@ static int __init init_kprobes(void)
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
INIT_HLIST_HEAD(&kprobe_table[i]);
INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+ spin_lock_init(&(kretprobe_table_locks[i].lock));
}
/*
@@ -1050,6 +1100,7 @@ static int __init init_kprobes(void)
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
+ kprobes_initialized = (err == 0);
if (!err)
init_test_probes();
@@ -1286,13 +1337,8 @@ EXPORT_SYMBOL_GPL(register_jprobe);
EXPORT_SYMBOL_GPL(unregister_jprobe);
EXPORT_SYMBOL_GPL(register_jprobes);
EXPORT_SYMBOL_GPL(unregister_jprobes);
-#ifdef CONFIG_KPROBES
EXPORT_SYMBOL_GPL(jprobe_return);
-#endif
-
-#ifdef CONFIG_KPROBES
EXPORT_SYMBOL_GPL(register_kretprobe);
EXPORT_SYMBOL_GPL(unregister_kretprobe);
EXPORT_SYMBOL_GPL(register_kretprobes);
EXPORT_SYMBOL_GPL(unregister_kretprobes);
-#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6111c27491b..96cff2f8710 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
return;
}
/* Must have done schedule() in kthread() before we set_task_cpu */
- wait_task_inactive(k);
+ wait_task_inactive(k, 0);
set_task_cpu(k, cpu);
k->cpus_allowed = cpumask_of_cpu(cpu);
k->rt.nr_cpus_allowed = 1;
diff --git a/kernel/marker.c b/kernel/marker.c
index 1abfb923b76..971da531790 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -441,7 +441,7 @@ static int remove_marker(const char *name)
hlist_del(&e->hlist);
/* Make sure the call_rcu has been executed */
if (e->rcu_pending)
- rcu_barrier();
+ rcu_barrier_sched();
kfree(e);
return 0;
}
@@ -476,7 +476,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
hlist_del(&(*entry)->hlist);
/* Make sure the call_rcu has been executed */
if ((*entry)->rcu_pending)
- rcu_barrier();
+ rcu_barrier_sched();
kfree(*entry);
*entry = e;
trace_mark(core_marker_format, "name %s format %s",
@@ -655,7 +655,7 @@ int marker_probe_register(const char *name, const char *format,
* make sure it's executed now.
*/
if (entry->rcu_pending)
- rcu_barrier();
+ rcu_barrier_sched();
old = marker_entry_add_probe(entry, probe, probe_private);
if (IS_ERR(old)) {
ret = PTR_ERR(old);
@@ -670,10 +670,7 @@ int marker_probe_register(const char *name, const char *format,
entry->rcu_pending = 1;
/* write rcu_pending before calling the RCU callback */
smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
- synchronize_sched(); /* Until we have the call_rcu_sched() */
-#endif
- call_rcu(&entry->rcu, free_old_closure);
+ call_rcu_sched(&entry->rcu, free_old_closure);
end:
mutex_unlock(&markers_mutex);
return ret;
@@ -704,7 +701,7 @@ int marker_probe_unregister(const char *name,
if (!entry)
goto end;
if (entry->rcu_pending)
- rcu_barrier();
+ rcu_barrier_sched();
old = marker_entry_remove_probe(entry, probe, probe_private);
mutex_unlock(&markers_mutex);
marker_update_probes(); /* may update entry */
@@ -716,10 +713,7 @@ int marker_probe_unregister(const char *name,
entry->rcu_pending = 1;
/* write rcu_pending before calling the RCU callback */
smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
- synchronize_sched(); /* Until we have the call_rcu_sched() */
-#endif
- call_rcu(&entry->rcu, free_old_closure);
+ call_rcu_sched(&entry->rcu, free_old_closure);
remove_marker(name); /* Ignore busy error message */
ret = 0;
end:
@@ -786,7 +780,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
goto end;
}
if (entry->rcu_pending)
- rcu_barrier();
+ rcu_barrier_sched();
old = marker_entry_remove_probe(entry, NULL, probe_private);
mutex_unlock(&markers_mutex);
marker_update_probes(); /* may update entry */
@@ -797,10 +791,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
entry->rcu_pending = 1;
/* write rcu_pending before calling the RCU callback */
smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
- synchronize_sched(); /* Until we have the call_rcu_sched() */
-#endif
- call_rcu(&entry->rcu, free_old_closure);
+ call_rcu_sched(&entry->rcu, free_old_closure);
remove_marker(entry->name); /* Ignore busy error message */
end:
mutex_unlock(&markers_mutex);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 48d7ed6fc3a..43c2111cd54 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/cgroup.h>
#include <linux/fs.h>
+#include <linux/proc_fs.h>
#include <linux/slab.h>
#include <linux/nsproxy.h>
@@ -24,9 +25,12 @@ static inline struct ns_cgroup *cgroup_to_ns(
struct ns_cgroup, css);
}
-int ns_cgroup_clone(struct task_struct *task)
+int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
{
- return cgroup_clone(task, &ns_subsys);
+ char name[PROC_NUMBUF];
+
+ snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
+ return cgroup_clone(task, &ns_subsys, name);
}
/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index adc785146a1..21575fc46d0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -157,12 +157,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
goto out;
}
- err = ns_cgroup_clone(tsk);
- if (err) {
- put_nsproxy(new_ns);
- goto out;
- }
-
tsk->nsproxy = new_ns;
out:
@@ -209,7 +203,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
goto out;
}
- err = ns_cgroup_clone(current);
+ err = ns_cgroup_clone(current, task_pid(current));
if (err)
put_nsproxy(*new_nsp);
diff --git a/kernel/panic.c b/kernel/panic.c
index 425567f45b9..12c5a0a6c89 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -318,6 +318,28 @@ void warn_on_slowpath(const char *file, int line)
add_taint(TAINT_WARN);
}
EXPORT_SYMBOL(warn_on_slowpath);
+
+
+void warn_slowpath(const char *file, int line, const char *fmt, ...)
+{
+ va_list args;
+ char function[KSYM_SYMBOL_LEN];
+ unsigned long caller = (unsigned long)__builtin_return_address(0);
+ sprint_symbol(function, caller);
+
+ printk(KERN_WARNING "------------[ cut here ]------------\n");
+ printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
+ line, function);
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+
+ print_modules();
+ dump_stack();
+ print_oops_end_marker();
+ add_taint(TAINT_WARN);
+}
+EXPORT_SYMBOL(warn_slowpath);
#endif
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/pid.c b/kernel/pid.c
index 30bd5d4b2ac..064e76afa50 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -309,12 +309,6 @@ struct pid *find_vpid(int nr)
}
EXPORT_SYMBOL_GPL(find_vpid);
-struct pid *find_pid(int nr)
-{
- return find_pid_ns(nr, &init_pid_ns);
-}
-EXPORT_SYMBOL_GPL(find_pid);
-
/*
* attach_pid() must be called with the tasklist_lock write-held.
*/
@@ -435,6 +429,7 @@ struct pid *find_get_pid(pid_t nr)
return pid;
}
+EXPORT_SYMBOL_GPL(find_get_pid);
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
@@ -482,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns);
/*
* Used by proc to find the first pid that is greater then or equal to nr.
*
- * If there is a pid at nr this function is exactly the same as find_pid.
+ * If there is a pid at nr this function is exactly the same as find_pid_ns.
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
@@ -497,7 +492,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
return pid;
}
-EXPORT_SYMBOL_GPL(find_get_pid);
/*
* The pid hash table is scaled according to the amount of memory in the
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 98702b4b885..ea567b78d1a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
#include <linux/err.h>
+#include <linux/acct.h>
#define BITS_PER_PAGE (PAGE_SIZE*8)
@@ -71,7 +72,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
struct pid_namespace *ns;
int i;
- ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
+ ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
goto out;
@@ -84,17 +85,13 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
goto out_free_map;
kref_init(&ns->kref);
- ns->last_pid = 0;
- ns->child_reaper = NULL;
ns->level = level;
set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
- for (i = 1; i < PIDMAP_ENTRIES; i++) {
- ns->pidmap[i].page = NULL;
+ for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
- }
return ns;
@@ -185,6 +182,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
/* Child reaper for the pid namespace is going away */
pid_ns->child_reaper = NULL;
+ acct_exit_ns(pid_ns);
return;
}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dbd8398ddb0..9a21681aa80 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -449,9 +449,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
spin_unlock_irqrestore(&idr_lock, flags);
}
sigqueue_free(tmr->sigq);
- if (unlikely(tmr->it_process) &&
- tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
- put_task_struct(tmr->it_process);
kmem_cache_free(posix_timers_cache, tmr);
}
@@ -856,11 +853,10 @@ retry_delete:
* This keeps any tasks waiting on the spin lock from thinking
* they got something (see the lock code above).
*/
- if (timer->it_process) {
- if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
- put_task_struct(timer->it_process);
- timer->it_process = NULL;
- }
+ if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+ put_task_struct(timer->it_process);
+ timer->it_process = NULL;
+
unlock_timer(timer, flags);
release_posix_timer(timer, IT_ID_SET);
return 0;
@@ -885,11 +881,10 @@ retry_delete:
* This keeps any tasks waiting on the spin lock from thinking
* they got something (see the lock code above).
*/
- if (timer->it_process) {
- if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
- put_task_struct(timer->it_process);
- timer->it_process = NULL;
- }
+ if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+ put_task_struct(timer->it_process);
+ timer->it_process = NULL;
+
unlock_timer(timer, flags);
release_posix_timer(timer, IT_ID_SET);
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 95bff23ecda..0b7476f5d2a 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -635,6 +635,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
}
if (status < 0)
printk(err_suspend, status);
+
+ /* Some platforms can't detect that the alarm triggered the
+ * wakeup, or (accordingly) disable it after it afterwards.
+ * It's supposed to give oneshot behavior; cope.
+ */
+ alm.enabled = false;
+ rtc_set_alarm(rtc, &alm);
}
static int __init has_wakealarm(struct device *dev, void *name_ptr)
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 700f44ec840..acc0c101dbd 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void);
extern int pfn_is_nosave(unsigned long);
-extern struct mutex pm_mutex;
-
#define power_attr(_name) \
static struct kobj_attribute _name##_attr = { \
.attr = { \
diff --git a/kernel/printk.c b/kernel/printk.c
index 3f7a2a94583..a7f7559c5f6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1308,6 +1308,8 @@ void tty_write_message(struct tty_struct *tty, char *msg)
}
#if defined CONFIG_PRINTK
+
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
/*
* printk rate limiting, lifted from the networking subsystem.
*
@@ -1315,22 +1317,9 @@ void tty_write_message(struct tty_struct *tty, char *msg)
* every printk_ratelimit_jiffies to make a denial-of-service
* attack impossible.
*/
-int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
-{
- return __ratelimit(ratelimit_jiffies, ratelimit_burst);
-}
-EXPORT_SYMBOL(__printk_ratelimit);
-
-/* minimum time in jiffies between messages */
-int printk_ratelimit_jiffies = 5 * HZ;
-
-/* number of messages we send before ratelimiting */
-int printk_ratelimit_burst = 10;
-
int printk_ratelimit(void)
{
- return __printk_ratelimit(printk_ratelimit_jiffies,
- printk_ratelimit_burst);
+ return __ratelimit(&printk_ratelimit_state);
}
EXPORT_SYMBOL(printk_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index 58926411eb2..cd26bed4cc2 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -112,8 +112,6 @@ void __init profile_init(void)
/* Profile event notifications */
-#ifdef CONFIG_PROFILING
-
static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
@@ -203,8 +201,6 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
}
EXPORT_SYMBOL_GPL(unregister_timer_hook);
-#endif /* CONFIG_PROFILING */
-
#ifdef CONFIG_SMP
/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8392a9da645..082b3fcb32a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
read_unlock(&tasklist_lock);
if (!ret && !kill)
- wait_task_inactive(child);
+ ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
/* All systems go.. */
return ret;
diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644cdec4..04006ef970b 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan)
}
EXPORT_SYMBOL_GPL(relay_reset);
+static inline void relay_set_buf_dentry(struct rchan_buf *buf,
+ struct dentry *dentry)
+{
+ buf->dentry = dentry;
+ buf->dentry->d_inode->i_size = buf->early_bytes;
+}
+
+static struct dentry *relay_create_buf_file(struct rchan *chan,
+ struct rchan_buf *buf,
+ unsigned int cpu)
+{
+ struct dentry *dentry;
+ char *tmpname;
+
+ tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!tmpname)
+ return NULL;
+ snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
+
+ /* Create file in fs */
+ dentry = chan->cb->create_buf_file(tmpname, chan->parent,
+ S_IRUSR, buf,
+ &chan->is_global);
+
+ kfree(tmpname);
+
+ return dentry;
+}
+
/*
* relay_open_buf - create a new relay channel buffer
*
@@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
{
struct rchan_buf *buf = NULL;
struct dentry *dentry;
- char *tmpname;
if (chan->is_global)
return chan->buf[0];
- tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!tmpname)
- goto end;
- snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
-
buf = relay_create_buf(chan);
if (!buf)
- goto free_name;
+ return NULL;
+
+ if (chan->has_base_filename) {
+ dentry = relay_create_buf_file(chan, buf, cpu);
+ if (!dentry)
+ goto free_buf;
+ relay_set_buf_dentry(buf, dentry);
+ }
buf->cpu = cpu;
__relay_reset(buf, 1);
- /* Create file in fs */
- dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
- buf, &chan->is_global);
- if (!dentry)
- goto free_buf;
-
- buf->dentry = dentry;
-
if(chan->is_global) {
chan->buf[0] = buf;
buf->cpu = 0;
}
- goto free_name;
+ return buf;
free_buf:
relay_destroy_buf(buf);
- buf = NULL;
-free_name:
- kfree(tmpname);
-end:
- return buf;
+ return NULL;
}
/**
@@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
/**
* relay_open - create a new relay channel
- * @base_filename: base name of files to create
- * @parent: dentry of parent directory, %NULL for root directory
+ * @base_filename: base name of files to create, %NULL for buffering only
+ * @parent: dentry of parent directory, %NULL for root directory or buffer
* @subbuf_size: size of sub-buffers
* @n_subbufs: number of sub-buffers
* @cb: client callback functions
@@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename,
{
unsigned int i;
struct rchan *chan;
- if (!base_filename)
- return NULL;
if (!(subbuf_size && n_subbufs))
return NULL;
@@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename,
chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
chan->parent = parent;
chan->private_data = private_data;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ if (base_filename) {
+ chan->has_base_filename = 1;
+ strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ }
setup_callbacks(chan, cb);
kref_init(&chan->kref);
@@ -604,6 +623,94 @@ free_bufs:
}
EXPORT_SYMBOL_GPL(relay_open);
+struct rchan_percpu_buf_dispatcher {
+ struct rchan_buf *buf;
+ struct dentry *dentry;
+};
+
+/* Called in atomic context. */
+static void __relay_set_buf_dentry(void *info)
+{
+ struct rchan_percpu_buf_dispatcher *p = info;
+
+ relay_set_buf_dentry(p->buf, p->dentry);
+}
+
+/**
+ * relay_late_setup_files - triggers file creation
+ * @chan: channel to operate on
+ * @base_filename: base name of files to create
+ * @parent: dentry of parent directory, %NULL for root directory
+ *
+ * Returns 0 if successful, non-zero otherwise.
+ *
+ * Use to setup files for a previously buffer-only channel.
+ * Useful to do early tracing in kernel, before VFS is up, for example.
+ */
+int relay_late_setup_files(struct rchan *chan,
+ const char *base_filename,
+ struct dentry *parent)
+{
+ int err = 0;
+ unsigned int i, curr_cpu;
+ unsigned long flags;
+ struct dentry *dentry;
+ struct rchan_percpu_buf_dispatcher disp;
+
+ if (!chan || !base_filename)
+ return -EINVAL;
+
+ strlcpy(chan->base_filename, base_filename, NAME_MAX);
+
+ mutex_lock(&relay_channels_mutex);
+ /* Is chan already set up? */
+ if (unlikely(chan->has_base_filename))
+ return -EEXIST;
+ chan->has_base_filename = 1;
+ chan->parent = parent;
+ curr_cpu = get_cpu();
+ /*
+ * The CPU hotplug notifier ran before us and created buffers with
+ * no files associated. So it's safe to call relay_setup_buf_file()
+ * on all currently online CPUs.
+ */
+ for_each_online_cpu(i) {
+ if (unlikely(!chan->buf[i])) {
+ printk(KERN_ERR "relay_late_setup_files: CPU %u "
+ "has no buffer, it must have!\n", i);
+ BUG();
+ err = -EINVAL;
+ break;
+ }
+
+ dentry = relay_create_buf_file(chan, chan->buf[i], i);
+ if (unlikely(!dentry)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (curr_cpu == i) {
+ local_irq_save(flags);
+ relay_set_buf_dentry(chan->buf[i], dentry);
+ local_irq_restore(flags);
+ } else {
+ disp.buf = chan->buf[i];
+ disp.dentry = dentry;
+ smp_mb();
+ /* relay_channels_mutex must be held, so wait. */
+ err = smp_call_function_single(i,
+ __relay_set_buf_dentry,
+ &disp, 1);
+ }
+ if (unlikely(err))
+ break;
+ }
+ put_cpu();
+ mutex_unlock(&relay_channels_mutex);
+
+ return err;
+}
+
/**
* relay_switch_subbuf - switch to a new sub-buffer
* @buf: channel buffer
@@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
buf->padding[old_subbuf] = buf->prev_padding;
buf->subbufs_produced++;
- buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
- buf->padding[old_subbuf];
+ if (buf->dentry)
+ buf->dentry->d_inode->i_size +=
+ buf->chan->subbuf_size -
+ buf->padding[old_subbuf];
+ else
+ buf->early_bytes += buf->chan->subbuf_size -
+ buf->padding[old_subbuf];
smp_mb();
if (waitqueue_active(&buf->read_wait))
/*
@@ -1237,4 +1349,4 @@ static __init int relay_init(void)
return 0;
}
-module_init(relay_init);
+early_initcall(relay_init);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d3c61b4ebef..f275c8eca77 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/res_counter.h>
#include <linux/uaccess.h>
+#include <linux/mm.h>
void res_counter_init(struct res_counter *counter)
{
@@ -102,44 +103,37 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
return *res_counter_member(counter, member);
}
-ssize_t res_counter_write(struct res_counter *counter, int member,
- const char __user *userbuf, size_t nbytes, loff_t *pos,
- int (*write_strategy)(char *st_buf, unsigned long long *val))
+int res_counter_memparse_write_strategy(const char *buf,
+ unsigned long long *res)
{
- int ret;
- char *buf, *end;
- unsigned long flags;
- unsigned long long tmp, *val;
-
- buf = kmalloc(nbytes + 1, GFP_KERNEL);
- ret = -ENOMEM;
- if (buf == NULL)
- goto out;
+ char *end;
+ /* FIXME - make memparse() take const char* args */
+ *res = memparse((char *)buf, &end);
+ if (*end != '\0')
+ return -EINVAL;
- buf[nbytes] = '\0';
- ret = -EFAULT;
- if (copy_from_user(buf, userbuf, nbytes))
- goto out_free;
+ *res = PAGE_ALIGN(*res);
+ return 0;
+}
- ret = -EINVAL;
+int res_counter_write(struct res_counter *counter, int member,
+ const char *buf, write_strategy_fn write_strategy)
+{
+ char *end;
+ unsigned long flags;
+ unsigned long long tmp, *val;
- strstrip(buf);
if (write_strategy) {
- if (write_strategy(buf, &tmp)) {
- goto out_free;
- }
+ if (write_strategy(buf, &tmp))
+ return -EINVAL;
} else {
tmp = simple_strtoull(buf, &end, 10);
if (*end != '\0')
- goto out_free;
+ return -EINVAL;
}
spin_lock_irqsave(&counter->lock, flags);
val = res_counter_member(counter, member);
*val = tmp;
spin_unlock_irqrestore(&counter->lock, flags);
- ret = nbytes;
-out_free:
- kfree(buf);
-out:
- return ret;
+ return 0;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 6acf749d333..0236958addc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1867,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
/*
* wait_task_inactive - wait for a thread to unschedule.
*
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change. If it changes, i.e. @p might have woken up,
+ * then return zero. When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count). If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
* The caller must ensure that the task *will* unschedule sometime soon,
* else this function might spin for a *long* time. This function can't
* be called with interrupts off, or it may introduce deadlock with
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
-void wait_task_inactive(struct task_struct *p)
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
unsigned long flags;
int running, on_rq;
+ unsigned long ncsw;
struct rq *rq;
for (;;) {
@@ -1899,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p)
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
- while (task_running(rq, p))
+ while (task_running(rq, p)) {
+ if (match_state && unlikely(p->state != match_state))
+ return 0;
cpu_relax();
+ }
/*
* Ok, time to look more closely! We need the rq
@@ -1910,9 +1921,21 @@ void wait_task_inactive(struct task_struct *p)
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
on_rq = p->se.on_rq;
+ ncsw = 0;
+ if (!match_state || p->state == match_state) {
+ ncsw = p->nivcsw + p->nvcsw;
+ if (unlikely(!ncsw))
+ ncsw = 1;
+ }
task_rq_unlock(rq, &flags);
/*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
* Was it really running after all now that we
* checked with the proper locks actually held?
*
@@ -1944,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p)
*/
break;
}
+
+ return ncsw;
}
/***
@@ -4046,6 +4071,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
cpustat->nice = cputime64_add(cpustat->nice, tmp);
else
cpustat->user = cputime64_add(cpustat->user, tmp);
+ /* Account for user time used */
+ acct_update_integrals(p);
}
/*
@@ -6387,7 +6414,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
.priority = 10
};
-void __init migration_init(void)
+static int __init migration_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
int err;
@@ -6397,7 +6424,10 @@ void __init migration_init(void)
BUG_ON(err == NOTIFY_BAD);
migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier);
+
+ return err;
}
+early_initcall(migration_init);
#endif
#ifdef CONFIG_SMP
diff --git a/kernel/signal.c b/kernel/signal.c
index 6c0958e52ea..954f77d7e3b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/signalfd.h>
+#include <linux/tracehook.h>
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
@@ -39,24 +40,21 @@
static struct kmem_cache *sigqueue_cachep;
-static int __sig_ignored(struct task_struct *t, int sig)
+static void __user *sig_handler(struct task_struct *t, int sig)
{
- void __user *handler;
+ return t->sighand->action[sig - 1].sa.sa_handler;
+}
+static int sig_handler_ignored(void __user *handler, int sig)
+{
/* Is it explicitly or implicitly ignored? */
-
- handler = t->sighand->action[sig - 1].sa.sa_handler;
return handler == SIG_IGN ||
(handler == SIG_DFL && sig_kernel_ignore(sig));
}
static int sig_ignored(struct task_struct *t, int sig)
{
- /*
- * Tracers always want to know about signals..
- */
- if (t->ptrace & PT_PTRACED)
- return 0;
+ void __user *handler;
/*
* Blocked signals are never ignored, since the
@@ -66,7 +64,14 @@ static int sig_ignored(struct task_struct *t, int sig)
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
- return __sig_ignored(t, sig);
+ handler = sig_handler(t, sig);
+ if (!sig_handler_ignored(handler, sig))
+ return 0;
+
+ /*
+ * Tracers may want to know about even ignored signals.
+ */
+ return !tracehook_consider_ignored_signal(t, sig, handler);
}
/*
@@ -129,7 +134,9 @@ void recalc_sigpending_and_wake(struct task_struct *t)
void recalc_sigpending(void)
{
- if (!recalc_sigpending_tsk(current) && !freezing(current))
+ if (unlikely(tracehook_force_sigpending()))
+ set_thread_flag(TIF_SIGPENDING);
+ else if (!recalc_sigpending_tsk(current) && !freezing(current))
clear_thread_flag(TIF_SIGPENDING);
}
@@ -295,12 +302,12 @@ flush_signal_handlers(struct task_struct *t, int force_default)
int unhandled_signal(struct task_struct *tsk, int sig)
{
+ void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
if (is_global_init(tsk))
return 1;
- if (tsk->ptrace & PT_PTRACED)
+ if (handler != SIG_IGN && handler != SIG_DFL)
return 0;
- return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
- (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
+ return !tracehook_consider_fatal_signal(tsk, sig, handler);
}
@@ -338,13 +345,9 @@ unblock_all_signals(void)
spin_unlock_irqrestore(&current->sighand->siglock, flags);
}
-static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
{
struct sigqueue *q, *first = NULL;
- int still_pending = 0;
-
- if (unlikely(!sigismember(&list->signal, sig)))
- return 0;
/*
* Collect the siginfo appropriate to this signal. Check if
@@ -352,33 +355,30 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
*/
list_for_each_entry(q, &list->list, list) {
if (q->info.si_signo == sig) {
- if (first) {
- still_pending = 1;
- break;
- }
+ if (first)
+ goto still_pending;
first = q;
}
}
+
+ sigdelset(&list->signal, sig);
+
if (first) {
+still_pending:
list_del_init(&first->list);
copy_siginfo(info, &first->info);
__sigqueue_free(first);
- if (!still_pending)
- sigdelset(&list->signal, sig);
} else {
-
/* Ok, it wasn't in the queue. This must be
a fast-pathed signal or we must have been
out of queue space. So zero out the info.
*/
- sigdelset(&list->signal, sig);
info->si_signo = sig;
info->si_errno = 0;
info->si_code = 0;
info->si_pid = 0;
info->si_uid = 0;
}
- return 1;
}
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
@@ -396,8 +396,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
}
}
- if (!collect_signal(sig, pending, info))
- sig = 0;
+ collect_signal(sig, pending, info);
}
return sig;
@@ -462,8 +461,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* is to alert stop-signal processing code when another
* processor has come along and cleared the flag.
*/
- if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
- tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+ tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
}
if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
/*
@@ -600,9 +598,6 @@ static int check_kill_permission(int sig, struct siginfo *info,
return security_task_kill(t, info, sig, 0);
}
-/* forward decl */
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
-
/*
* Handle magic process-wide effects of stop/continue signals. Unlike
* the signal actions, these happen immediately at signal-generation
@@ -765,7 +760,8 @@ static void complete_signal(int sig, struct task_struct *p, int group)
if (sig_fatal(p, sig) &&
!(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
!sigismember(&t->real_blocked, sig) &&
- (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
+ (sig == SIGKILL ||
+ !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
/*
* This signal will be fatal to the whole group.
*/
@@ -1125,7 +1121,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
* is probably wrong. Should make it like BSD or SYSV.
*/
-static int kill_something_info(int sig, struct siginfo *info, int pid)
+static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
{
int ret;
@@ -1237,17 +1233,6 @@ int kill_pid(struct pid *pid, int sig, int priv)
}
EXPORT_SYMBOL(kill_pid);
-int
-kill_proc(pid_t pid, int sig, int priv)
-{
- int ret;
-
- rcu_read_lock();
- ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
- rcu_read_unlock();
- return ret;
-}
-
/*
* These functions support sending signals using preallocated sigqueue
* structures. This is needed "because realtime applications cannot
@@ -1343,9 +1328,11 @@ static inline void __wake_up_parent(struct task_struct *p,
/*
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
*/
-
-void do_notify_parent(struct task_struct *tsk, int sig)
+int do_notify_parent(struct task_struct *tsk, int sig)
{
struct siginfo info;
unsigned long flags;
@@ -1379,10 +1366,9 @@ void do_notify_parent(struct task_struct *tsk, int sig)
info.si_uid = tsk->uid;
- /* FIXME: find out whether or not this is supposed to be c*time. */
- info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
+ info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
tsk->signal->utime));
- info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime,
+ info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
tsk->signal->stime));
info.si_status = tsk->exit_code & 0x7f;
@@ -1417,12 +1403,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
*/
tsk->exit_signal = -1;
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
- sig = 0;
+ sig = -1;
}
if (valid_signal(sig) && sig > 0)
__group_send_sig_info(sig, &info, tsk->parent);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
+
+ return sig;
}
static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
@@ -1450,9 +1438,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
info.si_uid = tsk->uid;
- /* FIXME: find out whether or not this is supposed to be c*time. */
- info.si_utime = cputime_to_jiffies(tsk->utime);
- info.si_stime = cputime_to_jiffies(tsk->stime);
+ info.si_utime = cputime_to_clock_t(tsk->utime);
+ info.si_stime = cputime_to_clock_t(tsk->stime);
info.si_code = why;
switch (why) {
@@ -1491,10 +1478,10 @@ static inline int may_ptrace_stop(void)
* is a deadlock situation, and pointless because our tracer
* is dead so don't allow us to stop.
* If SIGKILL was already sent before the caller unlocked
- * ->siglock we must see ->core_waiters != 0. Otherwise it
+ * ->siglock we must see ->core_state != NULL. Otherwise it
* is safe to enter schedule().
*/
- if (unlikely(current->mm->core_waiters) &&
+ if (unlikely(current->mm->core_state) &&
unlikely(current->mm == current->parent->mm))
return 0;
@@ -1507,9 +1494,8 @@ static inline int may_ptrace_stop(void)
*/
static int sigkill_pending(struct task_struct *tsk)
{
- return ((sigismember(&tsk->pending.signal, SIGKILL) ||
- sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
- !unlikely(sigismember(&tsk->blocked, SIGKILL)));
+ return sigismember(&tsk->pending.signal, SIGKILL) ||
+ sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
}
/*
@@ -1525,8 +1511,6 @@ static int sigkill_pending(struct task_struct *tsk)
*/
static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
{
- int killed = 0;
-
if (arch_ptrace_stop_needed(exit_code, info)) {
/*
* The arch code has something special to do before a
@@ -1542,7 +1526,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
spin_unlock_irq(&current->sighand->siglock);
arch_ptrace_stop(exit_code, info);
spin_lock_irq(&current->sighand->siglock);
- killed = sigkill_pending(current);
+ if (sigkill_pending(current))
+ return;
}
/*
@@ -1559,7 +1544,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
__set_current_state(TASK_TRACED);
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
- if (!unlikely(killed) && may_ptrace_stop()) {
+ if (may_ptrace_stop()) {
do_notify_parent_cldstop(current, CLD_TRAPPED);
read_unlock(&tasklist_lock);
schedule();
@@ -1623,7 +1608,7 @@ finish_stop(int stop_count)
* a group stop in progress and we are the last to stop,
* report to the parent. When ptraced, every thread reports itself.
*/
- if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
+ if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
read_lock(&tasklist_lock);
do_notify_parent_cldstop(current, CLD_STOPPED);
read_unlock(&tasklist_lock);
@@ -1658,8 +1643,7 @@ static int do_signal_stop(int signr)
} else {
struct task_struct *t;
- if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE))
- != SIGNAL_STOP_DEQUEUED) ||
+ if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
unlikely(signal_group_exit(sig)))
return 0;
/*
@@ -1760,6 +1744,9 @@ relock:
signal->flags &= ~SIGNAL_CLD_MASK;
spin_unlock_irq(&sighand->siglock);
+ if (unlikely(!tracehook_notify_jctl(1, why)))
+ goto relock;
+
read_lock(&tasklist_lock);
do_notify_parent_cldstop(current->group_leader, why);
read_unlock(&tasklist_lock);
@@ -1773,17 +1760,33 @@ relock:
do_signal_stop(0))
goto relock;
- signr = dequeue_signal(current, &current->blocked, info);
- if (!signr)
- break; /* will return 0 */
+ /*
+ * Tracing can induce an artifical signal and choose sigaction.
+ * The return value in @signr determines the default action,
+ * but @info->si_signo is the signal number we will report.
+ */
+ signr = tracehook_get_signal(current, regs, info, return_ka);
+ if (unlikely(signr < 0))
+ goto relock;
+ if (unlikely(signr != 0))
+ ka = return_ka;
+ else {
+ signr = dequeue_signal(current, &current->blocked,
+ info);
- if (signr != SIGKILL) {
- signr = ptrace_signal(signr, info, regs, cookie);
if (!signr)
- continue;
+ break; /* will return 0 */
+
+ if (signr != SIGKILL) {
+ signr = ptrace_signal(signr, info,
+ regs, cookie);
+ if (!signr)
+ continue;
+ }
+
+ ka = &sighand->action[signr-1];
}
- ka = &sighand->action[signr-1];
if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
continue;
if (ka->sa.sa_handler != SIG_DFL) {
@@ -1831,7 +1834,7 @@ relock:
spin_lock_irq(&sighand->siglock);
}
- if (likely(do_signal_stop(signr))) {
+ if (likely(do_signal_stop(info->si_signo))) {
/* It released the siglock. */
goto relock;
}
@@ -1852,7 +1855,7 @@ relock:
if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
- print_fatal_signal(regs, signr);
+ print_fatal_signal(regs, info->si_signo);
/*
* If it was able to dump core, this kills all
* other threads in the group and synchronizes with
@@ -1861,13 +1864,13 @@ relock:
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump((long)signr, signr, regs);
+ do_coredump(info->si_signo, info->si_signo, regs);
}
/*
* Death signals, no core dump.
*/
- do_group_exit(signr);
+ do_group_exit(info->si_signo);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
@@ -1909,7 +1912,7 @@ void exit_signals(struct task_struct *tsk)
out:
spin_unlock_irq(&tsk->sighand->siglock);
- if (unlikely(group_stop)) {
+ if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
read_lock(&tasklist_lock);
do_notify_parent_cldstop(tsk, CLD_STOPPED);
read_unlock(&tasklist_lock);
@@ -1920,8 +1923,6 @@ EXPORT_SYMBOL(recalc_sigpending);
EXPORT_SYMBOL_GPL(dequeue_signal);
EXPORT_SYMBOL(flush_signals);
EXPORT_SYMBOL(force_sig);
-EXPORT_SYMBOL(kill_proc);
-EXPORT_SYMBOL(ptrace_notify);
EXPORT_SYMBOL(send_sig);
EXPORT_SYMBOL(send_sig_info);
EXPORT_SYMBOL(sigprocmask);
@@ -2196,7 +2197,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
}
asmlinkage long
-sys_kill(int pid, int sig)
+sys_kill(pid_t pid, int sig)
{
struct siginfo info;
@@ -2209,7 +2210,7 @@ sys_kill(int pid, int sig)
return kill_something_info(sig, &info, pid);
}
-static int do_tkill(int tgid, int pid, int sig)
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
int error;
struct siginfo info;
@@ -2255,7 +2256,7 @@ static int do_tkill(int tgid, int pid, int sig)
* exists but it's not belonging to the target process anymore. This
* method solves the problem of threads exiting and PIDs getting reused.
*/
-asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
{
/* This is only valid for single tasks */
if (pid <= 0 || tgid <= 0)
@@ -2268,7 +2269,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
* Send a signal to only one task, even if it's a CLONE_THREAD task.
*/
asmlinkage long
-sys_tkill(int pid, int sig)
+sys_tkill(pid_t pid, int sig)
{
/* This is only valid for single tasks */
if (pid <= 0)
@@ -2278,7 +2279,7 @@ sys_tkill(int pid, int sig)
}
asmlinkage long
-sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
+sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo)
{
siginfo_t info;
@@ -2325,7 +2326,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
* (for example, SIGCHLD), shall cause the pending signal to
* be discarded, whether or not it is blocked"
*/
- if (__sig_ignored(t, sig)) {
+ if (sig_handler_ignored(sig_handler(t, sig), sig)) {
sigemptyset(&mask);
sigaddset(&mask, sig);
rm_from_queue_full(&mask, &t->signal->shared_pending);
diff --git a/kernel/smp.c b/kernel/smp.c
index 462c785ca1e..96fc7c0edc5 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,7 +33,7 @@ struct call_single_queue {
spinlock_t lock;
};
-void __cpuinit init_call_single_data(void)
+static int __cpuinit init_call_single_data(void)
{
int i;
@@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void)
spin_lock_init(&q->lock);
INIT_LIST_HEAD(&q->list);
}
+ return 0;
}
+early_initcall(init_call_single_data);
static void csd_flag_wait(struct call_single_data *data)
{
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f6b03d56c2b..c506f266a6b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
-__init int spawn_ksoftirqd(void)
+static __init int spawn_ksoftirqd(void)
{
void *cpu = (void *)(long)smp_processor_id();
int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void)
register_cpu_notifier(&cpu_nfb);
return 0;
}
+early_initcall(spawn_ksoftirqd);
#ifdef CONFIG_SMP
/*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7bd8d1aadd5..b75b492fbfc 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -338,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
-__init void spawn_softlockup_task(void)
+static int __initdata nosoftlockup;
+
+static int __init nosoftlockup_setup(char *str)
+{
+ nosoftlockup = 1;
+ return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+
+static int __init spawn_softlockup_task(void)
{
void *cpu = (void *)(long)smp_processor_id();
- int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ int err;
- BUG_ON(err == NOTIFY_BAD);
+ if (nosoftlockup)
+ return 0;
+
+ err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ if (err == NOTIFY_BAD) {
+ BUG();
+ return 1;
+ }
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+ return 0;
}
+early_initcall(spawn_softlockup_task);
diff --git a/kernel/sys.c b/kernel/sys.c
index 14e97282eb6..c01858090a9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -301,26 +301,6 @@ void kernel_restart(char *cmd)
}
EXPORT_SYMBOL_GPL(kernel_restart);
-/**
- * kernel_kexec - reboot the system
- *
- * Move into place and start executing a preloaded standalone
- * executable. If nothing was preloaded return an error.
- */
-static void kernel_kexec(void)
-{
-#ifdef CONFIG_KEXEC
- struct kimage *image;
- image = xchg(&kexec_image, NULL);
- if (!image)
- return;
- kernel_restart_prepare(NULL);
- printk(KERN_EMERG "Starting new kernel\n");
- machine_shutdown();
- machine_kexec(image);
-#endif
-}
-
static void kernel_shutdown_prepare(enum system_states state)
{
blocking_notifier_call_chain(&reboot_notifier_list,
@@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
kernel_restart(buffer);
break;
+#ifdef CONFIG_KEXEC
case LINUX_REBOOT_CMD_KEXEC:
- kernel_kexec();
- unlock_kernel();
- return -EINVAL;
+ {
+ int ret;
+ ret = kernel_kexec();
+ unlock_kernel();
+ return ret;
+ }
+#endif
#ifdef CONFIG_HIBERNATION
case LINUX_REBOOT_CMD_SW_SUSPEND:
@@ -1343,8 +1328,6 @@ EXPORT_SYMBOL(in_egroup_p);
DECLARE_RWSEM(uts_sem);
-EXPORT_SYMBOL(uts_sem);
-
asmlinkage long sys_newuname(struct new_utsname __user * name)
{
int errno = 0;
@@ -1795,7 +1778,7 @@ int orderly_poweroff(bool force)
goto out;
}
- info = call_usermodehelper_setup(argv[0], argv, envp);
+ info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
if (info == NULL) {
argv_free(argv);
goto out;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bd66ac5406f..08d6e1bb99a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -57,6 +57,7 @@ cond_syscall(compat_sys_set_robust_list);
cond_syscall(sys_get_robust_list);
cond_syscall(compat_sys_get_robust_list);
cond_syscall(sys_epoll_create);
+cond_syscall(sys_epoll_create1);
cond_syscall(sys_epoll_ctl);
cond_syscall(sys_epoll_wait);
cond_syscall(sys_epoll_pwait);
@@ -159,6 +160,7 @@ cond_syscall(sys_ioprio_get);
cond_syscall(sys_signalfd);
cond_syscall(sys_signalfd4);
cond_syscall(compat_sys_signalfd);
+cond_syscall(compat_sys_signalfd4);
cond_syscall(sys_timerfd_create);
cond_syscall(sys_timerfd_settime);
cond_syscall(sys_timerfd_gettime);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1a8299d1fe5..911d846f050 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -160,12 +160,13 @@ static struct ctl_table root_table[];
static struct ctl_table_root sysctl_table_root;
static struct ctl_table_header root_table_header = {
.ctl_table = root_table,
- .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
+ .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
.root = &sysctl_table_root,
+ .set = &sysctl_table_root.default_set,
};
static struct ctl_table_root sysctl_table_root = {
.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
- .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+ .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
};
static struct ctl_table kern_table[];
@@ -624,7 +625,7 @@ static struct ctl_table kern_table[] = {
{
.ctl_name = KERN_PRINTK_RATELIMIT,
.procname = "printk_ratelimit",
- .data = &printk_ratelimit_jiffies,
+ .data = &printk_ratelimit_state.interval,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_jiffies,
@@ -633,7 +634,7 @@ static struct ctl_table kern_table[] = {
{
.ctl_name = KERN_PRINTK_RATELIMIT_BURST,
.procname = "printk_ratelimit_burst",
- .data = &printk_ratelimit_burst,
+ .data = &printk_ratelimit_state.burst,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
@@ -1386,6 +1387,9 @@ static void start_unregistering(struct ctl_table_header *p)
spin_unlock(&sysctl_lock);
wait_for_completion(&wait);
spin_lock(&sysctl_lock);
+ } else {
+ /* anything non-NULL; we'll never dereference it */
+ p->unregistering = ERR_PTR(-EINVAL);
}
/*
* do not remove from the list until nobody holds it; walking the
@@ -1394,6 +1398,32 @@ static void start_unregistering(struct ctl_table_header *p)
list_del_init(&p->ctl_entry);
}
+void sysctl_head_get(struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ head->count++;
+ spin_unlock(&sysctl_lock);
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ if (!--head->count)
+ kfree(head);
+ spin_unlock(&sysctl_lock);
+}
+
+struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+ if (!head)
+ BUG();
+ spin_lock(&sysctl_lock);
+ if (!use_table(head))
+ head = ERR_PTR(-ENOENT);
+ spin_unlock(&sysctl_lock);
+ return head;
+}
+
void sysctl_head_finish(struct ctl_table_header *head)
{
if (!head)
@@ -1403,14 +1433,20 @@ void sysctl_head_finish(struct ctl_table_header *head)
spin_unlock(&sysctl_lock);
}
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+ struct ctl_table_set *set = &root->default_set;
+ if (root->lookup)
+ set = root->lookup(root, namespaces);
+ return set;
+}
+
static struct list_head *
lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
{
- struct list_head *header_list;
- header_list = &root->header_list;
- if (root->lookup)
- header_list = root->lookup(root, namespaces);
- return header_list;
+ struct ctl_table_set *set = lookup_header_set(root, namespaces);
+ return &set->list;
}
struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
@@ -1480,9 +1516,9 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
int op = 0, rc;
if (oldval)
- op |= 004;
+ op |= MAY_READ;
if (newval)
- op |= 002;
+ op |= MAY_WRITE;
if (sysctl_perm(root, table, op))
return -EPERM;
@@ -1524,7 +1560,7 @@ repeat:
if (n == table->ctl_name) {
int error;
if (table->child) {
- if (sysctl_perm(root, table, 001))
+ if (sysctl_perm(root, table, MAY_EXEC))
return -EPERM;
name++;
nlen--;
@@ -1599,7 +1635,7 @@ static int test_perm(int mode, int op)
mode >>= 6;
else if (in_egroup_p(0))
mode >>= 3;
- if ((mode & op & 0007) == op)
+ if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
return 0;
return -EACCES;
}
@@ -1609,7 +1645,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
int error;
int mode;
- error = security_sysctl(table, op);
+ error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
if (error)
return error;
@@ -1644,6 +1680,52 @@ static __init int sysctl_init(void)
core_initcall(sysctl_init);
+static int is_branch_in(struct ctl_table *branch, struct ctl_table *table)
+{
+ struct ctl_table *p;
+ const char *s = branch->procname;
+
+ /* branch should have named subdirectory as its first element */
+ if (!s || !branch->child)
+ return 0;
+
+ /* ... and nothing else */
+ if (branch[1].procname || branch[1].ctl_name)
+ return 0;
+
+ /* table should contain subdirectory with the same name */
+ for (p = table; p->procname || p->ctl_name; p++) {
+ if (!p->child)
+ continue;
+ if (p->procname && strcmp(p->procname, s) == 0)
+ return 1;
+ }
+ return 0;
+}
+
+/* see if attaching q to p would be an improvement */
+static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
+{
+ struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+ int is_better = 0;
+ int not_in_parent = !p->attached_by;
+
+ while (is_branch_in(by, to)) {
+ if (by == q->attached_by)
+ is_better = 1;
+ if (to == p->attached_by)
+ not_in_parent = 1;
+ by = by->child;
+ to = to->child;
+ }
+
+ if (is_better && not_in_parent) {
+ q->attached_by = by;
+ q->attached_to = to;
+ q->parent = p;
+ }
+}
+
/**
* __register_sysctl_paths - register a sysctl hierarchy
* @root: List of sysctl headers to register on
@@ -1720,10 +1802,10 @@ struct ctl_table_header *__register_sysctl_paths(
struct nsproxy *namespaces,
const struct ctl_path *path, struct ctl_table *table)
{
- struct list_head *header_list;
struct ctl_table_header *header;
struct ctl_table *new, **prevp;
unsigned int n, npath;
+ struct ctl_table_set *set;
/* Count the path components */
for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
@@ -1765,6 +1847,7 @@ struct ctl_table_header *__register_sysctl_paths(
header->unregistering = NULL;
header->root = root;
sysctl_set_parent(NULL, header->ctl_table);
+ header->count = 1;
#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
if (sysctl_check_table(namespaces, header->ctl_table)) {
kfree(header);
@@ -1772,8 +1855,20 @@ struct ctl_table_header *__register_sysctl_paths(
}
#endif
spin_lock(&sysctl_lock);
- header_list = lookup_header_list(root, namespaces);
- list_add_tail(&header->ctl_entry, header_list);
+ header->set = lookup_header_set(root, namespaces);
+ header->attached_by = header->ctl_table;
+ header->attached_to = root_table;
+ header->parent = &root_table_header;
+ for (set = header->set; set; set = set->parent) {
+ struct ctl_table_header *p;
+ list_for_each_entry(p, &set->list, ctl_entry) {
+ if (p->unregistering)
+ continue;
+ try_attach(p, header);
+ }
+ }
+ header->parent->count++;
+ list_add_tail(&header->ctl_entry, &header->set->list);
spin_unlock(&sysctl_lock);
return header;
@@ -1828,8 +1923,37 @@ void unregister_sysctl_table(struct ctl_table_header * header)
spin_lock(&sysctl_lock);
start_unregistering(header);
+ if (!--header->parent->count) {
+ WARN_ON(1);
+ kfree(header->parent);
+ }
+ if (!--header->count)
+ kfree(header);
spin_unlock(&sysctl_lock);
- kfree(header);
+}
+
+int sysctl_is_seen(struct ctl_table_header *p)
+{
+ struct ctl_table_set *set = p->set;
+ int res;
+ spin_lock(&sysctl_lock);
+ if (p->unregistering)
+ res = 0;
+ else if (!set->is_seen)
+ res = 1;
+ else
+ res = set->is_seen(set);
+ spin_unlock(&sysctl_lock);
+ return res;
+}
+
+void setup_sysctl_set(struct ctl_table_set *p,
+ struct ctl_table_set *parent,
+ int (*is_seen)(struct ctl_table_set *))
+{
+ INIT_LIST_HEAD(&p->list);
+ p->parent = parent ? parent : &sysctl_table_root.default_set;
+ p->is_seen = is_seen;
}
#else /* !CONFIG_SYSCTL */
@@ -1848,6 +1972,16 @@ void unregister_sysctl_table(struct ctl_table_header * table)
{
}
+void setup_sysctl_set(struct ctl_table_set *p,
+ struct ctl_table_set *parent,
+ int (*is_seen)(struct ctl_table_set *))
+{
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+}
+
#endif /* CONFIG_SYSCTL */
/*
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c09350d564f..c35da23ab8f 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1532,6 +1532,8 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
sysctl_check_leaf(namespaces, table, &fail);
}
sysctl_check_bin_path(table, &fail);
+ if (table->mode > 0777)
+ set_fail(&fail, table, "bogus .mode");
if (fail) {
set_fail(&fail, table, NULL);
error = -EINVAL;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 06b17547f4e..bd6be76303c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -35,7 +35,7 @@
*/
#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
-static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static DEFINE_PER_CPU(__u32, taskstats_seqnum);
static int family_registered;
struct kmem_cache *taskstats_cache;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 868e121c8e3..fc20e09a6cb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1203,9 +1203,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
iter->pos = *pos;
- if (last_ent && !ent)
- seq_puts(m, "\n\nvim:ft=help\n");
-
return ent;
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 421d6fe3650..ece6cfb649f 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -253,12 +253,14 @@ void start_critical_timings(void)
if (preempt_trace() || irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
+EXPORT_SYMBOL_GPL(start_critical_timings);
void stop_critical_timings(void)
{
if (preempt_trace() || irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
+EXPORT_SYMBOL_GPL(stop_critical_timings);
#ifdef CONFIG_IRQSOFF_TRACER
#ifdef CONFIG_PROVE_LOCKING
@@ -337,12 +339,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
#ifdef CONFIG_PREEMPT_TRACER
void trace_preempt_on(unsigned long a0, unsigned long a1)
{
- stop_critical_timing(a0, a1);
+ if (preempt_trace())
+ stop_critical_timing(a0, a1);
}
void trace_preempt_off(unsigned long a0, unsigned long a1)
{
- start_critical_timing(a0, a1);
+ if (preempt_trace())
+ start_critical_timing(a0, a1);
}
#endif /* CONFIG_PREEMPT_TRACER */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c8d61df447..e303ccb62cd 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -26,7 +26,8 @@ static struct task_struct *wakeup_task;
static int wakeup_cpu;
static unsigned wakeup_prio = -1;
-static DEFINE_SPINLOCK(wakeup_lock);
+static raw_spinlock_t wakeup_lock =
+ (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
static void __wakeup_reset(struct trace_array *tr);
@@ -56,7 +57,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
if (unlikely(disabled != 1))
goto out;
- spin_lock_irqsave(&wakeup_lock, flags);
+ local_irq_save(flags);
+ __raw_spin_lock(&wakeup_lock);
if (unlikely(!wakeup_task))
goto unlock;
@@ -71,7 +73,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
trace_function(tr, data, ip, parent_ip, flags);
unlock:
- spin_unlock_irqrestore(&wakeup_lock, flags);
+ __raw_spin_unlock(&wakeup_lock);
+ local_irq_restore(flags);
out:
atomic_dec(&data->disabled);
@@ -145,7 +148,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
if (likely(disabled != 1))
goto out;
- spin_lock_irqsave(&wakeup_lock, flags);
+ local_irq_save(flags);
+ __raw_spin_lock(&wakeup_lock);
/* We could race with grabbing wakeup_lock */
if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -174,7 +178,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
out_unlock:
__wakeup_reset(tr);
- spin_unlock_irqrestore(&wakeup_lock, flags);
+ __raw_spin_unlock(&wakeup_lock);
+ local_irq_restore(flags);
out:
atomic_dec(&tr->data[cpu]->disabled);
}
@@ -209,8 +214,6 @@ static void __wakeup_reset(struct trace_array *tr)
struct trace_array_cpu *data;
int cpu;
- assert_spin_locked(&wakeup_lock);
-
for_each_possible_cpu(cpu) {
data = tr->data[cpu];
tracing_reset(data);
@@ -229,9 +232,11 @@ static void wakeup_reset(struct trace_array *tr)
{
unsigned long flags;
- spin_lock_irqsave(&wakeup_lock, flags);
+ local_irq_save(flags);
+ __raw_spin_lock(&wakeup_lock);
__wakeup_reset(tr);
- spin_unlock_irqrestore(&wakeup_lock, flags);
+ __raw_spin_unlock(&wakeup_lock);
+ local_irq_restore(flags);
}
static void
@@ -252,7 +257,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
goto out;
/* interrupts should be off from try_to_wake_up */
- spin_lock(&wakeup_lock);
+ __raw_spin_lock(&wakeup_lock);
/* check for races. */
if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -274,7 +279,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
CALLER_ADDR1, CALLER_ADDR2, flags);
out_locked:
- spin_unlock(&wakeup_lock);
+ __raw_spin_unlock(&wakeup_lock);
out:
atomic_dec(&tr->data[cpu]->disabled);
}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 63528086337..ce2d723c10e 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -161,7 +161,7 @@ static void timer_notify(struct pt_regs *regs, int cpu)
__trace_special(tr, data, 2, regs->ip, 0);
while (i < sample_max_depth) {
- frame.next_fp = 0;
+ frame.next_fp = NULL;
frame.return_address = 0;
if (!copy_stack_frame(fp, &frame))
break;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b584961..3da47ccdc5e 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -28,14 +28,14 @@
void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
{
struct timespec uptime, ts;
- s64 ac_etime;
+ u64 ac_etime;
BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
/* calculate task elapsed time in timespec */
do_posix_clock_monotonic_gettime(&uptime);
ts = timespec_sub(uptime, tsk->start_time);
- /* rebase elapsed time to usec */
+ /* rebase elapsed time to usec (should never be negative) */
ac_etime = timespec_to_ns(&ts);
do_div(ac_etime, NSEC_PER_USEC);
stats->ac_etime = ac_etime;
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
{
struct mm_struct *mm;
- /* convert pages-jiffies to Mbyte-usec */
- stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
- stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
+ /* convert pages-usec to Mbyte-usec */
+ stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
+ stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
mm = get_task_mm(p);
if (mm) {
/* adjust to KB unit */
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
void acct_update_integrals(struct task_struct *tsk)
{
if (likely(tsk->mm)) {
- long delta = cputime_to_jiffies(
- cputime_sub(tsk->stime, tsk->acct_stimexpd));
+ cputime_t time, dtime;
+ struct timeval value;
+ u64 delta;
+
+ time = tsk->stime + tsk->utime;
+ dtime = cputime_sub(time, tsk->acct_timexpd);
+ jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+ delta = value.tv_sec;
+ delta = delta * USEC_PER_SEC + value.tv_usec;
if (delta == 0)
return;
- tsk->acct_stimexpd = tsk->stime;
+ tsk->acct_timexpd = time;
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
}
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
*/
void acct_clear_integrals(struct task_struct *tsk)
{
- tsk->acct_stimexpd = 0;
+ tsk->acct_timexpd = 0;
tsk->acct_rss_mem1 = 0;
tsk->acct_vm_mem1 = 0;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6fd158b2102..ec7e4f62aaf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -125,7 +125,7 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
}
static void insert_work(struct cpu_workqueue_struct *cwq,
- struct work_struct *work, int tail)
+ struct work_struct *work, struct list_head *head)
{
set_wq_data(work, cwq);
/*
@@ -133,10 +133,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
* result of list_add() below, see try_to_grab_pending().
*/
smp_wmb();
- if (tail)
- list_add_tail(&work->entry, &cwq->worklist);
- else
- list_add(&work->entry, &cwq->worklist);
+ list_add_tail(&work->entry, head);
wake_up(&cwq->more_work);
}
@@ -146,7 +143,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
unsigned long flags;
spin_lock_irqsave(&cwq->lock, flags);
- insert_work(cwq, work, 1);
+ insert_work(cwq, work, &cwq->worklist);
spin_unlock_irqrestore(&cwq->lock, flags);
}
@@ -162,14 +159,11 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
*/
int queue_work(struct workqueue_struct *wq, struct work_struct *work)
{
- int ret = 0;
+ int ret;
+
+ ret = queue_work_on(get_cpu(), wq, work);
+ put_cpu();
- if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
- BUG_ON(!list_empty(&work->entry));
- __queue_work(wq_per_cpu(wq, get_cpu()), work);
- put_cpu();
- ret = 1;
- }
return ret;
}
EXPORT_SYMBOL_GPL(queue_work);
@@ -361,14 +355,14 @@ static void wq_barrier_func(struct work_struct *work)
}
static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
- struct wq_barrier *barr, int tail)
+ struct wq_barrier *barr, struct list_head *head)
{
INIT_WORK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
init_completion(&barr->done);
- insert_work(cwq, &barr->work, tail);
+ insert_work(cwq, &barr->work, head);
}
static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -388,7 +382,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
active = 0;
spin_lock_irq(&cwq->lock);
if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
- insert_wq_barrier(cwq, &barr, 1);
+ insert_wq_barrier(cwq, &barr, &cwq->worklist);
active = 1;
}
spin_unlock_irq(&cwq->lock);
@@ -426,6 +420,57 @@ void flush_workqueue(struct workqueue_struct *wq)
}
EXPORT_SYMBOL_GPL(flush_workqueue);
+/**
+ * flush_work - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * Returns false if @work has already terminated.
+ *
+ * It is expected that, prior to calling flush_work(), the caller has
+ * arranged for the work to not be requeued, otherwise it doesn't make
+ * sense to use this function.
+ */
+int flush_work(struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq;
+ struct list_head *prev;
+ struct wq_barrier barr;
+
+ might_sleep();
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return 0;
+
+ lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+ lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+
+ prev = NULL;
+ spin_lock_irq(&cwq->lock);
+ if (!list_empty(&work->entry)) {
+ /*
+ * See the comment near try_to_grab_pending()->smp_rmb().
+ * If it was re-queued under us we are not going to wait.
+ */
+ smp_rmb();
+ if (unlikely(cwq != get_wq_data(work)))
+ goto out;
+ prev = &work->entry;
+ } else {
+ if (cwq->current_work != work)
+ goto out;
+ prev = &cwq->worklist;
+ }
+ insert_wq_barrier(cwq, &barr, prev->next);
+out:
+ spin_unlock_irq(&cwq->lock);
+ if (!prev)
+ return 0;
+
+ wait_for_completion(&barr.done);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(flush_work);
+
/*
* Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
* so this work can't be re-armed in any way.
@@ -473,7 +518,7 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
spin_lock_irq(&cwq->lock);
if (unlikely(cwq->current_work == work)) {
- insert_wq_barrier(cwq, &barr, 0);
+ insert_wq_barrier(cwq, &barr, cwq->worklist.next);
running = 1;
}
spin_unlock_irq(&cwq->lock);
@@ -644,10 +689,10 @@ int schedule_on_each_cpu(work_func_t func)
struct work_struct *work = per_cpu_ptr(works, cpu);
INIT_WORK(work, func);
- set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
- __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
+ schedule_work_on(cpu, work);
}
- flush_workqueue(keventd_wq);
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(works, cpu));
put_online_cpus();
free_percpu(works);
return 0;
@@ -784,7 +829,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
err = create_workqueue_thread(cwq, singlethread_cpu);
start_workqueue_thread(cwq, -1);
} else {
- get_online_cpus();
+ cpu_maps_update_begin();
spin_lock(&workqueue_lock);
list_add(&wq->list, &workqueues);
spin_unlock(&workqueue_lock);
@@ -796,7 +841,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
err = create_workqueue_thread(cwq, cpu);
start_workqueue_thread(cwq, cpu);
}
- put_online_cpus();
+ cpu_maps_update_done();
}
if (err) {
@@ -810,8 +855,8 @@ EXPORT_SYMBOL_GPL(__create_workqueue_key);
static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
{
/*
- * Our caller is either destroy_workqueue() or CPU_DEAD,
- * get_online_cpus() protects cwq->thread.
+ * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
+ * cpu_add_remove_lock protects cwq->thread.
*/
if (cwq->thread == NULL)
return;
@@ -821,7 +866,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
flush_cpu_workqueue(cwq);
/*
- * If the caller is CPU_DEAD and cwq->worklist was not empty,
+ * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
* a concurrent flush_workqueue() can insert a barrier after us.
* However, in that case run_workqueue() won't return and check
* kthread_should_stop() until it flushes all work_struct's.
@@ -845,14 +890,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
const cpumask_t *cpu_map = wq_cpu_map(wq);
int cpu;
- get_online_cpus();
+ cpu_maps_update_begin();
spin_lock(&workqueue_lock);
list_del(&wq->list);
spin_unlock(&workqueue_lock);
for_each_cpu_mask_nr(cpu, *cpu_map)
cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
- put_online_cpus();
+ cpu_maps_update_done();
free_percpu(wq->cpu_wq);
kfree(wq);
@@ -866,6 +911,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
unsigned int cpu = (unsigned long)hcpu;
struct cpu_workqueue_struct *cwq;
struct workqueue_struct *wq;
+ int ret = NOTIFY_OK;
action &= ~CPU_TASKS_FROZEN;
@@ -873,7 +919,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE:
cpu_set(cpu, cpu_populated_map);
}
-
+undo:
list_for_each_entry(wq, &workqueues, list) {
cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -883,7 +929,9 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
break;
printk(KERN_ERR "workqueue [%s] for %i failed\n",
wq->name, cpu);
- return NOTIFY_BAD;
+ action = CPU_UP_CANCELED;
+ ret = NOTIFY_BAD;
+ goto undo;
case CPU_ONLINE:
start_workqueue_thread(cwq, cpu);
@@ -891,7 +939,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
case CPU_UP_CANCELED:
start_workqueue_thread(cwq, -1);
- case CPU_DEAD:
+ case CPU_POST_DEAD:
cleanup_workqueue_thread(cwq);
break;
}
@@ -899,11 +947,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_CANCELED:
- case CPU_DEAD:
+ case CPU_POST_DEAD:
cpu_clear(cpu, cpu_populated_map);
}
- return NOTIFY_OK;
+ return ret;
}
void __init init_workqueues(void)