aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit_tree.c48
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/exec_domain.c33
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c11
-rw-r--r--kernel/hrtimer.c206
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/module.c343
-rw-r--r--kernel/panic.c17
-rw-r--r--kernel/params.c276
-rw-r--r--kernel/posix-timers.c10
-rw-r--r--kernel/power/disk.c2
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/rcupdate.c19
-rw-r--r--kernel/rtmutex.c3
-rw-r--r--kernel/sched.c62
-rw-r--r--kernel/sched_fair.c229
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_idletask.c5
-rw-r--r--kernel/sched_rt.c5
-rw-r--r--kernel/sched_stats.h11
-rw-r--r--kernel/stop_machine.c120
-rw-r--r--kernel/sys.c10
-rw-r--r--kernel/sysctl.c10
-rw-r--r--kernel/time.c18
-rw-r--r--kernel/time/ntp.c3
-rw-r--r--kernel/time/tick-sched.c35
-rw-r--r--kernel/time/timer_list.c8
-rw-r--r--kernel/workqueue.c7
30 files changed, 914 insertions, 600 deletions
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f7921a2ecf1..8ba0e0d934f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -532,7 +532,7 @@ void audit_trim_trees(void)
list_add(&cursor, &tree_list);
while (cursor.next != &tree_list) {
struct audit_tree *tree;
- struct nameidata nd;
+ struct path path;
struct vfsmount *root_mnt;
struct node *node;
struct list_head list;
@@ -544,12 +544,12 @@ void audit_trim_trees(void)
list_add(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
- err = path_lookup(tree->pathname, 0, &nd);
+ err = kern_path(tree->pathname, 0, &path);
if (err)
goto skip_it;
- root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
- path_put(&nd.path);
+ root_mnt = collect_mounts(path.mnt, path.dentry);
+ path_put(&path);
if (!root_mnt)
goto skip_it;
@@ -580,19 +580,19 @@ skip_it:
}
static int is_under(struct vfsmount *mnt, struct dentry *dentry,
- struct nameidata *nd)
+ struct path *path)
{
- if (mnt != nd->path.mnt) {
+ if (mnt != path->mnt) {
for (;;) {
if (mnt->mnt_parent == mnt)
return 0;
- if (mnt->mnt_parent == nd->path.mnt)
+ if (mnt->mnt_parent == path->mnt)
break;
mnt = mnt->mnt_parent;
}
dentry = mnt->mnt_mountpoint;
}
- return is_subdir(dentry, nd->path.dentry);
+ return is_subdir(dentry, path->dentry);
}
int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
@@ -618,7 +618,7 @@ void audit_put_tree(struct audit_tree *tree)
int audit_add_tree_rule(struct audit_krule *rule)
{
struct audit_tree *seed = rule->tree, *tree;
- struct nameidata nd;
+ struct path path;
struct vfsmount *mnt, *p;
struct list_head list;
int err;
@@ -637,11 +637,11 @@ int audit_add_tree_rule(struct audit_krule *rule)
/* do not set rule->tree yet */
mutex_unlock(&audit_filter_mutex);
- err = path_lookup(tree->pathname, 0, &nd);
+ err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
- mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
- path_put(&nd.path);
+ mnt = collect_mounts(path.mnt, path.dentry);
+ path_put(&path);
if (!mnt) {
err = -ENOMEM;
goto Err;
@@ -690,29 +690,29 @@ int audit_tag_tree(char *old, char *new)
{
struct list_head cursor, barrier;
int failed = 0;
- struct nameidata nd;
+ struct path path;
struct vfsmount *tagged;
struct list_head list;
struct vfsmount *mnt;
struct dentry *dentry;
int err;
- err = path_lookup(new, 0, &nd);
+ err = kern_path(new, 0, &path);
if (err)
return err;
- tagged = collect_mounts(nd.path.mnt, nd.path.dentry);
- path_put(&nd.path);
+ tagged = collect_mounts(path.mnt, path.dentry);
+ path_put(&path);
if (!tagged)
return -ENOMEM;
- err = path_lookup(old, 0, &nd);
+ err = kern_path(old, 0, &path);
if (err) {
drop_collected_mounts(tagged);
return err;
}
- mnt = mntget(nd.path.mnt);
- dentry = dget(nd.path.dentry);
- path_put(&nd.path);
+ mnt = mntget(path.mnt);
+ dentry = dget(path.dentry);
+ path_put(&path);
if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
follow_up(&mnt, &dentry);
@@ -733,7 +733,7 @@ int audit_tag_tree(char *old, char *new)
list_add(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
- err = path_lookup(tree->pathname, 0, &nd);
+ err = kern_path(tree->pathname, 0, &path);
if (err) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -741,15 +741,15 @@ int audit_tag_tree(char *old, char *new)
}
spin_lock(&vfsmount_lock);
- if (!is_under(mnt, dentry, &nd)) {
+ if (!is_under(mnt, dentry, &path)) {
spin_unlock(&vfsmount_lock);
- path_put(&nd.path);
+ path_put(&path);
put_tree(tree);
mutex_lock(&audit_filter_mutex);
continue;
}
spin_unlock(&vfsmount_lock);
- path_put(&nd.path);
+ path_put(&path);
list_for_each_entry(p, &list, mnt_list) {
failed = tag_chunk(p->mnt_root->d_inode, tree);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 046c1609606..35eebd5510c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2104,7 +2104,7 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
down_read(&cgrp->pids_mutex);
if (pid) {
int end = cgrp->pids_length;
- int i;
+
while (index < end) {
int mid = (index + end) / 2;
if (cgrp->tasks_pids[mid] == pid) {
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0d407e88673..0511716e942 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -12,7 +12,9 @@
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/personality.h>
+#include <linux/proc_fs.h>
#include <linux/sched.h>
+#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
@@ -173,20 +175,39 @@ __set_personality(u_long personality)
return 0;
}
-int
-get_exec_domain_list(char *page)
+#ifdef CONFIG_PROC_FS
+static int execdomains_proc_show(struct seq_file *m, void *v)
{
struct exec_domain *ep;
- int len = 0;
read_lock(&exec_domains_lock);
- for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next)
- len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n",
+ for (ep = exec_domains; ep; ep = ep->next)
+ seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
ep->pers_low, ep->pers_high, ep->name,
module_name(ep->module));
read_unlock(&exec_domains_lock);
- return (len);
+ return 0;
+}
+
+static int execdomains_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, execdomains_proc_show, NULL);
+}
+
+static const struct file_operations execdomains_proc_fops = {
+ .open = execdomains_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init proc_execdomains_init(void)
+{
+ proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
+ return 0;
}
+module_init(proc_execdomains_init);
+#endif
asmlinkage long
sys_personality(u_long personality)
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d093552dd6..f6083561dfe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1018,6 +1018,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;
+ p->default_timer_slack_ns = current->timer_slack_ns;
+
#ifdef CONFIG_DETECT_SOFTLOCKUP
p->last_switch_count = 0;
p->last_switch_timestamp = 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index 7d1136e97c1..8af10027514 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1296,13 +1296,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
if (!abs_time)
schedule();
else {
+ unsigned long slack;
+ slack = current->timer_slack_ns;
+ if (rt_task(current))
+ slack = 0;
hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
hrtimer_init_sleeper(&t, current);
- t.timer.expires = *abs_time;
+ hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
- hrtimer_start(&t.timer, t.timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
if (!hrtimer_active(&t.timer))
t.task = NULL;
@@ -1404,7 +1407,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
- to->timer.expires = *time;
+ hrtimer_set_expires(&to->timer, *time);
}
q.pi_state = NULL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 95978f48e03..2b465dfde42 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -517,7 +517,7 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
if (!base->first)
continue;
timer = rb_entry(base->first, struct hrtimer, node);
- expires = ktime_sub(timer->expires, base->offset);
+ expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires.tv64 < cpu_base->expires_next.tv64)
cpu_base->expires_next = expires;
}
@@ -539,10 +539,10 @@ static int hrtimer_reprogram(struct hrtimer *timer,
struct hrtimer_clock_base *base)
{
ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
- ktime_t expires = ktime_sub(timer->expires, base->offset);
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
int res;
- WARN_ON_ONCE(timer->expires.tv64 < 0);
+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
/*
* When the callback is running, we do not reprogram the clock event
@@ -795,7 +795,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
u64 orun = 1;
ktime_t delta;
- delta = ktime_sub(now, timer->expires);
+ delta = ktime_sub(now, hrtimer_get_expires(timer));
if (delta.tv64 < 0)
return 0;
@@ -807,8 +807,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
s64 incr = ktime_to_ns(interval);
orun = ktime_divns(delta, incr);
- timer->expires = ktime_add_ns(timer->expires, incr * orun);
- if (timer->expires.tv64 > now.tv64)
+ hrtimer_add_expires_ns(timer, incr * orun);
+ if (hrtimer_get_expires_tv64(timer) > now.tv64)
return orun;
/*
* This (and the ktime_add() below) is the
@@ -816,7 +816,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
*/
orun++;
}
- timer->expires = ktime_add_safe(timer->expires, interval);
+ hrtimer_add_expires(timer, interval);
return orun;
}
@@ -848,7 +848,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
* We dont care about collisions. Nodes with
* the same expiry time stay together.
*/
- if (timer->expires.tv64 < entry->expires.tv64) {
+ if (hrtimer_get_expires_tv64(timer) <
+ hrtimer_get_expires_tv64(entry)) {
link = &(*link)->rb_left;
} else {
link = &(*link)->rb_right;
@@ -945,9 +946,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
}
/**
- * hrtimer_start - (re)start an relative timer on the current CPU
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
* @tim: expiry time
+ * @delta_ns: "slack" range for the timer
* @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
*
* Returns:
@@ -955,7 +957,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
* 1 when the timer was active
*/
int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
+ const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
@@ -983,7 +986,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
#endif
}
- timer->expires = tim;
+ hrtimer_set_expires_range_ns(timer, tim, delta_ns);
timer_stats_hrtimer_set_start_info(timer);
@@ -1016,8 +1019,26 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
return ret;
}
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+/**
+ * hrtimer_start - (re)start an hrtimer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ * 0 on success
+ * 1 when the timer was active
+ */
+int
+hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+ return hrtimer_start_range_ns(timer, tim, 0, mode);
+}
EXPORT_SYMBOL_GPL(hrtimer_start);
+
/**
* hrtimer_try_to_cancel - try to deactivate a timer
* @timer: hrtimer to stop
@@ -1077,7 +1098,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
ktime_t rem;
base = lock_hrtimer_base(timer, &flags);
- rem = ktime_sub(timer->expires, base->get_time());
+ rem = hrtimer_expires_remaining(timer);
unlock_hrtimer_base(timer, &flags);
return rem;
@@ -1109,7 +1130,7 @@ ktime_t hrtimer_get_next_event(void)
continue;
timer = rb_entry(base->first, struct hrtimer, node);
- delta.tv64 = timer->expires.tv64;
+ delta.tv64 = hrtimer_get_expires_tv64(timer);
delta = ktime_sub(delta, base->get_time());
if (delta.tv64 < mindelta.tv64)
mindelta.tv64 = delta.tv64;
@@ -1310,10 +1331,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
timer = rb_entry(node, struct hrtimer, node);
- if (basenow.tv64 < timer->expires.tv64) {
+ /*
+ * The immediate goal for using the softexpires is
+ * minimizing wakeups, not running timers at the
+ * earliest interrupt after their soft expiration.
+ * This allows us to avoid using a Priority Search
+ * Tree, which can answer a stabbing querry for
+ * overlapping intervals and instead use the simple
+ * BST we already have.
+ * We don't add extra wakeups by delaying timers that
+ * are right-of a not yet expired timer, because that
+ * timer will have to trigger a wakeup anyway.
+ */
+
+ if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
ktime_t expires;
- expires = ktime_sub(timer->expires,
+ expires = ktime_sub(hrtimer_get_expires(timer),
base->offset);
if (expires.tv64 < expires_next.tv64)
expires_next = expires;
@@ -1349,6 +1383,30 @@ void hrtimer_interrupt(struct clock_event_device *dev)
raise_softirq(HRTIMER_SOFTIRQ);
}
+/**
+ * hrtimer_peek_ahead_timers -- run soft-expired timers now
+ *
+ * hrtimer_peek_ahead_timers will peek at the timer queue of
+ * the current cpu and check if there are any timers for which
+ * the soft expires time has passed. If any such timers exist,
+ * they are run immediately and then removed from the timer queue.
+ *
+ */
+void hrtimer_peek_ahead_timers(void)
+{
+ struct tick_device *td;
+ unsigned long flags;
+
+ if (!hrtimer_hres_active())
+ return;
+
+ local_irq_save(flags);
+ td = &__get_cpu_var(tick_cpu_device);
+ if (td && td->evtdev)
+ hrtimer_interrupt(td->evtdev);
+ local_irq_restore(flags);
+}
+
static void run_hrtimer_softirq(struct softirq_action *h)
{
run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
@@ -1414,7 +1472,8 @@ void hrtimer_run_queues(void)
struct hrtimer *timer;
timer = rb_entry(node, struct hrtimer, node);
- if (base->softirq_time.tv64 <= timer->expires.tv64)
+ if (base->softirq_time.tv64 <=
+ hrtimer_get_expires_tv64(timer))
break;
if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
@@ -1462,7 +1521,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
do {
set_current_state(TASK_INTERRUPTIBLE);
- hrtimer_start(&t->timer, t->timer.expires, mode);
+ hrtimer_start_expires(&t->timer, mode);
if (!hrtimer_active(&t->timer))
t->task = NULL;
@@ -1484,7 +1543,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
struct timespec rmt;
ktime_t rem;
- rem = ktime_sub(timer->expires, timer->base->get_time());
+ rem = hrtimer_expires_remaining(timer);
if (rem.tv64 <= 0)
return 0;
rmt = ktime_to_timespec(rem);
@@ -1503,7 +1562,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
HRTIMER_MODE_ABS);
- t.timer.expires.tv64 = restart->nanosleep.expires;
+ hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
if (do_nanosleep(&t, HRTIMER_MODE_ABS))
goto out;
@@ -1528,9 +1587,14 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
struct restart_block *restart;
struct hrtimer_sleeper t;
int ret = 0;
+ unsigned long slack;
+
+ slack = current->timer_slack_ns;
+ if (rt_task(current))
+ slack = 0;
hrtimer_init_on_stack(&t.timer, clockid, mode);
- t.timer.expires = timespec_to_ktime(*rqtp);
+ hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
if (do_nanosleep(&t, mode))
goto out;
@@ -1550,7 +1614,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.index = t.timer.base->index;
restart->nanosleep.rmtp = rmtp;
- restart->nanosleep.expires = t.timer.expires.tv64;
+ restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
ret = -ERESTART_RESTARTBLOCK;
out:
@@ -1752,3 +1816,103 @@ void __init hrtimers_init(void)
#endif
}
+/**
+ * schedule_hrtimeout_range - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @delta: slack in expires timeout (ktime_t)
+ * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+ const enum hrtimer_mode mode)
+{
+ struct hrtimer_sleeper t;
+
+ /*
+ * Optimize when a zero timeout value is given. It does not
+ * matter whether this is an absolute or a relative time.
+ */
+ if (expires && !expires->tv64) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ /*
+ * A NULL parameter means "inifinte"
+ */
+ if (!expires) {
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ return -EINTR;
+ }
+
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
+ hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
+
+ hrtimer_init_sleeper(&t, current);
+
+ hrtimer_start_expires(&t.timer, mode);
+ if (!hrtimer_active(&t.timer))
+ t.task = NULL;
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ return !t.task ? 0 : -EINTR;
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
+
+/**
+ * schedule_hrtimeout - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout(ktime_t *expires,
+ const enum hrtimer_mode mode)
+{
+ return schedule_hrtimeout_range(expires, 0, mode);
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 777ac458ac9..ac0fde7b54d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -30,6 +30,7 @@
#include <linux/pm.h>
#include <linux/cpu.h>
#include <linux/console.h>
+#include <linux/vmalloc.h>
#include <asm/page.h>
#include <asm/uaccess.h>
diff --git a/kernel/module.c b/kernel/module.c
index 0d8d21ee792..1f4cc00e0c2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,11 +20,13 @@
#include <linux/moduleloader.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
+#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/elf.h>
+#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
@@ -42,6 +44,7 @@
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/unwind.h>
+#include <linux/rculist.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <linux/license.h>
@@ -63,7 +66,7 @@
#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
/* List of modules, protected by module_mutex or preempt_disable
- * (add/delete uses stop_machine). */
+ * (delete uses stop_machine/add uses RCU list operations). */
static DEFINE_MUTEX(module_mutex);
static LIST_HEAD(modules);
@@ -132,6 +135,29 @@ static unsigned int find_sec(Elf_Ehdr *hdr,
return 0;
}
+/* Find a module section, or NULL. */
+static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs,
+ const char *secstrings, const char *name)
+{
+ /* Section 0 has sh_addr 0. */
+ return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr;
+}
+
+/* Find a module section, or NULL. Fill in number of "objects" in section. */
+static void *section_objs(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings,
+ const char *name,
+ size_t object_size,
+ unsigned int *num)
+{
+ unsigned int sec = find_sec(hdr, sechdrs, secstrings, name);
+
+ /* Section 0 has sh_addr 0 and sh_size 0. */
+ *num = sechdrs[sec].sh_size / object_size;
+ return (void *)sechdrs[sec].sh_addr;
+}
+
/* Provided by the linker */
extern const struct kernel_symbol __start___ksymtab[];
extern const struct kernel_symbol __stop___ksymtab[];
@@ -218,7 +244,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
return true;
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
struct symsearch arr[] = {
{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
NOT_GPL_ONLY, false },
@@ -1394,17 +1420,6 @@ static void mod_kobject_remove(struct module *mod)
}
/*
- * link the module with the whole machine is stopped with interrupts off
- * - this defends against kallsyms not taking locks
- */
-static int __link_module(void *_mod)
-{
- struct module *mod = _mod;
- list_add(&mod->list, &modules);
- return 0;
-}
-
-/*
* unlink the module with the whole machine is stopped with interrupts off
* - this defends against kallsyms not taking locks
*/
@@ -1789,32 +1804,20 @@ static inline void add_kallsyms(struct module *mod,
}
#endif /* CONFIG_KALLSYMS */
-#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
-static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex)
+static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
{
- struct mod_debug *debug_info;
- unsigned long pos, end;
- unsigned int num_verbose;
-
- pos = sechdrs[verboseindex].sh_addr;
- num_verbose = sechdrs[verboseindex].sh_size /
- sizeof(struct mod_debug);
- end = pos + (num_verbose * sizeof(struct mod_debug));
+#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
+ unsigned int i;
- for (; pos < end; pos += sizeof(struct mod_debug)) {
- debug_info = (struct mod_debug *)pos;
- register_dynamic_debug_module(debug_info->modname,
- debug_info->type, debug_info->logical_modname,
- debug_info->flag_names, debug_info->hash,
- debug_info->hash2);
+ for (i = 0; i < num; i++) {
+ register_dynamic_debug_module(debug[i].modname,
+ debug[i].type,
+ debug[i].logical_modname,
+ debug[i].flag_names,
+ debug[i].hash, debug[i].hash2);
}
-}
-#else
-static inline void dynamic_printk_setup(Elf_Shdr *sechdrs,
- unsigned int verboseindex)
-{
-}
#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
+}
static void *module_alloc_update_bounds(unsigned long size)
{
@@ -1843,37 +1846,14 @@ static noinline struct module *load_module(void __user *umod,
unsigned int i;
unsigned int symindex = 0;
unsigned int strindex = 0;
- unsigned int setupindex;
- unsigned int exindex;
- unsigned int exportindex;
- unsigned int modindex;
- unsigned int obsparmindex;
- unsigned int infoindex;
- unsigned int gplindex;
- unsigned int crcindex;
- unsigned int gplcrcindex;
- unsigned int versindex;
- unsigned int pcpuindex;
- unsigned int gplfutureindex;
- unsigned int gplfuturecrcindex;
+ unsigned int modindex, versindex, infoindex, pcpuindex;
unsigned int unwindex = 0;
-#ifdef CONFIG_UNUSED_SYMBOLS
- unsigned int unusedindex;
- unsigned int unusedcrcindex;
- unsigned int unusedgplindex;
- unsigned int unusedgplcrcindex;
-#endif
- unsigned int markersindex;
- unsigned int markersstringsindex;
- unsigned int verboseindex;
- unsigned int tracepointsindex;
- unsigned int tracepointsstringsindex;
- unsigned int mcountindex;
+ unsigned int num_kp, num_mcount;
+ struct kernel_param *kp;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
- void *mseg;
- struct exception_table_entry *extable;
+ unsigned long *mseg;
mm_segment_t old_fs;
DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -1937,6 +1917,7 @@ static noinline struct module *load_module(void __user *umod,
err = -ENOEXEC;
goto free_hdr;
}
+ /* This is temporary: point mod into copy of data. */
mod = (void *)sechdrs[modindex].sh_addr;
if (symindex == 0) {
@@ -1946,22 +1927,6 @@ static noinline struct module *load_module(void __user *umod,
goto free_hdr;
}
- /* Optional sections */
- exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
- gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
- gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
- crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
- gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
- gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
-#ifdef CONFIG_UNUSED_SYMBOLS
- unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
- unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
- unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
- unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
-#endif
- setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
- exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
- obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
@@ -2117,42 +2082,57 @@ static noinline struct module *load_module(void __user *umod,
if (err < 0)
goto cleanup;
- /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */
- mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms);
- mod->syms = (void *)sechdrs[exportindex].sh_addr;
- if (crcindex)
- mod->crcs = (void *)sechdrs[crcindex].sh_addr;
- mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms);
- mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
- if (gplcrcindex)
- mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
- mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
- sizeof(*mod->gpl_future_syms);
- mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
- if (gplfuturecrcindex)
- mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
+ /* Now we've got everything in the final locations, we can
+ * find optional sections. */
+ kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
+ &num_kp);
+ mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
+ sizeof(*mod->syms), &mod->num_syms);
+ mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
+ mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
+ sizeof(*mod->gpl_syms),
+ &mod->num_gpl_syms);
+ mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
+ mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
+ "__ksymtab_gpl_future",
+ sizeof(*mod->gpl_future_syms),
+ &mod->num_gpl_future_syms);
+ mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
+ "__kcrctab_gpl_future");
#ifdef CONFIG_UNUSED_SYMBOLS
- mod->num_unused_syms = sechdrs[unusedindex].sh_size /
- sizeof(*mod->unused_syms);
- mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
- sizeof(*mod->unused_gpl_syms);
- mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
- if (unusedcrcindex)
- mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
- mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
- if (unusedgplcrcindex)
- mod->unused_gpl_crcs
- = (void *)sechdrs[unusedgplcrcindex].sh_addr;
+ mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
+ "__ksymtab_unused",
+ sizeof(*mod->unused_syms),
+ &mod->num_unused_syms);
+ mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
+ "__kcrctab_unused");
+ mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
+ "__ksymtab_unused_gpl",
+ sizeof(*mod->unused_gpl_syms),
+ &mod->num_unused_gpl_syms);
+ mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
+ "__kcrctab_unused_gpl");
+#endif
+
+#ifdef CONFIG_MARKERS
+ mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
+ sizeof(*mod->markers), &mod->num_markers);
+#endif
+#ifdef CONFIG_TRACEPOINTS
+ mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
+ "__tracepoints",
+ sizeof(*mod->tracepoints),
+ &mod->num_tracepoints);
#endif
#ifdef CONFIG_MODVERSIONS
- if ((mod->num_syms && !crcindex)
- || (mod->num_gpl_syms && !gplcrcindex)
- || (mod->num_gpl_future_syms && !gplfuturecrcindex)
+ if ((mod->num_syms && !mod->crcs)
+ || (mod->num_gpl_syms && !mod->gpl_crcs)
+ || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
#ifdef CONFIG_UNUSED_SYMBOLS
- || (mod->num_unused_syms && !unusedcrcindex)
- || (mod->num_unused_gpl_syms && !unusedgplcrcindex)
+ || (mod->num_unused_syms && !mod->unused_crcs)
+ || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
#endif
) {
printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
@@ -2161,16 +2141,6 @@ static noinline struct module *load_module(void __user *umod,
goto cleanup;
}
#endif
- markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
- markersstringsindex = find_sec(hdr, sechdrs, secstrings,
- "__markers_strings");
- verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose");
- tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints");
- tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings,
- "__tracepoints_strings");
-
- mcountindex = find_sec(hdr, sechdrs, secstrings,
- "__mcount_loc");
/* Now do relocations. */
for (i = 1; i < hdr->e_shnum; i++) {
@@ -2193,28 +2163,16 @@ static noinline struct module *load_module(void __user *umod,
if (err < 0)
goto cleanup;
}
-#ifdef CONFIG_MARKERS
- mod->markers = (void *)sechdrs[markersindex].sh_addr;
- mod->num_markers =
- sechdrs[markersindex].sh_size / sizeof(*mod->markers);
-#endif
-#ifdef CONFIG_TRACEPOINTS
- mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr;
- mod->num_tracepoints =
- sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints);
-#endif
-
/* Find duplicate symbols */
err = verify_export_symbols(mod);
-
if (err < 0)
goto cleanup;
/* Set up and sort exception table */
- mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
- mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
- sort_extable(extable, extable + mod->num_exentries);
+ mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
+ sizeof(*mod->extable), &mod->num_exentries);
+ sort_extable(mod->extable, mod->extable + mod->num_exentries);
/* Finally, copy percpu area over. */
percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
@@ -2223,11 +2181,17 @@ static noinline struct module *load_module(void __user *umod,
add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
if (!mod->taints) {
+ struct mod_debug *debug;
+ unsigned int num_debug;
+
#ifdef CONFIG_MARKERS
marker_update_probe_range(mod->markers,
mod->markers + mod->num_markers);
#endif
- dynamic_printk_setup(sechdrs, verboseindex);
+ debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
+ sizeof(*debug), &num_debug);
+ dynamic_printk_setup(debug, num_debug);
+
#ifdef CONFIG_TRACEPOINTS
tracepoint_update_probe_range(mod->tracepoints,
mod->tracepoints + mod->num_tracepoints);
@@ -2235,8 +2199,9 @@ static noinline struct module *load_module(void __user *umod,
}
/* sechdrs[0].sh_size is always zero */
- mseg = (void *)sechdrs[mcountindex].sh_addr;
- ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size);
+ mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
+ sizeof(*mseg), &num_mcount);
+ ftrace_init_module(mseg, mseg + num_mcount);
err = module_finalize(hdr, sechdrs, mod);
if (err < 0)
@@ -2261,30 +2226,24 @@ static noinline struct module *load_module(void __user *umod,
set_fs(old_fs);
mod->args = args;
- if (obsparmindex)
+ if (section_addr(hdr, sechdrs, secstrings, "__obsparm"))
printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
mod->name);
/* Now sew it into the lists so we can get lockdep and oops
- * info during argument parsing. Noone should access us, since
- * strong_try_module_get() will fail. */
- stop_machine(__link_module, mod, NULL);
-
- /* Size of section 0 is 0, so this works well if no params */
- err = parse_args(mod->name, mod->args,
- (struct kernel_param *)
- sechdrs[setupindex].sh_addr,
- sechdrs[setupindex].sh_size
- / sizeof(struct kernel_param),
- NULL);
+ * info during argument parsing. Noone should access us, since
+ * strong_try_module_get() will fail.
+ * lockdep/oops can run asynchronous, so use the RCU list insertion
+ * function to insert in a way safe to concurrent readers.
+ * The mutex protects against concurrent writers.
+ */
+ list_add_rcu(&mod->list, &modules);
+
+ err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
if (err < 0)
goto unlink;
- err = mod_sysfs_setup(mod,
- (struct kernel_param *)
- sechdrs[setupindex].sh_addr,
- sechdrs[setupindex].sh_size
- / sizeof(struct kernel_param));
+ err = mod_sysfs_setup(mod, kp, num_kp);
if (err < 0)
goto unlink;
add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2473,7 +2432,7 @@ const char *module_address_lookup(unsigned long addr,
const char *ret = NULL;
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (within(addr, mod->module_init, mod->init_size)
|| within(addr, mod->module_core, mod->core_size)) {
if (modname)
@@ -2496,7 +2455,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
struct module *mod;
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (within(addr, mod->module_init, mod->init_size) ||
within(addr, mod->module_core, mod->core_size)) {
const char *sym;
@@ -2520,7 +2479,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
struct module *mod;
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (within(addr, mod->module_init, mod->init_size) ||
within(addr, mod->module_core, mod->core_size)) {
const char *sym;
@@ -2547,7 +2506,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
struct module *mod;
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (symnum < mod->num_symtab) {
*value = mod->symtab[symnum].st_value;
*type = mod->symtab[symnum].st_info;
@@ -2590,7 +2549,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
ret = mod_find_symname(mod, colon+1);
*colon = ':';
} else {
- list_for_each_entry(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list)
if ((ret = mod_find_symname(mod, name)) != 0)
break;
}
@@ -2599,23 +2558,6 @@ unsigned long module_kallsyms_lookup_name(const char *name)
}
#endif /* CONFIG_KALLSYMS */
-/* Called by the /proc file system to return a list of modules. */
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
- mutex_lock(&module_mutex);
- return seq_list_start(&modules, *pos);
-}
-
-static void *m_next(struct seq_file *m, void *p, loff_t *pos)
-{
- return seq_list_next(p, &modules, pos);
-}
-
-static void m_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&module_mutex);
-}
-
static char *module_flags(struct module *mod, char *buf)
{
int bx = 0;
@@ -2649,6 +2591,24 @@ static char *module_flags(struct module *mod, char *buf)
return buf;
}
+#ifdef CONFIG_PROC_FS
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&module_mutex);
+ return seq_list_start(&modules, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &modules, pos);
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&module_mutex);
+}
+
static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
@@ -2679,13 +2639,33 @@ static int m_show(struct seq_file *m, void *p)
Where refcount is a number or -, and deps is a comma-separated list
of depends or -.
*/
-const struct seq_operations modules_op = {
+static const struct seq_operations modules_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = m_show
};
+static int modules_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &modules_op);
+}
+
+static const struct file_operations proc_modules_operations = {
+ .open = modules_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init proc_modules_init(void)
+{
+ proc_create("modules", 0, NULL, &proc_modules_operations);
+ return 0;
+}
+module_init(proc_modules_init);
+#endif
+
/* Given an address, look for it in the module exception tables. */
const struct exception_table_entry *search_module_extables(unsigned long addr)
{
@@ -2693,7 +2673,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
struct module *mod;
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (mod->num_exentries == 0)
continue;
@@ -2719,7 +2699,7 @@ int is_module_address(unsigned long addr)
preempt_disable();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (within(addr, mod->module_core, mod->core_size)) {
preempt_enable();
return 1;
@@ -2740,7 +2720,7 @@ struct module *__module_text_address(unsigned long addr)
if (addr < module_addr_min || addr > module_addr_max)
return NULL;
- list_for_each_entry(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list)
if (within(addr, mod->module_init, mod->init_text_size)
|| within(addr, mod->module_core, mod->core_text_size))
return mod;
@@ -2765,8 +2745,11 @@ void print_modules(void)
char buf[8];
printk("Modules linked in:");
- list_for_each_entry(mod, &modules, list)
+ /* Most callers should already have preempt disabled, but make sure */
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list)
printk(" %s%s", mod->name, module_flags(mod, buf));
+ preempt_enable();
if (last_unloaded_module[0])
printk(" [last unloaded: %s]", last_unloaded_module);
printk("\n");
diff --git a/kernel/panic.c b/kernel/panic.c
index bda561ef3cd..6513aac8e99 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,13 +34,6 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
-static int __init panic_setup(char *str)
-{
- panic_timeout = simple_strtoul(str, NULL, 0);
- return 1;
-}
-__setup("panic=", panic_setup);
-
static long no_blink(long time)
{
return 0;
@@ -218,13 +211,6 @@ void add_taint(unsigned flag)
}
EXPORT_SYMBOL(add_taint);
-static int __init pause_on_oops_setup(char *str)
-{
- pause_on_oops = simple_strtoul(str, NULL, 0);
- return 1;
-}
-__setup("pause_on_oops=", pause_on_oops_setup);
-
static void spin_msec(int msecs)
{
int i;
@@ -384,3 +370,6 @@ void __stack_chk_fail(void)
}
EXPORT_SYMBOL(__stack_chk_fail);
#endif
+
+core_param(panic, panic_timeout, int, 0644);
+core_param(pause_on_oops, pause_on_oops, int, 0644);
diff --git a/kernel/params.c b/kernel/params.c
index afc46a23eb6..a1e3025b19a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -373,6 +373,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
}
/* sysfs output in /sys/modules/XYZ/parameters/ */
+#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
extern struct kernel_param __start___param[], __stop___param[];
@@ -384,6 +386,7 @@ struct param_attribute
struct module_param_attrs
{
+ unsigned int num;
struct attribute_group grp;
struct param_attribute attrs[0];
};
@@ -434,93 +437,120 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
#ifdef CONFIG_SYSFS
/*
- * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
- * @mk: struct module_kobject (contains parent kobject)
- * @kparam: array of struct kernel_param, the actual parameter definitions
- * @num_params: number of entries in array
- * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules"
+ * add_sysfs_param - add a parameter to sysfs
+ * @mk: struct module_kobject
+ * @kparam: the actual parameter definition to add to sysfs
+ * @name: name of parameter
*
- * Create a kobject for a (per-module) group of parameters, and create files
- * in sysfs. A pointer to the param_kobject is returned on success,
- * NULL if there's no parameter to export, or other ERR_PTR(err).
+ * Create a kobject if for a (per-module) parameter if mp NULL, and
+ * create file in sysfs. Returns an error on out of memory. Always cleans up
+ * if there's an error.
*/
-static __modinit struct module_param_attrs *
-param_sysfs_setup(struct module_kobject *mk,
- struct kernel_param *kparam,
- unsigned int num_params,
- unsigned int name_skip)
+static __modinit int add_sysfs_param(struct module_kobject *mk,
+ struct kernel_param *kp,
+ const char *name)
{
- struct module_param_attrs *mp;
- unsigned int valid_attrs = 0;
- unsigned int i, size[2];
- struct param_attribute *pattr;
- struct attribute **gattr;
- int err;
-
- for (i=0; i<num_params; i++) {
- if (kparam[i].perm)
- valid_attrs++;
+ struct module_param_attrs *new;
+ struct attribute **attrs;
+ int err, num;
+
+ /* We don't bother calling this with invisible parameters. */
+ BUG_ON(!kp->perm);
+
+ if (!mk->mp) {
+ num = 0;
+ attrs = NULL;
+ } else {
+ num = mk->mp->num;
+ attrs = mk->mp->grp.attrs;
}
- if (!valid_attrs)
- return NULL;
-
- size[0] = ALIGN(sizeof(*mp) +
- valid_attrs * sizeof(mp->attrs[0]),
- sizeof(mp->grp.attrs[0]));
- size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
-
- mp = kzalloc(size[0] + size[1], GFP_KERNEL);
- if (!mp)
- return ERR_PTR(-ENOMEM);
-
- mp->grp.name = "parameters";
- mp->grp.attrs = (void *)mp + size[0];
-
- pattr = &mp->attrs[0];
- gattr = &mp->grp.attrs[0];
- for (i = 0; i < num_params; i++) {
- struct kernel_param *kp = &kparam[i];
- if (kp->perm) {
- pattr->param = kp;
- pattr->mattr.show = param_attr_show;
- pattr->mattr.store = param_attr_store;
- pattr->mattr.attr.name = (char *)&kp->name[name_skip];
- pattr->mattr.attr.mode = kp->perm;
- *(gattr++) = &(pattr++)->mattr.attr;
- }
+ /* Enlarge. */
+ new = krealloc(mk->mp,
+ sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
+ GFP_KERNEL);
+ if (!new) {
+ kfree(mk->mp);
+ err = -ENOMEM;
+ goto fail;
}
- *gattr = NULL;
-
- if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) {
- kfree(mp);
- return ERR_PTR(err);
+ attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
+ if (!attrs) {
+ err = -ENOMEM;
+ goto fail_free_new;
}
- return mp;
+
+ /* Sysfs wants everything zeroed. */
+ memset(new, 0, sizeof(*new));
+ memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
+ memset(&attrs[num], 0, sizeof(attrs[num]));
+ new->grp.name = "parameters";
+ new->grp.attrs = attrs;
+
+ /* Tack new one on the end. */
+ new->attrs[num].param = kp;
+ new->attrs[num].mattr.show = param_attr_show;
+ new->attrs[num].mattr.store = param_attr_store;
+ new->attrs[num].mattr.attr.name = (char *)name;
+ new->attrs[num].mattr.attr.mode = kp->perm;
+ new->num = num+1;
+
+ /* Fix up all the pointers, since krealloc can move us */
+ for (num = 0; num < new->num; num++)
+ new->grp.attrs[num] = &new->attrs[num].mattr.attr;
+ new->grp.attrs[num] = NULL;
+
+ mk->mp = new;
+ return 0;
+
+fail_free_new:
+ kfree(new);
+fail:
+ mk->mp = NULL;
+ return err;
}
#ifdef CONFIG_MODULES
+static void free_module_param_attrs(struct module_kobject *mk)
+{
+ kfree(mk->mp->grp.attrs);
+ kfree(mk->mp);
+ mk->mp = NULL;
+}
+
/*
* module_param_sysfs_setup - setup sysfs support for one module
* @mod: module
* @kparam: module parameters (array)
* @num_params: number of module parameters
*
- * Adds sysfs entries for module parameters, and creates a link from
- * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/
+ * Adds sysfs entries for module parameters under
+ * /sys/module/[mod->name]/parameters/
*/
int module_param_sysfs_setup(struct module *mod,
struct kernel_param *kparam,
unsigned int num_params)
{
- struct module_param_attrs *mp;
+ int i, err;
+ bool params = false;
+
+ for (i = 0; i < num_params; i++) {
+ if (kparam[i].perm == 0)
+ continue;
+ err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
+ if (err)
+ return err;
+ params = true;
+ }
- mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0);
- if (IS_ERR(mp))
- return PTR_ERR(mp);
+ if (!params)
+ return 0;
- mod->param_attrs = mp;
- return 0;
+ /* Create the param group. */
+ err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
+ if (err)
+ free_module_param_attrs(&mod->mkobj);
+ return err;
}
/*
@@ -532,43 +562,55 @@ int module_param_sysfs_setup(struct module *mod,
*/
void module_param_sysfs_remove(struct module *mod)
{
- if (mod->param_attrs) {
- sysfs_remove_group(&mod->mkobj.kobj,
- &mod->param_attrs->grp);
+ if (mod->mkobj.mp) {
+ sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
/* We are positive that no one is using any param
* attrs at this point. Deallocate immediately. */
- kfree(mod->param_attrs);
- mod->param_attrs = NULL;
+ free_module_param_attrs(&mod->mkobj);
}
}
#endif
-/*
- * kernel_param_sysfs_setup - wrapper for built-in params support
- */
-static void __init kernel_param_sysfs_setup(const char *name,
- struct kernel_param *kparam,
- unsigned int num_params,
- unsigned int name_skip)
+static void __init kernel_add_sysfs_param(const char *name,
+ struct kernel_param *kparam,
+ unsigned int name_skip)
{
struct module_kobject *mk;
- int ret;
+ struct kobject *kobj;
+ int err;
- mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
- BUG_ON(!mk);
-
- mk->mod = THIS_MODULE;
- mk->kobj.kset = module_kset;
- ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
- if (ret) {
- kobject_put(&mk->kobj);
- printk(KERN_ERR "Module '%s' failed to be added to sysfs, "
- "error number %d\n", name, ret);
- printk(KERN_ERR "The system will be unstable now.\n");
- return;
+ kobj = kset_find_obj(module_kset, name);
+ if (kobj) {
+ /* We already have one. Remove params so we can add more. */
+ mk = to_module_kobject(kobj);
+ /* We need to remove it before adding parameters. */
+ sysfs_remove_group(&mk->kobj, &mk->mp->grp);
+ } else {
+ mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+ BUG_ON(!mk);
+
+ mk->mod = THIS_MODULE;
+ mk->kobj.kset = module_kset;
+ err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
+ "%s", name);
+ if (err) {
+ kobject_put(&mk->kobj);
+ printk(KERN_ERR "Module '%s' failed add to sysfs, "
+ "error number %d\n", name, err);
+ printk(KERN_ERR "The system will be unstable now.\n");
+ return;
+ }
+ /* So that exit path is even. */
+ kobject_get(&mk->kobj);
}
- param_sysfs_setup(mk, kparam, num_params, name_skip);
+
+ /* These should not fail at boot. */
+ err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
+ BUG_ON(err);
+ err = sysfs_create_group(&mk->kobj, &mk->mp->grp);
+ BUG_ON(err);
kobject_uevent(&mk->kobj, KOBJ_ADD);
+ kobject_put(&mk->kobj);
}
/*
@@ -579,60 +621,36 @@ static void __init kernel_param_sysfs_setup(const char *name,
* The "module" name (KBUILD_MODNAME) is stored before a dot, the
* "parameter" name is stored behind a dot in kernel_param->name. So,
* extract the "module" name for all built-in kernel_param-eters,
- * and for all who have the same, call kernel_param_sysfs_setup.
+ * and for all who have the same, call kernel_add_sysfs_param.
*/
static void __init param_sysfs_builtin(void)
{
- struct kernel_param *kp, *kp_begin = NULL;
- unsigned int i, name_len, count = 0;
- char modname[MODULE_NAME_LEN + 1] = "";
+ struct kernel_param *kp;
+ unsigned int name_len;
+ char modname[MODULE_NAME_LEN];
- for (i=0; i < __stop___param - __start___param; i++) {
+ for (kp = __start___param; kp < __stop___param; kp++) {
char *dot;
- size_t max_name_len;
- kp = &__start___param[i];
- max_name_len =
- min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
+ if (kp->perm == 0)
+ continue;
- dot = memchr(kp->name, '.', max_name_len);
+ dot = strchr(kp->name, '.');
if (!dot) {
- DEBUGP("couldn't find period in first %d characters "
- "of %s\n", MODULE_NAME_LEN, kp->name);
- continue;
- }
- name_len = dot - kp->name;
-
- /* new kbuild_modname? */
- if (strlen(modname) != name_len
- || strncmp(modname, kp->name, name_len) != 0) {
- /* add a new kobject for previous kernel_params. */
- if (count)
- kernel_param_sysfs_setup(modname,
- kp_begin,
- count,
- strlen(modname)+1);
-
- strncpy(modname, kp->name, name_len);
- modname[name_len] = '\0';
- count = 0;
- kp_begin = kp;
+ /* This happens for core_param() */
+ strcpy(modname, "kernel");
+ name_len = 0;
+ } else {
+ name_len = dot - kp->name + 1;
+ strlcpy(modname, kp->name, name_len);
}
- count++;
+ kernel_add_sysfs_param(modname, kp, name_len);
}
-
- /* last kernel_params need to be registered as well */
- if (count)
- kernel_param_sysfs_setup(modname, kp_begin, count,
- strlen(modname)+1);
}
/* module-related sysfs stuff */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr);
-#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
-
static ssize_t module_attr_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index b931d7cedbf..5e79c662294 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -639,7 +639,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
- remaining = ktime_sub(timer->expires, now);
+ remaining = ktime_sub(hrtimer_get_expires(timer), now);
/* Return 0 only, when the timer is expired and not pending */
if (remaining.tv64 <= 0) {
/*
@@ -733,7 +733,7 @@ common_timer_set(struct k_itimer *timr, int flags,
hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
timr->it.real.timer.function = posix_timer_fn;
- timer->expires = timespec_to_ktime(new_setting->it_value);
+ hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
/* Convert interval */
timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
@@ -742,14 +742,12 @@ common_timer_set(struct k_itimer *timr, int flags,
if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
/* Setup correct expiry time for relative timers */
if (mode == HRTIMER_MODE_REL) {
- timer->expires =
- ktime_add_safe(timer->expires,
- timer->base->get_time());
+ hrtimer_add_expires(timer, timer->base->get_time());
}
return 0;
}
- hrtimer_start(timer, timer->expires, mode);
+ hrtimer_start_expires(timer, mode);
return 0;
}
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 331f9836383..c9d74083746 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -651,7 +651,7 @@ static int software_resume(void)
pr_debug("PM: Preparing processes for restore.\n");
error = prepare_processes();
if (error) {
- swsusp_close();
+ swsusp_close(FMODE_READ);
goto Done;
}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index acc0c101dbd..46b5ec7a3af 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -153,7 +153,7 @@ extern int swsusp_shrink_memory(void);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
-extern void swsusp_close(void);
+extern void swsusp_close(fmode_t);
struct timeval;
/* kernel/power/swsusp.c */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 80ccac849e4..b7713b53d07 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -172,13 +172,13 @@ static int swsusp_swap_check(void) /* This is called before saving image */
return res;
root_swap = res;
- res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR);
+ res = blkdev_get(resume_bdev, FMODE_WRITE);
if (res)
return res;
res = set_blocksize(resume_bdev, PAGE_SIZE);
if (res < 0)
- blkdev_put(resume_bdev);
+ blkdev_put(resume_bdev, FMODE_WRITE);
return res;
}
@@ -426,7 +426,7 @@ int swsusp_write(unsigned int flags)
release_swap_writer(&handle);
out:
- swsusp_close();
+ swsusp_close(FMODE_WRITE);
return error;
}
@@ -574,7 +574,7 @@ int swsusp_read(unsigned int *flags_p)
error = load_image(&handle, &snapshot, header->pages - 1);
release_swap_reader(&handle);
- blkdev_put(resume_bdev);
+ blkdev_put(resume_bdev, FMODE_READ);
if (!error)
pr_debug("PM: Image successfully loaded\n");
@@ -609,7 +609,7 @@ int swsusp_check(void)
return -EINVAL;
}
if (error)
- blkdev_put(resume_bdev);
+ blkdev_put(resume_bdev, FMODE_READ);
else
pr_debug("PM: Signature found, resuming\n");
} else {
@@ -626,14 +626,14 @@ int swsusp_check(void)
* swsusp_close - close swap device.
*/
-void swsusp_close(void)
+void swsusp_close(fmode_t mode)
{
if (IS_ERR(resume_bdev)) {
pr_debug("PM: Image device not initialised\n");
return;
}
- blkdev_put(resume_bdev);
+ blkdev_put(resume_bdev, mode); /* move up */
}
static int swsusp_header_init(void)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 467d5940f62..ad63af8b252 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -119,18 +119,19 @@ static void _rcu_barrier(enum rcu_barrier type)
/* Take cpucontrol mutex to protect against CPU hotplug */
mutex_lock(&rcu_barrier_mutex);
init_completion(&rcu_barrier_completion);
- atomic_set(&rcu_barrier_cpu_count, 0);
/*
- * The queueing of callbacks in all CPUs must be atomic with
- * respect to RCU, otherwise one CPU may queue a callback,
- * wait for a grace period, decrement barrier count and call
- * complete(), while other CPUs have not yet queued anything.
- * So, we need to make sure that grace periods cannot complete
- * until all the callbacks are queued.
+ * Initialize rcu_barrier_cpu_count to 1, then invoke
+ * rcu_barrier_func() on each CPU, so that each CPU also has
+ * incremented rcu_barrier_cpu_count. Only then is it safe to
+ * decrement rcu_barrier_cpu_count -- otherwise the first CPU
+ * might complete its grace period before all of the other CPUs
+ * did their increment, causing this function to return too
+ * early.
*/
- rcu_read_lock();
+ atomic_set(&rcu_barrier_cpu_count, 1);
on_each_cpu(rcu_barrier_func, (void *)type, 1);
- rcu_read_unlock();
+ if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+ complete(&rcu_barrier_completion);
wait_for_completion(&rcu_barrier_completion);
mutex_unlock(&rcu_barrier_mutex);
}
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 6522ae5b14a..69d9cb921ff 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -631,8 +631,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
/* Setup the timer, when timeout != NULL */
if (unlikely(timeout)) {
- hrtimer_start(&timeout->timer, timeout->timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
if (!hrtimer_active(&timeout->timer))
timeout->task = NULL;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index d906f72b42d..e8819bc6f46 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
+#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/syscalls.h>
@@ -227,9 +228,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
- hrtimer_start(&rt_b->rt_period_timer,
- rt_b->rt_period_timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&rt_b->rt_period_timer,
+ HRTIMER_MODE_ABS);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -386,7 +386,6 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
- u64 pair_start;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
@@ -819,6 +818,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
unsigned int sysctl_sched_shares_ratelimit = 250000;
/*
+ * Inject some fuzzyness into changing the per-cpu group shares
+ * this avoids remote rq-locks at the expense of fairness.
+ * default: 4
+ */
+unsigned int sysctl_sched_shares_thresh = 4;
+
+/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
*/
@@ -1064,7 +1070,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
- timer->expires = time;
+ hrtimer_set_expires(timer, time);
if (rq == this_rq()) {
hrtimer_restart(timer);
@@ -1454,8 +1460,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
* Calculate and set the cpu's group shares.
*/
static void
-__update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares, unsigned long sd_rq_weight)
+update_group_shares_cpu(struct task_group *tg, int cpu,
+ unsigned long sd_shares, unsigned long sd_rq_weight)
{
int boost = 0;
unsigned long shares;
@@ -1486,19 +1492,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
*
*/
shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+ shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
- /*
- * record the actual number of shares, not the boosted amount.
- */
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- tg->cfs_rq[cpu]->rq_weight = rq_weight;
+ if (abs(shares - tg->se[cpu]->load.weight) >
+ sysctl_sched_shares_thresh) {
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- else if (shares > MAX_SHARES)
- shares = MAX_SHARES;
+ spin_lock_irqsave(&rq->lock, flags);
+ /*
+ * record the actual number of shares, not the boosted amount.
+ */
+ tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+ tg->cfs_rq[cpu]->rq_weight = rq_weight;
- __set_se_shares(tg->se[cpu], shares);
+ __set_se_shares(tg->se[cpu], shares);
+ spin_unlock_irqrestore(&rq->lock, flags);
+ }
}
/*
@@ -1527,14 +1537,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
if (!rq_weight)
rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
- for_each_cpu_mask(i, sd->span) {
- struct rq *rq = cpu_rq(i);
- unsigned long flags;
-
- spin_lock_irqsave(&rq->lock, flags);
- __update_group_shares_cpu(tg, i, shares, rq_weight);
- spin_unlock_irqrestore(&rq->lock, flags);
- }
+ for_each_cpu_mask(i, sd->span)
+ update_group_shares_cpu(tg, i, shares, rq_weight);
return 0;
}
@@ -3339,7 +3343,7 @@ small_imbalance:
} else
this_load_per_task = cpu_avg_load_per_task(this_cpu);
- if (max_load - this_load + 2*busiest_load_per_task >=
+ if (max_load - this_load + busiest_load_per_task >=
busiest_load_per_task * imbn) {
*imbalance = busiest_load_per_task;
return busiest;
@@ -4443,12 +4447,8 @@ need_resched_nonpreemptible:
if (sched_feat(HRTICK))
hrtick_clear(rq);
- /*
- * Do the rq-clock update outside the rq lock:
- */
- local_irq_disable();
+ spin_lock_irq(&rq->lock);
update_rq_clock(rq);
- spin_lock(&rq->lock);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f604dae7131..ce514afd78f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+static const struct sched_class fair_sched_class;
+
/**************************************************************
* CFS operations on generic schedulable entities:
*/
@@ -141,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+ int depth = 0;
+
+ for_each_sched_entity(se)
+ depth++;
+
+ return depth;
+}
+
+static void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+ int se_depth, pse_depth;
+
+ /*
+ * preemption test can be made between sibling entities who are in the
+ * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+ * both tasks until we find their ancestors who are siblings of common
+ * parent.
+ */
+
+ /* First walk up until both entities are at same depth */
+ se_depth = depth_se(*se);
+ pse_depth = depth_se(*pse);
+
+ while (se_depth > pse_depth) {
+ se_depth--;
+ *se = parent_entity(*se);
+ }
+
+ while (pse_depth > se_depth) {
+ pse_depth--;
+ *pse = parent_entity(*pse);
+ }
+
+ while (!is_same_group(*se, *pse)) {
+ *se = parent_entity(*se);
+ *pse = parent_entity(*pse);
+ }
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -191,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return NULL;
}
+static inline void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -221,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
return se->vruntime - cfs_rq->min_vruntime;
}
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+ u64 vruntime = cfs_rq->min_vruntime;
+
+ if (cfs_rq->curr)
+ vruntime = cfs_rq->curr->vruntime;
+
+ if (cfs_rq->rb_leftmost) {
+ struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
+ struct sched_entity,
+ run_node);
+
+ if (vruntime == cfs_rq->min_vruntime)
+ vruntime = se->vruntime;
+ else
+ vruntime = min_vruntime(vruntime, se->vruntime);
+ }
+
+ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+}
+
/*
* Enqueue an entity into the rb-tree:
*/
@@ -254,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
- if (leftmost) {
+ if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;
- /*
- * maintain cfs_rq->min_vruntime to be a monotonic increasing
- * value tracking the leftmost vruntime in the tree.
- */
- cfs_rq->min_vruntime =
- max_vruntime(cfs_rq->min_vruntime, se->vruntime);
- }
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -272,18 +336,9 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;
- struct sched_entity *next;
next_node = rb_next(&se->run_node);
cfs_rq->rb_leftmost = next_node;
-
- if (next_node) {
- next = rb_entry(next_node,
- struct sched_entity, run_node);
- cfs_rq->min_vruntime =
- max_vruntime(cfs_rq->min_vruntime,
- next->vruntime);
- }
}
if (cfs_rq->next == se)
@@ -334,7 +389,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
#endif
/*
- * delta *= w / rw
+ * delta *= P[w / rw]
*/
static inline unsigned long
calc_delta_weight(unsigned long delta, struct sched_entity *se)
@@ -348,15 +403,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se)
}
/*
- * delta *= rw / w
+ * delta /= w
*/
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
- for_each_sched_entity(se) {
- delta = calc_delta_mine(delta,
- cfs_rq_of(se)->load.weight, &se->load);
- }
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
return delta;
}
@@ -386,26 +439,26 @@ static u64 __sched_period(unsigned long nr_running)
* We calculate the wall-time slice from the period by taking a part
* proportional to the weight.
*
- * s = p*w/rw
+ * s = p*P[w/rw]
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+ unsigned long nr_running = cfs_rq->nr_running;
+
+ if (unlikely(!se->on_rq))
+ nr_running++;
+
+ return calc_delta_weight(__sched_period(nr_running), se);
}
/*
* We calculate the vruntime slice of a to be inserted task
*
- * vs = s*rw/w = p
+ * vs = s/w
*/
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- unsigned long nr_running = cfs_rq->nr_running;
-
- if (!se->on_rq)
- nr_running++;
-
- return __sched_period(nr_running);
+ return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
/*
@@ -424,6 +477,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
curr->vruntime += delta_exec_weighted;
+ update_min_vruntime(cfs_rq);
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -613,13 +667,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
- u64 vruntime;
-
- if (first_fair(cfs_rq)) {
- vruntime = min_vruntime(cfs_rq->min_vruntime,
- __pick_next_entity(cfs_rq)->vruntime);
- } else
- vruntime = cfs_rq->min_vruntime;
+ u64 vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
@@ -628,7 +676,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
* stays open at the end.
*/
if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice_add(cfs_rq, se);
+ vruntime += sched_vslice(cfs_rq, se);
if (!initial) {
/* sleeps upto a single latency don't count. */
@@ -696,6 +744,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
+ update_min_vruntime(cfs_rq);
}
/*
@@ -742,16 +791,14 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+
static struct sched_entity *
pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rq *rq = rq_of(cfs_rq);
- u64 pair_slice = rq->clock - cfs_rq->pair_start;
-
- if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
- cfs_rq->pair_start = rq->clock;
+ if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1)
return se;
- }
return cfs_rq->next;
}
@@ -849,11 +896,31 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
hrtick_start(rq, delta);
}
}
+
+/*
+ * called from enqueue/dequeue and updates the hrtick when the
+ * current task is from our class and nr_running is low enough
+ * to matter.
+ */
+static void hrtick_update(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+
+ if (curr->sched_class != &fair_sched_class)
+ return;
+
+ if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
+ hrtick_start_fair(rq, curr);
+}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
}
+
+static inline void hrtick_update(struct rq *rq)
+{
+}
#endif
/*
@@ -874,7 +941,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
wakeup = 1;
}
- hrtick_start_fair(rq, rq->curr);
+ hrtick_update(rq);
}
/*
@@ -896,7 +963,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
sleep = 1;
}
- hrtick_start_fair(rq, rq->curr);
+ hrtick_update(rq);
}
/*
@@ -1002,8 +1069,6 @@ static inline int wake_idle(int cpu, struct task_struct *p)
#ifdef CONFIG_SMP
-static const struct sched_class fair_sched_class;
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* effective_load() calculates the load change as seen from the root_task_group
@@ -1104,10 +1169,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
- if (!sync && sched_feat(SYNC_WAKEUPS) &&
- curr->se.avg_overlap < sysctl_sched_migration_cost &&
- p->se.avg_overlap < sysctl_sched_migration_cost)
- sync = 1;
+ if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+ p->se.avg_overlap > sysctl_sched_migration_cost))
+ sync = 0;
/*
* If sync wakeup then subtract the (maximum possible)
@@ -1226,13 +1290,42 @@ static unsigned long wakeup_gran(struct sched_entity *se)
* More easily preempt - nice tasks, while not making it harder for
* + nice tasks.
*/
- if (sched_feat(ASYM_GRAN))
- gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
+ if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
+ gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
return gran;
}
/*
+ * Should 'se' preempt 'curr'.
+ *
+ * |s1
+ * |s2
+ * |s3
+ * g
+ * |<--->|c
+ *
+ * w(c, s1) = -1
+ * w(c, s2) = 0
+ * w(c, s3) = 1
+ *
+ */
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+{
+ s64 gran, vdiff = curr->vruntime - se->vruntime;
+
+ if (vdiff <= 0)
+ return -1;
+
+ gran = wakeup_gran(curr);
+ if (vdiff > gran)
+ return 1;
+
+ return 0;
+}
+
+/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
@@ -1240,7 +1333,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se, *pse = &p->se;
- s64 delta_exec;
if (unlikely(rt_prio(p->prio))) {
update_rq_clock(rq);
@@ -1278,9 +1370,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
return;
}
- delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
- if (delta_exec > wakeup_gran(pse))
- resched_task(curr);
+ find_matching_se(&se, &pse);
+
+ while (se) {
+ BUG_ON(!pse);
+
+ if (wakeup_preempt_entity(se, pse) == 1) {
+ resched_task(curr);
+ break;
+ }
+
+ se = parent_entity(se);
+ pse = parent_entity(pse);
+ }
}
static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1576,9 +1678,6 @@ static const struct sched_class fair_sched_class = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
-#ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_fair,
-#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_wakeup,
@@ -1586,6 +1685,8 @@ static const struct sched_class fair_sched_class = {
.put_prev_task = put_prev_task_fair,
#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_fair,
+
.load_balance = load_balance_fair,
.move_one_task = move_one_task_fair,
#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 7c9e8f4a049..fda01621829 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -5,7 +5,7 @@ SCHED_FEAT(START_DEBIT, 1)
SCHED_FEAT(AFFINE_WAKEUPS, 1)
SCHED_FEAT(CACHE_HOT_BUDDY, 1)
SCHED_FEAT(SYNC_WAKEUPS, 1)
-SCHED_FEAT(HRTICK, 1)
+SCHED_FEAT(HRTICK, 0)
SCHED_FEAT(DOUBLE_TICK, 0)
SCHED_FEAT(ASYM_GRAN, 1)
SCHED_FEAT(LB_BIAS, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index dec4ccabe2f..8a21a2e28c1 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -105,9 +105,6 @@ static const struct sched_class idle_sched_class = {
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
-#ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_idle,
-#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_curr_idle,
@@ -115,6 +112,8 @@ static const struct sched_class idle_sched_class = {
.put_prev_task = put_prev_task_idle,
#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_idle,
+
.load_balance = load_balance_idle,
.move_one_task = move_one_task_idle,
#endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b446dc87494..d9ba9d5f99d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1504,9 +1504,6 @@ static const struct sched_class rt_sched_class = {
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
-#ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_rt,
-#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_curr_rt,
@@ -1514,6 +1511,8 @@ static const struct sched_class rt_sched_class = {
.put_prev_task = put_prev_task_rt,
#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_rt,
+
.load_balance = load_balance_rt,
.move_one_task = move_one_task_rt,
.set_cpus_allowed = set_cpus_allowed_rt,
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index b8c156979cf..ee71bec1da6 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -9,7 +9,7 @@
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
- int mask_len = NR_CPUS/32 * 9;
+ int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
char *mask_str = kmalloc(mask_len, GFP_KERNEL);
if (mask_str == NULL)
@@ -90,13 +90,20 @@ static int schedstat_open(struct inode *inode, struct file *file)
return res;
}
-const struct file_operations proc_schedstat_operations = {
+static const struct file_operations proc_schedstat_operations = {
.open = schedstat_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
+static int __init proc_schedstat_init(void)
+{
+ proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+ return 0;
+}
+module_init(proc_schedstat_init);
+
/*
* Expects runqueue lock to be held for atomicity of update
*/
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index af3c7cea258..9bc4c00872c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -37,9 +37,13 @@ struct stop_machine_data {
/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
static unsigned int num_threads;
static atomic_t thread_ack;
-static struct completion finished;
static DEFINE_MUTEX(lock);
+static struct workqueue_struct *stop_machine_wq;
+static struct stop_machine_data active, idle;
+static const cpumask_t *active_cpus;
+static void *stop_machine_work;
+
static void set_state(enum stopmachine_state newstate)
{
/* Reset ack counter. */
@@ -51,21 +55,26 @@ static void set_state(enum stopmachine_state newstate)
/* Last one to ack a state moves to the next state. */
static void ack_state(void)
{
- if (atomic_dec_and_test(&thread_ack)) {
- /* If we're the last one to ack the EXIT, we're finished. */
- if (state == STOPMACHINE_EXIT)
- complete(&finished);
- else
- set_state(state + 1);
- }
+ if (atomic_dec_and_test(&thread_ack))
+ set_state(state + 1);
}
-/* This is the actual thread which stops the CPU. It exits by itself rather
- * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
-static int stop_cpu(struct stop_machine_data *smdata)
+/* This is the actual function which stops the CPU. It runs
+ * in the context of a dedicated stopmachine workqueue. */
+static void stop_cpu(struct work_struct *unused)
{
enum stopmachine_state curstate = STOPMACHINE_NONE;
-
+ struct stop_machine_data *smdata = &idle;
+ int cpu = smp_processor_id();
+ int err;
+
+ if (!active_cpus) {
+ if (cpu == first_cpu(cpu_online_map))
+ smdata = &active;
+ } else {
+ if (cpu_isset(cpu, *active_cpus))
+ smdata = &active;
+ }
/* Simple state machine */
do {
/* Chill out and ensure we re-read stopmachine_state. */
@@ -78,9 +87,11 @@ static int stop_cpu(struct stop_machine_data *smdata)
hard_irq_disable();
break;
case STOPMACHINE_RUN:
- /* |= allows error detection if functions on
- * multiple CPUs. */
- smdata->fnret |= smdata->fn(smdata->data);
+ /* On multiple CPUs only a single error code
+ * is needed to tell that something failed. */
+ err = smdata->fn(smdata->data);
+ if (err)
+ smdata->fnret = err;
break;
default:
break;
@@ -90,7 +101,6 @@ static int stop_cpu(struct stop_machine_data *smdata)
} while (curstate != STOPMACHINE_EXIT);
local_irq_enable();
- do_exit(0);
}
/* Callback for CPUs which aren't supposed to do anything. */
@@ -101,78 +111,34 @@ static int chill(void *unused)
int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
{
- int i, err;
- struct stop_machine_data active, idle;
- struct task_struct **threads;
+ struct work_struct *sm_work;
+ int i;
+ /* Set up initial state. */
+ mutex_lock(&lock);
+ num_threads = num_online_cpus();
+ active_cpus = cpus;
active.fn = fn;
active.data = data;
active.fnret = 0;
idle.fn = chill;
idle.data = NULL;
- /* This could be too big for stack on large machines. */
- threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
- if (!threads)
- return -ENOMEM;
-
- /* Set up initial state. */
- mutex_lock(&lock);
- init_completion(&finished);
- num_threads = num_online_cpus();
set_state(STOPMACHINE_PREPARE);
- for_each_online_cpu(i) {
- struct stop_machine_data *smdata = &idle;
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-
- if (!cpus) {
- if (i == first_cpu(cpu_online_map))
- smdata = &active;
- } else {
- if (cpu_isset(i, *cpus))
- smdata = &active;
- }
-
- threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
- i);
- if (IS_ERR(threads[i])) {
- err = PTR_ERR(threads[i]);
- threads[i] = NULL;
- goto kill_threads;
- }
-
- /* Place it onto correct cpu. */
- kthread_bind(threads[i], i);
-
- /* Make it highest prio. */
- if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
- BUG();
- }
-
- /* We've created all the threads. Wake them all: hold this CPU so one
+ /* Schedule the stop_cpu work on all cpus: hold this CPU so one
* doesn't hit this CPU until we're ready. */
get_cpu();
- for_each_online_cpu(i)
- wake_up_process(threads[i]);
-
+ for_each_online_cpu(i) {
+ sm_work = percpu_ptr(stop_machine_work, i);
+ INIT_WORK(sm_work, stop_cpu);
+ queue_work_on(i, stop_machine_wq, sm_work);
+ }
/* This will release the thread on our CPU. */
put_cpu();
- wait_for_completion(&finished);
+ flush_workqueue(stop_machine_wq);
mutex_unlock(&lock);
-
- kfree(threads);
-
return active.fnret;
-
-kill_threads:
- for_each_online_cpu(i)
- if (threads[i])
- kthread_stop(threads[i]);
- mutex_unlock(&lock);
-
- kfree(threads);
- return err;
}
int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
@@ -187,3 +153,11 @@ int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
return ret;
}
EXPORT_SYMBOL_GPL(stop_machine);
+
+static int __init stop_machine_init(void)
+{
+ stop_machine_wq = create_rt_workqueue("kstop");
+ stop_machine_work = alloc_percpu(struct work_struct);
+ return 0;
+}
+core_initcall(stop_machine_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index 53879cdae48..31deba8f7d1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1716,6 +1716,16 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
case PR_SET_TSC:
error = SET_TSC_CTL(arg2);
break;
+ case PR_GET_TIMERSLACK:
+ error = current->timer_slack_ns;
+ break;
+ case PR_SET_TIMERSLACK:
+ if (arg2 <= 0)
+ current->timer_slack_ns =
+ current->default_timer_slack_ns;
+ else
+ current->timer_slack_ns = arg2;
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b3cc73931d1..a13bd4dfaeb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -276,6 +276,16 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_shares_thresh",
+ .data = &sysctl_sched_shares_thresh,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
.maxlen = sizeof(unsigned int),
diff --git a/kernel/time.c b/kernel/time.c
index 6a08660b4fa..d63a4336fad 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -669,3 +669,21 @@ EXPORT_SYMBOL(get_jiffies_64);
#endif
EXPORT_SYMBOL(jiffies);
+
+/*
+ * Add two timespec values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0)
+ */
+struct timespec timespec_add_safe(const struct timespec lhs,
+ const struct timespec rhs)
+{
+ struct timespec res;
+
+ set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
+ lhs.tv_nsec + rhs.tv_nsec);
+
+ if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
+ res.tv_sec = TIME_T_MAX;
+
+ return res;
+}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 1a20715bfd6..8ff15e5d486 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -142,8 +142,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
time_state = TIME_OOP;
printk(KERN_NOTICE "Clock: "
"inserting leap second 23:59:60 UTC\n");
- leap_timer.expires = ktime_add_ns(leap_timer.expires,
- NSEC_PER_SEC);
+ hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
res = HRTIMER_RESTART;
break;
case TIME_DEL:
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0581c11fe6c..5bbb1044f84 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -300,7 +300,7 @@ void tick_nohz_stop_sched_tick(int inidle)
goto out;
}
- ts->idle_tick = ts->sched_timer.expires;
+ ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
ts->idle_jiffies = last_jiffies;
rcu_enter_nohz();
@@ -380,21 +380,21 @@ ktime_t tick_nohz_get_sleep_length(void)
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
hrtimer_cancel(&ts->sched_timer);
- ts->sched_timer.expires = ts->idle_tick;
+ hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
while (1) {
/* Forward the time to expire in the future */
hrtimer_forward(&ts->sched_timer, now, tick_period);
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer,
- ts->sched_timer.expires,
+ hrtimer_start_expires(&ts->sched_timer,
HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
} else {
- if (!tick_program_event(ts->sched_timer.expires, 0))
+ if (!tick_program_event(
+ hrtimer_get_expires(&ts->sched_timer), 0))
break;
}
/* Update jiffies and reread time */
@@ -456,14 +456,16 @@ void tick_nohz_restart_sched_tick(void)
*/
ts->tick_stopped = 0;
ts->idle_exittime = now;
+
tick_nohz_restart(ts, now);
+
local_irq_enable();
}
static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
{
hrtimer_forward(&ts->sched_timer, now, tick_period);
- return tick_program_event(ts->sched_timer.expires, 0);
+ return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
}
/*
@@ -542,7 +544,7 @@ static void tick_nohz_switch_to_nohz(void)
next = tick_init_jiffy_update();
for (;;) {
- ts->sched_timer.expires = next;
+ hrtimer_set_expires(&ts->sched_timer, next);
if (!tick_program_event(next, 0))
break;
next = ktime_add(next, tick_period);
@@ -567,11 +569,21 @@ static void tick_nohz_switch_to_nohz(void)
static void tick_nohz_kick_tick(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t delta, now;
if (!ts->tick_stopped)
return;
- tick_nohz_restart(ts, ktime_get());
+ /*
+ * Do not touch the tick device, when the next expiry is either
+ * already reached or less/equal than the tick period.
+ */
+ now = ktime_get();
+ delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
+ if (delta.tv64 <= tick_period.tv64)
+ return;
+
+ tick_nohz_restart(ts, now);
}
#else
@@ -668,16 +680,15 @@ void tick_setup_sched_timer(void)
ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
/* Get the next period (per cpu) */
- ts->sched_timer.expires = tick_init_jiffy_update();
+ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
offset = ktime_to_ns(tick_period) >> 1;
do_div(offset, num_possible_cpus());
offset *= smp_processor_id();
- ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
+ hrtimer_add_expires_ns(&ts->sched_timer, offset);
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
- hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index f6426911e35..a999b92a127 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -66,9 +66,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
#endif
SEQ_printf(m, "\n");
- SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n",
- (unsigned long long)ktime_to_ns(timer->expires),
- (long long)(ktime_to_ns(timer->expires) - now));
+ SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
+ (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
+ (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
+ (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now),
+ (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
}
static void
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 714afad4653..f928f2a87b9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -62,6 +62,7 @@ struct workqueue_struct {
const char *name;
int singlethread;
int freezeable; /* Freeze threads during suspend */
+ int rt;
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
@@ -766,6 +767,7 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
struct workqueue_struct *wq = cwq->wq;
const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
struct task_struct *p;
@@ -781,7 +783,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
*/
if (IS_ERR(p))
return PTR_ERR(p);
-
+ if (cwq->wq->rt)
+ sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
cwq->thread = p;
return 0;
@@ -801,6 +804,7 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
struct workqueue_struct *__create_workqueue_key(const char *name,
int singlethread,
int freezeable,
+ int rt,
struct lock_class_key *key,
const char *lock_name)
{
@@ -822,6 +826,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
wq->singlethread = singlethread;
wq->freezeable = freezeable;
+ wq->rt = rt;
INIT_LIST_HEAD(&wq->list);
if (singlethread) {