aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-12-31 08:31:57 +0100
committerIngo Molnar <mingo@elte.hu>2008-12-31 08:31:57 +0100
commita9de18eb761f7c1c860964b2e5addc1a35c7e861 (patch)
tree886e75fdfd09690cd262ca69cb7f5d1d42b48602 /kernel/sched.c
parentb2aaf8f74cdc84a9182f6cabf198b7763bcb9d40 (diff)
parent6a94cb73064c952255336cc57731904174b2c58f (diff)
Merge branch 'linus' into stackprotector
Conflicts: arch/x86/include/asm/pda.h kernel/fork.c
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c540
1 files changed, 301 insertions, 239 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index d897a524e7d..c731dd820d1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
+#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/syscalls.h>
@@ -71,6 +72,7 @@
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
+#include <trace/sched.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
@@ -116,6 +118,12 @@
*/
#define RUNTIME_INF ((u64)~0ULL)
+DEFINE_TRACE(sched_wait_task);
+DEFINE_TRACE(sched_wakeup);
+DEFINE_TRACE(sched_wakeup_new);
+DEFINE_TRACE(sched_switch);
+DEFINE_TRACE(sched_migrate_task);
+
#ifdef CONFIG_SMP
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -201,7 +209,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rt_b->rt_period_timer.function = sched_rt_period_timer;
- rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
}
static inline int rt_bandwidth_enabled(void)
@@ -226,9 +233,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
- hrtimer_start(&rt_b->rt_period_timer,
- rt_b->rt_period_timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&rt_b->rt_period_timer,
+ HRTIMER_MODE_ABS);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -260,6 +266,10 @@ struct task_group {
struct cgroup_subsys_state css;
#endif
+#ifdef CONFIG_USER_SCHED
+ uid_t uid;
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
@@ -285,6 +295,12 @@ struct task_group {
#ifdef CONFIG_USER_SCHED
+/* Helper function to pass uid information to create_sched_user() */
+void set_tg_uid(struct user_struct *user)
+{
+ user->tg->uid = user->uid;
+}
+
/*
* Root task group.
* Every UID task group (including init_task_group aka UID-0) will
@@ -344,7 +360,9 @@ static inline struct task_group *task_group(struct task_struct *p)
struct task_group *tg;
#ifdef CONFIG_USER_SCHED
- tg = p->user->tg;
+ rcu_read_lock();
+ tg = __task_cred(p)->user->tg;
+ rcu_read_unlock();
#elif defined(CONFIG_CGROUP_SCHED)
tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
struct task_group, css);
@@ -385,7 +403,6 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
- u64 pair_start;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
@@ -397,9 +414,9 @@ struct cfs_rq {
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next;
+ struct sched_entity *curr, *next, *last;
- unsigned long nr_spread_over;
+ unsigned int nr_spread_over;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -586,6 +603,8 @@ struct rq {
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
+ unsigned long long rq_cpu_time;
+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_exp_empty;
@@ -703,45 +722,18 @@ static __read_mostly char *sched_feat_names[] = {
#undef SCHED_FEAT
-static int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_show(struct seq_file *m, void *v)
{
- filp->private_data = inode->i_private;
- return 0;
-}
-
-static ssize_t
-sched_feat_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char *buf;
- int r = 0;
- int len = 0;
int i;
for (i = 0; sched_feat_names[i]; i++) {
- len += strlen(sched_feat_names[i]);
- len += 4;
+ if (!(sysctl_sched_features & (1UL << i)))
+ seq_puts(m, "NO_");
+ seq_printf(m, "%s ", sched_feat_names[i]);
}
+ seq_puts(m, "\n");
- buf = kmalloc(len + 2, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- for (i = 0; sched_feat_names[i]; i++) {
- if (sysctl_sched_features & (1UL << i))
- r += sprintf(buf + r, "%s ", sched_feat_names[i]);
- else
- r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
- }
-
- r += sprintf(buf + r, "\n");
- WARN_ON(r >= len + 2);
-
- r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
- kfree(buf);
-
- return r;
+ return 0;
}
static ssize_t
@@ -786,10 +778,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
return cnt;
}
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_feat_show, NULL);
+}
+
static struct file_operations sched_feat_fops = {
- .open = sched_feat_open,
- .read = sched_feat_read,
- .write = sched_feat_write,
+ .open = sched_feat_open,
+ .write = sched_feat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
};
static __init int sched_init_debug(void)
@@ -818,6 +817,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
unsigned int sysctl_sched_shares_ratelimit = 250000;
/*
+ * Inject some fuzzyness into changing the per-cpu group shares
+ * this avoids remote rq-locks at the expense of fairness.
+ * default: 4
+ */
+unsigned int sysctl_sched_shares_thresh = 4;
+
+/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
*/
@@ -962,6 +968,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
}
}
+void task_rq_unlock_wait(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+
+ smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+ spin_unlock_wait(&rq->lock);
+}
+
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
@@ -1063,7 +1077,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
- timer->expires = time;
+ hrtimer_set_expires(timer, time);
if (rq == this_rq()) {
hrtimer_restart(timer);
@@ -1124,7 +1138,6 @@ static void init_rq_hrtick(struct rq *rq)
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
- rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
}
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
@@ -1438,9 +1451,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
+ unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
- if (rq->nr_running)
- rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+ if (nr_running)
+ rq->avg_load_per_task = rq->load.weight / nr_running;
+ else
+ rq->avg_load_per_task = 0;
return rq->avg_load_per_task;
}
@@ -1453,30 +1469,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
* Calculate and set the cpu's group shares.
*/
static void
-__update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares, unsigned long sd_rq_weight)
+update_group_shares_cpu(struct task_group *tg, int cpu,
+ unsigned long sd_shares, unsigned long sd_rq_weight)
{
- int boost = 0;
unsigned long shares;
unsigned long rq_weight;
if (!tg->se[cpu])
return;
- rq_weight = tg->cfs_rq[cpu]->load.weight;
-
- /*
- * If there are currently no tasks on the cpu pretend there is one of
- * average load so that when a new task gets to run here it will not
- * get delayed by group starvation.
- */
- if (!rq_weight) {
- boost = 1;
- rq_weight = NICE_0_LOAD;
- }
-
- if (unlikely(rq_weight > sd_rq_weight))
- rq_weight = sd_rq_weight;
+ rq_weight = tg->cfs_rq[cpu]->rq_weight;
/*
* \Sum shares * rq_weight
@@ -1484,20 +1486,20 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
* \Sum rq_weight
*
*/
- shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+ shares = (sd_shares * rq_weight) / sd_rq_weight;
+ shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
- /*
- * record the actual number of shares, not the boosted amount.
- */
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- tg->cfs_rq[cpu]->rq_weight = rq_weight;
+ if (abs(shares - tg->se[cpu]->load.weight) >
+ sysctl_sched_shares_thresh) {
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- else if (shares > MAX_SHARES)
- shares = MAX_SHARES;
+ spin_lock_irqsave(&rq->lock, flags);
+ tg->cfs_rq[cpu]->shares = shares;
- __set_se_shares(tg->se[cpu], shares);
+ __set_se_shares(tg->se[cpu], shares);
+ spin_unlock_irqrestore(&rq->lock, flags);
+ }
}
/*
@@ -1507,13 +1509,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
*/
static int tg_shares_up(struct task_group *tg, void *data)
{
- unsigned long rq_weight = 0;
+ unsigned long weight, rq_weight = 0;
unsigned long shares = 0;
struct sched_domain *sd = data;
int i;
for_each_cpu_mask(i, sd->span) {
- rq_weight += tg->cfs_rq[i]->load.weight;
+ /*
+ * If there are currently no tasks on the cpu pretend there
+ * is one of average load so that when a new task gets to
+ * run here it will not get delayed by group starvation.
+ */
+ weight = tg->cfs_rq[i]->load.weight;
+ if (!weight)
+ weight = NICE_0_LOAD;
+
+ tg->cfs_rq[i]->rq_weight = weight;
+ rq_weight += weight;
shares += tg->cfs_rq[i]->shares;
}
@@ -1523,17 +1535,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
shares = tg->shares;
- if (!rq_weight)
- rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-
- for_each_cpu_mask(i, sd->span) {
- struct rq *rq = cpu_rq(i);
- unsigned long flags;
-
- spin_lock_irqsave(&rq->lock, flags);
- __update_group_shares_cpu(tg, i, shares, rq_weight);
- spin_unlock_irqrestore(&rq->lock, flags);
- }
+ for_each_cpu_mask(i, sd->span)
+ update_group_shares_cpu(tg, i, shares, rq_weight);
return 0;
}
@@ -1596,6 +1599,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
#endif
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ int ret = 0;
+
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work good under rq->lock */
+ spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
+ if (unlikely(!spin_trylock(&busiest->lock))) {
+ if (busiest < this_rq) {
+ spin_unlock(&this_rq->lock);
+ spin_lock(&busiest->lock);
+ spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+ ret = 1;
+ } else
+ spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+ }
+ return ret;
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(busiest->lock)
+{
+ spin_unlock(&busiest->lock);
+ lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1800,7 +1836,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
/*
* Buddy candidates are cache hot:
*/
- if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
+ if (sched_feat(CACHE_HOT_BUDDY) &&
+ (&p->se == cfs_rq_of(&p->se)->next ||
+ &p->se == cfs_rq_of(&p->se)->last))
return 1;
if (p->sched_class != &fair_sched_class)
@@ -1827,6 +1865,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
clock_offset = old_rq->clock - new_rq->clock;
+ trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+
#ifdef CONFIG_SCHEDSTATS
if (p->se.wait_start)
p->se.wait_start -= clock_offset;
@@ -1936,6 +1976,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* just go back and repeat.
*/
rq = task_rq_lock(p, &flags);
+ trace_sched_wait_task(rq, p);
running = task_running(rq, p);
on_rq = p->se.on_rq;
ncsw = 0;
@@ -2235,6 +2276,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
smp_wmb();
rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
old_state = p->state;
if (!(old_state & state))
goto out;
@@ -2292,14 +2334,11 @@ out_activate:
schedstat_inc(p, se.nr_wakeups_local);
else
schedstat_inc(p, se.nr_wakeups_remote);
- update_rq_clock(rq);
activate_task(rq, p, 1);
success = 1;
out_running:
- trace_mark(kernel_sched_wakeup,
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- p->pid, p->state, rq, p, rq->curr);
+ trace_sched_wakeup(rq, p, success);
check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;
@@ -2432,9 +2471,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
p->sched_class->task_new(rq, p);
inc_nr_running(rq);
}
- trace_mark(kernel_sched_wakeup_new,
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- p->pid, p->state, rq, p, rq->curr);
+ trace_sched_wakeup_new(rq, p, 1);
check_preempt_curr(rq, p, 0);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
@@ -2607,11 +2644,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
- trace_mark(kernel_sched_schedule,
- "prev_pid %d next_pid %d prev_state %ld "
- "## rq %p prev %p next %p",
- prev->pid, next->pid, prev->state,
- rq, prev, next);
+ trace_sched_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
@@ -2801,40 +2834,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
}
/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
- __releases(this_rq->lock)
- __acquires(busiest->lock)
- __acquires(this_rq->lock)
-{
- int ret = 0;
-
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work good under rq->lock */
- spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
- if (unlikely(!spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
- spin_unlock(&this_rq->lock);
- spin_lock(&busiest->lock);
- spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
- ret = 1;
- } else
- spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
- }
- return ret;
-}
-
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
- __releases(busiest->lock)
-{
- spin_unlock(&busiest->lock);
- lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
-/*
* If dest_cpu is allowed for this process, migrate the task to it.
* This is accomplished by forcing the cpu_allowed mask to only
* allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -3344,7 +3343,7 @@ small_imbalance:
} else
this_load_per_task = cpu_avg_load_per_task(this_cpu);
- if (max_load - this_load + 2*busiest_load_per_task >=
+ if (max_load - this_load + busiest_load_per_task >=
busiest_load_per_task * imbn) {
*imbalance = busiest_load_per_task;
return busiest;
@@ -3695,7 +3694,7 @@ out_balanced:
static void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
- int pulled_task = -1;
+ int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
cpumask_t tmpmask;
@@ -4052,23 +4051,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);
/*
- * Return p->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked in case the task is currently running.
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
*/
-unsigned long long task_sched_runtime(struct task_struct *p)
+unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
- u64 ns, delta_exec;
struct rq *rq;
+ u64 ns = 0;
rq = task_rq_lock(p, &flags);
- ns = p->se.sum_exec_runtime;
+
if (task_current(rq, p)) {
+ u64 delta_exec;
+
update_rq_clock(rq);
delta_exec = rq->clock - p->se.exec_start;
if ((s64)delta_exec > 0)
- ns += delta_exec;
+ ns = delta_exec;
}
+
task_rq_unlock(rq, &flags);
return ns;
@@ -4085,6 +4087,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
cputime64_t tmp;
p->utime = cputime_add(p->utime, cputime);
+ account_group_user_time(p, cputime);
/* Add user time to cpustat. */
tmp = cputime_to_cputime64(cputime);
@@ -4109,6 +4112,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
tmp = cputime_to_cputime64(cputime);
p->utime = cputime_add(p->utime, cputime);
+ account_group_user_time(p, cputime);
p->gtime = cputime_add(p->gtime, cputime);
cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4144,6 +4148,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
}
p->stime = cputime_add(p->stime, cputime);
+ account_group_system_time(p, cputime);
/* Add system time to cpustat. */
tmp = cputime_to_cputime64(cputime);
@@ -4320,7 +4325,7 @@ void __kprobes sub_preempt_count(int val)
/*
* Underflow?
*/
- if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
return;
/*
* Is the spinlock portion underflowing?
@@ -4441,12 +4446,8 @@ need_resched_nonpreemptible:
if (sched_feat(HRTICK))
hrtick_clear(rq);
- /*
- * Do the rq-clock update outside the rq lock:
- */
- local_irq_disable();
+ spin_lock_irq(&rq->lock);
update_rq_clock(rq);
- spin_lock(&rq->lock);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -5119,6 +5120,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
set_load_weight(p);
}
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ bool match;
+
+ rcu_read_lock();
+ pcred = __task_cred(p);
+ match = (cred->euid == pcred->euid ||
+ cred->euid == pcred->uid);
+ rcu_read_unlock();
+ return match;
+}
+
static int __sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param, bool user)
{
@@ -5178,8 +5195,7 @@ recheck:
return -EPERM;
/* can't change other user's priorities */
- if ((current->euid != p->euid) &&
- (current->euid != p->uid))
+ if (!check_same_owner(p))
return -EPERM;
}
@@ -5411,8 +5427,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
read_unlock(&tasklist_lock);
retval = -EPERM;
- if ((current->euid != p->euid) && (current->euid != p->uid) &&
- !capable(CAP_SYS_NICE))
+ if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
goto out_unlock;
retval = security_task_setscheduler(p, 0, NULL);
@@ -5851,6 +5866,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ spin_lock_irqsave(&rq->lock, flags);
+
__sched_fork(idle);
idle->se.exec_start = sched_clock();
@@ -5858,7 +5875,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->cpus_allowed = cpumask_of_cpu(cpu);
__set_task_cpu(idle, cpu);
- spin_lock_irqsave(&rq->lock, flags);
rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
idle->oncpu = 1;
@@ -5875,6 +5891,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
* The idle tasks have their own, simple scheduling class:
*/
idle->sched_class = &idle_sched_class;
+ ftrace_graph_init_task(idle);
}
/*
@@ -6105,7 +6122,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
/*
* Figure out where task on dead CPU should go, use force if necessary.
- * NOTE: interrupts should be disabled by the caller
*/
static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
@@ -6566,7 +6582,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
req = list_entry(rq->migration_queue.next,
struct migration_req, list);
list_del_init(&req->list);
+ spin_unlock_irq(&rq->lock);
complete(&req->done);
+ spin_lock_irq(&rq->lock);
}
spin_unlock_irq(&rq->lock);
break;
@@ -6615,28 +6633,6 @@ early_initcall(migration_init);
#ifdef CONFIG_SCHED_DEBUG
-static inline const char *sd_level_to_string(enum sched_domain_level lvl)
-{
- switch (lvl) {
- case SD_LV_NONE:
- return "NONE";
- case SD_LV_SIBLING:
- return "SIBLING";
- case SD_LV_MC:
- return "MC";
- case SD_LV_CPU:
- return "CPU";
- case SD_LV_NODE:
- return "NODE";
- case SD_LV_ALLNODES:
- return "ALLNODES";
- case SD_LV_MAX:
- return "MAX";
-
- }
- return "MAX";
-}
-
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_t *groupmask)
{
@@ -6656,8 +6652,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %s level %s\n",
- str, sd_level_to_string(sd->level));
+ printk(KERN_CONT "span %s level %s\n", str, sd->name);
if (!cpu_isset(cpu, sd->span)) {
printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6793,6 +6788,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_PKG_RESOURCES);
+ if (nr_node_ids == 1)
+ pflags &= ~SD_SERIALIZE;
}
if (~cflags & pflags)
return 0;
@@ -6868,15 +6865,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct sched_domain *tmp;
/* Remove the sched domains which do not contribute to scheduling. */
- for (tmp = sd; tmp; tmp = tmp->parent) {
+ for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
if (!parent)
break;
+
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
- }
+ } else
+ tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
@@ -7311,13 +7310,21 @@ struct allmasks {
};
#if NR_CPUS > 128
-#define SCHED_CPUMASK_ALLOC 1
-#define SCHED_CPUMASK_FREE(v) kfree(v)
-#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
+#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{
+ *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
+}
+static inline void sched_cpumask_free(struct allmasks *masks)
+{
+ kfree(masks);
+}
#else
-#define SCHED_CPUMASK_ALLOC 0
-#define SCHED_CPUMASK_FREE(v)
-#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
+#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{ }
+static inline void sched_cpumask_free(struct allmasks *masks)
+{ }
#endif
#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
@@ -7393,9 +7400,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
return -ENOMEM;
}
-#if SCHED_CPUMASK_ALLOC
/* get space for all scratch cpumask variables */
- allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+ sched_cpumask_alloc(&allmasks);
if (!allmasks) {
printk(KERN_WARNING "Cannot alloc cpumask array\n");
kfree(rd);
@@ -7404,7 +7410,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
#endif
return -ENOMEM;
}
-#endif
+
tmpmask = (cpumask_t *)allmasks;
@@ -7658,13 +7664,14 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
cpu_attach_domain(sd, rd, i);
}
- SCHED_CPUMASK_FREE((void *)allmasks);
+ sched_cpumask_free(allmasks);
return 0;
#ifdef CONFIG_NUMA
error:
free_sched_groups(cpu_map, tmpmask);
- SCHED_CPUMASK_FREE((void *)allmasks);
+ sched_cpumask_free(allmasks);
+ kfree(rd);
return -ENOMEM;
#endif
}
@@ -7686,8 +7693,14 @@ static struct sched_domain_attr *dattr_cur;
*/
static cpumask_t fallback_doms;
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
{
+ return 0;
}
/*
@@ -7727,8 +7740,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
cpumask_t tmpmask;
int i;
- unregister_sched_domain_sysctl();
-
for_each_cpu_mask_nr(i, *cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
synchronize_sched();
@@ -7766,13 +7777,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
*
* The passed in 'doms_new' should be kmalloc'd. This routine takes
* ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
+ * failed the kmalloc call, then it can pass in doms_new == NULL &&
+ * ndoms_new == 1, and partition_sched_domains() will fallback to
+ * the single partition 'fallback_doms', it also forces the domains
+ * to be rebuilt.
*
- * If doms_new==NULL it will be replaced with cpu_online_map.
- * ndoms_new==0 is a special case for destroying existing domains.
- * It will not create the default domain.
+ * If doms_new == NULL it will be replaced with cpu_online_map.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
*
* Call with hotplug lock held
*/
@@ -7780,17 +7792,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
struct sched_domain_attr *dattr_new)
{
int i, j, n;
+ int new_topology;
mutex_lock(&sched_domains_mutex);
/* always unregister in case we don't destroy any domains */
unregister_sched_domain_sysctl();
+ /* Let architecture update cpu core mappings. */
+ new_topology = arch_update_cpu_topology();
+
n = doms_new ? ndoms_new : 0;
/* Destroy deleted domains */
for (i = 0; i < ndoms_cur; i++) {
- for (j = 0; j < n; j++) {
+ for (j = 0; j < n && !new_topology; j++) {
if (cpus_equal(doms_cur[i], doms_new[j])
&& dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
@@ -7805,12 +7821,12 @@ match1:
ndoms_cur = 0;
doms_new = &fallback_doms;
cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
- dattr_new = NULL;
+ WARN_ON_ONCE(dattr_new);
}
/* Build new domains */
for (i = 0; i < ndoms_new; i++) {
- for (j = 0; j < ndoms_cur; j++) {
+ for (j = 0; j < ndoms_cur && !new_topology; j++) {
if (cpus_equal(doms_new[i], doms_cur[j])
&& dattrs_equal(dattr_new, i, dattr_cur, j))
goto match2;
@@ -8465,7 +8481,7 @@ static
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
struct cfs_rq *cfs_rq;
- struct sched_entity *se, *parent_se;
+ struct sched_entity *se;
struct rq *rq;
int i;
@@ -8481,18 +8497,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
for_each_possible_cpu(i) {
rq = cpu_rq(i);
- cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
- GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+ GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
goto err;
- se = kmalloc_node(sizeof(struct sched_entity),
- GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ se = kzalloc_node(sizeof(struct sched_entity),
+ GFP_KERNEL, cpu_to_node(i));
if (!se)
goto err;
- parent_se = parent ? parent->se[i] : NULL;
- init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+ init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
}
return 1;
@@ -8553,7 +8568,7 @@ static
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
struct rt_rq *rt_rq;
- struct sched_rt_entity *rt_se, *parent_se;
+ struct sched_rt_entity *rt_se;
struct rq *rq;
int i;
@@ -8570,18 +8585,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
for_each_possible_cpu(i) {
rq = cpu_rq(i);
- rt_rq = kmalloc_node(sizeof(struct rt_rq),
- GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ rt_rq = kzalloc_node(sizeof(struct rt_rq),
+ GFP_KERNEL, cpu_to_node(i));
if (!rt_rq)
goto err;
- rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
- GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+ GFP_KERNEL, cpu_to_node(i));
if (!rt_se)
goto err;
- parent_se = parent ? parent->rt_se[i] : NULL;
- init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+ init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
}
return 1;
@@ -9224,11 +9238,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
* (balbir@in.ibm.com).
*/
-/* track cpu usage of a group of tasks */
+/* track cpu usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 *cpuusage;
+ struct cpuacct *parent;
};
struct cgroup_subsys cpuacct_subsys;
@@ -9262,6 +9277,9 @@ static struct cgroup_subsys_state *cpuacct_create(
return ERR_PTR(-ENOMEM);
}
+ if (cgrp->parent)
+ ca->parent = cgroup_ca(cgrp->parent);
+
return &ca->css;
}
@@ -9275,6 +9293,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
kfree(ca);
}
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 data;
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ */
+ spin_lock_irq(&cpu_rq(cpu)->lock);
+ data = *cpuusage;
+ spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ data = *cpuusage;
+#endif
+
+ return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ */
+ spin_lock_irq(&cpu_rq(cpu)->lock);
+ *cpuusage = val;
+ spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ *cpuusage = val;
+#endif
+}
+
/* return total cpu usage (in nanoseconds) of a group */
static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
{
@@ -9282,17 +9335,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
u64 totalcpuusage = 0;
int i;
- for_each_possible_cpu(i) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
-
- /*
- * Take rq->lock to make 64-bit addition safe on 32-bit
- * platforms.
- */
- spin_lock_irq(&cpu_rq(i)->lock);
- totalcpuusage += *cpuusage;
- spin_unlock_irq(&cpu_rq(i)->lock);
- }
+ for_each_present_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i);
return totalcpuusage;
}
@@ -9309,23 +9353,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
goto out;
}
- for_each_possible_cpu(i) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+ for_each_present_cpu(i)
+ cpuacct_cpuusage_write(ca, i, 0);
- spin_lock_irq(&cpu_rq(i)->lock);
- *cpuusage = 0;
- spin_unlock_irq(&cpu_rq(i)->lock);
- }
out:
return err;
}
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct cpuacct *ca = cgroup_ca(cgroup);
+ u64 percpu;
+ int i;
+
+ for_each_present_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i);
+ seq_printf(m, "%llu ", (unsigned long long) percpu);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
static struct cftype files[] = {
{
.name = "usage",
.read_u64 = cpuusage_read,
.write_u64 = cpuusage_write,
},
+ {
+ .name = "usage_percpu",
+ .read_seq_string = cpuacct_percpu_seq_read,
+ },
+
};
static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9341,14 +9401,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
+ int cpu;
if (!cpuacct_subsys.active)
return;
+ cpu = task_cpu(tsk);
ca = task_ca(tsk);
- if (ca) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
+ for (; ca; ca = ca->parent) {
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
}