aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c106
1 files changed, 61 insertions, 45 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index d906f72b42d..558e5f28426 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
+#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/syscalls.h>
@@ -227,9 +228,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
- hrtimer_start(&rt_b->rt_period_timer,
- rt_b->rt_period_timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&rt_b->rt_period_timer,
+ HRTIMER_MODE_ABS);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -386,7 +386,6 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
- u64 pair_start;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
@@ -398,9 +397,9 @@ struct cfs_rq {
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next;
+ struct sched_entity *curr, *next, *last;
- unsigned long nr_spread_over;
+ unsigned int nr_spread_over;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -819,6 +818,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
unsigned int sysctl_sched_shares_ratelimit = 250000;
/*
+ * Inject some fuzzyness into changing the per-cpu group shares
+ * this avoids remote rq-locks at the expense of fairness.
+ * default: 4
+ */
+unsigned int sysctl_sched_shares_thresh = 4;
+
+/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
*/
@@ -963,6 +969,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
}
}
+void task_rq_unlock_wait(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+
+ smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+ spin_unlock_wait(&rq->lock);
+}
+
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
@@ -1064,7 +1078,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
- timer->expires = time;
+ hrtimer_set_expires(timer, time);
if (rq == this_rq()) {
hrtimer_restart(timer);
@@ -1442,6 +1456,8 @@ static unsigned long cpu_avg_load_per_task(int cpu)
if (rq->nr_running)
rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+ else
+ rq->avg_load_per_task = 0;
return rq->avg_load_per_task;
}
@@ -1454,8 +1470,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
* Calculate and set the cpu's group shares.
*/
static void
-__update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares, unsigned long sd_rq_weight)
+update_group_shares_cpu(struct task_group *tg, int cpu,
+ unsigned long sd_shares, unsigned long sd_rq_weight)
{
int boost = 0;
unsigned long shares;
@@ -1486,19 +1502,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
*
*/
shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+ shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
- /*
- * record the actual number of shares, not the boosted amount.
- */
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- tg->cfs_rq[cpu]->rq_weight = rq_weight;
+ if (abs(shares - tg->se[cpu]->load.weight) >
+ sysctl_sched_shares_thresh) {
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- else if (shares > MAX_SHARES)
- shares = MAX_SHARES;
+ spin_lock_irqsave(&rq->lock, flags);
+ /*
+ * record the actual number of shares, not the boosted amount.
+ */
+ tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+ tg->cfs_rq[cpu]->rq_weight = rq_weight;
- __set_se_shares(tg->se[cpu], shares);
+ __set_se_shares(tg->se[cpu], shares);
+ spin_unlock_irqrestore(&rq->lock, flags);
+ }
}
/*
@@ -1527,14 +1547,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
if (!rq_weight)
rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
- for_each_cpu_mask(i, sd->span) {
- struct rq *rq = cpu_rq(i);
- unsigned long flags;
-
- spin_lock_irqsave(&rq->lock, flags);
- __update_group_shares_cpu(tg, i, shares, rq_weight);
- spin_unlock_irqrestore(&rq->lock, flags);
- }
+ for_each_cpu_mask(i, sd->span)
+ update_group_shares_cpu(tg, i, shares, rq_weight);
return 0;
}
@@ -1801,7 +1815,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
/*
* Buddy candidates are cache hot:
*/
- if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
+ if (sched_feat(CACHE_HOT_BUDDY) &&
+ (&p->se == cfs_rq_of(&p->se)->next ||
+ &p->se == cfs_rq_of(&p->se)->last))
return 1;
if (p->sched_class != &fair_sched_class)
@@ -3339,7 +3355,7 @@ small_imbalance:
} else
this_load_per_task = cpu_avg_load_per_task(this_cpu);
- if (max_load - this_load + 2*busiest_load_per_task >=
+ if (max_load - this_load + busiest_load_per_task >=
busiest_load_per_task * imbn) {
*imbalance = busiest_load_per_task;
return busiest;
@@ -4186,7 +4202,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
if (p == rq->idle) {
p->stime = cputime_add(p->stime, steal);
- account_group_system_time(p, steal);
if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
else
@@ -4322,7 +4337,7 @@ void __kprobes sub_preempt_count(int val)
/*
* Underflow?
*/
- if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
return;
/*
* Is the spinlock portion underflowing?
@@ -4443,12 +4458,8 @@ need_resched_nonpreemptible:
if (sched_feat(HRTICK))
hrtick_clear(rq);
- /*
- * Do the rq-clock update outside the rq lock:
- */
- local_irq_disable();
+ spin_lock_irq(&rq->lock);
update_rq_clock(rq);
- spin_lock(&rq->lock);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -5858,6 +5869,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ spin_lock_irqsave(&rq->lock, flags);
+
__sched_fork(idle);
idle->se.exec_start = sched_clock();
@@ -5865,7 +5878,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->cpus_allowed = cpumask_of_cpu(cpu);
__set_task_cpu(idle, cpu);
- spin_lock_irqsave(&rq->lock, flags);
rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
idle->oncpu = 1;
@@ -6875,15 +6887,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct sched_domain *tmp;
/* Remove the sched domains which do not contribute to scheduling. */
- for (tmp = sd; tmp; tmp = tmp->parent) {
+ for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
if (!parent)
break;
+
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
- }
+ } else
+ tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
@@ -7672,6 +7686,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
error:
free_sched_groups(cpu_map, tmpmask);
SCHED_CPUMASK_FREE((void *)allmasks);
+ kfree(rd);
return -ENOMEM;
#endif
}
@@ -7773,13 +7788,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
*
* The passed in 'doms_new' should be kmalloc'd. This routine takes
* ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
+ * failed the kmalloc call, then it can pass in doms_new == NULL &&
+ * ndoms_new == 1, and partition_sched_domains() will fallback to
+ * the single partition 'fallback_doms', it also forces the domains
+ * to be rebuilt.
*
- * If doms_new==NULL it will be replaced with cpu_online_map.
- * ndoms_new==0 is a special case for destroying existing domains.
- * It will not create the default domain.
+ * If doms_new == NULL it will be replaced with cpu_online_map.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
*
* Call with hotplug lock held
*/